pax_global_header00006660000000000000000000000064146337561710014527gustar00rootroot0000000000000052 comment=69be18c0232c9dddaae6c715aa9ed1b61bfb10be cccl-2.5.0/000077500000000000000000000000001463375617100124375ustar00rootroot00000000000000cccl-2.5.0/.clang-format000066400000000000000000000150031463375617100150110ustar00rootroot00000000000000# Note that we don't specify the language in this file because some files are # detected as Cpp, but others are detected as ObjC and we want this formatting # to apply to all types of files. BasedOnStyle: LLVM AccessModifierOffset: -2 AlignAfterOpenBracket: Align AlignConsecutiveAssignments: Consecutive AlignConsecutiveBitFields: Consecutive AlignConsecutiveMacros: Consecutive AlignEscapedNewlines: Left AlignOperands: AlignAfterOperator AllowAllArgumentsOnNextLine: true AlignTrailingComments: Kind: Never AllowAllParametersOfDeclarationOnNextLine: true AllowAllConstructorInitializersOnNextLine: true AllowShortBlocksOnASingleLine: false AllowShortCaseLabelsOnASingleLine: false AllowShortFunctionsOnASingleLine: Empty AllowShortIfStatementsOnASingleLine: Never AllowShortLambdasOnASingleLine: Empty AllowShortLoopsOnASingleLine: false AlwaysBreakAfterReturnType: None AlwaysBreakTemplateDeclarations: Yes AttributeMacros: [ '_CCCL_ALIGNAS_TYPE', '_CCCL_ALIGNAS', '_CCCL_CONSTEXPR_CXX14', '_CCCL_CONSTEXPR_CXX17', '_CCCL_CONSTEXPR_CXX20', '_CCCL_CONSTEXPR_CXX23', '_CCCL_DEVICE', '_CCCL_FALLTHROUGH', '_CCCL_FORCEINLINE', '_CCCL_HOST_DEVICE', '_CCCL_HOST', '_CCCL_NO_UNIQUE_ADDRESS', '_CCCL_NODISCARD_FRIEND', '_CCCL_NODISCARD', '_CCCL_NORETURN', '_CCCL_VISIBILITY_HIDDEN', 'CUB_RUNTIME_FUNCTION', 'CUB_DETAIL_KERNEL_ATTRIBUTES', 'THRUST_RUNTIME_FUNCTION', 'THRUST_DETAIL_KERNEL_ATTRIBUTES', '_LIBCUDACXX_ALIGNOF', '_LIBCUDACXX_ALWAYS_INLINE', '_LIBCUDACXX_AVAILABILITY_THROW_BAD_VARIANT_ACCESS', '_LIBCUDACXX_CONSTINIT', '_LIBCUDACXX_DEPRECATED_IN_CXX11', '_LIBCUDACXX_DEPRECATED_IN_CXX14', '_LIBCUDACXX_DEPRECATED_IN_CXX17', '_LIBCUDACXX_DEPRECATED_IN_CXX20', '_LIBCUDACXX_DEPRECATED', '_LIBCUDACXX_DISABLE_EXTENTSION_WARNING', '_LIBCUDACXX_EXCLUDE_FROM_EXPLICIT_INSTANTIATION', '_LIBCUDACXX_EXPORTED_FROM_ABI', '_LIBCUDACXX_EXTERN_TEMPLATE_TYPE_VIS', '_LIBCUDACXX_HIDDEN', '_LIBCUDACXX_HIDE_FROM_ABI_AFTER_V1', '_LIBCUDACXX_HIDE_FROM_ABI', '_LIBCUDACXX_INLINE_VISIBILITY', '_LIBCUDACXX_INTERNAL_LINKAGE', '_LIBCUDACXX_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS', '_LIBCUDACXX_NO_DESTROY', '_LIBCUDACXX_NO_SANITIZE', '_LIBCUDACXX_NOALIAS', '_LIBCUDACXX_OVERRIDABLE_FUNC_VIS', '_LIBCUDACXX_STANDALONE_DEBUG', '_LIBCUDACXX_TEMPLATE_DATA_VIS', '_LIBCUDACXX_TEMPLATE_VIS', '_LIBCUDACXX_THREAD_SAFETY_ANNOTATION', '_LIBCUDACXX_USING_IF_EXISTS', '_LIBCUDACXX_WEAK', ] BinPackArguments: false BinPackParameters: false BreakBeforeBraces: Custom BraceWrapping: AfterCaseLabel: false AfterClass: true AfterControlStatement: true AfterEnum: true AfterFunction: true AfterNamespace: true AfterStruct: true AfterUnion: true BeforeCatch: true BeforeElse: true IndentBraces: false SplitEmptyFunction: false SplitEmptyRecord: false BreakBeforeConceptDeclarations: true BreakBeforeBinaryOperators: NonAssignment BreakBeforeTernaryOperators: true BreakConstructorInitializers: BeforeComma BreakInheritanceList: BeforeComma ColumnLimit: 120 CompactNamespaces: false ContinuationIndentWidth: 2 EmptyLineAfterAccessModifier: Never EmptyLineBeforeAccessModifier: Always FixNamespaceComments: true IfMacros: [ '_CCCL_IF_CONSTEXPR', '_CCCL_ELSE_IF_CONSTEXPR', ] IndentWrappedFunctionNames: false IncludeBlocks: Regroup IncludeCategories: - Regex: '^<(cuda/std/detail/__config|cub/config.cuh|thrust/detail/config.h|thrust/system/cuda/config.h)' Priority: 0 SortPriority: 0 - Regex: '^$' Priority: 5 SortPriority: 4 - Regex: '^ - _LIBCUDACXX_REQUIRES(...)=requires (...) NamespaceIndentation: None PackConstructorInitializers: Never PenaltyBreakAssignment: 30 PenaltyBreakBeforeFirstCallParameter: 50 PenaltyBreakComment: 0 PenaltyBreakFirstLessLess: 0 PenaltyBreakString: 70 PenaltyBreakTemplateDeclaration: 0 PenaltyExcessCharacter: 100 PenaltyReturnTypeOnItsOwnLine: 90 PenaltyIndentedWhitespace: 2 PointerAlignment: Left ReflowComments: true RemoveSemicolon: false SortIncludes: CaseInsensitive SpaceAfterCStyleCast: true SpaceAfterLogicalNot: false SpaceAfterTemplateKeyword: true SpaceBeforeAssignmentOperators: true SpaceBeforeCpp11BracedList: false SpaceBeforeCtorInitializerColon: true SpaceBeforeInheritanceColon: true SpaceBeforeParens: ControlStatements SpaceBeforeRangeBasedForLoopColon: true SpaceInEmptyParentheses: false SpacesBeforeTrailingComments: 1 SpacesInAngles: Never SpacesInCStyleCastParentheses: false SpacesInParentheses: false SpacesInSquareBrackets: false Standard: c++20 StatementMacros: [ '_CCCL_EXEC_CHECK_DISABLE', 'CUB_NAMESPACE_BEGIN', 'CUB_NAMESPACE_END', 'THRUST_NAMESPACE_BEGIN', 'THRUST_NAMESPACE_END', '_LIBCUDACXX_BEGIN_NAMESPACE_STD', '_LIBCUDACXX_END_NAMESPACE_STD', '_LIBCUDACXX_BEGIN_NAMESPACE_STD_NOVERSION', '_LIBCUDACXX_END_NAMESPACE_STD_NOVERSION', '_LIBCUDACXX_BEGIN_NAMESPACE_RANGES', '_LIBCUDACXX_END_NAMESPACE_RANGES', '_LIBCUDACXX_BEGIN_NAMESPACE_RANGES_ABI', '_LIBCUDACXX_END_NAMESPACE_RANGES_ABI', '_LIBCUDACXX_BEGIN_NAMESPACE_CPO', '_LIBCUDACXX_END_NAMESPACE_CPO', '_LIBCUDACXX_BEGIN_NAMESPACE_VIEWS', '_LIBCUDACXX_END_NAMESPACE_VIEWS', ] TabWidth: 2 UseTab: Never cccl-2.5.0/.clangd000066400000000000000000000025321463375617100136720ustar00rootroot00000000000000# https://clangd.llvm.org/config # Apply a config conditionally to all C files If: PathMatch: .*\.(c|h)$ --- # Apply a config conditionally to all C++ files If: PathMatch: .*\.(c|h)pp --- # Apply a config conditionally to all CUDA files If: PathMatch: .*\.cuh? CompileFlags: Add: # Allow variadic CUDA functions - "-Xclang=-fcuda-allow-variadic-functions" --- # Tweak the clangd parse settings for all files CompileFlags: Compiler: clang++ CompilationDatabase: . Add: - -x - cuda # report all errors - "-ferror-limit=0" - "-ftemplate-backtrace-limit=0" Remove: - -stdpar # strip CUDA fatbin args - "-Xfatbin*" - "-Xcompiler*" - "-Xcudafe*" - "-rdc=*" - "-gpu=*" - "--diag_suppress*" # strip CUDA arch flags - "-gencode*" - "--generate-code*" # strip gcc's -fcoroutines - -fcoroutines # strip CUDA flags unknown to clang - "-ccbin*" - "--compiler-options*" - "--expt-extended-lambda" - "--expt-relaxed-constexpr" - "-forward-unknown-to-host-compiler" - "-Werror=cross-execution-space-call" Diagnostics: Suppress: - "variadic_device_fn" - "attributes_not_allowed" # The NVHPC version of _NVCXX_EXPAND_PACK macro triggers this clang error. # Temporarily suppressing it, but should probably fix - "template_param_shadow" cccl-2.5.0/.devcontainer/000077500000000000000000000000001463375617100151765ustar00rootroot00000000000000cccl-2.5.0/.devcontainer/README.md000066400000000000000000000260661463375617100164670ustar00rootroot00000000000000> **Note** > The instructions in this README are specific to Linux development environments. Instructions for Windows are coming soon! [![Open in GitHub Codespaces](https://github.com/codespaces/badge.svg)](https://codespaces.new/NVIDIA/cccl?quickstart=1&devcontainer_path=.devcontainer%2Fdevcontainer.json) # CCCL Dev Containers CCCL uses [Development Containers](https://containers.dev/) to provide consistent and convenient development environments for both local development and for CI. This guide covers setup in [Visual Studio Code](#quickstart-vscode-recommended) and [Docker](#quickstart-docker-manual-approach). The guide also provides additional instructions in case you want use WSL. ## Table of Contents 1. [Quickstart: VSCode (Recommended)](#vscode) 2. [Quickstart: Docker (Manual Approach)](#docker) 3. [Quickstart: Using WSL](#wsl) ## Quickstart: VSCode (Recommended) ### Prerequisites - [Visual Studio Code](https://code.visualstudio.com/) - [Remote - Containers extension](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-containers) - [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html) - [Docker](https://docs.docker.com/engine/install/) - This is only for completeness because it should already be implicitly installed by the Dev Containers extension ### Steps 1. Clone the Repository ```bash git clone https://github.com/nvidia/cccl.git ``` 2. Open the cloned directory in VSCode 3. Launch a Dev Container by clicking the prompt suggesting to "Reopen in Container" ![Shows "Reopen in Container" prompt when opening the cccl directory in VScode.](./img/reopen_in_container.png) - Alternatively, use the Command Palette to start a Dev Container. Press `Ctrl+Shift+P` to open the Command Palette. Type "Remote-Containers: Reopen in Container" and select it. ![Shows "Reopen in Container" in command pallete.](./img/open_in_container_manual.png) 4. Select an environment with the desired CTK and host compiler from the list: ![Shows list of available container environments.](./img/container_list.png) 5. VSCode will initialize the selected Dev Container. This can take a few minutes the first time. 6. Once initialized, the local `cccl/` directory is mirrored into the container to ensure any changes are persistent. 7. Done! See the [contributing guide](../CONTRIBUTING.md#building-and-testing) for instructions on how to build and run tests. ### (Optional) Authenticate with GitHub for `sccache` After starting the container, there will be a prompt to authenticate with GitHub. This grants access to a [`sccache`](https://github.com/mozilla/sccache) server shared with CI and greatly accelerates local build times. This is currently limited to NVIDIA employees belonging to the `NVIDIA` or `rapidsai` GitHub organizations. Without authentication to the remote server, `sccache` will still accelerate local builds by using a filesystem cache. Follow the instructions in the prompt as below and enter the one-time code at https://github.com/login/device ![Shows authentication with GitHub to access sccache bucket.](./img/github_auth.png) To manually trigger this authentication, execute the `devcontainer-utils-vault-s3-init` script within the container. For more information about the sccache configuration and authentication, see the documentation at [`rapidsai/devcontainers`](https://github.com/rapidsai/devcontainers/blob/branch-23.10/USAGE.md#build-caching-with-sccache). ## Quickstart: Docker (Manual Approach) ### Prerequisites - [Docker](https://docs.docker.com/desktop/install/linux-install/) ### Steps 1. Clone the repository and use the [`launch.sh`](./launch.sh) script to launch the default container environment ```bash git clone https://github.com/nvidia/cccl.git cd cccl ./.devcontainer/launch.sh --docker ``` This script starts an interactive shell as the `coder` user inside the container with the local `cccl/` directory mirrored into `/home/coder/cccl`. For specific environments, use the `--cuda` and `--host` options: ```bassh ./.devcontainer/launch.sh --docker --cuda 12.2 --host gcc10 ``` See `./.devcontainer/launch.sh --help` for more information. 2. Done. See the [contributing guide](../CONTRIBUTING.md#building-and-testing) for instructions on how to build and run tests. ## Available Environments CCCL provides environments for both the oldest and newest supported CUDA versions with all compatible host compilers. Look in the [`.devcontainer/`](.) directory to see the available configurations. The top-level [`devcontainer.json`](./devcontainer.json) serves as the default environment. All `devcontainer.json` files in the `cuda-` sub-directories are variations on this top-level file, with different base images for the different CUDA and host compiler versions. ## VSCode Customization By default, CCCL's Dev Containers come with certain VSCode settings and extensions configured by default, as can be seen in the [`devcontainer.json`](./devcontainer.json) file. This can be further customized by users without needing to modify the `devcontainer.json` file directly. For extensions, the [`dev.containers.defaultExtensions` setting](https://code.visualstudio.com/docs/devcontainers/containers#_always-installed-extensions) allows listing extensions that will always be installed. For more general customizations, VSCode allows using a dotfile repository. See the [VSCode documentation](https://code.visualstudio.com/docs/devcontainers/containers#_personalizing-with-dotfile-repositories) for more information. ## GitHub Codespaces [![Open in GitHub Codespaces](https://github.com/codespaces/badge.svg)](https://codespaces.new/NVIDIA/cccl?quickstart=1&devcontainer_path=.devcontainer%2Fdevcontainer.json) One of the benefits of Dev Containers is that they integrate natively with [GitHub Codespaces](https://github.com/features/codespaces). Codespaces provide a VSCode development environment right in your browser running on a machine in the cloud. This provides a truly one-click, turnkey development environment where you can develop, build, and test with no other setup required. Click the badge above or [click here](https://codespaces.new/NVIDIA/cccl?quickstart=1&devcontainer_path=.devcontainer%2Fdevcontainer.json) to get started with CCCL's Dev Containers on Codespaces. This will start the default Dev Container environment. [Click here](https://github.com/codespaces/new?hide_repo_select=true&ref=main&repo=296416761&skip_quickstart=true) to start a Codespace with a particular environment and hardware configuration as shown: ![Shows configuring a Codespace with a custom environment](../docs/images/codespaces.png) ## For Maintainers: The `make_devcontainers.sh` Script ### Overview [`make_devcontainers.sh`](./make_devcontainers.sh) generates devcontainer configurations for the unique combinations of CUDA Toolkit (CTK) versions and host compilers in [`ci/matrix.yaml`](../ci/matrix.yaml). ### How It Works: 1. Parses the matrix from `ci/matrix.yaml`. 2. Use the top-level [`.devcontainer/devcontainer.json`](./devcontainer.json) as a template. For each unique combination of CTK version and host compiler, generate a corresponding `devcontainer.json` configuration, adjusting only the base Docker image to match the desired environment. 3. Place the generated configurations in the `.devcontainer` directory, organizing them into subdirectories following the naming convention `cuda-`. For more information, see the `.devcontainer/make_devcontainers.sh --help` message. **Note**: When adding or updating supported environments, modify `matrix.yaml` and then rerun this script to synchronize the `devcontainer` configurations. ## Quickstart: Using WSL > [!NOTE] > _Make sure you have the Nvidia driver installed on your Windows host before moving further_. Type in `nvidia-smi` for verification. ### Install WSL on your Windows host > [!WARNING] > Disclaimer: This guide was developed for WSL 2 on Windows 11. 1. Launch a Windows terminal (_e.g. Powershell_) as an administrator. 2. Install WSL 2 by running: ```bash wsl --install ``` This should probably install Ubuntu distro as a default. 3. Restart your computer and run `wsl -l -v` on a Windows terminal to verify installation.

Install prerequisites and VS Code extensions

4. Launch your WSL/Ubuntu terminal by running `wsl` in Powershell. 5. Install the [WSL extension](ms-vscode-remote.remote-wsl) on VS Code. - `Ctrl + Shift + P` and select `WSL: Connect to WSL` (it will prompt you to install the WSL extension). - Make sure you are connected to WSL with VS Code by checking the bottom left corner of the VS Code window (should indicate "WSL: Ubuntu" in our case). 6. Install the [Dev Containers extension](ms-vscode-remote.remote-containers) on VS Code. - In a vanilla system you should be prompted to install `Docker` at this point, accept it. If it hangs you might have to restart VS Code after that. 7. Install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html). **Make sure you install the WSL 2 version and not the native Linux one**. This builds on top of Docker so make sure you have Docker properly installed (run `docker --version`). 8. Open `/etc/docker/daemon.json` from within your WSL system (if the file does not exist, create it) and add the following: ```json { "runtimes": { "nvidia": { "path": "nvidia-container-runtime", "runtimeArgs": [] } } } ``` then run `sudo systemctl restart docker.service`. --- ### Build CCCL in WSL using Dev Containers 9. Still on your WSL terminal run `git clone https://github.com/NVIDIA/cccl.git` 10. Open the CCCL cloned repo in VS Code ( `Ctrl + Shift + P `, select `File: Open Folder...` and select the path where your CCCL clone is located). 11. If prompted, choose `Reopen in Container`. - If you are not prompted just type `Ctrl + Shift + P` and `Dev Containers: Open Folder in Container ...`. 12. Verify that Dev Container was configured properly by running `nvidia-smi` in your Dev Container terminal. For a proper configuration it is important for the steps in [Install prerequisites and VS Code extensions](#prereqs) to be followed in a precise order. From that point on, the guide aligns with our [existing Dev Containers native Linux guide](https://github.com/NVIDIA/cccl/blob/main/.devcontainer/README.md) with just one minor potential alteration: 13. If WSL was launched without the X-server enabled, when asked to "authenticate Git with your Github credentials", if you answer **Yes**, the browser might not open automatically, with the following error message. > Failed opening a web browser at https://github.com/login/device exec: "xdg-open,x-www-browser,www-browser,wslview": executable file not found in $PATH Please try entering the URL in your browser manually In that case type in the address manually in your web browser https://github.com/login/device and fill in the one-time code. cccl-2.5.0/.devcontainer/cuda11.1-gcc6/000077500000000000000000000000001463375617100172335ustar00rootroot00000000000000cccl-2.5.0/.devcontainer/cuda11.1-gcc6/devcontainer.json000066400000000000000000000034711463375617100226140ustar00rootroot00000000000000{ "shutdownAction": "stopContainer", "image": "rapidsai/devcontainers:24.06-cpp-gcc6-cuda11.1-ubuntu18.04", "hostRequirements": { "gpu": "optional" }, "initializeCommand": [ "/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}" ], "containerEnv": { "SCCACHE_REGION": "us-east-2", "SCCACHE_BUCKET": "rapids-sccache-devs", "VAULT_HOST": "https://vault.ops.k8s.rapids.ai", "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", "DEVCONTAINER_NAME": "cuda11.1-gcc6", "CCCL_CUDA_VERSION": "11.1", "CCCL_HOST_COMPILER": "gcc", "CCCL_HOST_COMPILER_VERSION": "6", "CCCL_BUILD_INFIX": "cuda11.1-gcc6" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", "mounts": [ "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent" ], "customizations": { "vscode": { "extensions": [ "llvm-vs-code-extensions.vscode-clangd", "xaver.clang-format", "nvidia.nsight-vscode-edition", "ms-vscode.cmake-tools" ], "settings": { "editor.defaultFormatter": "xaver.clang-format", "editor.formatOnSave": true, "clang-format.executable": "/usr/local/bin/clang-format", "clangd.arguments": [ "--compile-commands-dir=${workspaceFolder}" ], "files.eol": "\n", "files.trimTrailingWhitespace": true } } }, "name": "cuda11.1-gcc6" } cccl-2.5.0/.devcontainer/cuda11.1-gcc7/000077500000000000000000000000001463375617100172345ustar00rootroot00000000000000cccl-2.5.0/.devcontainer/cuda11.1-gcc7/devcontainer.json000066400000000000000000000034711463375617100226150ustar00rootroot00000000000000{ "shutdownAction": "stopContainer", "image": "rapidsai/devcontainers:24.06-cpp-gcc7-cuda11.1-ubuntu18.04", "hostRequirements": { "gpu": "optional" }, "initializeCommand": [ "/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}" ], "containerEnv": { "SCCACHE_REGION": "us-east-2", "SCCACHE_BUCKET": "rapids-sccache-devs", "VAULT_HOST": "https://vault.ops.k8s.rapids.ai", "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", "DEVCONTAINER_NAME": "cuda11.1-gcc7", "CCCL_CUDA_VERSION": "11.1", "CCCL_HOST_COMPILER": "gcc", "CCCL_HOST_COMPILER_VERSION": "7", "CCCL_BUILD_INFIX": "cuda11.1-gcc7" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", "mounts": [ "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent" ], "customizations": { "vscode": { "extensions": [ "llvm-vs-code-extensions.vscode-clangd", "xaver.clang-format", "nvidia.nsight-vscode-edition", "ms-vscode.cmake-tools" ], "settings": { "editor.defaultFormatter": "xaver.clang-format", "editor.formatOnSave": true, "clang-format.executable": "/usr/local/bin/clang-format", "clangd.arguments": [ "--compile-commands-dir=${workspaceFolder}" ], "files.eol": "\n", "files.trimTrailingWhitespace": true } } }, "name": "cuda11.1-gcc7" } cccl-2.5.0/.devcontainer/cuda11.1-gcc8/000077500000000000000000000000001463375617100172355ustar00rootroot00000000000000cccl-2.5.0/.devcontainer/cuda11.1-gcc8/devcontainer.json000066400000000000000000000034711463375617100226160ustar00rootroot00000000000000{ "shutdownAction": "stopContainer", "image": "rapidsai/devcontainers:24.06-cpp-gcc8-cuda11.1-ubuntu18.04", "hostRequirements": { "gpu": "optional" }, "initializeCommand": [ "/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}" ], "containerEnv": { "SCCACHE_REGION": "us-east-2", "SCCACHE_BUCKET": "rapids-sccache-devs", "VAULT_HOST": "https://vault.ops.k8s.rapids.ai", "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", "DEVCONTAINER_NAME": "cuda11.1-gcc8", "CCCL_CUDA_VERSION": "11.1", "CCCL_HOST_COMPILER": "gcc", "CCCL_HOST_COMPILER_VERSION": "8", "CCCL_BUILD_INFIX": "cuda11.1-gcc8" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", "mounts": [ "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent" ], "customizations": { "vscode": { "extensions": [ "llvm-vs-code-extensions.vscode-clangd", "xaver.clang-format", "nvidia.nsight-vscode-edition", "ms-vscode.cmake-tools" ], "settings": { "editor.defaultFormatter": "xaver.clang-format", "editor.formatOnSave": true, "clang-format.executable": "/usr/local/bin/clang-format", "clangd.arguments": [ "--compile-commands-dir=${workspaceFolder}" ], "files.eol": "\n", "files.trimTrailingWhitespace": true } } }, "name": "cuda11.1-gcc8" } cccl-2.5.0/.devcontainer/cuda11.1-gcc9/000077500000000000000000000000001463375617100172365ustar00rootroot00000000000000cccl-2.5.0/.devcontainer/cuda11.1-gcc9/devcontainer.json000066400000000000000000000034711463375617100226170ustar00rootroot00000000000000{ "shutdownAction": "stopContainer", "image": "rapidsai/devcontainers:24.06-cpp-gcc9-cuda11.1-ubuntu18.04", "hostRequirements": { "gpu": "optional" }, "initializeCommand": [ "/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}" ], "containerEnv": { "SCCACHE_REGION": "us-east-2", "SCCACHE_BUCKET": "rapids-sccache-devs", "VAULT_HOST": "https://vault.ops.k8s.rapids.ai", "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", "DEVCONTAINER_NAME": "cuda11.1-gcc9", "CCCL_CUDA_VERSION": "11.1", "CCCL_HOST_COMPILER": "gcc", "CCCL_HOST_COMPILER_VERSION": "9", "CCCL_BUILD_INFIX": "cuda11.1-gcc9" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", "mounts": [ "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent" ], "customizations": { "vscode": { "extensions": [ "llvm-vs-code-extensions.vscode-clangd", "xaver.clang-format", "nvidia.nsight-vscode-edition", "ms-vscode.cmake-tools" ], "settings": { "editor.defaultFormatter": "xaver.clang-format", "editor.formatOnSave": true, "clang-format.executable": "/usr/local/bin/clang-format", "clangd.arguments": [ "--compile-commands-dir=${workspaceFolder}" ], "files.eol": "\n", "files.trimTrailingWhitespace": true } } }, "name": "cuda11.1-gcc9" } cccl-2.5.0/.devcontainer/cuda11.1-llvm9/000077500000000000000000000000001463375617100174545ustar00rootroot00000000000000cccl-2.5.0/.devcontainer/cuda11.1-llvm9/devcontainer.json000066400000000000000000000034761463375617100230420ustar00rootroot00000000000000{ "shutdownAction": "stopContainer", "image": "rapidsai/devcontainers:24.06-cpp-llvm9-cuda11.1-ubuntu18.04", "hostRequirements": { "gpu": "optional" }, "initializeCommand": [ "/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}" ], "containerEnv": { "SCCACHE_REGION": "us-east-2", "SCCACHE_BUCKET": "rapids-sccache-devs", "VAULT_HOST": "https://vault.ops.k8s.rapids.ai", "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", "DEVCONTAINER_NAME": "cuda11.1-llvm9", "CCCL_CUDA_VERSION": "11.1", "CCCL_HOST_COMPILER": "llvm", "CCCL_HOST_COMPILER_VERSION": "9", "CCCL_BUILD_INFIX": "cuda11.1-llvm9" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", "mounts": [ "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent" ], "customizations": { "vscode": { "extensions": [ "llvm-vs-code-extensions.vscode-clangd", "xaver.clang-format", "nvidia.nsight-vscode-edition", "ms-vscode.cmake-tools" ], "settings": { "editor.defaultFormatter": "xaver.clang-format", "editor.formatOnSave": true, "clang-format.executable": "/usr/local/bin/clang-format", "clangd.arguments": [ "--compile-commands-dir=${workspaceFolder}" ], "files.eol": "\n", "files.trimTrailingWhitespace": true } } }, "name": "cuda11.1-llvm9" } cccl-2.5.0/.devcontainer/cuda11.8-gcc11/000077500000000000000000000000001463375617100173165ustar00rootroot00000000000000cccl-2.5.0/.devcontainer/cuda11.8-gcc11/devcontainer.json000066400000000000000000000034761463375617100227040ustar00rootroot00000000000000{ "shutdownAction": "stopContainer", "image": "rapidsai/devcontainers:24.06-cpp-gcc11-cuda11.8-ubuntu22.04", "hostRequirements": { "gpu": "optional" }, "initializeCommand": [ "/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}" ], "containerEnv": { "SCCACHE_REGION": "us-east-2", "SCCACHE_BUCKET": "rapids-sccache-devs", "VAULT_HOST": "https://vault.ops.k8s.rapids.ai", "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", "DEVCONTAINER_NAME": "cuda11.8-gcc11", "CCCL_CUDA_VERSION": "11.8", "CCCL_HOST_COMPILER": "gcc", "CCCL_HOST_COMPILER_VERSION": "11", "CCCL_BUILD_INFIX": "cuda11.8-gcc11" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", "mounts": [ "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent" ], "customizations": { "vscode": { "extensions": [ "llvm-vs-code-extensions.vscode-clangd", "xaver.clang-format", "nvidia.nsight-vscode-edition", "ms-vscode.cmake-tools" ], "settings": { "editor.defaultFormatter": "xaver.clang-format", "editor.formatOnSave": true, "clang-format.executable": "/usr/local/bin/clang-format", "clangd.arguments": [ "--compile-commands-dir=${workspaceFolder}" ], "files.eol": "\n", "files.trimTrailingWhitespace": true } } }, "name": "cuda11.8-gcc11" } cccl-2.5.0/.devcontainer/cuda12.0-gcc10/000077500000000000000000000000001463375617100173065ustar00rootroot00000000000000cccl-2.5.0/.devcontainer/cuda12.0-gcc10/devcontainer.json000066400000000000000000000034761463375617100226740ustar00rootroot00000000000000{ "shutdownAction": "stopContainer", "image": "rapidsai/devcontainers:24.06-cpp-gcc10-cuda12.0-ubuntu20.04", "hostRequirements": { "gpu": "optional" }, "initializeCommand": [ "/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}" ], "containerEnv": { "SCCACHE_REGION": "us-east-2", "SCCACHE_BUCKET": "rapids-sccache-devs", "VAULT_HOST": "https://vault.ops.k8s.rapids.ai", "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", "DEVCONTAINER_NAME": "cuda12.0-gcc10", "CCCL_CUDA_VERSION": "12.0", "CCCL_HOST_COMPILER": "gcc", "CCCL_HOST_COMPILER_VERSION": "10", "CCCL_BUILD_INFIX": "cuda12.0-gcc10" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", "mounts": [ "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent" ], "customizations": { "vscode": { "extensions": [ "llvm-vs-code-extensions.vscode-clangd", "xaver.clang-format", "nvidia.nsight-vscode-edition", "ms-vscode.cmake-tools" ], "settings": { "editor.defaultFormatter": "xaver.clang-format", "editor.formatOnSave": true, "clang-format.executable": "/usr/local/bin/clang-format", "clangd.arguments": [ "--compile-commands-dir=${workspaceFolder}" ], "files.eol": "\n", "files.trimTrailingWhitespace": true } } }, "name": "cuda12.0-gcc10" } cccl-2.5.0/.devcontainer/cuda12.0-gcc11/000077500000000000000000000000001463375617100173075ustar00rootroot00000000000000cccl-2.5.0/.devcontainer/cuda12.0-gcc11/devcontainer.json000066400000000000000000000034761463375617100226750ustar00rootroot00000000000000{ "shutdownAction": "stopContainer", "image": "rapidsai/devcontainers:24.06-cpp-gcc11-cuda12.0-ubuntu22.04", "hostRequirements": { "gpu": "optional" }, "initializeCommand": [ "/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}" ], "containerEnv": { "SCCACHE_REGION": "us-east-2", "SCCACHE_BUCKET": "rapids-sccache-devs", "VAULT_HOST": "https://vault.ops.k8s.rapids.ai", "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", "DEVCONTAINER_NAME": "cuda12.0-gcc11", "CCCL_CUDA_VERSION": "12.0", "CCCL_HOST_COMPILER": "gcc", "CCCL_HOST_COMPILER_VERSION": "11", "CCCL_BUILD_INFIX": "cuda12.0-gcc11" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", "mounts": [ "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent" ], "customizations": { "vscode": { "extensions": [ "llvm-vs-code-extensions.vscode-clangd", "xaver.clang-format", "nvidia.nsight-vscode-edition", "ms-vscode.cmake-tools" ], "settings": { "editor.defaultFormatter": "xaver.clang-format", "editor.formatOnSave": true, "clang-format.executable": "/usr/local/bin/clang-format", "clangd.arguments": [ "--compile-commands-dir=${workspaceFolder}" ], "files.eol": "\n", "files.trimTrailingWhitespace": true } } }, "name": "cuda12.0-gcc11" } cccl-2.5.0/.devcontainer/cuda12.0-gcc12/000077500000000000000000000000001463375617100173105ustar00rootroot00000000000000cccl-2.5.0/.devcontainer/cuda12.0-gcc12/devcontainer.json000066400000000000000000000034761463375617100226760ustar00rootroot00000000000000{ "shutdownAction": "stopContainer", "image": "rapidsai/devcontainers:24.06-cpp-gcc12-cuda12.0-ubuntu22.04", "hostRequirements": { "gpu": "optional" }, "initializeCommand": [ "/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}" ], "containerEnv": { "SCCACHE_REGION": "us-east-2", "SCCACHE_BUCKET": "rapids-sccache-devs", "VAULT_HOST": "https://vault.ops.k8s.rapids.ai", "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", "DEVCONTAINER_NAME": "cuda12.0-gcc12", "CCCL_CUDA_VERSION": "12.0", "CCCL_HOST_COMPILER": "gcc", "CCCL_HOST_COMPILER_VERSION": "12", "CCCL_BUILD_INFIX": "cuda12.0-gcc12" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", "mounts": [ "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent" ], "customizations": { "vscode": { "extensions": [ "llvm-vs-code-extensions.vscode-clangd", "xaver.clang-format", "nvidia.nsight-vscode-edition", "ms-vscode.cmake-tools" ], "settings": { "editor.defaultFormatter": "xaver.clang-format", "editor.formatOnSave": true, "clang-format.executable": "/usr/local/bin/clang-format", "clangd.arguments": [ "--compile-commands-dir=${workspaceFolder}" ], "files.eol": "\n", "files.trimTrailingWhitespace": true } } }, "name": "cuda12.0-gcc12" } cccl-2.5.0/.devcontainer/cuda12.0-gcc9/000077500000000000000000000000001463375617100172365ustar00rootroot00000000000000cccl-2.5.0/.devcontainer/cuda12.0-gcc9/devcontainer.json000066400000000000000000000034711463375617100226170ustar00rootroot00000000000000{ "shutdownAction": "stopContainer", "image": "rapidsai/devcontainers:24.06-cpp-gcc9-cuda12.0-ubuntu20.04", "hostRequirements": { "gpu": "optional" }, "initializeCommand": [ "/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}" ], "containerEnv": { "SCCACHE_REGION": "us-east-2", "SCCACHE_BUCKET": "rapids-sccache-devs", "VAULT_HOST": "https://vault.ops.k8s.rapids.ai", "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", "DEVCONTAINER_NAME": "cuda12.0-gcc9", "CCCL_CUDA_VERSION": "12.0", "CCCL_HOST_COMPILER": "gcc", "CCCL_HOST_COMPILER_VERSION": "9", "CCCL_BUILD_INFIX": "cuda12.0-gcc9" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", "mounts": [ "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent" ], "customizations": { "vscode": { "extensions": [ "llvm-vs-code-extensions.vscode-clangd", "xaver.clang-format", "nvidia.nsight-vscode-edition", "ms-vscode.cmake-tools" ], "settings": { "editor.defaultFormatter": "xaver.clang-format", "editor.formatOnSave": true, "clang-format.executable": "/usr/local/bin/clang-format", "clangd.arguments": [ "--compile-commands-dir=${workspaceFolder}" ], "files.eol": "\n", "files.trimTrailingWhitespace": true } } }, "name": "cuda12.0-gcc9" } cccl-2.5.0/.devcontainer/cuda12.0-llvm10/000077500000000000000000000000001463375617100175245ustar00rootroot00000000000000cccl-2.5.0/.devcontainer/cuda12.0-llvm10/devcontainer.json000066400000000000000000000035031463375617100231010ustar00rootroot00000000000000{ "shutdownAction": "stopContainer", "image": "rapidsai/devcontainers:24.06-cpp-llvm10-cuda12.0-ubuntu20.04", "hostRequirements": { "gpu": "optional" }, "initializeCommand": [ "/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}" ], "containerEnv": { "SCCACHE_REGION": "us-east-2", "SCCACHE_BUCKET": "rapids-sccache-devs", "VAULT_HOST": "https://vault.ops.k8s.rapids.ai", "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", "DEVCONTAINER_NAME": "cuda12.0-llvm10", "CCCL_CUDA_VERSION": "12.0", "CCCL_HOST_COMPILER": "llvm", "CCCL_HOST_COMPILER_VERSION": "10", "CCCL_BUILD_INFIX": "cuda12.0-llvm10" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", "mounts": [ "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent" ], "customizations": { "vscode": { "extensions": [ "llvm-vs-code-extensions.vscode-clangd", "xaver.clang-format", "nvidia.nsight-vscode-edition", "ms-vscode.cmake-tools" ], "settings": { "editor.defaultFormatter": "xaver.clang-format", "editor.formatOnSave": true, "clang-format.executable": "/usr/local/bin/clang-format", "clangd.arguments": [ "--compile-commands-dir=${workspaceFolder}" ], "files.eol": "\n", "files.trimTrailingWhitespace": true } } }, "name": "cuda12.0-llvm10" } cccl-2.5.0/.devcontainer/cuda12.0-llvm11/000077500000000000000000000000001463375617100175255ustar00rootroot00000000000000cccl-2.5.0/.devcontainer/cuda12.0-llvm11/devcontainer.json000066400000000000000000000035031463375617100231020ustar00rootroot00000000000000{ "shutdownAction": "stopContainer", "image": "rapidsai/devcontainers:24.06-cpp-llvm11-cuda12.0-ubuntu20.04", "hostRequirements": { "gpu": "optional" }, "initializeCommand": [ "/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}" ], "containerEnv": { "SCCACHE_REGION": "us-east-2", "SCCACHE_BUCKET": "rapids-sccache-devs", "VAULT_HOST": "https://vault.ops.k8s.rapids.ai", "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", "DEVCONTAINER_NAME": "cuda12.0-llvm11", "CCCL_CUDA_VERSION": "12.0", "CCCL_HOST_COMPILER": "llvm", "CCCL_HOST_COMPILER_VERSION": "11", "CCCL_BUILD_INFIX": "cuda12.0-llvm11" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", "mounts": [ "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent" ], "customizations": { "vscode": { "extensions": [ "llvm-vs-code-extensions.vscode-clangd", "xaver.clang-format", "nvidia.nsight-vscode-edition", "ms-vscode.cmake-tools" ], "settings": { "editor.defaultFormatter": "xaver.clang-format", "editor.formatOnSave": true, "clang-format.executable": "/usr/local/bin/clang-format", "clangd.arguments": [ "--compile-commands-dir=${workspaceFolder}" ], "files.eol": "\n", "files.trimTrailingWhitespace": true } } }, "name": "cuda12.0-llvm11" } cccl-2.5.0/.devcontainer/cuda12.0-llvm12/000077500000000000000000000000001463375617100175265ustar00rootroot00000000000000cccl-2.5.0/.devcontainer/cuda12.0-llvm12/devcontainer.json000066400000000000000000000035031463375617100231030ustar00rootroot00000000000000{ "shutdownAction": "stopContainer", "image": "rapidsai/devcontainers:24.06-cpp-llvm12-cuda12.0-ubuntu20.04", "hostRequirements": { "gpu": "optional" }, "initializeCommand": [ "/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}" ], "containerEnv": { "SCCACHE_REGION": "us-east-2", "SCCACHE_BUCKET": "rapids-sccache-devs", "VAULT_HOST": "https://vault.ops.k8s.rapids.ai", "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", "DEVCONTAINER_NAME": "cuda12.0-llvm12", "CCCL_CUDA_VERSION": "12.0", "CCCL_HOST_COMPILER": "llvm", "CCCL_HOST_COMPILER_VERSION": "12", "CCCL_BUILD_INFIX": "cuda12.0-llvm12" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", "mounts": [ "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent" ], "customizations": { "vscode": { "extensions": [ "llvm-vs-code-extensions.vscode-clangd", "xaver.clang-format", "nvidia.nsight-vscode-edition", "ms-vscode.cmake-tools" ], "settings": { "editor.defaultFormatter": "xaver.clang-format", "editor.formatOnSave": true, "clang-format.executable": "/usr/local/bin/clang-format", "clangd.arguments": [ "--compile-commands-dir=${workspaceFolder}" ], "files.eol": "\n", "files.trimTrailingWhitespace": true } } }, "name": "cuda12.0-llvm12" } cccl-2.5.0/.devcontainer/cuda12.0-llvm13/000077500000000000000000000000001463375617100175275ustar00rootroot00000000000000cccl-2.5.0/.devcontainer/cuda12.0-llvm13/devcontainer.json000066400000000000000000000035031463375617100231040ustar00rootroot00000000000000{ "shutdownAction": "stopContainer", "image": "rapidsai/devcontainers:24.06-cpp-llvm13-cuda12.0-ubuntu20.04", "hostRequirements": { "gpu": "optional" }, "initializeCommand": [ "/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}" ], "containerEnv": { "SCCACHE_REGION": "us-east-2", "SCCACHE_BUCKET": "rapids-sccache-devs", "VAULT_HOST": "https://vault.ops.k8s.rapids.ai", "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", "DEVCONTAINER_NAME": "cuda12.0-llvm13", "CCCL_CUDA_VERSION": "12.0", "CCCL_HOST_COMPILER": "llvm", "CCCL_HOST_COMPILER_VERSION": "13", "CCCL_BUILD_INFIX": "cuda12.0-llvm13" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", "mounts": [ "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent" ], "customizations": { "vscode": { "extensions": [ "llvm-vs-code-extensions.vscode-clangd", "xaver.clang-format", "nvidia.nsight-vscode-edition", "ms-vscode.cmake-tools" ], "settings": { "editor.defaultFormatter": "xaver.clang-format", "editor.formatOnSave": true, "clang-format.executable": "/usr/local/bin/clang-format", "clangd.arguments": [ "--compile-commands-dir=${workspaceFolder}" ], "files.eol": "\n", "files.trimTrailingWhitespace": true } } }, "name": "cuda12.0-llvm13" } cccl-2.5.0/.devcontainer/cuda12.0-llvm14/000077500000000000000000000000001463375617100175305ustar00rootroot00000000000000cccl-2.5.0/.devcontainer/cuda12.0-llvm14/devcontainer.json000066400000000000000000000035031463375617100231050ustar00rootroot00000000000000{ "shutdownAction": "stopContainer", "image": "rapidsai/devcontainers:24.06-cpp-llvm14-cuda12.0-ubuntu20.04", "hostRequirements": { "gpu": "optional" }, "initializeCommand": [ "/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}" ], "containerEnv": { "SCCACHE_REGION": "us-east-2", "SCCACHE_BUCKET": "rapids-sccache-devs", "VAULT_HOST": "https://vault.ops.k8s.rapids.ai", "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", "DEVCONTAINER_NAME": "cuda12.0-llvm14", "CCCL_CUDA_VERSION": "12.0", "CCCL_HOST_COMPILER": "llvm", "CCCL_HOST_COMPILER_VERSION": "14", "CCCL_BUILD_INFIX": "cuda12.0-llvm14" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", "mounts": [ "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent" ], "customizations": { "vscode": { "extensions": [ "llvm-vs-code-extensions.vscode-clangd", "xaver.clang-format", "nvidia.nsight-vscode-edition", "ms-vscode.cmake-tools" ], "settings": { "editor.defaultFormatter": "xaver.clang-format", "editor.formatOnSave": true, "clang-format.executable": "/usr/local/bin/clang-format", "clangd.arguments": [ "--compile-commands-dir=${workspaceFolder}" ], "files.eol": "\n", "files.trimTrailingWhitespace": true } } }, "name": "cuda12.0-llvm14" } cccl-2.5.0/.devcontainer/cuda12.0-llvm9/000077500000000000000000000000001463375617100174545ustar00rootroot00000000000000cccl-2.5.0/.devcontainer/cuda12.0-llvm9/devcontainer.json000066400000000000000000000034761463375617100230420ustar00rootroot00000000000000{ "shutdownAction": "stopContainer", "image": "rapidsai/devcontainers:24.06-cpp-llvm9-cuda12.0-ubuntu20.04", "hostRequirements": { "gpu": "optional" }, "initializeCommand": [ "/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}" ], "containerEnv": { "SCCACHE_REGION": "us-east-2", "SCCACHE_BUCKET": "rapids-sccache-devs", "VAULT_HOST": "https://vault.ops.k8s.rapids.ai", "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", "DEVCONTAINER_NAME": "cuda12.0-llvm9", "CCCL_CUDA_VERSION": "12.0", "CCCL_HOST_COMPILER": "llvm", "CCCL_HOST_COMPILER_VERSION": "9", "CCCL_BUILD_INFIX": "cuda12.0-llvm9" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", "mounts": [ "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent" ], "customizations": { "vscode": { "extensions": [ "llvm-vs-code-extensions.vscode-clangd", "xaver.clang-format", "nvidia.nsight-vscode-edition", "ms-vscode.cmake-tools" ], "settings": { "editor.defaultFormatter": "xaver.clang-format", "editor.formatOnSave": true, "clang-format.executable": "/usr/local/bin/clang-format", "clangd.arguments": [ "--compile-commands-dir=${workspaceFolder}" ], "files.eol": "\n", "files.trimTrailingWhitespace": true } } }, "name": "cuda12.0-llvm9" } cccl-2.5.0/.devcontainer/cuda12.4-gcc10/000077500000000000000000000000001463375617100173125ustar00rootroot00000000000000cccl-2.5.0/.devcontainer/cuda12.4-gcc10/devcontainer.json000066400000000000000000000034761463375617100227000ustar00rootroot00000000000000{ "shutdownAction": "stopContainer", "image": "rapidsai/devcontainers:24.06-cpp-gcc10-cuda12.4-ubuntu20.04", "hostRequirements": { "gpu": "optional" }, "initializeCommand": [ "/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}" ], "containerEnv": { "SCCACHE_REGION": "us-east-2", "SCCACHE_BUCKET": "rapids-sccache-devs", "VAULT_HOST": "https://vault.ops.k8s.rapids.ai", "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", "DEVCONTAINER_NAME": "cuda12.4-gcc10", "CCCL_CUDA_VERSION": "12.4", "CCCL_HOST_COMPILER": "gcc", "CCCL_HOST_COMPILER_VERSION": "10", "CCCL_BUILD_INFIX": "cuda12.4-gcc10" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", "mounts": [ "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent" ], "customizations": { "vscode": { "extensions": [ "llvm-vs-code-extensions.vscode-clangd", "xaver.clang-format", "nvidia.nsight-vscode-edition", "ms-vscode.cmake-tools" ], "settings": { "editor.defaultFormatter": "xaver.clang-format", "editor.formatOnSave": true, "clang-format.executable": "/usr/local/bin/clang-format", "clangd.arguments": [ "--compile-commands-dir=${workspaceFolder}" ], "files.eol": "\n", "files.trimTrailingWhitespace": true } } }, "name": "cuda12.4-gcc10" } cccl-2.5.0/.devcontainer/cuda12.4-gcc11/000077500000000000000000000000001463375617100173135ustar00rootroot00000000000000cccl-2.5.0/.devcontainer/cuda12.4-gcc11/devcontainer.json000066400000000000000000000034761463375617100227010ustar00rootroot00000000000000{ "shutdownAction": "stopContainer", "image": "rapidsai/devcontainers:24.06-cpp-gcc11-cuda12.4-ubuntu22.04", "hostRequirements": { "gpu": "optional" }, "initializeCommand": [ "/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}" ], "containerEnv": { "SCCACHE_REGION": "us-east-2", "SCCACHE_BUCKET": "rapids-sccache-devs", "VAULT_HOST": "https://vault.ops.k8s.rapids.ai", "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", "DEVCONTAINER_NAME": "cuda12.4-gcc11", "CCCL_CUDA_VERSION": "12.4", "CCCL_HOST_COMPILER": "gcc", "CCCL_HOST_COMPILER_VERSION": "11", "CCCL_BUILD_INFIX": "cuda12.4-gcc11" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", "mounts": [ "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent" ], "customizations": { "vscode": { "extensions": [ "llvm-vs-code-extensions.vscode-clangd", "xaver.clang-format", "nvidia.nsight-vscode-edition", "ms-vscode.cmake-tools" ], "settings": { "editor.defaultFormatter": "xaver.clang-format", "editor.formatOnSave": true, "clang-format.executable": "/usr/local/bin/clang-format", "clangd.arguments": [ "--compile-commands-dir=${workspaceFolder}" ], "files.eol": "\n", "files.trimTrailingWhitespace": true } } }, "name": "cuda12.4-gcc11" } cccl-2.5.0/.devcontainer/cuda12.4-gcc12/000077500000000000000000000000001463375617100173145ustar00rootroot00000000000000cccl-2.5.0/.devcontainer/cuda12.4-gcc12/devcontainer.json000066400000000000000000000034761463375617100227020ustar00rootroot00000000000000{ "shutdownAction": "stopContainer", "image": "rapidsai/devcontainers:24.06-cpp-gcc12-cuda12.4-ubuntu22.04", "hostRequirements": { "gpu": "optional" }, "initializeCommand": [ "/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}" ], "containerEnv": { "SCCACHE_REGION": "us-east-2", "SCCACHE_BUCKET": "rapids-sccache-devs", "VAULT_HOST": "https://vault.ops.k8s.rapids.ai", "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", "DEVCONTAINER_NAME": "cuda12.4-gcc12", "CCCL_CUDA_VERSION": "12.4", "CCCL_HOST_COMPILER": "gcc", "CCCL_HOST_COMPILER_VERSION": "12", "CCCL_BUILD_INFIX": "cuda12.4-gcc12" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", "mounts": [ "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent" ], "customizations": { "vscode": { "extensions": [ "llvm-vs-code-extensions.vscode-clangd", "xaver.clang-format", "nvidia.nsight-vscode-edition", "ms-vscode.cmake-tools" ], "settings": { "editor.defaultFormatter": "xaver.clang-format", "editor.formatOnSave": true, "clang-format.executable": "/usr/local/bin/clang-format", "clangd.arguments": [ "--compile-commands-dir=${workspaceFolder}" ], "files.eol": "\n", "files.trimTrailingWhitespace": true } } }, "name": "cuda12.4-gcc12" } cccl-2.5.0/.devcontainer/cuda12.4-gcc13/000077500000000000000000000000001463375617100173155ustar00rootroot00000000000000cccl-2.5.0/.devcontainer/cuda12.4-gcc13/devcontainer.json000066400000000000000000000034761463375617100227030ustar00rootroot00000000000000{ "shutdownAction": "stopContainer", "image": "rapidsai/devcontainers:24.06-cpp-gcc13-cuda12.4-ubuntu22.04", "hostRequirements": { "gpu": "optional" }, "initializeCommand": [ "/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}" ], "containerEnv": { "SCCACHE_REGION": "us-east-2", "SCCACHE_BUCKET": "rapids-sccache-devs", "VAULT_HOST": "https://vault.ops.k8s.rapids.ai", "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", "DEVCONTAINER_NAME": "cuda12.4-gcc13", "CCCL_CUDA_VERSION": "12.4", "CCCL_HOST_COMPILER": "gcc", "CCCL_HOST_COMPILER_VERSION": "13", "CCCL_BUILD_INFIX": "cuda12.4-gcc13" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", "mounts": [ "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent" ], "customizations": { "vscode": { "extensions": [ "llvm-vs-code-extensions.vscode-clangd", "xaver.clang-format", "nvidia.nsight-vscode-edition", "ms-vscode.cmake-tools" ], "settings": { "editor.defaultFormatter": "xaver.clang-format", "editor.formatOnSave": true, "clang-format.executable": "/usr/local/bin/clang-format", "clangd.arguments": [ "--compile-commands-dir=${workspaceFolder}" ], "files.eol": "\n", "files.trimTrailingWhitespace": true } } }, "name": "cuda12.4-gcc13" } cccl-2.5.0/.devcontainer/cuda12.4-gcc7/000077500000000000000000000000001463375617100172405ustar00rootroot00000000000000cccl-2.5.0/.devcontainer/cuda12.4-gcc7/devcontainer.json000066400000000000000000000034711463375617100226210ustar00rootroot00000000000000{ "shutdownAction": "stopContainer", "image": "rapidsai/devcontainers:24.06-cpp-gcc7-cuda12.4-ubuntu20.04", "hostRequirements": { "gpu": "optional" }, "initializeCommand": [ "/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}" ], "containerEnv": { "SCCACHE_REGION": "us-east-2", "SCCACHE_BUCKET": "rapids-sccache-devs", "VAULT_HOST": "https://vault.ops.k8s.rapids.ai", "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", "DEVCONTAINER_NAME": "cuda12.4-gcc7", "CCCL_CUDA_VERSION": "12.4", "CCCL_HOST_COMPILER": "gcc", "CCCL_HOST_COMPILER_VERSION": "7", "CCCL_BUILD_INFIX": "cuda12.4-gcc7" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", "mounts": [ "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent" ], "customizations": { "vscode": { "extensions": [ "llvm-vs-code-extensions.vscode-clangd", "xaver.clang-format", "nvidia.nsight-vscode-edition", "ms-vscode.cmake-tools" ], "settings": { "editor.defaultFormatter": "xaver.clang-format", "editor.formatOnSave": true, "clang-format.executable": "/usr/local/bin/clang-format", "clangd.arguments": [ "--compile-commands-dir=${workspaceFolder}" ], "files.eol": "\n", "files.trimTrailingWhitespace": true } } }, "name": "cuda12.4-gcc7" } cccl-2.5.0/.devcontainer/cuda12.4-gcc8/000077500000000000000000000000001463375617100172415ustar00rootroot00000000000000cccl-2.5.0/.devcontainer/cuda12.4-gcc8/devcontainer.json000066400000000000000000000034711463375617100226220ustar00rootroot00000000000000{ "shutdownAction": "stopContainer", "image": "rapidsai/devcontainers:24.06-cpp-gcc8-cuda12.4-ubuntu20.04", "hostRequirements": { "gpu": "optional" }, "initializeCommand": [ "/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}" ], "containerEnv": { "SCCACHE_REGION": "us-east-2", "SCCACHE_BUCKET": "rapids-sccache-devs", "VAULT_HOST": "https://vault.ops.k8s.rapids.ai", "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", "DEVCONTAINER_NAME": "cuda12.4-gcc8", "CCCL_CUDA_VERSION": "12.4", "CCCL_HOST_COMPILER": "gcc", "CCCL_HOST_COMPILER_VERSION": "8", "CCCL_BUILD_INFIX": "cuda12.4-gcc8" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", "mounts": [ "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent" ], "customizations": { "vscode": { "extensions": [ "llvm-vs-code-extensions.vscode-clangd", "xaver.clang-format", "nvidia.nsight-vscode-edition", "ms-vscode.cmake-tools" ], "settings": { "editor.defaultFormatter": "xaver.clang-format", "editor.formatOnSave": true, "clang-format.executable": "/usr/local/bin/clang-format", "clangd.arguments": [ "--compile-commands-dir=${workspaceFolder}" ], "files.eol": "\n", "files.trimTrailingWhitespace": true } } }, "name": "cuda12.4-gcc8" } cccl-2.5.0/.devcontainer/cuda12.4-gcc9/000077500000000000000000000000001463375617100172425ustar00rootroot00000000000000cccl-2.5.0/.devcontainer/cuda12.4-gcc9/devcontainer.json000066400000000000000000000034711463375617100226230ustar00rootroot00000000000000{ "shutdownAction": "stopContainer", "image": "rapidsai/devcontainers:24.06-cpp-gcc9-cuda12.4-ubuntu20.04", "hostRequirements": { "gpu": "optional" }, "initializeCommand": [ "/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}" ], "containerEnv": { "SCCACHE_REGION": "us-east-2", "SCCACHE_BUCKET": "rapids-sccache-devs", "VAULT_HOST": "https://vault.ops.k8s.rapids.ai", "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", "DEVCONTAINER_NAME": "cuda12.4-gcc9", "CCCL_CUDA_VERSION": "12.4", "CCCL_HOST_COMPILER": "gcc", "CCCL_HOST_COMPILER_VERSION": "9", "CCCL_BUILD_INFIX": "cuda12.4-gcc9" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", "mounts": [ "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent" ], "customizations": { "vscode": { "extensions": [ "llvm-vs-code-extensions.vscode-clangd", "xaver.clang-format", "nvidia.nsight-vscode-edition", "ms-vscode.cmake-tools" ], "settings": { "editor.defaultFormatter": "xaver.clang-format", "editor.formatOnSave": true, "clang-format.executable": "/usr/local/bin/clang-format", "clangd.arguments": [ "--compile-commands-dir=${workspaceFolder}" ], "files.eol": "\n", "files.trimTrailingWhitespace": true } } }, "name": "cuda12.4-gcc9" } cccl-2.5.0/.devcontainer/cuda12.4-llvm10/000077500000000000000000000000001463375617100175305ustar00rootroot00000000000000cccl-2.5.0/.devcontainer/cuda12.4-llvm10/devcontainer.json000066400000000000000000000035031463375617100231050ustar00rootroot00000000000000{ "shutdownAction": "stopContainer", "image": "rapidsai/devcontainers:24.06-cpp-llvm10-cuda12.4-ubuntu20.04", "hostRequirements": { "gpu": "optional" }, "initializeCommand": [ "/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}" ], "containerEnv": { "SCCACHE_REGION": "us-east-2", "SCCACHE_BUCKET": "rapids-sccache-devs", "VAULT_HOST": "https://vault.ops.k8s.rapids.ai", "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", "DEVCONTAINER_NAME": "cuda12.4-llvm10", "CCCL_CUDA_VERSION": "12.4", "CCCL_HOST_COMPILER": "llvm", "CCCL_HOST_COMPILER_VERSION": "10", "CCCL_BUILD_INFIX": "cuda12.4-llvm10" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", "mounts": [ "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent" ], "customizations": { "vscode": { "extensions": [ "llvm-vs-code-extensions.vscode-clangd", "xaver.clang-format", "nvidia.nsight-vscode-edition", "ms-vscode.cmake-tools" ], "settings": { "editor.defaultFormatter": "xaver.clang-format", "editor.formatOnSave": true, "clang-format.executable": "/usr/local/bin/clang-format", "clangd.arguments": [ "--compile-commands-dir=${workspaceFolder}" ], "files.eol": "\n", "files.trimTrailingWhitespace": true } } }, "name": "cuda12.4-llvm10" } cccl-2.5.0/.devcontainer/cuda12.4-llvm11/000077500000000000000000000000001463375617100175315ustar00rootroot00000000000000cccl-2.5.0/.devcontainer/cuda12.4-llvm11/devcontainer.json000066400000000000000000000035031463375617100231060ustar00rootroot00000000000000{ "shutdownAction": "stopContainer", "image": "rapidsai/devcontainers:24.06-cpp-llvm11-cuda12.4-ubuntu20.04", "hostRequirements": { "gpu": "optional" }, "initializeCommand": [ "/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}" ], "containerEnv": { "SCCACHE_REGION": "us-east-2", "SCCACHE_BUCKET": "rapids-sccache-devs", "VAULT_HOST": "https://vault.ops.k8s.rapids.ai", "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", "DEVCONTAINER_NAME": "cuda12.4-llvm11", "CCCL_CUDA_VERSION": "12.4", "CCCL_HOST_COMPILER": "llvm", "CCCL_HOST_COMPILER_VERSION": "11", "CCCL_BUILD_INFIX": "cuda12.4-llvm11" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", "mounts": [ "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent" ], "customizations": { "vscode": { "extensions": [ "llvm-vs-code-extensions.vscode-clangd", "xaver.clang-format", "nvidia.nsight-vscode-edition", "ms-vscode.cmake-tools" ], "settings": { "editor.defaultFormatter": "xaver.clang-format", "editor.formatOnSave": true, "clang-format.executable": "/usr/local/bin/clang-format", "clangd.arguments": [ "--compile-commands-dir=${workspaceFolder}" ], "files.eol": "\n", "files.trimTrailingWhitespace": true } } }, "name": "cuda12.4-llvm11" } cccl-2.5.0/.devcontainer/cuda12.4-llvm12/000077500000000000000000000000001463375617100175325ustar00rootroot00000000000000cccl-2.5.0/.devcontainer/cuda12.4-llvm12/devcontainer.json000066400000000000000000000035031463375617100231070ustar00rootroot00000000000000{ "shutdownAction": "stopContainer", "image": "rapidsai/devcontainers:24.06-cpp-llvm12-cuda12.4-ubuntu20.04", "hostRequirements": { "gpu": "optional" }, "initializeCommand": [ "/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}" ], "containerEnv": { "SCCACHE_REGION": "us-east-2", "SCCACHE_BUCKET": "rapids-sccache-devs", "VAULT_HOST": "https://vault.ops.k8s.rapids.ai", "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", "DEVCONTAINER_NAME": "cuda12.4-llvm12", "CCCL_CUDA_VERSION": "12.4", "CCCL_HOST_COMPILER": "llvm", "CCCL_HOST_COMPILER_VERSION": "12", "CCCL_BUILD_INFIX": "cuda12.4-llvm12" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", "mounts": [ "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent" ], "customizations": { "vscode": { "extensions": [ "llvm-vs-code-extensions.vscode-clangd", "xaver.clang-format", "nvidia.nsight-vscode-edition", "ms-vscode.cmake-tools" ], "settings": { "editor.defaultFormatter": "xaver.clang-format", "editor.formatOnSave": true, "clang-format.executable": "/usr/local/bin/clang-format", "clangd.arguments": [ "--compile-commands-dir=${workspaceFolder}" ], "files.eol": "\n", "files.trimTrailingWhitespace": true } } }, "name": "cuda12.4-llvm12" } cccl-2.5.0/.devcontainer/cuda12.4-llvm13/000077500000000000000000000000001463375617100175335ustar00rootroot00000000000000cccl-2.5.0/.devcontainer/cuda12.4-llvm13/devcontainer.json000066400000000000000000000035031463375617100231100ustar00rootroot00000000000000{ "shutdownAction": "stopContainer", "image": "rapidsai/devcontainers:24.06-cpp-llvm13-cuda12.4-ubuntu20.04", "hostRequirements": { "gpu": "optional" }, "initializeCommand": [ "/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}" ], "containerEnv": { "SCCACHE_REGION": "us-east-2", "SCCACHE_BUCKET": "rapids-sccache-devs", "VAULT_HOST": "https://vault.ops.k8s.rapids.ai", "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", "DEVCONTAINER_NAME": "cuda12.4-llvm13", "CCCL_CUDA_VERSION": "12.4", "CCCL_HOST_COMPILER": "llvm", "CCCL_HOST_COMPILER_VERSION": "13", "CCCL_BUILD_INFIX": "cuda12.4-llvm13" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", "mounts": [ "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent" ], "customizations": { "vscode": { "extensions": [ "llvm-vs-code-extensions.vscode-clangd", "xaver.clang-format", "nvidia.nsight-vscode-edition", "ms-vscode.cmake-tools" ], "settings": { "editor.defaultFormatter": "xaver.clang-format", "editor.formatOnSave": true, "clang-format.executable": "/usr/local/bin/clang-format", "clangd.arguments": [ "--compile-commands-dir=${workspaceFolder}" ], "files.eol": "\n", "files.trimTrailingWhitespace": true } } }, "name": "cuda12.4-llvm13" } cccl-2.5.0/.devcontainer/cuda12.4-llvm14/000077500000000000000000000000001463375617100175345ustar00rootroot00000000000000cccl-2.5.0/.devcontainer/cuda12.4-llvm14/devcontainer.json000066400000000000000000000035031463375617100231110ustar00rootroot00000000000000{ "shutdownAction": "stopContainer", "image": "rapidsai/devcontainers:24.06-cpp-llvm14-cuda12.4-ubuntu20.04", "hostRequirements": { "gpu": "optional" }, "initializeCommand": [ "/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}" ], "containerEnv": { "SCCACHE_REGION": "us-east-2", "SCCACHE_BUCKET": "rapids-sccache-devs", "VAULT_HOST": "https://vault.ops.k8s.rapids.ai", "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", "DEVCONTAINER_NAME": "cuda12.4-llvm14", "CCCL_CUDA_VERSION": "12.4", "CCCL_HOST_COMPILER": "llvm", "CCCL_HOST_COMPILER_VERSION": "14", "CCCL_BUILD_INFIX": "cuda12.4-llvm14" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", "mounts": [ "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent" ], "customizations": { "vscode": { "extensions": [ "llvm-vs-code-extensions.vscode-clangd", "xaver.clang-format", "nvidia.nsight-vscode-edition", "ms-vscode.cmake-tools" ], "settings": { "editor.defaultFormatter": "xaver.clang-format", "editor.formatOnSave": true, "clang-format.executable": "/usr/local/bin/clang-format", "clangd.arguments": [ "--compile-commands-dir=${workspaceFolder}" ], "files.eol": "\n", "files.trimTrailingWhitespace": true } } }, "name": "cuda12.4-llvm14" } cccl-2.5.0/.devcontainer/cuda12.4-llvm15/000077500000000000000000000000001463375617100175355ustar00rootroot00000000000000cccl-2.5.0/.devcontainer/cuda12.4-llvm15/devcontainer.json000066400000000000000000000035031463375617100231120ustar00rootroot00000000000000{ "shutdownAction": "stopContainer", "image": "rapidsai/devcontainers:24.06-cpp-llvm15-cuda12.4-ubuntu22.04", "hostRequirements": { "gpu": "optional" }, "initializeCommand": [ "/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}" ], "containerEnv": { "SCCACHE_REGION": "us-east-2", "SCCACHE_BUCKET": "rapids-sccache-devs", "VAULT_HOST": "https://vault.ops.k8s.rapids.ai", "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", "DEVCONTAINER_NAME": "cuda12.4-llvm15", "CCCL_CUDA_VERSION": "12.4", "CCCL_HOST_COMPILER": "llvm", "CCCL_HOST_COMPILER_VERSION": "15", "CCCL_BUILD_INFIX": "cuda12.4-llvm15" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", "mounts": [ "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent" ], "customizations": { "vscode": { "extensions": [ "llvm-vs-code-extensions.vscode-clangd", "xaver.clang-format", "nvidia.nsight-vscode-edition", "ms-vscode.cmake-tools" ], "settings": { "editor.defaultFormatter": "xaver.clang-format", "editor.formatOnSave": true, "clang-format.executable": "/usr/local/bin/clang-format", "clangd.arguments": [ "--compile-commands-dir=${workspaceFolder}" ], "files.eol": "\n", "files.trimTrailingWhitespace": true } } }, "name": "cuda12.4-llvm15" } cccl-2.5.0/.devcontainer/cuda12.4-llvm16/000077500000000000000000000000001463375617100175365ustar00rootroot00000000000000cccl-2.5.0/.devcontainer/cuda12.4-llvm16/devcontainer.json000066400000000000000000000035031463375617100231130ustar00rootroot00000000000000{ "shutdownAction": "stopContainer", "image": "rapidsai/devcontainers:24.06-cpp-llvm16-cuda12.4-ubuntu22.04", "hostRequirements": { "gpu": "optional" }, "initializeCommand": [ "/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}" ], "containerEnv": { "SCCACHE_REGION": "us-east-2", "SCCACHE_BUCKET": "rapids-sccache-devs", "VAULT_HOST": "https://vault.ops.k8s.rapids.ai", "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", "DEVCONTAINER_NAME": "cuda12.4-llvm16", "CCCL_CUDA_VERSION": "12.4", "CCCL_HOST_COMPILER": "llvm", "CCCL_HOST_COMPILER_VERSION": "16", "CCCL_BUILD_INFIX": "cuda12.4-llvm16" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", "mounts": [ "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent" ], "customizations": { "vscode": { "extensions": [ "llvm-vs-code-extensions.vscode-clangd", "xaver.clang-format", "nvidia.nsight-vscode-edition", "ms-vscode.cmake-tools" ], "settings": { "editor.defaultFormatter": "xaver.clang-format", "editor.formatOnSave": true, "clang-format.executable": "/usr/local/bin/clang-format", "clangd.arguments": [ "--compile-commands-dir=${workspaceFolder}" ], "files.eol": "\n", "files.trimTrailingWhitespace": true } } }, "name": "cuda12.4-llvm16" } cccl-2.5.0/.devcontainer/cuda12.4-llvm17/000077500000000000000000000000001463375617100175375ustar00rootroot00000000000000cccl-2.5.0/.devcontainer/cuda12.4-llvm17/devcontainer.json000066400000000000000000000035031463375617100231140ustar00rootroot00000000000000{ "shutdownAction": "stopContainer", "image": "rapidsai/devcontainers:24.06-cpp-llvm17-cuda12.4-ubuntu22.04", "hostRequirements": { "gpu": "optional" }, "initializeCommand": [ "/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}" ], "containerEnv": { "SCCACHE_REGION": "us-east-2", "SCCACHE_BUCKET": "rapids-sccache-devs", "VAULT_HOST": "https://vault.ops.k8s.rapids.ai", "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", "DEVCONTAINER_NAME": "cuda12.4-llvm17", "CCCL_CUDA_VERSION": "12.4", "CCCL_HOST_COMPILER": "llvm", "CCCL_HOST_COMPILER_VERSION": "17", "CCCL_BUILD_INFIX": "cuda12.4-llvm17" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", "mounts": [ "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent" ], "customizations": { "vscode": { "extensions": [ "llvm-vs-code-extensions.vscode-clangd", "xaver.clang-format", "nvidia.nsight-vscode-edition", "ms-vscode.cmake-tools" ], "settings": { "editor.defaultFormatter": "xaver.clang-format", "editor.formatOnSave": true, "clang-format.executable": "/usr/local/bin/clang-format", "clangd.arguments": [ "--compile-commands-dir=${workspaceFolder}" ], "files.eol": "\n", "files.trimTrailingWhitespace": true } } }, "name": "cuda12.4-llvm17" } cccl-2.5.0/.devcontainer/cuda12.4-llvm18/000077500000000000000000000000001463375617100175405ustar00rootroot00000000000000cccl-2.5.0/.devcontainer/cuda12.4-llvm18/devcontainer.json000066400000000000000000000035031463375617100231150ustar00rootroot00000000000000{ "shutdownAction": "stopContainer", "image": "rapidsai/devcontainers:24.06-cpp-llvm18-cuda12.4-ubuntu22.04", "hostRequirements": { "gpu": "optional" }, "initializeCommand": [ "/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}" ], "containerEnv": { "SCCACHE_REGION": "us-east-2", "SCCACHE_BUCKET": "rapids-sccache-devs", "VAULT_HOST": "https://vault.ops.k8s.rapids.ai", "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", "DEVCONTAINER_NAME": "cuda12.4-llvm18", "CCCL_CUDA_VERSION": "12.4", "CCCL_HOST_COMPILER": "llvm", "CCCL_HOST_COMPILER_VERSION": "18", "CCCL_BUILD_INFIX": "cuda12.4-llvm18" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", "mounts": [ "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent" ], "customizations": { "vscode": { "extensions": [ "llvm-vs-code-extensions.vscode-clangd", "xaver.clang-format", "nvidia.nsight-vscode-edition", "ms-vscode.cmake-tools" ], "settings": { "editor.defaultFormatter": "xaver.clang-format", "editor.formatOnSave": true, "clang-format.executable": "/usr/local/bin/clang-format", "clangd.arguments": [ "--compile-commands-dir=${workspaceFolder}" ], "files.eol": "\n", "files.trimTrailingWhitespace": true } } }, "name": "cuda12.4-llvm18" } cccl-2.5.0/.devcontainer/cuda12.4-llvm9/000077500000000000000000000000001463375617100174605ustar00rootroot00000000000000cccl-2.5.0/.devcontainer/cuda12.4-llvm9/devcontainer.json000066400000000000000000000034761463375617100230460ustar00rootroot00000000000000{ "shutdownAction": "stopContainer", "image": "rapidsai/devcontainers:24.06-cpp-llvm9-cuda12.4-ubuntu20.04", "hostRequirements": { "gpu": "optional" }, "initializeCommand": [ "/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}" ], "containerEnv": { "SCCACHE_REGION": "us-east-2", "SCCACHE_BUCKET": "rapids-sccache-devs", "VAULT_HOST": "https://vault.ops.k8s.rapids.ai", "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", "DEVCONTAINER_NAME": "cuda12.4-llvm9", "CCCL_CUDA_VERSION": "12.4", "CCCL_HOST_COMPILER": "llvm", "CCCL_HOST_COMPILER_VERSION": "9", "CCCL_BUILD_INFIX": "cuda12.4-llvm9" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", "mounts": [ "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent" ], "customizations": { "vscode": { "extensions": [ "llvm-vs-code-extensions.vscode-clangd", "xaver.clang-format", "nvidia.nsight-vscode-edition", "ms-vscode.cmake-tools" ], "settings": { "editor.defaultFormatter": "xaver.clang-format", "editor.formatOnSave": true, "clang-format.executable": "/usr/local/bin/clang-format", "clangd.arguments": [ "--compile-commands-dir=${workspaceFolder}" ], "files.eol": "\n", "files.trimTrailingWhitespace": true } } }, "name": "cuda12.4-llvm9" } cccl-2.5.0/.devcontainer/cuda12.4-oneapi2023.2.0/000077500000000000000000000000001463375617100204755ustar00rootroot00000000000000cccl-2.5.0/.devcontainer/cuda12.4-oneapi2023.2.0/devcontainer.json000066400000000000000000000035531463375617100240570ustar00rootroot00000000000000{ "shutdownAction": "stopContainer", "image": "rapidsai/devcontainers:24.06-cpp-oneapi2023.2.0-cuda12.4-ubuntu22.04", "hostRequirements": { "gpu": "optional" }, "initializeCommand": [ "/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}" ], "containerEnv": { "SCCACHE_REGION": "us-east-2", "SCCACHE_BUCKET": "rapids-sccache-devs", "VAULT_HOST": "https://vault.ops.k8s.rapids.ai", "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", "DEVCONTAINER_NAME": "cuda12.4-oneapi2023.2.0", "CCCL_CUDA_VERSION": "12.4", "CCCL_HOST_COMPILER": "oneapi", "CCCL_HOST_COMPILER_VERSION": "2023.2.0", "CCCL_BUILD_INFIX": "cuda12.4-oneapi2023.2.0" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", "mounts": [ "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent" ], "customizations": { "vscode": { "extensions": [ "llvm-vs-code-extensions.vscode-clangd", "xaver.clang-format", "nvidia.nsight-vscode-edition", "ms-vscode.cmake-tools" ], "settings": { "editor.defaultFormatter": "xaver.clang-format", "editor.formatOnSave": true, "clang-format.executable": "/usr/local/bin/clang-format", "clangd.arguments": [ "--compile-commands-dir=${workspaceFolder}" ], "files.eol": "\n", "files.trimTrailingWhitespace": true } } }, "name": "cuda12.4-oneapi2023.2.0" } cccl-2.5.0/.devcontainer/devcontainer.json000066400000000000000000000034761463375617100205640ustar00rootroot00000000000000{ "shutdownAction": "stopContainer", "image": "rapidsai/devcontainers:24.06-cpp-gcc13-cuda12.4-ubuntu22.04", "hostRequirements": { "gpu": "optional" }, "initializeCommand": [ "/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}" ], "containerEnv": { "SCCACHE_REGION": "us-east-2", "SCCACHE_BUCKET": "rapids-sccache-devs", "VAULT_HOST": "https://vault.ops.k8s.rapids.ai", "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", "DEVCONTAINER_NAME": "cuda12.4-gcc13", "CCCL_CUDA_VERSION": "12.4", "CCCL_HOST_COMPILER": "gcc", "CCCL_HOST_COMPILER_VERSION": "13", "CCCL_BUILD_INFIX": "cuda12.4-gcc13" }, "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", "mounts": [ "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent" ], "customizations": { "vscode": { "extensions": [ "llvm-vs-code-extensions.vscode-clangd", "xaver.clang-format", "nvidia.nsight-vscode-edition", "ms-vscode.cmake-tools" ], "settings": { "editor.defaultFormatter": "xaver.clang-format", "editor.formatOnSave": true, "clang-format.executable": "/usr/local/bin/clang-format", "clangd.arguments": [ "--compile-commands-dir=${workspaceFolder}" ], "files.eol": "\n", "files.trimTrailingWhitespace": true } } }, "name": "cuda12.4-gcc13" } cccl-2.5.0/.devcontainer/img/000077500000000000000000000000001463375617100157525ustar00rootroot00000000000000cccl-2.5.0/.devcontainer/img/build_button.png000066400000000000000000000126001463375617100211510ustar00rootroot00000000000000‰PNG  IHDRÐ ‹iCCPkCGColorSpaceGenericRGB8U]hU>›¹³+$΃Ԧ¦’þ5”´lRÑ„Úèþe³mÜ,“l´AÉìÝi&3ãü¤i)>AÁ¨à“àÿ[Á'!j«í‹-¢´P¢ƒ(øÐúG¡Ò ë¹3³»“¸k½ËÜùæœï~çÞsîÞ ¸,[–Þ%,®-åÓâ³ÇæÄÄ:tÁ}Ð }Ð-+Ž•*•&ã¿Úíï ÆÞ×ö·÷ÿgë®PGˆÝ…ج8Ê"âeþŲ]€AûÈ ×bø Ä;lœ âõWžð²Ï™‘2ˆ_E,(ªŒþÄÛˆç#öZsðÛŽ<5¨­)"ËEÉ6«šN#Ó½ƒû¶EÝkÄÛƒO³0}߸ö—*r–ᇟUäÜtˆ¯.i³Åÿe¹i ñ#]»¼…r ñ>ÄcU{¼èt©ª7ÑÀ+§Ô™g߃xuÁ<ÊÆîDüµ1_œ u~Rœ æàâ*-°z÷#°Mi*ˆËWh6Çòˆø¸æf}î-gi:×Ð9¥fŠA,î‹ãòV§>ÄW©ž—Bý_-·Æ%=†^œ tÈ0uüõúvW™â’9 Œ%/VµñBÈWµ'¤_¶tâÜÈMÛ“ÊŸ¿ŸåP““í\>ĘÉ@Á„yì0`D i|[`£§ èh¡è¥h¡øÕàìßÂ)ùþ·TjþÈëèÑ0B¦ÿ#ðЪÂïhU#¼ ~yh«uÐ fp#Ô1I/I’ƒø"“ä0!£ ’'ÉSdŒdÑ:J5Ç–"sdó¹ÑÔy#RŸ7‹¼‹èwAÆþgd˜à´ÏÅJŸ7ØÏØÏkÊ•×n^:}nW‹»FVŸ»Ösét$gj-tÈÚÔrÏÿÂ_ç×°_ç7Z þ~ëÛV·5ë4ÌV }ºo[ÄGó=Nd>¥-Ula³£¢Y5VúË}¹x»g[üä÷É?’kÉ÷’&ãÞä>áÎsŸrŸq߀È]à.r_r_qsŸGjÔyï4k± æi—QÜŸBZØ-<(d…=ÂÃÂdKO膄 a/zv7«]»ǰod«}¬€©sìn¬³Öá?TF–'|¦ãï3Nnã„#I?"…m»z„íõ¦v~K=Ú¯Æsñl<b|_|4>?Âpƒß‹¾QìñÔré²ËâŒi´µšêŠÃÉäãb ¯2* åÀ (ëºè»Ѧµ—hå°{28ÂoIþýÛy¥esŸ8ü';÷Z¶9à¬ÐûhË6€gã½ï¬>¦xöRx'Äbß8ÕƒÃÁWOÏ«ëõ[xn%ÞØ|½^ÿûýz}óÔ߸ ÿÿ%x ÅcÖËleXIfMM*>F(‡iNñ‹çY! Рÿ³, pHYs  Ñ`Ón IDATxí\\TGÿÓÛ.Ešti¡(‚ F1‚KŠ9=F£ÑØsÖX¢Ñ$Ƙœ&1jÅ‚F½KŒÉ) *("*"E:ˆ) K‡ûæ­,.°ãÝí÷û=vÞÌ÷ÍÌûÞ|uæ¡RZZÚ%(9 ä€˜â²¼‚ ïíóJ’Ç%e»’R8 *¥^Y­ä€’ p@]%ŠÐ@#Ü«2á\™—ŠL¸”gÃ¥$úU•à + S&”ÙKž6Êø|”kë ݰ'îêÛ"‰o‹d={Üѱ‡@UK&½²ñ¯á€Ò…ëßíêŠR‰ì(ø&&t€²ëQS{9 ÜΧ̟G¼ŽC× ìQ!(H›fäŸÄ‚Û?Á:;OÖ³ÓTi¤o¼ƒñ¥íß TQ:OãÍ($@êê*˜`}ç³ÑØ(}ZþîÆÈ+®FZ~¥t¤g¸Åž,Ì?ã·"àVl‡gÙ¤¦Š{{$Ù IßÉ<;$ëÚ¡D]ejºr]0ݦz4TrWoa&œ*³È Ì‚[I6Ür2 !¬éðœÒíl1oÀBDñÜ;LÛ‚ þfêcŒŠšF„^¼„ŒòŽÿWãÊ Kcm\ZÓ~ë¯cQneWbß9­§­Ž?×úàÅMqØ=Ë Gc qè‚|­ííh€N†8vé>ŠÊk%úTSSÁ_3¸!¿¬'¯ánŽ@çIoTÑ„ˆK Ñ'é®Ì®Òžsñ;Óæ6}M`¬«qÛoãZJ©TºΆX7Î'âr85õ¸’TŠÄìŽ=ùúA_GýWÅÀÁ\+ú`æžœ/=ö៘–vZ®à—™¡àw(Tã‰i;ZèÕS£¼M°ã×Lèhª"aó x,‹°¦žëj¨§ –Œ¶FÈæàѺxÿl8–І†îKö† 0ǶIŽXvÇ£tô‘ž_®÷¡E™,€¶†&ô3&f¤qyνÌthñ硲º¯`Î˶»T€‘ž=d ŽmÓ\1e×W ÆÅÛ¢…Æ* õ4pùC_Xj¢¶¾ „¿fŒ-ÖŸÊÂgáéÍ“þ9³ƒâÛí¦È¦'k6"cPÆo³Àñ“¢*¦~k¯bú_è×÷/CPÕÐnËCìð¼“>w=ŽRX ÿu±â÷x[{ež–:xZj\“6-XžÌ 5$PÃ{øƒ]ð>›â‚Ù:i¸2m^›8Í ¨Éû&à¡™1ƒÂ$úPô&«H_{>‡8àBb ¢RÐÕR¥K“ë"1K'ZFÚØ5Óû/?èVáiž÷ë_ßÁÎi.˜@JwÉd’Ò} W€úÚñ›)€1-"©åÂ:¸SÝwÓ‘'Dos¬ý1¿Þ(Â瓜päR>VÛJ;ÓJ·>À-¶˜œLµÛà¾7ÚŽž¹ïqÖNKC1`u-¾8•zªÎBÏúr$î­ y•ç7~‹xÛøôç4¨E«@%ú&‡ÇÆsÿÇ®¬K zF@OlÈ“*<Í×56¡Ç;¸[¦¹¿œâ„aΘ:ÜŠÓàÍx²~®Ž‘ÕÜ~[?Œ´ôª-ÀѨUè•‘%ÆíQPŒ’=£ðƸøÝ¤®Àx1kwb?ˆ /#Ð#âÐ<‰rJjðËÒ>8}«GÈ¥{“\Š!¤˜VÕ¿ºÞóÉx ÖH®i’ö¯©k„–† ªkEÚV‡¬QqNH>¯ªŠˆ=5uMÐÓTC=I𤱥ž:™~5Œþâ6Ìô5ú¶KT7K]4Ñ‹9)rÙøÿº^„¼d…ž†ZÈ)ªnC£H…SM>b¦J &;öB@ÀWÐÒÕÆóªFØ?ÊGIk²8Nž}4i´‡uˆ¾[ÂÑMf…äÂ…Gríõ º&¬¿¹{’pçãèk+r¡†{™pŠhÄÇñH ŠGö5Åηœ0äÃXܧ…:×|UŒßÒ¾µ\l™$ÐÌ2]˨@v©¤+œ¦iŸa»¹¹ÿpûcŒ»z^üü?þ¼ Ë^ž‹]!â:E ôî)äÈ®KE÷rÐǬVRÛ»£¹‘«§àäõB|KÖh¼)Þ'kÔ:<èʱån¤^ÏÀLJ›éxÁÅšd XLbEîU?;=ÄgUpóñ§XåZº¶|\IÕµ7Ñ\J2Ø-ˆÂŸ Å&fߜͅ 5†ÍóBo ]°…6ç ä•ÕvZxØ\Z ûßÄÀ€ö³FúÖÁ˜7Ò_’e)­¬ã¦þýlLò3ǦW°t\/®îß{ˆ.À®é.ÈÙîÏYc®AÎ+¾¦—Ê`ï£øÐŒxhJ _G»å5ØPÌÉê´5EºÍÓZîăöàQäÖ¾bG‹Y?ßx7 ¼ékÒ*W7ÝcŒ§Ûsj†OÏ| ³†ŽÅe–Æ:H)­ÄRó*àa©×<ÌSýeÖˆ¹É™Å5d|L1Rw\ —^†Ï'öÂòƒ©ˆ&!zcˆ%ìÉí:|µ?]-ÀÊ`;œ!‰_bƒQŸÝαäÓÈfQdAÔblþ-KGY!ȳ?‡Êb¡ÇÉ"“Ù¶ñÞN‰v£i¿‰ïu)ÆØ{¹KB[’ ‡8ñá¸ø¬hÁœ[ޛɵ»w_ˆ•“9Ú“Kû’µmYüâ4ha—ïÊÝ1…À ”ƹL±CWÀjŠ ‹IØÝ—]áÜZÔßûbø“HƒFR[f¯Çß§Â"7ŸC»pv.\Gí“FÒ¦ÞÞLiä~ËæðdÌCmW´1kÄB fXl4†³v&tE×}È ŒU¥ÄÀ–¿;cɾdòûëñ¥><™*'3-Wˆ‹äB2`ÁçöÓ-ÀêN\}€Å/[#ïa-GÃê:Æ|u0Q¥?åBÑ¢b{aw(ÓÉÀ,›"À\æ8K*?G¾"ä8tãC1^®–¸,«PGnt]½hÎW3+àÓÛ±¤\Úƒ×ý,°72´Åñ´ÁÉ’‡”%fŠÞÿ#Š…ºÑò)¨ïD,N›dl§PúÚÆT‡»ŒxšøW\ÞfÝe|ú#¹ŒKo˜ï‰a”@˜`‰ÈUÞ\ÿ?DÊß mo"û®W³ýœð×Ä÷¬À’‹^´ÄŠ£i´½ ¼GnäGäÒ1`‹ýÔ2o¬œÐ $Ûøä—,.SÆ'å! X 4•øÂ.F{a]Øiá&ñè¸ü(³·fŒ ^'K7›öѦSÒBQ8In¥+¹Ì{çx ²…Ç÷áÒÿòèÏ]^„—ã.‰ÑFo—-„Ç`5¥Œ™Rm ôŒSýÌp>¡¨uS·Þ3·–½·ŠW÷G`ì–Ý*<ìa²@ÍO=ŸÒÈLë[ ™÷gíf| |~2ƒÛ'jÆ—÷Ëj{0qÛMžï…Ú}Å«‡ÂƸ3Q¬©Û£“W73d%vŸØÄ¡é•”s1À–a“°Ñ~ R)9ðÅÙÛkaÀÊ Øƒ™;oÃjy?Œ§y²Mn¹oç¨ÿ”-mBF!ôøz‰¦3Þ~ˆÕí-Q'ï†íå- ´¥µÐ„=”©|÷ûDn3Ñ1Etè=Š;¸ã_{»Éuj=GgÊrî˜æŒòêFømˆEváÓ‰·äåiž¨»½>¸ÃsYt»1ÎŽ™îˆ¥8â»w­¯ÉÌ1s¥J(ì *¹‚ákÛtõSÿa˜í¹ ‹Æ:":µ‘·Špã“XH›¹N´(|ø˜óݼý¢ —~ßù[V›>:[Á,œ©²‹…â„@Gúb[ ¦úš`[íÁ¡XòGX›¦}~XಠM½¬ –é;²Ð‹;aÀ6I_#WmíX;ܹ_#]5èQFó’ž/Ä¡ùø‘bǃôdÍãñ6–Üøj²VRÂæà…4u_øø°\Yar³ás éDLûv:Á–ܺ_¯‰‚í6#=CZh@æ±W¡%hÿo&`ã lšâF¿)7óUôröŸÏy†žBöTæäýŒ ‘ßCµZ;¶Æö´÷4w§ggã([Ù ,ñáhNféXÓƒÒ–#4ìŒdŠïΓ2êN°£ø›YåÎn°?ÉÜ 'äY¥í[•Žˆ£sðx†®õ\ {šâ[1µ F±šb~ë>ºûž}Ì7éÁÌH9 ÷”™Ã½;f)›Ž‰£lTœÿ×ÔÌ&~c Ž^ûD±Is½´_±Ž;ÆŸ&}pÁÐ÷ÕÛ9¤Ñw¶žÍshÙu )ŽGPF l²sê*ßÒ¯¼ð)’µ,ÂW")Î¥µâûêtËímxñFËT+…nÙq™D+;<ÔæC ¡ƒ ºº¨P§_u]4ßï‚W_~.ö[ýÚJ¸>Èïa¹BãHCºíäŒe}çàÏMв¾ 8  9L ÜÂ[™§$aÊ!ùKš#½|°Ç!ÇÙg JxjP P'XíZ“‡aÅÑ(¸‰a©7¤&#:ѵL’&:Ååâ…Hs/\ìáƒh½¶'Ùev lìr(¨ËY*êP¿±®UYЯ¯¯¹j.*óÉmS£Tε#w޹tôÙw…º÷›¦m…|ú\ Ï>ÔYzZ¥©vß)ÉŽ ³|åÑUè;ý&Úmg;xìJª¨4Ržp¸C[TÏÚ¸£ÆŒ–=0£§3¿ ¯‰UÐ=hÙ?»hÞäöü ãr|=šL!èÄóÿÙqP%ÿ;°þغý ÖÞÜšŒœ†Ù…IEND®B`‚cccl-2.5.0/.devcontainer/img/cmaketools_presets.png000066400000000000000000002566321463375617100224040ustar00rootroot00000000000000‰PNG  IHDRåM¿NþiCCPkCGColorSpaceGenericRGB8U]hU>›¹³+$΃Ԧ¦’þ5”´lRÑ„Úèþe³mÜ,“l´AÉìÝi&3ãü¤i)>AÁ¨à“àÿ[Á'!j«í‹-¢´P¢ƒ(øÐúG¡Ò ë¹3³»“¸k½ËÜùæœï~çÞsîÞ ¸,[–Þ%,®-åÓâ³ÇæÄÄ:tÁ}Ð }Ð-+Ž•*•&ã¿Úíï ÆÞ×ö·÷ÿgë®PGˆÝ…ج8Ê"âeþŲ]€AûÈ ×bø Ä;lœ âõWžð²Ï™‘2ˆ_E,(ªŒþÄÛˆç#öZsðÛŽ<5¨­)"ËEÉ6«šN#Ó½ƒû¶EÝkÄÛƒO³0}߸ö—*r–ᇟUäÜtˆ¯.i³Åÿe¹i ñ#]»¼…r ñ>ÄcU{¼èt©ª7ÑÀ+§Ô™g߃xuÁ<ÊÆîDüµ1_œ u~Rœ æàâ*-°z÷#°Mi*ˆËWh6Çòˆø¸æf}î-gi:×Ð9¥fŠA,î‹ãòV§>ÄW©ž—Bý_-·Æ%=†^œ tÈ0uüõúvW™â’9 Œ%/VµñBÈWµ'¤_¶tâÜÈMÛ“ÊŸ¿ŸåP““í\>ĘÉ@Á„yì0`D i|[`£§ èh¡è¥h¡øÕàìßÂ)ùþ·TjþÈëèÑ0B¦ÿ#ðЪÂïhU#¼ ~yh«uÐ fp#Ô1I/I’ƒø"“ä0!£ ’'ÉSdŒdÑ:J5Ç–"sdó¹ÑÔy#RŸ7‹¼‹èwAÆþgd˜à´ÏÅJŸ7ØÏØÏkÊ•×n^:}nW‹»FVŸ»Ösét$gj-tÈÚÔrÏÿÂ_ç×°_ç7Z þ~ëÛV·5ë4ÌV }ºo[ÄGó=Nd>¥-Ula³£¢Y5VúË}¹x»g[üä÷É?’kÉ÷’&ãÞä>áÎsŸrŸq߀È]à.r_r_qsŸGjÔyï4k± æi—QÜŸBZØ-<(d…=ÂÃÂdKO膄 a/zv7«]»ǰod«}¬€©sìn¬³Öá?TF–'|¦ãï3Nnã„#I?"…m»z„íõ¦v~K=Ú¯Æsñl<b|_|4>?Âpƒß‹¾QìñÔré²ËâŒi´µšêŠÃÉäãb ¯2* åÀ (ëºè»Ѧµ—hå°{28ÂoIþýÛy¥esŸ8ü';÷Z¶9à¬ÐûhË6€gã½ï¬>¦xöRx'Äbß8ÕƒÃÁWOÏ«ëõ[xn%ÞØ|½^ÿûýz}óÔ߸ ÿÿ%x ÅcÖËleXIfMM*>F(‡iN^P ûßÒ å Mžµ pHYs  ùð-?@IDATxì¸ÕÕ†½÷Þ¥ "")¢QÄ‚], Äh¬c‰±ÄˆE%6,1ØPô· bi H‘&Qz¯— ÿù6Ù'sÎ=í6¸pßõ<çÎÌž=»¼3<ìùfíµ Ì™3g¿a€ @€Böïÿßëö÷íÛçÒ´MOOØWZð§üþçaê|ЂåÓÙ‡ @€ÀáF @M.X°`øXçüOéÁ_áp.v @€ ü—€ϽÈ.qÝïKœŠñþ\‘"E¬xñâîå€ @ÈÏ4vÞµk—¥¥¥¹±s¢|û€ @€@˜€÷r÷b¼NxÞ‹òèåõS¦L+]º´•/_ÞJ”(0¦È @€@~# ñóÎ;mÓ¦M¶mÛ6÷Ó¸Ù¢¼'Á€ @ùœ€^¢MiÑB¼䵕 /1¾V­ZVªT©èË9† @€@¾# °5%K–t¿íÛ·Û/¿üâ„yŸeÿ r“ïÐÐa@€ @  8¯}-ÐëA>AÒ @€ p€€WäÀ¢q³O#Êót@€ @x/¿@·Š!¯— <ä=%¶€ @ÈH@ãe›5~ÖxQ>##R @€ ÿ8¯‡X?-êªò @€ ˜€ÆÍ?#Ê'æÄY@€ äkzaðæEyû}ÅÊÔ¢® @€ ˜€ÆÍ?#Ê'æÄY@€ ä{AaÞæéŃ @€Ž› _“˜g!@€ /…öèëœck>:/Ç€ @€@b…ŸN|¶`Á‚V¨P¡ˆLééénÀ‘˜âA^//Ån € @‡-Dâ¼ï”éý1[@€ @ 5rrÉ’(/WûÂ… »8ÑU)]ƒô½{÷ºmôùXÇy½¼Xm& € @ù‘@*¢}~äBŸ!@€ *,‰òñy_©ÙÓÒÒ|RÂm^//aã9 @€ Ä$ q¾´ÒŒX € @G2yÀïܹÓ9«'ëg¦GÇPƒÒÇ«@yR|çõòâõt@€ ©âÅϬ—<‚ü‘ú„Ð/@€ hÒ¹5þMÅ2-ÊGÇOTI*ySÉãëH%o*y2SžÏË€ @ù‘@f…ø £Tœt‚ùÙ‡ @€ÀáL Õño¦EùT¼ä=¸Tò¦’çP–çëf @€ üJ@Â|Pœ>ί\è7 @€2K@céL‹ò™©$8pÏÌuñòæõòâµ›t@€ @€ @€€dZ”ÏëÂxN·Ç€ @€ @€ S2-ʧ§§§\w*ySÉã+L%o*y2SžÏË€ @€ @€ ]™å÷íÛO2^ä±®¼É,¯——¬ýœ‡ @€ @€ ¤J Ó¢¼ Þ»woBa^‚¼ò¤jy½¼TûA>@€ @€ @€@"…ŒwN¢{ZZš,XÐ *‘MácRñ^”×Ë ¶•}@€ @ ç ìܹÓ/^l+W®´ZµjYÆ ­D‰™ªèÛo¿µ©S§Úm·Ý–©ëòCæyóæÙÈ‘#mË–-vã7ZìÙgŸµ.]ºXûöíó‚CÞÇõë×Û‡~hkÖ¬±îÝ»ÛqÇwÈÛ”Ù,[¶ÌÞ{ï=»üòË­fÍš™½œü€ ü—@–DyOOâ{fxm¬m^//V›Iƒ @€²G`òäÉöÊ+¯D¼[ÈèÚk¯µvíÚ¥\ø’%KlöìÙ)çO%£Ú&!òâ‹/N%{žÌ³uëV{â‰'¬páÂÖ¢E 'ÈK¯*Uª Êgã®íÚµËþïÿþÏÚ´icM›6MX’îÁÚµk]¾¢E‹&Ì›WOnذÁôGwåóê]¢]€ p8È–(8t6B€ @y—ÀæÍ›í¥—^²òåËÛþð«]»¶­X±Â^}õU—Þ¤IwîPõ`úôé6eÊ”ÃZ”× Y¯^½Â|¹råìÑGµŠ+*´GD½»wï¶Ñ£G»Y‰Dy…l• â‰'ÚÿøÇ#¢ït€ ¬(tÓM7=˜õ˹€ @8Ò(¼¤7…§Ô±fµjÜ/Y²¤UªTÉgͰ-^¼x†´è„‰'Ú¬Y³œW¼¼¸‹+æ¼·5jdsçε5j„=rU÷Â… ºzõj«P¡‚ë˜9s¦ój?÷ÜsÃÕìÙ³Çf̘aãÇ7y5K„–ÇxÐôaà£>²mÛ¶¹þ)RÄ7nœ»Vé ¥#/a…Ö‰e]¿ûî;›0a‚-_¾ÜT†>4$21UÆŽë¼Öc]£6‹Ú¢vJLöY!{Ô>Ý'åùù短zõꎣêþñÇíûï¿·_~ùŵGòGu”[LíU¿Ê–-ëš)¾‹-²1cƸmµjÕ\y*³N:&{Õ¡6èÞ{S9œu¯Ô^‰Ô:¯~ß‹ÿ¿þú«» ,pu«¬xöÛo¿Ù¤I“s1P»Ä@÷]ω7Ï@!¾ùæÛ¸q£k¯Î+¿f;èÊf_¨ ÝãMíÝ·U«V9FÁ¾)_¼6+¿®S_t/U–îyôõzôœËütéÒ¶cÇŽˆ|º7þßžMõÏ·Ñ3ÐŒ…fÒs¬ÐNñ<íÕ_åSyÑϳú’™gNÏ¥ž—2eÊ„û¤öˆ¥BU®\YEb€ DÐGûx¦pv³DŽFãå&€ @€@.ð‚ðœ9s¬U«Vá$<>öØcác Æòì–h,qUââ[o½e·ß~»sÌ1á|Á ”÷Þ{¯K%v~ýõ×NÐ~ä‘Gœ°¬¼?ýô“ 80\¦DÓ¿ýíoVµjU1b„â•O±À4h`mÛ¶Õa„IŒ½ï¾ûlÓ¦MNð—@ÿÁ¸¸Û§žzjD^ ö«?®}tÍ™gži]t‘˦òT®ÊWûU®ìŽ;î‡J6l˜û Á\¦rN¥OŸ>N8•8+a^&‘UåHPUYï¾û®õèÑ#ü¡áÅ_´~øÁåU›†îb½8ê½DêšR¥JE²j·îcëÖ­mûöí.â×+¿Ê‘¸¯û)†Ÿ}ö™+_t¬ØêçŸ~8-¸#[õIôW!Ï@e>ôÐCîãƒò‹Ö:ó Ôµ×ß[åÑ5_|ñ…»‡÷Üs½õ|¨mõq@[±Óšþ™JÔfÝ»¯¾úJÅ»*šáQ¯^½ ªôQÃ÷{þüù.¯m}ÐR}Ÿþ¹+CmT¾ÆÛ]wÝåÚèyž~úé¢û«g!øÂ(Mžæ‰LžäýúõsרL‰ËºoA“ ¯ÅGU_h6¸»·ƒ rÏ–x陹ì²ËsÍŠÉË]íÔs÷ä“Ošî›<éåY/KÖæ:Xÿþý]^}LÑ3¢pKѦþë^ÊÎ>ûl—Oô±D‚¼fˆèZ19çœsÜì }èšî©î¿ú§FÑ|–}¾èç9•gNôléc—Ú£þifÆk¯½æfÉD×Ë1 @Y#€(Ÿ5n\@€ ä ~÷ß¿n%Ž5Ê ®“½±ªQ…GéÚµ«ó –g¶DV ´Z°4Ú$ÔKÌíÒ¥‹óÒÖyyk˃Zù%úû°#W]u•»ü¸ãŽsmÇŽ£‹Kx,qV"±÷¢—È/Ok ÅÑ|A µ#oi‰òò–÷ù7Þh×]wÕõñ@â|§NÂB¯<«¯¼òJ×g úÞ”þ»ßýÎjñM}èPÅ&UÓL™êWµIû ]’à@­Øü2ß?‰ÌZC@–lq^µC÷^¦2;wîlëÖ­‹ø0¡=¿Õn™ÂéC‰âèëYQ8˜ÓN;Íy§û¾ú™ßõ쩯úsõÕW»2²ÓfW@’? É#ÓÇ…£QÛÏ;ï<ÚÆÏnðE\rÉ%<ðý9mƒÏ²)ý<'{æÄ@¡zôÌù™ ›£Yzî<¯`½ìC€ 5„¯É7®‚ @€rˆ€ÄPy9ë'!Z1Â\¡=_[‹c*†¼ìÚk¯ÍP«è ¬ ^¨—ȯ_´I|ô¶Äzo‰ëôy¢·¿ÕVÅÚVÈ™db¸„zÅ`÷^ø¾Ñõù{íQ¯‚VG¦<òŠW˜"ýÄñä“Ov!„$g§Í®‚$ôŒêNp]R·nÝ *$Ž'2ßß೬üþyNå™S¨Yô3§çÉqø@€@¶ Dޏ²]@€ @H€¼s%jÁQ™<žå­0'ÿüç?MÞÊÞ \óÿøÇpáZÜTx´© ;eÍ›7·Î!ÏjoZxË/&ëãÙoÙ²Åy'û<™Ý¾ñÆnNy&+<‰â“úé§áÑåÉk[}ÑBœñÌ·_‹‚ÍgÕƒ=XVp_â»fˆ§÷6×yJ'˜7ÚcZ÷!™É{[b¸BËM÷Cu'2…f š?Öì‚xæCô\zé¥ñïUŸ¿ïºþá‡6‰ûzλ^ac$–«Ùis¼vÓuÅ<Út£…úè<ÑǾOñžåTž9¿0q¢ç2º^Ž!@ÈÂ×dWA€ @9@`È!.–»b{Í‹ªÞZ‹¬Ê‹["»âqë§1eAÙ—!Ï^¥K0öùµ­U«–‹A/‘Ò_ïcˆëãÀ¿ÿýo·€¬/ÇoãÅ­×y…?Q{Ã[a[$ÎG ×¾¿•7´­õ‹“*}îܹ.ö¸DbßþiÓ¦ùKÜVñëe5rÛœúÓ¬Y3W”ú/a^,´Ðkð¾øòÒ÷&<˜Ç§GoýLÅC÷÷CœtTW"Ó „ y&µk×&Gìû{}ÿ%^û™ òÐ×½“ç½›U,vy¤ûð6™i³Ãͬé™Ös¢çÀ›žÝ=™1ß_ÿ,ëÚèç9Ù3§bºâ¼'_~ù¥[Ç Ù3™ö’€ ß à)ŸßŸú@€ CH@!D$*®¼â«KŒ–wöÇìZ¥Ð5²nݺÙsÏ=ç<›µ©„U呈9`À—'øG¢{燼ÂàhMÅ{_³f»F!q&Fa:$ÈjqUÕ+±W±¼Õo¿Õ>-t©Øñ>dŒ?¯­Äa]÷á‡:1uÒ¤IIC¾hAO-,ªÙêf haXõKÔ~ÅÃWè-L{ '8±väÈ‘®Í‰é`ÛRÝ×Çõ_Þâv%Ϊ-ú(¢¶Éä9.—WµÚh1Û`Ýb÷ÕW_¹…CÕw•«¾Ià×úÑ!S‚׊§ê•H®±fFŒâ™BÞÈ_ Ëó[ÇàÇŽk§œrŠ‹»?nÜ8w¬çD33ôÜɛߋñ©´Y³"äÕþý÷ß»gH!‰”–Š©| Þz4ÃBåèXâ·ž÷ÌXðYÖ$ ðÑÏs²gNõéYÔ¿-<¬3«V­rmÒ¿ƒèÐF™iy!@ˆ$€(Ƀ#@€ @à Øwë­·ÚK/½dï¼óN¸f ”}ûöu´å]­E;%Äÿç?ÿqù$ +O<T1êå)-Q×{WË;Y"°L¢îwÜáDo‰î¢%„K¼ô¦Ð9£GvBôòåËÝGÎoµ¦ÝÏ>ûÌ%)6¹Dÿ ×²Ïë·òøïÓ§ —òÖ[o¹d Õ7ß|³[hU ={ötíŸ8q¢‹¯ö飀„M$Hûs~ëë ÷ýyÝyëëƒBühÁO±V¿½)>(»¬U«V¦EiÕ¶D¦Emïºë.{å•Wì½÷ÞsY%òj±ÞèØúÑå(œ‘ÂIPW=zz÷î-âXùî¾ûn÷!G¼wüI'd~a_•«5ÆŒã~*@‚¼^•¥Úf-Ϊ>½ýöÛîÃ…fij '}äP_~ùe÷ñ@×è™Ös¡çCÌïâüÑýôϲžWõWןçTž91ÑýÔ3çà“è°CqšA2 @)(šš—x®`Š‘ € @8| øpÚzSûèt,ï] Üúù}‰ŠÞ«8VÏã‰å±ò*MaP´«¼Ö]+ïx މbŠG×±qãFg¾H‘"ѧܱÄYS;V/<'òVy)ûøÞ±Ê‰•¦þ¨]‰âˆ«}bKLUffÓä±®íÛ·ÇèWh}ô—ù=÷ÜQ¤î•„ûDmޏ p Ï{ÅvOt•]bù›o¾i=ô{&6lØàþMU¨öUêùÕý× ‰X×&;¯r’µYeèÉ •¯kõìdæ™Öu±Lm‰[ÞçOõ™Ó³‹™/‡- @ $ í·`Á73OùŒÜH @€‰´É„Z5ËÇ6ÏL%È&²D‚¼®Sø”d¦Åjý‚µÉòϧҟdí –—•}Åz?~¼M˜0Á…v‘ø¯Øêxb(ÚR¹OÑ×øc ×™¯Õ}ÊŠITNtm²óª3Y›UFfû싞¯Tž±à5ñöÕ–dÏK^xæâµŸt@€@~ €(Ÿî2}„ @€ €€<¢å‘>lØ0»^³$ZèŒ3·RIpy®œÒ‡Å…Ï)±:WI¡€ @  „¦"±ák¦OŸž$\@€ üG :|ŽõSH …«ñ!kÒDi cS¨P¡ _“ÿ¨Óc@€ #@²ð5 ‡˜x5ž#ý @€ @€ Bˆò‡>UC€ @€ @€@þ"€(Ÿ¿î7½… @€ÀA# EB1@€ 䩎åóËA?!@€ p ìܹÓR}19ÈM£:@€ ä({5þMÅ §’‰<€ @€2K@‹Ãnݺ5³—‘€ @G4<åèÛKç @€ @€ @ /@”ÏKwƒ¶@€ @€ @€ÀMQþˆ¾½t€ @€ @€òDù¼t7h  @€ @€ ÑXèõˆ¾½t€ @‡ŽÀ[cÖØK#VÚ‚•Û]#¨€ @@“Ú¥íº³jÛ«&­ Q>)"2@€ @™% AþŽ—æeö2òC€ @à°$ G?þM&̾氼Å4€ @y›€<ä1@€ ä7©ŒƒsÅS¾iÓ¦V¹rå”xïØ±Ã~üñGKOOO)?™ @€ ¼O€5yÿÑB@€ œ'Ê88ÇEy ò5J¹7åË—·bÅŠÙäÉ“æS¦FF@€ @€ @8 ä¨(_¨P!'ȯZµÊf̘‘‡<éÛµkç„øJ•*¹}„ù„È8 @€ @€ @‡9å=‹ýû÷Û¾}ûüaÌ­?¿hÑ"+Y²¤Õ®]û°æW®\éBð,_¾Üõã„N°5j„û=eÊ[·nuëÖ-œÜùú믭téÒvÒI'“mîܹ6qâDûÃþ‘¾sçN5j”uìØÑÊ–-ëÎíÝ»×TŽÿÀ¡6Åú0¢Y šÍ€A€ @€ @€ÀÁ'pÈz•€¯˜ò‘½ ,ûÞûî;{ôÑGmܸq®ÉÑû÷ïoÓ¦M waäÈ‘öÙgŸÙœ9sÂi~gáÂ…öñÇÛ_|á“ÂÛ·Þz˦OŸnK–, §igÓ¦M.ÿæÍ›]º¾ð öå—_ºJ\°`Ë#¶ÁŸf1`€ @È*MS§˜…ÆŸ‡ÊŠ.dS_8Õn¼ ±kB‡–UmÚ‹]íø£+f¹Itªc5+•Ìòõ\@€ ÌÈOùÌ4@1åëÔ©c7n´ªU«:a¾M›6.Æ|fÊ9y%p¿ù曦ö^sÍ5® šðüóÏ»ôV­ZYÁ‚­L™2î܈#¬yóæM•.«P¡BDúŠ+œø^¸pa'ø×¯_?â|ðàõ×_7‰û}ûöµš5k†O©î{ï½7|Ì @€²K`úï´ ]ÛZ“¿?`‡Â™¦HávLÝRÖ¶IyוFµKYÓ:%¨>Í6d©{mBeÍ^rxYŸ¥Ë¹€ @™"pÈ=å«U«f-[¶´-ZXÑ¢E]ã}H–Lõädž9s¦ݯºêªpí¯¼òJŠF¼I˜_¶l™yïv¥oÛ¶ÍæÍ›í}^måy¯çŸ¾ó–WxšX6|øpSxœn¸Á ÷±®% € @àð ŒÛo¿ÝÆÑàùóç»ô1cÆD¤ûü y˜ÓÖèŽ?غQ“mÝèÑ9]4åA€ @ _Ȳ§|ƒ \,ø ¥ îKŽó\âüábK—.5}T7{ÐäýñÅ“Ü¸ŠŸÿÍ7ß8¡]'ÇŽë*Æ{PÀ—·ý?üàbÐk1Ü÷Þ{Ï… ‘G~Ð$ÜOš4ÉÎ>ûlkÖ¬Yð”ÛW9ÑaqTF•*U2ä%€ @ ïøöÛoÝxPžé§œrJ¸¡[*T¡¬sçÎn«?£C‚¹Ò5ìÚµk8='vê\~¥«SÏ UÊz¸˜dí¨S¥”õé^ßš…<âç.ßno\n WnIvYÊç­_Þ®îVÏ*–)jÿþ|Y†ëÊ–(bv©m]ZU±ñ³ÖÙû£WÚÆm{\¾›Îoš!PÀ½¿ ⺿\ÖÔÖoÙmÿz2"€ @ˆT“SD"¸aÆ)æŽm×®].Ž|ôÙcŽ9&:)ÏoÙ²%¦—{¼Ÿzê©N$ïÑ£‡Ë"Q½S§N¶aCä4ÛY³fÙž={ìÄO´%J8A_^QÑ¢¼y™¶§vZ†:'/ú é¾!ʉ°@€ò>sÎ9Ç4~ŽØ/ºè"×x3ƒvÁXzzºuéÒ%˜œ3û!Gœ 'äÊÏ™#K9õ„ê6üÁ¬`¨ž}¡ØõçžXÕî¼è(ë~ß;cudæ,ITpmSw¥Bã_Ö±º­Ú¸Û†|q@œ¯[­”Í|©‹ …ÉQý\-”ÿkyÝh[¾z»}b5ëÔ¢‚ýߘö˺®œÆµËZ¿«Û§SÖ"Êgážp  @Èo2-Ê—+WÎ òZ|T‹³Mž;íÛ·&Ñûµk×¶Ù³g§ÜGy5)ÜŒ-V¬˜m߾݉ò~øaDŸ~ú©;0`€Ûz/zå/UªT8oÛ¶mÝ‹™šUlû«¯¾:|N; ¥sÿý÷G¤q@€ pø¨\¹² WÝr9[(Œa´ÅË//ÿå¶mç>;ýî‰6kñ&ëx\UûâŸ'Úƒ½¶.Ùåk‡<ðÿCSÛ¹gŸµ½qœ-þu«]wN#{úOÿ›u:úÉ“­HÈþì¿O±o¦þfúHðɃmíëÇÛÛѽ¿¶Ç†- ‰òíì†ì¾Wf9„·\pÀaéÑ¡‘Þóy‘/m‚ @8ô2S^B¯lÇŽ.>ºb¤‡¾K¯Šá.oy} ÚâÅ‹íᇶõë#WŠ’ߺukSlÏ‘#GÚ±Çk¥K—^j;wî´_ýÕêÕ«ggu–ûùP8'NŒÈ+(}‡ÔÔ©Sm„ ç9€ @€@NHÛ¼Éæ÷{ØÖ†Bçä†u¿g¢U¿x„äUþ¤ÙkmWÚ>kX½DJÕµmVÙÎn_;â×¢ÁEaß®º)âæ_^þÉ ò*ð¥O~¶™K·º²Ë„ÂÖÔ¨P̆O^ãy%J˜ÿfæ«S¹¸.XÀÆL[mÛwïµK:Öp×èÏù'U· [ÓlÚüȰá ì@€ @ @ Óžòkóý®<Õ%®?÷ÜsÖ³gOkܸ±-_¾ÜÞxã çÑ^©R¥ ŒN?ýt“g»L‹uEÛwß}ç’n¾ùfºÆŸŸ;w®Ýu}´)~¨Î¿ûî»vÔQGY­ZµÂY¢?(Þ}… ÂçÙ @€@f̾û[ÿýOVãâ¡s2sm*yåÍþæ=m¬Uƒ2V¤pA'¢ëº‚)FÞ|ëžã­v¥bUM˜»ÉN¿c¼u8ö@üqÓ×EœŸºp³;îЪªÛž ™³ã³s"òè eã NxÿÛÕÖ»k-«Y©¤•*QØ*•-lƒ†“á" @€ EQ> HfµÀkß¾}íÅ_´×^{-|©<诿þúðqp\y¶kqؽ{÷šË6yÃKÜW,ù ),ÐK/½d«W¯vai‚ç´ß§O{ðÁíÙgŸµ‡zÈÖÂ^O>ùdDÖ:Øe—]‘Æ @€R%°såovì€û­ThÌš6åùNV¶D!ûzÆ:9}½ý¶a— ùËq¶mWzJÕ]øàd«T>R”_µö@ì÷5›v»2t^¡k¼U*[Ôí®Ùp ßO¡Åe ÿßlز%‹Ø–i¶pùk ]èDùz4´²¥¼R=5ŒÐ5ž'[@€ ÄåóIzVágî¸ãKKK³ß~ûÍ îE‹Ôû‹oºé&¿ë¶ÑqÞ¯ºêªðù{ï½7¼ÜiÙ²¥Ü}šÄ÷ )4NÿþýÃI m£@€ œ$ÐþÓÝØW Éæ´U,SÔÊ•,dSÞb=î›äŠWH³ãÜ¢«©Ô§8ôñlìÌõvë¹õìÝêÚ”¹¼å‹.d[˜á:}þFÓâ¯ZdöõÏÿ'Êÿî˜ÊÖ*ä%¿gï>W´ýëvÙ¥jX‰"…lÁª¶zã®xÕ’@€ ˆò8²~P¤H«S§NÖ àJ@€ äs¶î ßûíø†eìÞ^Ímóö4»û¢†á6ÙÅ£øðŠýÞ+z¦xÑB6}Ñ»öÌ:îC€ÊÞgûí“P<ùsCákÆ>ÝÑ޵ʚÖ+mׇòlÙ™n/~¼0Ü„Á#–Ûýš¸ã‡ÞžNg€ @ÉL–ó€ @€ÞfØÞþ½—6°Çûm‹Wo·u[ö8öXmØòlOÕv§¥[ǾlÕÆÝvÉ)Õ­ÿÕM¬B(üÌGß­ qÙCßÛÓÖY«úeíé?5³?u«ã¼â;üy|8v^¾Øµi_ȵ~ÈçÄ“€Ã @€@BxÊ'ÄÃI@€ @à`øøÛ•V>ô«ZðuÛÎ4Û¸mODõÛwíµ’Ý? §)ÌL0ÔLøDœ…žitåWV¡tQ+Q¬°­Z¿#CÎ î?:§aÍ2¶&–fk¨Ѧ´Rgÿ¯Ñç9† @€@<ˆòñÈ@€ 2+ÖnÏÕº%öG þÑ.Zu`a×ètŽ!@€ „¯É=®… @€ @€ d‚@®xÊ(PÀ DïÏÄ} + @€ @€ ä9*ʧ§§ÛÒ¥Kí¨£Ž²š5kfßüùó³|-B€ @‡ž@“Ú¥mÁÊm‡¾!´€ @‘€ÆÁÉ,GEyU6gΓ8_©R¥˜u˃¾lÙ²–––fÛ·gŒ¹~ýz[¸paÌkI„ @€×UÛîxiÞáÑXZ @€ " qp2ËqQ~ÿþý6wîܸõ/^ÜN;í4[·nýðÃqóq€ @8| \ѹªküK#Vâ1øÞFZ@€ "yÈK÷ãàD—å¸(Ÿ¨2ÎA€ @ù‡€^HRy)É?Dè) @€ÌX•§€ @€ @€ p Ê$ÐT@€ @€ @@”ç€ @€ @€ $™Ž)¿}ûvÛ¹s§5jÔÈêÔ©“éf(PÀ]£…^1@€ @€ @€@~"iQ~Ïž=6qâDkß¾½•-[6K¬æÌ™cË–-ËÒµ\@€ :ÅŠ³‚ ZzzºíÝ»÷Ð5„š!@€ p˜È´(¯~ÊS~Ô¨Q‡i—i6 @€ UrÌIKKs‚üƳZ ×A€ @ _(S¦ŒS>_Þz: @€ ¼I`ìØ±6`À€pãhC† qÇòÌôÑGÝÌÝpv’ؼy³=öØc6{öì¤yÉ@€ ûåsŸ15@€ @)X´h‘Íš5Ëå–?sæL›2eŠ;Þ¿¿ýôÓO¶bÅŠK#›lÛ¶ÍBtõêÕ @Ȳ¾&´›&@€ @G8Â… Û AƒL[ € @G F·Gʤ€ @q ÈÓºmÛ¶áóÑÇáìä9ò”Wû–-[F´í—_~±I“&Y¡B…ܽ­]»vÄù}ûöÙ‚ œ×}ñâÅ­yóæÖ Aƒpž‘#GZ:uìè£vióæÍsø§Ÿ~z8ß»át€ ä7àÿú׿ÚÅ_lúÓŸ\(+6¹ÌÇ+÷\$r*_n™Ú¬³wïÞ r«ÞõÜaÆYÕªU#Dù3fØgŸ}æ„_ Ù~ø¡õìÙÓºuëæº©´~ýú™BáHÖñ{ï½gÝ»w·K/½ÔåyóÍ7íÄO ‹ò›õ‹%ÊKÜß¾}{B„ µ³dÉ[¿~}†|¾Ÿ{î9»ì²Ëb–¯ Ri³>P<ñÄ®ü"EŠØ—_~i¥J•²þýû[ùòå]ú¯¿þj÷Üs+Oy¾úê+«W¯ž;üóþûïÛðáÃÃI:>çœsÜ¿%îÞ½Ûý»Q<ÃÏ?ÿÜõ¯U«VæEyÅ©üñÇ]9ʧûÒ°aC{à¬@&AèС6nÜ8÷!Ay¾øâ «T©’=õÔSáúÙ @ù‰1åóÓݦ¯€ @ 'ïUW]ñ“(mŠƒ]“‡¼y‰²/¾øbX ÷žóÅõ“/“Ȩ|9iâõ“ýðÃn«:£-7êU¦Ú+=º=yõXÞ½zõ²×^{Ížþy«V­š~å•-{ûí· þùçÛ¿ÿýo{á…¬I“&N0–h}°mÆ &OúD1Ý“µYÿf´è­f <óÌ3öꫯÚwÞé><ùä“á.i_¿„yå‘`ÿÛo¿…ÏkG1û%Èë߀øèwüñÇÛ'Ÿ|bš1 Sšyý;gÍ(W®œ+Ûeýч ëØŸ}öY—ïòË/wìÿïÿþÏgs[y㫌—_~ÙN=õTW¶>v`€ @ ?@”Ïw>C€ CL@á6 D¿SN9Å…¸'m´ýýï·?ÿùϦÙ1y¾Ë³W¼¶Þ^¢¤ÿµiÓÆU!q1'…y‰ßòÌ×Ï áúà? Èk^é]»v 8ÈN_ýµÞU§úÏT§ÿX/O^L¯_¿¾vÚiλtéÒvÝu×¹fjX™Â»È»^¢¼<³åM~Ûm·Ù 7ÜR|zy‰ËÃÛÿÖ®]ë„f¬­Df…’ ¦©îàñÒ¥K•äÚ*ñúÊ+¯tDZþ$kóüùó ~Í5×X… \-Z´°“O>Ù–-[fòÔß¹s§ !Ó¹sç°'{­Zµì’K.‰¨Ržû2ÏC1û¯¿þz—æÙU?H3äñ^¢D »õÖ[]ÿGmÒǵI ”ïÌ3Ït¡~|9>ï…^èÊ÷¾f,È&OžìO³… @ùŠák²q».\hòÙ¼y³é%®hÑ¢nª¦b(úŨ”GloŠyY·n]7H.Y²¤Ov[‹Q/Nº^ƒlM Õ‹F<“‡‹b=jª®^6&Nœè^j£óuÔQv '„“õ‚ ÁøÏ?ÿìbBª§sݺu6f̘pÞèå“§Q´íرÛSeªŸš¶ªA¼q©iÅéééÑ—¹cÅ¹Ô …L/7£FrÜúôéczé’éÅ ‘·œ^*eºVlÏ:ë,w¬?zQÑ §bvêÔ)œ®)Àâ 5kZ»ví\ÜÍpv @Èæ$˜ÊW¢{¼ñŽë×_݉è>ø µoß>KíÑ8@c òÚJ÷Â|¬%Ìß|óÍIóź6˜¦zô!@žú2íËü±¼å%œËNG^ôÊ£±¥íÝÉ,üQY>dOôåþC„ÆbœÝº¢ËÏíc?>ôõøc}¼‘X.ôè>é;餓ü% ·;ÊÃ޾ϣ9ú·ëMád¾>][}ÈZ0®½Æä2½Ç`€ @ ?@”ÏÆ]×`Ö{ãh ­7 8åÕ%aXy¢êÊ'O“ûï¿ßyŸèX×+î¢z¡ ½Ô 4È öUîÙgŸm7Þx£ÍÅç£>rS‰ý4Ù ç<}¼É3^Ó‡ÅXmõ^>ëå}#Ïv?Õ\‚ûŠ+ü¥ ·jŸ¡’éeT¬ôDÌU¶¦Ýêeƒ @ ÷hŒ ‘ïÞ{ïµ7Þx#<Ó0VZ sêÔ©ö¯ý+ìY+_¼4¿¨«÷'ÈË‹Þ ª¾,kÞgv«Aot/Àûr‚Çð•?˜æó±ý9Sø1¢R}”F¹L{Ë‹[Ï—‡5®”“†ž%)å$£ñ»·¥ÿ 5£cÍ€õeéXžÝ§Ó”®ëúɼ®±rt>—W¸ÊgÉÚ¬W2Ï5î•É!Er$Æ«:–éÃOÐAå»ï¾séþOÓ¦MM¡g‹ß—¥8ô*Û—!/y•3vìX7׻Ы šo“®ñcr×!•‡A€ Ä&ïDy Ð}ûö5MNô²£i›·ß~»‹=é=«c#47èV¨ý4—7·^è‚V¦L7]ZS¦5 ׋§¦ÅÊÔ&]§Á´^J½ŒôòÓ´n ÙÁ)ŸÁ²cíëE#Öô\ ´e*ûü£Û—×»Bð|óÍ7N€—0ï¯U½ ª>Í]õÇÏ׎| ÚûÛß\(yâÈ~÷»ß¹­BúH”Wˆ›è2ÅDákTŸ^4 ×K„î^„üÇyÓK”?*äÅ]†«$É-¶å_ÄBíÖ‹K½zõ\ºÂìøDIŠâ4 @Y$ðí·ßÚ9çœãþÏÕl½d&oÜ~ýú%Ëó¼Æ} GãöXã@¥ÉÑ hñ³j>„‰Æ3=äµ/¡^íñat|~Õ§zã}8Hµ=ÑãÑT¯;òÉDÏÂgœáœ^4ûQâ»Æã29“(¤ŠòÈÙBcí?þØó¼­ñ²îÕøWãÞÜ29¨ž=z¸ubÕ“¬Í;שSÇõC$4n•SÉòåËŒÊÔ\ãf éšY¢gZâ»þ­­cÇŽîE|T¯Æë_kœ®Ùb£Ð6ºV‹ÅÊ©G¢¼{o|•§q´xʱE÷D¡tK^Ü4 ×‡Ç ÖÍ> @€€Y¾å%@k )Q>ž0//™¿üå/nМL~ˆÖ¬Yã’¼GNôyyÈ{aÜ ÌìÊ4øõ‚¼Ž5TäååážQ^q ª½i!¬æÍ›‡½‚^ôÊ#/~ý²jziœ1c† çsíµ×ºº¦òvèÐ!ì”JÙj·LåIp×‹ÒøñãÝËE*×§šGluä=¥©É:–7Ú+N @¹K@³ÖšN ½¦j\%ü)\GpÌ”êõ;_0,L0Dwq$ÈkëÅz ¨ò’—Éc^çb}b!W}àR¸yÐ˹FyôAkèСáêäUÏ=÷ØK/½äuPû/¿üòð3§Ð4 peè}EýÒ,}Üò¦EtåT¤ŸþyØéE!|®¾új—MyâY"ñ®!€  ò(¯›&oyš(,Œâ _Vä©-A^ ÃÏ$ºÙš ûÌ3ϘÄ| è²hïmåéÕ«W¸ Nå&“×¶ÌO}uÿý£N•)ïñ̘<}‚Þ>ÇsŒ›%BËrúEV/Fü+$ŒêÐGýGu?µ5Yäí#“W½<䙣™ŠécV&+#Õóº÷*_ÓŸõ1E?}Ð ½°a€ äÅ4k-Â.Ym ý¡v¯¬èÞ‹=YÛâW˜¼àXSù¼ ¯²%†K4öy‚õ)_VL¢©¼ä½ÀŸ¬ }x­$Áú“]—[ç%*gN($¢7Ídô¡•¦4z–¼ïói+g ýâåÑÌLyÓË‹^3YcÅ}y±DõçüVmÓ¸7–ib…]L63Y›ÕOÍFÕ½ÒLWõ!Úôž!'$ýûÒ;„¼à%~wëÖ-"«¼Úõ±H^÷b]–œe´p®ž#¿ö“>¤Éc^¡v¼iŒ®~ûú45(ÄwéÒÅô‹¶à}Œ>Ç1 @8Ò äKQ^7U‹2i°æ\‚¼<¶2ë5îåR˜ x£=ì5X•0®|°*&•)æäĉmåÊ•î8øÇ{ÞÇŠK̽¯¸ôê£7½ÀÊ$tkp]—jUÿ5ˆ–‡zVL}ÒO4`×a ðµÀë]wÝ•´H/¾+ãÀ#óâìO²Âô²4½hÉ‚Þ8òRL½ˆÈûLJï‘GQªëë`€ Ô èÿdyÞêÿÜTÇrTÐÿÛ>4^êµe=§÷bÏj Üý8Ñ Þ~qՠǼ/߇µñÇYÙjZ_v*×û8öyÅS>•6û<©8š$Ë#¡;–¨ïëÈ©m2A>XO²6ëßO´ˆ¼^ûz×I壗Þü»B° µaôèÑn-ÍR‘#‘Æëò¨÷"}0ªõ¯a€ äWñçæ"š~©¸ñZdUq(%ÈKÌά ¯xŽzùQœH-~¤EM£M/›òXñÞñ#GŽt/•Êç½È%fûEª”®Ec%jË”GÞRZ€Ud«W¯v[ý‰öìÑ ß¿`hë½U¾G¦8 ¥#ÓGyú?õÔSâMº )üyá…LÓi5#@avŸ]!sd»víJ¡s±/}F½@è7ÖÆÇÛꣃL}’§¾7 í2Ý+™Î©½Z,N/!š.¬)Ïž“¼þ1@€r—€Æ ^°N¥&ý®ÿË3Ò/•rãå‘@®ø¸³bß} { ß*O&¯xºFy¼XïëS¥ë—U –™¨ ß&Ÿ?Q^Îå/ '©÷#-«÷}SZÿþý]Zþ¢Ao!@€@ÎÈ·žòã©§žê„ØÇ{Ì®Å=¼ôÒKý©”·ò"‰C>º ö#FŒp‚¼<É/¹ä÷r©)­ò—¸/aY ®ZµÊ]®¡$¬k%…qY´h‘›¶êCåH¼.Y²dDUúÈ0nܸpšB²¨.y›ËãE×j-±_õH — ­˜”Y1ÍÐ"SŠ;©ÁºÌ¿äEǯU¾¦àúšr­é½2ÍÐb·*[mLæ$Nê“âék:®>Bhf€ÿ€!Î2½Ð«LýOSñüåù#1_Þ?™™à å @™" 1ŽšTxŒdB»fÁ)ÜEçÐâ‘Ë$žk¨.ñ‰ÕÆ„•r€ LÐØA1´yä÷;^úx¯ÐvŠžjœôxe%J×øE3)ƒ?/È=Ý•}Neúð0þœ'Å:–w|Vëòå±… @€ò6!áýy»‰Yo⚎&Á] ™ÅÈ%^K¸—w~¼<©ô{ïÞ½nñ¦š5kæØÂ¯*SíWøœZµjRq[m‘'¼Bû$Ч©äÕ^‰ô @€ 4KN㛾}ûÚ 'œ1~X²d‰i¡Oý_­m¢ÿÓ“µZõèÃ},/um´`®tŅשòšÏ¬»Ê“°¯ šM]–¼è%ÂËü/â'j§» ÉõW¦™˜ñÌ{ñ«.}üi+çmõ“#‡ÆUúi6§fhú4ÍHôáãÕA: @€ p€€"uÈ)8߇¯É‹„ÂÔ$2y}çDx ç9ý¥2S]¬-QsâœÚÒ Aƒ¤EiB²°8I ! @Y" ⯾úª 6Ì­óS¼xñð8GëíhÆ›ÂÏIØÎŽ Ÿjã¢Ew‰ÖYäU§Ê“Ø.¡Ý‹íJ—8ïž×i7@IDATë Šñ:§ãì ò*G^÷â¦>Ä*O¢½ï›òc€ @€ÀÁ!€§üÁáL-€ @IÈëúÇ´ 6¸5a$Âk½›èµs’÷´÷÷‹¯ÆÍ:á×È‘¨ù貃¸¼ç%Ê«^¨÷¢¹Îy±>ºŒ¬ëöe¨-²«~úúð”÷tØB€ @ wxOyDùÜáK©€ @yŒ€ÄéÙ³g§Ü*yš{Á:å‹RÈèEyŸ5·êñ姺E”O•ù @€ 5^”'|MÖøq @€ÀaFÀ{£êfKèÏ ±ÿP÷‹ú!@€ ÔL-¹ @€ @€ @È.Dùìäz@€ @€ @€@ŠåSE6@€ @€ @€@v Ñ1å×oÜ—]>\@€ |A ¸Ðëþýûlßþý¶ß~KÛ»ÏÒõ ík[²d¾ÀA'!@€ kð”Ï5´ @€ @€ @ˆ$€(Ƀ#@€ @à;v¬ 0 Ü‚Ú!CÜñÞ½{íÑGµ‰'†Ï‰;;wî´I“&Ù¨Q£ì—_~9h]ܸq£-]ºÔÄ9ÚÔ&Û¶m[ô)Ž!@€2Ià …¯)]º´U©RÅJ•*å~èmÝºÕ êÖ¬Ysà—ɾ€ @8Ì ,Z´ÈfÍšåz¡w†™3gšÞ%z÷îm ±óÓO?Yýúõó^Æn~ZZšûè°pሠ5jÔ°¿þõ¯V¡B…ˆôœ<1b„½óÎ;®H}©V­Z¸ø9sæØ AƒLÂüUW]e§Ÿ~zø;€ @™'ë¢|ùòå­qãÆƒ:ßL .e|.X°À–/_néééþ4[@€ @ (\¸°ƒµÍöÌ3Ϙù“N:ɺtéâDxÍ øðÃ(ÿÜsÏYN³ç»ê?¾•(Q >‚¼õÖ[öÕW_¹s>- @€@öäêè¶nݺֲeK× 6¸éŽòŽßµk—,XÐ ìä=_¯^=kÞ¼¹óxÑ4My``€ @€ä)_¶lÙð{…'¢°.zw(T¨µmÛÖj×®íO¹í¾}ûœã¼î‹/îÞ74hÎ3räH«S§Ž}ôÑ.mÞ¼y¶bÅŠ˜^àƒv³~/¸à‚ðõÑ;r4zøá‡­GÖºuëèÓIÕõõøã·n¸!œÿüóÏw3ß|óMûöÛo­sçÎásÚ‘`>{öl»ýöÛ#Ò£âõaêÔ©îCÀ5×\ãf2¿ÿþûáK7mÚäÊW§œrŠõë×/|Ž@€ ¬È5Q^ÞñàîÞ½Û¦M›fëׯÏÐJÓ@OST•·aÆ־}{#a>. @€ ï 6̪V­!ÊϘ1Ã>ûì3çè#ñ]žä={ö´nݺ9>J“€¬÷ 9éø½÷Þ³îݻۥ—^êòHä>ñÄâüøñãM¿X¡Y4«wûöí Ù+ÔÎ’%Kb¾÷è}H^î—]vYÌòU°êI¶ÓN;ÍJ–,sö±>$ÈË=™ÅëƒBõïßß4‹yøðáŨξ}ûº Š'A€ ä \å+V¬è·;vì° &8a>Qs5Hž;w®mٲŠø4]s̘1nðœèºCqnË–M!Oò9RuÚÞ=öÛo«b–U¦tY+_¾bÌsщ+V.µO?~×οðJ«^½Vôéðñ›¯¿hµj×µ.§žNc€ @‡yÉ÷êÕ˺víêÄòüã6tèP;î¸ã¬fÍšööÛo;A^^æçwž›‰ûÔSO9!¿cÇŽN€>˜}Ö¬ayÒ¯^½:nµ‹/¶"EŠX¹rå2äч…:dHωÍZŽgÅŠË’×¼òH‡ @8@ ÇEy ²FBû”)S’ òÁ¡ÁµbÐË[CÓOc>/Ùúõkì‰ÇÿnW÷¹)ôÑáØl7mÕ/+íéÅ,§Qã¦vÓÍy.:qã†õ!Ïš¹¶uÛ«nñEùY³¦Ûî={B¢|t C€ @àð! ÷yË´ìu×]ç<㵬DùüÑy×K”—•*UÊn»í6·€l*1Ù5£W"¹·µk׺œr$òÖ´iS7ë÷·ß~sI~¦¯òÔªu`L®íGu”k«BÚTªTÉ_ža«™ÅzJÅäõî×âÒ¾ê¶May$ð§Ò‡ ¤R%y @€r@Ž‹òÕªUscMUüøÌšb9j©ð7+W®ÌSÞò•*Uµ‹/íeÿù÷svÍn±&MŽÉl÷bæ?¹Ã©Ö¢EdÜÉ29ä³B!@€ p(4,Z‹“Ê#]qæƒ&ñ^3rS±eË–¹.Á¼z·Q˜o¯¼òŠ5*CÈ9&é'SØ'žxÂí'ä•Aác´Èk*¦ØõÑlÛ-·Üâ<îƒiÊ«E‹.Šc@€ \&㢼ž2 d³bòøÐµŠ/_¡B…˜1³RnN]süñ'YÁ…BÂü³v͵·XãFͲ]tšµíè¦-â–£YK–,°ysg;/Ÿ¦ÍZ& S£‚ÒÓ÷ÚO?ͲŸþdUªV½€tŠYþÖ­›mö¬i¶nÝZkzL kØàhwS™å!4iâkÚ¬Eè>¬±óçXëãòöi³,!@€ p0DÇwWØL™„wy~+ìJ*BŠï-¸/ïöxÀŸ²‡zȽ›Hìö&OtÅŸoÕª•KÚµk—=þøãvòÉ'»°:JÔ³©š>,(6ü¯¿þš!¼Ž>4hÁZ… ÕŒbµÇ·÷µ×^s ÔÛ+O}-€L‹×‡TÛG>@€ œ#ã¢|õêÕMƒb?0ÎJS×­[çDy ªc-›•2sòšV­Û¹Áþ^}Öþpí­Ö áÑ9Y|DYäŸyºŸ-_¶$¼HÕǽkgœyžÙíÀt܈ Bi{ÓìñþsB»_Øê›‘#2Ì:X0¶½ðüÏåûfÔ«wTûóm÷»þíØ¾Õ>þhhhÁ«¯mÃúu®~ üˆòÑÄ9† @8˜¦F$ße“'OvÛF¹mݺu]Ly­YU¶lY—6gÎçÕ®ÅV%€+´ÌÏ?ÿìÎéÏÒÀB¦ò÷eé\•*U\Hš`šÒÿÝÇ€÷ákä\Oy÷„ÂH&òJW<ü/¾øÂž}öY“'|0¬Ì—_~éâä÷îÝÛ‰òúhà­Aƒ¡qÿº˜uÛ¯¾¶€ @@ŽŠòvå1²yóælõÀh%ÊçU;®U[+êï¿_ý—õ yÌ7y˜gÕ6m\o«V­ˆ¸¼r•ªV´H1ûè÷ ö¹Û©¡Z÷ìÙe¯¾2ȾüâãЇ‹¦Ö¸IFOýÿï-'ÈŸuÎEvZ×î¶s×{þÙÇlc¨o;vn·—_z&äñSÉn¿ãï!ü26~Ü×öáoÛç#>°³º_è³Ú–Ðý¼ùÖ¿ºú‰ì@€ @à|¿~ýìŒ3ΰ5kÖØ'Ÿ|âÄ÷-Ì>Õâ® £ßn¿>ﯸ⠷€ë¤I“lèС>ÙŽ9æSè9AE›¼æS™¥œJ|ûü6º.':+?i€ @ ä¸(/a¾T©RkÊDЦ’Ê4Ý4/Û¦MlÌ7Ÿ›DëìX‡ÐB¯Ç¶<>¢ˆºur‹T­]»ÚŽ;s¥J–vñ)—/_‘®ñW¨™èk*W®1ˆ_¶l±»vð‹3”¡ëƒV±b¥à!û€ @È5×\séçmРA~×ä´óú믇·mÛæb§û÷‡ð‰ÐN‡Ü/^žòåË;ozÍÒUh†&X†ßïÓ§ß»UÛî½÷Þ˜çÛ·ooíÚµsí™!Ø«W/ÓO‹ÕJl¯S§NÂëºvíŽa(&Ãn*}8çœsL¿X&ñ?È>VÒ @€R#£¢¼ªTÚä¹R¸p‘ ×háW?A×”,YÒ]z^žV©Òÿ¼~vïÞeeÊ–‹.–c@€ ä9©„»L–G‚~,Q?§;›ÙYÀÕªUËé&P @€@!qþc6öÛo¿¹)òèÈŠiJ¦¼0ÒÓÓ³›>+õ§rä›sœÁ©\’å<5kÕ -:µØv…âÂ{[¸`®Øë×oè“"¶5jÖÊpÍ̈ÈSÿ¿1ð÷ïO·¡0<þW6$ÈïÏfø¡ˆŠ8€ @€ @€ 0÷”—(߬Y³P ò†öË/¿„Ää]áÊRÙiÔ¨‘ ͲpáBKKKKå’ƒšÇ òG7ma\xe®×ݵëYna×gž~Ä:v<ÍvìÜa_}1Ü1:á„ö1ëׂ°Cþó¼==°Ÿuêü{häןFä­[·¾Õyèÿx˜iJo½z B±ägÛ¤‰cBáx:YÓf-#òs@€ @€ @Ù'ã¢üîÝ»möìÙn¡¤6mÚ˜) †MIÔd òMš4qbüâÅbž'ʰÏmٲɅ¬iÒ¤¹]tq¯lW_ …Žmq¼]Ö³ñ {wˆ»¢zZvU¯?YÙrÂüD—Óªu;[»~}âý5g}¡üê¼f$Ü|ë=öï—Ÿ±1£¿ߣ¶íÚÛÅ—ô>в I¥Ðd²@€ @€ @€@êׯ¿?ιl%wêÔÉÊ”)ãBÐL:5ã|gÂòäY/{ ø“'Ov±é^ÂÉ‘ßD†lIá’„YÒö¦Ù¤ £­c§ß'Ì—['·ïØæbõ)\4å*6oÞºå"y¾XÌ7‡>8”+[>a¾èë8† @8rìßàµ@Ûýû÷Ù>m÷í7Ó÷îµô}én[«FiçD³7”¶qãFçTsäP '€ @È= ,0­§šãžòjrݺu /±·\¹rÖ¹sg[±b…-_¾Ü…J zÎW¬XÑB¬F¦ý÷ßï÷¹×õ¬—\$´€ê¡äÕêR1vMÖ›rÿõ¦O”O^óÊWL”…s€ @€ @€ r\”¯\¹²µhÑ ì&Lp±Ï›7oîoÕ®²;v¸…\K–,i… riŠ?¯8òŠoŽA€ @€ @€ŽD9*Ê+\MÛ¶m'… Ùºu«û7ΪW¯nòŠ—ç|éÒ¥]˜šõë×»óò ß¾}û‘È—>A€ @€ @€ÂrL”/^¼¸xâ‰Îó}Ú´i¶aÆp% W³jÕ*÷ '²@€ @€ @Èg æD .líÚµs¡jæÎ‹øžP)€ @€ @€Ž8ÙåµHh›6m¬lÙ²¶lÙ2[´hщA€ @€ @€r‚@¶Ã×{ì±V¥J[³fÍ™3''Ú”ceTªío9Ö ‚ @€@^&°ÿ~ױŋGä9r¤¢}¢ò~ýõ×þ0b;xð`ûàƒ"Ò¢äåþ÷¿ÿݦOŸ}*¥c…«Q_?þx»á†\¨Py«Ÿþùvå•W:±üÛo¿ÍPÖW_}e ñ“ÌâõaêÔ©îCÀ5×\cÝ»w[Œf/È)ë¼óÎs3âfä @€@RYr9!~`R®d€ @€r€À°aìjÕªÖ²eËpi3f̰Ï>ûÌä±-ñ]žä={ö´nݺ¹8¼øâ‹V¥J÷‘`ôèÑç9€ @È,‰ò™«‚Ü©ЋÃܹsmùòåÖ¨Q#kذ¡{HåÚ¬äÑ¢Qší ©À·ß~»ó’gÎÝwßãSb³Ò>® @€@<ò’ïÕ«—uíÚÕ‰åÿøÇ?lèСvÜqÇYÍš5íí·ßv‚¼¼ÌåÙ­…SŸzê)'äwìØÑ ÐñÊÎtÍ0–°½zõê¸ÅË“¿H‘"V®\¹ yôa¡C‡Òs"¡^½zI‹y÷ÝwmëÖ­ö—¿ü%WßQ’6„ € @à!€(Ÿn¤¼t4}V¼·bÅŠ¹AoÓ¦M}RŽm%Æ2ÄÊ—/omÚ´qåjÚª<{4ØÎͤr¬@€ o È»[Þã2-{Ýu×9Ïx-+QþÇtÞõåe¥J•²Ûn»Í- ›J|úÝ»wG„»ÑLá]»v9'W`èÆé›6m²ß~ûÍ%Iø—ÉѦV­Zn_1Ú:ê(×ÖÖ­[[¥J•\z¬?ò°×ø<“×{zzºËª}Õ­z½Õ©SÇ üÁ=ñúP @YÌ­>€(DN§N¬Aƒ1ó@€ 9ˆò™ã•ã¹?ÿüs7È•‡Ê¹çžëºŠgùŸÿüÇþùÏÚC=äò9Yñœ9s\q*Ûüï½÷^çe„ Ÿ“¤) € @ 7H”š?–½ÿ~ç‘Þ¶mÛ`'ÞkÕTlÙ²e.¤K0¯œWæÅÛ+¯¼bš}òEq×õ“)ìÎO<áö òÊ ð1 .ty“ýyøá‡3d ¶í–[nq÷Á4]«E‹ÍPV0Aakä0tùå—“Ù‡ @ÈDù$ðäâ=]eݱc‡›’kºi¼ë¶lÙâ¦ÙJ¿ï¾ûÜ`WyO>ùd'ÄßsÏ=.¬ŒÕÞ´ˆ•Ą̂¾-ZXãÆÃSHåÑóÍ7߸x›pËC¨víÚ.V¦÷Ò‹Ãüùó]qZLJuëåDñ6ååÓ¥K_•©}Š©:k³yóæ6vìX·ø”Zºt©óÈ9ãŒ3Âm·ÿ—_~é<‡ôr´jÕ*×õI‹^éš /¼Ð½©½ê‹¼{ÔõGÞD @€ˆŽï®±±L^óòü–ˆ¬ñp2SxoÁý£BÞí<ð€?åeä¼—+ÔŒâÏ·jÕÊå“'ýã?îÆò «#+^¼¸Û¦òGcgÓýõ× áuô¡A ­V¬XÑïå\ãÛûÚk¯™âÊÛ«÷—B… E¤éšX}HÔ6ÝõB3 ž|òÉpÖmÛ¶¹tÅíW¡TBà„/f€ @Àå<ŠûØ·o_· TïÞ½ãæÔK€â+j ëŸÿüç¸ù¢OH—]|ñÅaAÞçÑ@Zuûi©JW,G-h%S\Iyå4iÒÄäå®—½x(–¦„s‰á~Q«—_~Ù½ ÈSG×lÞ¼Ù•ñÁXõêÕ(?fÌûþûïâ¼ö÷ß¿ ©£MYÕ`[ƒryÓH”—È®Eµ$¸—-[Ö•)¡]m8çœsL/šF«c…çÑ”YµéÌ3Ït/Š_¯©¶*_B¾ùÇ{,ì½ï ä @€¢(LÆße“'Ov[­Í$«[·®s:‘“‰§j¶¨ÆÂZlUãT=þùg—_$@{Óx×—¥4-pªñy0MérÈñN9>|Ö†ŠÎ§¼{öìqãhíÇ2ÅÃÿâ‹/ìÙgŸ5yÂÃÊh¬¬8ùz'‘Ó>xSH™uëÖŬ3ØŽx}ðåÄÚŠ‘ÜÅÛ›qôSŸôÁƒ @ÈDù¼$?úè£NpW¶X¼ßwÞy§ìßtÓM JËxÊ¿è¥ –)î¤7 àä[¶léúÈý† f—^z©ÏêÄoMUUø`¼bH‡ @€€sF‘—¶fl®Y³ÆÍ.•ø®™—2-1Ê#‡Ù?þøc'$KÔ–i >{öl76Ö¸[ê¹e&LpõôèÑÃ.¸à‚˜ÕhVªú#^³hµ°kåÊ•mÆŒ6~üx+S¦LØ&f¹(G1Œ6½÷(ÜŽÚ‰A€ dž@ÁÌ_’¿®ç‰¦jJ×â¨A“¨ìy Ê>DL0O¢}½@ÈäÁžÌŠF¦)³ò’ǹôš‚êÏù2ä¹.Oy×´k×ÎM7UšTMEZ´JÓq%ÈË4 ÏNIy$yA^5òÄ×´^_¾¶§œrŠ[P o›Tïù @€À‘G è!Ý;N‹ŽÊäd"±]³?ÿö·¿…C!Ê‘E‹¿jÜ©1¼œXÚFN&rn‘鼯áÌåÄ¢04cÇ2 æA¯óXyT®B>ÆÛûvûm¬ë•vÅW¸Ea7lØàf›þë_ÿr‚ü1Çãf¾ÆjŸ¼æ=öØxE†ÓSéƒoŸß†/ޱ«-1²‘@€ ƒžò1 D'I˜——÷í·ßîNÉc>(È+>cfyT§N2FqëãyËû¶(Ô¼cü]Ÿ®2äá4½”M6>Îf0=ÞþòåËÝ)ïEäó§Éú´T·š.ëM pɾþúk÷óé~«Ð;ú¨€A€ ä?×\séçmРA~×¹_ýõð±b›+vz¬u‰äi®_¼VÝ>-•>(¥~Éì¹çžK–…ó€ @  Ê'€<%AzàÀ.”UÒBKš"«P+‰ÉÁ2¢÷oR¦Ð-±Dy¥+¦¼bÕK—Xmz‰ˆõ/3Ç^ŒžÂ«_cY0î}p?V^¥ÉKI&žà „ÇÒbXAßeä @€bðãʧÂIÉòh,Óãépå̾3h¦*@€ pdˆ=?óÈìk¶{%a^1ÏçQ‚|V=ä}Cä¯iŸZÀuãÆ>Ùm%€+~£ê’iº¬ž•PïM^?ZÊ/¾äÓ³»•×>h¡)…™‘©}Z\*h~Ѭyóæ…“ƒí 'FíÈ ^ýÖâPmÚ´ ÿô1@^JL…Æ! @€ @€ pÄÀS>“·RÓG“2'L9þóŸ]h-xªØ˜ß—,YbÇwUø… :vìhŸþ¹›b«…«ÆF¢¹„úîÝ»çDs"ÊèÛ·¯=üðÃn ®Ú)|‰é[·n çSÌLÙСCÝ´`‰é©°QŒÊSO=ÕFŽiŠ“Ù¶m[·@×|àâΟtÒIá:Ø @€ @€ p$@”?Äw³uëÖ¦øŽŸ|ò‰ûùæ(„Ë­·ÞêbH*MqáµxÕàÁƒíwÞqÙ´È«¨jÑ¢…;Ž· “Äò ÷y¼|®ÿþÑÇÅØœ2eŠ)ŒM“&M\[ôñÀ›<Û/ºè"“˜þÆo¸ä=z¸Eq}}~ë¯ñ[ÅÉT I“&¹:”®p>7ß|³Ï€ @€ @€ÀG @(–ùþ#®WÿíÐôéÓ«®)}­[·n¸–ZdV‚v©R¥r­òx—x/ÑÝ‹øZTKÞí<ð€óè÷•ïß¿ßÖ¯_o {“ÙX™*C×*\NÑ¢E}‘l!@€2édÚ*Ì ¶úif¦Æžúiý ’%K†ÓâPÎ @€ œÀ‚ LŽÖxÊ'guÐrh*&Q¥ ]£_nÚŽ;ì›o¾±iÓ¦9~Ñ¢E¦8÷U«VuíÁº%ÚË“?«V©R¥¬^Êu€ @€ @€+ˆò‡Õí:xíÝ»·Ú¿ûî;›:uªû‚ÓµkWëÙ³gØsþെš @€ @€ åŒû˜ã½÷ûÙgŸí~9^8B€ @€ @ȧ æÓ~Óm@€ @€ @€ÀA'€(БS! @€ @€ äWˆòùõÎÓo@€ @€ @8èˆ)БS! @€²N ø‡o[áÞ·ýŸMˆ[Hº•lÿÅ=l×Õ×[z­zqóq€ @8øŽhQ~Ë–-9FtùòåV·nÝ+‚ @€ *ŠþaÅž{3œ}x/öÎþåëÍž|ÕŠ‡~²ýõ«Ú¦gŸ³]Í[ž@yö(U[ýöíÛç¶iii–žžþ•,Y2nœ€ @€’8¢EùäÝ' @€ò.jv³‚?üœíX²Æ*t¿Ø•³uð¶íŒó²]&@€ @Y#€(Ÿ5n\@€ \#í¬h_ý*¶á²‹míù—Û¾ÂE‚§Âûe~ša•ß{ÛJ~<6œæwÊ\‡•)|—­þi¦í+ZÌ'ç™íرcmòäÉvçwº6 8Ð*Uªd½{÷¶½{÷ÚO±¡C‡º.{öìiåÊ•sGÞx㘷éÓ§û`³*D@D@D@D@¢S>*Š€ˆ€ˆ€ˆ€ˆ@ÖȱŸ•¯Z+ü¦EòÚ¼q_$ÑŽóúõ×_·iÓ¦Yþüùíž{î±*w>dǼÙßJôºG¡{·|Ó§Øú§_•JDk÷ë×ÏEmJÏ•YÏòÜsÏ9‡|³fÍìì³ÏvNøo¾ùÆFŒáœò/¾øbÒYl߾ݸï‚ ¬@F”¾· X™2eüih?hÐ AÏÇ™ˆ€ˆ€ˆ€ˆ@â´Ö0qfºBD@D@D@D@’N Ò!¿ñ‰{mΤÉérÈ8оÿþ{ûÏþcÕªUs’/Ë–-³Õ7Þns¾Ÿj* =î÷>·RÝjDÊ/^œRWÕªUöþûï;‡õÊ•+S<6‰jçÏŸoÇ·Ñ£GÛ’%KÂÚ|ñÅÎí i;nÜ8¶0`€}øá‡ae‘'D¹wéÒÅfΜY×ù”)Sܪ€† Z‡œl Ñê-[¶´n¸Á9Ë¿úê«};ÖøIËb½Ãwß}ç>ÜtÓMv饗†uS¼xq«[·nØV¤H[»v­þù–/ß¡'ö:C”€"åÑ?ŒKD@D@D@Dàè!P¡^ݰ—]7sª­?#¬,ž"ä_{í5ç&:¾zõên{þùç4 (–#‡Í3ÖjÝw‡åÿë6ïkíÀ¥WØ®“›Äs›,m3lØ0+[¶¬Õ¯_?t_ôÖÇŒãôÌq¾IÞ¦M»øâ‹]ʺuëfHá yÎ9ÎyœÎ×\skóÎ;ïXÓ¦M­fÍšî|òäÉÆ†³9Ò.\èôÕ#˃çHí,]ºÔ6nÜ,vÇ3fÌ0¢Ü¯½öÚ¨ýÓˆ{`8Ç#í¼óγ‚ :I™Èº+V„}\ˆ¬÷ç±Þ) =z9£FòÍcîß}÷]Ç”2ôP¤|ú¸eÛU$_"B†È'’0EÚ¶mÛŒ9H£Œ:Ú`DóüôÓO.2†r¶M›6E^–ð9÷aùë„ ÜÄ$Ú³$Üi0 Më~ÿýw׆È*™ˆ€ˆ€ˆ€ J=ÕÙlÛîУ¬›ýí+Q*tïc¡W_}5Ì!ϵ9:áóæÍ›¢›u‡ØþÓê„ÊK´¼!t|¨0–kÛ¶­¡wÞ¿ç¬2dˆ­^½Ú=úàÁƒCÇ12>/½ô’Õ¨QÃ9òýõ×,=ÆØŒ½‰0eDòçɓNJû{ƒoˇ…3Î8ÃN8á_”´}åÊ•C>žyFV\pÁî#A<רˆ€ˆ€ˆ€ˆ@JI‰”g ãqÇgìÑ d°¹fÍš”wSIº lذÁúöík[¶l ëãä“OvÑ4¹rårå'N´Ï?ÿÜEE&‚ÂÿôÓOÛ…^h—_~¹›´¼üòËaýq‚vgëÖ­íÌ3ÏLQ—VKˆÿ÷¿ÿ…}`ÈÜ̘Døça¢Ã»0«S§Ž»Ÿ¯ î‰P"‚‰‰ ú¤2Èny_z„­o¿lûФtʆÄ8ðy"È}„é“¡Š+Úÿû_÷gËxûôéã÷|܉eè~ðÁö¯ýËŠ-ÖŒrîAÝ£>VçOà‡Ž( x?>fÈD@D@D@D » äywLèÖ}òyè8Þƒ´òŒ‰g Ëö+aVþàøjÍVפèGCì·+®Õü(ß±cGØsìܹÓ5O°cQÆi.Þ‚ÇðêÜù ¬Ð_ÖµkWc*ÎnoHÍ ?ÏÊUŒ@šgžyÆN?ýt;÷Ü??®äÏŸß7OsχæNÌ©pпóìÙ³Ý8–q/Ï㟠$ƒÏK¤>+iƒe±Þ!xŸÔ޽–üW\‘Z3Õ‰€ˆ€ˆ€ˆ€ÄA CNy„QË–-sÇ,Dëœ(’6rÊ;,úAni  CÞwxÝu×9]GêÉ2¤r°òåˇºd‚±uëV{â‰'Ü?Tñ×ÿûÿãÿ9ä}&8Ìý[_Ξˆõ_~ù%M-{þ§h‡?Ò®¾új·z ¸Z#² “îÕ¾}{ûè£"«u." " " YO bü¶¯hñ„žñÒ48k£EÈã'B>5‡¼¿áo}z[±6OéÑëwÊ<‚ÄŒ™>}º{‚M0æ!u0~õð ‘)ÉVq€íMä·7?žåœèqßçeÊ”q’4Á2Ê Hñð^¾¦Zµja×Ò#)µ¨ô“N:É>ûì3{á…ŒHøàØ–²èä·k×Σÿ¦U«V5Æï‘ÏÆ=ƒe±ÞviÒ8ÌIH”«(ù´h©^D@D@D@Ò&!§<ŽbÌ'Tò·c@Š&"K4x2•¥“-ô‰`fLš6m­*î²±cÇ‘îD÷0yÁùMÄO0r>õ8Ž~}ð Ò1&@Ñ,ØW´úŒ”5lØ0ÕË™´‘Жé]Mê T)" " " é PxÂ'¡«öÝ tÏcD$ ã¤æ÷².iõ¹³Ys )ٯߞVól¯' ¾[·n.WÒºuëlôèÑÎùÎ*^ŒhndchsÑE9Íõ‘#Gº Xø!ýƒ;êµ™]È*ZV0ž–‰€ˆ€ˆ€ˆ€dœ@†œòD§`DŸøèjÎqÚúˆ‘h‘Í´!ò){"X万JJcÂA„wf:“7oÞlŸ~ú©»¹×Zg¢Œ>bBC]¬%¸þïÏÄ!-#êOÌÿ±ÚÂGæ}ƒ.=#¿ìØëkΟ?ßE*q-}ÒÒæ$:½yÚ±¼X&" " " ‡ <óæ„eÏi§‡ŽÓ:`œÖ¿#ò;šCÞ;êãuȧu¿¬®FˆGÞÛ×5oÞÜEn¿úê«® yþý¸ëׯ"_8hР׆ñ 2‹8—1’юÜ tù£󴯹ô[³fM§#Ù‡n¿¬÷ç×_½[]:eÊ——É—×®]ÛIçd5Òˆš÷ò=‘uÁóxÞÁ?Ÿßs=ÿgÈã\vÙe!¾Á~u," " " "8 9å‰g̲ÈM›69§-@´4É4W­Zå-E>W¬A/Ø™3g†úˆ¼æh=烃oïÄÎ A­zœäß}÷›¼òÊ+!ÝL¿48Öýù? ±/ÇXÑòþÚÿýï.ÈŸ³GãÝë¼3áyüñÇ®=I¶‚FÒXoèu¶iÓÆŸFÝüñÇαÏÒíàä"jcŠ€ˆ€ˆ€ˆ@ÈùÛ–ÐÝ‹_ºf„ 6cÆ »á†Bò$8ꑲñy$TW»é¦›ŒÍ[¿~ýü¡›c¼õÖ[¡sV{MR…Hs¶XmÈE4=«| òAE¡ÎH ¦eÌy䑨ÍN;í4kÒ¤‰{þ¨ …mÛ¶56Æÿ8Û™;Ñw,C6ÔkØÇjCy<ïpùå—[Ðpæ™ët," " " ">±GwqôÇ@‘å¢eË–µ-Z8ÉšB… ¹¨nœ»Ñô䉸F/‘‰N}’41ÐDwœèô }´vpT4ÿÀéXf’Ü Ëpqà{ÍL$oHF4ïØ÷²CÁ:Œóg7ÑB>:Ý×¥¶÷º–K–,‰ê”GÞ†dT$”å…è% IœçŸÞMRˆx¼Ö&“1ŸÄ‹‰çøñãíÆo EÊûwuÅøá>juäùpÀÿXïÞ½ Ò©§žãj‹€ˆ€ˆ€ˆ@æØWþïdž¹V„"ĺ+ddXÐ'Ù=c0¶ ¶üáìõÞ±ÊIìš–¥Õ‡~4§~Zý&ZŸšc=Z_åÊ•‹V¬2#€@ÎŒ¼Žu¢ªqÞ¡‚sˆÈ8u£-£ôúßè"âhÅqO´ÏÊ•+Ý’ÐÌ”iÉÈ»fçµ'žx¢s^3ñŠ4¢VX¾ìåjÕªåš|õÕW‘MÝß„BúKÍâƒIÁ‚CÍø[GKÔê0ôÎoäi‚F_Ï>û¬}ùå—®˜•+Wv×aì}!0"æ}{Œeؾ,ò£ƒkñC¿Ç{¬ã#6>`¼gjïÑ•NE@D@D@D ©vyn¨¿|£þ” Ä8øðÃ]pÁc=æò 1ÆêÓ§OŠd¯1.W±ˆ€ˆ€ˆ€ˆ€ˆÀ!@ C‘òV„5Ž{$kH‹Ž¼7ú Á(ÑÞDÝ{}r_´ïI¨„–úĉLz—Žw"Ó‰:÷—DÔàˆæƒQè§œrŠk‹´ zHËDFÝLš4)äÔ'Šœv8®ýµtйsgµÞ£G‹&eCºœDi=õÔSÖ¬Y3«Zµª‹âÿì³ÏÜ3 C™Õ†Îj¤½öÚkN/ôˆ¬Ò¹ˆ€ˆ€ˆ€dÝ5jÿ}¯åþ>Žq´fÍcÜvÉ%—¸à büxðî»ïIÙĸ<Õâý{…ê÷µH,élèBˆ€ˆ€ˆ€ˆ€ˆ€ÄE ÃNù¸îòW#/K-:ÙKÖDsø&r#±-èwÞy§ 8ÐæÎJ@Eù9çœc-[¶ {m&e´ES”ömëÖ­kÿïÿý¿°¶œ0Á1b„+§I°øÀrÑE…ÚRŽEÊç„< ÿë®»ÎéÅ;6TE8ìÑ4–ñ’“ÀTˆ¬÷çDųÛ'æòå‘{ž3µgôíýûøsíE@D@D@D » 5̶ýãê˜1tèPàÂÊÔ‡zȲ ™ÞºukKMž0f‡ŠüÏ mêÜ=t¬äÈqPä@2ºe"€#—¨ùX‘ò8Kiƒ®#Zßhbz«]»¶‹¬Æ‘¼lÙ2_œ¡=‘DÉ2tÛÓJ`š¬{¥Ö22+V¬pÎéòå˧ê€öºýô‡C<gu¬{ÓohÝãìgõC0¡o¬þU." " " G#Â?·"7v ½ú¯Ë…Ž7mÚ:æàÅ_t«!9®Y³¦ †`¬•^󲑎dºôu³jÉ—ÿ‰ Æ€^òÀ6V¶RÆÊÊÍ›7[5B×ê@D@D@D@D@D 6… º< Y)Ï ž£+OrM$YÐøÆaŒN8ô8œe± à÷ãmÑSO†ÅëŒ÷÷âà ð2ض·¸ÐŠªË]}™­ö±+aE¡Ï…CA‡lðàÁF0‹ÏÓ¸4¡CŸt4çžÝaù­¯üíœO¨C5ˆ›@–:åy*´ç‘a2ѨQ£Ðƒn۶;ýö[%Þ Ñˆ€ˆ€ˆ€ˆÀÑ@`ãÄO­T‹‹Ý«æœ¾ÀJý÷aÛøÈ“†ôcÐ)Ïú†nH /+Y®FÝ¿û+QжŸ™ ÿ»LG" " " " " I'ð§PxÒ»M½Ã•+WÚÌ™3]#œñS¦L±É“'»HùÔ¯T­ˆ€ˆ€ˆ€ˆ€YöT©n{Û]z©¼¯·2wßæ$}D{¨2 ôY0‡Y…*'„õ¶zÆaç:ÈY)Ïk-ZÔ4h`¿(l“7´­•Oµ½u+Ùþ\…,ç¾–gÛ Ë»i¶å];ÍrîÚ˜9o®^E@D@D@D@Dà!°á‰^Vná"Ë9e®{¢Ü#&X…±u̦O³­.c“‘GÆ!Ü/X>o„u³nÖ·aç:È<™â”gi-  Š+f  ==ºä%J”° ØÔ½õmÞ®ƒÑ9eCÕîÇüîâµÜfU¯²Â¿Œ¶‹? o¤38¬2ÒÊÞö–ëó©¾ÙŽ=V¡N+}Ý%¶²óÓ.SPÎ&‘×Ç_dÕr;æâV).Ã!¿¯hqÉÖ¤ £ÈIwÊãˆ?ãŒ3\4|´GÞy ¿ÚqŠmÌQ.ZuвíÇ]n{‹ž`…êo9÷lKQ¯8R¬ð¶}ÿm+ôŸ®¡WÊ3ø;þàf•KÛÆAor7‰XÉçzX¾>¯§¼¤taûõ»?%%SVªDD@D@D@D@D ³ä8þøã“’É©H‘"Ö¼yóÐsþòË/¶víZC3ÞÛƒI£ÖÕþ·ý^¬–/Š{ŸoË|+:ãé¸ÛÓÐëÖ't‘‹€ˆ€ˆ€ˆ€ˆ@vøã+|J};ðKl)Ç4±5kÚþ*UmÿñÕ,Ç–M–sÙRËÁöé;°~{Ì·Øßë1ÛÕ¾cX=cuŒýþýûÝžã½{÷º„³DéïÛ·Ï­„õe›7o¶5j„õ£è.\è”d’)ÏíæÏŸo‹/Nqç]Õ[¥Ë!OGHÚp½¤lR`Uˆ€ˆ€ˆ€ˆÀ‘Fà ÜÌöæZ®_WZóϵ«·¤xÃc§›Ür¬a Z¬¨›÷´·ö6Õ±ˆ€ˆ€ˆ€ˆ€ˆ@ˆ¿'|{ôã«V­jMš4 ]‹feÞ¼yCçì/PÊ¢‰fÿº¨²-î}º­éL›Þ­‰U¯ð·}°=×ÓLD@D@D@D@Žû*T´í?-°›ÖÚ‡:˜åN|øžã´:¶{Ê×ÇáàŸ4i’õìÙ3ôçíÝ»· 4È­ÿÔSOÙ7ß|ª?víÚeS¦L±ñãÇÛªU«²äaKpÕ§Ÿ~jóæÍs«#¢Ýx÷îÝöõ×_»Uɬ¨‰€ˆ€ˆ€ˆ€$N C‘ò8ä5jdåÊ•sƒ¶ 6¸å¬Õ«W·cŽ9ƾúê+—ŠÇÚS®iÔ§;½v)ëùÏ*6ã—öδßìßgW°Ïîk`Õïù:j{úɿ젦¦LD@D@D@D@Ž";ï{ÜŒ ;(#“o§–kæw–ã—å¥kFÒçÏoJ”°ýõN²½Mϰ?jÖù³íaöûóÏ?ÛìÙ³ÝSã(ž5k–.\ØÚµkç$uæÎk%8³·Šïq‘â£Ã¢E‹Â.¨P¡‚=øàƒn©sXE’NÖ­[g]ºt±;v„z,^¼¸uîÜÙJ—.*ÃaÿÞ{ï9y# óäÉcwÝu—Õ«W/ÔF" " " " iÈSÇ;ù-'Ó¦Msz“9rä°šµ-qÌŸxâ‰öã?º§ØS2ú@-_î6øÛ vûksÎ-Ø™'±“+Žùäô#§|L<ª²Îo@IDAT8äÊe»Ï¿ÌŒí6Vàöë×ÏØ öÜsÏ9‡|³fÍìì³ÏvNxVŒ1Â9å_|ñÅLañôÓO»`ªûï¿ßåàž¯¿þº 0Àyä‡þ×_µ!C†X¥J•ìÖ[oµíÛ·[ÿþýÝÆj†  "½£ˆ€ˆ€ˆ€$…@âë_·=î¸ãÜúñDu`$ƒâœ$PÇ{¬Mí-RÉí#&ÌÚ`· ˜cO^Wævmb*±nc~‰l:ÕO¨D@D@D@D@D@ŽDÊGËW…¬Ëûï¿ïÖ+W®Lñ¾H« Ç2|øp=z´-Y²$¬Í_|a ,•Ñvܸq¡óàÎé?ü0X”â˜ùÑæ3gÎLQOr5¼kÆ ­C‡V«V-Õ²eK»á† IV"GÚØ±c §xZë~ûí7[¿~½]pÁV·n]'CÚ¢E «V­ZXÄ>y¬cÇŽV¹re«S§ŽÝxã.ºžz™ˆ€ˆ€ˆ€ˆ@ü2r’/_>w§7†Ý‘e¦›7ovKsŒâa@¼?W¡°6‘'…òæ´Bùÿtà7«VÔúF6øë<­~b\¦bÃÀ°aìlÙ²V¿~ýÐÓÿðÃ6fÌÄ\ƒHò6mÚØÅ_ìÚPÖ­[7C ‡ !ÎqÎ_zé¥vÍ5׸6ï¼óŽ5mÚÔ­ò¥`òäÉn;ÿüó]}ðgáÂ…aÒ.Á:ÌhéÒ¥97¢~ÆŒF”ûµ×^kÑú§ ÷Ànºé&·þœwÞyN&”UÊ‘¶bÅŠ° ‘õþ<Ö;*TÈž}öY+Z´¨oêö{öì1?ߣ€ +V4¤t¼5nÜØIØüòKì *ßV{¿ üéÿû<¡#–,b‘ƒCo¥J•r:óDÌc9÷ý­Oè þúiP­˜rë8pžÕ»ª½÷Ý»¤n «Z>º?V?Á>3óÝüd]"iH¤´lÙ2·¥’CBèñÇEÏÐz”i%†z饗ì“O¤ÑŸæºFD@D@D@D {0ÆmÛ¶­½ñÆN>…ù‘Ü«W¯v4xð`ç'ÊƼ5jÔpŽ|dX²Ú6mÚäV¯]»6æ­‰äG£½X±b)ÚðaáŒ3ΰN8!E]F *S¦L˜ž8ûO?ýôP÷|lêËû ´ç³ƒ©¿¿ö" " " "p8ÈP¤<ËD£!Oô ƒL‘èÉ£-Ï`™r,϶¶»x­Œ®?½¼ÝvVy{ä£å6uá;£z×fÓ¶=)ÚR@?Ùe¼Ïm·Ýæ–¥’Q#¢¤OŸ>)ºa0~Ýu×ٹ瞛¢.VìÈ£áÏ€$ÁÚºu«“ŠuîÝ»wÛ%—\«‰ÊE@D@D@D@Dà"@¢W¢Ç1’À¢qNd<ã_ò^1&&º§=ãã Ü 3¿ÿþ» ¦qüA^†À˜5kÖ¸"äe0nñÄÐY¯R¥Š{Ö ¸À%Wå§7îxŒ¨wüÄ1÷æ¾ÞÐ}gNÏ;0o ÎxtüKLìWÀƒy]´ ts`:H@†œòDDà˜g#Ú‡èÉûAݶmÛœ†£¿}ÞM³£:å¾ø #¾˜ý÷ÊÊ›V>؇Yç‘ËmËŽ?5êýõ~O?Ùe ®ï¼óN{â‰'\Tú)§œ’”GaÙì9çœãºèZ¢e9hÐ ÍÀ:ãÃËNY¡ #™Nù ùsœÐÌIX‰ ¢ÁyOÕxlùòåÖ£G°¦Ìo‚e¯½öš?ÞFÖîÛo¿56Œ½zõrÇiÓ‘…Y´h‘k›ÖO÷îÝS4 >[§Nœ=XÆÑÞ!oÞ¼¡¾øø@߬|f®—¯a“?š4šô%K– õ¡´ dÈ)O÷è9²ç1:„D°l”±OþJ»¼k§™U½ŠÃ0Û¾kŸÚyºÌ—ÛÊ—ÈgKÖD—¹ñ¹~üI6ìÑ€dù¨wÌ7jÔ(ÃOA”͉'žèú!aöºH¸íþõ×_XZê2È”Ãú»ï¾³zõê¹$¾MpÎ%QCDÑ”/_ÞZLà$ÃÀŽás†;wºWÀñNcdÐiãcoÁc¢Û;wî쫬k×®.rg·7œÔÌ N>ùdWÄ<è™gžq’/~Åkþüù}ó4÷|X @‡À§ n;ò¡(œßDñ<þy‘ð!º=ø¼‘Û+Xëüƒá\ÇO¿¾­¯cÏÝ=*2šØqÇç4åqû¦sæÌqQí$[ÅŽ´ÌâÅ‹]{~–ÌÉäèqße8¤qtË(GÎÅKºxùšjÕª¥hG[òI£Ò) ÚI'dŸ}ö™½ð .Zݯ@¦ÍçŸnèä·k×Îõùhà­jÕªN>&òÙ¨–ÅzÚñì8â™KàÈ÷ò;ÔyƒAAÈå Ï±Â—ùDfhÝûûj/" " " G"¤8åãS`ñ‡¶·è QelÒê#ß–ùÆõ‡Š}öÙaŽyÑéµ?þØEÃ3fÉêÒ¥KÝrÛd:ûß~ûmç¿êª«ì²Ë.3¢‰XÎm jzßC׉€ˆ€ˆ€ˆ€ˆ@V  ù /¼ÐqGíœï¬Å®¸â 'C›‹.ºÈ9GŽéÈ>€'óO?ýd pA6™™¬g6÷¹òÊ+­U«VQ±b–÷Áÿ裺²ÞŸ_ýõ.ë”)SÜ*X_^»vm7vgÅk¤5ïå{"ë‚ç±Þù y° ¾¿¶nݺÎ)ÏŠƒ;î¸ÃÞ|óM'µI=Ü;vìè‚•|{íE@D@D@D@Ò&åNùœ{¶YÑOÛ®ê­bJÙÉšC)B>ølh*6ÌEßË=fª×dî7ß|chC>÷Üsöàƒ&Ú]ŠöhPúÈ—`e¹rå4€ѱˆ€ˆ€ˆ€ˆ@¶¸é¦›ŒÍ‘ÙÞrçÎmo½õ–?µíÛ·;íôh,gœq†‹6Õ¦xñâ.šž@¤e¼ M¨óÀAûöígÑy¶Gy$jåi§fMš41Ú¤emÛ¶56’Õâl'ÇTj×1ðs‰ÔúŽõÈÚ™¦ÖGÆ bH¥•¼6µ¾T'" " " G3´G…™DG{¾U“lO¹¦¶§d=Û[¤’íÏUÈrîÛay¶­°¼›f»ä°‡‚†|48䉴iÖ¬™‹‰Ö&=e nY–:qâD›?~XÁĹTìÛ·/¬>Ö 9DîD&Äb©ªÌu­ÊE@D@D@D@DàP%@b×´,­68ô£9õÓê7ÑúÔëÑú"€æP5"äe" " " " é'mNy‡{þeŸ¸-ý¯õWz‡|Ó¦MÝ2Òd?‘:,!Eb#Á+FR%¯#‰ƒ}Ë–-V¢D W—ÖÚ™?ÿü³‹¶)X° kÎòT™ˆ€ˆ€ˆ€ˆ€ˆ€ˆ€ˆ€ˆ€ˆ€ˆ@ÖÈV§|Ö½fòîäò5²;ï¼3)7Î-™¥³+V8-K¢â[´háúG¿’z+á„g ë¨Q£\]¼?_|±õïßߺvíêHáÐ'Á¬LD@D@D@D@D@D@D@D@D@D ëÈ)Ÿë 68Éš ؽ÷Þ›À•Ñ›úDMh22Ä5¢ ]Gt![¶lºðž{î±¾}û†éh9Qïû5üë 2‰Ô©§žêtåqÄ£WýóŸÿ´Œ$¨ýëVÚ‰€ˆ€ˆ€ˆ€ˆ€ˆ€ˆ€ˆ€ˆ€ˆ€ÄI ÇñÇ ζ‡]³™3g&õ™q‚9Ò®ºêª¤ö›HgD¸5ŸÝËÍ›7»DV±ú‰<ÚŠ€ˆ€ˆ€ˆ€øsZÀž¼CìÙXÁI."6r!…èËWÖ¨QãÈ ·L&€<9J(Š”OtÞ¼y³Õ!Ï£/^<'ŽÞ4^úèW«TD@D@D@D@D@D@D@D@D@D ½r¦÷B]'" " " " " " " " " " "9åã¥Ö" " " " " " " " " " "nrʧ.ÄÈ)Ÿ/µtS>Ýèt¡ˆ€ˆ€ˆ€ˆ€ˆ€ˆ€ˆ€ˆ€ˆ€ˆ€$F@NùÄx©µˆ€ˆ€ˆ€ˆ€ˆ€ˆ€ˆ€ˆ€ˆ€ˆ€¤›€œòéF§ E@D@D@D@D@D@D@D@D@D@D 1¹k®Ö" " " " " "y&MšdÓ§O·ûî»ÏݤwïÞVªT)k×®ýñÇÖ«W/;묳ì´ÓN˼‡È‚ž—/_n ´2eʤ¸ÛæÍ›mûöíV©R%Ûµk—­]»ÖÊ–-ëÚG6Þ¹s§­[·Î*T¨`9rä°Õ«W‡5)R¤ˆã,ܰaƒë?X<Ε+—»w°lãÆöÃ?¸g¨U«–•(Q"Xðq²û‹÷~ýõWÛ·oŸU¬X1Å%©Õ¥h¬È9å3O—Š€ˆ€ˆ€ˆ€ˆ€$—ÀÏ?ÿl³gÏv℟5k–.\Ø9å8`sçεã?>¹7͆޺téâîÚ³gÏŽù‘#GÚ„ ìÍ7ß4ô;w¶æÍ›ÛÍ7ßœâI‡j'N´=z8~×®]S´É—/Ÿ]ýõÖ¢E W÷Ö[o9{І‚Aƒ9'?ž|òI÷jkܸ±uèÐÁrçNlJ™ìþ‚Ï”Úñž={làÀ6eÊ+P € 0 Ô<µºP#ˆ€ˆ€ˆ€ˆ@ $6‚JâÕ•ˆ€ˆ€ˆ€ˆ€ˆ€¤F‡o¿~ývü¦Öç¡T·ÿ~ëÛ·¯uïÞÝ9À£=Û1Çã¢à§NjíÛ·kÇGŠiÓ¦Y¹råìØcµÅ‹».Î;ïúÈ6lh8¼£Nw/Sãæ´C¶ F¸»‚ˆ´é±ß~û-¢&õSø-ZÔˆ¼4¤tøH@´>öÞ{ï:y䑘ï‘HÁûe”?ün¹å'[ì—ãÔê"Ûê\D@D@D@’E@‘ò ’L$$Á®Õ\D@D@D@D@D DɽýÆoXÿþý|Ë!CBINìò-[¶4d[^zé%«Q£†sä“ÌóP1>Üu×]îÃAŸ>}œ4L´g+T¨‹hÿþûïCmøð0cÆ #é*šû±Œ8ý1/i«m°g;úïÞ¡¬ã˜D¯gžyfH®Dó#uÍí/Zé-ãƒ:òÑ,µºhíU&" " " É  Hù(2ø¿í¶ÛܲTeÔXfË 9–1ø.S¦L¬j•‹€ˆ€ˆ€ˆ€ˆÀQI€D¯>z‡ô­·Þê"ã‘sAƒýÇtÑõ8å1ÆÕ8¿I ObRœËA¹›õë×Ûï¿ÿnóæÍ ñÆŽFùš5k\22mÁpÇŠ~w þ0Þo×®ûÀ@Vtã£Ù9çœã’ÜâˆGÛ=’9gŸ}vŠæHíµŽá,Çëâ5¢ê¹.ÞùÈwÞéxpŸh–HYÉ?Ú³ªLD@D@D@2›€œò fpÍ`ó‰'ž°ÇÜN9å”®NÙ”%µ~ðž²Ö܄⩧žŠV•PËy‹/uÀžPGj," " " " ‡œòAóç8Ò}Dvd ÎûfÍš/‹y¼|ùrëÑ£GX= Qƒe¯½öš?ÞFÖŽ¨l:í½zõ «v‚ck£‰å8gîA’RßfÒ¤Iî žHãcãœêáXïÙ³§‹Èlëœë‰ä_½zu¬&aå$†å§a"ýe5ÿ°щˆ€ˆ€ˆ€d9儌Æ$ƒSï˜Od hä­þóŸÿ¸ˆÊÑÅ$±ÔÕW_m•+WvM‰èI†1Y(Uª”œòÉ€©>D@D@D@D@²@¤¾ûÎ;Ý3áxÇ9œ/_>Éž–W­‰nïܹsèò®]»:‡3 N½á gnpòÉ'»""éŸyæ;ýôÓíÜsÏueùóç÷ÍÓÜß~ûív÷ÝwÛóÏ?ê3xsð_}õ•“ˆAÏýÔSOêhG[ß?A>¬ X¶lYŠ$§Áþ£”D"Z>tÀ5hÈÔÐçqÇçîÁºXÇñö—üc=³ÊE@D@D@D 3HS>TàÞsÏ=Î1?sæÌtôðç%'œp‚‘¨‰Á,V­ZµPYÕªU]Ëb¿üòK—<‰å°D¼h{"eÞ~ûmûä“OŒåµåŸ~ú©Û£…Ï1I’d" " " " "p8ÀɌĉ·éÓ§»ÃêÕ«»=ck’ŠõÍçÌ™ã"ÝIÆŠM¾xñbwÌfoyóæ5úò‘æÎøsö8©‹+*c±÷í*V¬è»LsÏ…Ž;º±û”)S¢¶÷’=ýúõssïxÚø¯B¤q°þUÿŽy sŠwß}7ÅE¯¼òŠ[°gÏžP]ð8T8ˆ·¿ìàxLŠ€ˆ€ˆ€ˆ@¦P¤|:³ÄÔGÌ5ÒI'¥³§Ô/#†ˆŒû3Æ ô‰Üa"€Ãžcö, e?lØ0# Ÿå²èI¢5ÉÆñ%—\â’\¥~WÕŠ€ˆ€ˆ€ˆ€ˆÀ¡K€(x¤ /¼ðB[·n=ÚŠ-ê‚[xê+®¸Â9ŒisÑE9ÇòÈ‘##Û;Ê‘¼a¬=`ÀkРÓ^Ïî7fNAòÔÉ“'G}‚vˆ!І±?A>iY… œlŽ~>^ĒljÖ«xa4vìX#ÈI ¢æ Z´h‘5lØÐÍ9¸'=Qü?ü°K>›Ñþ¢]¯28R$=Ržeœµk×6?Ø=R@E{æÍ›Û7Þh>úh´ê —±,·OŸ>Nzæ…^pÉŸ®»î:õóÁ¸þ°ãˆ衇ŒˆÚ•.]Ú¦M›æȯ¾úª› 0éàØ'»ÊðéÈ‘2)Á[ø:Æáã[œíåË—wÎ`¢ß±úõë»ä¯Dn4È­‰N0 óŒä°±|ýõ×n A0ѬN:.ú=Z/£ßš5k†œÔ¾<Ñ=ó î±ì¬³ÎrU8ï#-Öó·iÓÆ½‰d#Í3v-eÌ3êÖ­ë’äòç;«ø B¾-o¹råò‡1÷‰ôì$™üýûû÷Ç©Õù6Ú‹€ˆ€ˆ€ˆ@2ä8è¬=ŒŽ|84aDSÌš5˼ƣo“UûŒHËÄóŒDåÜu×]N7òŽ;îˆç’˜mŸaI)ƒÞO<ѵ›1c†õíÛ×E½3±ðvï½÷“ "ôIâÊvÙe—Ùå—_î–áúv~Ï2X4åi/hˆ€ÆØ#—Èž—è­³íÛ·Ï ,*Û¼ys¶®Âܾ}»á öÎøhï•VäYpÞ#E#‹M€ÿ$~–9æ˜cRhÌs%ÿ#¹sÇ·;žþb?jD@D@D@Dàð$ÀŠGV>Æ7bJÇ;âˆ'b»E‹¶`ÁC»1R =Ý2—x‡|³fÍ,£ùX/E Ö«W¯M¼n<É¥ˆŠÿøãÝÆòÔ3Î8Ã.½ôÒ˜‘>):Sˆ€ˆ€ˆ€ˆ€†TIËÒjƒC?5§~Zý-õD‘€•šÅë§xúKí^ªÙ@¦9åI¾DôÉ|ˆü>öØc]Ô™æ”ç•Ö®]k7nt‰~ªT©â"¸‰˜'rž¥‡£y‡|£F2Ýéí7ñqƒûy#Ù’_uO–äòñƒÈ•þóŸvß}÷¹„L¾={ù2ì%©Ny^ ç;NäU«V¹„K8Ž‘XAk÷ádhä£!ß AC×=³­Zµj.aîСCmÛ¶mVµjU—`iâĉNyŽ¿üòK;çœsì”SN±_ýÕ¶©U«Vèñø o´ç6lh•+WÕé@D@D@D@D@D@D@D@D@D@D ëdºSÞ¿ I &OžìË5kÖ´&MšØêÕ«mΜ9¶{÷nßìÞ-ZÔZ¶liW]uUÒŸ3gΜ®O´½QöÈ#¸d¯Ÿ~úi(:þôÓO·o¼Ñ5»öÚk ¶&Lp…ð½ýöÛ]=?$€ÅQSç~Û¶mCu:È:9F®Hæíˆ„¯S§Ž}÷Ýw¶fÍš¨]£•~ÒI'YÉ’%]$=Žù+VDm›‘™3gfäòCêZäjp¾“×;ðƒHý¦M›Óhõ´Ý»w¯‘|)èøö¡c£—’‰{Æ–ìÙC²ú•mß¾}V°`ÁPãÓ5j½Ðôæ" " " " "… þéßMàš¤5ݱc‡KHúã?º>qÐ7kÖÌ|bÓ¤ÝèêG{©R¥¢:äyMêK—.³ž6yòä‘C2È&j¦dÃ͉º!:=tdlp87oÞܪW¯žªc9U·¤È6§¼zôäg̘aÓ§O·={ö¸¥gžy¦ ã÷m´#@¶;å=ÄuëÖÙĉmÙ²eV¤H#™iݺu}µö" " " " " " " " " " "pØ8dœò$yÔO?ýdóæÍs`«T©" ôÃþ_L/ " " " " " " " " " "à äöÉÚ¯\¹ÒrçÎíì‰öI"ÒZµjYåʕݥK–,1´çe" " " " " " " " " " "p$HŠS¹™B… 9Møß~ûÍ-Z”0›òåË[ýúõ-oÞ¼¶uëVûñǾd" " " " " " " " " " "p¤ÈS>_¾|vòÉ'[™2eB¢VV­Zºuƒ ìÜsÏ Ûˆ<ÀhÿÍ7ߨŽ;Ü9NzòœSŽö¼;~D@D@D@D@D@¢`ÎàÍÁjæ$ï¿ÿ¾ÙŒ”f¤áðÆ!=|øpɼdÉ’°&_|ñ…-X° TFÛqãÆ…ÎƒDUøá‡Á¢ÇÌkˆ~gq"¶bÅ ÷©]C”<ÁN?ýôSX3V ÿI}ÐpÐשSÇm­[·¶Ž;ºjd0¤npγ±ÂCÇ—±ÇÞyççoÛ¶­µjÕÊI¾T©RÅ:uêdõë×wÏCPV¼–Þþ2ÊŸ¿?’5=ö˜U­Z5æã¾øâ‹Æª˜ÉD@D@D@D 3 $)ù ,'%z%Ò£½Gt˃¶hÑ"c£LD@D@D@D@D@R#0lØ0ç0Æ ì mó1cƸ9ó óD_|ñÅ® eDx#…ü„sœó—^z©]sÍ5® brdÕ¬YÓOž<ÙØ|¹¿û… †‚åÁcæF¬F+=^#š=x¢³6lh8¼£Nw/Sãæ´C¶ F¸»‚ˆ¯ÏÊçD ~HyiHéð‘À¯š~ï½÷Œ<òHÌ÷H¤¿àý2Ê~·Ür‹“­ ö<æƒ H䉜ÃÛéXD@D@D@’A ¥G=•^óæÍë"Øc$aŠ•˜Íx$bPC"¡M›6¹-•[òUD‚ ›Ÿ ƒËœ9s,þü¡"²@^(÷ð}D&ÌòåÚ‹€ˆ€ˆ€ˆ€ˆÀáF€(y¢·Y±Ë*\´¿‡ âòV!÷2xð`çoÙ²¥]qÅnÎÒ§OçÈ'I,‘⇂ñÁà®»îrImy¾gŸ}6jÐS¡B…\Dû÷ßïäcŒâCÃŒ3Œ¤«hîÇ2>xt/i«m°g;úïä‹f$z=óÌ3CUä#šëÖ­¡²àA¢ý¯ÍèqðƒN´¾xæwß}×±lÖ¬™›çFk§2H„œò8Ùc Êb=Ði§f¿ÿþ»‹šˆÕæp)gðÛm·¹e©$6ʨ¡©Ïà;Òøñ¯ýË-¬Kïy0aVzûÐu" " " " "p( Ñ«ÞÆ!}ë­·ºÈxä\pÊÿøã.º§<†Sç7RœÑVúF¾Îå ÜÍúõëÝœ†|XÞp†£Q¾fÍW䃕hƒ| †¬g¬èw×ààO™2e\Û7ÞxÃHÀŠn|4;çœs\’[ñh»ã G2çì³ÏNÑ©¢Ö1œå8ðq¢s]¼FT=×ñ|ñØwÞéxpŸh–HYÉŸgEoc®'¬ Sž‚åË—»Hq¶DvÄ2_‡TÍ‘²üÁ5ƒM"qüq;å”Sb½~Bå8ød£sÈà½ýž={Ú“O>étêLE@D@D@D@Dà'€S>hþGºÈŽ ¢ÁyOt<Æœ§GaMIˆ,{íµ×lüøñ6jÔ¨°v$@eÃÐiïÕ«WX}´ë\ƒM,Ç9sú6¬„å žHãc«oqªX„cù…Ÿ£E¶vÎõ´'ùi<†ti,‡<×'Ò_Vòç# :ø€SªT©x^UmD@D@D@D ÃrÊ!ÂF´Gݺu4Oâù$H¼àÄ&OŠls8ž£1ÉàÔ;æYë}I®D"&Œ¥•5jÔ0’M:ÕþùϺrô ÑÎ$ÂþÜ×'Ôu þ0à&±ÔÎ;í„N°“O>ÙéúûúÈ=äxXÂËG"}(#š†%½$ñ”ꡎüÝ™00 âˆH%Ž1&Bü݉Pâzô1S G>—ÎE@D@D@D@D 5‘sÆ¿ŽwœÃ$ëĉž–!íâ-xÌx»sçξʺvíêÆ³$8õÆØ—¹cnŒÕÁÏ<óŒ~úéNV‡2d*ãµÛo¿Ýî¾ûn{þùçC}¯e‚þ«¯¾ró,ôÜO=õÔ¨Žv´õ‘öÁžzê)a¿lÙ²T“œïå JbÂø®Ac®GŸÇw\Üò›ñö—•ü}_æ4H‹b›7ovûW_}ÕIÚ •$dHÈ)ïoì#ßq“˜ˆŒ U®\Ù „3xŒ¬¶=àzÇ<Îyœ×É4œØò6ØØ±cDT˜çIÇÿû_+X° +'qÕèÑ£ÃÚT¯^Ýz衳ÜUþõC[®á]pȳ¢;K`™`°'™‰šªV­ê®úòË/eµmH®Å œç$r‡ì$ÕZ¼xqèY d÷ß¿ûã.Öˆ€ˆ€ˆ€ˆ€d€25Hœà|ǦOŸîöŒ1ƨÈ7â8&Q)†Ã•¨ök¯½Öˆ¬'À…1«7ÌÞÈ¡åû¢Œq.å`åÅŠsÇ^¾¦Zµj)ÚQŸ–ñA¡cÇŽNW~Ê”)Q›ƒS¾_¿~nŽåïQÿUØ®];{àlàÀnþZÛÈ:’Ê~òÉ'Noý†n«~å•WlÖ¬Ya+˜'úücaÿ:‰·?¢êƒ¬3“?­|Paþãͳ~¬ñõÚ‹€ˆ€ˆ€ˆ@F $ì”/Y²d(rƒ.ËBŒ1ÅaŒCžÈoŒèh"GˆÞÆY{$KLƒŽyÛ&ˈLjÂm„ .òÁ´w†6ÌEµ7oÞÜñÇÉwäuhCÄÇÇì’y½Mÿ|”ã¿è¢‹ìºë®sÅ8Üù{â<'J~éÒ¥N;ŸA8“ƒ7:‡<îaBÄüsÏ=ç»u{l1¹ù¿ÿû?=O„Òc=æë ÜS¤‡u¤ˆA€1& ^x¡ b,ÌX§/FrWdchØ—¹ÊÈ‘##»bÅŠ® Žy¢ÍY¡J +B³Û˜S‰Õ ^)3î­>E@D@D@Ž^ 9åè²DÃq\¾|yci!D©Â²F"äqØ"qC¢%"&ði†Cœ¥>úh(J==<ìˆÄà9p¬ÃžD° Þ8åÑZÄ`á÷’6W]u•ûÛDÊÏàdçZ&*­[·v×ñCr¬ ñq…„³+W®tÅ>©‰§pÈcHè°„vÚ´iîœ$kˆ"b)/ÆÿÌõ×_o/¼ð‚K–EB,™ˆ€ˆ€ˆ€ˆ€Ä")“lçë‡D‚¼Æœãßÿþwh,Œ$$ã[QXµ‰\B²WÆÙõÝ»w·¯¿þÚm1fHMFŽ«#ÛÑoÍš5CNêÈúxÏo¼ñF'‰´d4Cz’ 8ï# ¡hÖ¦M7f'‘l¤SÞ3v-e¬¼}ñÅÝ3¡½ŽQÎàCY®\¹Ø¥j‰ôì(™üýûû<ާMä5:Dä8øå?îvQ VYÚ¹iÓ&w"MpÎ3ÈÅO‘Ò8˜sçÎí¢'æÏŸï–Ž&ò`Éh‹ƒ;3 éöèFÞqÇ ß NèSFL™$T9øÁcÂñî»ï:'<Ët½Od‘èü=,Dz×_Ý%…òõÏ>û¬[†ëÏѪgÎR`"‰¼äÒ6/½ô’ <Ø>ûì3·T–å¤Þ¼ ý1Qayl,c©ð%—\«Zå" " " " ÙHÀ¯leÏX=›—ð@Æcß¾}N>Ñ— ‚¤evÙöíÛ3ئD{Ž´Ú0ö%ˆÈžDëCeæbÈÊW>‚Ds\ó?Â0ã+­þâéGmD@D@D@Dàp"ÀŠGV>Æ7búëÍœûèÿ²DRûhj_æ÷ ʼ¾£/;RöÞ!ÏRÐô8䃮¼òJ'õBñÁ,:˜]ºtqºïDŸã¨gÒѳgÏPD£¯^½:tžÚ_ºúä“OÚÓO?’“áã5DÜÍN”|ïÞ½]²,ú#ú#™lÐ)üÛû9Qÿÿú׿\{~˜¼±eç„-ô0:8b Ãž–¥Õ†±ujNý´ú?Zêë§%åœÇ¤Å%žþÒêCõ" " " "p¸ˆ¾¾ñp}›,znïGº¥S§N¾+Q98ÖÙ"²¬2ÀÐsdc¹id´ç‰V"y•7"ߟyæ§÷èËØwèÐÁnºé&ç€'AFtÐúõë–æÍ7ßì"ÿ½ÌŒ_†zâ‰'º¶o¾ù¦kÏ ÷‹LBEb+žé"ämØäŒwèô#" " " " " " " " " ‰EÊ‹—¹DRHÖàlFÃ=³ÍKØ Ÿ?~cù-šðAkÑ¢…“–éÛ·¯ÓŠ§Ý¸qã\²ªV­Z›ºc’Ô’˜‰DL#FŒ°–-[ºè 44¿øâ wù ؽ÷Þ›ahô§e8ÃIÚúÁ8mw®!b~èС!Gtˆ~øa{ùå—C{$dÐ¥'ŠóÒ2þ~·Ür‹ÓpÄaN$;‰±ˆœGWkܸ±»&xÝÅ_ì¤mH,Kþ¢öÑÅ~$àc‘øô;pà@×Ïwß}÷…ÉÞ¸ ýˆ€ˆ€ˆ€ˆ€ˆ€ˆ€ˆ€ˆ€ˆ€ˆÀQF ¡D¯‡›d'z%ÔÈ‘#“<«Y‰d®HÝDÊן…èu6¤pÒc[¶lqQ𑺚>Bÿ„Npºó¾ï|ÐÖ¬Yco¼ñF Ç?×`iéxú¾´È>Œ71ö‡K¢×죥;‹€ˆ€ˆ€ˆ€ˆ@âÒ•è5ñÛYWäÍ›7[òP$b½T©Ri%Bž-½LâìÇú‚ lâĉNö¦dÉ’6wî\Û¼y³!ŸŒ¨÷×ÉïIh/" " " " " " " " " |þâ&ðøã;MùyóæÙ/¿übåË—·‹.ºÈµ‘‰€ˆ€ˆ€ˆ€ˆ€ˆ€ˆ€ˆ€ˆ€ˆ€¤M@Nù´©Å_´iß¾½xˆ€ˆ€ˆ€ˆ€ˆ€ˆ€ˆ€ˆ€ˆ€ˆ€¤“@Ú™FÓÙ±.' §|8‰€ˆ€ˆ€ˆ€ˆ€ˆ€ˆ€ˆ€ˆ€ˆ€ˆ@¦S>ÓЪc' §|8‰€ˆ€ˆ€ˆ€ˆ€ˆ€ˆ€ˆ€ˆ€ˆ€ˆ@¦S>ÓЪc';üTg" " " " " "}&MšdÓ§O·ûî»Ï=DïÞ½­T©RÖ®];ûã?¬W¯^vÖYgÙi§–}™„;/_¾Ü ,heÊ”IÑÛæÍ›mûöíV©R%Ûµk—­]»ÖÊ–-ëÚG6Þ¹s§­[·Î*T¨`9rä°Õ«W‡5)R¤ˆã,ܰaƒë?X<Ε+—»w°lãÆöÃ?¸g¨U«–•(Q"Xðq²û‹÷~ýõWÛ·oŸU¬XÑ]²jÕ*Û»woÔË£±‹ÚP…" " " " 9妿" " " " " ™Gà矶ٳg»à„Ÿ5k–.\Ø9å8`sçεã?>ó ‹zîÒ¥‹»SÏž=S8æGŽi&L°7ß|ÓpÐwîÜÙš7on7ß|sЧ:t¨Mœ8Ñzôèáø]»vMÑ&_¾|výõ×[‹-\Ý[o½åì)  äœü|xòÉ'Ýsª­qãÆÖ¡CË;±)e²û >SjÇ{öì±Ú”)S¬@6`À×¼{÷î¶cÇŽ¨—6jÔÈî¼óΨu*ŒHl•‘;éZH€ß~ýú%ìøMàÙÚtÿþýÖ·o_Ã1L”{4;æ˜c\üÔ©S­}ûöaíøH1mÚ4+W®œ{ì±¶xñb×Åyçg8”±ùóç;ÿ믿î"æëÕ«gmÚ´±óÏ?ßÕoÛ¶Í^~ùeçd÷N{*x¢ð»uëf[·nµ+¯¼Òš4iâV+Œ=Ú¾ýö[÷àþûïwýÄó“ìþâ¹'m–-[f¬¸Ø²e‹åÉ“'ì²»ï¾ÛpØmÅŠ6dÈ+_¾|°XÇ" " " "4Ò”OJu$" " " " "lDÊ{gs°odGÞÿ}1b„­\¹2XåŽqxã>|¸áD^²dIX›/¾øÂ,X*£í¸qãBçÁ¢ª?üðÃ`QŠc$Pˆ~Ÿ9sfŠºÔ pó©Qò»wï¶Ÿ~ú)¬«·¡>h8èëÔ©ã¶Ö­[[ÇŽ]5²@R78çÙj×®íÊÇñeì±wÞyÇ9äÛ¶mk­Zµr’/UªT±N:YýúõÝó …¯¥·¿ŒòçïdÍc=fU«V {Ü5jXݺuöüÑræÌi—_~yX[ˆ€ˆ€ˆ€ˆ@²(R>Y$Õˆ€ˆ€ˆ€ˆ€ˆ@Ò 6Ìé©ãö†¶ù˜1cœãç;Nm¢¿/¾øbׄ2"¼‘ÂÁ¹Ê9ÎùK/½Ô®¹æ×qÓ¦M­fÍšî|òäÉÆæ#Èý½Ø/\¸0¦Ä‰o‡ÔÎÒ¥K ­ôxhvôà?úè#kذ¡áðŽf8ݽLw˜ÓÙ,áî "~ЦÇ~ûí·ˆšÔOáW´hQ#ò>ÒÒá#ÑúØ{ï½g|èxä‘Gb¾G"ýï—Qþð»å–[œlM°ßhÇDÕó±ã¢‹.Šªá핉€ˆ€ˆ€ˆ@¢)Ÿ ±D"AÒꚨ4#äCDþD÷暌’gžyƾùæ›T»™7ož[JûÄO8ýÈx¯KµSUŠ€ˆ€ˆ€ˆ€ˆ@0V&zû7Þ°þýû;ùäF|’ÓÁƒ;‡|Ë–- Ù–—^zɈˆÆ‘ÏXûP1>Üu×]îÃAŸ>}œ4L´g+T¨‹hÿþûïCmŸÏ˜1ÃHºŠæ~,ãcNÌKÚÄj,ÇÙÎ\Å;ôƒu“èõÌ3Ï éáÕh~¤n¢Y¢ýEë#½e|ÐAG>ãc äzd" " " "Y)ŸYÿ·Ýv›[–Jb£Œú˜ Z}’!úc°úè£Ë_)Ÿ~úiÛ¾}»›T¤÷Þ,Û$¢¥råÊ1»à$ŠBo‘,z’ñ\³CUˆ€ˆ€ˆ€ˆ€ˆ@ èÕGoã¾õÖ[]d<Îh°#?‚ Ny §6ÎoÈÆ“˜çrPîfýúõöûï¿+Þp†£Q¾fÍW„Œ Fäc0Æò±¢ß]ƒƒ?eÊ”qIlùÀ@Vtã£Ù9çœã"¸qģ펃ž9ÃÙgŸ¢9R;D­c8ËqàãDçºx¨z®ãùâ1¢ÂƒûD³DúËJþÁg]´h‘[¡(ù ‹€ˆ€ˆ€d9å ÊàšÁ&Ñã?þ¸rÊ) \²éI'dcÇŽuKV™4`èe2¸Æ˜4ø3S"U’ñ1ÀužÊƒQ즛n²3Î8Ãûgr'úl$€S>hþGºÈŽ7ã¼oÖ¬Yð²˜ÇË—/wA*Á$D%pÅÛk¯½fãÇ·Q£Fù"·'*Æ¿W¯^î8µë\ƒÿG¶gîAÐŒo3iÒ$wŽO¤ñ1 xñâΩN`Žõž={ºðȶ±Î¹žˆq¿ú V;_N O,‡‘% ‘·âZœò8ßÏ=÷\WMä Æ=ˆ~ñƒr¢~° ¸=?›7ovQ@ ´‰Ô!AQ¾|ùBõ ÔK–,iEŠqƒ|:E´Ñ2eÊUD4Œ—¶Aó‘¨ù .¸ ÔgäIµH¾µsçN—ê„NpÏŽ6&Ïê©§†ç|pÀáC¡Äµ¼GdrªÈûè\D@D@D@D@<;vøC·g,ŠáxÇ9̘'zZ†´‹·à1Ñí;wöUÖµkW7¦%Á©7äŒkO>ùdWD$=2‘§Ÿ~zhlŸ?~ß<Íýí·ßnwß}·=ÿüó¡>ƒ1?`,ÿÕW_¹Õ¶¬~e¬My¤¡­ïçO=õ”‹°G+=2Éiäu‘ç%!ŸÉ‡¸¿ôyÜqÇ9‡{°.Öq¼ýeý2ÿ!ï@pµr¬wQ¹ˆ€ˆ€ˆ€d„€œòé Ç×;æqÎå‰tGR)úa@íÍ$­b°Z¬X1WîûÃqùA?ÎluÄàóÏ?wËr‰Þ! #T®\¹BºŽ|ˆtÊã÷r9]ºt±>øÀxìË/¿tÑ7gu–Û»ÂÀÉœÐåÄx¢„Ðê$¹†®'øK.¹ÄÓ~Á‚V¾|ù³º Žœò‘~D@D@D@D@â @À +I}@ÊôéÓÝUÕ«Ww{Å8Xq“¨›3gޝ^{íµFd=ŽW‚F¼á`ö–7o^ó}QF¤9cô`åŒÙÙ0/_S­Zµí\ƒ4~ø Ð±cG{öÙg]ÀL´æHöà”G“`?‡ˆÖÖ—µk×ÎxàcÜýßÿþ×ǵ'©ì'Ÿ|bï¾û®Ýpà a×¼òÊ+.8ÇÏI¨Ü³gÁ.–ÅÛó™ ë¬àÏ;2§ùÇ?þëñU." " " I# §|:Q²Ä4è˜GŠ&Qãz› .t—2'êýòË/wîL6ˆ8gÙ+Ç쬽޻wo7ÁèÖ­›sjã´gI*ƒxʼ1! ”Ü/(Cƒ®#@èÅs ýýCÔ<ɰp®ùޝã|æÌ™Î!æ<×ða€RlÆ ³k®¹ÆEèó\8å™4xYVð!ƒ÷ J>žÉ÷”‰€ˆ€ˆ€ˆ€ˆˆ‚güzá…ºñòèÑ£ÝØ§/vÅW¸Ú Î8{äÈ‘nLZ±bE×Ç<Á1äqblz($€eNAòÔÉ“'»gŒü!Ò‰æ8®ýX=²]ðœÕ²Èö0Æçã…_‰lëøê«¯vŒXÝ»aÃ'¥IÐ Á;Œí6lèæ\“ž?‘’ä@IDATü°[Å­ÏDú‹v}f•ñÁÉEÉgaõ+" " "I åZÇÈ:I€èîo¼ÑEšÇl”F‘ïL*¼ Í({½zŸÀ -GàÑæ8¹Ñ|÷ºL@X*Ë`2¸ô–DWDÔàùÇ{Ì9ä»wïLÛ¤vLò, ‡<JôߪU+÷<¾ŽgòÑG$¼â™™øzýŸ  ÔžQu" " " " ‡H™”àø:¿ÊòÕW_uÎvVbâ ö²#Œ›IþJäö Aƒ\ÐÁ-¤L‚QOPÊ×_m/¼ð‚[‘9fö÷®S§NXä¶/îé—•°ô™c~áW¾Fë‡U¬ÎûH‹õümÚ´qãuÉFšgíZÊzè!'“ÉØ8ßY…Àòmyc…nZ–HÁ¾’Éß¿o°Vøò÷S”|ŠŽE@D@D@2“€"å3@—(v¢Â‰¾I¯]B8ªq\3‘¨\¹²ëŽ=é>š‡¶ƒ`,R’s&è>údWÞiï.üÅ‚ñQ T©Ršøy´êý’aU¥J•B²;8Ûѧ'úÿ»ï¾sV¥³qãF§¥_¨P!·Ø_¯½ˆ€ˆ€ˆ€ˆÀÑM€À6oHµx#/QбÌÊKœÁÞïÛ±?ãŒ3Ü« Žo¤Wˆ¢Çyïeh‚}øãöíÛûØ{žÍË8Æl¨xóÍ7gâ¾óß5µnÝÚØ¢ó ߆wu?Þ;Ú5þZÆë÷ß¿“¥Dây"Û7ß|³!•ãsGùë#÷ñö¼.™üùxi)ÉD@D@D@D + „‡Ogåó{á¿ë®»\ä÷wÜ‘î·a@ËÀ”%“,ŸõÑðtˆž¥©h¼UR»vmw¢|°ÈWL80_ïNRùÁùOß$„MÔpÈ“Ì*Ò˜ÔøI‘ôïÙXLô Vð‚ä´2Hƽ~ìëú´Úp}jùXýmå8á üA[?Ò!ïY¤å÷íØÇÓ_°½ŽE@D@D@DàH" §|:þšÞ!OÒTä[2j8ÛÑYGƦqãÆ¡î8Fòeâĉ.*Þ/)õÚ‘Ó¦M µEÛG7o!¥eD÷“à•Hz’>-]º4­KÂêÑÂGgÞëÄSÉG¢g|¤?Qô$Ùš:uª­_¿Þ½ƒïO<Ñéd-ï£ÿÃ:׉ˆ€ˆ€ˆ€ˆ€ˆ€ˆ€ˆ€ˆ€ˆ€¡ä”OðëòH³5ì&¬9ä>‘j0a,ŽoœìÔÑÆ[µjÕ ™’U <ØIÖàõ—_~qÚŽ¾]j{–ùâ4ðÁ]þ3Ï<ãœê©]¬CË’ëYòûÉ'ŸØøñãí‰'žpÏJ‚$o8ÝqøóþcìоÇÏ‘‰€ˆ€ˆ€ˆ€ˆ€ˆ€ˆ€ˆ€ˆ€ˆÀÑB Ë5å‰ö&(ò'©Q×Hž)~¨Ø† œd ò{ï½7iåÓ, j´U+¢èƒå0$ʽOŸ>6vìXLje·\p]}õÕ =W… \´ÿsÏ=g={ötŽußA¬e©Ô—.]Ú%Ó"ÙÓСCÝ%DÝ“0‹gö†þ£>rÑñ¾?V¼ñÆN+?­åƾíE@D@D@D@D@D@D@D@D@DàH ã #ø@V½Îdœ´åÊ•‹ë–D¥“ 4½Žy’¤&ÓHEtúUW]•Ìn3Ô²5¿ýö›‘¼)»l÷îÝöÇ8müìzÝWD@D@D@D cWbì³gcÕ&c=¶}ûöYÁ‚Ce›7o¶5jdìÆºZD@D@D@D@Žä%°9K#åqã_¼x±K`šk¤[à—,YÒˆP?,oÞ¼‡”C&DŸg§Cžg º?áO™LD@D@D@D@D@D@D@D@D@D %,uÊûÛûèmO™ˆ€ˆ€ˆ€ˆ€ˆ€ˆ€ˆ€ˆ€ˆ€ˆ€I”èõHúkê]D@D@D@D@D@D@D@D@D@D@i銔'9hžYOKô|®\¹RíŽ628’d©S~Ë–-ö믿ZµjÕÜ–ȵk×*QlZT/" " " " GI“&ÙôéÓí¾ûîsoÕ»woC³]»vöÇX¯^½ì¬³Î²ÓN;í°~ëåË—[Á‚­L™2)ÞcóæÍ¶}ûv#§×®]»ŒyQÙ²e]ûÈÆ;wîtÒ¢*T°9rØêիÚ)RÄñ nذÁõ, @Žƒ¶qãFûá‡Ü3ÔªUËI•ë=NvñÜVóæÍ3î]£F #Ì"yëwß}çæ¬Ç|dµÎE@D@D@D òÔ)¿ÿ~›9s¦íٳǦf Búé'㙈€ˆ€ˆ€ˆ€ˆÀÑAà矶ٳg»—Å ?kÖ,+\¸°sÊ8pÀæÎkG‚£´K—.î{öì™Â1?räH›0a‚½ù曆ƒ¾sçÎÖ¼ys»ùæ›Sü :Ô&Nœh=zôpü®]»¦h“/_>»þúë­E‹®î­·Þrö ƒ rk><ùä“î9ÕÖ¸qcëСƒåÎØ”2ÙýŸ)µãï¿ÿÞž{î¹°&µk×¶ÿüç?aï@Úz+^¼¸uëÖÍŠ+æ‹´ Hl•áÛ™s²ûAvºS" " " " " G(¾ýúõ sšI¯JRß¾}­{÷îQ#¶y×cŽ9ƈ‚Ÿ:uªµoß>¬)¦M›fåÊ•³c=Ö/^ìðœwÞyÖ¨Q#w<þ|çàýõ×]Ä|½zõ¬M›6vþùç»úmÛ¶ÙË/¿ìœìÞiOäD–ãÞºu«]yå•Ö¤I·ZaôèÑöí·ßº÷ß¿ë'žŸd÷Ï=iCÀÎv8ÝsÏ=ÎÁî?f|òÉ'öüÃuÅÇ ò¬Â¸êª«ÜÇ!¸õïßßzè¡xo§v" " " "& ·§‰H D@D@D@D@D@²‹‘òÞÙ|†U«VÙûï¿o#FŒ°•+W«Ü1oÒÇ7œÈK–, kóÅ_Ø‚ Be´7n\èh8èëÔ©ã¶Ö­[[ÇŽ]5²@N~œólD‹cÈãø2öØ;ï¼ãòmÛ¶µV­ZYÅŠ­J•*Ö©S'«_¿¾{¤pâµôö—QþÈî`|ÔàÝ‘ ºé¦›Œç׿þµ¢>ex«Ž¯ÚVpþÚk¯µo¼ÑQ€øòË/·óÏ?ßmÏ;×ôòäþZZ®ZµÊ<ìʰ®R;ëÖ­sõÊ3ìŒÑ¡löíÛ·Û|`mÛ¶uïhCt÷™Ý>`®qÊìVSà8³æëÃïÝ»7³aöÉ/))É”yÙTJGåF•­¯öÎ;ï˜~èxì±Çb¾Dμ^NýåüÜsÏ…}Ó¿—šôjúÑC?püÓ?ý“û?ã:OÿéÚµ«sÖMš4ñÝ,@@ d+(_¿~ý°4Ñî råÊ®[rõ!8²ý˜ÔÙ6•½,²ÛæTjnzYãÞV~ã+³&ó¬” 'É£eãôë×ÏeÀ¨†bN›jë'bÒ#¢z<µD‰Ù:µ>(*»ãÖ[ou¯fë$„ € €@!ÐçreowëÖÍËüq÷9¸uëÖîóô[o½åò={ö´ë®»Î[Gåùš$VÙÒ…¡éƒûï¿ßMj«ûSà8ZöråʹŒv•VQ0Ycôë›o¾1Mºªšû±šÆ+è¯æKÚÄìW°]õß/¸à‚`wh]ß÷:uêÚÞ²e‹ l«ÔM´–èù¢#»}eÊ”1½‚MÿGd¨ÿCjzbA­fÍšnéÿ¨äZ´§1ü– € €@¢Ù ÊG~PÉì¢z¼1ØÒO•¶ÙG¯°'ª»c®¨ÿÿìXRS+¿ü?­èÑý1ÇåÇ=zß}÷™>ô6Ì.½ôÒ]vãÆm"&}IP½F=jšHÛ±c‡›øJ„u¯4@@ÎFMôê³·V¹eÆ«œ‹’\¾ýö[—]¯ ¼š‚Ú ~kn«hAïH#eMËÝèsöáÇ]?VÁð={öØÖ­[]—²¬ÕTêÄW 8òûøS½zu7‰í¸qãL°ªÄJ´¦Œm½?âõ]Az•ÌIIIÉ0\¥v”µ®¦`¹‚Ï ¢ë¸x›²êuœî/ž¦ïIòðÉY‘Ç$r¾¼öŸ3gŽÍš5ËÕÑoÓ¦»UM¨«¦'‚Íóýþà>Ö@@ì d+(¯ºŽYÕTmBï?ÿüs—ÍáopG‹û-½B|yŒËšoùKúæO¾«À–zœU->0ŸH¶I¬›Vv†WUæ¼êZêö¤I“L“.ýÛ¿ý[¬ÃèG@@àœPP>Øü¶é>#;òÉVïÛ·o<,溞@}úé§Ãöë³y°oìØ±.°;yòä°qšU/5Õi9rdØþh ¬ë•£‰8WBž¨õc>ûì3·­<‘MäJ•*¹ ºž*P`ýÙgŸ +ËyLä¶Ž×÷žà“½‘c‚Ûzê7V@^ã9_^úë»–~iܸ±Ý}÷Ý¡· +µ;Ã˧j’X5¿ßmð@È¡@¶‚òÊIOOÏôÒÊÚPSÆHh½I¯Óù¦™k§ó‡N_JÙ(ˆîó ÎûìŠX÷žU¿&Lòµ!äÿ—ùôŸ={¶5mÚÔ:tè:…²O4y”jOª®¦Êé^"›²€ô㉷Ôx}öMY3ª+éëgª_[é‘Í` MeÜ+Gjj¬&‹Ò‡ý€àãôçd‰ € €@~ DÖw÷ßMxWp¸T©R.Á%«ûñuÅ5.¸®ìö!C†„×Ó­ 8k‚Sß ×gç‹/¾Øué;Ò3Ï<ã>»û’(¥K—öó\þá°xÀ^zé¥Ð9ƒé3¿ðóæÍs“¯ªžûW\õ»€jëû{1b„˰_¿~½ Dϙպ2þõA?tÈ5Øô]AçTiÓàwà˜ÈõxÏ—WþúÎ3zôh÷4Å£>fçŸnГÁæ”ðûƒûXG@²+1š›Ý3eqœ&uU)šX­X±"6ÿñv¶ê¹ööûªu˜Ž×y CS6ËÀ]ð\ÇæfÓ#µƒ r§T`Ý7}ð¾÷Þ{]fÇ´iÓ\¦ŽÓÕ‡ä`{óÍ7]&Œ‚úzVÇhÒ)ß4©•ö›&µzã7B]úð­ãþú׿šïÔ—e©f½îƒ† € €@A ¨Œ‹Jœø¶páB·ê'âT X“Šë›÷Ýwîó¬ÿ\¬lò5kÖøS¸³ß(Y²¤›ÔSçÓK™æ 4ëSºbÅŠ¡¾äädw¸–þ8%ßÄÛôƒÂ=÷Üã’šÒÒÒ¢æKö¼øâ‹aõУþßÎ>}ú¸µ×^{-³aQ÷)qHIVú~Ùþò—¿¸§ô´¯oÁuß\Æ{¾¼ðWö½~ PrÑСCÝ7Á{Ów0ýð2þ|÷žý>ÿ$„ÿ¿åûY"€ €9ÈV¦|v.x´FÆÇ*ƒçqs3»¨vÙÓfÍj†O§ó”^?5ØU`ë;w6ÕLMMµ)S¦äê}¨î¥êêË„š²4ùSÕªUÝ*T°éÓ§›&(zÿý÷í†n]_™íÃN×¼×#™Ê^Ѻ²vþë¿þ+4&«=Þª6xð`—­¯Iž‚ÙBYÏ~@@òJ@¥d”œÒ½{w÷©>‹ë³³úT“»ªlŒÆ\sÍ5.Ȫòª‘îå*y£d“W^yÅ=ù™!W÷žÙy5Q­&OUÂL´¦Ï÷ ¯ZµÊe§ë©Ú¬šæ«RÙúõãE¬ò8ÑΣ'xe4cÆ W¾T%”¤¤Õ«W»'h}Yé•ÅÿÇ?þÑM>›ÓóE;>»}zÊxøðáîß_Oûzû:Ÿ‚ñz²@MsüéOrß®½öZ÷„’¤ô½LyGÄ@È%|Ë”?Z墘·Ü¸f9ëש¦ýçgá F; ³óDŸ—}úp÷÷¿ÿÝ}ÐÏ‹ë(#ÇO(¤Zó*Ô·o_÷…C™9ú‚¡1*Ulú©ìj O?zÛ£GÔ×ýÆÓ”Q¤29z×ÐWFÉM7ÝÏáŒA@@ Û‘eR‚'òû$U{õÕWÝ\LšËJÁ`?)§Ê<*ÀªÌí¿ýíoî3»2Ñ•d¢²3jÚ¯€²2£ÇŒãJÆD+ ©±*ã˜U¦´Î«’>H­ã²Ó4§Tfå`~õ«_¹Ó*xÙbÝÿï~÷;WªEOÑF6oíXõ©ÌKË–-Ýwý€¡à»‡ôƒˆ&wõ­X±b~5æ2‘óO’S}_òOV|ôÑGöî»ï†^ï¼óNèRºÎõ×_ïÊwêiï5WÚ­·Þà  € ù–)¬B½˜÷ûî½-mÏ¡öÇ·VÙÝ]jŧ™'Ósy§Ü÷ß¿Ë: N”›—Ùºuk¨v»Ï˜6QÔŽ;Â.ëò¾ÓO|¥sÄó%aãÆîPŸEäÏ£? @@¼PŠ^¾)8꛲šƒeM©`°ÆûqZvìØÑ½bQà[Ÿ­UžEÁ{•¢‰Õn¿ýöX»Býº·Ç{,´ÕÊ믿uˆ‚ûÁ÷9èŸÿùŸM¯hMß‚>~ŒÞk¬ëé}G;Æ«'x~øa—!¯§pUZ¦víÚjÌßqǦR9rȬÅ{¾à9rê¯úú¾Æ~ð¼ÑÖ{õêe={ötå?õ”A´ÿ[ÑŽ£@HD óOL‰œ)‹±'‹•‹:âÆNµ¬Ùy¥í7/,;ý8aÔ!a±Î6(7|@^-'{ÊÍËê*Y£GDÕôáUMY.ÁÀº&”Ò£ºÁ¦/Áæ'ÂRÉß‚Y©/¸íƒñ‘ðª, @@Â" ì÷¬ZVct%ðš•¢¹ ¼Oö‰5:«€|ð8eègu¾àøü\×½E&:åçõ¹ €œýEóë-=q0ꥆôhp:ëÂì¥[šÚ’ÿSw¾OûóìÎkDë]<&þ¦r)íÛ·ÏrŸÈ3î=UÞþq¨{dwÂÛUÓ´¢‡vÆuœ2ÐÏÆ¦€¼ÊÉT®\ÙÝc½GQšXµ3›Ôêïÿ»«©¿j*ªi¨™3gÚ!C\Ð>ÖuéG@8ó|ID-õySK½ô§rô:qâ„•-[6Ô§Ï›JÞ !€ € €@Ö«V­r1Ý„2åuÚ½{÷ºZãY_"ãˆòMŽÚúÿ/ãŽ8{Êoœw@>ÎSž‘Ô5_µjÕ,ï]AûÌšžfðO4DŽS9Ù³gÛ7ß|ãð?þø£i¢WeÕûÚô‘ǰ € € € € ¹@ÂAùÌO—ùÞ2kþaÇ’šF­-Ÿù‘f¥öü`:ž–?}úô±jÕªÙ_|a_}õ•û§[·nö»ßý.”9Ÿ?wÂU@@@@@³G _ƒòb+¿ü?ÍZþ!¡À¼òǽùíoë^…þf¹A@@@@@à 8=µjþ¶¢G÷[Ò72•¢‰§iœÆë8 € € € € €g²@¾gÊ{,•¢)µù3;Zãr;Zå";V¡ž,VΊž8h%öÿd%w-³’Û¾¤†¼c‰ € € € € pÆ XP^rEí´Ò맺×/É@@@@@@ |/_“Åý°@@@@@³V€ üYûOËC@@@@(lZ¾¦°ap? € € Px:dK–,±ôôtkÞ¼¹Õ©S'ìæöîÝk»wï¶úõë[Ñ¢áùF'Ož´7ZåÊ•­bÅŠî¸U«VÙÁƒ]Ÿ:J”(aµk×¶"EŠ„7Þ ]cÅŠî:Mš4±äää ÷ï¹wêÔ)Û°aƒ•/_ÞªU«vhfû² € P`å Œž #€ € €@4cÇŽÙˆ#lõêÕa»kÕªeƒ Õ?ùä››0a‚UªTÉRSS­]»v.K_ÙöÆ s§˜2eJ†S)ûèС¶xñâ û‚*±£q{öì v»õ·ÞzËýðÜsÏe(““Ù¾ '¢@@ @²—^R ·ÌÅ@@@àlPPZ­oß¾ÞâUW]eeË–5és«íܹ3ìT> ®Àw´lüüÑïÝ»·©\M°©æý<`'Nœv»õãÇÛºuë,òz‘Œ×¸£GFî²>}ú¸¢ÕÀÏl_†Ñ € P åäWÆÈÉ”hÿüóÏîKE™2eBý…ueýúõööÛoÛ­·Þša’­ÂzÏÜ € €À™%°víZ7«Ÿœ5x÷ÊtïØ±c°+¡uÕÿæ›o\­ëZ*;sÞyç…γ}ûv—¯ÌöhAù5kÖ¸±5 \iÓ¦Mp3W×/»ì²˜çËl_̃Ø € P å`ß¼y³«5©ì=¦šÓ¶qãFWoRY-ݺuËééòüxý ñý÷ß›ÙU @@ÜP&¹JÃäe›4i’;½Ÿ¤µB… ¡ËiUõÇJšQÐ^-ȱ²{÷nÛºu«ëUÙµ+V„>Këª=¯Úöû÷ïí×Êwß}ʪ×$¶ñ\Ï€? € €…^€ |ÿD Dßwß}öøã»z‘—^ziG3@@²¨U«–›ä5«qÙÙ¯Lû‹/¾Øî¿ÿ~w¸‚ï+W®´_|Ñâµ?V0Þ_¯^½zöå—_švbeËû±³fͲɓ'ûM·\´h‘饦@ûÈ‘#mìØ±.kßuþïŸqãÆ…6»téb·ß~{h›@@3[€ |‚ÿ~š JÖ}`þ’K.Ið ‰W]I}pß²e‹éKЉõ%t”µ>þ|ÓÕ«W\}ªq©þ#GŽØìÙ³­U«V.çÛo¿µºuëÚå—_žá‘\]kÉ’%.[G_24ÁVä}yùꫯÜÕóÔSÑ/Ý + € €Ä) Ï  ”ûϾÁÃN:eË–-³*Uª¸Ï³þ3¨²Ñ•Ml>C=)))ض®Ïõšôµk×®¦ZñM›6 Ûm#99Ùu¯^½:jP^ýª)ß¼ysÓwý vøða{æ™gLÂú§dK—.íöiB[Ÿ)¿páBûøãíÎ;ï eÇW®\Ùã € €ÀÙ!@P>ÿŽúíó ÎçeÝH}¨1b„©¦e‰%ÜRuÝ{ì1kܸ±{¤u„ ¦GnƒAù÷Þ{ÏôDAy}ÀטÏ>ûÌÕ°×½+°þꫯº/þQX}ÁyöÙgˆ£ó2$l2­ñãÇ»Ì ?æý÷ß·§Ÿ~:Ï3ÎÆ?‡ € €œa­[·vAé1cÆØðáÃ-8©éôéÓí­·Þrž*ɤeË–îÝÍ™3ÇZ´höN•”¢æÇ„íŒØP‰FæÕøWP=Z=yí×gp}~çw\rJ0`®24O>ù¤)qGAyýhà8ðåkÔoÒ¤‰NjJtñ“×îÚµËõkŒÿœÈ  € €ÀY!@P>›ÿŒ)))ay}yÈ‹¦/ ®?üðÃ.K~ݺu¦šöS§Nµ{î¹'¡KîØ±ÃÏk×®í2ïõEçóÏ?·n¸Á8`Ï=÷œ î?ñÄ.ÓhñâÅ6jÔ({óÍ7mÀ€¡k¥§§›Ž-_¾¼ýãÿpäΛ7Ï~ûÛ߆ư‚ € €@v\ïÞ½»)Ÿššê&vÕS¢zšsîܹîóª>‹«é)R¹•]®20W\q… â§¥¥ÙÒ¥K]&»>ûú¦ÄMô:mÚ4×¥øòåË]‰ϴ×ç^}6Öç]=yÙô©JZêsòàÁƒ­sçÎ.È®Ïé¾TM¯^½"c@@AùEâ+ú®É›ôeaÊ”)‰Ÿ Ž#þýßÿ=lTƒ Üeá$Úô¨¬Ÿ õ²Ë.³råʹ/% Êëa}Iù·û·Ð£¿zàÑGuÚ¯Õ£G÷Cú~ó›ß¸/6la@@l Ü|óÍ.[]Áu=ñéÛ…^hýû÷wÉ1¾OO*€® ½ñjÊdWÙ˜{ï½× [Ï©IeÛ·oÚ¯c³júœ¬ïúü 'WuMÕlzêõüóÏÏ2û]™÷Íš5³’%KFž"l»X±baÁ™í Žc@@ `ÊçÀ}ûöíö÷¿ÿÝ®¹æšœ%óC÷îÝkÿýßÿmz¤V¼*p®æ—™¾·fÍšaÊ8RÖ»Úš5kÜ2òQZÕØŒlÁó”-[Öe©¼ @@ܸí¶ÛL¯mÛ¶¹Ï¬ tG+)£>Mܪ²37nt—¯_¿~XÙOú\USƽZ°lN´cºtébz)«^“¾êš‘s1Ó}ꄬšêÚ+é'«öÚk¯Å’Ù¾˜±@@ ßÊg“Zy}øWVÍÝwßͳd}˜JÉìܹÓ:uêäêRªÆûóÏ?ïJÇŽ Š«f"ÍO€¥‰cƒu1£#žì¡hÇч € €@¢¾ÖzVÇ)ˆ®§JsÚ² ÆGž_%•OC@@ ^¢ñdÜÿ ø€üå—_îŸý¿=¹»¦ÌxÕ×ã±wÜq‡©üŒ&ŒRÓ#©j Ò«­ZµÊ-õçàÁƒ¶gÏžÐv<+þ‹„öMÙFùË_ìoû›ïb‰ € € € € 2åÄóùK.¹ÄMð”àáQ‡kBU_FÆЄTº†UmÌ™3gºuM,]£4š€J“[)Ã]õú ¦ü¹âY6jÔÈM„¥ d?-ZäjsöìÙ3žS0@@@@@ ‚òYwÿòË/®d2×Ü•­u_FAw?)•?‘2â”×DQ/¾ø¢«+¯}íÚµsõ-ƒÕ0À^xáûðÃÝámÛ¶µ£G†&~ Žõç×R×÷÷ 1ƒ ²Ñ£G»ø×¾nݺÙõ×_ï‹uíÌlŸ;˜? € € € € €Vät†ô©³ÕAè¹Ùèž4i’õîÝ;7O×¹TŽFñ™MϘx.¦€¼&˜Íª¶|<çb  € €À™! Ò…jZêó –ziî"=I©—æ-*[¶l¨o÷îÝÖ¬Y³3ã r— € €°€J+æJ¦|ÿ%K–,€¼n±R¥JYÞiAùÜÖå| € € € € €)pPÞßìŠ+ìèÑ£~3×– 4pAù\;!'B@@@@@ d;(_HîŸÛ@@@³T@óL-Y²ÄÒÓÓ­yóæV§N°wºwï^Û½{·Õ¯_ߊ-¶ïäÉ“¶qãF—ð£§qÕV­Ze %•(QÂj×®íù²t %+é:Mš4±äää ÷vS¹´qêÔ)Û°aƒ•/_ÞªU«õ¬²Û¶m›{oJ|¢!€ €‚ò…çß‚;A@@ÓÇŽ³#FØêÕ«Ã_}õU+V¬X¦'™3gŽ ÈëI€¼˜,Ó‹³@@ Kðg<³Î@@@ ïÒÒÒléÒ¥Ö¶m[»ë®»\Öy5¬gÏžvË-·˜Ê²Ì›7/[7 7*3sÁ¸×ÕW_3Ó<ÖöíÛg&L°J•*Yjjªµk×Îeé+Û~ذaî°)S¦d8\ÙÿC‡µÅ‹gØìP‰Û³gO°Û­¿õÖ[çž{.f™ÝŸ²øUJ§M›6ÎA € €@Á d/½¤àï›;@@@³P@Aiµ¾}ûfxwW]u••-[֤ϭ¶sçΰSù€¸ßѲñüñG7¾wïÞ¦r5Á¦š÷<ð@ÔìôãÇÛºuë,òzÁãµ®`¼Æ=z4r—õéÓÇýP¤H‘ û|ÇØ±cÝõÿð‡?Ø;ï¼ã»Y"€ €"‚ò…èƒ[A@@à\X»v­©ìŠŸœ5è¡L÷Ž;»ZWý÷o¾ùÆ•ÁѺ®¥²3çw^è<Û·owÙøÊl”WvµF…Ž ®äevúe—]¼T†u=a ‰qõƒAõêÕ3ì§@@ pº ü©S§ ‡ w € €ù. Lr•†ÉË6iÒ$wz?Ik… B—Ó$ªê/S¦L¨/¸¢ ½Z0Ü\ß½{·mݺÕu©ìŽÚŠ+Lõjº†jÏ«¶ýþýû]Ÿö«}÷Ýw¡¬zMb›Õõ”‰ÿ—¿üÅãûÛߺsð@@ p º üO?ýäê#úÈ…“»B@@¼¨U«–›ä5/έLû‹/¾Øî¿ÿ~wz}çX¹r¥½øâ‹.¯ý±‚ñþ~êÕ«g_~ù¥mÞ¼9f¶¼;kÖ,›C=)))ض® ¼&}íÚµ«©V|Ó¦MÃöGÛЪj«W¯Ž”Wÿ‰'ܵšHV?¨>|ØžyæÓ„°Ýºus}¥K—vKMhë3å.\hü±Ýyç¡ìøÊ•+»q™ý™6mšKn Ö‘×úááÉ'Ÿ´_ýêWÖ¹sçÌNÁ>@@|(šO× ]Fžõ¡Wfýc©ú0ªš‡­[·6ed•:+ € € pV è;Ú˜1c,²´åôéÓ]f¹‚öj-[¶tË9sæ¸eðÏìٳݦܹþý÷ß»@¿úuM% Åj7¿Už&ØT†Fpݧšêâ7iÒĽ|0_Kß§ïFjš¸Ö÷i©ÜŽüÁÁ ˆø£ó*‹ÿÈ‘#¡—š4–'‘#ÀØD@ P _3åõ¡S™"úP¨ ’_~ùÅeÈ+CEÙ.5kÖtôW^y¥-X°ÀM°T€6\@@òY E‹Ö½{wØNMMu»V«VÍM`:wî\Sý÷””wWʤW0ZÙå*sÅW¸Ò-iii¦IO•u_»víÐ;P`Z½*«\m×®]¶|ùrWŠÆ¾ ྣèGR¥J…Žõ+J ºï¾ûlÔ¨Q®TŒ²Ï@_·n]¨TM¯^½üð|[ªlMd{饗ì믿v?Dîc@@ àò-(¯ÉŒ|@^w•«ñMwîØ±Ã½ÒÓÓÝ#¤ ÌÏŸ?ß=æéDZD@@³_àæ›ov%`\Ÿ0aBè _xá…Ö¿—©î;{ì1—U¿dɈW¿¯ï½÷úaaËà9õônûöíCûulV­M›6®Æû”)SL/ßôô¯®©ŒõÈV¢D ;ÿüóC%i"÷ûm•ªiÖ¬™•,YÒwE]+V,Ì!ê ÓGC@(\ENgœJä–T‹Pug̘á2Þã9VlUOQKe·òÁãUg^µ5ᑲMT[^™+Ùm‹/Î € €À9%àKÅh©Œr-õ:vì˜+ç¢R(J¦ÑüP¾Oå[@Î˶mÛ6SâŽÝÅ‹ÇÎ)Ò½nܸÑÝJýúõ³=٩Σïd©ún£Úíº&e8@@b¬ZµÊ”„‘uHŒ$Ò­‰Ê Ù¾}{Ì€¼>d+óEmÆ n©¾x2UÜ`þ € € €ÀY' ï*C“Y@^oZAt%ùèo@=–ŽMäøòåË» xòÑ4éC@ˆ&/AyeÖ«©Dš>°¶jÕ*pWf¼&túòË/]]ÇC‡¹l=jɇ[GÆ@@@@@³@ öóŸY¼¹† ºÇX3¦‰\÷íÛš iÏž=n¸}­X±¢µmÛÖM´¤Ç^÷îÝëùÔºÕX=«É•<˜Ùe؇ € € € € €À!pP^5ß•ùOíÈüÑå9â0ˆW^5(¿øâ KII±-ZزeËܶ&wÕ£©k×®5M¸¤vôèQ·ä € € € € €gº@ÂAùo¿ýÖ~þùç¸Þ·/=£@¾ZõêÕí§Ÿ~rëÊ´WF¼ðj Ô/X°ÀM¥ã”%¯‰¤4© @@@@@³A á üÉ“'mÛ¶mq½÷¦M›ººñ¯ »jÇ—+WÎT+^Ayქi4F­~ýún©¾®GC@@@@@àlH8(íM—.]ÚMÔ¹¯B… .¿~ýzûî»ïìâ‹/6•¨IKK³9sæD­IŸœœl æ>|ØTþ†† € € € € €ÀÙ"+Aye¾«4M´vþùçÛæÍ›mÓ¦M.k¾U«V.0¯²5»vírº/^ܪT©b5jÔ°zõê¹€¼²è:í”ô!€ € € € € pF äJP^uß§OŸ@µáÕ6nÜè– Ì7oÞÜ­«®|‘"Eܺþ(C^yjɇHXA@@@@8Kr%(¯Àº¯Ÿ•‹óÊoРU­ZÕÕ˜?~ü¸8pÀM «¬zmÓ@@@@@8Ûr%(Ÿ(Šðª1OC@@@@@à\(z.½YÞ+ € € € € €)@P¾ õ¹6 € € € € €À9%@PþœúçæÍ"€ € € € €¤@Ô”/È7̵@@@àÌ8tè-Y²ÄÒÓÓ­yóæV§N°ß»w¯íÞ½ÛêׯoE‹†ç ”Ïgp.‡ € €™ ;vÌFŒa«W¯X«V-4hP(¨þÉ'ŸØäÉ“mÔ¨QVµjÕ°± Ø2Äzôèa7ÜpƒÛ7|øð°1~£C‡Ö¯_?¿×rüøñ6sæL÷”*UÊè~@ð}¹½üå—_ìù矷M›6Y«V­ìÁt?Lè½Æj}ûöµ”””X»éG@Èg‚òù Îå@@@ sÑ£G»€|ûöí]0YÙî ,°‰'º üË/¿lÅ‹gï«LRR’Ý}÷ÝîØž={¶ÍŸ??¡ ü´iÓlƌ֠Aôoܸ±ýðÃöúë¯ÛSO=eO<ñ„5lØ0ó7™½ºÏW_}ÕŠ+v´žxøá‡Ãú´ñÙgŸÙ—_~i5kÖ̰@@‚Ƴàîƒ+#€ € €–––fK—.µ¶mÛÚ]wÝå²ÎkÔ¨a={ö´[n¹ÅTÒfÞ¼yÙ’R‰•™¹à‚ Üëꫯv™æ‰œlß¾}6a«T©’¥¦¦Z»ví\–¾²í‡ æN5eÊ” §TöÿСCmñâÅö;TbGãöìÙìvëo½õ–û!à¹çž +“£(Z¶löR¹9Ö­[×½× '£@@ À²—^R`·Ë…@@@àlPPZM%W"ÛUW]eeË–5és«íܹ3ìT> ®Àw´lüüÑïÝ»·©\M°©æý<`'Nœv»õãÇÛºuë,òz‘Œ×¸£GFî²>}ú¸⩯l~ý€qÓM7e8 € €+@P¾`ý¹: € €Ö®]ë&`õ“³v¹ìðŽ;»Zׄ¨ß|ó+ƒ£u]KegÎ;ï¼Ðy¶oßî‚ÙÊl”_³fÛ¨Q£Ð1Á•6mÚ7suý²Ë.‹ë| è«Ö~½zõ\ö|\1@@ ßÊç5B@@¬”I®Ò0yÙ&MšäN¯À¼Z… ÜR4‰ªúË”)ê ®(h¯ ä÷×wïÞm[·nu]ÊZW[±b…)£^M×PíyÕ¶ß¿¿ëÓ~µï¾û.”U¯Il㹞;ðôŸ?üÐŽ9B–¼a‰ €2‚ò…ì„ÛA@@à\¨U«–›ä5/ TSþâ‹/¶ûï¿ß^Á÷•+WÚ‹/¾èñÚ+ïïGÙ皪 € €@á(rºfcø3¡…ëþrt7‹/ÎÑñŒ € €À¹"àKÅh©Œr-õR}r•sÑKò²eˆúT¾Eä¼lÛ¶m³ôôtè.^.øØ¦0@IDAT&!€ € € € € s‚ò97ä  € € € € €Ä%@P>.&!€ € € € € s‚ò97ä  € € € € €Ä%@P>.&!€ € € € € s‚ò97ä  € € € € €Ä%@P>.&!€ € € € € s‚ò97ä  € € € € €Ä%@P>.&!€ € € € € s‚ò97ä  € € € € €Ä%@P>.&!€ € € € € sâ9?g@@@r_àСC¶dÉKOO·æÍ›[:uÂ.²wï^Û½{·Õ¯_ߊ Ï7:yò¤mܸÑ*W®l+VtÇ­ZµÊ<èúÔQ¢D «]»¶)R$ì¼ñnè+V¬p×iÒ¤‰%''g¸xϕȸ;wÚòåËíÔ©SÖ²eK«V­ZÔÃúé'ûᇬmÛ¶VµjÕ¨cèD@È\Ê7lØÐ.¼ð¸ÞÉ_|a»víŠk,ƒ@@@àÜ8vì˜1ÂV¯^ö†kÕªeƒ Õ?ùä›|xØùüF‡¬_¿~~3®åøñãmæÌ™¦À¼o¥J•²º|_n/ß~ûm›:ujØi{öìizù¦3RSSmÇŽ®ë7ÞpßÑyä‘lÿáÏÍ@@ çáé$9?Ÿ>|ØÚõAZÙ*ú@¨Ì½Ž9âúNœ8ᶃ`sáÒœ@@ÎÑ£G»€|ûöííü£=ûì³.è¼eË”?~üx¶ßeRR’=úè£îuë­·ºìûùóç't¾iÓ¦ÙŒ3¬^½zÖ¿÷£€ê?õÔS¶~ýú„Îïà… º€üå—_ncÆŒ±çŸÞÝÃĉî9vìX×û“e·nÝìûï¿·>ú(ÞK1@@ r=(¿uëVSüöíÛÝmÏ›7Ï>ýôS÷Z¼x±ëÓ‡TõíÙ³'ß§F@@3M --Í–.]êJ®Üu×].ë¼F.(Ë-·¸¤}ÇÈNSÒÊÌ\pÁîuõÕWÛƒ>˜Ð©öíÛg&L°J•*¹lôvíÚ¹,}eÛ6ÌkÊ”)Ω¤¥¡C‡šÿN”aÀÿv¨ÄŽÆEû®ôõ×_»$'¹èÇ•­éÓ§;ò›o¾qK}[´h‘]rÉ%¦÷§ò=sÞyçÙ{ï½ë²ô#€ €ä£@®—¯ÉÇ{çR € € p– ((­Ö·oß ï쪫®²²eËš‚ô¹ÕTŸ=Ø|@ü¹çž³âÅ3~]úñÇÝðÞ½{›ÊÕ›jÞ?ðÀ¦'ƒ#›²û×­[g‘׋§`¼Æ=z4r—Ý|óÍö¯ÿú¯auëìW+_¾¼[nذÁ-»víê–þOçÎíÝwßuôÕ«W÷Ý,@@ @ ã§Ì¸ .‰ € €H`íÚµnV?9kPE™î;v v%´®ò™Ê(W¹­ëZšUYä¾)Ó\%8쎔_³fÚ¨Q#HزM›6aÛ¹¹¡ìø`Ó{Pm{¹¨¤š&wU‹üáÂooÚ´ÉÊ;"þ € €&@P¾Àè¹0 € €D (“\¥aò²Mš4ÉÞÏqU¡B…ÐåT§]ýeÊ” õW|™Î` ?¸?¸¾{÷nSyO5úÕV¬XáêØk]×hذ¡)P¾ÿ~u¹ýZ~÷Ýw¡¬úªU«†ýp ýj/¿ü²mÞ¼Ù=UàÄðeoüöÿŒ4WîFë»víò],@@ H€ |ÁsY@@È(P«V-7ÉkÆ=9ïQFùÅ_l÷ß¿;™‚ï+W®´_|Ñâµ?V0Þ_]“»~ùå—.+[Þ5k–Mž<Ùoº¥ê½ë¥¦ÀþÈ‘#M³*k?ØÆÚìÒ¥‹Ý~ûí¡m­(C^ç¹öÚk-%%%´ÏÿXðË/¿„‚ÿÚéËæøŒùЬ € €ä»Aù|'ç‚ € € K@nÊ·lÙb ÐÛ©S§lÙ²eV¥J«[·®ùlpe£+›<Ø|†zdÉ—àá5é«ê¯«V|Ó¦Mƒ»£®'''»þÕ«W[´ ¼úUS¾yóæn¢Uý vøða{æ™gLÂvëÖÍõ•.]Ú-5q«Ï”_¸p¡}üñÇvçw†²ã5Yk°i"Ù3f˜êÄßxãÁ]¡@¼üTãÞ7m«É† € P°E öò\@@ø?Ö­[»1cƘ‚ðÁ6}út—Y® ½ZË–-ÝrΜ9nü3{öl·éÇ÷E®ÿý÷.Я~]S“²Æj7v5ÜßyçSyš`Sš'Ÿ|ÒtŸjúÑ I“&îåƒùZú> WöºïÓR-¸üÁAïU¶¶oßÞî¸ã76øÇÿPðÑG…ºõ~”µ¯§òº4P袬 € €ÄÈ“Lyeœhò Mޤ,ß´­üPé÷±D@@Z´haÝ»wwíÔÔT7±kµjÕlÉ’%6wî\Sýw_®E™ô r+»\e`®¸â +R¤ˆ¥¥¥ÙÒ¥K]&{íÚµC¨*W£‰^§M›æúT_}ùòå®ÿŽ2`À;pà€éGR¥J…Žõ+ lßwß}6jÔ(ØtŒ‚òj½{÷v?4èGŒÏ>ûÌtík®¹Æ.»ì²à!¬#€ €@‘Ó7†?šÃ)^¼¸«É¨ ="ùè§áìÔ©“íÙ³ÇæÍ›—ëe~øâÅ‹3À^@@p¾TŒ–J¬ÑR/=íªÏôzé)زeˆúT¾EÁà¼lÛ¶m³ôôtèÖwXM÷ºqãF·[IBú>’¦ó¨Å{¼²ê7oÞlºfV“Äfç~rrÌÑ£GmÆ îi‚x~lÈɵ8@@ kU«V™’0bªÍúQG4hÐÀeu(K>2 ¯öîÝkú`­º‰Ê$Ù±cGÔóЉ € €è{Cƒè¸U@@@@@ÈQP¾B… V§N;pà@ŽJÏølùóÏ?ߊÍÑ-%öî € € € € €@> ä(Þ´iSw«?üðC¶²äýûW(9  € € € € €@!ÈQP~ãÆ®–|©R¥rå­:tÈNœ8‘+çâ$ € € € € €6åõfŽ;æ^…íq? € € pf ?~Ü6mÚ”éMW©RÅTFó7Þ0•À¼ñÆ3_Xw*!iÉ’%–žžnÍ›7·:uêäË­îÞ½ÛöîÝkuëÖµâÅÿêž¶mÛfÕªU³òåËçËýp@@à\ÿv. ðž@@@ Pìڵˆ ’é½\uÕUvÛm·Ù·ß~k¥K—. üúõëí³Ï>³=z˜æÄJ¤)©iĈ¶zõê°ÃjÕªeƒ Jø|a'ÉbcêÔ©ööÛo»QÏ>û¬Õ¨Q#tÄwß}g/¾ø¢)0ë­·ÚÕW_ÚÇ  € €@î ”Ï]OΆ € €ÙP€û¡‡ =qâD[³fMX_0˜Ï+6l°Y³fÙå—_žp}ôèÑ. ß¾}{KIIqÇ/X°Àô^”ùå—3d°çôí8pÀtÝ•+WZ™2e\àÝŸóÔ©Söæ›oÚŒ3Ü>ßÏ@@ ïÊç-gF@@J”(a]tQèˆÏ?ÿÜåƒ}¡ÿ»²cÇ[´h‘íß¿ßZ´ha-[¶ Q6»ÊÝT¨PÁQ6z«V­lþüùv饗ZõêÕCc¿úê+×ç;Ö®]ë²ñ9bÉÉÉÖ®];·kéÒ¥¦—š‚éëÖ­³k®¹ÆŠ)âú2û“––æŽmÛ¶­Ýu×]¡¡={ö´råÊÙøñãmÞ¼yÖ¥K—Ð>­(`¾|ùr0`@XäÆ+¯¼âÞS¯^½Âvé½)3¿oß¾Îé½÷Þ íß³g;¿®Ù©S'{òÉ'CûXA@È‚òyãÊY@@@ T]Ùå'Nœ°“'OÚG}d×_½ù ´Jµ+VÌöíÛçîä’K.qµÛ'L˜àõÁ ü”)SBAy­¿ûî»VªT)+Z´¨©ìK³fÍ,55ÕXÿúë¯ÝùÜW]våÜ~ä‘GÜýû÷úÎW­Zåúl*ËS¶lÙ°’2~ÌO?ýä²Üýv¬¥ÎðàÁ »5jdO?ý´éG‰É“'‡í×5xàkÓ¦©, @@ ïÊç½1W@@@<Pv¼¯¾}ûv{üñÇ]`ÞåuIäUƒ¾k×®.À®Ìö̚ʹ|ðÁ¦@öСCÝ1*-3sæLÓ5úõëç&f}íµ×\YMÒª¦ZìzýüóÏ1O¯ì{= P±bÅ cüïØ±c†þÜèhРAÌÓè‡äi € €ù'@P>ÿ¬¹ € €䢀J¾ø IÏ;ï8^{Õv§ ><Ã0• ñM%rt½`Ÿöéé`ߨ±c­dÉ’þ0– € €…@€ |!øGà@@@ ‚ûÈ++³¾C‡¦ R—,YâjÈ«ŽüSO=euëÖ÷¶Jâ¬\¹Ò¶lÙâê»TÙœeË–¹‰iu'žxÂŽ?7ÎTW~È!¡CêÔ©ãêåûtŒ~„Ö´W¹ € €@á(Z¸n‡»A@@òN B… îä~ÒUm'GMOOwAx-;uêäÜ>È­ y°iL°eè׸֭[»ácÆŒ1áƒmúôé6räÈЄ® 6´&Mš¸—Jè”)S&´­~m+ÞÑRת~|°Ò5AeÖ@@Â!@¦|áøwà.@@@ ªU«æJÒ,\¸Ðe•«6ûäÉ“C%^NžÜƒ-@@@@@òL€ |žÑrb@@@@@Âʇ{°… € € € € €@ž ”Ï3ZNŒ € € € € €@¸@ñðM¶@@@‚8~ü¸mÚ´)Ó‹W©RÅ’’’ì7Þ°’%KÚ7Þ˜éøÂºóСC¶dÉKOO·æÍ›[:uòåVwïÞm{÷uëZñâÑ¿ú1åÊ•³êÕ«çË}q@@à\ˆþ)ì\à½"€ € €@¡صk— 2$Ó{¹êª«ì¶Ûn³o¿ýÖJ—.] Aùõë×ÛgŸ}f=zô°Ê•+gz¿‘;;f#FŒ°Õ«W‡íªU«– 4(áó…$‹©S§ÚÛo¿íF=ûì³V£F GèG‚‡~ØŽ9bÍš5³ÔÔÔ cè@@È™Aùœùq4 € €ä’€Ü=ôPèl'N´5kÖ„õE $‡ȧ• 6جY³ìòË/O8ˆ>zôhoß¾½¥¤¤¸ã,X`z¯ Ê¿üòË13سûö8`ºîÊ•+­L™2¦,ýXíµ×^sù¢E©tˈ~@@ §ås*Èñ € € +%J”°‹.º(t®Ï?ÿÜåƒ}¡ÿ»²cÇ[´h‘íß¿ßZ´ha-[¶ Q6»ÊÝT¨PÁQ6z«V­lþüùv饗†•fùꫯ\Ÿ?xíÚµ._ãÉÉÉÖ®];·kéÒ¥¦—š‚éëÖ­³k®¹ÆŠ)âú2û“––æŽmÛ¶­Ýu×]¡¡={ö4•Š?~¼Í›7ϺtéÚ§•3fØòåËmÀ€aý‘¯¼òŠ{O½zõ Û¥÷¦Ìü¾}û:§÷Þ{/l¿ßX¶l™sºîºëlΜ9¾›% € €@. ”ÏePN‡ € €ù# ÚçÊ.?qâ„úè#»þúëÍ¥Uª¥X±b¶oß>wC—\r‰«Ý>a¨ÖKŸ2eJ((¯õwß}×J•*eÊWÙ_ÊEõ¯¿þÚOÁ}ÕeWP~Ïž=öÈ#¸ú÷ï`ÕªU®_ÁñȦ²‡§ ²*­³yófW"§sçÎîýúó±D@È< Ê—/_Þݵóåóæ³"€ € p® T­Z5ì­×¬YÓm«‹oš<6‘¦ö-·Üâê»6Ì•°Qú=z¸ìùÌÎ->8^{Õv§ ><Ã0• ñM%rt½`Ÿöéé`ߨ±c­dÉ’þ°¨K•­Q©ž›nº)ê~:@@rW Ï‚òÊ’¿ï¾ûBwK›+ € € PÀÁÀ}ä­(³¾C‡¦ R—,YâjÈ«ŽüSO=euëÖ÷¶j»¯\¹Ò¶lÙâê»TÙM´ª‰iu'žxÂŽ?7ÎTW~È!¡CêÔ©ãêåûtŒ~„Ö´W¹œÌšžذaƒËðî¹çBC8àúŸ|òIWþ'ž8¡ƒYA@ÈT h¦{sq§/c“‹§äT € € €@B*Tpãý¤«ÚNŽšžžî‚ðZvêÔɸ}[Aó`Ó˜`Ë,Яq­[·vÃÇŒc ÂÛôéÓmäÈ‘¡ ]6lhMš4q/•Ð)S¦Lh[ýÚV¼£¥&®Uð<Ø—UéGÇT«VÍŽ9ziâ\½Ž=šá^ƒ÷Í: € €@ây’)¬%¼%²åƒ¬#€ € €@~ (ø¬R- .tYåªÍ>yòäP‰¢GmIII®œ‹‚øS§Nu·yÑE¹¥ŸÜõý÷ßwåbT‹]õÜ5ñkrr² <8êÛjÑ¢…uïÞÝ€OMMu»ê~”?wî\ÓµRRR¢›W5jÔ0eÃG¶»ï¾Ûeóë>i € €¹+'Ay‹ÿñÿá&[ºæšk\µå½ K@@ȉ@´ pßW´hæ0À^xáûðÃÝ-´mÛÖöîÝëÖ5/Ö½÷ÞëæÄR­u5ñÿû߇J×(Ý®];[´h‘÷«_ý*4™ª¿w`”?7ß|³›À5--Í&L˜qá…º¬üh÷®¬ùȬüÐýäϬùûóËÌÆF»—ÌÆ³@@ >"§ë†?7ßq1G³äõ˜åСC­OŸ>6gΗõ¡@}~Múºxñâ˜÷É@@ø?_NEKe‹k©—J²¨¶¹^'Nœ°²eˆúvïÞmÍš5û¿“œak{öìqw•p‰ÖTW]MúhÍ›(h¯&·DÙÛ¶msÁveë/ž§ùRÑnŸ>@@|P EÍ”y InJÁw5=Æ©ö׿þÕ-©-ïøƒ € €,P©R%W›=Öm(+ ¯cH÷ym'×xeÜkòWòÒ !€ €çŽ@®åƒYò>~ûöíNsóæÍ.[^GC@@@@@à\ÈÕ ¼ÇóYòÚþå—_\wÕªU]ÝFmDfËkÂ$? ¬–í@@@@@Î2\+\¨@º¶û,yYù ¼&š>}z¨¶|0ïÓø””-ܹT“ž† € € € € €ÀÙ"ë™òÁ,y!íÚµËY)S^í…^pKâ} _“À^wÝuîåvžþ£míW= @@@@@³A W‚ò>¸.`–¼¶?®…Õ©SÇ-—-[ª-ï3äu¼Ú¤I“ÜRùË.»ÌeËßÿý®? € € € € €œé¹”÷‘YòuëÖµ/¾øÂí®Q£†Ê–W‡?&Z@>t+ € € € € €œ9ÊÇÊ’oÙ²¥}þùç6~üx[³fU©R%Ä¥lùÅ‹»í}ûö¹¥JØèå3äµ®€½¯1:˜@@@@@ÎPåýûöïÚîÔ©“Mž<ÙFmC† ±­[·ZÍš5ýP·6l˜[>úè£n©Zó À«¤ò ö_yå•n@@@@@8ŠçäMDË’ïÑ£‡+O“ššjo½õ–;ý¶mÛ¬cÇŽa—RF¼Ú’%KÜRÙó7v»j]MAú`°ßuò@@@@@3T W2å}à¼oß¾¡zñÇwÙîrQP^­T©Rnéƒù*asûí·[ïÞ½Cc¿è¢‹BÆFNëNÀ@@@@@Î@lgÊûÀºÞ³ç>ø )(ÓM7Y‹-\é•¢ÑkÖ¬YަråÊvíµ×†•¨Q@þOú“Û¯±¾©„Íu×]ç7Y"€ € € € € pÆ d;(ïß¹²äŸ~úi»âŠ+¬W¯^¶råJûâ‹/\ ^ï~âV_°`;Ì׌×~ןq¿páBó%lÜüA@@@@@à,ÈVP>˜%¯`|ÅŠíw¿û›Ð5h¢Àº^ʤÿë_ÿj]ºt±={ö¸R5Á y•±iÓ¦;”r5AAÖ@@8wŽ?n›6mÊô W©RÅ’’’ì7Þ°’%KÚ7Þ˜éøÂºóСCn~­ôôtkÞ¼¹Õ©S'_nu÷îݶwï^«[·®/^ÜV†üÒ¥K]P^åkÊÇR¤@@³[`×®]6dÈLßäUW]e·Ýv›}ûí·Vºté ʯ_¿Þ>ûì3ëÑ£‡©Dg"íØ±c6bÄ[½zuØaµjÕ²Aƒ%|¾°“d±1uêT{ûí·Ý¨gŸ}ÖjÔ¨azJyܸq15j”U­Z5æ~v € €$.pP>˜%?}útûóŸÿlMš4‰ye•´QÐÞäƒòü±û ¨ý¾éüæ½K@@θzè¡Ðž8q¢­Y³&¬Oä‚n6lpóf]~ùå ÑGíòíÛ··””w¼Ê|ê½*(ÿòË/» öÜ|0]W¥FË”)–P¥'–«W¯žárûÛß\}ùòå3ì£@@ g 僗ëÞ½»é•U‹V²FÇ\sÍ5%[>  € €œ%J”0Í;åÛçŸî‚òÁ>¿Ï/UzeÑ¢E¶ÿ~kÑ¢…µlÙÒïrÙì*wS¡B7FÙè­Zµ²ùóçÛ¥—^Œþꫯ\Ÿ?xíÚµ.ÿÈ‘#–œœlíÚµs»ô”¯^j ¦¯[·Î}¯)R¤ˆëËìOZZš;¶mÛ¶v×]w…†öìÙÓÊ•+gãÇ·yóæ¹§ŒC;O¯Ì˜1Ö/_n vgXå•WÜ{RbT°é½)3¿oß¾Îé½÷Þ í®T©’élúÑaÛ¶mî»^©R¥‚»XG@È„ƒòW^y¥ËzÏêÚ5²† ºòúМÔU5ä¨Õ4–‰^céÐ € €P}te—Ÿ8qÂNž0²‡êŽQi™™3gºkôë×ÏM̪ï;*µ£IZÕ4Ç–^?ÿüsÌÓ+û^OT¬X1Ãÿ;v옡?7:4h÷it?üðƒû¡A?Ð@@r_ á |¢·¬AO@>Q=Æ#€ € €@,•|Q@^í¼óÎ3eÂúé§.kÝ—d©]»¶) =‘¦Ì{eákBׯ›2ƳÊWúK/½äÊÐĺÖÎ;3”Љ5VYïº5­+à¿bÅŠÐðzõ깿‚辩”ÏáÃ‡ÃÆéGƒxJëøs¨„Ž~ Èêýúñ,@@ȳ ü /¼à&.Rx5ò‰ÿãp € €Ä¨ZµjØÎš5kºícÇŽ…ú5yl"Mì[n¹ÅÕw6l˜+a£:õ=zôpÙó™+Z|p¼÷ªíO>|x†a*Aã›JäèzÁ>íÓÓÁ¾±cÇZÉ’%ýa™.ü×ĺ¿ùÍoÜ„°™f' € €@¶ò,(¯šðÊ*¡!€ € €…M ¸¼7eÖwèÐÁ4Aê’%K\ yÕ‘ê©§¬nݺ‘ÃãÞVIœ•+WÚ–-[\}÷à*›£ïPš˜V×xâ‰'ìøñãnȸqãLuå‡ :¤N:®^~°OÇèGˆ`M{•ˉ·ùZòJ¨¢!€ €ä@Ѽ;5gF@@ —@… Ü )+Ü·àä¨ééé.¯e§N\€Û¹46 ¶Ìý׺uk7|̘1¦ |°MŸ>ÝFŽšÐUós5iÒĽ”ìT¦L™Ð¶úµ­ x?FKM\«úñÁ¾xKר4Žêíÿú׿&K>øÃ: € y–)Ÿ÷Ê)@@@ T«VÍ•¤Y¸p¡Ë*WmöÉ“'‡J¼œMT­õë×Ï4Ö¸qã\·öíÛÛ¶mÛÜzÕªU­oß¾6bÄ{õÕWݾŠ+Ú­·Þ*]S·n]ËÈȰY³f¹~]»v M¦êï!¯ë_wÝun×ÌÌL5jT¨[ëÖ­]V~¤{WÖ|î¬üЉýäÖüýù¥ú.Z´È•ÇùÍo~C–|4<Ž!€ €q(s¸®aøï&ã4p2 3oÞ¼d¸ î@@¤ðåT´T¶¸–z©$‹j›ë•““c©©©¡}YYYÖ¢E‹¤¶¼npëÖ­.k^¥`"µ;wºÝ ÔGjÞDA{5¹E ªG:Wû6lØà‚íÊÖOI!_*/'ö#€ €%E@%5ïüJÊ_”ç@@@ ¤¥¥EíŸW0ÞŸ¤@z0˜^€¼ÆPÆ= @@ ô Dÿ]géóà‰@@@@@H˜Aù„Ñ20 € € € € €áåÃ=ØB@@@@@ aåFËÀ € € € € €„ ”÷` @@@@@„ ”O-#€ € € € € .@P>܃-@@@@@&@P>a´ Œ € € € € €@¸Aùp¶@@@@@H˜Aù„Ñ20 € € € € €áåÃ=ØB@@@@@ aåFËÀ € € € € €„ ”÷` @@@@@„ ¤$ldF@@ pàÀ[³fMÔ3jÕªeÕ«W··ß~Û*T¨`½zõŠÚ?YîÞ½ÛæÏŸoÙÙÙÖ²eKkذaÂoU¾K–,±åË—Û‰'žhÍ›7·””#?:tÈ.\h[¶l±N:YåÊ•~o\@(MG¾+MOϳ"€ € €@Ò(üðÃG½Ÿ=zXïÞ½íÛo¿µJ•*KP~ÅŠöå—_Ú¥—^j5kÖŒz¿¹î߿ߞ~úi[¼xqØ¡úõë[ÿþý <^Ø Q66nÜh<òˆíÚµ+Ô+--Íy×®];´O÷5hÐ Û»w¯Û7bÄ»üòËíÊ+¯ õa@@ h勿ÇÙ € € '¸ï½÷ÞÐhcÇŽu™ÝÁ}uëÖ /®••+WÚäÉ“íŒ3Î(p}èС. ß¹sgëÞ½»;ÆŒ¦gUPþ¥—^Š˜½^Ôg8p íÛ·Ïî»ï>kÑ¢…éšo¼ñ† >Üxà7¼2é‡ feË–µ»ï¾ÛêÔ©c Êøá‡vúé§[“&MŠzœ € pX€šòü7@@@¤(_¾¼µiÓ&ôJOOw÷ܧ@q°mÚ´ÉÆoÿüç?]É•à1e³÷Ýw¦Ìö÷Þ{ϾþúkÛ¾}»M˜0Át^°Íž=;¸iË–-sòÑ£GÛ¬Y³BÇ,X`z©)°­±Tî%––™™éÎmß¾½Ý~ûí®l¾d¸âŠ+ìøƒ©¤î1wûì³Ïì¹çžË½ûˆmØÇŒsÄþmÛ¶¹ç½ð íÔSOueºuëfÍš5 ËØŸ:uªmݺծ¾új„WI¾}ûº ½ÊÑ@@â#@¦||@@޲@VV–Ë.ÏÉɱƒÚ'Ÿ|VjEõråʹ@¼n­C‡®vû¨Q£¬ZµjvÜqÇ…îøã?¶Ž;ºm­+ˆ_±bEVÐ_Ùå>ø )°>gÎ×oúôé.«ýâ‹/vÁìûï¿ß½ûôé7¸òÓO?¹Í›nº)¸Û­«,OjjªEú%ÀêÕ«íÇ<âœÜ;4~°#z € € €@ (\y5•µQ&¼/Á¢ oµ X0ðìvæó2ï•…¯²7M›6uåeTb&ZÓD­ªÇ®{Ê«mÞ¼Ùü}åÕÇïWÖ»îCMë øÿý÷þ°üñ.À¯@¿o*ɳgÏž°~úÒ L™2¾‹[*ëý…^põì{õê:¦27U«V mûýª@M&¹ƒû¾K@@ØÊÇnEO@@H"_sÞßR½zõÜêþýûý®OĪ¶ê»¿óÎ;ö裺ò.ªÃ~饗æ;Ñi¤ øÐ^Qà~ñâÅÁ]y®?ñÄG{ê©§BûT"G× îÓAýz ¸ïõ×_w5äý‰ Ükl•­ñÏçÕ®]ÛÕÒ÷Û~©/Ôr{ûã,@@ &@P¾`^ôF@@  Üç~eÖŸ}öÙ¦É_çÏŸïjÈ«ŽüßÿþwkÔ¨Qîî1o«$Žjï_¿Þèƒ'ªlŽ&¥­U«–»Æc=æÊŨÏÈ‘#]©™‡~8tŠ&aU½üà>S³fM Ö´W¹ß” ¯@¼ÊÐø¾þ˜–SÙø{÷î «5¿aÃW>wY›à¹¬#€ €Ä.^,0öóè‰ € €s¾‹ŸtUœ5;;Ûáµ<çœs\€Û¹46õ ¶h~õkÛ¶­ëþâ‹/š‚ðÁöé§ŸÚ³Ï>šÐõÄO´“N:ɽTBGµÞý¶–Ú®P¡BØ>•–iܸqØ>_ºFåoˆWyò€ÏÝ4®Ú¸qãB‡Tg^%rt?4@@ø*S^?kÌÈÈ8¢6aQoIoL5á21h € € €@¼ôYF¥[fΜé²ÊU›ý£> •x9xð  :Ôe†_{íµ¦ þøñãÝm´iÓÆ-ýä®|ð+sî¹çšÊÂhâ×fÍšÙC=ñ¶O9å»è¢‹Lø|ÐMìªûQ6þ´iÓܵºwïñÜ¢ìTfüã?îîQŸã,Xà^~LÝõêÕí¬³Î² &ØÇìêÕ«\Íûï¿ï&~½ñÆ}w– € €E(TP^o\õSIMôã'*â}¸¿Þô)ƒ |Q59@@’-à3ÀƒOé÷åWf¥_¿~6dÈPFxûöíM¥]Ô4Ñiß¾}mĈöꫯº} âßzë­¡Ò5uëÖuIJ³fÍrýºvíJXò÷àNŒðÏu×]ç>CeffÚ¨Q£B=Z·ní²ò#Ý»>#åÎÊXQÐ_AþÜMŸÛ”ñ®¦{Ö+ØT3_Ay5=»lŒWÓþÛn»-⸮ÿ € €X ÌẆΊŒaˆÓO?ÝýÜqòäɦŸAÆ£©Ö¡²F”aòÍ7ßÄcH›7o^\Æa@@Jº€/§¢¥²ÅµÔK%Y”i­—rRSSCûìmÑ¢Å1K³uëV—5¯R0‘ÚÎ;Ýnê#5o¢ ½šÜ"Õ#«}JFR°]IO))…Ê—Êkè"ï—ž¿(5ô‹| € €”0•PÔ@Éõί„!ó8 € € ¼iiiQo.¯`¼?Iô`0½ y¡Œûdm²ÉÏ'YïûB@’]€‰^“ý/Äý!€ € € € €”‚ò%æOɃ € € € € €$»Aùdÿ q € € € € €%F€ |‰ùSò  € € € € €É.TAùXNNN²›q € € € € €J ¥Pg%à¤ýû÷Ûĉ02C"€ € € € €  ÊW«VÍÊ–-x2¾ôÙÙÙÉ!Ä] € € € € € '„å5jdíÚµ+Ôm:tÈf̘aYYY…:Ÿ“@@@@@HF„åÓÓÓÝó®\¹ÒdµU¬XÑêׯoiiiåcE£ € € € € €À1!° ¼ú~øÁTŽ&ÖV³fM”µ?ý@@@@@8VŠ”oÞ¼¹8pÀ@@@@@ˆ—@¡‚ò›7o¶† Ú 'œºJ•*ÙüùóCÛ¬ € € € € € €@¸@¡‚ò«V­²M›6Y™2eLÁø³Î:ËÊ•+çF.[¶¬Ëœ¿ [ € € €@ÁvïÞí²³³­eË–.1(¯âÕwûöí¶eËkܸ±û¼“×õ‚û÷íÛgëÖ­³ºuëZåÊ•ƒ‡¢®ëWÇÿú׿lÙ²eÖºuk»òÊ+£öç  € €@É(TP^®7½j*]ã[­ZµL5áUʆ† € €F@sR=ýôÓ¶xñâ°Óëׯoýû÷wŸ9üxõ8p rÒ¤Iöá‡ÚàÁƒí¸ãŽó—‰ºTÒÒc=f7ÜpƒþùQû¾ýöÛ6eÊ÷ äêÕ«%d}ìØ±–––fÝ»wOÈø‘ÍÌÌ´åË—Ûµ×^é0û@@R)Pè |n­5jØ™gžyÄçÜýØF@@¢ :Ô}®èܹ³ +ñgÆŒ¦ ²‚ò/½ô’¥¤üòQ&^}•µîÇŒvoñ<öã?º ùO<ÏaókòäÉ–žž~TƒòsæÌ±™3g”Ïó¯Â@(q ʧ¦¦Ú¡C‡,+++̱Zµj'€UFËž={\_•¼©R¥Š[÷˰AØ@@@R! Ìê XûöííöÛo=óW\á>3¼óÎ;öõ×_[·nÝ,ž}ý˜¡ aE¥l¾ýö[ëÚµ«}÷Ýw®GÍš5Ëôë€.]º¸óV¯^íÊùlÛ¶ÍN<ñDëСCX¹ýšYpeî+øž‘‘á²ýµêÔ©î×Î?ÿüs¾×;í´ÓlúôéÖ±cǰ_ h_ÅŠÝ~ϳwï^[¸p¡}ÿý÷V¯^=Ósú_èWk×®u]õŒúrE‰\sçÎ5§/]|SyÔÙ³g»}Êæ×˜òѽèï¼k×.ëÙ³§ë®sõä§Ÿ~²æÍ›[›6mÂü˜,@@d(rP>''ǽ¡*_¾¼{Ó¨7yjÚ¯¦zóy5½‘ÒÆvíÚYƒ ºùóÃv² € €%Z@Vµ›nºéˆçìÑ£‡)HµÛÕâÙ7ž%8 5j”}õÕW.(­$¤‰'º@ùóÏ?o[·nµ1cƸÏQ 0kýÜsÏuAùÑ£G»ùºTÛ^Mwå?ûì3Ój~/ÞŸ|òIg¢1~øa7¶‚ÚÚV½ú¿üå/V§Nw %Fé¥ëýêW¿Êóz 6t÷¯/|€]×}ÿý÷Mev¬WSB–~¹  ¿>jlÝ[Ÿ>}Ü}òÉ'æ?êšúâAAùñãÇÛÆÂòò—YÓ¦Mݯ”Õ?oÞ<÷EÌŽ;Üu”×sÝÿý¡k~úé§.âD°U~IDAT ¯òCzn € p,9(¯7^zä e/ø¦Ÿbªi2ØHMYÊzЛ6e‚¨6½2?Ô4Q’_t.û@@@ d hÒSxU3wÓgŸ5®cñ쫌ëx7ý’xøðá®,λï¾k_|ñ…û"Añ¯½öšÝu×]î³Ð#<viä{÷îmçwžû¼¤qt®îQiù(à­ »²éÐWæ½Öýë_­U«V¦1Tçþ›o¾±[n¹Å]ïÎ;ït_ 0 êõT>–6hÐ ¿ï¾ûìÔSOuv=Ë믿î~éðì³ÏÚ‹/¾è²÷õ¼múŒ¨qo¼ñÆP&ü3Ï<ã¾Ìx衇œ‡æÐC† ±G}´ — ? € P,EÊë®õf)×>ëõ“üšÞ@ëçŠz“¥¦@¼~ÚIC@@Ò+ ’%±f<'ªo¼ô¯ºêªP0ù׿þµ ¬«¼Œ‚òÑš~E¬_ø¦D'M|lšPVAù•+WºÝª‰¯¦ÏT*m£ŒvÅci¹¯Ë9ÊðWišn‡ËùÏtÊÈðÁݯ¡õy°B… ± µÏÍ7ßìJ樓OÞºà‚ \@^ûôEÅ9çœãÊþèË‹¼’ÂÔ—† €$‹@\‚ò…yñ5Y“óʲ§!€ € €ª¡®ìçXZ¢úÆríXúèþ|óe`Xί©özî¦ìueÛ+¯€xî¦@µ²âÇç^¾½¾ Ð/ ¢µH׋Ö_Ç”d¥Ö¤I·ôÿ¨ô^ñhúE€jØûæ¯ùù矛^¹›jùÇëÚ¹Çf@ˆ§@±åõÙÙÙ.0¯266lˆçs1 € €ƒ òªæúõëÝ$§ÁGP&´2ÁUþ²Q£F. ¯¾ÊúÖ˜ÉØTŠFeaªT©b ¾+^‰M*åâ[ÕªU]6½‚÷sæÌq“£¾÷Þ{.x®29…iÊv¶à¼_¾¼&­-L Ž¥ósoGSϨ¦Ì|ýRÀ·={öX¥J•Âêßûc,@@dˆž2qîX“)£cÅŠGáj\@@’Y mÛ¶îöT‹\Aø`Ó¤ž*Éâ篊g_?fðzɲþÃ?¸[¹úê«M¯N:¹:õÁûS0^_XèKßýîw6xð`7!nî’¢ `ç×4Á«šŸHWëú¥³jÖû¦’7ÊÀ×g¹àßI“¸jÒU_NÇ÷þB@ãkòÖ`@É’%¾kžKeÁëš*Ÿª pýK_¦hüü~çÀ@@޲@±fÊågår € € ä§œrŠ]tÑE¦¼ê“kb×ÚµkÛüùóÝd¦ èvïÞÝ=E<û¾ùæ›a2*âƒÓþ@›6m\–ºß>ZKeÆ«)à­Œp³UO>ئNjS¦Lq“ÃvìØÑýÒ`Ó¦MÖ²eËP7³`Á;v¬›ˆµqãÆ¡cÁy«lŒêß«´ÍñÇo}ôQ°‹[¿ä’KÜ„³ú¢äÌ3Ï´Õ«WÛĉ]÷””_>jž|òÉ¡‰^»víj2TM}eóë‹ÕÎׯ"&Ož|Äø¹w¨^¼&¿4i’ 6Ì222Üä²cÆŒqت4* @8Ê %î@@R$pÝu×¹r&™™™6jԨГ·nÝÚúôé–¯¾>ËÚOªàr¤æäÁcþ\¿/÷¶ß¯¥_ëêÜÖ¾HM¨öìÙÓ>øà{å•WÜyʘ=ztèük®¹Æ²²²Üd²_|ñ…Fñ;î¸#4äoû[S ^Ayeª÷îÝ;t,÷J¿~ýlÈ!®>½Žµoßþˆlô^½z¹úöÓ§OwYúê§kËå(P®€»2ê´×„µ_|±-Z´È}A°téR÷€ÊѨ_~ºgeáëÿƬY³tIkÖ¬™Ýyçn@@cA ÌáŸ7†ÿ&ôX¸ëïqÞ¼y1ö¤ € €¥[À— ÑRåA´ÔKuÅÕKu¿SSSCûVÖs"›æžÒ\TÊÖöÙ×y]/Q}óºÞÑÞ¯¿Ç–-[LõÜó²ÐßN}Tw?¯/ô7ÕùùÀõ|*Y£¬yÕ°ÖôA÷•×559m¹råÂî[÷¡zùº×Xî%÷õ7oÞì~ÍP¡B…܇ØF@’R@åõKD2å“òÏÃM!€ € €€êÖ­3D¢úÆ| î¨ÀuzzzÔ«((®ò3ÑZùòå£;–––¶×†>\Fk ìçnºüž'÷9Áí¢œ‡u@@àh ûD¯Gû¹ € € € € €Å%@P¾¸ä¹. € € € € €@© (_êþä<0 € € € € €@q ”/.y®‹ € € € € PêÊ—º?9Œ € € € € P\å‹Kžë"€ € € € €”:‚ò¥îOÎ#€ € € € €—Aùâ’çº € € € € €¥N€ |©û“óÀ € € € € €Å%@P¾¸ä¹. € € € € €@© (_êþä<0 € € € € €@q ”/.y®‹ € € € € PêÊ—º?9Œ € € € € P\å‹Kžë"€ € €@‘Ž € €I%@P>©þÜ  € € € € €@IH)Édz!€ € €À±'°uëV›5k–­_¿Þêׯo§Ÿ~ºÕ®];n²aÃ7þ®]»¬S§NGŒ«¬ù ØO?ýdÍ›7·6mÚXåÊ•]¿éÓ§›2Ö»tévÞÔ©S­jÕªÖ±cǰý~Cç,Y²ÄæÎëÆêÚµ«?¶Œví`G•ëÉÌÌ´råÊYFF†5jÔ(tX×Ð8;wíÛ´i“Íž=ÛíKKKsû÷íÛg*Ýóý÷ß[½zõ¬[·n6mÚ4·ÞªU«Ð¹¬ € €ÄW€ ||= @@Š °xñbW7}ÿþýV¾|yÓrôèÑöÀXÓ¦M‹0ò/§*¨>|øp·Q¶lYûä“Oì„NÓõ…Àý÷ßo»wïv×ÿôÓO]}àÀ¦`¶JÎhŒÓN;ͪW¯îÆQ 7ÞxÃÎ?ÿüÐ8¹oôÿø‡;OûuÝ1cÆXzzzX·ü®í;ÏŸ?ßÝ·ÆQý±cÇÚïÿ{»ä’K\ժ߸qcXP~éÒ¥6jÔ(g¨çPоÿþ¶yófw?g„ n»]»vFPÞk³D@â/@ùšø›2" € €R`Ê”).Ø­ øˆ#lÀ€.0¯@sQÛ¶mÛìµ×^sã:ÔÞ|óM»ãŽ;lÕªU¡¡Ÿyæ°~衇ÜõµT{È!®ïj Ìû¦ìrµ=zø]aKeý«¿2îuý×_ݺwïnÊ^¶ü®íû*K¾wïÞ6räH{ùå—­nݺ.à¾nÝ:ß%ߥ¾DP@þúë¯wã苊5j¸ ¾'Ó@@ Hå‹ÄÇÉ € € OÛn»ÍžþyW¶Fã6nÜØªU«fšŒ5–¦~*Ç|© ŽšÊÑ(#üÆo´š5kº}gžy¦Ëz׆ʹhUÞU¶FMËsÎ9Ç–-[æÊÖœtÒI.c~ÆŒî¸þùÏþãã 6 í ®¨DŒÚwÞi+V´””T÷%qt,–k«ŸZ“&MÜeÊ”q%sd¦¦ ocmòQÉ› .¸À4Žî¥oß¾±žN?@@"P¾¦xœŠ € €ñP6û[o½åÌ*!£ ºš_æw5e|¯\¹2¬ÛgœaúÓŸ\`]š5kvÜ—ÅQà]íóÏ?w¯°N‡7”‰®Àû¹çžkÿþ÷¿mûöí.˜® ÿ5×\“»{h{ùòåî‹…`^på¿0ˆåÚuêÔqc*(l~Û<i]õíU*çä“O;¬²6*DC@H¬AùÄú2: € €@à±ÇseU”Þ²eKÌ~î¹ç\Fx,è&p 6•eQSƽšŽûLy¿­¥&jU;õÔS]¶¼Û8üÏž={¬R¥JvÜqǹ]çwž Ê+[^_¨å5q«ŽU©RÅôµlþ\í‹õÚê›ûù²³³µ;4†Ösrr´µà¶¾P=|ÿ…€ï¤/T߆ € X‚ò‰õet@@ˆQ@AjÕYïСƒÝrË-î,•uQ+W®œ[æ÷Oýúõóìâ3â¿ùæWºE•¯‰S/¾øb—¹î'OÕ=ø¦Ì{eÉ똚úÊNWxÝŸ&EõAuNp©Ì|•–Qùœ-Z¸CšVAqŸ=¯¬ùü®­ w5¥:÷*…£6sæL·Ti5}ù kíܹ3t_K–,qÇü?Ê’W­û/¿üÒeþ+ÿÒK/ùÃ,@@(@P>¸  € €± (@­×‚ lÒ¤Iný£>йtM~WRÐZ“¢ªôŒ‚Ð „+(í'\U¹²àuíaÆYFF†mÜ¸ÑÆŒcéééÖ¹sçÐ%ÔOѪ]uÕU¡ý‘VTîfâĉ6xð`»ì²Ë\‰˜qãÆ…=W,×öYì;vì°ÇÜ.ºè"wü±Ë|×D²j üÏ™3Ç]O“Ï*ø?yòä°[»ùæ›íÇtÏ rA[Ùó”¯ cb@HˆAù„°2( € €F@“¾ð ®®¼ÎW`\k½"µ¼öGê« Vÿö·¿¹`õøñã]é/¼ðÂP÷Þ½{Û,33Óe’ë€2Ý5Ik°)@?räH—Áß±cÇà¡#ÖU þÏþ³½ñÆ6zôhw¼]»vV¯^=[µjU¨~×öϪ ¿êÔ¿öÚkîÜ ¸IZ}Ö½²þ-Zä¾ÜXºt©Ë¨×äµ Ìû1TRç™gž±¹sçºà¼îQ}r?gèæXA@ˆ›@™Ã?»üå7q2yš7o^òÜ w‚ € €@ øÒ(Zª¤‹–z)ƒZAj½T—<555´/+++TŽ%Þ¦‰HUžÅšã=¾Jåøìð¼ÆÞ¼y³+S¡B…¼ºx¿ê¶+=¿çŠåÚ*O£²>y¥çÓõjÕª ÆûÖ/41lÏž=C%nôùéù矷Ë/¿Ü®¼òJß•% € €@œTbP¥É”(à € € €@üÒÒÒâ7X„‘ÈÎ+˜í»«dM¼›JÄÄÒb¹v´:öº†‚ÿy£s§L™b³gÏ6•½Ù²e‹«C¯sÎ>ûìXn‘> € €…øe¦¢BžÌi € € €Çž€&²½çž{\ý… š2óµï©§žrû޽'âŽ@@cG€LùcçoÅ"€ € €qhÛ¶­éEC@8ºdÊ]o®† € € € € PŠÊ—â?>Ž € € € € ptÊ]o®† € € € € PŠÊ—â?>Ž € €@,eÊ”±²eÿÿÑAÛzÑ@@@ `î½uÁN¡7 € € € € €Vàÿé.…ó@@@ D ³âƒFv|Pƒu@@ '@P¾pnœ… € € € € €@ʘŒ@@(½dË—Þ¿=OŽ € ‚òñqd@@J„€ºûe¤‡Ò±hÇ#Ã>@@0÷>š <ÿ@@@ j]5æ} ž`<ÿY@@(š@JÑNçl@@(É Â:t(ìµÏêu,úÍ›7ÛâÅ‹-++Ëš5kfM›6µ””ÿ|Ù¸q£egg[£FÂöotëÖ­¦W:u,555tèàÁƒ¶jÕ*[³füñv 'D|]{Μ9V£F kÕª•U­Z54+ € €DÀ¿oÖûèÿ¿«-ÈôE@@/àƒíZ_ú ¡À¶‚ä»wï xEVÞ}÷]ûì³Ï†ª^½ºõïßßáuà©§ž2îo¸á;ÿüóÃúúgŸ}Ößo»í6ëÒ¥‹Û½lÙ28p »o߯J•*öÊ+¯øM;pà€ 0ÀV®\Ú§g¾ãŽ;¬S§N¡}¬ € € «€Þ7ëý³ÞWS¾&V5ú!€ € P ‚y¦õòúòåËÛŽ;âª2}útoݺµ=ùä“6lØ0»ùæ›mçÎ6hРеjÖ¬éÖÇÚ\Y»v­ È÷i]y}¡p×]wÙ‹/¾h}úô±}ûö…u{ë­·\@þ²Ë.³¡C‡Ú£>júRàå—_v÷Ö™ @@bÐûf½&(]@@( ¾[0ïƒòÊ0ׇ‹]»v»i}æÌ™îü»ï¾Û•–Qé˜nݺY¯^½Ü‡•” ¶M›6Ùòå˃»Üúĉا@½2”.¼ðBëСƒ ´gddX÷îÝMã¨mذÁ¦NjÚÕUW™‚ÿ*sÏ=÷ØYgeëׯ?b\v € € M@ï—õ¾YïŸ ÊG“â € €¥T@Ô܆@f¼ß.W®œù—>X¬[·.nyáÕ-Zä–þŸK.¹Äì‚ä~_zzº Ô7ÎïrKe¾O›6ÍÕŠ¨_¿¾ë¿téÒP|•ËQI›Úµk»®>À饗š‚øS¦L±yóæ™ÎUœæÍ›‡d@@¨ Èëý²Þ7û÷ÐÔ”JÆA@@J€‚î R›öùýú¡Œyõñ™ô~UeW«Vͽ*W®ìÎ Žëz=\@}È!.¨Þ¾}{kÓ¦MÄ`¸jr*{]xeÀëºj_}õ•+Q£@þðáÃC—Ö=ßtÓM6räH»÷Þ{­I“&. ¯É`õŒj«W¯vKã¿øâ ·®tn¿~ýì´ÓN íc@@Hz¿¬÷§ÊŽ×«jÕªn&ÿ~ºÌá7¢áïº#rŒîSF @@bðAy¿Ô¤§ZÏÉÉqKeøhŸê²kŸ_j]Y@û÷ïwÇýù±_™ž € €” %|(D5äƒòJôP`žLù’ñwæ)@@ˆ«€ÏŽ÷äÁm}ˆÔ”)¯`¼^ Ö«åÎçÞŽ4û@@8ü{e¯~[Áw­û—Ï÷ËÿK‘Só–xIEND®B`‚cccl-2.5.0/.devcontainer/img/cmaketools_sidebar.png000066400000000000000000001252431463375617100223210ustar00rootroot00000000000000‰PNG  IHDR˜PcúiCCPkCGColorSpaceGenericRGB8U]hU>›¹³+$΃Ԧ¦’þ5”´lRÑ„Úèþe³mÜ,“l´AÉìÝi&3ãü¤i)>AÁ¨à“àÿ[Á'!j«í‹-¢´P¢ƒ(øÐúG¡Ò ë¹3³»“¸k½ËÜùæœï~çÞsîÞ ¸,[–Þ%,®-åÓâ³ÇæÄÄ:tÁ}Ð }Ð-+Ž•*•&ã¿Úíï ÆÞ×ö·÷ÿgë®PGˆÝ…ج8Ê"âeþŲ]€AûÈ ×bø Ä;lœ âõWžð²Ï™‘2ˆ_E,(ªŒþÄÛˆç#öZsðÛŽ<5¨­)"ËEÉ6«šN#Ó½ƒû¶EÝkÄÛƒO³0}߸ö—*r–ᇟUäÜtˆ¯.i³Åÿe¹i ñ#]»¼…r ñ>ÄcU{¼èt©ª7ÑÀ+§Ô™g߃xuÁ<ÊÆîDüµ1_œ u~Rœ æàâ*-°z÷#°Mi*ˆËWh6Çòˆø¸æf}î-gi:×Ð9¥fŠA,î‹ãòV§>ÄW©ž—Bý_-·Æ%=†^œ tÈ0uüõúvW™â’9 Œ%/VµñBÈWµ'¤_¶tâÜÈMÛ“ÊŸ¿ŸåP““í\>ĘÉ@Á„yì0`D i|[`£§ èh¡è¥h¡øÕàìßÂ)ùþ·TjþÈëèÑ0B¦ÿ#ðЪÂïhU#¼ ~yh«uÐ fp#Ô1I/I’ƒø"“ä0!£ ’'ÉSdŒdÑ:J5Ç–"sdó¹ÑÔy#RŸ7‹¼‹èwAÆþgd˜à´ÏÅJŸ7ØÏØÏkÊ•×n^:}nW‹»FVŸ»Ösét$gj-tÈÚÔrÏÿÂ_ç×°_ç7Z þ~ëÛV·5ë4ÌV }ºo[ÄGó=Nd>¥-Ula³£¢Y5VúË}¹x»g[üä÷É?’kÉ÷’&ãÞä>áÎsŸrŸq߀È]à.r_r_qsŸGjÔyï4k± æi—QÜŸBZØ-<(d…=ÂÃÂdKO膄 a/zv7«]»ǰod«}¬€©sìn¬³Öá?TF–'|¦ãï3Nnã„#I?"…m»z„íõ¦v~K=Ú¯Æsñl<b|_|4>?Âpƒß‹¾QìñÔré²ËâŒi´µšêŠÃÉäãb ¯2* åÀ (ëºè»Ѧµ—hå°{28ÂoIþýÛy¥esŸ8ü';÷Z¶9à¬ÐûhË6€gã½ï¬>¦xöRx'Äbß8ÕƒÃÁWOÏ«ëõ[xn%ÞØ|½^ÿûýz}óÔ߸ ÿÿ%x ÅcÖËleXIfMM*>F(‡iN!G°ï;â ˜ ?„¹ pHYs  ùð-?@IDATxì ¼USÿÿ—Y”¹RB… (Š’4ˆÌã“È<$É” )ižŒ…~Q!eNšŸh (B™§ôôïýý[çÙ÷ÜsÎ=Ó½çžs?ß×ëœ=­½öÚŸ}Îúìï°Öw»¦M›n]»v­Ëuùè£rýuB@b…ÀöŪ5jŒB@ä "˜œy”º! „@ñB@S¼ž‡Z#„€ÈD09ó(u#B@â…€¦x=µF!3ˆ`ræQêF„€Å LñzjB gÁäÌ£Ô! Š"˜âõ<Ô! „@Î °c&î¤téÒ®lÙ²n÷Ýw·Ïßÿí~ùå÷믿ºo¿ýÖ±-B@ìF H f¯½ör‡~¸Ûÿýó¡V¡BÛ÷×_¹+V¸/¾øÂmÙ²%_9íB@ì@ ÈæàƒvµjÕ2T¾ÿþ{Çügh-›7ovÛo¿½+Uª”i5•*UrGu”;äCÜìÙ³Ýü‘Hª•B@!"!´–jÕª¹ÿüç?îÃ?tß}÷]žF°Á±üÑ­ZµÊÊzè¡îøãw³fÍÊ*’Y¿~½[´h‘i`t«_¿¾óÚ÷9þ|·iÓ&תU+6óÉÔ©S&Ć æ9öé§ŸW\qEžýðôéÓ݉'žèöØc;†‰‘zöÝw_wì±Ç:Ú´páÂ<ç±qØa‡¹êÕ«çÛ¯B@t Pè³Ï>ûaüþûïnæÌ™F$±þßÿþ×Ñ™þüóÏ®N:ÖÑΘ1ñ¿¸Ëœ9sܘ1c̯T¹re#:úË.»ÌÕ­[ך?mÚ4÷å—_:4:4µ ¬\¹Ò½öÚkF áóì³Ï¯Y³Æ´;¤ÚLeÊ”±c“&MÊG0o¾ù¦Û{ï½óÜêºuëŒ\vÜqG÷Þ{ïå!˜<·mŒ5ÊAT;wvp@è0Ï¢[·n¡m­!  B SÆ™¹‡·nü-‰Ê²eË,¢ dq–Å‹[Ûµkj&mþ׿þeZØ?üÚÉ|þùçî§Ÿ~ í#‚Žûõ:°mR)W®œ;묳ym¢EÙ½þúëFä×]w]L Ö­u! r ¬$·Þz«õ1Á;{ë­·\—.]¬?îÇÊÂþÕ«Ww§e½P{m:E„Î4!ŠŒs «O¦¾Â<‡ -#(haçwž™½ü~|øKÞ~ûm¿Ë½ûî»MVhA ,0’Ÿ¹àã HˆP›6mÜG~Ø´HÌdÁÏÆó•Ó! ²7ÞxÃ^Dßÿý<7òÊ+¯˜/–¾"(˜åñÑâŠH·*Á”/_Þá{ᓬàGÐ„Š³à3Ф}DkóÉ'Ÿl>)|&|xè'tR>MmÉ’%îÏ?ÿtÇwœ-äþÃáDÜùe4 3eðƒÿF"„@n!pÕUWÙK-/›A¹á†ܹçžëÎ8ãŒàn‡Åƒým۶ͳ?y_·ÓQã?u`Úi§ò˜’©Þ‡)w‚ÁOôñÇÇ}‹7v˜´ˆªÛe—]Üo¿ýfÃ[FPxAú÷ïoKoj£<U½sÌ1®Y³f®OŸ>æó!° (<{î¹'¸KëB@ä …q›5jÔ°Oø-G+^.™íBÓ`x+GRþò燛ž’¹ÙÂ<‡`´üMAÁ®ùÀä ͆Tˆ’#Ä›)?œD!× 68ƵnÝÚ>˜Û·ƒ‚FÉñ&òïÿÛ´£àq­ ! ŠB%H&ø–ÌÍáA輋³ AàƒyôÑGÍgB{Ñh†n$˘”piÞ¼¹EÇ}öÙgîÔSO ?ì{Fn¼ñFóÁºŒ 2"ä;’4iÒÄÞ^^xá ‡–ü‚¯ Ëh]!. ÍDFñŸì·ß~nçw6?B2öÎýd¢Ð’¹^²ç aŸ e:tèàºwïîyä×£G; MpàÀ¾ [6jÔÈ]xá…yöiC!.¶kÚ´éV"  C0íÔ¬YÓN2B?QÁo€é_f$æ)KVï-*¡_ýµ‘ä*B@”D ÍD˜t²¼3íË®»îš0¾DLq~ŒTÈ%á §x„X±bEÓÜR¬J§ ! ²B%Fîã‡à-žîh$ñ äRµjU#–Âo;TN! ’C þ?¹úm¢Eü'øRN8á„|þ„HÕ¢ñ0 £d˜MÚK¤ûÑ>! „@ID PüÊ¤Ž @„,öÜsOG”sk1·Ó£ø0dÊ21&Nq&eÄ´6wî\§H'‘! ²B%"ÈpòC„ÕâOaP3 óAåÏ”0Ìì»Ã;Ø>&ºdÂFH"„€Ù‰@¡ Z cCyóæÙd—˜Ê˜…)dÐVÐh\ˆCŽŽ£Ù0J]"„€Ù@¡ š sg¡‘0 ,½@&LmÏG"„€¹‹@Úü 8dÖ_H†Äa"’Üýñè΄€±H+Á†L82SÑ3Í~2ƒ+c5VÇ„€B {H+Á0GVÙ²eÝ·ß~ë–.]š=(¨¥B@!vÒF0 Œ$$™,ø]‚áÇioµ*B@b@Z†Üï Œdzy"ÆK–! „@ÉF e‚!ܘ¼&Œ¶g`$ÓÃH„€B@¤D0DŒù±.¤âÕÀHý „€BÀ#Ò8Laø[ïëâ+×R! J.) °mܸ±ä¢§;B@¨¤d"‹Z«! „@‰G@Sâ@!P8ˆ` WÕ*„€(ñˆ`JüO@!D€ˆØ „oi=6);ùcW¯£B@ìA2¹óÎ;Ýyç箽öZç·ûôéc7ñÌ3Ï书Ǣ\a ×^³}ûö¡¡!…uÍtÖ+ &hª.! ÒŽÀìÙ³]»víB:Ù%K–ä¹C&–/_žg_2Œëƒ\Æï† "?ÞkóX?þØÊ%s­hç@*| Ø’k† íK§pÍHuFÛϵwØ–¢¸û?þOÙ¬.S˜oY Œ/Š9ÌÒÎ íGq„ûóÏ?­Sg‘ºuë†Z¾víZ×±cGG6\²è’.$Y©_¿¾uðï¾û®]óÞ{ﵪ<ð@ç?[·nu‹-r<òˆ{úé§-Q"ç¥*tð=ô›6mšM½E}åÊ•smÛ¶u“&M²k’”ñŽ;îpŸ|ò‰•IÇu!47î«uëÖynƒ”+6l°ky䑆Až16d"‹N"‡HñüÒK/ÙdŸ¤,ØyçÝ¡‡êªU«æ˜ñ¡ ?J/ Pe‚Pþ8¤Œ ¤ÿÑGÙŸ‰óùÓ}ôÑ–4X.¸þúë¯Û°U«Vn÷Ýww³fÍŠ˜‡tÕÁ%NÌ~öÙg–m”ëÔªU˪޴i“›1cFð2yÖ)WµjÕ<ûØ 6êä>Á¢yóæn—]v±²¯¼òŠ¥ÊÎwâ¶*Tp'œp‚úá‡ÜôéÓ ·:¸m/D¶Ÿ\C±fìnÖ¬™•ã\° þiH%ÁŠ™¿O:é$+Ç×âÅ‹ ‡¯¿þÚ1¿y˜cO’YèÔÏ:ë,û <õÔSùþ+´®J•*Žctø—]v™{òÉ')Û“~ü¾ø_²¤Óõ‚Ê5o¼ñÆËÔ®ƒæ„…°Žøm´HÁd‡vCú¯aÙÁ$¾¨Ë›ÃO§n>hk˜ë¹–&Í$·éœx£@x‹òBgyß}÷ÙŸ‚2@¸”*UÊÝsÏ=î ƒ²CœÏ[ó»yÁLiôîÝÛíµ×^~wžå‹/¾hÛtΔ:ujÄœ<¼ùy‚Fª‡ûš4iâ.¿ür÷Í7ßDl³?a»í¶ËG0÷L l/ü€iO¯^½Œ$!˜hy‚áœ7ÞxÊr·Ýv›­C¾S¦L‰V…«]»¶½7¤$Hýàí †?ê„ Bõñ,Áᢋ.r¶$³ðÌî¾ûnwÅWD}ááyòÿ \ÿþý]ß¾}“jtÐã;ý‚:ñt $$ ¶!;ß¡³„Xü¶7Ÿ%u“a'ñ¿d† ãxÐ<m½Ä 9oH *bùòåj,¡3»é¦›ìÍŽ‘óÐl°!{Ak¹ä’K,oÇÐVÆŒco'ÌHÝ£G#Úuî¹çšV2vìXÇÛüý÷ßï쫊kIúê 9û@–-["—Æ»bÅ 7nÜ8û3‰)ýu×]gå™ÌM‡·{Tv„ãá‚Ö¹@®`}µŸYІڴi㮿þz#ðyõÕWfk®¹ÆªÚwß}CUΜ93´ŽÆÂdª` 1xmâ¡n4:„Îfݺu¡sc­Ð¾‰'ZþD`¡ƒ9u·lÙ²Àç«~K~“hËgŸ}¶ûàƒì÷©VþŸ¼œ]pÁö³KT¼Cßk.ÑÈíÆkÊþÞ7ã·]¢EµO&¾žà6dDùà>_®¸,K4ÁЙvîÜÙuïÞ=æCZ³f»å–[ÜÍ7ßzãöÉ‹ƒ9‹dÁ[6?Ä ”)SÆÔooS=z´e¥ mâ¼vÚÉuëÖ-d£#¿õÖ[Ýwß}g¤Eç¯Ð7lØ0_q¡î«®ºÊÖ1w‘Óçí·ß62dü¹ƒå÷ÙIa_^ƒÛ{løÓßu×]f®ÃvŽ4hÐÀ–˜ !ÌháubÊ‚T¹„Á[Öœ9sìx[8• å@0•·™þÂë°‹ðÅXÐn4žJ•*Ùþ-[¶„ÌœT¥Ã…€~ÞäyñB ó*^c}ÿý÷íå fáé°1yy²ˆÔ³© @HÉ ÷‡ð§¿j.¬C:Áˆ5_žs¸n4äx<ÞGÅsNÉ ?ö… šÉðÊ+¯4ïӨQ£¨XDºíF¨òàÏG§áÍZ‘ÎIfØòŒ~þùg{ûeßí'If bŒ™Ú}‡Okxnh¡Ù"AÓSÐ yð»çÞYzâà¼oM†c‘ˆ0žû÷ÄXÐù§”å“ ^³D @ BóöŒé ¿AdÞ !ŒAWÀðuÌ8C‡ud€„¿USæÒK/ ŠyèôÓO·m?+õž{î:îW êä­>Aóàã…Hþ€t¨HÈ|™T–N~ÌN\B䃙­ìðëzßA íT¬XÑ1Â4¼ß~ûÍ|LqUg!ž=õã{áÅ€dù¢¹J2‹ZAæé` Ñzý ^pa®ÇÛéFkæÙ`ÿC9O.ÔMÇ4Ÿ¯G¹dM íÅ“UAu@t¼~´óJ<Á ÑMtòA’Á~¹`óMômÞ;ßøSà ãÆO'O9L3”Á)`b#úkýúõ¶üòÊ$"øq¸G/˜:mÈ*üZ8é¹LEhÉ÷ÄbÃ!ÿÚk¯™éçþí·ß^`•˜ !“ÏÇ øïÇï¶ KP†PW$ØaQ†sZ¤7bŒ×Og•ê+íà³Ä§ÇKKx´e´‹ZËÿ¯(ÅkÉ^òð}‡ï¼½c=¨Éøú½éÌo'³$8Á×ÏùÞïãÛWÐ9ÿû×T2Çã¬ÃÏ‚—h"È…Ž9QrÁnÏC>|¸{ôÑGÍ¡6¦9¯µ­ä3ú·{:fÒO{!`ß BÞXp¾CŠÑ^^µœ‰TóßYûðbï~,„‡F ÎÔdäñÇ·H45Lyø30Ë!›7oŽ«Jo£0¦AÈÎKð˜ßi "Ü”HáY!#òèᇶœÉ'xœèØ$™Cm„—"ßùÆÓž±÷óÅS>Õ2tö¼ ÐY'#‰7Ò‰S‚FãÍc”ñ»¿eØÏ'Y Ö«ß&_>VYlG¿¢¥s'Ÿ|²u*„7KO$J¢‚¹)šÏ%¼.¢©<¹ð†þùç[§wüñÇ›QÑIâdæ ÁQàôæ ÕªUæ@÷æ8:âð·<ó½÷Þ ]³×B xçwÌì¡B\\²¡s=õÔSCç$²‚†Æ`0BEñŸ þÇîï‰T/N]O®¨ðD‡!hp:P7m¤ã‰%àÄ=á® ¡¢±y2g¢¡N>]»vµè44(ˆ‰`‹D5ÆXmÒ±äà%eäÈ‘¾î‰?ZM˜¶ù]?øàƒÑФ}?DÀoÌSI¤¦1œÏÇë^"ÕÅq‚„ Ÿ`YNa-¹f"" & ­&Mš¸É“''E.aU¸É[$ƒ@4^C!\—°aŽcª¢Ó§³¤,&;„Εq*üÙ0qq.»'b…þù‚ˆ¨#øáuÒI‹ ÿƒï¸ñIàìNF;‚߉bñä¡vÚiyª š©üŽÌ6Á^h×dâÕ®0Çñ'ôZ 䮄Iû7Fp€XÀsuCf\ |"µÑ·IË¢A€(GÂÉÑ2y‰&D@B,¼<YXÂo«GðãÉ%¨$r}êô&(žÿïDÚFkIöZ¾¾Â^n×´iÓ­k·9¤s]ðd£@Ì ­³Gû„К¢•‰ç¾ùó®^½ÚŠÓåô§NÚ‰Ž(¹LvÔ´ óapœM86,äB{!IñA€‚D%ÇÄôúPfüuD`2 · M'Öù0åHÚƒ?/¼óg?~œæÁH0_¾ %õAR8ÜÑüÃëB»Pï”÷„«]—ãÜ/‚Õ$šxí*‘k‰`¢¡©ýB@;ˆJÄ¿ùÖ[o™ùxÿý÷7Ƈáãë#4Óf*Á„×ï rñuù:؆D|t˜wÆûrA¢I¤Ã÷ç‡/ƒä©>ðH†8E0áHk[bVH…°~´NÂî™^(]Z²£÷fÔX€x?#ÚG*äâ¯ÉøNžŽM†v„k+K·ÿ%xmß®C€÷™èõD0E-…€ÿ @G뵇x@Á’hçO½ž`|ÙÂºŽ¯?ÝKLºU}B@!`(ŠL?! „€(D0…«*B@Œ~B@!P(ˆ` VU*„€"ý„€B PÁ ¬ªT!  •`MË´ôLÌ(B@’…@¡Î¦ ±T©RÅe}ñâÅ–Ó¡dA¬»B@”L Uƒñ’Oc¿ýösÌTÌDu©LBçëÔR! Š7EB0LÏ”d$Ÿ¹Î ÊåQ¼aSë„€B   ÕD¼8y8HŠU½zu›f’Y³f¥§e*õ’./ùZ" ©_cM1<‡IÇŽëÚµkgSÎ×É:I9I„€…@‘ ‡H˜@ޤN¤¥%o)uñÍø|ó…q“ÙP'¹JH×I å»îº+Ò¡|û6mÚd‰Ã˜Öœœ&Ñ„”²ä’ÁDCHû…€H"%ßXÒÖ¾ÿþûP­Z5K‰ËÛ{0“¡/[Ò–dÈ O+o æ’†•îWâ@FHHaûÙgŸ¹ 6¸Úµk[&E22B2dh,Ž‚æK+ðm&¨ÄdOLTˆ¶«Y³fÔÓÀ|ñK–,±Œ‹`WP›Ð-Zd ƒÊ—/oÁ‘.@^ 2¢M¢aBþ> mçí·ß¶ýç9wÜq–ó>R]Ú'„€ÈÁxèÉ?{öl+CÒ :LÒ̱â"FçÎ-§<9¢ äÒ¥Kó5uêÔ)Z±¤öC.˜ÑHmLÇÏ6þ–3Ï<Ó}öÙë¤Ý䛇ü9'N´sƒ'`ºìׯŸí¢eˆøó9éùå÷üóÏ»iÓ¦¹7Z]ÕᇬFëB@'ZB>m´:AHæ€p't’½©Ó™Ò‘fZ4J¶:ȉD2üñ‡»í¶Û&­n¸!©&ñùçŸç9—Ž|—]vqcÆŒ1r9ÿüóè6oÞì† â^}õUÓ6À.\Fm¸’~µM›66©wïÞpáËB䃶@‚ûï¿ßTðæ›oºçž{νôÒKîÜsÏõE-{ þ üB! „@,Š$L9V‚Ç0Ãà|ž7ož…4Ó‰5nÜØí½÷ÞÁb[gÐèÀ­C'iPh»'—^½z¹wLŽ»ßxã wÏ=÷äù`JD0‹‘ƒ¢ 5l©R¥\ÇŽíØÂ… mþ…) Óãé§Ÿnçì¾ûî¡s|ÙåË—›IïòË/7r¤î–-[º²eËÚ5}9–¿È%ˆˆÖ…€ˆ†@r½`´ÚÒ´MfÆŒ¡frm~›H Ó45%_5 oû·Ür‹C“ ’ &¬dÉ… #9ù‰¶CË#Ô;<-kéÒ¥hùÎÁœ~$å}+œ³jÕ*;uÀ€áUØùÁ ˜•! âA X ÷!͘œY¹res,ÓifZhË AƒÌ\F;Ñ,0‹uïÞ=%rá¾b9ù1“EòKApM¸ ‰`Ú ?‡6ÍŽh5ÈE]dÚŽ¯œ"Ø<Z !(…J0ëׯ·—-Q¡cÄÃ`@_Lq  þïЇ\‚Z/—ÎeÅŠMÛ `·Ýv³ª1A‡vXÄKAXh(Ás˜U!(ÞQ¾õêÕ Bc Qè€V„€q v‚ÁAÌ1#Óúé'·råÊ8š‘·NmÂdwÞygÇ€ABl©«¸ þ¸q㊬Yø^ µž={ºSO=Õ4“×^{ÍLd˜#I«V­Üc=fÑg-Z´0'=~ž -‘F´f@´2Ì”M¶Ídz! E mƒùæè£6ǰo¦Féã;ˆGpZ3§4oμQUÞ¢ãÑ~€Ù¡C÷òË/»‘#G¤Œ¹þúëC˜Å‚Ò Aó£@*þ¢Â&L˜*Ƶ»uëfä5yòäÞÖe—]fåÂë ¬! „@¶kÚ´éVèÉóc¡Y@Ì-†½Ÿ ¦1&³$ ³Ëœ9s,,â`*“p ó†¯e‡v°b¢®t sI~ýõWÓðÐòâfP`ðg,2ãÙPŽÈ½Xåâ½¦Ê !PrHZƒ@x;†`¹0Ý ¹÷•qD’Žq Ì‚Ñ`œÇàJ:>Ìj˜Ã¢MúXrSþ;äÔÏ_*ïžx½!•x'ÖÌ[»¶„€yHš` &hDóðSïCž\¸ Ú oÚLf>œM¥jÕª6Zœ²Ôõé§ŸÚx ¶%B@!Ý$M0 Ž`Æ|ø¹°êÔ©“ o¡ü¬Y³B!³ÎeBhÑZ¾ÿþû|çj‡B@d/ILø-Ši€!6}Ì.øY0…3>á>™`­ ! „@v"4Á ™9æ5”wß}7ªS !½„0C2˜ÍÐX¤µdçF­B@ă@Òaà´ODŽ?þxÇèpfä•! „@n#ôd— €dŒŠŸ†$VH«?†9ÌÏ{•ͰúÉ'³ùÔv! „@a#´Ã$Š|*o›2¥FfþòdÞh•!$fí•! „@É@ i ÆÃã÷„{MÅcI3¾š-[¶D<,«u! „€ÈR"˜}öÙǦ‡L2-<Ó½ ÓÌ£Ý òc*M9bpèK!ó$m"ƒP¥0 Tb.#Ÿ Ž|´È„Y|,X`DC¦J4žàHþœGX7(„€(¡$M0„ýõ×ní¶Ñû„3jÿÇ4¢aª|.ìÃ!N^ø¹sç:&kdľD! r¤ †Á‘¤7 “\ò‰$ Ä$²D! J)ù`JD%ó.Gí^xá…’yóºk! Ò‚€&-0æ^%ÌÇ\s! „@²ˆ`’ENç ! „@L’öÁĬU Ò‰Gz¢÷˜ëänA!àbáÂ…æ#\œ BîË}ùå—–¿‡H¿Ã?\aä-…€H "˜´ÀXt•mòÎ;ï´‰EwÚi'‹ÐcÌQÇŽ]½zõ¬!˜¶ú÷ïoë# ƒ‰Fï½÷^·ÿþûÛþñãLJÒ&û2‡v˜ëÚµ«£^‰B Ud"KÁ">¿_¿~F.·ß~»1b„0`€i&O=õ”%{#||àÀF(ƒvO?ý´ëܹ³ûå—_ܳÏ>k­%õ„ ,ÓèðáÃç¶iÓÆBÊ™["„€HÒ`ÒbÕi ³V“&MB3$”+WÎÝ}÷Ý6¾ˆñFÌ÷†ÆrÙe—…R“Í„°_‘N:…Ìfçwž ”eÞ8‰B ˆ`ÒbÕ±zõj»>• QÔgõ3=cî J0µõ`2 úd˜uáØc ž¢u! „@JÈD–|E{2i¦Ì`Ñ„)|fRˆ&”ñÚL´2Ú/„€HLªáùÌå†Cžiw¶nݺò¤I“\ß¾}³%T«VÍöÏž=;tœ²O<ñ„{æ™glÚ æ´¥K—†ÊüôÓOÿÎ;ï¼Ú§! „@*ÈD– z8·U«VnâĉæÜg²ÑuëÖ¹)S¦X˜ñŽ;îh3XcBƒt Ò%ÌŸ?ßB–Ï:ë,k1>Î2dˆkÛ¶­Ûu×]ÝÔ©S݆ ÜÙgŸ»Ò%…€ÈED0YöT/¸à‡³æÌ™¡‘öh-8ì|)„1:ÔRSãðGëa–ë3Ï<ÓÊ:á®»îrÆ sãÆ³}Ì~}õÕW;ï»QZƒE_B@¤€ÀvM›6Ýʌȹ.}ôQÚnGºïˆÓVi1&¿ I L_J4¬øxßM´rÚ/„€Hi0‰"VŒÊÇ"š ñTÍ…D!n"¿ú¦û*ªO! J"˜÷ÈuÃB@¢A@S48ë*B@‡€¦Ä=rݰB hÁ κŠB Ä! ‚)q\7,„€(D0Ià\ÆÀ$Ñl"„€(RD0E ·.&„€(9ˆ`JÎ³Ö ! ŠL‘­‹ ! J"˜’ó¬u§B@"E@S¤pëbB@’ƒ€¦äèzôèá*W®œH•–]ºt©•¡nHéÖ­›ûí·ßD.††¾„€H¶åoïþã?&{¾+S¦Œåt÷‰«Ò½$i¹åS•k¯½6¥*þúë/K) y}ôÑQëâ8ÚÆúõë]ƒ ¢– ?ðóÏ?»~ýúY'ÿÀ¸ƒ>Øí¶Ûn¶D³ÀŒõ믿ºãŽ;.t*×øàƒÜ¢E‹Ü;ìàöÙgK™L²TN:Õêøê«¯ì|²[pÀ!su¢­pmÈeÓ¦M®bÅŠnÅŠެÛ~¡kQföìÙv=ê&‘Ù´iÓ,féÒ¥ÝÚmYQ9Ž9ϧ[ÆÔ7eÊ3³QžvÐÞý÷ßßÍ™3ǽûî»fÜyç­½d1c†Û¼y³ÝËN;íº¾V„€È>RÒ`Ž:ê¨t°˜p|ø0䂟#YÁ9Ž`ŠD0ìÇ I¨2äBÇ.ø0ð]¤S<±lذ!OµD’ ©4¸©,û@jÔ¨áš5kfë|áèßu×]]ŒBµ"„@V |˜·W¸DcÀçY,r¡•˜Þ¨çý?ü§átæøup~#ä£!ª ÒñB„‘\˜Ó)D—AhDƒaÊBhßsÏ=—ç2{챇mVí%Ø>¿/|‰v∖óˆ óYª¸†_OÛB@Ò`Šë˜WBóèÔ©“ùwpvŸtÒIF$ x$B ÁÏ‚œx≎13 pmÛ¶5SéœvÚiV&_D°:3žv¢)A ÁÀ‹jÕªÙ%‰ƒì È· !âíä“O¶È²‡~ØsÌ1QmÕ&B ;Á£çF¤X‡lª¦kñ‚™è¦›n2? ûð£Üu×]6vìX+F´ÝÕW_íjÖ¬iÛ~,Šm¾èøƒZA´rSŒè 3¢Ð0•özAã 2 b=z´í>óÌ3-²Í_Ï/ý9~y饗Z¨2ãh¸‚ÉðÆoôE´B ØnÛX‡­˜V’Þ¤10.¢°¢Èè0‰zŠäsH¤Í„öf“ àðgÀe,¿ lVX‚&A žFeZáÊÁÒD}}÷Ýw6~%™€ÎÅ$Çø‰Ù@±Ö`è¬Jªàüöf§Xø™b•IõQgo¿ý¶=dB~!Bª}p‚¿„†•¬ø±1Éž¯ó„€(>k‚Á©ŒY…è’Ì!ÀXHƒé]˜Zs_]tQH£É\ëte! Š+'æÛ"ò‰7WÌ<˜{0a㓬ù®¸žíB+iÓ¦}²±ýj³™A £ƒc88q$QPŒ}Àç€cgò¬Y³,j)3ðèªB@!,#O.0^‚™|}x+³3ÍæøãÉ$ûtužB ƒdd %c(Ð\ 4ŸÜ §>£¿™ ‘éW˜ã m’A³‘! „@ö PäƒÓžiA È­%’ ½0bœY†!™`(l¤òE¹ÏÏ|\”×Ôµ„€Ù†@‘ 3ñ’óƒÑÚÑÈ…A|Gy¤aI΄}ÑêY} ! „@±B È ÆÏY… A;!¯‰'ÆVÔ®]ÛfþþûïÍÁÏ8 26RV"„€Ù@Zœü•·Í,\PZcœøäñSÌû4ÍœGb±ºuëÚ@>ÌbdYd;ëøe(K83ç’+^"„€Å”†ÉÑH'.Hý Á0µ ©°Mh2øHÏK fœûlãØgÚzò…0£/RÓÑXÅúB@´#Á=1Þ9¼yËÏÀË8?ý;šŠO>é@ 硽°žH¢¬´#¥ …€B !R"¦p!?{ߘ#¦íAøÉ„ G@H¹àµ'¶‰øc ”Q?ùä“`‚!æÑ8?צ¾à ]¶}$¢o?û|D#ëHpbÅä‰õÿ—p¡éü¶–B@DG@›bwdÙ²eÖ&ü.øFÌ8‰H… ¢g`+&$"Ãüür8çéd½™ íÄ;¼‰^ó‚V9q Ù{ï½M‹ÀƒƒC<عûóü2]×Fó@¸4ú¨Õ£/!  F@S0FŦ BÚã¸qãl_:¾0kM™2Å 8еmÛÖLAo¼ñFž)zx³G;™6mš{øá‡-ü—h,mŒÊ¢AÈ9眳‰éº¶‘gλž={º-ZX´Ø„ L#!€@,X`÷Н>}zž6vèÐÁ-_¾ÜîaÔ¨Q~V#Y˜´!¢" ‚‰ Mñ;€á¼óÎs/½ô’{üñÇM[@›;v¬9¡#µBˆW¨ÿæ›o¶è)êDÇeü‡Ë>ÿŒObŒ‰^C!8 ( ‘nŒC‰šì˧ëÚþ~!,ü:DÃ!p€9è½&Ö²eK g,áÉh:˜ü _f;ÌŒ™hh#eÂïÓ߃–B@äE`»m¦–­ÌW•ëòÑG¥í1¥xSKÚ*M "Ì@ PÅwB¤WaþÞÔ}‡í„ñbn"ª-]’Îk£åApÑî‡ë1=‘'˜Æ €Ô½ßÑàÁƒÝ™gžéÎ>ûl_TK! " P8½S„ iWú # ü˜¾ÚÿM~€dAõF;ÒymO ÑîvœËD¬ŒîÇ´©ã·áœN8!Z•Ú/„À?hŒ~ B 1téÒÅÆï0;Úûz÷îmû¢œ¦ÝB@üƒ€4ý„@ j×®íøH„€Hi0‰c¦3„€B D0q€¤"B@!8"˜Ä1ÓB@!Û3;o¼ó3ÅQŸŠ! „€0¶×Ôã‰ÿ29&ñÖê ! „@fØ6u”¬d™^WB@ä6b—Ü~¾º;! „@ÆØÞÏ>›±èÂB@!“l>ÿRNÞ¥nJ! Ši0E¹.(„€(ÈS2ž³îR!PäÈDVäG¾ Y™ ž •¤Oà³qãÆÈ… ØK.ùîÝ»»>øÀJRWŸ>}Ü—_~óLrÌÌL"„€H;âä—&P¦V ½È3.L ñÅ[¢«ðcѶ¿ùæ<»hÑ"רQ#GÚcR“÷„”ÇÑ„ÄZ¤nݺu´"Ú/„€ˆE.qce@¬Ú×ôûï¿[z]‚%*Çwœ¥$f,Y§NêžyæKñ[±bŸª#—™³Î:+Tø–[n1óœ'…ºuë)ù:BÿY LÛ AóÃp>þäÜsÏu&Løç -„€©#°Ý6§ïVª!:ª0ÄL¯^½\½zõ\Ë–-C—)¬k†.Xaž¯\Ìbh3¥J•JúÖ~øáÇŒÑÈ)éŠu¢%B SöäÊh+Ì‘…¼óÎ;¶ä¸$yöÚk¯”È…+cf¹$ÿ t¦Ñ(T‚ñ—E{AüìÀÿ÷ÿgÛ¹Qf7£/! „€ȃÀö…š®½pU¦&A˜6^ZŒA¡/! „@Î"°Í:R¸JŒ×^@ÐÏ«Å,¿Œ!Aµ˜š5k:È “K™Ñ &} ! ²B Sޤ½€Œ'Bm ˆÃ@ ¡ ‡cûŠ2 À.ª/! „€H BU_‚Ú ­üþûï­±>OIP‹ñ¤é´mÛÖ>þÎØæ8ÚD! ²´ƒñDÁí‡sa:Ä'íb/¯ÅxÍÅ >"˱ÇkZ e9žŒG%]BöɃ>8]Õ©! „@N"PhSÅ„k/LO2gΑ‰½x-†mN$rñåµB@ì@ ­&²hÚK5Ü{ï½gÙ?ûì3·Ï>û„ÐA‹ñƒ ½–¦Â'¨¹@>Þ':Y+B@!PlH+Áø»ôšÛ7vÌ•Åtó÷Þ{¯ûúë¯]ùòå}Q[2Í=B–E­2ÁlæÍbǼӗB@di‹"‹¤½œqÆFwß}·{î¹ç Fó7jÔ(:h*ÈÂ… m‰VCÔN}Ö'H\¶S_B@!PlH»ãIàòË/uyàBÎy?] sh!ž˜0“uèÐÁ‘Æ×;ò!HÆ øe±ES B@d¿þú«ëܹ³Mx{óÍ7;¶Ó%iÉïI‚FA·Þz«}.¾øbÓ:0saîZ½zµ«V­šµ9°üy?çœsŒ\úöí*Kù Ã?]7­z„€BÀ™`a"aÿþýÝæÍ›Ûé"™´Ì¦ì‰íå°ÃsLíµ×ºåË—‡ž!šˆwÚ‡vn[ñ>Ž{2á¸×„È{âÍdÁó]÷Ýw=%jùL†)óC>|¸ûgl37Ö©SÇ}÷Ýwyˆ‘ ,Lõfb ©ÙŒ3ÁW]u•ͺ£x܇^|ñEËþyæ™gF<-—Dn^Î>ûìPh»ß§¥©!‰@& yðÁ-3/ýʾóK.¹$µ l;;eŒ'Z±0õûE]dÎü`ë >h8LvI-¦›ç|Ìbh.¦2:LD&1ƒ!ÏsÇ‘>ù€°N×›Ÿ~úi·fÍšáÇ÷ÐCuÔQ.<§L"u…—ýøãÝ®»îÚ½`ÁisÞì±ÇFl¼¤ð;ñc§B'iE¤øå—_Œ\HµîÉ…ÊH§¾ãŽ)ÓB¨]Ûû7áОVÈÄØ¾}û|ä¬B\Ð\H”…æâÉ…rdpDüÀKÛÐW>Ž9æÃúÈ# ƒPn¼ñFû\pÁ®zõê-dÀ€¡T ¡Â¬¬]»ÖJð²€öB‡_X²hÑ"7}úôPõŒ“âwÔºuëÐ>­! ú×prÁRq×]wÉD³.$Ú‚Sy3 j/Ì-6lØ03‘EkoÒtÞ,Ô\¦L™â|Çâϧ~i1‚—¼y@*KÔ\´FLj?ü°#ØÂ /«V­rhLÝS«V­ Œ}óçÏ·¢ÍüáN<ñDËÃh?ÌXúè£nàÀîÊ+¯tøEFŽiäÁóC ›2eʘ í¿I¿~ý̼Eˆy2Ò½{wóÉ}øá‡Ò“!~¼†B͑ȅM«ÒI.\+e•$h!Œ'-ÆD–"‚ÐKýxÙ²e­? í3–×( Ì—_|þùçyÌ[Á«ÿë_ÿ n:æ—ã‰ùK"„@ñDà§Ÿ~ ù´‹’\@#m&²x  úlD.ñ –ž2ž˜õØ á‹`pbJ#p€xyŸ Õ/ÃëÒ¶™ErÁq¯=¹°Í…—Ì”4˜xœ[ŒsD.ñ"—žrŸ~ú©U„©ËuqvÙe—….€?ò‰•Øh´~øÁü1˜ÝÐ^ìw2V'(D«H„€(züqóäã#Aƒæ²Â$î¶H4Ì`±:®¢‡½ä\ñí·ß¶ÐïŠ+:?fså·ß~k‘f ÂŽh¯hòŸÿüÇsâÃa†Ä ÎA/Œoñ!Î~¾šx„úˆÃ—!:K—.uØ0¹0>V4¬ h4HP£Iýj‘k( &ò¥µ7Ý`¶zë­·¬ZÒSC7n´ÐßN:….wê©§º'žxÂÆÇœ|òÉ?iÒ$‡ ,Ê:aÛ äyð£%™mBÓÃÍc˜á:jÔ( aö!ÍÁºÂ×=ô7QÍ@IDATP‡“̘1®aÆ¡PëðrÚB >°F<õÔSö_ºï¾ûÜ=÷ÜcÄÂÿ•c½{÷i4ñÕ˜\©´´L® :+`úâÇCj>D•*UÊrÊ)îöÛoÏ3rž1/  üí·ß¬SþùçÍt†OƫѾMD—xaÀ%áÉøa x;òf7Ê]zé¥n¿ýösLñCÐÚ ce‚õËsNýúõ­"טFF"„@jL›6Í, æ@(˜´×­[—Ï“ÚU >{»mÒVþð¹nÂÊ•¹È0UuéÒÅüXÍ›7/ø P‚ež¸%Öi˜ÓÐ`¼É-RYÚÉ̉8)¹Í›Ü«}ܬY³²¢Å6lpëׯØÖXÇ"žä¹!ÅA,ÁLÍ3ùòË/cKøØèÑ£ ¼v•fÑ 1õëׯPëLƒ?uêTKÈWÜ aâX’}üñÇiZ‰“t‹éEÉÁd’¤P.ά°#FŒp³g϶Ü6Ç57Ö±P¡V^yå·×^{Ù¬²áÇRÙ¦]kÖ¬ eøŒVI×;uÁD+b3GóL˜èóÀŒZ.Ñ‹-*ðÚ‰Ö™MåI×MçZ©R¥´7û?þpwß}·%õ£rÈüÈ#twÜqGžIi¿p’Žƒ$ä\bæõ¢i0E¸®¼Ñßzë­F.ái”c‹TùôéÓÝŒ3"Ji Ò¦L™’R:9{ c$cÛµkç†êš5kf/n'NÌÞ›JsË¥Á¤Т¨Ž©úy+ûôÓO-©PíÚµ]Ù²eíҼϜ9ÓÞXü>°-¼É“ oâ$!:æ˜cÜA<œo“Ê+Ü’%Kì͘D`ÁduÌZ½Ï>û¸Š+º÷Þ{Ïñ–W·n]ËYã§%úꫯoÖMš4±?$÷Áui›ÏOC’$Þ>Iõ:nÜ8ËŒéë˜/Ã’kC,,É-3yòdGzæªU«Z1p$¯÷søá‡»š5kš¦¬cõêÕÖVÊr.!ä«ñæ,êÝ{ï½]ƒ ‚§æ[çm’öðŒêÔ©cél=&ù oÛÄšü<çHÚ íøè£Üï¿ÿn÷A’©hõRçÛo¿mÏ›!õB>±†ëuîÜÙòøP7š„vÛm·±™OÀ­GŽŽÖc ¶d$$3¡Úç3úr$ ëÚµ«Ý—/Ç’:ñG`Š»þú냇b®cbéß¿¿•á¿'rŽì¹çž¶¿ v¼õÖ[–ž›Â¾ûëÕ«—ý^üo‚—:PÊ€eÈjM ª7Òy<òˆ‘ǸÎk¯½f× ã%~·Ã<‚Âo,Ðl‚ÿ?_mš—þ‹$ äwÉ}¶lÙÒ–þ?MÚs2×B²/B °uÿ{奣wïÞ¡ß+þHž;Ï™2`S¦AkDœ_2‘Å Tq)FA§xûí·›‚o’¨ë¼™%"ü‘Ét7räH÷Øc¹ý÷ßß:~Þ&#ÉsÏ=gäB#’=þøãÖáÒÙòãöŸ«É6í„6=ùä“®V­Z¦AñÆ®Ã?Ë]wÝeŒZÎ CöçÆ:æË°,W®œ]Ÿ?3oˆ´…¶#t˜h%hH\Ÿ%ÛC† ±ã`ùꫯÚyø~øp.÷ÉÛ+¸{²¦^H?–ðvLg&\¯aÆö¶K‡I €\Î?ÿ|{>\ÿˆ#ް6Ñá#t: vŽS7oΟ}ö™ ÏÄýÐfÈ¥S§N1É0¼=tÀh¸tÌ|èÄ^P‚ÚÁµÑšÐi#¿7î‹—¡ðße¹0¥ó¦ f$I¤^þ|`×D¥-üŽéÜùÍ&"hMÿ™ øíhA)”…xáwÀ3ñdÚ¡CÛÇñ¿ ´’Aƒ)aŠã<^L (ÿ¡<ëÔÍó¡ õ|ýõ×ʘˆ`2}⦄輽ÎN”7jˆ‚·¡D„N÷”SN1•œ7Ó«¯¾ÚN§Š$¨ç\Ž–?oP7ß|³»îºëÜŽ;þO†0°·´+„Ž((çw^Èäƒ:O‡Bç ¦ @ÍœNaÙ¸qcëÔé¸LthŒk·ùƒîûÑG5 lG_àEžtÒBóv‰6Ç>0$˜ é¬ L&`êqô;> Âà8xƒé7ޘǼÂýÐù`ž¡cÂ4•ˆÐiyg9ë8ð¶ŽÔÚO„$Îý³ öˆ×lcÛ×9çœc÷FhÆÈ¼yólþ•H½þ\pEøÍÒžÁ…^˜ðäµh¤ˆ×àlcÛ—Ÿ÷ûï¿÷»".yNÿ'´ þ'M¶ý¯Ì h?üRƒûå—_n¦Xö¡ýb?¢BùÏPž:x>õ@ä™”ÿõ ™l…®¼Ñ"áÑ_ü"Ùæ ª4¼¿í¯<ŸNê›o¾ ù ü1ˆ‰·ñ æ˜?¿þ<üñˆ¸ f¥ øëÓ¡û?IðxºÖýýaêá.hVàù¯ýËÌ:Ý»w·R?ãŒ3òá~~¤m:Œ Ó±A þM8xN,¬!#÷)”}^ÀýØcõ›¶äþ:ÉàzñÅÛÛ0Zî°aÃŒ¼N=õÔO;hó³Ï>k"/JÑ$è3ð&&^¢I¼õúóÑ,ø-ŸÇøÍzŒ|ÙXKˆÁLüï¡q!^“±°/È“Ú|†ì‡üü‹&i$Ü'Ã6¾UÚ½yóf+ãÿC¶±í+|Ûï/ª¥¦¨NÃuüÛ*sA®Íð.¨ØAÁxG{ðD_£ ‰¤Ð©„×ë¯çëóí /ç§kéë‡0ü›4uó'%œØwlhw¼%â4EkàM¾‡‚‚!ÂÛ öAáâ±öxq¤£ŒfÎ žÃ:&!ÌQ˜Ÿn¸á†ðÃ1·éÈ8m‹dCûÀ¯ v`¼ï¾ûLãÅ!^y›ƒŸÕûub^<ÆÁdê…ÀèÑ y/‰š’<©`2õëÔåMʼnþ>|;"-ýï¼ &ñâûŽ{ ÿ…ÿÇü¹EµüÂEuE]'ixËâO1wîÜÐÛ •á|Æ9ªì;¬ Ýš¦WéƒÇ|›ô¦œÄ^‚o|°ù`øc{Á€­×¿U³Ÿ?«w€³MÄ× ›š3g‡CBGޤúç ¶ÙWîßðئCG:ÌEþÃu9—cü1!–˜Î0Oy•7³øºÃ¯N ]bà™„¿•ú:‰ÂãÍ5ØA€5möχ%×ò>Î%B ?Qn^¨‹¶sŸüvpþ&"˜6‰²"òí²Ë.s=ôþá‡Ú² v,[¶ÌÊa®áƒ†®AÄÛž ¶ÉÔë£Òžxâ Ã-à–HÿXmò¿eü^øÿá7<} õ³?ñf\žêå÷ ±ðBäI.ܧþãü †l‡ÿ~÷)“¬HƒI¹ G„?jœƒ8 y# …!ÚýöÛÏ4 È‚·:"R" ÚQa-Z´0û-cÞF±ï"ü qŠâ€ÇGÒ¶m[».çàåm 0^ئÌé§Ÿn?^ÊÐi7ùÇÆìÛ›0êæsšÎA^óF»¤ ÃàÅ_Ë/}ûü¶_úýþ¼d—Ûmcô­°V® ±çéÞ´ý8]u&SNøe¢ýPûùñC‹%¼QB&‘ÊÑ G2gD;ÇwšL—¢åxso2…/ƒN“ˆ:©h÷áÏ‹w­Í˜¸ÿ'òõá˜ECÁ±I‚¶îHǹOðó8ÑD»ˆžN “Hõú}\Ÿ¶EkåhŸxÃ׋VWP§ÊKÄÐvš—Šh«˜uxÖüf=VÑꉶ?¶‰Ô‹é˜·z4eozBëàs%oï˜ý3Ö´Uÿr.àÈ Jøs§mHøo.Z½í§>Ì ±~?`DßÀ€çH× Ç0üÿ¾]P›bOYƒÁ$ƒ:éFb]¸ c £šK"#àßd#u1„Ás¼1¸Ï¯GëbãÏ…ÜøÄ~7±:¬XçF;­ÍA-+xnA×/è^Ãï1¼“ ^« ¢–õë]Ÿrñ`íëóKÞ¼ƒv}¿?¸äEÊ m/«XíHÇ³Ž„m"õ–/_Þf˜ÀIp.Ú ®‘¦ô3ñð뼬x‚OVþ¸_R:…úb‘ ×£XÏ)ÃðÿKøv*íO™`Pýy{ƒ1#E*%Ó8ÿƒáŠ`’A0sçðÇ[o,áÍ ŸMxÇëK?¼µó)I‚†‡/ŒiÌüÈB¢ø61#øK$éA e‚ñÍ@Û‘óÇ]ò¦‰ãY’}0AeAB4‰È„*Çó;ÍDÛríšÑ=k¹v§º! „€(RD0E ·.&„€(9ˆ`JÎ³Ö ! ŠL‘­‹ ! J"˜$žuq“D³uŠB H(¶Ã`Ÿt…=)¢º˜B@i SN'ž bbŽ%‰B@d/EF0LÅ>‚4Ø ›àŒ²ñœ£2B@!yŠ„`µÍl£É#m™ù”™$B@!= Áøyq˜ˆW˜J„Q·Ì½#‚‰5•B@Š„`ü­’ “W¼ÂdŽÁªñž§rB@!yÒF0Ì&Jä³’’‰/"É< ™oÙÁ/šì¾ûî¡T¾ÑÊh¿B 8!2ÁKƒœ$SòB²)’ùHâG€Ì…±& e†bZ¥*$ÝÂäHf?‰B 0H™`HÛI¶@¦Ø‡XŽ?þx›¾ŸF5†F“+Bö:Ÿû:Ö=õ†Gb¥x…Ù]}ÊVșԷd«#=*‚“!Ù>1L:ÐTB@ÄB e‚¡rÿæ$r~àCÁ\– atîÜÙòp·oß>ê-A.¤d%mp0qÔþ9LXDÖ?„Ü*GqÄ?%þÿ‚,•dç$‡<ù+H— ÿæYÌ›7ÏòuC$$ƒ+[¶¬=£ÛRs|Ó¦MnòäÉVÕªUóÔ¯ ! „@ºH Á„7†7÷ ¸•+W†ÊÚmrÔ`¢‚¬·nÝÚ‰` N} !PÊT1»í¶›uxá¡Å ¶ŒôÁ´æ…ŽÓ—I—YÈ×ê’œÝt¯¾úªå”ÖG.rO.½zõJ:çx°Îà:yÃlæ-2î9Ò]|ñŦ!’â!.¤ÒµkW÷ÐCYf>RZ“‚yòÉ'ü9ä[?묳‚—кB ­¤Uƒaî0:ZÞöçÏŸo¦ZëçÃ?M/^lfd†§ÜõçG;·(÷C2tô·Ür‹]M&H.=zôH;¹p¡åË—›æqùå—I°¯eË–æ«Y²d‰;÷ÜsCQhl“nšô°  ¨D!P䤕`0½àD.Uª”ãÛ #‚'’”/_Þü h/ønðå¬[·ÎŠþù矡õHçfb÷ AƒÌ\Fh1:y÷îÝ …\¸GïËŠDY Í›77må7Þp|CÔ¨Q#óý4VX_B@BF ­C[!‡ ¹°âÁMV¯^í6lèjÔ¨aE :íâ,Ì!CB}È¥0;qo.¼è¢‹ÌÜå±!ò rCJ—.m~¢5kÖ¸ ¸Ù³g»ñãÇ;ðM$àÀ×­¥B ÒN0É4Bb¾1Hí'[¤bÅŠæD/Šöú(3¦Ú©W¯^è’·ÞƒX~ýõWW³fM‡Ÿ³~¡pr÷áСJ´"„€(ŠÁp_„÷B2„Þ~óÍ7…p«Ù]%!ËL:vìX÷Ë/¿8|Ahy„7iÒÄpcýwÞq'Ÿ|²«_¿¾Û°aƒQ"dÚ šþ.\Ö­[74ÎÆ×R!.Š ÁpC„ùñTÒśڂ>+öuëÖÍÌrŒañZË 'œà.»ì2ƒì /´IAß~ûmÇaœÌõ×_oë|~úéF: Du饗†ŽiE!N¶Û6¢{ëÚµkÓYg±¬‹Á‰¹" !à dõd¼7Ž3X“€‰HÇ)‹_lÇwŒx¬OëB@d(VL27PÏ4| „H÷ÏqÆ¿ÄBÉ%B@ÂD PZfƒU·B@d"˜ìxNj¥B ëÁdÝ#Sƒ…€Ù€&;ž“Z)„€È:D0Y÷ÈÔ`! „@v ‚Iâ9}öÙgIœ¥S„€% LÉzÞº[! „@‘! ‚)2¨u!! „@ÉB@S²ž·îV!Pdˆ`Š j]H!P²Á”¬ç­»B@"˜"ƒZB@”,4Ùe–=ofAùä[Š`<2Z ìA@“=Ï*OK?þxצM›<û 4 „¬ K—.uäÁéÑ£‡}*T¨§|¬•+WÚáË/¿Ü5jÔ(VÑ”-X°ÀÍ›7/D0´óÚk¯u+V¬0‚Iùª@Œ LF`/œ‹’@¬fÍšö9å”S\§NŒpþóŸÿ¸>}úä¹èÖ­[ݲeËÜ /¼à¦OŸn Ì|H‰ôÕȪU«Ü”)SB&9´Š©S§ºQ£FÙrÓ¦Mþ4[N›6Í-_¾<Ï>®Ã9‘„Ï_~ù¥"Sçœ9s"Ó>! ²i0YøÐirݺuMùàƒÜÏ?ÿìöØc¹ôìÙÓLj$'#&>Ûo¿ÝÕ¨QÃQváÂ…v3Ù‰'žhDQáb˱cÇZ*ç*UªXù1cƸãŽ;ÎR5ûv¾ÿþûŽOóæÍý®ÐrâĉΓÔË/¿ì=ôP× AƒÐq­!½HƒÉÞgwË«U«fe19!Ï?ÿ¼‘K»víÜÈ‘#Ý#gíÚµ¡ã±V¨˜! rLnë‹i)„@–#°]Ó¦M·ÆkÂÈæ{eð`ºþŒLZÊ5×\ãÎ?ÿü|#ù“i$ƒ„ûQbÕ…9 ÓZL4¡˜¼öÜsÏhEòíg@(æ6H !¬úp]ºtqµk×ÎW^;„€(ÞÈDV¼ŸOÔÖáWùí·ßL;ðã\¢Žq bñÕ„k9~péMjÁ}­{Ð?üàÞzë-›À³ st\â‹ÀÿìÅ·jYLM˜¦6oÞlãYðµäšÍX|7Ü+ãt$B@d2‘%ñÌ2i"K¢¹:E!¤Ádv]T!ûˆ`rÿë…€A@“ØuQ! „@î# ‚Éýg¬;B@dL°gj LMÕ)B@Œ! ‚Éôº°B ·ÁäöóÕÝ ! 2†€&cÐëÂB@ÜF@“ÛÏWw'„€È"˜ŒA¯ ! rLn?_ÝB ch6åŒAŸ÷ÂLm¿téR·ë®»†¦ÁgÖâxf.Î[SÁ[ï¾û®%ø"ÿ‹D!PXˆ` Ùëýâ‹/ÜàÁƒóEÞûk¯½6­YW­Zå–,Y’ïZÚ!„€H'2‘%€æ—_~WéßÿÝ1å|2rÌ1Ǹ;î¸ÃÝzë­n[28KÚÕ¿·~ýúdªÓ9B@Œ! &Nèÿúë/×¹sgwÚi§¹öíÛG= r!cõêÕ]§N¢–‹vàÈ#tGu”®U«–«Zµª¥'ž3gŽ;÷ÜsmÿºuëÜÂ… Ä*W®ìHž]2$‹'í9üðÃ-1Yxªâ`æÏŸï6mÚäN<ñD·ûî»;²V²œ,*TpuêÔqûí·_ð;¶`ÁK|vì±ÇºvÚÉ4£SN9ÅÖ)¼uëV·|ùrËïÂùuëÖu{ï½wžz´!„@n" ‚‰ó¹ÒyöéÓÇȃS"‘ i‚ñk ë†nˆ³æØÅèLh™ÇŒcë$ûïÿëöÝw_׫W/·Ûn»ÙþñãÇ» &ä)Ãô6]»v uüvðŸ/ÊrN³fÍŒ\V®\i÷ ©rß,ÇŽëºuëæªT©bg½óÎ;näÈ‘¶N™‰'ºƒ>ØÚY¿~}W¶lY#—ž={:òçø¶>óÌ3îöÛow5jÔ6AëB@ä 2‘%ðPé\è^}õUGGòÉ{r¡³÷yåƒe’YGsA ´·ß~Û4’§žzÊ:øóÏ?ß}÷Ýw¦mPnñâÅF.hBÇw”kÓ¦uò8÷Ãå7Þ0riÙ²eˆ4!H²oß¾nĈîþûï7’™4i’Îõ ˆíᇶ2hkžý5žþy»n»ví¬ü#ÿüsÛÉ ¡Ã÷f³óÎ;ÏaJ 7qAœÛ¶m[wÎ9çØy|]}õÕ¡uV*Uªä6ð~ O?ýÔŽwèÐÁí¹çž¶Ž™î¸ãŽssçεm¾H{|àºæÍ›Û>Hë’K.qÍêÕ«ÍŒ*¬! rL”ÎzРAf.ûûï¿Íï@çÙ½{÷”È…¦`žâã…œôtøåÊ•³]kÖ¬qÏ>û¬ Ä.tÜ'ŽCLøH³‚ß%((Œ5Ê}òÉ'³f8Ä/½¦R±bÅàif"󃶅ÿ¹ôÒKó”cC“í9‡€&ÉG É 2$äЇ\ð3¤*gžy¦ÃIŽàS šÚ~þùgwß}÷™Ÿ­€6@$D™yè¾úê+¿sÙ°aC7{öl÷àƒš9lçw¶òha˜Á7nlZ„¡–.]ÚŽ£• Çéx ‡c> mŒ0k/[¶lq|^!Û¤Þ#æ6>1ïŽ7xLLtÈé .†É ’à$Ž-[¶Œ…Ãï­$¼ ¾Ìk Úô‚FÒ¯_?‡o%(×]w»üòËLzè!;„ƲqãF‹»âŠ+Ü 'œ2eí°ÃVæˆ#ްåÓO?m\² Ê¡‡jíá ˆ%ˆÖ…@n# &‹ž/ ‚ï„ÿ¿þú«œíüç«I“&nÊ”)¦]á[¡ÜÔ©SÍ\uöÙg‹Ú:cm0É}ðÁî•W^qgu–iEø~¦M›f믿þzÈ<ÆI˜ë.ºè"‡ÿšk®1Ÿæ:4›àX!‚ †jA-Z´°:¸šÚŸDÜF@SLžo<;û—^zÉ=þøã¦5¡ÉàKñ&)ƘÜu×]nذa!òÁL…Çgâôeý­_yå•níÚµF0h7Ýt“C£Áƒ0ø“s‚çµjÕÊ4‚ kƒ6E8244$H…h4„öm4­ùòZ ![l·í v+K® Y¹"8пÿþ{3§…›È‚÷ˆVÁs[2Â`KÈ)0@=^sb'~/wÞy§ûúë¯-$9HFþ–ÞãÏÑRÜE@L>[:oÆ $Ÿd%š–I0:ÆŒfZÛgŸ},âì‡~p˜èÂÉ…ë‹X’} :Od/Ò`²÷Ùe´åàƒaL MùòåÍL†éL"„€i0ú$…f3ZJ„€ÑØ>ÚíB@! "˜TÐÓ¹B@!LTht@! RA@“zŒ÷! „@lD0±ñÑQ! „€HL’Àé4! „€ˆ€&6>:*„€I" ‚I8&„€±ÁÄÆGG…€B ID0I§Ó„€B 6š*&6>Åî(ÉÄÊÏÒ¬Y3›NÿÛo¿µió}ƒwß}wW¶lY¿™Ô’yÆÞ|óMGÂ2¦ðgªýtÈèÑ£Ù3/¸à‚ˆÕ‘jù™gž #÷L¥J•BÛZB ;ÁdÇs µòï¿ÿvü±;è ƒÜÁš-™ü/áãs˜I™0_|±#ûf"–½{÷v;í´“«U«VÄ’©/XvÑ¢E–Í ™0׬Ycíôå˜É™tŸ|ò‰-E0-…@ö ‚Éžg•§¥Ç¼#cdP 4 ä›o¾±4ÆäÁ!¥3Ÿ *‹Ç\'Ë%B°FÅ,›êA’–Í›7/D0´óÚk¯u+V¬0‚Iµ~/„@f&3¸ÊUI>V³fMûœrÊ)®S§NF8$ëÓ§Ožk’´lÙ²eî…^pÓ§Owärñ)Íš5Ë6W­Ze)˜ÿûßÿÚ6Z)˜ÉvÉrÓ¦Mþ4[’f™\1Aá:”$ï¼óN(Íòäɓݜ9s"Ó>! ²i0YøÐirݺuMùàƒÜÏ?ÿlÙ-!—ž={šITÍ>Ûo¿ÝÕ¨QÃ’ˆ-\¸Ð.`&;ñÄ *ü@ìcIºænݺ¹*UªXù1cƸãŽ;ÎU«V-ÔÌ÷ßßñiÞ¼yhŸ_™8qbˆ¤^~ùew衇º øÃZ !ÅHƒÉâ‡oÓ}gÉ !QþšvíÚYzãGyÄø pþù§ëر£»òÊ+­,äñä“OºÝvÛÍA6¤_îÛ·¯1b„»ÿþûd&Mšde“ùâšõë×·S¹i—%B@ä"˜ÜxŽ1ïÂû^ˆÎBp²xà¦QÞÒ¸ä’KL“Y½zuÔº®¾új‹`óõáx/S¦Œ[¿~}Ôst@’‹€Ld%àÙñÅv—•+Wv˜Ç<Ñ\zé¥ùî‚©^½z¾ýì \ß ‘]¤Lö~¿Œx’v !PbÁ”€GOX3BÈ2 BÄ‘Z^¶lÙâøT­ZÕïÊ·$í»ï¾s76B{4h+]ºtž²„R%|;xLëB@ä."˜Ü}¶vgDf}øá‡6P±T©R¶Gú×_m}ˆ!:Œ•ÑeãÆ®^½zîŠ+®°bøkvØÁ–|qðñ8k×® µB} À”!ˆ`rã9Ú]@&L°uÈÂ`<̾ûîëºvíºSÆÏ :Ôœô-Z´0S×+¯¼bQfC† • ®@|/^ìEfýõ×_™É|ÙC9Ä‚>ÜÕ©SÇÆ·x“œ/¾$q08ù‰V#ÔZ"„@ö# 'ö?C»L_øBÆoŸ÷Þ{Ï"¿Z·nmDB˜´P2ZŸh°‘#Gšéì¶ÛnsŒ Š7©±AœŒµÁ0›À2»Q†@€råʹ™3g:¢ÓÐ^Ž>úhG8´—`ìkذ¡Õ3wî\÷ì³ÏúbZ !ål×´iÓ­ñš0²ù^<˜.Á„?#‚–rÍ5׸óÏ??ßHþdÚÉ á~”XuaNô†Mh'&¯=÷Ü3Z‘|ûй C«~à\—.]\íÚµó•×! Š72‘ïçµuøU~ûí7Óü8—¨…cH„X|5áZŽß\z“Zp_AëÞĬo½õ–cO‰Ù‹ÀÿìÙ{%ªå˜š0MmÞ¼ÙÆ³àkÉ5!š±:øn¸WÆéH„€È>d"Kâ™eÒD–DsuŠB #HƒÉ캨B ÷Áäþ3Ö ! 2‚€&#°ë¢B@ÜG@“Ä3ÎTˆrMÕ)B@Œ! ‚Éôº°B ·ÁäöóÕÝ ! 2†€&cÐëÂB@ÜF@“ÛÏWw'„€È"˜ŒA¯ ! rLn?_ÝB cˆ`2}Þ ¯[·Îr·äÝël>®/¿ü2|·Û´i“ãœT„éýûõëçfÍš³òÊ'æþûïwLDïy1+ÕA! rͦ\LñC=d ¿È³âeëÖ­îî»ïvýõ—å_ Nß·o_Ëçòøãûâ /I‘L:åJ•*E=—éü{÷îívÚi'W«V-ËýÏyQ+Ô! J Ò`ŠÉ£&ß 9T‚SÔ3©&ä‚,Y²$ÔRò¦©òÈ# í+¬••+WZÕ$(ëÔ©S¾„d…u]Õ+„@ö# ‚IàF2UE:ý÷ßwL9Ÿˆe  9_¦è_°`­óõÉ'ŸØ:)‰½`ºš1c†e„¤,$”wß}×ê&¹Y/?øàƒàáÐ:ÉÄ&OžlÓ哤͛ÏV­Zå¦L™’/ErèÄm+ëׯw“&Mr/¾ø¢[¾|y¨,¤I´Ñ äɾ¿ÿþÛï²tÌ´S"„@n YœÏM¢sçÎî´ÓNsíÛ·zäBÆêÕ«ÛÔ‚aH‘`²jÖ¬™]¸p¡;ðÀÿ_{gwS¹ýñÇ· ©P!e¸H‰Hä*QD*M7•i¢)‰”yh.“©ŠŒ©¤P!íP¤¤ê6Üþþïw}Z§ýî÷œ}†÷uÎû¾g­Ïçœ}öÞÏø{ö³Ö³ÖzöY’ëJ*„E ­_¿Þ >\~cÊZ¸p¡+Z´¨˜¶48ØôéÓ%Zäþýû%­~ýúò[¿.j’»ÿþûݬY³m€–-[&f²FÉQóèqÆŒnÞ¼yrJ?æÎëªV­êúöí+צM›æ0ùÂ"=Bèøãw*( ß|ôÑG»sÏ=WÒØ—!`ämLƒ‰qü`܃v³gÏv“&M › qí Õ¥K—°i"]„)óg„ †( ©N:îÌ3Ït™ÏÐ`ˆ{O4J|$#GŽ”:ÇŒã`Ò´h—#FŒÈTÂ¥]»vnâĉY„W¿~ýþ•º²e˺®]»ºN:IбcǺ"EŠd*“4„ >ÒŒ?Þ]vÙeÒ—™3gJ[‹/ÒÎØ$ ¦7ÕÒèý52 ü€ ˜8ƱR¥J´à LR*\ Š+GñþA£%¢#T¯^=W·n]ùé Mj÷îÝÂ̹¨¦(|$¬þ¡ÓN;Í5lØÐíØ±#“ ¡Ñ´iSÑ”$áŸ_—ûî»O„˃>(Á{?Úom+‰°ÇË+®¸BÚ£÷hf1ˆ]i™ „ú©Z™š £Õi÷ C ÷#`&Î1BÈŒ5ÊÍ™3'¤Éx… «ÿÃKÌò¨«w˜2Z»ÆØáU¢D aú\Û¼y³´XÓâh——ôÜ»•Y7¿ñÇ`«\¹²+Y²¤ÿvÔsÚ€†‚pñRùòåEr ÁNheï¿ÿ¾ô§I“&Rï¾}ûD»Á¬‡IÐÈ0ò‰qÂüÑ÷„{Q¡B1KákÁIÍê³Xÿþý.4 &»qãFqäcrRB ,Z´H|"hºƒ 3„IÌ+@09Az_N¾dø[p²ÇëA¸ Uù 3Ÿn­Ö¾à/âCð;azD3C«©Q£†¿;7 <Œ€i0 B†—q¨#\ -á¢Í@pà—ÀT†ïE‰ß˜•Ø)†v‚ªT©"ÇU«VÉ‘/œé0m˜wéÒ¥C×#ýhÑ¢…8÷Pøp>ûì³HIÃ^Çw„éNý*$BÀ±cMß±A»9ñÄÝÊ•+ÝÞ½{¥o pÕ«WwË—/wh1ª•…­Ä.†@žCÀL6† Nl„‹2ül'YÙQ¥ï¾ðnŒLÁ=ÝuÅ=ÌZ´“ÝÔ©SÝŠ+ÄO´sçN×¼ysÍx,X° ˜·z÷î-ýàí~Õ€3þy“ev²±MyéÒ¥òÖ?me×áE?T0b:ÃWé®8MoGCÀÈÛ˜€Éeã§L¶bÅŠ™|¬öq”CÞ•>‚­Ål‹Æ„Æ? I\pÁ®uëÖqõ®L™2²s sÛ°aÃ2å¥þHTªT)wÏ=÷ˆ‰ŽíÐì²ûí·ß\çÎCm&¯:ðÑZ´<ÕÒN8á„9-R=vÝ0ò2­1eäwÂAžß Ó;ÂôÝ—TôG>~)|IF†€!Þ˜“??ZA*… Pb*óï&ËG[W C ÌDX–Ô0 C vLÀÄŽ•¥4 CÀˆ0q€eI CÀ0bGÀLìXYJCÀ0 80X–Ô0 C vLÀÄŽ•¥4 CÀˆ0q€¥Iõ_õÜŽ†€!`Y0“»b†€!˜€É­CÀ0 ¬˜€ÉЉ]1 CÀÈLÀäˆV„!`†@VLÀdÅÄ®†€!`ä&`rD+Â0 C +öoÊY1ÉÕWâ5jÔ(‰nICÏ?ÿ|‰ùõ×_»Ÿþ9Ôvþ.?–h–¡ a~Ƙˆ„èÖ­[¦Ìa’Ç|é¹çžs… vW_}uØ<_~ù¥Ä”Ñ›×^{m(2¦^³£!`ä~LÀäþ1ÊÔBb­lذÁ ‹Äú×øO=õ”ó¿ŸÃ="a¶iÓF¢^f*(Ê -~øa‰>Y³fÍP€°(Ùbº½nÝ:wøá‡‡Ì»ï¾+‘.i§a¾ÿþ{·iÓ&9jèe½oGCÀÈý˜€Éýc¶…ÿüç?ÝÅ_œéMúê«¯ÜÆÖé̇ˆ•±Q1¡:¸³Ï>;Öl ¥[³f[½zµB  7ß|³Ûºu«˜„ µL†€!rÌ“ò!ȹvØa¢˜ÐÊM›6uÝ»wC”ÉÁƒgªˆè—›7ov3fÌpK—.uß}÷]è>BéwÞ‘óO>ùÄ-X° d’C«X¼x±›~,Y²ÄmÙ²%Ó5ê!O8Z¶l™Ûµk—Üzíµ×ÜÊ•+Ã%³k†€!0 &Zùä,×¹Àve|/ììúïÿòËüßÿý_ØôvÑ0Ò0i0þlk†Ø²ŒÆ±ãŒZJüñ‡ãSµjU½”åÈN´}ûö¹sÎ9G„ÚËÈ‘#]±bÅ2¥e+µ—üçÞ{öÛ0ò/&`òïØJÏØ™µvíZyQñˆ#Žk8Ò÷ìÙ#}}†Ýa¼X‰ÐXöîÝëÎ8ã ×±cGI†¿*X° ù¢ÿû8Û·oÝúAy¼€idù0ùc¥W^yE~# ¼S²dI×§OŸPOyf̘1â¤oÞ¼¹˜º^~ùeÙe6zôèP:ïŸõë×ËVd~Ï;7d&Ó´+V”AŸ~úiW»vmy¿EMršÆdïÁàäg·Û¬ C ï#`Nþ¼?†ÒL_øB^xáù¼õÖ[²ó«eË–"Hئ„ ”¼­Ïn0v’‘¿W¯^Ž7轤&5®ñ'ïÚà‡A€ðoeË– ™ÝHÃF€c=Ö­X±Bv§¡½œ~úéŽíÐJÞ2¹Ö A)gÕªUîùçŸ×dv4 <Ž@&MšŒÕ„‘—ûÊ˃9E˜€ðg¤‚ÐRnºé&׺uë,oò'Ò„ ä÷£•…9 ÓZL$¢˜¼J”()I–ë¼й !±­úÁtwÜq‡«U«V–ôvÁ0r7f"ËÝã±uøU~úé'Ñô=—ˆ‰nÄ#X´¿–£×½G5©y¯Eû­þ þU`Ñ¢EŽ?ð42 ¼‹À_v‹¼Û‡´j9¦&LS¿üò‹¼Ï‚¯%¿Û¡yWß }ååN#CÀÈ{˜‰,1K¥‰,æZCÀ0R‚€i0)Ý*5 C ÿ#`&ÿ±õÐ0 ” `&%°[¥†€!`äLÀ$0Ʃڢœ@S-‹!`)CÀLÊ ·Š CÀÈߘ€Éßãk½3 C e˜€IôV±!`ù0ù{|­w†€!`¤ 0)ƒÞ*6 C #`&¯õÎ0 ”!`v™2è3WÌ?ïÞ½;t‘¿´ç¯ð *ºÏ;v¸iÓ¦¹¶mÛºråÊÅ“ÕÒ†€!#˜€É³_ÈÎ;!‰ýT¦LW¯^=wå•Wúožó'˜›6m’ b&`¡²›†€!pˆ0sˆ€M´ØóÏ?ßÕ©SGb©lٲŭY³ÆÍ™3Ç8pÀµoß>Ñb-Ÿ!`IGÀ|0q@¾k×®˜RÿüóÏŽ¿œO„ø{zBu²M›6nÈ!Nr¯¿þºD‰ô–Ià¯eË–¹3fHxd"Z†£>úH"E.Y²Ä‘ÇK\CyióæÍnñâÅÞK¢ ½ñÆnÊ”)î½÷Þsôñµ×^“̙ډ!`"`LŒÂï¿ÿîzôèá.ºè"wýõ×GÌã%ãÉ'Ÿìºwï1]¬7ˆîØ»wowË-·¸õë×»† JÖ 6¸¡C‡ÊobÄÌ›7ÏU®\Ùõë×/ScBc~×CwüÀ¸Š+J^Fýúõ7hÙòåËŸfÍšIšÏ?ÿÜÝwß}’™r†ÐÃÏS¸pawÜqÇI:û2 CÀ‹€i0^4~ÃXìfÏží&Mš6%a‚‰kO€¬.]º„M“ÈÅ¢E‹J™Ÿ|ò‰d'’å¨Q£\É’%%îý„ DÛáþ¬Y³2UñÅ_¸þýû»qãÆ‰a¤‚)S€“aÆÉ]„ åpnÑ&³[†€! ˜€‰ãA¨T©’1bDX!C%J”p\ó;¾¼´zõj9eCtÄG¸ÿüç?ò[¿¶oß®?ÝQG%ÂhÁ‚g?„©nêÔ©¡4öÃ0 p˜•¯•/_ÞÍœ93ÆÔ±%ûàƒd 0oöoÛ¶M>ø[ÚµkçjÔ¨!… Ð,¦OŸ.ïÇ`Ã÷Â6âÆ»š5k†*cçÙ“O>éjÕª%i>üðCwúé§‹`!&3v¤=ýôÓ®víÚä×VØ=‡ëÛ·¯äc3/oònŽ‘!`‘0 ™$_gw„@àavCœuÖY!ó×I ³Gƒâ]}ÿ…-Ìþ—1ÿõ¯‰–õî»ïJ>„Ôm·ÝF1B;wá±bÅ yÏ¿ ujHDOü=hCŸªU«:„+»ÊŒ CÀˆ„@&MšôšD"%Ìë×Ñ ò#!\0Y}ôÑ"@"õ‘4ø]"ùˆÐJК0³ù -j+ŽïÓð’&ïÝXi?bvn `LÐfx&!€‚_ ŸpÄË£ü“ÀÚµkE˜° '?ÚŽúrÂå³k†€!Þ˜€Iïñ©÷lÅ.Uª”[¹r¥{ÿý÷E[â?ӨŦMLY"CÀH+ÌD–VÃm5 C yØ6åäam5†€!V˜€I«á¶Î†€!<LÀ$k«É0 ´BÀLZ ·uÖ0 ä!`&¬ý­’@–Å0 |€ ˜|?ÄÖACÀ0Rƒ€ ˜Ôànµ†€!ï0“ï‡Ø:h†@j0“Ü­VCÀ0ò=&`òý[ CÀH &`Rƒ»Õj†@¾GÀL¾â¬$$òàÁƒÝ;3õf.¼B šH!šƒîå–®L™2Å͘1#°9„Ì`LvíÚ˜.Þ›Ï=÷\Ôºã-3/¥'œÅСCé³N¤×Å‹»}ûöå:h~øá7dÈ *˜ŠÆÙ¿)§õ×IXåM›6I4Ë7%°zâ€ó4%«IDATÓŒ7Î,PDÝT º§iôøòË/KèçŒØGz)GŽ´ë³Ï>smÚ´ ,(¢‡~¸»úê«#¦ûæ›odLöïß/ÑB#&Œóƺuë¢Ög‘y*ùü!Ìõ¤“NÊñvCéÞ{ïu{÷æ§œrŠ»ûî»sÍ¿Œr}ãÆ®nݺ¡ˆ¸9D@¦Á€c·R‡+ú;ï¼S„K¡B…25$è^¦„ž,]ºTÂI‡»—kkÖ¬q ,ÈN–7#ðì³ÏŠpiÛ¶­3fŒ#„ ·yóæåá^ålÓMƒÉY<“RÚ¯¿þ*«²?þØüñ®V­Z®téÒR7+`³bÑkÜàÚßÿþw¹îm$&Vâ ”°Ì'œp‚÷v–ߘ¶nÝê>úè#YŸzê©®R¥J¡to¾ù¦;æ˜c$¤ò[o½åXåÕ©SG•iì˜Ý»w;VÖ7– I?¨—6+VLÊzå•W«OÂ2QsçΡ:‚î…eü î7ÞxCŽh„—&@!Ÿ!p$44ý©R¥Š;í´Ó²]ûôÓO¥­¤%ï™gž)y—-[2gQ.ÝmD¬&icT»vmwòÉ'®t½X-ZTƹ\¹rYª` ‰ØJ`8úAÈkÅÚŸ˜2 Çxǣѱ€X­3VôÕ .Z;0%¡Ía¶©P¡‚;ãŒ3Beè3ѨQ#y¶Àçšç+•./ü¶mÛ¤-,^Àç‹q9ûì³ÃeÉrí믿–âô¡Y³frŸ¸IÌ‹_|Ñ]|ñÅYòpaÆ bJcŒ0Qi¶AƒbM?®1vÌï¼"/i™7¤ãÙ!ü9sÚKÁt!ÊEø êëÑ£‡5ò ù ÐzõêÅi·8­b ¶—]v™»âŠ+Béií„4!¬ûôé#ý %ÌøA™ø#Xeßzë­Þ[¿1± 6LÒ°ÀßÀóôÐC…BlGkÇ¢E‹¾(HÛI$ÖAƒÉó¢Ï‹(iÀˆ4£F’|á¾¢•.Ïc=&Â{Ô3gΩ«€á¹…Î;ï<9êÏ&X ˆ½óOï£M³`.þôÓOòlÒÏ-ZHu.Ì;×ÝqÇ"dÉËBhøðáRŒ>¯,:˜/ú¼âdÜgҀ͡0 j_b9þ-–D–&÷ ƒ€)Þu×]âŸà¡c%‰ºÎÊ,b"·k×ÎM˜0Á=ñÄî¸ãŽÆÏj2M:U„Ëå—_îÆïž|òIa¸0[n%&Wã í„6;ÖÕ¬YS4¨÷Þ{O“È‘z˜ øYî¹ç™#FŒ{äñ¯5sÐ=MÑÎÔÏd®X±¢ü¦í ­ ‰ú9r>zôh¹–³gÏ–|ø~ø—~²zw4.ˆ:úAÄê˜9˜P«VV»0œp3F¸´nÝZƇú«W¯.m‚áC0„ vîS6+gþ+MÒKô‡6#\ºwï( ½ùø Fã1ó‰A,P hí n´&´+ÚÈóF¿X ùŸ ÒÒ0…y“ 3ÅS®æûí·¥N4QÚÂs sW?Ц‹vDk‚˜3^ÒóH›RH‹`1ÂsÀ˜¨0½á†äsÒg­däÈ‘òcŠ# ”ÎÒó›²ÒPΞ={¸•22“2è㯈P€y£C0QVÔ VéñL·iÓ¦¢’³2íܹ³d‡ …#ÔsêƒÑ2)XAÝ~ûíî–[nq‡ö—2Œ`À\À5VÿhWŒÈKW]uUÈ¡:Cy#@%±Aͦq<çœs„©Ã¸ LthŒÛ·o—súýøã r!Ž/ðºùæ›]áÂ…eu‰6Ç50 G˜“ ˜*ŽÊØñA îƒ7˜ÞvÛm™Ì+ôæƒyÆ„Y'‚i©³œß:p`µEkíg‡Bœþsöjr’ñuå•WJ_X£C«W¯–£ÿ+žr5/¸B<³´…1¸æšk„ykšXŽh¤P‰%2%g1}ûí·™®ûO'ˆù„–ÁÚŽpG}^Á›Í$Jʸ¯Ëÿœùç˜æMÖñ/„“U£Õ“0¬²˜«V­ ­n( ç3ÎiTeeX^»5¦ªôÞÊ1…yW“jŠÀI¬ä]=žxâ‰âƒab+á`u««j®3YÕÎ9;v¨Ç¿šZ¹r%·C#‡²;9½mÖÂu…Ç9 a2˜‹ôC½äåaÂÓæ)5Q©™EËö×çî¤3¯]ÁÀ˜øW¥Zfùòåk/ƒkÚ¬ãѺÔ'C^vhá§c—›eÑvúɳƒó7´É.+v¾µoßÞ=òÈ#’}íÚµrŒÖŽÍ›7K:Ì5|аüD¬íñb›H¹º+í™gž,ÑØÜn~µIŸeüJÌ?ü&Ou¼S>׳CjÆeì”(—çÁ‚H…œß§åŸcä÷bȹÿùõŸ“&Q2 &QäR”^<Ô8q²c !“¶T©R¢i ,XíÀ\Ø‘ŽÐFØÖ¼ys±ßâ0f5Š}bâÅäÒK/•zɃc”Õ`˜žW(pNšK.¹D^ÒÀ´ÿicÖ¶°f‚P6ç4Ìœ|¢D™¬¶½»­X1Ã$a$lÿÄæ¿gÉ’%îÑG•Õ=ökvƒ±k “ ‡*x`"Bp#È!ŧZµjâÀ!ÍöZ®ã,ÆÏùŸƒ—Ä`‚ÃL ¯åM‹ï… àˆ9ŠEy`^hUx2ö¤clо0ù±Šöî4S<Ñ\ØŒÀ à ñ!æQp;ebʬ£µ !𣬼Ùz/ù±e\¡xÊeK2 v0Â| ´{ÆÝ»‘‚¾ øˆ˜‹8ÔÙvŒàfœð)õìÙSúËæ,‰Ïó˜ñg¡ÆðjB_·C3ÆøÄØÜÁó€ÆŠï†ùë%?†øtûöí+EØ8„yº_¿~²Á·—]2“]“œŸ·ÁyÈ`¤º’†Ñyj²W_}UZSÕ•9”áð@¢yÀ !4¤nݺ…ìÁL>/±{‹lÜ4i’ÜbbâèW‹0.ÉäÉ“% ‚ææ·Kã\eÒ¼‘ñnˆvÇŽ%÷KÛ뽦¿ý÷ô\¤ƒ©3±0U6DðaeÉDU†ÉDVæ€Ù,`ø Ä7Þ¦"V¬¬,ô8²µ^=JÆŒ/LxLG³2IzÆŒ]E=v]A¬RÙ^¬fŽÚ§ ›62FªåxËeÂ# N>Êü¥‚_Œ/‚XÛÎX!Yd@±´§ö¬Y³dç!ùÑdØv­íãZ$Ò4þ#~hå†+“í¿lxÀì ^,6gÿ†ƒpy½×¨½|ùraìÌ0ñúÀ‚ú¥ei¿ôܤ 6ò°]!ÆŽ… ¯ŸMªy£]’†×x%@IëÒ£¶OÏõ¨×5_¢Çý R+¿{ÏsŠXiëΩ2)‡Nøe"= ¨ýL ´ bE‰0 —&Μ)2Mþ.Aˆ–£æmL…/¦ÉŽ˜T¤~h¾X‘ÚŒi€¾è$ÒòpÌ¢¡àX G^[w¸ûôü'@¤¾ èa~L•«×¨Ÿ¶EjéhŸX†–‹V©²è@ðC´Í¢"µ³cÍ3«XE*'ÒõpØÆS.¦cVõhÊjzBë`„¹’Õ;æEóHí@[ÕŸ€# ÿ¸Ó6ÈÿÌE*7ÚuÊà ôü€¼žÃÕëÇÐ?_üçÑÚtß4˜ trù=]ÉFjfÐCèÍ£NDï5ý‰åѼ7>AÄbXAy#Ý‹Ôf¯–åÍ­þh}õ÷ÑÏd¼uEÞ´ú;Zý¤‹k-O¬¼½v}½î=²RCÛ£aÔŽœëpØÆS.ÿ|ÁËœXäE›éêùhZ^?£ýÍbE ¸¨°Òûz¤üœ$Ê‹6¯Á(hœüúç‹ÿ<;í7“ô,oXűê "VVølüŒ9(ÝËyXµóI'BÃÃÆ–iÌÃhQ|›˜€!ü%F9ƒ€™ÈÀ1·˜Èhºe1 C iDö¬%­ V‘!`†@~DÀL~Uë“!`¹0¹`¬ †€!`äGLÀ$0ª¹a‹rͶ,†€!`$0I…Û*3 C }0“>cm=5 C ©˜€I*ÜV™!`郀 ˜ôkë©!`IEÀLRá¶Ê CÀHLÀ¤ÏX[O CÀH*&`’ wäÊøGÖíÿjíH9GjïÐVþžž¿+72 C &`¡’‚kâ¯Â „”ˆˆ•DÄŒö¯³y¡/ÖFCÀ84˜€94¸Z©†€!`¤=öwýq<˜ƒ4öuP6J/‚ÀJ9M##aqË”)#qÒ “ ¡Mç¢nݺ§[ëæÏuB½þúëò×äZ·nüu~ýúõ³¢.¢þKžäDYô%#Ž!h ÏJÄJBµŠ~k_ìhy01Ž£G‡ûú믘 áBHVÂ{ÃGÌÇmÛ¶‰ßƒ¶<‹#ag‰©M4>"3•茥K—•üâ‹/J¤C B…4ÄîÞ½{·DàCH6yèСŽ0´ᘇ &¿ PDÊÅŒç }L|w¯¦!Xñ6¢E’‚íË0ò5f"‹qxaè8µgÏžŠGïÏŠƒ¾W¯^ÂÌE 'B¹0iÈ!Cü!3þ|S¢ž£A@¼wâÐ#@ˆô.–xíâ€Oœ8Q„+‰@M^B  iüøñ®U«VŽð³„¤52 CÀLÏZŒ7œÁô¤ÂeРAYÌMqT1içÎ…ácƒN:é$˜°â¥† й¬õêÕsE‹uk×®•b0w!pÚ·o ½Z»vm×§OפI“LU!Tzh0-[¶”{Ñâ¼g*ÀN C ß"`&²8‡!ê¾gÏž’s™W¸ 0à*ûá‡ÜäÉ“e÷ÚBÒ£œÄøElr/áÇAˆØ ùÿ5ºzõêrÝûå-§H‘"âëÁtgd†€ ˜ž *¸‘#GНåÿûŸø+XÅ÷ïßÿ š‰ðÂÏrÎ9çˆí(V¬X¦^øüü‘é~´ú±iàè£LŽæbd†@8Œ;„C%†k™Ñ£G»… ЉèPj.4¿ ¦ªŽ;:L\l$€ ,(G´uëV9ò…O„Ý`ñPµjÕ$ù»ï¾ÊvðàA÷Ì3ÏDô?…ÚCÀ0þDÀ4˜l< åË—w3gÎÌF Y³~ðÁ!S•Þ-[¶¬;ãŒ3d‹ðúõëÝ’%Kä÷ܹs3™Ç0s±yõêÕ¢yÐ>ÒÄKlIæÃæ44|=lfËòå—_oq–Þ0Ò0¹dàÕÔ„áã%4L·nÝÜ#<"~îóÎ Nz>Jø†Ð¬^}õU¹Ä»+ü –ïM«y8rß›¦wïÞn̘1"ÌðñpïüóÏw—]v™d‹T7ƒîIfû2 ´@ @Æ® ƒü¯T~'4ƒüB˜¼ÐTü/=zûKoúH¿.l.ˆæ‹‰”ß®†@ú"`LûX^bŒ%M,]Gs1á R–Æ0ü˜“߈†€!`ä&`rF+Ä0 CÀ€ ?"vn†€!#˜€É­CÀ0 ?&`üˆØ¹!`†@Ž `&õ¿ºÈjY CÀHLÀ¤ÍP[G CÀH.&`’‹·Õf†@Ú `&m†Ú:j†@r0“\¼­6CÀ0Ò0i3ÔÖQCÀ0’‹€ ˜äâmµ†€!6˜€I›¡þ«£Äx+Â¥mÛ¶œ€|,ÜæÍ›—w;•Ã-7 &‡MFq¿þú«¬Ê>þøcwüñÇ»Zµj¹Ò¥KKÕ¬€W¬X!+½Æ ®¤Œ•Œ—0ɰ/X° DÈ<ᄼ·³üÆä°uëV÷ÑGÉÊøÔSOu•*U ¥{óÍ7Ý1Çã×üÖ[o9VyDÕüÇ?þZÕíÞ½Û±²nܸ±LHúA½´­X±bRÖ+¯¼âX}Þwß}–zçΡ:‚î…eü n G4„×^{ÍU®\ÙU­ZU’#ÑCéO•*UÜi§–%ˆÛ§Ÿ~*m%-y‰" -[¶,d΢\bæœuÖYr/Ò«IÚÃÕ®]Û©4(ú§ë¢E‹Ê8‡ÓnC µM?N?ýôˆåRæë¯¿.ãF~ï¿ÿ¾0TÆŠ¾úÞEk¦$´9Ì6*T…/}&5j$ϸó\ó|E£ rÃåEƒß¶m›´…Å 8ð|1.gŸ}v¸,Y®}ýõ×FœH³Íš5“û×_½´ýÅ_t_|q–<\@›Ú·oŸŒ&êï¾ûÎ5hÐ@¬ àÇ5ž æŒw^‘—´ÌÒñì ‘0§½Ä3Æ8aR&M™2e¼·“þÛLÒ!Ï^…Át!ÊEø êëÑ£‡CPAh>´^½zɹÿ Ü à`´:>`KÈê+®¸"”œöÑNHÓ!Ìûôé#¸†fü Lü¬²o½õVï­ÀߘX† &iXàoàyz衇\‰%äz´v,Z´Èá‹‚´%K–tƒ ’çEŸ %0PÒ€iF%ùÂ}E+7\žÇ{L„÷¨gΜ9RÏÏ?ÿ³€á¹…Î;ï<9êÏ&XìÝ»7ÓüÓûhÓ,˜‹?ýô“<›ô³E‹Ò_ÓsçÎuwÜq‡Yò²>|¸CžWÌ}^ñG2îŒ3iÀæP˜µ/±ÿK"K“{€AÀïºë.ñOðб’D]ge1‘Ûµkç&L˜àžxâ wÜqÇ ãg5ަN*ÂåòË/wãÇwO>ù¤0\˜-·“«q†vB›ÆŽëjÖ¬)Ô{ï½§IäH=Lü,÷ÜsLŒ#FÈ=òøWÈš9螦áxì±ÇJýLæŠ+ÊoÚÁ0aöhHÔÏ‘óÑ£GË}°œ={¶äÃ÷Ǽô“Õ+¸«6HúAÄê˜9˜P«V´@N8‚#\Z·n-ãCýÕ«W—6Áð!˜Â»?÷)›•3ÿ•‡&é%úC›.Ý»w†Þ|ü†£á˜ùÀÄ (P´vP7ZÚmäy£_,†üÏié ˜Â¼Iƒ†Žâ)Wó¿ýöÛR'š(má9†¹óÌÆChMsÆKziS i,Fx¦7Üpƒ\cN@úl •Œ9R„Ò˜1c$ ”ÎÒó›²ʦœ={öp+ed&eÐÇ_1 ¡óV‡L”5‚‚Uz<ÓmÚ´©¨ä¬L;wî,ÙaBáõœú`´L VP·ß~»»å–[Üa‡ý¥ #0pÕ?Ú#òÒUW]rh£ÎÃP`ÞÐCIl€A`3‡éAÏ9çaê0.ãöíÛåœ~?þøã‚\ˆã ¼n¾ùfW¸paY]¢Íq LÃÂf…ÀÀd¦Š£2v|@ƒûà ¦·Ýv[&ó ýù`ž1aÖ‰‡`Zê,ç7‚X­CÑÚAûÙ!‡§ÿœƒ=¤š€œd|]yå•ÒVàhÆÐêÕ«åèÿЧ\Í ®Ï,ma ®¹æaÞš&–#)¤œæa1}ûí·z)ì‘q‚˜OhÌ“Æó Šy›¹mÙ²EG‡BáËÑ~± €»B™3¤§ Æ¢y*é/®ÊVXÝ1!ÀŠB0x‰)œmÞ›&Üo9z®õxóÀ¤¾úê«Bï!˜X{©lÙ²ÂDô“‡‰ÇŽ+/aVò’ÖC×I⽟S¿µ˜zøø Í <¯»î:1ëôïß_B½U«VYð÷çwÃð aDWÂÞ-OÛãO§÷sê¨å#0t%MÙLR¶+cC»c•ˆÓ­•:|Ñ6CøÛ®! ü¤X+Þû^a”‘Ì™Þ<üÆ$„9 óS—.]ü·ÏadäCÛâSCØÐ>ð‹Ö̃÷ß¿h¼8Ä+d8øa¨ê× ¬<àf"å"À`Èhby¥xMI*T0™êoÊRSq¼Ï‡¶#ÜQŸWðf3‰’òî«Àò?gþ9¦y“uü ádÕhõ$Œ«,&ŪU«B« ÃùŒsUY–×n̓©*½·rLaÞÕ¤š"p+yW'žx¢ø`˜ØJøXÝꪚëLVu€sÎŽêñ¯¦V®\ÉíÁÈ¡ìNNo›µp]áqCG˜ æ"ýP/y¹ÇÄD˜pÄt†yJMTjfѲýõù…;éÀÌk—G00&þU©–É.Ëø•˜øMžêx§|®g‡ÔŒËØ)Q.Ï'‚…‘ 9¿OË?ÇÈïÅsÿóë?'M¢dL¢È¥(;¼x¨qâ(d5Æ.B&m©R¥DÓ@X°Ú¹°#%¡°+¬yóæb¿ÅaÌjû.Ä$Ä)ŠÉ¥—^*õ’Ç(«%À0=¯Pàœ4—\r‰<¼¤i7þÓÆ¬ma%Ì¡l&Îi˜9+øD‰2Ym{w[±b†IÂHØþ‰ÍÏ’%KÜ£>*«{ì×ìc×&?UðÀD„àFCŠOµjÕÄ?€Cšíµ\ÇYŒŸó>/!ˆÁ‡9˜@^ Ê›ß Às‹òÀ¼Ðª ðdìIÇØ }aòcíÝi¦x¢¹° ,À=âBÌ£àvÊÄ”YGkBàGYyÏœ93–ª3¥ñc˸Bñ”Ë–d2ì`„ù2h÷Œ»w"|!@ð1q¨³5ÁÍ8áSêÙ³§ô—ÍX!ž%æ1ãÏB1àÕ„¾n‡fŒñ‰±¹ƒçß ó×K~ ñéöíÛW6аqót¿~ýd1‚o/»d&»&9?oƒóÁHu% £ó> <Ô›TóF»$ ¯ðJ€’Ö¥GmŸžëQ¯k¾D2$úA¤V~'öžç±ÒÖ œSe&R;œðËDzPû™@cm=5 C ©˜€I*ÜV™!`郀 ˜ôkë©!`IEÀLRá¶Ê CÀHLÀ¤ÏX[O CÀH*&`’ ·Uf†@ú `&}ÆÚzj†@R0“T¸­2CÀ0Ò0é3ÖÖSCÀ0’Š€ ˜¤Âm•†€!>˜€IŸ±¶ž†€!TLÀ$n«Ì0 ôAÀLúŒµõÔ0 ¤"`&©p[e†€!`¤&`Òg¬­§†€!`$0I…Û*3 C }øöWkÎæIEND®B`‚cccl-2.5.0/.devcontainer/img/cmaketools_targets.png000066400000000000000000005673571463375617100224010ustar00rootroot00000000000000‰PNG  IHDRêE¢ÇiCCPkCGColorSpaceGenericRGB8U]hU>›¹³+$΃Ԧ¦’þ5”´lRÑ„Úèþe³mÜ,“l´AÉìÝi&3ãü¤i)>AÁ¨à“àÿ[Á'!j«í‹-¢´P¢ƒ(øÐúG¡Ò ë¹3³»“¸k½ËÜùæœï~çÞsîÞ ¸,[–Þ%,®-åÓâ³ÇæÄÄ:tÁ}Ð }Ð-+Ž•*•&ã¿Úíï ÆÞ×ö·÷ÿgë®PGˆÝ…ج8Ê"âeþŲ]€AûÈ ×bø Ä;lœ âõWžð²Ï™‘2ˆ_E,(ªŒþÄÛˆç#öZsðÛŽ<5¨­)"ËEÉ6«šN#Ó½ƒû¶EÝkÄÛƒO³0}߸ö—*r–ᇟUäÜtˆ¯.i³Åÿe¹i ñ#]»¼…r ñ>ÄcU{¼èt©ª7ÑÀ+§Ô™g߃xuÁ<ÊÆîDüµ1_œ u~Rœ æàâ*-°z÷#°Mi*ˆËWh6Çòˆø¸æf}î-gi:×Ð9¥fŠA,î‹ãòV§>ÄW©ž—Bý_-·Æ%=†^œ tÈ0uüõúvW™â’9 Œ%/VµñBÈWµ'¤_¶tâÜÈMÛ“ÊŸ¿ŸåP““í\>ĘÉ@Á„yì0`D i|[`£§ èh¡è¥h¡øÕàìßÂ)ùþ·TjþÈëèÑ0B¦ÿ#ðЪÂïhU#¼ ~yh«uÐ fp#Ô1I/I’ƒø"“ä0!£ ’'ÉSdŒdÑ:J5Ç–"sdó¹ÑÔy#RŸ7‹¼‹èwAÆþgd˜à´ÏÅJŸ7ØÏØÏkÊ•×n^:}nW‹»FVŸ»Ösét$gj-tÈÚÔrÏÿÂ_ç×°_ç7Z þ~ëÛV·5ë4ÌV }ºo[ÄGó=Nd>¥-Ula³£¢Y5VúË}¹x»g[üä÷É?’kÉ÷’&ãÞä>áÎsŸrŸq߀È]à.r_r_qsŸGjÔyï4k± æi—QÜŸBZØ-<(d…=ÂÃÂdKO膄 a/zv7«]»ǰod«}¬€©sìn¬³Öá?TF–'|¦ãï3Nnã„#I?"…m»z„íõ¦v~K=Ú¯Æsñl<b|_|4>?Âpƒß‹¾QìñÔré²ËâŒi´µšêŠÃÉäãb ¯2* åÀ (ëºè»Ѧµ—hå°{28ÂoIþýÛy¥esŸ8ü';÷Z¶9à¬ÐûhË6€gã½ï¬>¦xöRx'Äbß8ÕƒÃÁWOÏ«ëõ[xn%ÞØ|½^ÿûýz}óÔ߸ ÿÿ%x ÅcÖËleXIfMM*>F(‡iNõ „óŸ ê EáÔ— pHYs  -±à@IDATxì˜EÖ† d³(˜PQĬ˜0ç¬kXsXלuM¿auÕu×´FÖœ#fÌÀP1 "Š‚J ¢ÿ¼µžkMÑ}»ï;Ìæ;Ï3Ó}»«+¼U]ÝýÕéêfï¿ÿþoN&" " " " " 5~ûíÇÖýõW¿åŒ3j­³-ü#¼ýLö‡Æn׺ˆ€ˆ€ˆ€ˆ€4FÍš5«•íæÍ›~³ÏþØþµhÑÂÿf?ësŽÒŠˆ€ˆ€ˆ€ˆ€ˆ€üNÀuÞÜmÁ>èm_Ë–-ÝœsÎéFRD@D@D@D@š:îŸúé'7}útÿó0‘Ÿûi õ1ýðx`ÀL gÝDy–ü!ÚãÔ¡C×¾}{7Ï<ó¸6mÚH¬–LD@D@D@D ÉàzêÔ©n„ nòäÉþ{g3Dzî§ñ¦'¬„z#£¥ˆ€ˆ€ˆ€ˆ€4q< ÄÆ¶Xœ7‘ž%"=ý¢‹.êÚµk®ß" " " " "Ð$ Ä·mÛÖÿM™2Å5Ê‹õÜCcì·??%N“¤¤B‹€ˆ€ˆ€ˆ€ˆ€¤{Öíw,Úó["}*FíO‡[¸wNº§&Ð3Û šˆ€ˆ€ˆ€ˆ€ˆ€üNÀ<} HÒsÒó°!Oz£¤¥ˆ€ˆ€ˆ€ˆ€$àž™{gî¡“î­%Ô'sÓVöö¦G¤Ÿ2eŠÿ3ÁÞ Åam{¸ÌÆÂç ›'L)ñYX-E@D@D@D@D ©@¬ûøwSå¢r‹€ˆ€ˆ€ˆ€ˆ@9âûi~—,Ô—’px3_Êqia«=¾´|k»ˆ€ˆ€ˆ€ˆ€ˆ€ˆ€ˆ€ˆ€ˆ€ˆ€¤(Y¨År¦»i×®kß¾½c½T‹ã+õø8|¥ã‹ã×o¨$’稟1c†›cŽ?cNúp^ú0s„Ͳ8¾bá"¾bùÑ>¨+’=êù8l¯uÂd}H–ÌW{|u¬ãE@D@D@D@D@D@D@D@D@D@D ’…z"ûå—_ŠŠõˆô„ÉkÕ_Þr(œˆ€ˆ€ˆ€ˆ€ˆ€ˆ€ˆ€ˆ€ˆ€ˆ€”J ,¡!~úôéÁžßö‡èÎ>~çµj/o9ND@D@D@D@D <S§Nuï¿ÿ¾{ê©§Ü{ï½çø]ª½øâ‹î²Ë.+õ°z ×]w¹ûï¿¿^Ó#¿þúëÝ“O>nÊ\çî¾ûîs]t‘{øá‡3Ãç ðÅ_¸ýë_î믿Î{ˆ÷ÒK/U¤øáwþùç»—_~ÙÇ›7?å0,©€ ," " "  þ˜l>agÖ&¦­É3½MV<¶¿Úã³|j)" " " " "P9¯¾úªC Ÿ-š7oî>ø`·æškæNèóÏ?÷"îr$ˆ¼»í¶[Ñ#yä7Ï<ó¸ 6Ø îwÞqmÚ´q»îºka[}®¼ùæ›î§Ÿ~*) è:wîìæšk®’Ž-xܸqîÃ?t'Nt‹,²ˆÏ×<àV[m5·ÜrË¥úÙgŸU¤¿ýö[G›˜þù]¯^½\œŸ´ d1¤-0(´Í6ÛøúN‹gVo¯Ö|ÍjJOD@D@+: õµÐÊ·ˆ€ˆ€ˆ€ˆ€ˆ@uÀë¹OŸ>^ð<è ƒÜb‹-æ¾üòKwà 7øíË.»lƒ‹¡o½õ–{íµ×2…úçŸÞuìØ±–P_”‹çâ£>rsÏ=·;묳ЬãÞŸþÙÁˆ‹bB}“)¾Ì2˸ /¼ÐÍ7ß|…m•X¡}RŽ5ÖX£ÁÛfXžjÍW˜G­‹€ˆ€ˆ€¤PŸÎF{D@D@D@D@D@ê™Ào¼áSØ{ï½ÝòË/ï×WXawøá‡{¡~øðánõÕW÷Û™2ó“O>qï¾û®ÄWYe•L¡tÚ´inèСŽx–Zj)·âŠ+º9眳V©, ÌW_}åºtéâV^ye/&hРA~;ëÏ<óŒO64¦éÁÚå÷ßïÃuíÚÕ-½ôÒ…`lýõ×Ý”)S|9)chå”)k˜&hذan¡…r믿~eaòáÙ‡9ågðƒ7&Mšä† â=ÍÏ)_˜o„_¸püK,áV]uÕ»‘#G:þM6ÙÄÇEb¼ñÜsÏ9r8†Æô7¼™€qiiF|L[C>ˆ³G®U«V>¸å6ЩS§B”§uëÖŽíLÉÊ ¬]tÑB˜p%/C;†é™hØàÁƒý››nº©kÖ¬™ßÆþ?þØO»ä’Kºž={úíüûàƒ¯´Ú{Ë–-}øÞ½{ûuk#Yùò‰èŸˆ€ˆ€ˆ@UP_Õգ̉€ˆ€ˆ€ˆ€ˆÀìMÀ¦ZAh EMDuæL7C¤Ä;úÓO?õÂ0"îwÜáŽ?þx/ÄZ¸p‰À|úé§û)Wæ˜c/#Ò3o9äâé%—\âׯ‰·}ûöî´ÓNs ,°€ëׯŸW ðàƒ:Ä×X¨ŸóÌ3ýÀ€å›x‰?´¸|Ì_ON=õT/ÔS&3øc±Ÿw¦ÄÁ,þ‡zÈ}öÙ®mÛ¶^¬æ÷Úk¯í:tèàÃÇ=÷Üã§„‰…z¦³yúé§}8¦£A|Gü/&ÔŸsÎ9nÔ¨Q>}òC:ÿøÇ?ü ÊØ±c}ZíÚµ«%Ô÷íÛ×O߃PN~vÜqÇD¡>/CŸéßÿ!°¿ýöÛþB=í ¡»êª«¼Øn¼ØF>Ž8âV½>&ð£P„zƒn½õV†ø¨#¦!‚‘ Ddµ‘bùò럈€ˆ€ˆ@Õ(ëc²U_*ePD@D@D@D@D QÀ»˜©I˜JäÄOôâ0^â“¡ñAVDú=÷ÜÓ]wÝu^\g>x>‹àšdðˆÇˆÒ×^{­_òûÊ+¯ôÁñ^æx„y˜‚稣Žò"*/ÆàsªcW_}µÏ£ÿüctö!¼"P³Žo†(ËüöÄÁøô0+§lwß}·éwÞygÏãòË/wˆÖ ˜Q¾+®¸Âó…ÜöØc‡hŽÈμñ䎈÷¬ÃöÔ‚ý5×\ãÛe—]ü€ó·—cë­·ž/;Ç2ˆAZxö3æÚ·|ï»ï¾~`)‘*eyÆi1=yÁ$¢-áMG"}hYm$-_aZ¨nê«»~”;˜­ àáŽgøÆoì=ßñœFÏn¦½aÀ ü lw©¿ñ FÀÅõñ’FPÇã+µlÃ`S¾l½õÖ^(F¤‡YhǤƒ° /¸áý;o0¤áÎ;ï<÷·¿ýÍO»Âo¦`Á˜òfV¶|SÔ§­”åa˜7-¦¢Ý2Ð/¦®ám,f¶ß~û¦b lÿý÷÷eeöD¹œ6B\2ÆC@Sß4žºRNE@D@D@D@D`¶$€'ú^{íåÿðLf¾÷þýû»xÀÍ;ï¼n­µÖrcÆŒñeG¼ á=öÎ6ñ៿ؾùæïYÎvàÐ*ý¡ÓxzD]ŒÁ J-Çà‰mžþ–w¦êaÚ3<ç1äØ8¾˜1ÂOóÅ_ø·Š…­¯}]¢yî_|qÿVÅ?þXç$ó2,%!æÓgÚ¾¾Ù@ZfLmcóì³Á$,žCŸ*Ûœþå´©þ‰€ˆ€ˆ€4*êUu)³" " " " "0{`z„H>fŠ!bïºë®þC£ÌGÎÔ!晌ÀyÈ!‡̘1Ã{ÄÇB;˜Îcšóç÷Ï?ÿì?6Šg»Í?qâÄÌÒrl}Þ×X)eã_¦¶ q8œ£O~l÷Ýw¯5; ¬ìáñ¶Î‡Zñ¨çx<ðÈYgj–ب³´)ˆl©Kyû–Ç2ç<~ 8ÄÙæ‡ßi–—aÚññv>Š|ã7ú·>˜ÁôA|7 ˜ñ–ÆÇvòòÛ¬œ6bÇj)" " "ÐxüánÑxò¬œŠ€ˆ€ˆ€ˆ€ˆ€Ì&n¹å?/¼McÅ2‘™¹»1æPGœFxgjþ–Yf¿/ô"÷jþ!€²áÖ³ÄsAñÓŽ7ÏeŽeÐÁ•ÕÆ–Gˆ619>¶ØïRËF\x\ã1z—ÇóÇ[ùbˆô¡ çͦcaÀ„9ðW_}õZžú„·¦23~û¶ÌËèµ×^+D{¦ëáíê•·0°áÇÂ0p·£Â΄•< +l Ù:Ôo?öØcýü´µÐ“¾pP´Ò­[7¿…yê˰aÇ ­”6æ+ŒCë" " " ÕM@õÕ]?ʈ€ˆ€ˆ€ˆ€ÌÖ˜»A–yêù+;Óß<üðþÜL{ƒmµÕV®òžÞ›o¾¹š ƒ÷÷Å_ìÄÿâñ¤g >\ºÆkø)I8†ét˜|‰%–p]j¦Waº„`<Ç»ßyçZƒå£ªäñ¦›nr|u…V“*¬wíÚÕ½ûî»î‘Gñƒ;w.ì+¶RjÙˆk‹-¶ð-å­xðVÀc=V+òÃcù)´å÷|àèÖ_}ÿ±ÓZüþ.Þàx¯Ã˜iˆB³©†øè.ûÏã0axÖñ'¾W^yÅ×3S÷„^äqø{ï½×Çw:õH:L‘„!Øóƒ,| ÑýÑG£(ú;ä¬ì>ø «aÝu×õm‡°wÞy§o#|ø˜v•e¼ÙÁpäCÆ”‰s0=“Yž6’”/;^Kê' ¡¾úëH9Ù–"øÑGíúôéãî¾ûîB9t;î8/¤³e>ŠŠÐ~óÍ7ûp´„I{uñ&޽òÊ+ÝóÏ?ïÅg>@»ÝvÛù4øÇÔ;ìGæÃ L “dˆ©Ì7P0Nú6mIž40ÛWNÙxøî»ï\¿~ýÞØØN;íäüq¿Î?Ò9å”SüÇÓO?]ð¢_guüt- á,/lC<Æ“þ¡‡òõÂ~~#ø[8Hú¸ë®»|T0C,·0¸Üa‡Ü}÷ÝçmÄv>Þ›f|˜•¸ñþGÀFXç£Ãf´êXÏž=½Èóµðñ2Ãø~ó-˜î†Á>PÌÇnᣯ ôÐ~i v´hÑ")šÂ¶Í6ÛÌcA"¼êyƒ¡?øÈÓF’òU¬. Њˆ€ˆ€ˆ@UhVóúà_¶©Š,)" " " " " ³š€MÓÁÒ¦Eaï^~38¢7¶ŽÐj^¼IùMГ²iKð"Æ›¸Ø±æÁÝ®]»´¨fÚ>~üx?]KË–-gÚÇʘ5W=,™‚§˜áO˜rDÒrÊ6aÂ?ç¼ ÔIy£|0àm‚báÂc©âfš™´2fܸq¾¾Ò„q²N^`‰˜e„¥]ï4cëÖ­sÅ—G†ñ±öM̨{iDÊSÿL×Ó·o_?€„g¾Ùgœáß¹îºëfŠ'«$åËâÕRD@D@D`Ö(6SŽ;ÖæsÿÄ >Åï0gM¾•Šˆ€ˆ€ˆ€ˆ€ˆ€x±·˜@oˆl~rûgYLèåxÄkÄÕb ²ÅÂ¥ ;Æö•S¶¬|7åc`¥ChÎâF˜Rã%/yDzËwVò´™¬rçaÇ‘40AÝgå7Œ‡Á¦?üÐOGÄ›|“Ï|6ÜpÙDzŽÍj#Iù ÓÔºˆ€ˆ€ˆ@uP_õ¢\‰€ˆ€ˆ€ˆ€ˆ€ˆ€4|Ÿé€è™Zi¡…òÓü0%ŽLD@D@D éhVóA¡Ùvê›·Þz«éÔ¤J*" " " " u À&K¦aÉSy0•†MwÃTlc ^Ñ­äÔ7uȾª!5õMyÃÛx¶þ¿¯UM”hZ$Ô7­úViE@D@D@D@D@D@D@D@D@D@ªŒ€„ú*«eGD@D@D@D@fL¡#¦D Ü{` õM©•¨¬" " " " "0 L:ÕÏw? “TR" " " " "Ð`é¹.Çæ(ç #" " " " " Yøí¤I“²‚i¿ˆ€ˆ€ˆ€ˆ€4yò¨oòM@D@D@D@D@D@D@D@D@D@D@’€„ú†¤¯´E@D@D@D@D@D@D@D@D@D@š< õM¾ €ˆ€ˆ€ˆ€ˆ€ˆ€ˆ€ˆ€ˆ€ˆ€ˆ@CPßô•¶ˆ€ˆ€ˆ€ˆ€ˆ€ˆ€ˆ€ˆ€ˆ€ˆ@“' É6ù& " " " " "P>É“'—p‘ƒ r:tp«¬²Šß;cÆ ÷üóÏ»®]»º¥–ZªpÄÛo¿í~øá·á†úmŸ|ò‰ûâ‹/Ü ,àÿøˆ-¿çž{n·Ækø0tsÍ5W!n6öïßß-´ÐBn…Vðaø÷ì³ÏºÅ_Ü-»ì²nêÔ©î•W^ñûH¿uëÖnäÈ‘n„ nå•WvóÏ?¿‹óLàO?ýÔ}þùç~?yúñÇýïyæ™Ç­¾úêî§Ÿ~rC† )ÄÛ¼ysï”)SÜj«­ææw^÷믿ú´ÙFùÛ¶më¾þúk7~üxܺë®ëÚ´iã¾ûî;÷Î;ïxnäû·ß~sŸ}ö™›6mš# yN*{9ù&á×^{ͳ_b‰%\ûöí}úß~û­¯ò9vìX÷Ö[o¹víÚyŽ|Täkƒ 6p-Z´pƒöË5×\Ó—éƒ>p묳Ž?†4¬L¬S?:uòå5j”¯ß=z°ËѾÿþ{·è¢‹ºŽ;:ò1zôh¿oÓM7õË_|Ñs‚kÜž¨_ê¡Y³f>ÿ,GŒáë§eË–¾}QWÄÑ­[7×¹sg'ÿ²8|ùå—î£>rÔùb‹-æÛõ‚ÑØž'}¼ôÒKž mz>|¸ƒ«µSiÍ?Ú<炵 ¶üñǾmÑÆÂsˆ}óÍ7Ÿo7Æ s°µ|±/6Ú u Ça`hF¾­¡ã7|?@_Êo®-áy—ÃÚ¢Õ× /\H4Ò®Qô·\G`DŸM}’VØ PèáÄu#î8îÝwßõqÐ÷ÐÑ^hKpµ~™òs ¦üäuܸq¾/ËF~éSˆƒk[؆aL;[zé¥}=Ó.íziLé[¸ÿàByÏÒM3k_Ôåñr=æžÄê–ãÓXSï/¿ü²/×:Ø´jÕÊ/é`”t> n+®¸¢¿æ‘†]KI›c­Ÿá©”k½µÎOòTn;⸬ë%ùÎcIýçò‡~èÙÓ¶éŸi_˜1ÈÓãöC¾9h'ôÑÔ+¬¹þõìÙÓ·U~Ó¯®´ÒJy²_¯aþè1ê5E." " " " "0;@xH3DÏRŒ‡WÌxåa#¶óðkÛØÎƒÛmÛ* 8±‘ÞÂ!Å~pn#×lCÔ@˜|óÍ7½Î1”™‡9dÂÆy& "?ùAPÀ›yØ$^<ÃxºwïîÞp@ô/OàÄbòNŸÂ†ÐöqlCdä|¡¿-–Ô†í\ ÂßúD@ÄOx`”…~ÒÊB9ã~ÔÊËyF›'L}œ/–ç,ç+í|ÑÿP3ê<ØyŸÕ[»¢?A(„3|Yd®Y»"ND|â û6úuêãµlë·-_ÄÓ¥¦À¨Dg„`\¤ï¤žiËX\Ò€+ –Ûo¿}æ5Šxè3Xd`#ÆÌÊF^èSà׎¸_@[¿ÿþû½àh»¶=\Âá0í!:/Ÿ0Îx=-¯x³3زÇ{ćøßY8W˜Èj“ié[¢–N1¶œ¥ÓåÛØŽ5‚¢tx®;/ê›? ³Ú€qÎ:,œ-³Ú0áò¦mq¦-³Ò2Ž6ˆÇ“÷|)û°¤k@RŒ0‹¼Í6ÛøC¿øøûî»Ï åôUI׃¸,y~“WR¨ŒûÉ´rÀˆ}¡eåÉØ¦±'N®Ga>Òú…$Ža^X'¾4–qؼ¿óôõ¤ö“á›hié0codäÉwkúz­o!ý5o,°¿Jç+fi”g¹íˆ´’ú ÞrÄ+¾˜Ñf)fÖN³®™yÚ ¥C†çsÜ®ãßv\}.P…‡Ý#Ùzí³ºB9À›…Q§<XFB"h\Ìó\‡Aå”1Ïj¼Dëbx°Õ‡!¤Çbzž¼°+iyúú8o¼Æ_1cÚ›–(O¾ÓX#ºâõÏàÂ?oWà%Nœ]jÞ¦`Y ËÛWK+­nÓÊÆ•”>L!SÌ(?šq1Ëj§vl\϶=isËÿNŠcVm˾[(1'g©¼Æ #]¼:)±>/5…ê GdΛS¼³»Ö¼"^É8ó¦]n¸Æ˜gÊZ-ùÆ«™çAÞ̨ÆzÇ»ã·óç3§#P-çEÅ TѬló2­J2Bð£B ,vžãõÎ[ÅÂT2_Š«2ø~Eü ‹ÊÄ<Â(ÜAnöÞ/Ï=÷œÿ@…}p‰‘kâáÃ"p$OIƒ%Œîe½fR+1ý¨WŸú¦ÔÜ"ê#rcQ¬2dˆ»ýöÛýÜŒÌ7…°Ž`¾ÿþû»UW]Õ—ëÙgŸõ_åæ+×|;4æ­zøá‡½À õwÜq‡ÿú:£à`3¾Èþä“Oú/w#ÔÃðšk®ñƒ'Ÿ|²ÆÈ aÂv0("¡ÞHj)" " " " " " " " "  O Á…zæ¨çƒ ãÇw ,°€¬W[m5ïYßðxŠç€)~éÉïàó¦ÀÕW_í·÷ìÙÓñ!æ0Ãúõë7“PÿÔSOù}óÎ;¯_Ú?>bƒ Ïkƒ ª%Ô[[Þzë­þCÇwœ[d‘El³OûôÓO/üÖŠˆ€ˆ€ˆ€ˆ€ˆ€ˆ€ˆ€ˆ€ˆ€ˆ@õhÞÐYâËÊ=zôp+­´’ÿpù±é\:oYé¿ûî»^ ßgŸ} Aæ÷Þ{o‡w<ƒfˆõ|ýü‡~°MþëÏ|\„üÂŽšÄy.vÚi'Ç\ûöQ–0 ë<òˆcjÃ;¬¨˜§ß" " " " ‡oaüñî…^¨•é>úÈo0`@­ížée" " " " " ÕIà³Ï>sgu–Ÿ¥¥lú%—\ÒO£±Y³fáÏ¢ë“'ONœCÁ¾±Øˆ# x½‡Æ[»í¶[¸É17<óñ÷ïßß‹ïì8p`a*šPÔÇ+ÿ7ÞðsÚ3ßü}÷Ýç§Âs?4ÄüÁƒ»m·Ý6ñËÊÄÃô7¡ÇüóÏnÒºˆ€ˆ€ˆ€ˆ@•xñÅýý œZýõ ¹åÞÒ¦‘ìÝ»waûóÏ?ï·s?¸É&›¶kED@D@D@D@D z¼ôÒKîý÷ß÷Îàµæœy\~ùåÝRK-•3tr°Ÿ~úÉð4޻ +Ä›ªö÷ĉ½áÓ2¼ñÆ{á|ÇwôAÚ7ÜpC7nܸZ‡ :ÔM›6Í­µÖZ®M›6^äÇ{*êé1–›nºéLìÃÛ>4êMB}HDë" " " "Pý¶Ûn;Çýs,ºïºë®>óÜg†¶óÎ;»3f¸6Ú(ܬu*"°õÖ[{‘¸d¡~î¹çö"=8ýꫯj Ÿ^½zÕÚ6;ÿàã·ï½÷^î"âýÄT5o¾ù¦kݺµ›2eŠê|ðÁZq<öØcþ÷Å_ì—æmOøvíÚ®±Æþaí /ôsâóÛИ†çÌ3Ï 7i]D@D@D@D èÔ©“Ÿê0Î:L[Zø8œ~‹€ˆ€ˆ€ˆ€ˆ€4Ž;ºýöÛÏ¡«—ûì³nÅWtíÛ·¯fêÔ©î›o¾qK,±„cD…?›Fçå—_®Ï) ð¤zýõׯJÈD@D@D@D@D@D@D@D@D@D@’…úÆU¼úÍ-íÌQÕUWù9åíñ°¿îºëó2"Ûf›mæFŽé†î6ß|óx·2dˆßvä‘GúÒòQZ^}@ÔOâ™´{÷îîž{îq£Fª'ƒáŸyç× ¤" " " " " " " " " "  F ä©o,§U˜0‘=î¸ãܵ×^ënºé¦Bñ´?ôÐC ¿Ãìâ¸ÿË/¿8>È^óË,³ŒŸ›>ÜÇ”B}úôqcÆŒñó…ûX?ðÀÝÙgŸí®¼òJwî¹çúÝ üûßÿ®t½õÖs{ì±G­mú!" " " " " " " " " "ÐpšÕˆÊ¿•’ü¼óÎëÖ]w]ÿ5ÚxÊæÒÙj«­¼W÷[o½UJ´…°x™#035L]­Ü<”“îôéÓÝèÑ£½ߪU«r¢Ð1" " " " "Ð`~ûí,¹gÉ÷¹8™ðÇjùnÛXçþÙe—m°<+ahl>þøc‡8÷Ò-[¶,¬Ë£¾B5 ÔÎ;W(6E#" " " " " " " " " "ÐThŽú¦RÓ*§ˆ€ˆ€ˆ€ˆ€ˆ€ˆ€ˆ€ˆ€ˆ€ˆ@UhPú_ÛttÓ\ËM›o%7½Cg÷k‹vîÞÓÝÜn¼û¹K+×jÌ+®ùÔ±U N™Jh0ú©KïìÆ®ó/7iÉÝÜÏó,çEz 4ý·–îûßðÛÙO8™ˆ€ˆ€ˆ€ˆ€ˆ€ä%À|ú^x¡{ùå—óÒ`áÈë‡~èžxâ 7lØ0ÿ-€¤Ì𽀡C‡ºº©S§&©úmäŸz™8qbÅòú裺óÏ??•[ÅRDu&Põ_j¦â<4•öÃwGþùÏV]ŸHvñŪñ’K.q·ÜrKáwÒJ\‡IaÒ¶;Ö÷¡ pßÿ}Z°ÂöÛn»ÍÝsÏ=…ß³z%ïµlV¶cÚß«|ñÅÝ_|á¿i“Ä¥TÖIqh›4E³Ü£þ×VÜä÷â|à“ßÎMŸk×þ½«]ói“ò¢0" " " " " M˜¢ö|àºvíZÕ¾ýö[wÖYgùôZFç™g÷ÿ÷®S§N¶É}òÉ'^dûùçŸý¶n¸Áí¸ãŽnçÓš|ðAG\m´Q!žJ¬ <Ø‹4{íµWÉÑ}õÕW¾^¬%GpÀ믿îó3iÒ$7ï¼ó&„Цº¨d;ªú/µlq[û1b„š·ß~û’Ú;ÿ~ï½÷ÜK,Q*²z ÿé§ŸúHA”~÷Ýw]ûöíÝ~ûí—šn\‡©£wß}·ëׯ_­­;í´“ã/ÍÞyç7çœsºÝwß=-H½nÏ{-‹Ûq¹í$«0Ä{ÁÔ,žk®¹Üi§æYd‘Âáå°¶ƒ)sÚ`t›6m|;áƒöÔKóæµ}ÙÎ@BëÖ­ýò§Ÿ~²h ËfÍš9≛ï¾ûÎ-¹ä’þøp?×,ÒâÛ”¡‘i´mÛÖo&ßä?Éø`(ù2#NŽóÂñ\Ï0>4š–ß´<…ñsÞÇFârÄaô»a Ô‹POCŠO+æÄ•jDú¹—³Ÿ¹–xÜ»q®7/Ê^D@D@D@D@D@D Ú \tÑEnÚ´iîä“OvË.»¬÷v½ñÆÝu×]çN?ýtŸ}Ä«ÿüç?þùê¸ãŽs ,°€C¨衇Ü*«¬’:ñÜsϹŽ;V\¨ã7Ü«¯¾êÊêë£>à4eÊ”’DËúÈÇìg}µ£jáÕØÚÌÔÉZk­5ÛµyDÉ+®¸Â‹“•nôYˆôpÛgŸ}|¿{饗:¢èG»téRé$gi|q;®v‚€ŒHÖwâ‰'úAŸ÷ßßõéÓÇ]}õÕî¼óÎóe®+kDêN8!‘é"vóÕꫯî=ôÐB¸É“'»¿ýío~À€òÁµ2Éþûßÿ63Èq×]wÕzÓkÅWt‡v˜kÕª•G|l;à€ DZ2|øpÿFüiCÇ{lê hüûßÿöÇ#Ð}ôÑ>,üÐPÍ’Êóå—_ÞçÉÄ~ÒìÖ­›;äCìÐZKöÿðõ¶ñcË-·t»ì²ËLÛµ¡zTT¨g´fDÍ 4M³â™ÞÙ¾Œý,i‰XÏ48m†÷-é8h\xˆýøã½§%^sÝ»w÷^n”‚iS^zé%ÿ>ÿüó Æ6`yxmÔ¨Q/p¼ÙÖXc ·Øb‹…»gZ/–6™ªa¾ùæs;wvƒ òž«®ºª[zé¥ Û_ýµÃ³wïÞÞƒœ)mH—¼á-Šñ÷Þ6Ûlã¶žøñ 7cІ &xS%Œ|¦eÀû>4<ú8†%‚L©³ÔRKùÂá…‡×*|—Yf·ÒJ+ÍäчO)ØO8ìùçŸwðĈöµ×^ÛÿNú7fÌ÷Úk¯y!}Í5×L â½?úè#Ï‹·`ižñÔ)âÐzë­WëXÊGxâ‘;zôèZ”ïa¸/´ÐBnå•Wva[!2Êñæ›oúøáJ}–jŸ}ö™Ï7éÁ˜öZÖß|óƒ7ƒ 0ÂÓ‘©=6ÝtS¿N9˜B¢gÏž~ güøñnuÖñ4”éyà†'hh´›·ÞzËñæF=¼¨cuÖF7Ø`ŸeçV‰v”§þÃü&­[ùÉ?ç1œvÛm7”òѾ`¸ð {Á5|…@YyÛmá – .¸ Oƒ:ìß¿¿ouy-«mÀ‘“¶Oß{Þ€1³~¦C‡¾Œ”¡s£Þ™~Ñ-ùìøtsã'ÿâ68÷õðÐÂ:Óà´5P˜-ÑŠˆ€ˆ€ˆ€ˆ€Ì^xxýûßÿîXXù}ß}÷yA›é-ñ~CÀ  ûï¿ß <¡PÏÃðã?î|‰ïÉ=÷ÜÓmµÕV‰Ð²Òæ ^éç5wòq ið@mÞ}Ää1 qaáæ›oö‰]<Àã]‡(öæ1Çv/ÌD!ÖôðÀÿòË/ùYËðÆëÛ·¯O4Yßzë­}xD¸SN9Å?ø“§§žzÊ‹ôxö›HÈòä E<÷mzÊió:/âtšPÈŽ‚Q‹`"ÙxU"Hwâ>ꨣÜj«­æ“§­<òÈ#…¬ð{»í¶+À…EV˜švI]?ÞºÔÉgœáÊÃá妛nòáÉ«1¢ÎiÇ´o¼§Û)?"5uúä“Oz–¥•‘òà…j‚+옓3þÔ;»ÖFlB$ ññ§q]ÛQžú÷™Ëøgåç|!Op@¨GT§­ÂÃpnâEjyò¶Îæ#gqÃéöÛo÷ƒgxÔæµ¬¶‘ÔF‰Q¯YŒ² Û÷h»}ˆ eá¾£}ÒVáÇô[X±6C;§OQ#l†B(ýçßúë¯ïE~ÚqÃ’%‚'^Ðy9Ï[C&¾æ©Ã<åÿóŸÿìöØcßÎ-–ÿ^ù²YjR§ì´¤{ú„®Ë|­Ýr ·I ÇŽi ®Ut¿vŠ€ˆ€ˆ€ˆ€ˆ€4^xŒáˆ8Œ {Í5×xïÍRŒ‡l¼ÿCy5O2¼‰“,oÚxÂ÷®ñ~¿þúë½P€¨„ „woh¤ÃT¼~ÐÇÿ½úŽÈ†Šòˆïˆþfˆ Ìa‹eA¸E( ‘ ñ€9úY·9˜ùh#Þœgžy¦ÏK~_vÙe> D8D(„ý+¯¼Òs£lq‚ˆ „/‚x’ñ¶€åáòË/wˆ![6è`ÇPˆôLEA‘&Âé0``‚ÁÖì…^ð«xœ'B"=b?܉ ~ÔϪˆ9ˆÚxúÒ¾øÃƒq•úæ1âa¾ FðcÚ'žÇXkz(3Â8S‘×cŽ9f&FÄE»a@ÄÊc¢úè·ÑÆ0DmŒvØNÜVLUÄyõÀø0ö²êkã7öP¼mQ—v”·þ-YKÊÐK>)†ÐEg .çœs޽lòròÀùH[¤éo À³¶¶Ürù¦ïÍÓ6È3m”>¼SWˆ…¶Ýx ÒÓqþÐ6´©7ðÜ…G9Þô°A°ÅÁ’2<ê³ÚÌÜsÏí½¿é£(#Æ’ß . „rn"3à€0M]ñ†ƒõV®¼Ërê0-nÚJ(&Ó¦„á\b:œ,£¬ô%ôÔ… Pfçn¸¡gB?n†NÆàýN^+åZ–ÖN*U?\3ô†í¶sä‘GŠRWÖ…ˆŠ¬PoGq„ŸžvFÿŽ˜^;‹^ØÅ¹N]šHo;8ç©[úÁJÛ?þè¯5¼)Æ ¼ýÀõ®˜ñV ƒÔ Ëfµïöê±¼Óæ[)5öÞÝævç>6Ò ûfjjÛQ, £¥ˆ€ˆ€ˆ€ˆ€ˆ@ã$À+Þ<#|ò0Ž€Å4/xq"n—b©è ƒÊÊ~¹u˜'1ÎUú úq;ÿÒŽ£_d€•ò"LÛ›9yÎú|Œ ~œû´¥Ôk©×2ŸXô¯RõÃÇS{ì1?õ × b¡;LºÖáqœƒ¼!cö懅aÊ(ÞŽc€•sçàƒ¶]µ–Ìgoq°dà £¿ä¸øM/öÑgrí¡_¬´ÙÀ3×DÚyà-ˆØÈeЋû ®aôó¼XŠñ6VXvÖyËOVÝJ»Ó­CY¦wHŸóo›¾UÓ8Ûmµ?æ˜LKªXâ®MqÁ~<ô͈çì³Ï®åeGjR~mêD¾ç4ËS6 S¬þÓâOÚNÙ97BCp¾õÖ[½‡3¢0bf˺äï`¦%Á;OýRÕ¬¶amŒs-4›â¡žïB`¡x¶.ëxscÌ5öyÚ ƒH¼yAäÑ<Ãí-Üh»ˆ¸üѶðf¨†V¾ºÔ¡Å‘´ä¼åí'òN)––m pò‡…ý¶å¯ØyN¿Ãy!í­+ð}¤9þÅ}šý¶<äˆÂˆV¢~ØàM„eø*ƒ¼µ^ÉS©¬ÃrÐ~€3KjC6›…IZRgáüð6G»åÕúͤcëcSÝÐÏ3µ”“®u¡±ë/œ¹/b:#ÞP;þøãgâ®3xiÓ¡Ùö¤Û§eu˜eBý¯-Ú¥–ø÷kjêþpG±xÂpZh\xpFc.ê,Ó34´}>€l}mƒ·oŒà\`Àï8 &Çç[±üÙ>cU¬±°y—LqÅGHEÃ7—²Ž§Ü´¦bÊ,Ϲ@8ú088·Ü O,µMYû >Ì®-–‡ÿm-þ¿®õC›¥±>öÀ‡¸ñjÇûÑ>|kª\ÖV úW<ÎÓŒ·@¨<úiÏLc–4ïÿÎ;ï\¨«0.®y´]<̳¦ÌAP§ì±Ù6öç1®ÛöæS8]Çò†–±å7oA…åÇÿæš·ä 0›f¼•ö§?ý)m·¶W)ʽC•QÀæ3jß$gOÝ]©xRÐ#€§ÞöE2ÂGÔ˜‡ÈĦE0CÄÀû56¦¸Aà6³)=¾…ÞÃYi[<£G.|T•mÌeM:±øÂ4-¡!la¼²!| 8ò°ÏœÑ&Þú¿ÿ³¼â¡j†àƒøzå†å°p&"ð›¸%ؘºÁþÈ Ç² ±ñ…lÛ.yÓrnè%ɼåÌÿÍ`ˆM7‚W¥¥‰ØÄqá1a¼ñ:"œX"35M„X•‡µyæ"¼˜hK;Oš!N?ë·MmDy¬Œ,ÈÈ[Æ0RÛQÞúÓ(e^´UêiUÙ¬^M0+7ÄÍTôxsÞ!Ü絬¶am14ë›âö† ×M¨ ·åYGüc Î+¼¢ÍÃ?o›±éoøÐf8í iíŸ~p×]wõÓÅpžÙT9„ û ~³rë0-N¾kÀÀog¥MÇ÷kÄÅ[/œßœCô-ýXžóœpLqBÿ‚ç=פØsš0Y–çZ–GØNòÔOR¶ Op\Âk/û(¾õ“‡µ?¨ÌÔƒ& D2ÐËÔc´½øš›=×s&8g@IDAT?úÜxp—mL‰f§\k¹ÎÛžÅKš”?éÚmaÂ%\0>êÎyÈÓËa6=’ÿ‘ðëöÇ Á´i6 P²G=7Á\¼¸aŒ_Ó+Æc@‹Én¬K÷ª/vl¸¯å¤ÿ½rnÓºˆ€ˆ€ˆ€ˆ€ˆÀìA€iV˜²ƒ"*ñìG5ÈëçÞ&M<ûúöíëãD¼ÂSWëíø ,ådp¯w^ÃǺuëæ=ìñų–”¹›Þ™ÿšgE:¸ Ì!b B.´m·ÝÖ§Çô"Ôé‘_iì·„'߈Šsª3ø(mŒ)+ž|òI_&È3¿aO}ãa€†ƒ¶$¥E>áDýð‘Vì#¦ð ~³X#0áñÎu„`Ì`¬ð²¯‹Qœ+wß}·dAðDPBh£†^¯Yé”ÓŽòÖVÚiû™ö‡?Ú7í™uúêŬÜ<0}^Ó l0¸Ä\øÌQNûɲ LÿVªèË@mA3ÿ5_棙yÛ é[{çM8›Ñ¾"iûL‡Ãàç¼å9î'츴e¹u˜ý/s~c0æZ`¶ÐB ùzˆ¯¶ßÞþ¡MÀé\È¡\Yç9qÐo#ò›HË1¥ZÖµ,ŽÏ˜‡í$«~â8âßô÷Üso/ãp¤ß¤Ÿ§MXšyXÇq'ýf€,žâ‹,Þþ¸í¶Û|ûGä¦~¸Wàž€é°µ·«ˆ—~"Œ‡k§MÃw<¸0Ë5ë$¦\c¹æÙùÈ` ó½s¾pݧïg°¦L#EœyŒ®áyÃq”‹·-¸1c€ ®Ü+À™Ó–lPp¼UD¾B£±·´’ö3XhÓÿ„Çi½zd_i¢¼2rÅk‡Ü0Ò8óÚ¢¿Œsc§-˜ü×ߊi5nhñÚ+" " " " "Poyx(ezóðªdbˆˆ|(ÏÇ[n¹ÅG8ÍeyHŘ£‘˼Ì¢yN±‡eV±Á•1Py5ÞÚ-œ-ó¤M<Lƒ(€1`€ˆ?ü2—:"óä ±Æ<9yÍݼõ6ø /s{Þ"Ï”ñc;Œ˜¿³üÛ’mˆ c߈<ÌÏu†· ¥‡@g¯àÃñ !cz€N8¡‚>ÓáàYŠÎT–®-4O;í4/nš€ ›Í7ß¼àJÜð@(!&ÄÃÑß„‘&""£Í‡Íö$c: oD7ÄiŒÁ›zƒò P"È"dc´+wDÓ<ÆÔ y¾öÚký!œ‡rHAØÉbÍA *ЖÌ;ŸiDÌÁ[¹˜ç´0´5>H›aÇlÄ&ܱs%)Ž0þrÚQÞúOJ;ï6øóñe;©;òmy/'ÔÌ]nS|l”¶BÐF²,OÛàûœ‡´Q¦‘¡.ºwïîÛ”å?-Î#ÊÊùKûC<Í:&).ÊÇÀç9B=ƒeYmÆâa°‘óš´C£¿£_ëß¿¿ÿcçÞá‡îƒY>mÉÆpÝ þ•S‡ÁáµVamƵ%4DRk?ló®“úŽ“O>Ù÷m Þä9ωqž: ET¶g™¥Ÿu-‹ãIj'YõÇÿF çìœwÖw†k×gËkÖqÜI¿ذÁ Û0Ï[AÞˆÚ\Ó͘¶ uÎW®!f6@c¿YÚ=B<×?ÊÃÔ9fœö‘q¶!ìó]“;î¸Ã÷«l³kº}¤žm¡ÛÆT=8>sþÄÆ·HŸÁ »®Ó^ì:Í5 §òöݼ}È·B£Ï">,i?×dÕK YMG‘!W&ó¿¶éèÆ®3óÇlJ½ãà]ó©cs,å:HD@D@D@D@D  °)1X"ì±ähD%þçÛ¶m[؆(“wª†rò::⬠ëqLwƒ@š¶ß‹‡r!ÀÄ–v b^ƼºŽ Œ÷m(*B6ƒ ÌWއô¸q㼘>`ÇéåùMyÉ—yù…Ǥ•ƒú£|±h€§Þz|„4ɨ[ê¾IFÙ©cG›I*|ȃ :$ÅÅ6Ê…•2ç²? È?Ê€geR¾8,®?~Û´E¢õ ´|æÉwkŽCüÅ;2nðØej%Ä¥¸ÎŠå)mõ@Ò8¤n/·¥Õ?œ± Ó ×ñ<å\+fyú€´<‹7ÞW_mƒº¡ÿ(µ®­?æcšˆýÅŒs˜A²¼–Õfx{1‘ŸvÇÓçñ¶JÜæÒú‰8ŽøwZVªYziý¨í/¶L:Ï-<ƒ£×\s`EDå­™rês¦Ø5ÑÒ³¥µ“°OªŸR9Ò0ÌÛq[Úm [êñßœ’Ê@¦âÚ>»”=©œÚVÿ˜FŠ>”û(ÚœÝSÍ|GZOyA\o?òQ7yñíÊNãóŠôe'¢E@D@D@D@D@ª‚€‰¡i™‰ò´pÅâ1¡9>¶Ø1ñ#@l{¸D€ã€JåM+sZ9Ò‡¬<% paâr§  ¢d ¤oÞaúyÖó”!,b`ìm›”^ð6ý@ž|'±æ8<,yÛ‚)5™‡á¶wÍô4¥ ·Iùdõ’”~Zø´íå¶£´úgJŽð;Ié"f õiçC_ZÂ0YëõÕ6²ÚhZ¾8ßùã[Ym–6PŠPŸÖf˜~cРA^¤Ç=-ïo^ÁqþÙWŽ¥Õa¥Ú‘å)­µýÅ–IçILµÅà-ƒ•öÑÒrë-OæÑÚI¸-©~JåHz³‡ñ7ÖuØæáK='Õuc-·ò]}f™G½}âª5ž'ó,g?s/[OøÐÍõæE¹ÃPõ%áR`&L =꫱:˜û¢Ø4<‡ žwÜq~ê˜j,‡òÔð™£~ذaÞ»ïT¦¿aj™T¾À' T1ÍS8ÈUmy­–üà1Ë7˜jf6¥RµäOùh8iõ³\¨ÿµU7yÅÃKëéÛ¿wµk>mRI%Ô—„KE@D@D@D@š0 õM¸òUtYF M¨/ï½£:d±Ïx¦±Éc„#|©"}ž¸FD@D@D@D i°rYiãß¶]Kh³lŽú¸pm†÷u­G tÓ\ËM›o%7½Cg÷k‹v®ùŒ)®å¤/]«qC]«1¯hNúœ~‹€ˆ€ˆ€ˆ€”DQž4î¶Ûnî¯ý«³ß^x¡‡~†¶âŠ+úpá¶J®“~˜æ~ûíç˜ïW&" " " " "Ðt 4˜Pr> ;çˆ~þ¯éVJ." " " MÀäÉ“Ýa‡V«à;ì°ƒÛu×]km{ÿý÷]÷îÝkm+õ"8"½}p%¿ÙŽhŽP޽ñÆî½÷Þó×^{mEÅzÒÁH“t0Ò {¶ÕGº¤ÇEhä'i{Fë" " " " " ³Ž@‹š/uŸ=ë’›µ)Å$³6u¥&" " " "F Y³f®E‹nùå—w:urC† q|X1þˆâá‡î^zé%×­[7Wsßš]æöÕW_Ý‹òôiþßÿýŸ?fÑEuöÇíï¼óŽ»òÊ+ÝÍ7ßìFŽé8®®†ø~ÅW¸gŸ}Ö—‘øX`ÇÀD¿~ý|š¤uÊ)§¸>øÀ‡©Dºˆñ¼I@¹¶ÞzëZÅøúë¯Ý7ß|ãÓ\a…<ƒ0€ÍW϶_ýµðÇö–-[~ÿôÓO®cÇŽá¡Z(B`ìØ±®M›6®yóæþ™ˆ% êQ_$¿b×'Ÿ|âxà÷Ã?¸ÕV[͵jÕÊ-µÔRþArŽ9þ‡–0<ð™ñ@ºøâ‹ûĶmÛÚf¿œ0a‚ã¸xrqüJ+­äzöìéÚ·o_+\øƒ/¯O›6Í?Ô¶k×νüòËŽ¯ØºtéRëAsĈîÍ7ßtÇw -´O§Gþ°ï¿ÿÞ 0 Ž¢ð›pË.»lá·­üøãþ”8)',6Ûl³Â×à|ðA7cÆ ^kÉ×Ï×]w]¿müøñî¹çžóÜ<ð@×µkW¿}ذa¯º4Ûd“Mü.Ž…mø@úÅ_ø‡óùçŸßm¸á†…(Þ}÷]ÏaôèÑn‘Eqk®¹¦ÿŠ}!€VD@D@D@ê…bïN;íä¸I=æ˜cRïwð8¿õÖ[½GøÙgŸízõêUV~­¹—``€e–ç:bý‘G™.+3¤cü„5¯~<ú1¼êÓ1¦âÁË0Ü[Öu:â²é~|Á?âæûN×5­ j­Š€ˆ€ˆ€ˆ€ˆ€”A@B}ÐìD^¼ž°Q£FÙf‡è|ÖYgy±˜0ˆé±1jræ™gºÅ[Ìïâx<»¦OŸ^:xð`‡ø~Á¸yæ™§°=\¹ÿþûýODnÂ>óÌ3îÓO? ƒøõUW]µ ÔãÍůhlëÝ»·;à€ܘ1cóláñ€‹…zÄ}ÊÚ{§#0sžöûì³Oá0¼èÿüç?»o¿ýÖïÃ{þöÛo÷T<¤ž{î¹^¤'_ÌÍŠ—üÝwßíð.?çœsÜ¥—^Zˆ+ÏÊZk­åçÍ(öá‡Dúõ×_ß‹Öü±»÷Þ{½ý*«¬â=ámÎØW^yÅ{œãmÎëÙ¢zlx±#Ò3H‹_~ùÅ¿âýÝwßùx·Ýv[Ç«ë¼. Ÿ‡zÈ¿Òq衇ú¨ÂW¦y½Ý úŸþÙ ïìæ]€OÜxÀó!Âùå—vhÑ%ù{üñÇ}PaÅÀ̉{Ë-·Ì¬÷¢ h§ˆ€ˆ€ˆ@&Þà›8q¢;ýôÓÝm·Ýæß(L;h=öp¯¿þºûÏþSð@O ›´Ý>kžôi"=Þöö–žÅÃÇeëbxµ‡^ë&Ê[œáoD}›?ßök)" " " " "0ûh>û±v ¥ÿò—¿øiPjï©ýëóÏ?÷Âò AƒjïHøµôÒKûijx}ÛìxÈ ­C‡þUkÄæí¶ÛÎïb:ŒÓãXœæ¡'H‡bæDeÎZŒA ?÷ÜsûmLécÛxK!¯7ÎÇMxXo¦ÊanØc=6uŠž¼ñ+œˆ€ˆ€ˆ@6_|Ñß1åoõ]rÉ%Eâ­7»/(0a'bxøáÖP·àlÃù ü³)j,L)K¦ÛáÏvî Y7cÝ¦Ö ·Ù: axÛ^Ê2¾-åØjûÑGygXÀh å±<ãRŸÆ4–8Ú0õd%lVå;Ì+çƒu±ñ.o%ã”Cys·®Ö·o_^ŽCyÚçŸ9ö„éÓoñn}[ZúYéò¡nÞæ9*4Ê Sž÷˜žg¨RÌâeÚ×¼Fßgo€sLü;o<å„£¼8É1œd”‡7¶íY<)L)çÌÃöZ‰²–Û’Ên³6L!ÏY}+×]ÚÊé×,­0ýº¬W‚k]Ò/åXÎ1Î5Þ”£-Ò&“lÊ”)¾-Ò†«ÑWÖõž%,;Lpl-6õq^ë K ¾úªRJ¶ÁÆÞ~šœG=‚ðQGå˜ã”¿¤‡4n|O8á·ãŽ;¦IÉÛ@è€1Äî$ÓÞÄróêçCLç£GÇÓ¡Ox¼ÕóJã¢`¶é¦›ºîÝ»{¯¶…ÞöüÆÛŸ¿rÁ…·ß~ÛOtðÁ{ñ›ùõ×[o½TIi‘oŒøá¹¿ð %×CRÜá6ØRGܸ1å¿ùHù…“LD@D@D ~ ðÀH²óÎ;çNˆÁ~ÞàC` ï™rG0‹†áô6ˆðÜãà”À’îIù³„Dö%Ý«æ)i`YdzŸ|ÞÞtÌÿ¬Ãwž¸·æÃZ8Ñð­!¾1PÍfy¶ïUÕW^¯yf §Í¬KZ³*ßa™®“7“C£\<ÃàHÃseÄ¡©.F_CZ8õàTÅ7³ò´'ÒNr–âYŽéºêÛÒÒ/–.oõïßßÁ9ͦO…Á=÷ÜSkºVÞ§¿á™+Ëûôéã…Z¹xÃ)Áï³Ï>+vTx…g<Þpg`‚¶´Ë.»R€ÅwÞéøF™,¶ß~{ÿ† U)ïRD±#Ž8Â?ÿùÈëð„~òWMÆ`ÆÃ?ìFŽéè'ÌBã Þ€Æy‹Ol<¤³?þøÌ)?Ÿzê)/ÒgM ¦ÙPëô}LGËóv’1èÌ÷ÑøÆÏ™pà{fôá[ÚÕr^$•£Ümå´á½öÚ«,‘¾œ´Ê-W5ÇùˆHÏ[ûLkܶm[/‚ãœÀ`Z†àŽHÏý ÓsoóÄOøöû§?ý)±HöVÌrË-Wò Ib„¿odЗ·m<-G¨/w9û˜Öç4ØrâÔ1¨¯vôG »ÖØÛO“êi2›m¶™¿x‡b=ó›#Ò3ÊYªw¹}ø‹)n¶Új«™<À™Ë ÇC)a¸1À˜:çå—_ö7È~CðÏ<ô SŠqA Œfæ­B§‹èÏMGh\D)?“ro<) pâf‘‹¯¯ž|òÉar‰ë\¤¸QÂxõ=¼¹‚OXžÄ‚±w‘ݬ†7—xeña9©‹\Ðv'.©ƒ9çœÓ³…z«ò†tX§°°Ï*ƺ ‡uÆÔ1¤ƒ³•#oúˆ+ðä­"XRÇä—tx–Ã! ç©xúVÊO¦*å›f| Ѻ؛۔,„mXWʨ#Òæü =ŒIG6òJÛä|§RÂó¸'msÈúmÚ,ÆùH?‹1HFÛcŸ õ´Uúc¦àåXšp,ãó’sŸóÃÙó™,ݬk•ô”•üsÐv8ή‡ öq.Ðâk O¸æ}uKXÊG¾“Œº¦¼œÏ\¨'òj} lí­Ž'ÄK߈~E?I; Û"a`Àùϵ’öoå'Ž´óƒ}y:åIÛ²kiÒu–´Ã¤û$FÔ¼8í¼¤|ôçÔ-¼Ñqd±¶Ö“åŸ4ó\ëÓÎϺ¶£¼}•å7miçR|OBxòHI‹óÆ¡%ñ÷'µŸ0L5¯7Y¡žJáÕGNj>äÊ«|Ä ;ï«|V±ÌÇ~ÒI'ù C؉Û~–œ€L¹Ãh=6^ L­Cd7tƯ¾úªÿ@*Çà ÀÍ8FòUW]åo²ðo^㎃›¤¼ÐárAP3$ýtŒ\Ôy#€ Òæ›oîóê-áß5×\ãO$æŽåFŠ“‰ŽùÑGõ£ïy¢²iokÞuı/Po7%”‰ »)榣®0ö1ç7+Ô d¼É<ð@Ï£Ró}úÄôOD@D@D ‘÷ ˆØ&˜$ 6r=çZ?¼A*ºŠhΠ>"މì¥$€ o"=b8Ôx×óÇvîíB{K‡T¶×ÅÈ/Þ¡Y ìÇÏQµ˜‰ÜC›ñÀϽ2÷ïä1r÷Ýw/Üßñ°{ýõ×û`tñ|Ƹ?µ{D3¸Oæq‰0LÍ€C ÷ÈvoYò|`B7ó,ã8bÆþ^½zùim[œç¤üwžìþŸ{\ÒÇØGL-B¹øá!Ì=5Æ6Âs<˜Îƒ[Âðû™gž©Å‡²Sf»Ïå¹ÀPâêò;ë<ùFLáÍYÌÒãa{ï½÷öϤÇwø© l?õµæškºÞ½{ûã’þ!>Úýºí'ß<÷XÝP.ÎcÀsõ õxªògÂ4i3À‡¨“dx±r>bä—úN©x^²Âfq@ðB¤²gŽA\£Lç&ÔçIŸ>a3¦8‰ñüB\L?Š”ôê%1ͱ‰ò¤bϤ¤Çs,u‡Hy{ ƒœœ¿´5ú1k[<3Ò¶a›=?óö;‚Sk>çg¡Åçs1þæ_êzž6@¿õßÿþ·ÐÇQn„V¾=G[²þü°Ã+$Ïs/ Ñ9°° S¿e ûVÚÿÆoìV_}u¿þð6¡žø¬%u€¾€àÌ3µY˜Û*u¾Ú ÆQÏÔ ý}6<’,«_³¶f¸õ7ÄoçñRvêÛf* ìôÉ”›òc´?®9ab×#;ßjùC˜ Û*Wõôñ½u`‚:ýyä¾˺FQÖgs~ÁŒ~‘þƒ´ì:Ä9ÁyBŸæß'Ró6sã7úÁcòMùésX†FzL·…ŽöÅÀ±ôEp¤þŒá(9í 6„áMêÃ2ZÚˆõ\/­=;?H#Ë8Çh[´ú–Ï?ÿ¼ã »6…ù ¾ø~€¼3s}œå“òÓfœiL#Eùlªkko\‡8†:±mÜûî_3ç<×zãaÇ„çg]Úñåé«—e”?éžÄ¦ƒ2vÄÃ9Ž–§ &µp#ùפ…zêˆ 㢋.òŸ“¼TãNêÄ’âáíýúõó#€övÂsãÃ9Pœ¸tät’ ’Δ›O:RøP«M³Ã ¦Ð¸™§ã5ãF‡´¸q¡£áXÞà!tð.€B}9Æ·‹/¾¸p±°»A./ÆC%7Z÷‰'žèã&ራýƒebôŒ¹`9¡1¶›28ct²ä—¿SO=Õß´sQ@àçÂØEÉ览€ˆ€ˆ@pó÷¿ÿÝ áñi=¢Ò­·ÞêzùâcêúAû Äz¬T±ÞDy¼Û1~›%ÅÅ~DzøÃ°vL}-I³Ú AÁÐÄ$„:GöM@\°‡gDΘ¦ûfŠo¾ùf?ÅÇ1ÇSpxðF´Âу{rD„yËQŠ©>¸7ä>–¼pŸŽ˜h¢÷ÚlëR#n›@æž–Là^–6ÌC.÷šxŒ‘< ™›kâÄË™ûx›Zá’{}òÇs‡å9¬3Dz¼6WYeïìƒØ@Ž;î87áéy@ â!+5ß <‘'¦öä9¡†òàÍÛÉ: Vö:Ï ˆ#´«Ê8b¢šEŽ—-¢.†CU;®Ø2Oú<+Ñ.M”BÀáY ñ…ãCq4)­øY•s¡,©.,Ú!mþÖ^l_]—¼µÂs1ìM¨ç™ôÌãØÒàùŒvŒQÿœ¯ Îl³Í6ÞƒQ q“ëñ¦ñaÃAÊFŸä„çE]ù§åǶçi„¥½q-´~sŒ1Î-¶qn!"òükõ +ú+[؆-}-¬_àüÆ¡s€¾Éâ±°¶¤ áøÇõƒã©¸†m1L«Rç‹¥GµMUDŸ|òI? /S'YVlÇÐF¯978D…;ƒfx„ÓÏÚ7/˜6‰vŒÁ×®xÀ€>švÄq˜i9±¾a¿m`ÊÒdÉõ‡þ‡ó†ëJxíɺF1Å0zÏá‡îµ%ê†k}7uFûÀ)”sq<ÍèÛé«7Ø`ÿV}6Óh!J›Q7\H¤ï@£á|cP•ë×ú2‘ 3A‹|Àήû'×K»ærïHY(Ô?\³Î‹«Ø’¸hÛ6ÀOÄù€¸Í½ýBk“áa×cÎYò–ô!nÂq?HͽésßÃTÌp0ÑÚ´°¼×úbçg]ÚQÞ¾ªãp_|OBßÅyÇZ-º¨1 LÜåiƒaq½ycÌt¥óÌ“¢‘¾Ô¼0úŠXÑÁšÇ<¯Ó!²ŸN“‹.aí#ktÚp€ï”h¼‹@Ž7lúÄþ†8ùx* #ÆvÒã¥ÏÃI9FGÆM8B77&ÒsÁç*´¤›<^í¡CÇxp0#?vsÀMGcš.6æUÏ \¹(ÙèáÇC qsãOZðIÊcž´FD@D@D ?îx`;ÿüóýƒQÚ‘<Ü2-âTÞy×Óâ*¶û¼Ã?éyȲû›bqÄû8†cÃ|Çñ„¿y¨-7­8íÆü›ûaó‚·r n#ÒcÜËò0ËýÆ}-bë&^ðà‹ í‡{b3DYÌD=~Û}0 Æ#Æëí·^üÙý-âY˜gÄ Ë $p/Ï yâáQCìBˆäþ“·K7Ì›—{~ò¿Å[øT“g´Ðˆ‹r1°@<ˆè<är,žyf„Qx¯[j¾Ã<™ ?žì>žüP^œdH‹{ñvØÁ á,If,MP´0ôæ=O'bb’èaÇ”²´téFÈ}@,´Å y8äÉGžôaˆxˆ7*í߯Ú’ò¤EÚ&â.ÏJöÍ´¤ci¿¼€w«ñO ÚPø‡ø’epF §ü&Ê!RañT<'Û¹gç,ç ¨'ÎÊE{/f&~"2- oV£pþÑï"™…çE%øÃ3dĺõ1yÚŒèï8Ç­d +ú7ÊŽPñvžu Câ6ìÕüÃ{Úúž“­Q#J¦ý¯ òZŸjùà˜8­J/–òH?G¹h£¿ à&YžþØŽÛh£ Ü\Kàbe±0´9DXÚ0mÇÎ#ëÓ-œ-©Æhïví1a›2„fýŽyì‡ûÒÖó^£8çqT¤­p}e Ãô‘´¸ãí\›èÛôao}{޶Í5ŽsÌêˆvK›±k¿iÓ´ vcýZëVFDzÑpÿÈu…ó ÏùÇ™ôFmŸvCþ¤¡,˜å£Øýí)kìzL~Éw’Ñví:jýŽÖâû‘¼×ú¬ó3)/yÊ–§¯JŠ;m[|O‚G»äLû¢žyó³Ö¨×•CÖñ°< w‰<«ÍyÃŽg*¦ŽÄL¬ŒE( Éâ‹ï{io¡ç4÷µ¡™øÂ`Ql&‚ÇyNË¥üaäÀÈ-nîïM̰òÅoœðÛò…èbÞ™ÿüç?-šÂ’<ÚàE\öròmƒqžl`…ü @š—´e0ÞfûX"ŒÀ(4«CÛfé†ûüº|âz M ´WÒ0²8äÍWÞôiSö¦Ï9åÇáýJ½ÓßPÆ4éŒg:|ŠDZ!båéWAqØB ¢màaËyÊó\h ^™Ù@O†|æ1{Ö#]2Œ4iðæ¼c{|^®®üáiýñaˆp8æåiv,}6±ÑVé[x¶Ç7rà‰Q^,nÃ~cÍ?;§ì·i –¦m—Iý1ûicô¡qZ•:_,q¿Ào%pþ ÛáóôÇiñ†â°>Îá} ý}ˆ¥cq±¤mãíO½˜ ÏvëçL°gfŽœ1ßÿíMþoׂb×(t$èð¢'ïä‡û-øJ޹öVúvθ§,!»nÙ½M Çc¼yÀ1¼‰WÚ,çqÚÌÄ™ÔO[þMðÏ:?¼$­s®pÞ˜·>:B1ßô ¿Y¬aÄ€ŸµKƒ¾g“ØÂ¾Íî=ÂökÛ8®”k}Ü~ìÛNv~ÆùàwVÙ“§¯"\^‹ïI8Ž6Áý2« Þ˜Qþ¼mÐŽi¬Ë|W´ÆZºFšo»‘NË>'nZ–vLÒv:B¦†©¤g—衪’ñ—y±ëbÇqa± e±pÚ'" " "Py<¼0m¯¥óÝ S»ÏáÁŒ_œðlOx¯tŽbQ!‘Þ„õRÓ#>1Xù31>Ðm›=Ôò;ijœRÓÆ;n”!)>H¬l¥Æ]ŸáM¤Îsgù0¡ÝÞдíö‘¸\C F\ˆmxàµ6çÙ>ñüN3òdbaޱu„ Œm'¿MÌaÝî}C!Ï@jdÒ¬œ|›ØIÂÌ—yE²že´¤z°¶cÇã%I¹LÉâ`ÇÅy‰çI–ô„¥OAH¥Ðoæ5ÚBß¾}½ÐB;±‡¤ãéwdHcÌ`àŠàÇ›Óäƒ~&.SÞg+Ú&m±®”ÓDeK³’K«liq›0mí >/*Áæq[¶ó;O F½…^ÇôC´KcÎ>,Cüd» ÆmØÊç‹R,<Ç,lÞeœV¥ÎKŸó“r›YâOö[?ea오e|ÞÛ1ÆŸcŒOx<Ûb^ˆÇœ?ðgbë«9ÎÚ ½UaqÙo«3Û^l™çÅ 6Î   Èòö ¢=okÙ›ÅÒ`ùGl6&žóŸþÅÌX1ȦÒVu‡ã$S¹á­Î”* ’…Œ,>–œ+v~†ÛmÝÒÌ:?,|Ú’üñö} y¢ob 0úC¼â³X“òjoïX:ô!ñuÊöå]Ø•z­Ï“FVÙˆÎIõŸ7yÒK sú/Ú?Ó+Ñqͱo(ämƒIq7¦môl)×Ê«ˆ€ˆ€ˆ€ˆÀlCÉ=öØÃ?0ÚGyXщiBL̨DñÔ±9ã‹ÅghC>™T^ÒÁã‘­R镚¿´ð±À“.ÜŽ`ÎýayìMˆ$8<¾Ø:žq¼¥‰hâ ¢i™8ç™°ä‡ôÉ=`3•" o¬"R"šðjb:B(‚€yã™§+i…iÛ+ê–oŽGèRã0cxL…‰ømÈ6\–“oËFïÞ½}tpÀk‘ŒWÖ…ðnEœ°ó—´xØÆÓ7~óiÈl”=ôÎæ7fl¬I+4æ5âQãÍÖùÁCÙLˆ‹6`eç·­〨êÕ*Ò -OúÔ#oó·ma\ÅÖù®åÄÛݦ,‰Ã›èK¹Íc9%©gþœ¬íÇuÇ™õaïUƒIÌ»NŸ@…ñ±Î5Žk'qØtrô­?þ¸+À¦wŠÓŒÏ8Þø7=ý5×"ê“éuø@­ ÞåaÍwðÊç~“küøÖL%¬Ük}ZÚ¥¶£<}UZZy¶›W?ß ±ó+¾¾çiƒIi…í˜ýY¿“â˜UÛ$ÔÏ*ÒJGD@D@D@D (2CQ¡hà2vòÀăYFDl„îJ˜‰ôa\¤`fƒì Eæ0l]Ö“Ò&úH«.ù´cJÖgàˆ·«ã@ðEK'ï’vƒÀÌåxhG\!ñ°ãa>)Ïðåã©<ð€ÏsTó‘axñŠf¾m„ö#X¬¹æš>{<”s^ð>„4ÊA»±é¬ jQnòHºˆ!|ƒ ±žï4¥Y9ùÆû¦ä•tøPÃà|0Þ‚a Ïk¦W ïäÅÞB󄀌!ŽÄƃ;"ó#6‘b® [lÇ 0ä%œ²&Ž/þðMY¬‹©òjÆTñ@B´còC }’¯xÚ’<é›H¾(7Õ€,{™Kú1„#ÎŽ¥¼fˆ^Äo2´1ó¡àØ®¼òJßX­”!ÎñqF>DZÌ£¶éqþÐ7 Ñ·Ëys YU¬ýÅçE]ùgå=O î¦i?|¼”)?Ð~á…¼ eӑз°ní̹’Ú°å‹6ÁF´QúꃶP°¶°y–IiUê|±ôiÓä[%XgÞx3¨\ysŒºÎêí8úW,8wééçM¶0ô ômÌÓNMßFý˜à ÃÛn»­ TÓ¶Í8 OÄ4ãÃØô=ôáˆíôw´»fÅå°k#iÒ'Q¿y®Q|È•ö#„N×MôFćÂ9Þþ´/¶ÅýùÄßúv®1´—Ðè“Iƒ©ºØÏ5>ž)„8ß­]QÚýƒ,“fvŽÐ—r¡Ž¨wúT~ç=?Òâ·í œOÞì¡ ~¦J6NyXó–uÉ5‰ÓÓF(gÒÔ7–nÞe¹×ú8þrÛ‘ÕC±ëeœV)¿é'0Ú+çç_x½b_ž6H¸Ð8'ûôéã|øÖ ÷c7Ýt“ïóq–a€†·Ö¨ó´ï „ñÕ÷º„úú&¬øE@D@D@D@ª‚@’`ÝC0àO63{Ÿ9Ä[x0×ìy æ•~~‡á-fñ"h` ð‘Dò!êð‘VÄ {XE¸Gèa°À [¦@¼AÀˆ‹élÊ kËró K„Ú‡zÈ‹ˆVlCäãïõ‡…ˆøÌ3Ïømä/9„D,¬D˜š8åüþ×ß)+Ç7sŠ›‘¢wùæ™gáÒ–Ô/ßC²zàxÌóš,l—ä%‹Lð%/xÝRv¶!¢štžô)í ÏSDZÒæ·å7©l„1C˜Æ¨«‘ÀχZ¢ÃðvܬX2- ?°Ž§½IÊSÒ¶´|Â8o…DŒããœâ£²•IçE9üÓò”´=Oà8ÞVapŒ>ÎæŸg0/–àˆg8ý½Ñ’Ô†-/ˆoUÖwü?{ww_5î|—±L)s¨Ð „&™eŒ W†K\‰ªKdÊpEÈr¥(CJ*C™J†Ì•È\fe¾ïwñ¼wý÷ïsö9ïyÿýßþÏóùœ³ÏÙ{í5üÖ³ÖÞû÷<ëÙÆ¡P_ð m ã¼zú,mWYöÏb¼D™æ_d6®7ÃÃ^šº>þ/£æciˆXî £tCÍñ¡3ÿJÑ\aSÏmæuzBÌ÷¼uIÌÛåÏÜ—<ƒ¨Gþ"™Ï<óÌbh•fý¹¹¿68´Û¡ †ý¯¿ÄOw’¯þ4wÀŒ¨‡¹(êbŸº˜S¤QÏ{éjAà›“ÕÙ**âMÖ;‡Áñ˜±<0@r ™U ã†Uaä¨Õ¿Í®—Vúï9L¿»^2 ¡ã£Î·ýÛJï ¨çãÈ»$Ȭ­4ðÞ%ó¾Õ8°d 8ôÐCÛÅuþ>ï:8͵^>‘gôç´z4t®êªû}æ"×L÷pŒòÆbž‘:ê>DÛeŹa,‰mì|bÛ>Eÿ_mƒ 6øW]ú"—ÞE.&³OD HD H–<b‹ ²õá tð ϰØÇC‘1+A´"vÛm·â©=m¾È6u<„M›OûŠƒôëJÃS›'ÃBŸÀ×sýítCñiŸWÿg$’»®È!¨ûí·_|þ÷8ôï×q:ÙW~åÐÅ èàbäõ_™¶Æ oY™z¬Œ‹?|Æé@`8nŽ‘®k@×|,ìïgñÉé²–q±}¾—Ú3,™«º®£pé;¦®B´0µçž¾vÀȱWy«S`Û‡½<]êzôÍ ]8F=b+¿>,#ͤÛ!s}{|01Fýa¥†Ô»ksˆ•: 1·0Þxaµ}VœÍB†^ëG•5­ɳk®â©ÞÍ×.ŸÎ2’ÐÓq×Ì!:åèÓz<·õºý?Î[Ì­û ãÞ}\Ì¿~§Gýb¢žy'‰@"$‰@"$‰À`³:–?>±•°„h%›ø/ò¢&0d0¤ÎãêÓ&[º*ÖUvWº ºŽÕûfQo½ãÚ6®>ˆ~üëÿ{A]Çú÷¸r†âSçÙþÝgááq—Ûçø?Þãê/Ÿ¾ò#ãÊùWªé¿W’Bô2Â?Ôˆ‹¿zŒÓ¨ë}Š´¶ãtXšqcUš!2®¬q8/£òq¬OÆaç܈ݗ‡ý³˜s䣼¯ä-}í€Q—Œ«Ó8l•'Zúæ…q8Êc(–uyã~k£óÛu³ªÈª¬QbŽ¢~H½ûêÁàÎiXè¤ æV€1Tð¬GÂnºé¦£ª0ѱYŒ×iõHE»æ*+£„%°GÔÓÓÈ¿Ýϱ¿k«ÜZÚzÝþ_§]Ñ¿»G÷Š®E–—$‰@"$‰@"$«<<ž,}^J²ë ß•¥ÞVeˆM,ÌÂÊ(<ûÂ6¤\ñXYÆÅ,‘^‘:¼"Ëš%F¼çÍC㌦È\ž×)K ¡k"|Íb×Ù/î¹p@<ùé”p-ÂêsE×ɼV.¼‡W›³ðdè›…ã˜9$‰@"$‰@"$K±µØÖ'ÂÜ `;ôÍ’0+Ÿ$‰@"$‰@" @ /ôÍêÎÍ$‰@"$‰@"$‰@"$‰@"$‰@"$‰@"°H$Q¿HÀf¶‰@"$‰@"$‰@"$‰@"$‰@"$‰ÀVXŒzqýÄüòÖ^Kg½TÁ[z½áÚÿ”D HD HD HD HD HD`UC`щúµÖZ«¼9¹ë-Ö7¾ñ Þâ^ŠÍsá…–¸—«Z'd{D HD HD HD HD HV]•¨¿ùÍoÞÜö¶·-è^rÉ%Í\P¼è/»ì²fõÕWoÖXcâe¿Þzë5›m¶Y3÷bÛæŒ3Îh.½ôÒU·G²å‰@"$‰@"$‰@"$‰@"$‰@"$«‹FÔo¸á†ÍÆoÜüùÏnÎ:ë¬æâ‹/^XÇ~ó›ß4?øÁJÚ[Þò–Íï|çæóŸÿ|’õË¡•;D HD HD HD HD H®ˆ,ÊËd×^{íB¼ÿéOjN?ýôN’¾óÿøGó­o}«9û쳋—ýît§âq_§YY~ÿêW¿šYUþò—¿”UV´?]†¾‚ûò—¿¼¹è¢‹ú’”ý‡vXs 'ŒL“D HD HD HD HD HV,3÷¨ÒF¸äû—¾ô¥âQ?´Iˆf1í…À¹éMoZbÖ=wE¤S¿=öØ£yá _Øl³Í6 .RLþ¿øÅùl²É&ÍÐy¬½“ñàÜsÏm~÷»ß5ë®»nûðü+¬bØa‡æ÷åD HD HD HD HD HË™õ^{Ík^³„³ùýï?që¾ýío77»ÙÍÊ hüãÂâLé$øÓŸþôæE/zQsà6[o½õLJº÷½ïÝl¹å–Ëäuík_{™ÿù'HD HD HD HD HDàŠ‰À̉úÜà©þð‡S!ö÷¿ÿ½q®xõ×½îudž͙ªœtßûÞ·„å ²~«­¶Z@nÿ:ÕêÍ7ß¼7«¾ûÝï6_ÿú×›k\ãÍínw»‘žó2úÛßþÖœsÎ9%¤Ðnt£f»í¶ëÌß;„úÅ/~QVBx¯€U„÷ý)§œRö;þÍo~³ÙvÛm‹¥3³Ü™$‰@"$‰@"$‰@"$‰@"$‰@"013'ê‘ÂbÓûL+B¹ êyæO«}Úò&=<2Yï³Å[LšÅàôHzáqÎ;ï¼R¦ÿÇsL³óÎ;7»ì²Kg>ýë_›ç<ç9…|WOçüñË­NøÆ7¾Ñ|ðÁ%é¤û ^ð‚fµÕVk¬ˆ8ú裛“O>¹ùå/YÊ׿^œ’$‰@"$‰@"$‰@"$‰@"$‰@"0fJÔ#{¯r•«4¿ýíoT»K/½´œ¨_Yåž÷¼ç2d=/÷i…1¢½!~µ«]­9ꨣ IÿÿñÍøÀæ²Ë.k^ûÚ×6þð‡Ë {7Ûl³åŠ=òÈ# Iÿð‡?¼ÙqÇ‹Ñä ƒZÆèñÇ?þ±yÍk^Ó¬³Î:ÅØp­k]«ùÄ'>Ѽç=ïiŽ=öØæa{Ø|¾¼îÅË7?%HD HD HD HD HD`¶ü+ÆÉŒòüç?ÿYrâÁ½‰ó¯|å™ÚR¥Îsïq{4»í¶[ó¼ç=¯óøÐûØÇšç?ÿùË|¾ÿýï—Ó…»÷áÎË}5ÖhöÞ{ïrì«_ýjgBÔA´ÓN;•s„ˉsâ„ï|ç; ÏûÇ?þñxøò¾ÿýïß\ÿú×/!v"­v&I_#’¿D HD HD HD HD ˜3eÂõ>ˆá…2šüîw¿[H6‹~®¸íï{ßû Á½Âº^&»Á,þóŸ7Ûl³Í2Ù[i£óÏ?™ýþÀ_˜šö9È~+B~ðƒ”Ÿ¯zÕ«b×üÖùµ\ïz׫ÿæïD HD HD`¥FÀûšÜçÞýîwoî|ç;¯ôuå¤ãÞ~ýõ×/a&»–Üç ]yÉ%—4w¸ÃÊóÀJݰŽÊqBnsÏ=÷,ÎBI&ÞõÑ~´ùÚ×¾Öì¿ÿþMng˜',‹Ñÿ“V¶]‡UE8Cšïz×»®Tsâi§Ö|ñ‹_lžùÌg–®<äCʪÿÇ=îq½]ÛîÃÞ„D30šOos›Û4ã¸Ñ ®zÕ«6xÄ#:r[ü]C¯e+Zýë_7_ùÊWšë\ç:Í­o}붺 XG‰›ßüæËðR}çäþD`UF`¦D= Å—7Ù™Ìþò—¿L…­—È1ÒWVAÒ?ãÏhît§;5{íµ×‚ª9êe²ÂßSÓ/zí Ä3^ø¡ö9&øX© ¯0¦<ò‘,Þ÷‘¿Ð:<ìSD HD H¥ŠbàÜsÏm8¿¬Ìâ™â…/|á2÷îk­µVygTM }ï{ß+ï–ò @Þþö·|g•4úЇy Ù9K9ãŒ3ŠQáQzÔÄÙþøÇ?.ýí˜8ƒŽ¾üå/—úxvŒçÈŽd¹kJf©G‹Ñÿ“6«]‡¥¦?\pAƒÜ~Ѓ4‘¾ÿýï/õzë­7)d‹šž!âà,Ýð£ˆúv­ wýp Ë$ÈCÒDÌ’@IDATøôÉ9çœÓ\ýêW¿Üˆú¡×²¶O«'}8Ä~}ä=A¼ÛÏ!”ñ•¹KÌ!B7ç¹.ã®pSm‰(¸FuYsÍ5ÛI×—Ë=.ŸöÉÚe¬0š×Ž®úAˆn³êP‹HÊÁÝEºúxýWçÂ9Yžõ>Çäç˜ýʻҕ®Tòól£,úY×5ÒÄñø_o»ÚQÏß—/3'êö³Ÿ•ð)7»ÙÍšðÚž¤‰Ì 08ë~’r'I$ý¶Ûn»\H™Iò’6pôrÞ˜„„¶1pou«[ufø‡}}Η¾ô¥eÒÆ a Þ­¶Újþë²¼SD HD HD`qxÅ+^Qœ›žõ¬g5m´QóùϾyÇ;ÞÑüïÿþoóÜç>·ŽŒ8ôÐC˃ø>ûìSœlõˆ-¶Ø¢—ôøÔ§>UМrÊ)ËÔÑ;"wÞyç’iþßÿýßÍî»ï¾œAB(ëüãåº ÷ÿùŸÿ‰l—Ûn½õÖÍ{ìQöÿèG?j^üâ7ÞOù’—¼d™´]õf–Ú»( ãô~ûíWÂYw­ŒãËdüï?Ú²ñÆwÊ}+‹BÔ[úrË[Þ²¹è¢‹:­`£Ú|fâ52É@•ç,IÜ~úÓŸ>ˬ;ó›ÞÄgàÞï~÷+7¡Çw\±ºÕR}òð€2QôÛo¿}ãe°&ZôBŸUÙ¾Å-nQ,Ù§žzj³ÝvÛ5·½ímëäù;HD HD Xapùîw¿[îO=l¶Ùfå~U<0îsŸk<ðz¿Rˆ}dí¯Å3 /piÂCº%£ÊvoÖµ×^»áPsúé§/»-·Ü²8Ñ„§r‚'¦ûjžýßúÖ·J¹ê«b9% 9‰ ‚H/ÏB!îÏÝÏó0E(‘§=íieu¯° /xÁ "iÙòúsŽ­ÕÎ'žxby6c ¼ y­Â—óÎæ›o¾\d¼:X%íx8ù|úÓŸ.Ïxò‘/¢ðŽw¼£¿"Œ'‡!DzŸ·%Ç!ä ¼¬"€eúÔqá:jÑ>8“ƒg±Ú ¡à޹Ýín·Œ®È‹^œuÖY%¸êÏIå¼óÎ+õVžç«vøÑ!Xÿô§?-!$#„ãûÜç>Å›R;„ë¸ýío_ 9ˆ1«ºy¥j㽃›gºZèÍÙgŸÝx†õ|‡˜ ÏËÐQá¡”§-p2ÖÈ,ôhHÿ×õíúíWãNATiý‚áo|ã2>ê•(òW‡Zè¼0T°:–èCýˆqZŒù§ðå€G÷ÍI°·&$æ™k]ëZ¥Úgå»±Kô»pYÞ3óNœ;t ?˜Þä&7)å;¯OgèyªKì½;Ï<`Üj’ñ /,†Bc¢ž§ã¼¾­öigÍGŒëþ¼êýêN÷Ÿò”§ÌsêK_úÒ2 %ê]èƒëIÌ9}㜑õ¤“Nj„{‰q¥N¡SæÕ>ç˺îñ{Ôµ¬ÖcöéÉBúG?˜{õéCúÐR-}Ž,þÄ'>QÆa\+¢Îo|ãËuÙªÜÓ¤òä'?yþšéÜð<§÷Æþ»ßýî2.…àÁ2r›óƒ`òúòq\©¿kÔ}ï{ßr=ô_˜6RçÅ¡x”HðÁ×?÷!°$®_tÂýK[¬TÐ6!ïÌ Ÿüä'«bäsyûœøÇñˆõxr|¨¾G^¹]±Ìœ¨§`.¦&pd¶‰¨‡¶IÊ ¤æÂ¶²‰]ánÜÈ™Œ*1pFåã&á OxBÃZøÎw¾³$]wÝuËÒ¢¸m_˜]$Ýô#ç㜇=ìa¸e!Êfuep“}„üßmoÙ™_‰@"$‰@"$‰À""ྔ“ âÁ=«ÿïÿû ¡-N0Òòè£nX5Ä#ÁSõ$Û‘ïI¡9¶tɸ²ƒl°ÒU=ˆs”á>:<åÄêˆLãaÞ„‡~xsÐA¢K(ÊW¿úÕË…´¤¿~`Gx‘ …üFèynâ‘×N8žâyMFú\Ûè7ïid»ö#©µ‡÷&¢Ê6tK{<£"| ì>øàò;ð×_ ;žóBG›€ÒÈouÖix/T†ô©Ü˜¯h¿ñ¢Np@Ô#Õé*<cÓ3n,†Ô¡Öcà½ï}oÑ)yÃ騣Ž*Ù0¦¦ÿxœnt騳Ÿóœç”xß~k 2˜a’àV-ÎDÛx Óƒi„~ÒUø ¿EFé =7§]0zÙb0þîv·»•ù…~É–¶Þ釸jäþ7¸Á<±8¤£.£¶~ô£›]wݵèy¤ÓvÆÓØß·5×SŒ¯T2nœSæ9+B2`Éà:TÆ]Ëj=ÆÁu鉺.¤â‰Â.™3{õ3#Ä“žô¤åš‚w¢/Ó†Åf\ìêcS¾tø-oyKyÇÒÞé׿²úòaL¡sænÞò!}ìcËj“O>¹„™Šýã¶ÆIÛSѯ½N7 ¥æ`}¶Ýœ¡¾- ÚoNc4êSŸZ®—Aü·ÓwýǶËïJ—ûVfNÔkš˜anÉnXÅËr38J¤ßd“MÊ®I%&ÍQç¬ècn ܤ†¡ål–ÖÇ2¯ŸZ\´Ûù¸ÑôqÀçÆË„Z‹›72.8Òé/éBܤµóc¹MD HD H÷¼ç=…¤wßmy½çÄ!ÓCí$‚LðÀÔõ/>.‚™É‹´-ãÊORN1–ý?æ1)!;…¦A(q²©½ªy-#æ=(ûÛßn^ö²—rž—’­64¨ oÄoêÏVÈôú>Ý1ÿòD4Ô/QEr àÞý¼6‡¼ò•¯,ž{Ïþó‹—<²Ó2}Î;x`!áPˆ}¤21¡ ?Ôû oxCñ˜UFŸX-à¸ç'D¼çŒ3Ï<³¬ú­ÏÑHz¡(´«nÊA¾è7¸ú„qå3ŸùLɢƨΑFg‚ìç Œˆ|ÛÛÞVú‰ÔÖOA8jrâ™tœÀEè!^íòÖ7Œ@HåéƒqX3ô éõì=³y~Ýë^·\ñž×D„’¿¶!9tm7÷œHÏ‘¼HmºM×™èF-Fú}ì±Ç6œ¸B´…A…!ºx #àWÓêÑÐþ:ŒÛj?¢—C™±@9è,ŒK$¢¾`0¡ûÓÔÁx¤‹ŒEæä/Vº6D/Ôkˆn0ŒÑQä¿|éÂÑþÃ;l¾Hzó—¹&Æ¿ôB‚xùêÐ:©W-°AØò6>éëáX«P¡Úê¼LÔB‡ÁAý­ò7 %êÛõ2Ôçôý¦+µÐ)F¸ ‡3N´Õ¼d~YCÆs¡JÌæñXÍõÙÏ~¶—\/†Ê$×2ã.=1w.¤Â0lì™'B`¸ï¾ûÎWì×÷æõ°È8šµ¯Œ÷®#æ0¼#®.Và )O¿¸¾Ôç¸>'Ý&ýë4Óþ¶â“3owõÈúíææò>¡ÚGŒ·”+6ÿÏÊΰ<%Ü ˜üÜpP8³É1.0Qœ¥&9Êæ&“%)¼4"ÍʲEÏŠ¤Ÿ¦M,im’~\>mò½+½>q×î›®´¹/HD HD XL„@A!†ÝŸr.±ªÕCsMH©"ÉæAÛ½txý![ºdhÙÈBaÔ‡omá"oÏHz‚°@¸!Z»œ˜ ¼.Ý¿#Bj]…ž·‡›!£QÈïO`ËÖjfD€¹`Îky7Þ°¼‹'D¯gAĪöDs{ù=¼á#Ü€² wÚI`´¢[D8m}üã_0…-ÏkiÚm‚>óJ†‰ðøí“!mÚÿ}etí‡SôŽËŒa<ƒ‰1ÄÛ´u0>ŒU«,à<'3ÈM"£tƒŽ2àY!£¢ úIï®BÍ_³|N7Ÿ0†©£°/ô‰ ÑØDç0zÑ3û >‡Ð1sœñŒÜþ¯ÿú¯²Ò¯iûpH9ƪ¹Â<ã¯ï<ó"ƒ¥ö2I?d,˜óIÕð3öÅŸ¤_'½–uµe¡ýƒP&Hzs‹¸ô eæ"ý«Ì¤±ê‰ÄÜZþLñÅ̹4>V¥Õ_ý‡Ä¶E|wI_>ÉcýŽwàÞ\ÌŸ³ÆcϵÌ5ص¹‹eôòºGÛµ7´Næ²ÀÏ6VÐ =?Ó­x®<ë"Å„ã‰`àù1%)~(?«” ƒ%£–s“$^ã)‰@"$‰@"$‰@"°ê!€·öJ‡¢šW™äy¹QKü‡óúز#=­~xGT ¥"4@¤³j¸–(¹[{ÅñÐGšÉçÀ9ïjÛÏW]õ RÉ7D"á|Ú‚<ô¼gÎS^\«MúÂÊ„ÅP‰²Úí·¢8Œðßš )Û"D&Tïè⡉soÿ.‰rçHƒÈb?ÊGü¶¥Ýíãñ_ß#oyäFqL³‚YÙQQXGH£vl|No…„‘«ü™Ñon"Tq=w ÑF$îòPÇð pc ntW^ºåý @“`M]HF][ãV|sõªCŠu¥µ‰êCêy;ê7jœ›wŒ¸0B*—_þ øjÏiñ?ê0 ‹b]Hÿ„Af›y‰ÌhfnµŸ!‚A”ñ²Ö±!ul§7N1¤í-oÎÃ/’û‘¶Þöå×qüäŠó•÷S0ž2Üx /9õÔS—YídŸë1¤túÚª)×e«Þ†ŠqÈ2ÍXŒss»b˜)QO⦚ÞÒEË8¼Ð÷¼ÁíFÛ rs鸛K­RD HD HD`ÕEÀC3BÎ3Â8i‡ÊìzÐn?cÄ}—‡ú$ewyÄ{nçåE[¢>u:„#²—£“0mrуøMV2hx`úÐe"VÂV½.»ì²B„ iaÀÓœ÷6BÉ1àÙNx˜ 5¢=}Ûðö×Þº=Ñ~çE^Ú„è Ñ>B¯„¢žW}àÞ)ˆ¤QÆϤÈjá+jCùõþ¾ß<#Ì«øÑÂÖøo4ë(‹ÃZm ð¾r‡ì· …xCxLû¯tf!2¤mCú!up®±‚OàÊ £ÌC9dû…Ôá‚9CZ~Æ@¢CëÒ©å9ž<#kâ"ÒÙ"’´C¹á]n¿¸åâ3†D¸ž™Q¦PΫÏq^Ÿ Éàd‹(ú(Â!§†`ž¹ÂiKσÄé+{Èþm¤=ÑF[†Œ¡m¬Ë™T†ö]Æ$¿áEWõ›°*ˆ¨è× Ì¦­ƒ¼…E1à7Œ;ÄýP§¡£HßZbnjë¦þDm½oÈo^¼BAWb݇‡ÿPA‚šë¼€³{£lyÑó ÂQ¸ã Rϱ¯o;möå'ÜÃ/å¾p<íyM^V½Ä˨Í-ˆ~2dœK'ì—ù…ç½kRG*C®e]yÕz2¤ºòˆ}1ÆbõEìÇÿ‘8/Æ }MW|¢ßm#Oœ?íÖ*s;í0kæTz!À†ä+𼓢=7š›ÅÞ·ß5‡1«ÖåȧFŒØ7jË@Æ@mÆ'Þ'Yß×tå¡.®¥>)W\fâQïæŠ%YOI‘ò)‰@"$‰@"$‰@"LŠ€0+BvxÉ!R qƃ̃)R9Â` ¶M´½´KÌòÙ~ûíËÃ;oJ$t„–@þòPÛyçwãÊŽ2ÔE¾;í´S!¹Õ³Ý¿c†G:ù«/1'¬b—79²BGÞšÂDXç#sÔ•'܉'žX^v*/¡ÄœUr‚´Ûa_~^rꥅˆ¤»žŠ½­L„ïGy"¯¬tö2S¤=íd\@*Dˆ/Ø„=ÏH ,µAìfÄ»ø×Œ :¸ fZÈ éjÙqÇKy‹è#å©/#܆¨7R‘ˆ©>J.tA†ÌᵨMž[ÕÙØëoö4Æd\ÛØÒU–zÂIÿxÁ+ãˆÀCÿŽÃš§;w/BÇÆ-Xñ²_ˆècå˜cŽ)F„'•³å9Ú.w=Úÿí²†þöLJ~Óg¿Íú%dÚ:x‰±9‡×4Ãã’°Hgú3N†è{d/7¨þÓ½0âõ•d¨—›ß&%}2èÚ³ŸýìûÚË—½u¨Î(?ôÝJ0$ª/ýB†Ó}áp÷Œù¨s{žèkc쟶ãüzkþõòfcׂÑôC×<*M¬þ¡p:‡!G»Æsç›·Ê`Cœ3©Œ»–µó Ìk=×?í<Úÿá'ÆD:Ä…´w=Bb3¸’.oym‡ÿž{î9h޲ñ:Ì—ëžfgb䣿Qâ½bæÛ/{ô›c}ùÈO}…Ñr]dˆBÊ{ñ.}0V#×ûGyd¹Ž3ê· ÷ ×\|©:×áhÜǸλNÖÆ:õ6ÞÌIŒ5 Ä®eêrÁœ!±ÆÈþè ¿»Ž+#VÑH“²r!0þJ3¦¾Ĥã&År‘¶umÌéy8HD HD Hyˆ¸y>qÄe?"Ù e„—’+¼ÌÑáîx„,"›Ð+„;jˆ rô‰:¿ùÍo.§ ®vß}÷y¢uÖNbT KáÏù1‡Wè( œûÒÐ5/ ¤3ÈŸ °yž‡q§&zÚùÔùO£GCû¿]î$ÿáïåË1õzGݧ©ƒ~ð»< S^6JWô 'CtÃû ŒC:*ŒŒ¾Øl³ÍŠNEýûÊ1Ž´Õø¥ OãÎéÊKû¦ŒsD=cÙ8‰|Ó9öÙšïÌk^6êCŒ=ä,‰zƶÞW´¾¦éÃVóaâÚR Ò2ôÇþ¾ú©¹Ìpã3dœË9¯Ï6˜3ÆÔs›c£$ê2îZÖΣKOÆõO;®ÿ®;Ä5͇(‹#®«egë+ÚÑÚ=öoWê„®ç ÌVµéýBŒ=ýa\0Æš_Cúòqœ>»F2ú™3‰1é|qöCàgÜ"ñ…÷&®ŒüÛm·]ù?î+ÎCú×¢<×CFð'>ñ‰ó‡êz#Öãš9Ÿ`î#Hb¿1¸t÷’xc3eåD`µ¹‰âŸ ©š›YÖD–!Þq°£òÑ®x ô¶}çˆáÎËxÿý÷/„0O·6ñ‚Èfd¯œ‡´° èQäh]vßoíU¯.¯Û¾vè?ík%â{óÔóÒ.Ñ·ú¾]‚ ×7çÀ®öÁGÂèЕ—}ÚE-³màqÙU/ehCÝþ׆„¾zxabÔsH½»°vò—w$¢(áÅCAÓî³H3ÉV¿Àa¡ú7­õõ?ò(°ëkÕ ÆÚ(2ôÕaT¾íc‹¥úÆü1i_Ç|Œ¬Döc˜‘l¨ŒÓ«ßH~zÕç›ó¬Vi½¾y¢Gû_ÎJ¢¼¾y4ŽÚvóHÏ8zØa‡«ìZ53M¿3£®‰Q^lCOêy¼«&ÅQ=š]‹b.Œ2—ò–qÛ\ÇÕ7&ÝY-âÚW¿d)·;ë~ù Ô‘9”.1úÄ=Õ‚<ê-EBÒ{R¼Päòi^–š$‰@"$‰@"$W4Æm‚¼¯ý£ò ¢¹}î¨s"-ò£&@b½õ°oEÀ,D{ûÚÜ׎X…Ð.\º¸:v»Û„\¤eDgH‘vÞ‘çÐí6Ôí@¶½m»Êâ`'CêÝ…µó¾óï¯T/˜Dj m€¸åÙGuÕgÔ>ýÒUþ¨sºŽM«G}ý/$GýŠ®2fãˆú¾ñPç×W‡:͸ߋ¥ãt´¯^Æ»wUŒÓY:0 Qß§3ÂCñFÒó@ï«»ócµO»þŽM#}}8+=Š:õÍ£q|Ô¶kœ1$ñîf¼e¬´ª…LÛoCæ›ºŽ¡'õ¾®þ™Gõˆð:uÞKý·WãÄÜû”Ð1+c;²N—?ž¥Å¨÷ž7ªbe S‡l¸ük™5Hþ…€÷Xq‚ æ©6r%FÝð˜õŽÚ0‹JÝ©so"¬JôyÔOEÔ³ö‹èf5uÜ’µË è$ê//ä³ÜD HD H¥†@õK­Ç²¾‰@"$‰@"$K>¢~âuG–ÎXæD¼ ce%é—b'eD HD HD HD HD HV=&ŽQÏ‹^Nÿ§÷&=ꨣš“O>¹AÖ‡xƒú~ûíWÞ¨ûfµEÐqÄÍZk­ÕlµÕV%Û/ùËÍùçŸßüþ÷¿o®{ÝëΪ¨Ì'HD HD HD HD HD` Wš#e“fÉ~ò“Ÿ<Óº_ûÚ×n®ýë7¯xÅ+š6Ú¨¹ÉMn²àüO<ñÄæ¸ãŽkn~ó›7}ìc›G>ò‘Íúë¯ß|õ«_mN=õÔf‹-¶(„ú‚ ª2`8ï¼óšƒ>¸¹ãïXŽÜå.wiî}ï{'I_á”?D HD XUøç?ÿ9ßtÎ$ñ±ÿ*W¹ÊüÿË.»¬YguæÓæD HD HD ÀÅ_ܬ±Æ%šŠ÷ÀŠªâ“DýhÜ–;*$Í oxÃBÖo¼ñÆ "ë÷»ß²œgûK_úÒBÖ¯¹æšeËÓ]xš?üáͶÛn;_ÿøÇÍg?ûÙæœsÎ)/ô]{íµ›ÕV[­ÿóŸÿÜœtÒI<~ò“Ÿ”óûÛß–:FyòžW¶r…ò¹ÙÍnÖ|÷»ßm¾ÿýï7l°Á|YÒœqÆ¥¢>CßLÑç<Ïß/zыʇ×û4òƒü œöð‡?¼ꦖu×]·ÙgŸ}š¿ÿýïó»ßûÞ÷6Ç|ù¯ü|ä#ųÿ¹Ï}n!Ê…­9úè£ Ž —qþÖ·¾µnpƒ”s÷äƒü`s£ݨ¹ÓîTÈò/|á Í=ïyÏrìG?úQóüç?¿œ<ÿä'?Ù¬·ÞzÍøÃÉŽxG²èCjxã#ï ò]vÚi§Bú‹»ï?‚ßKˆÕéþ÷¿ó·¿ý­ÄÿôÒK 9ÿ‰O|¢X’¬V`@HID HD HD HD HD`UA`õU¥¡³n'B[ yd=ïöi„;©½Øë|¶Þzë² )ޤ¿ímo[ˆ÷w¼ãÍÎ;ï\<áß÷¾÷Õ§Bü ƒjÞùÎw6O}êS Ù~úé§—4¯{Ýë 10Gà¿ä%/YæÜøóÊW¾²üDÖ¿ýíooüÿÅ/~‡'Þ^ùÊWn^óš×4‡~x!ùå‡Ôümýík_;qÞyB"$‰@"$‰Àï|ç;ûÜ¿üå/Íûßÿþ†ÈÊ.QgN)‹)V¶ÂæOúÓLŠYQõ®+k¥ï©s¡>ÛÂèk_ûZy¦ÑF+€*””G¬¢O_úÒ—æ¤êò­æ„´ØÒWþ¸r­ÈþùÏ^žë´ÚÍqꬳÎj¾÷½ï•g¯úø¸ß‘o8}Kïø7¾ñæøÀ|Òöÿù‹ðC{=¿Z%Þ%ÚãÛÊrÎc]2ɸ€y­¯³hë´:ÐÕ–z_è0§>u7·ZÌ1Çg»:Ÿ!¿£¬!i‡¤™®CÊ™Eü†±æt±^©VçÿÇ?þ±è"Zª‚«¢¯³˜x—ã7¿ùÍYe™ù,"‹5WMRåZ—ºþ¤Gý$=ßJ{{Ü£ùõ¯Ý<ïyÏk>úÑ¶ŽŽÿÄ7O÷qÆ€½÷Þ{Þû~—]v)ÞóŽ=â˜Ï‚‡;|r‡;Ü¡¹Æ5®QnÈö°‡Í§õÃÍÌo~ó›fûí·o6ÜpÃ’”ý£õ¨BÚ:·ïØ®»î:¿ÔÃýûÞ÷¾óù+çnw»[i X„ÒéË/÷'‰@"$‰@"\±pë~™ƒÉùçŸ_Þuë[ßz¥ntÔ™“Êb òYÿ׿þu&Ŭ¨zוµº¸½ªX»[BkzÑÆ>µÎkÔoFe‰ke´ËCôIÙ?ýéO—ËÚ*äxFZîà wô•?ª!NO9å”’äIOzÒüJeX}ÑEÍŸîyËÊnïG'Ç·¼å-…¨õn5ÏvC~ÞÒþûg½õ<{ì±ÇÃ]zèC:_,Þóž÷4?ûÙÏæ÷ÁâAzP#´m-“Œ eÖú:‹¶N£uýû~‡#ê•5·2fÀK¸Z!p'‘(k’sF¥®£òŸÕ1N™ „µk8œúúðḋ"|wÛm·Î8öÕ[dþ׿þõò®ÁIû¢Î§ýGùÓyRaÔ’Ç6Ûl3é©éñl¢Dˆ¶°Ùf›u¦ÉÓ#0k=Z¬¹j’Ö:¸Ôõgqï 'Au ¦õàÀ-”Ë4btárÃÔçUùº¹¼Öµ®µÜͬö±bà«1ˆß HgÎm·»Ýíž—pVÖjt•e\D}f±F‡9ìMCÒOSÖ,Úxyça<"é½³³¤w"Á­à`L»ãïXªˆpgADsZføÄO,úûÿñ͈U1›l²ÉÄF“Î ÿ½“Ñ×jÆÓiˆúQyOs µÇ{̇Xž&<§ÅÒ£þWì‘¥®?IÔO©/Húg<ã%ŒŒ¿i„uŸðé"êíw#hFÒ#°Ûâ"PßL´Oó?zÖêZܨtIG¿þݕ־°üÞæ6·)^-‘ÎÅÿêW¿zñ–Š}¹MD HD Xõà …lß#»?åÀ‚|½Õ­nÕpH©A‰ìpßÊQÄùV—†ppq/êÃûŠÜþö·/iÄ<­ãUï`Šs‘N¼$­þTvíˆ"MWëúð dx¸Îu®Y–­|•ËkÌa¥Mˆò.ž ºD½±ê¨ ^ßqß-}´i$/$‚hÚz#89à¨?G!Ï6H⨶y畾°Jw”èϰ¯‰úèn´ÑFËô)òK_Ä3Œüõ¿çµ-·Ü²§žô ­OQu¥[ž¿´A¸Ñ.QRõ£pÐô¸î3¡c”Ã9tshùÈx ÛK}¬¾Ê±b™™pHö´TvîKûõQàq“›Ü¤ÙvÛm›Ï}îs…´¾ñoI—Ûjó·¿ýíBlÃzV¢æÆ_ía¬¼K.¹¤Ô•nïôT¤G"Òã…Ç;üõ}¬râ6¼èßô¦7-S]ãè«_ýjéD=1F¬,@ */ž­»ÆEþË2埡:€P…üô#¦¼ã­Ú^­æ ¡¤ìs̹¤Öaÿcýl|ë¹ÕØ€GŒ‹˜KÌ3æQç0vò#]eÍb¼È^¢˜'ô¿¹Õ<3JFÍÇ¡kÚèb<™WÌõu»¢íŽÁUžtÏŠy›ÎãÑrž¸âM‹D ¸•Æ ×M7Ý´.qÞg>ó™Fè ºÅYþüç?Ui¿óïÜÜõ®wß×®sW} Þ3÷ï§vZ)_&Ž)Chí ò‡p„Ù´O}nµÀýº[iü·rµÆGÛµ9âÚ#b‚•×úÿÆzH½=ƒXYL¢<ÛyÌc ñ¢üw¿ûÝåÞ>Žë/a9·Ûn»r^×—g èÏ8®Þ“èízÜã7oqýWõáÙË&7AÊÃKV½LUÌ{—õÈÒ;"†¹ûÝï^Ä«^õªæÁ~p¹8 ÒÝ`=ð,ifùµÏ>û4/}éK›ç>÷¹åF‰ç¾IÐd1üŽ>úè2d1ùGš®­çÂï¦úÐC-F ¸Zf¢ Ò¿ëÜÜ—$‰@"$‰@"pÅG¡ƒpA™„¨C8zعÏîC=œ Ó€´õP|øá‡—— ºçǃ7ÒŠÓ ÒCȤ‡a±š‘RB} …ê‚@G&©€8·oý9r;âºÎz(êd9bã]ïzW¹§{~õçi(>¶kyòrF x6@ª#.‘Pê‡8ˆ:×Z€ðGÒóÚÜb‹-J\wdƒ:¸¯—7‘"äžœLZoƒ—±ªÓcûØBæ ú¼4Ôƒ·çć!ý½~ñ’XdbªvP*•˜û‚t0­ÅsBó~kèiO{Zlêß#ô!úA;#AªEæ¼l‘ºÄóÑ8â¼QÛ!å#gèeRAÒ"_œ_“£]eµ 7cQ‡ëê‹ÈƒÒø‡¾Ä±…n=Kžp û ê„Ê ã(ƒ‘ýo¼2Îxö<ŒCn2Å3jœ[oyÄik£ƒ¶™6˜óÎ ©ÇÅBñ<û¶CtÀ¹ôM{c4ÆÄŒ-ûŒ-$"'»èWXÿѶZ‡£>Œ1/ß^âi ˜›"ŸH[:´ÓN;ïgçë¸ÖºX—5«ñåó¨ŽPEÚˆñ2ɇ<ä!‘d™í¸ù8Óäµ±Ápˆ…;#@pól¼ówca€õq­‰ëMœcަGÎ#ÈYR{”×ÿÃ0UýûËõÇücܸ®Ôמqר3Î8£¢{î¹gáwôkŸ¹[ŸÑ7¾ñeì!ÇûÄÜn®ÆOq\5g £UóDúÆõ€!ÑÜ÷2Þð?®o® æ2F$2ã6Œñ껸îG=pdqÍEØâž´ÉGÿÃuÜøˆ¼FmåE·ÃÀo>2Ûî5Ì ã°fL†G\Yuëz·tO~ò“Ëu®|÷=¢wÀ!Hk˜Ó•¡×úQãs!z4t®…q}¬}Obî2îð­8C®À@›Ü ÑÁºŒ¥ø{õ¥XéË«Î.@ŒI`¿ýö›I5äåBÀ*ë†iOÖ<êYÛˆ ÅP.”&BB“¢ F,5ê»y2‰ù„ô¥‹ã¶LP&`å˜@jqapCã!èÈ#,uòZåŶ>Ïo7õÛÍÝHY.fI¢‰Ü¤¬)‰@"$‰@"$«6H„¸$ÜŸ"é‰f³ˆ[âAؽº{ë /“÷»ßý €Ð q_”%Aêù¤'@â‘XÞN<Ü"¼|ÂQyR×Ù}zÔ' ²ÔÉÃ'R‚ »‘îÑ…ã@n„7¯V†{l¨DhµÈK»äƒD÷ë\žy!ÒÀ¨~˜´Þu‚†Ÿç¢>ÚË [Yžu8!ÂûžË £ÎˆþXA ä‰Lì"=âœI¶Q®º!FÔ×&ÚÚ†„!8 ©Çòa虑7*ýó?°/ñ!eIC7‘»V7Ó½>¡¿Vðn ü»ÒÂÕäË83‚^ûƒ”CR‘x¾<‡1öbÌ+pÐOƃvÑ÷Qä'ÒPX«ãžÆ" R‹YàÏ#¿cŽ¢02ßã1ÚÂÊü¦íˆzâ›À„OmhëpI4÷Å{:æ¤jÌoHÉ>1ÿÆ hcNz8§]Ö¬ÆKÔGÍsÚEG¿ ¸]2d>ŽóîyÏ{–¹Ô׸D[" CÂÒaºã(æôH[ýÁ0FßãÚĶ6ÔóNxì×Çú~½FóVÐ×W†ŽÚ°Ò—½ßµÉÜΘû˜Ûë4tÛ5΋>¢·t&®mþÓizA`眘×êüüŽ6"éad>@ø»®CÆG;Ï®ÿ0"tŸÞ¨?.L[HÔcÔý=²&®ÇêÞà%“ê‹îÆu4æ=Ñ6BÚ÷#C¯õãÆgU…ùŸCÚ6d®šÏpÀö= à ½t ¦_úÙÊ?†Ö!:8 è•:IzÔOÐ= ­v³„µRVRȶ¸i}ùË_^âq™ˆb©d¤CîóÒiË‹^ô¢ev™h|j±<®^"Çâl` âmIäö¹¡uab6aH± Àq:Ÿ.q³éã\$1u¥Í}‰@"$‰@"$«XÈ믿þ2 n{v e"Ž/ ²²M(" )ò »˜ûr¯[{N#^j òåC©w—ßA‚·ëÜW¥>DÝ‘í{`dtѾvÿ£^H—ðÎ<øàƒKÞõ—:†ñ¢ÝöiêFv°¢>È𒎺x¾iï‹c¶ˆXx¦¨%ú0öE¹°AF,TàÓî:h \•EÆá0´^C˧S±RÁ³à4⣄Çq[XC@$¨8ã"ºÁÃÖ8EþÕâY1$ =uõ"ñü¬\ñlë™–ã˜qg{\H·Püáó…ünÇw,ûÇé`œ‹ðõi ]5· -…o±"žD{I[‡Ëι¯Sñ?Œ–Qf쯷]ó±ãtÌÚ.kVã%êОüÇ¥ðæ®uCú!óq_¾°aØ‘GÌqpŽÄyæ-ýåD^¶t›·¿~ Bßþ˜ç‚°·Xy@Úø–=_q-uÂÉ0Ðñ¢WwõÙzë­çÕ=Y/³ÛÜnl´çqm©ñˆëãW[œO¬Ž QÌyU}Ça #¼^èJäm-õÜ÷µþÆ>çMr­oëû%ã³üi}k›äC¯W­¬{ÿ¶ïI$¤Âx1¬2Þ„hÿPŒs–êvØm©¶nÆõ6Ïš¤¯«HIc¢©÷·¸õàmŸÅ^±äÓdiКü\´Û<&¬xà˜¦l“_J"$‰@"$‰@" ©Ãóp*A´#ij‰ÿ]N0uºQ¿‘ÁÈ…vH¼qÛ®s<|òüîu ±:sâ^1Aì‹<ý2Çï H5Ä3ÐC-B¦O¦©wêP×)ÊPuâUçØò ƒB½¿a´½îÓúaÞ¹áYçÓ÷›îtõCèNœÇKR»‚‡Cœ×®KûÿòaiEµ´¼-=“ÑÎUC….5Šh¡'apè:á(òœ{„+ÂO¨õڢݦ !ã¼¾-ݤãÈz¸jgÊ}ç,dU±¼‚˜=h‹Yàó¶.Çø¢ÒýV{›‡èe`î£còÓþxVoëp´¿]/†RR±H;tÛ.kVã%Ê7>µ;$ÚÐ6|:óT¤‰sº¶íqçþÎ |êóíkã…?1~à/ÆyÌÕÎ ŒU‘Wü>‹ý£¶C®QŒØÂG1 àv¬^AÚ[­+1F•á˜ú#›“Hoü›_B+F¾Ú˜JW#}gE†Pn¼Õ…Ta$«1Šül•ŸõþøeŽ‘¾o«~œIÍêdnÌ|È+~Ö꯮±z'Ê1‡´¯Sqlè6°Ý¤×ú!eŒk›<àÜÕíq3¤¼®407Ñá•Ì_®9ñ…¡:Ø•÷RÚ·úRªlÖuÅ!à†ËÅÄDlÉ#±üäe/{Yïä¹âj—%%‰@"$‰@"\QhkërˆÄñö¸˜þæ„#¿ƒÄô{œÆ÷ã¶ö;7Æ9ogsw,ÐÙšÔoëp€¦½Â‡…´ÇXìŸdÛ.+°]èx‰:Dý§·hxj;S†ÌÇ夹¯¸fÄÿ(§&ÎÍ uhcZê]ÈLïòS¶÷´ ækÞÔÂâÔÄ?On×€ºQ'Ûš,l•ºÑ¾FÁGÀÙ„!¡ûåÏÜWáó‚4Vá0@Ôé¼Ï¥aˆH[WØö…ÐMmw׺ib×Kºbnåµáã#ÎíÛº>™'õ¥Ð:{ì±GéÃ0Þ ÁZ„ ùð 'ðó®™YÈ´×ú¾²'Õ£!sU_YCö‡W¿ÈŒÒô¹Öy ÑÁ®²j=v|Üÿ®)‰@"$‰@"$‰ÀŠB $óÐ2Ý»zù"Ü‹‘UÈ8˧5É24ÏH'„o./”óàŽ\Qr˜‡ú®:{ÈôòÔc=¶ÔGŒjÿ‘Hñh勼o‘ã8Ââw¸C)ÞC9òÅ øiÚáá?ÂD-Ë×nuT.‚ÊK%ÿøÇ?6^Ø'ÓÔ›÷1LÕU9þ#jàCÄ‘G ð¼^AÝÕÇ7B¥-ˆ²~QïÁÉaU/²I9H¨ ¾öó“F]Ú${ɼç‹§¶D? ‹õKMeÐ6$ Á« ŸêÕ[2¤ü ÕK»T§žzjO‹–ßÍ€"œƒqâ\í úEþñ"Ãx³wеå oxCÑÁ]wݵ}hêÿÈ9/gô"ÒQµSPhü˜ÅpÜh£J*³Œ*¡íq±Pü«*tþ¢úNÌkúã奌5¼¯?ó™Ï”׎ÄÜâwèY„ êÒᨠÂˆŽš[ôRuÔª‹8·kÛUÖ¬ÆK”G§Õ›1‰QÂoqãCÞüæ7BZ_›ã<ó«p#Æ.’Þ<1þ#¹ÁÜ&N»ùÚܦÂ3†Þá'¢‘n‡0¨˜ £™c›{ÌáŒæ;ú׬v;âÚ¨Ls’þrò"Wú#DgëÖØŠ8Y…ÁÛŸ~ÙמÔ“'~Ìí®1ô¥s²2„êrÜ5Ï/„ñz¥-ôÌ|ÄÈâœ>‰1b.uÑGúÝœêÿÐñÑ—ìGú«§•=æù{GAà4k«ô¥k’ÓÓíœÔ¸uª·Ó^ëë<üžV¢F]/ÛeMòß´RÑÎgPÏk†R륺ŒÃ&<@Õ…×­¶Û‡D è!åk}âyФU¶ÿQß®¶I^ÌúêÔÁÏC]§óVÄVX†X·ÃÞtÕ©k__=aÜN/…DÆùcÊKe•]ãbüûêÔµˆ8Ï wÆ1s\ÄŸgÌkKpän¾A’..æ¾oˆª˜;ŒC¡¾àAÚ–­¯Ðgi»Ê²ã%Š5ÿ"óè´qŸÉR×Ç>ãeÔ|ç‰åÎ0J7ÔÙ:ià ›zn3¯Ób¾oݘ·ã\yQüE2Ÿyæ™ÅÐ*Íússmph·C 1ú_‰Ÿ>î%_ýiî€QsQÔÅ>u1§H£Þ÷ÒÕ‚À7'«³ØûÄÿš¬wƒã1cy`€äæÍ_ ã†Uaä¨Õ¿Í®—Vúï9L¿»^2 ¡ã£Î·ýÛ»¼7 žôw8°ÁÚŠ$ï„4ï[KŠC=´]\çÿèó®ƒÓ\ëåyFN«GC窮ºÙg.rÍtÇ(oœ!æ©£îCt°]VœƱnÖå@IDAT’ØÆþÀ'¶íóWôÿÕæn6þíÉŠ.}‘Ë‹ ï"“Ù'‰@"$‰@"$K„±EÙúð†F:ø„gXì㡇Ș• Z‘»í¶[ñÔž6_d‚:¦ͧ}žeúÚ^“ßCê<®>ò„¹ü>Q¶>â­/]„Y~dõ¦<&#tWÔÇCq~]ixjó¤cXèòz¢¿n(>íóêÿŒDòo×9Äu¿ýö«“Ïÿ‡ƒ~ãý:N'ûÊ‚¢:°„\Œ|£þ+Ó–^ò–e©Çʨq±ØøÃgœ†ãÆC¤‹í8–nÈXüFmÇ•8öÍCÇ˨|#]×€®ùXè%ÞÏâ“Ó d-ãbûü׿þõŰd®êºŒÂ¥ï˜ºþâ¿(£öÜÓ×9V¿lTþãêØöa/O×£º}óBŽí6ʯËvÚ¡ÿ‡ÌõíñÁxÄ5Jô;"œ ©wÖæ/¿eøˆ¹…ñÆ «í³âl2ôZ?ª¬iõHž]sOõvh¾vùt–‘h”„žŽ»fÑÁ(GŸÖã¹­×íÿqÞbnÝW0»ï€‹{%¿Ó£~1QϼD HD HD`0áYËŸŸØJØGB´’MüyQ2Rçqõi“-]ë*»+]]Çê}³¨·‡ÞqmWD¿þõÿ½Š ®cý{\9Cñ©ólÿî3‚ðð‹¸Ëísü‡ƒïqõ—O_ùŽ‘qåü+Õôß« I!zájÄF‹ÅÆ_=Æé@Ôuˆ>EZÛq:,͸±*ÍWÖ8‡Ž—Qù8Ö'ã°s®OãdsŽ2”á•ÚeöµF]2®Nã°Užˆú!õ;§a¡“6˜[ÆPÁ³ »é¦›ŽªÂDÇf1^§Õ#íš«¬ŒJl”ÀvQ?NO#ÿv?Çþ®­rkiëuûvEÿîÝ+ºY^"$‰@"$‰@"¬òðx²ôy)ÉR¬3|W–z[•!6±0 +£ðìCÛrÅG`e³DzEêðŠ,k–ñž73š"sy^§,-Väû‘ýâž Ä“ŸN ×"¬ŽÐ1WdqÌkåÂ{xµ9 O†¾Y8Ž™C"$‰@"$‰@"°¤@P[K€m}"Ì f±Cß,i³ò‰@"$‰@"$‰ÀúB߬>àÜL’$‰@"$‰@"$‰@"$‰@"$‰@"$‹„@õ‹lf›$‰@"$‰@"$‰@"$‰@"$‰@" A ‰ú!(ešD HD HD HD HD HD`‘H¢~‘€ÍlD HD HD HD HD H!$Q?¥L“$‰@"$‰@"$‰@"$‰@"$‰@",IÔ/°™m"$‰@"$‰@"$‰@"$‰@"$‰@"0$ꇠ´Hiþò—¿4gŸ}vó­o}«¹à‚ Êç—¿üåT¥ýæ7¿i<ðÀ泟ýl9_~/ùË›‹.ºhd~‡vXs 'ŒL“D HD HD HD HD H+Ï"ëk]ëZÍÍo~óÆöÒK/m~þóŸ7?ûÙÏf‘õ: /¼°yÍk^³\¯r•«4zÔ£š{ßûÞËëÛóóÎ;¯9çœsš»Þõ®Í¯~õ«æÜsÏm~÷»ß5ë®»nßiÍYgÕüùÏnvØa‡Þ4y HD HD HD HD HD`ñX0Q¿Þzë5·¹ÍmšÕV[m¾–7»ÙÍ Yÿ•¯|¥ùÇ?þ1¿)þà‘>ŠèŽ6ýéOjþú׿6×¹Îub×àí¶ÛnÛÜë^÷*X}ç;ßiN:餿ˆ#Žh6Úh£–Cdã7n^ýêW7묳Îä™&HD HD HD HD HD`%A`A¡oÖZk­fóÍ7/5ïíO}êSÍ—¿üåF–Þð†Í†n¸’4sºj Þ÷ÙgŸBšÊI¿ß~û5‡~ø¨d½Ç6Ùd“æÖ·¾u³Ùf›5»ì²K³ë®»–´_ÿú×Ë–Wü‰'žØ´Ãâ|îsŸ+xK¤®°ÿÉO~RÎéúúÛßþÖ0žuÔQÍÉ'ŸÜøŸ’$‰@"$‰@"$‰@"$‰@"$‰@"pù"° ú›Þô¦¥ößýîwK|u„¾ùío[¶‡ÃC|©Š4â¼#áÉã÷¸åš¢½Ï|æ3›k_ûÚÍ^{íµÜñiv[Cn|ã—íÅ_Ü}ôÑ%´Ðõ¯ý²Ï×>ðRîÖ[o]p—æ²Ë.k¢_æÎý@ä?ç9Ïi~ñ‹_4«¯¾zñÞ?þøã—üЇºù;HD HD HD HD H¥ˆÀ‚ˆz1éIÛ‹y\†åªW½j㥩KUnq‹[”2]d½ØîAÒ¿ä%/i®|åéàüØÇ>VbÉÃí{ßû^sþùç7Ûl³M³Å[Ì ¶#<²ôøÃ›wܱ± à ƒ*ý4³B2£D HD HD HD HD H‰XPèD5áM^Ë•®t¥ùXí<¹»DL{Dþõ®w½Bæw¥YYö!ë½ôõ¸ãŽ›ƒ3+’^5äýÉO~²ùÁ~P¼ÜÅöGÜÏJ¾ùÍo67¸Á švÚ©¼Oà׸F³÷Þ{Ï*ûÌ'HD HD HD HD HD`JDÔ‡'ý­nu«N%ê ä ïr/býç?ÿ»ç·^zï{ß»¹ÓîÔÜñŽwlîw¿û5[mµÕ2yÌ'^I~¬¿þúÍ!‡Ò|ä#iÞþö·7ûï¿1P¼øÅ/žÚ“>š&¤Žøö>o}ë[›Ç?þñ%–üë^÷ºH² ­>ßÞ‹kñapRD HD H+ÞÅ$„åç?ÿù•¾Yêúío»¼“ê[ßúVï{¤ÜÓ{‡Õi§6S‡ž úëïàš•|ô£mþçþ§·Y•“ù,ÅèÿIkծê¢?>øà•nN4Ÿ½ò•¯œïFœËG1ÿ¿ëG»»Òôíã$©ÌSO=µ‰Ã}ií™à½ï}ï¨$‹zlèµlEê1]ºà‚ šÏ~ö³ÍøÃNÎoQAÉÌ+8ÓÅjù7(?ÿùÏK8žÚÛm·]ñ 穽öÚk7<λâÓ;v»ÛÝ® fDÿÿøÇq/»˜ð_üâWÚ¸éÈú×¾öµ%f½Àxà3'º¯vµ«5÷¼ç=Ë…Ã {-íÕ ÿûßëý¿­^€-¬k1é›dSD HD H+HísÏ=·Ù`ƒ VêfyÔ _øÂeîÓ×Zk­æ/xAYy•É+š9í¼óÎÍ.»ìI–Û~èCjäåÙb–rÆg”PzÔ£&ÎöÇ?þqé—hÇÄtœðå/¹Ôç÷¿ÿ}sÝë^·#EîZ³Ô£ÅèÿIÛÖ®ÃRÓ)¢ùAzÐDúŽ;øÆ7¾±œóÞ¤øÍ:½ˆˆw‚ŸøÚ×¾Ö\óš×ì|?`”ÝîÃØ?n{Ì1Ç4'œpÂ2Éò‡4>}rÎ9ç4W¿úÕ›G<â}IuÿÐkY[§Õ“q9ï¼óšW¼âË‹q€Ï~ö³éþÉ{Û‚³Zc5Ê»ñ^0o;•Úï|œÙ¸|Úù3(8½v¾®GöáÌj‰2Ö\sÍÂ_ŽŠxáÜú|Æhí©÷É[žŽÙ¯Í¢‘hO-ú\Y]HÇësâ·2囲´XQOALn¸aËáNñxÒ#éÅAoË–[nYv}æ3Ÿ™÷¤pãyÛÛÞ¶¼‘?IJÙÎwEýׯ÷½ï}‹Zœ˜þ.>&<ïðÒÞ¸ÑFºÿæ7¿|qö‚YA}b¢!_úÒ—Ê6¿D HD HD`Å#€ðpïÿ¬g=«Ùh£Š·ë;ÞñŽæÿ÷›ç>÷¹¥BÈ«C=´ ûì³O g‰¨ÿð‡?\ÞiÕgŒøÔ§>UBÆóìZ÷•¯|¥8WMCÔϪu>pòl”$}Êì~/–Í®† Ëi©éÂQŸl»í¶W8•áõ¯ý‚#ti‡P$=Üþó?ÿ³Ì»Â3Dy7 w)K[KO\³p~OúÓ ˆ÷{ó›ß\ Éo|ãÇB¨\¿ºDt \¢R[o½u³Ç{Ì'ûÃþPŒ7¹ÉMš8 \ƒFå'~àhN9å”bˆ}|à‹¡;þËï6·¹M‰lûl¿ÿýï—Õq-V¯>©ëû£ý¨yãF7ºQã]–µtµQ{Ü£ñ>IÂèì™"mÜùÎw®O_æørævü÷ÿw³ñÆwÊ}KõÚiœ¼ã)ÒݧKX‹X…,7ª—;ʉŒLæ¿2õ]íZ输N:iþ…»5+2k¡U Dƒ× vÊX Ï$ò€< yÓ›ÞT&Œí·ß¾ü^b›’$‰@"$‰@"¬lx>à¤ÂÓÒóÃf›mÖxoññ¹Ï}®<È_ÿúן¯º}î™=0×âÁŸ¸g‘m¶Ù¦Œt£¶êN÷Ÿò”§ÌsêK_úÒ2 %ê]èƒëIAûÆ9#+®‡SkŒ+u 2¯ºn •QײZaا' éå;ß{…®&ú—{ߢpËCûúÉO~òüµP>áy®„Æ–ŸPÙ¡ Ø82ýWK_>ÒÀþŸøD¹öÜ÷¾÷-×9ÿ?þø’…Ui!úu”Ðy$xÈ«^õªÆÜ×5÷!®OÄuI_·=æÛsÏ=Ë{=/¹ä’rÍ×^¡ªås´t]ÇáøDº¡zés»r"°`¢~’f¹Ñ#]![b`t)ñ$e,¥´1ÀÜ$}ôÑ¥êö™ÜÄð¯—`í»ï¾%ìNënxßDívÇCBìwmâtþ;ßùβûa{X#–YJ"$‰@"$‰@"°² เâÁ½®ÿïÿû„¶ðHK÷άšà9‡à©‰z’‡òȇ÷ä#ùȆK—Œ+Û9Â'X¡ªÄ9ʸË]î2ïˆ VGdÏF$’áð¹wRtÐA…è²zöÕ¯~u©sÉèß_îñëg"„ RÈo„|N>má÷Á~°”§L¿wØa‡’ ç][ÈuBZX*ÏK2HB1ä`É‹‘ç~„‡ÐÎpª’/rº¨G²[@àï\„U-Hx}>ŸÚ ¤(’V>Èõ!ÊGÚx^Ò×H‘/|á óø ²…aˆ6Â@Þ{ï½÷<¹DWjÇ'ÿ‘OA×uìûí9Š^ê+ùóÖÕ'Ï{ÞóÊ)C°føˆg3ýésm£ß¼§‘íÚ¤Öžüã… µ ÝÒ^˜A¸ÂN¸$øë/†ÏŠ¡£ŒM8iä·Î:ë4<ªGCú¿TnÌW´ßxQ'8è'¤:]…G``lò~ ƒÅ:Ôúc ˆGnŸ¼átÔQG"×íP§]:*ïç<ç9Í­o}ëRŒ¶ ƒÃÁ1Êh‰p&ÚÆQ?ÐOº ?á·È(¡çæ´ .¸`²Ô<`üÝínw+$?ý’7,mE#@p5rHÏy3È!}8¤ý~ô£›]wݵèy¤×vÆÓØß·5×SæGD+7Î)óœ•! °dp*ã®eµ3¾t鉺.¤ÕÆšk³6¶ŒRô£m$Õ6FÃ>ÜúЇã {„8£ãån»í6oļûòa$¡KæäÝwß=’7}ìcãïä“O.á£Ì{CDûÚžêŒyí}ð0øæV}±Ýœ¾-Œ4Úo®b,êSŸZ®ƒAü·Ówý_wÝu—+¿+]î[z¬P¢^,*7{&^7„nžBâE§ͪ"å»Þõ®AÍuÁv£kr5IÀ¯7,u^.äõiÝ|úÀø:×¹N¹H‰m—’$‰@"$‰@"¬,¼ç=ï)DrøÁ~pyf@"1=üN"HæH]$ç‹^ô¢B 3y‘¶e\ÙáIÊæ^÷ºWó˜Ç<¦8! MƒPâLS{UsÈAÌ{ öþ©—½ìe…œG> ÙjCƒºððF¾ó¦Ó½›P@È!!¯ÏZþHWmñÒF~Ïþó‹7=²Ór~ïá:ðÀ› æH8$b©…t`L@<èõ~ÃÞPa\B•ÔÕy"Ò¿þMWj¡SŒ0pgœèWó’ùQØ—ð(7ÎE0W˜Ç­œ ^Šðv½*“\Ë„ŒéÒsçBúV±˜·èŸùI/Od8˜…(g¯½ö*×.s³ùÈŠÆð¡o×ú<ó¾Ð³ªo7£‚k,owõÈúí:ˆú8‡^ 'NÂHÇr»ê"0Ì|4#|(¡AFÜh¸ ºypC„´6Xº¼BfTü"–é¸9™¶An”ãB?my^"$‰@"$‰@"°%‚8B »gåI+Ì‹‡ëšR6"Éæ‘ô¤'=©œ†lé’¡e»6A}8ÑðÖ&n òæù‹¤'ˆ„¢µvXŠ´žƒx]ºW¯_\ˆ¬êò<Œ÷X ut⩯ F‹ecËÙâY͇ 쑇pã Ë»xñ̇«= =ÿ…—läoøK ,dÏWçªbÐó• !FŠÀ5öÛ"HZˆ‘ éáɃ±HBŸôaÄBŽ—Z–c¾œ°‡¡¯b4+oÖ<Ú ²IO}¤axûËŸ“½ÓN $3Ý"BQh+R ¦°åy-M»¼Wé3âЪÂã·O†´mhÿ÷•ѵNõs°±ÌÆ3˜AÐM[ãÃXµÊ‚_Á 7‰ŒÒ :Ê€g…Œrˆ2è'½g¸ aL4ÍòÙÝ|¦ŽÂ¾Ð'2Dg`£þŒ]Ä–žÙO 3ÇÑ=äöýוý“~MÛ‡CÊ1VÍæñ}ç™,µA$ý±`Î' >~ƾØã“ôë¤×²RXëkVýcÎ4×ÉÄh-rÆ$mQ-b«_âsøÜj³Z5ìŒÑÆÅŸøÄúðüï¾|\?Èús¡tÚ¢¾®+æÅY ¢1ååÚêš«Ðê²³|„îÑvc(BÛÕéFý6G~¶±2fÔ9yli på]M%ƒxÓM7ŸàÔ÷„î‚‘’$‰@"$‰@"$«ñˆk¯t( ªÅ«%VèäF-ñ?âëcCÊŽô´ú!iŠ”âÕ[‹P#µDùÈÝÚ{Ù4“ÏsÞÕ¶! tÕ7Bï ù†Hä!n¯O[<§!5àl¹¾×j“¾°²±1T¢¬vû9g…1Þâ[$e[äÈä‘zÜqÇ• ˆ1çðöï’(7pŽ4ˆì ö£|Äo[Úý×>ÿõ=ò–GnôÀÕÊÊŽzŒÂ:Bµcã Ĺ$ºO¢¬&­íG>Á”MA¤m¡kµÑm_¬ð€sŸ i[¤Õÿ}ùwí×özLHƒp¶‚œÑ )ŒÌ#±]Hxä KÂSØŠ”IHÈqº:f¬ÕblD=GFÂh7káÍMÄš¯çŽ!:Èd•…<Ô1<Ã#܃Ýnׇny¿Ð$F›Ò‡‘G×Ö¸Å=©WR¬+­}ÈVRÏÛQ¿QãܼcÀ…R¹$üògÀW{N‹ÿQ‡YƒèBú‡áÄJ'e3žÓO$4ƒÏwc%ê5®>pÊ)±Ý–.‚»¦/Ÿ¸>¯H~Ñ<佌¢ 28Orê\(­z“}®³ ‹ÒéC«¡\o­f*ÆWD&qÎ4clhY™nÅ"°Â‰zÍcå¦&z=Ë$o„¸¨®X²´D HD HD XðpóŒ0N"¾p¤ëz Ρ+xI—‡ú$ewyÄ{¾içåE¢>u:„#²—·£0mrÑÃSþ5YÉ áÁ~èÃy”‰PXõža‚¤…OsÞÛ%kDLÑž¾mxûkoÝžh¿ó"/mâU¢}„^!-D=¯úÀ½/RxÆŽ2æxé²ZøŠZàd~½¿ï7Og¡„-?ZØá†`eñê­ áÞWîýV¡ïciÿµ‘Î,D†´mHÿ/¤Î5V«x­2è(óC™Ç~!u¸`ÎÜ„10”|Œ6Ò¨Wèrœcc¡ýùÚ"÷è,/o†02Dgˆî¬Jݯ_~I7¬¾aŒ0ü¼ÇÙo£êÇ«QóH¤ºžÇ‹;µ»^¹4î|í¦ ÂD ËB†ŒéÌa Æ6ã†9qR ý‰kKÔá_{G/´ô'aÈŠy!Ù5C'LJ¶k—]v™Ç¯«ÖVxÀ[x ºê½$]1ýûò ×wÄ´WrµËó.sc[bŸãC$ð–Ì'Dر6QϨ[÷½r›„¨wý±2#劇Àê—G“\|,ëøí?¯Ùœvñ ›ó×{bóË»½¾ùå½ÞÙ\|74¿Ûrÿæ²õwhþ±Æ0ïË£ Yf"$‰@"$‰@"Ìż;ã%ŠJà¥&Ö;(ÈÞ}!H Þ¯mám‹à‰Bª„ÔÞÃãÊŽsÄž­½ýÄzWN›¤¦¥–ˆE±Š‘uG„€˜ÑAÞÖçD]y¨† |_µWnÝŽHDƒÿòFê#Ø„nˆº8×16‚Íóš°ê„db8 ï÷È»]^m8 ’¤ö W.Â¥ž¦È?ƃ¨Oô‘.H-qêáɃ²&8"­•Ú¡Üð.·_Ürñ¿C"ÜÎ(Ó³©óêsœ×'H28Ù"Š…>ŠðGÂ~ Á:<ü‡êL„¿ñ¢Î:ì²åEÿ̓ˆIábŒ³•#M=Oø?J¦íþ<…%a8àÍÜާ=¯É˪—xµ¹ÑO†Œsé„ý2¿ð¼wM ãˆcCeȵ¬+¯ZO†ôOW±/æÝúÚçX¬Ò‰ks¤Ÿv«׌ŒŒ¸ÂŠÑ«öõtTþ±RÉ»&Úsž9wß}÷ßï:êƹÈW™ú­ëºiê-ÃóñŸx'd}¿RŸ¿ÕQùí:ÄñÜ®Z,ŠG=e6¨Ü¬¹æšóˆòšpA°Lï̿޶ùÖ¥6Í¿B¢Í§ùÇ•®Ñüy­Mʧ¹ÅÛk^øÑfïpþxþHD HD H+.¬Ùá%‡H%ÄO3°HåƒØòláÜËA»Á,Ÿí·ß¾ÄÔåM‰„ŽÐÈ_„ÀÎ;ïÜðÌWv”¡.òÝi§ É­~ž¶Ûn»HR¶òW_$"bNX Ä®ç"¤±<¼5…d‰°,NFæ¨+¹O<±¼¨Q^ÂUˆM«<1àI»ö!äç%§¼yÑ"ݽðTìmeŠ3üÁ~°ä‰¼ŽÅËL‘v<û´“qù!R¼`öƒqIh¤3ý'CtƒÇ=²—އTÿé^ñúÊ ƒ„—›ß&%}2èÚ³Ÿýì#ÛË—½u¨Î(?ôÝJ0$ª/ýB†Ó}áp÷Œù¨s{žèkc쟶ãüzkþõTcׂÝèF¥ºæQibõ€›Ð9 9Ú5nœ;ß¼ÍPâœIeܵ¬_`^ëɸþiçÑþïZdN7·ºf2:¹fÄõoïnc׸ q=ó"lrä‘GÝö~Ø»pÍêŠ3VNIÛ—ü„TsŽëCƒêêgc0ú•¡ïì³ÏnŒ×tó:C,¼„ˆ’×8q->Μ^‡£qâúíúWáÔÛ82×0Â(Ï5ª.ë‚9a‘:Ä»Wüî:®ŒX#MÊÒD`ü•fÂv4âQî.ùÓ?¯Þ|ä[7¯¶| ª®ô¸ùNÍ_¯½asÍo¼©Yý/ã—Àvå‘ûD HD HD`i €Dô¢HžGqD©4rZL\„á ‡ä /sDtx…;àH,$R™ ,Ÿ‡ßHÛ!eË "\ €0 qÚqvÅRGbxà÷ެ ON¡?#Ì ©íaë<ž©ÔY{ôÄ~ ;@¢þ±µá€C|#y> ¼¶£<PŽÈk$"š‰³ß~û͗Їg)\¨‹(7¶M¡›A`Ãæ~÷»ß¼7ª¼áá…§êD<,‘þu8e"ÙŒÛù]"œÂƒœ&Œ zC{¼Ä!‹È&ô áŽ4"}¢Îo~ó›Ë)ÈÏÝwß}žh‡µ“èRxç #˜Ã[y”Î}ièš— Ò$QØ©0îÔ„P;Ÿ:ÿiôhhÿ·Ëä?ü½|9Æ ¾Sï¨û4uÐ>b—‡aÊËFéŠ>¡#ãdˆnx¿qHG…‘Ñ›m¶YÑ©¨_9Æ‘¶¿ôáiÜ9]yiÔqލg,§3‘c£qÝAe¾3¯rÊ)å#½±·çž{–S£ž±µ³þ]U_Óôauú2?aâÚR r3ôÇþºNõoõ1w<ëYÏ*s›ùmÈ8—'r^Ÿm0gŒ©ç6ÇFI”?îZÖΣKOÆõO;ö:c˜0h5†¡Kõ;Úç¶ÿ‡Ñ¤Þï:íå±<çÃÇ…×á¥n,º>„ôåã8=uícÌ3cͼËK?„À;KÞýîw/“Îõ)^@iû¶§Ÿ~z9Ä8\‹òäÃÐP¿·®7b=®…õ¹ ;a܉ýÆ*=$]Ç7Œ¹”¥ÀjsÅ¿Þ8³Àv°ÕÖ\–5q ŒË9~±éӚˮ³Iì¼½Úo¾Ý\û¬W N/!«Ø¬Ä Sm-žU¾™O"$‰@"$‰@"°2 ËÃc ¶­h¤’ØáVÌÆ>¤Lí%6ëvˆ5Žœ b½¿p7Ò¾ã‘~T>Ú¾‘Þ¶ï„/ãý÷ß¿Â<âÚÄ "›‘A¼rÒ—\rI!óG‘£uÙ}¿µW½º¼nûÚ¡¯´/ŸÈ[|oÏp^BÚ%úV_÷Káú&°Cwµ>êF‡®¼ìÓ.áÊŸ~iG²®zÉZêþó?ÂZŒ*Ú (£žCêÝ…µó¿¼(J!.Èu»­»+c»²NK!‘̡Ƈ¸§š¹G=˜XÁÛm¹ôV»LEÒËG8çgœ6ªù?HD HDàÿØ»øÝªyà[f)ÔE(N($*%S!2%Sæ¸(¥¢º®!\nfW!¢!¥Œ ¡)QdHÆŠ”¡ •Lýïÿ¼W¾¿»Î>{?Ï~~¿çwÎïœóý¾^ϳŸgïµ×ðYßµöÚŸõ]ß•¬œÚWº6AÞnT<}$è{"äGM€Äùúˆ˜ÄÒ°¾·ý[yûÊÜWŽX…ÐŽk\žº¸:Žv¹ûH “(ã&RÄ;ï:ý!¿‡”¡.2°¶íKƒ<‚ ÉwÖî³a¢Õ6˜Djr€¸Ýf›mF’I}ùê:¯^ºÒï ;êÜlõ¨¯þ¹ä¨÷¡èJ±6ލïku|}y¨ÃŒû=_º1NGûò¥½ûØ«bœÎÒIˆú>áŠå0’žz_ÞÝ«}Úùwm6ÒW‡ÓÒ£ÈS_?×G»Ú™‰$®¶LÞš¬´ª…̶ކô7uCOês]õ3)ŽúdöŠ"VR uØUCîÍ0‰À´˜³E½Fîá¹Ø2f†¤×ñÔqØö²û½©3ß»>â;_·¹ÙWk~ñû¿5O{ç÷›Ÿ^ò—ΰkcßfµ«/ë¼Ö>™õmDò"$‰@"$‰@"ÐÀB´¨ïÎéò=Ë?/:£Ü`xAž½ä%/)®c–oŽ3õ…Šò—úóÎ;¯X—ó•Íý × )‰ÀBCÀ>Vœ˜¨â橞äZhy](ùa1k–Ü0sLID}õs"ê‘ô|)YîbIÙBË#},adý_mß\±xsض<`£µ›Ïï½qsöEW5'Ÿÿ§fÏߦ¹ü/×4wÞûÿ|ˆÕ÷¬ñóc›]píF=õù®ßIÔw¡’çD HD H¥H¢~iLòL"$‰@"$‰@"0múˆú9¹¾aI‚¤GÐÛLÈÒ4K;m^ÀŸ»™¿ûÝï–²ü}­{t–é†×»Nó‘³~ßìöÞ.öyù¿Í7X£ÙtÝ›v†uR}a;ØÌÿ¾xfäD HD HD HD HD HD`B`v;yü«€á“ÌÎȵpƒc®qìZKþßuW¯ƒ,õ{õ¬Ö¬~£k³s¿;­¹Ôõ81.ž—ÇD HD HD HD HD HD`E@`NDý•W^YÊÈýM-|;%#쯹æšriµk®ªƒÌüÞìN7k¶é-›½ï¼æ/=£ùØ·~ßl¿ñ-š;®ÓMì÷Å3á*øãÿý¿ÿ×¼á ohN?ýôRú .¸ 9øàƒ›‹/¾xÖh|îsŸk;ì°Yߟ7&‰@"$‰@"$‰@"$‰@"$‰@" C`NDý¯~õ«’ ŸôüÕ³ž_k­µšM6Ù¤øªG#‘Éõ¯øe9¶¿žþ€ušã_´Q³ç£5[nxóf«;¯Q‚\~ÅßÛAËÿ¾x:¯"'M†|ÿûßo~ùËk1þýïßüð‡?lþüç?ÏñÅþ³Ž$oLD HD HD HD HD ‹Àœ|Ô_rÉ% ²~Ýu×mîu¯{ÿô6“%W\qEó£ýh&7¸ü{Íßn~×™ÿñã cº˜œ¿YóÚî°øÔÇÑ4¯øÔ…ͯºÖç}„‹£xRD HD HD HD HD H•9õ@øÎw¾Ó\~ùåÍzë­×¬¹æšÍ_ÿú×æ×¿þuóóŸÿ|fƒYánð›3›æŽOòs ¹òêkšû¾â›ÍMnx½f[ܰùù¥Ý.râ¦OüY…ެåaý§?ý©Y´hQ³ùæ›77¾ñ§‚@l|öÙg—8ô uÆû·¿ý­9÷Üs›ÿøÇÍlÐÜã÷˜ÉÃ׿þõ2Q³ÕV[-qï)§œÒÜô¦7m¶Øb‹%ÎçŸD HD HD HD HD Hk˜3Q/š‹.º¨|FºÚÕ—57½è3Í•·Lg°¿üퟋIúv^‹“îϪ&_üâ›}èC¥Ø6èåNȯ}ík››Üä&s†ã=ïyOƒh'â?þøãKüuÄüã›ýöÛ¯¹úê«›ë_ÿú;ð…BÒ¿þõ¯on~ó›×;â¸ç=ïY&lÜËÏGÑl»í¶IÔ×`æïD HD HD HD HD H*æä£¾ŠgÐÏÿôøæ†ü?w8ƒnúW ÷¹UÖî_ùÊWŠû{ßûÞæÈ#lžüä'7—]vYsÖYgÍq ØYÇ~øá4üà7¿ûÝï–ˆûo|câþå/yó¾÷½¯ýÛÛÞVÂ!ãIþ~ŸvÚiÍCúÐr̯D HD HD`(çŸ~sÌ1Ç4ÿûß›c=¶9ï¼ó†ÞºÜÂEžÿùÏÑHsÍà\P°ùË_þ2רÊýË*ßuf¿üå/7Vß¶åŸøDYÅë}Dý3þ™«0D’ñ~5DŸ¼'p K%ýµ¯}­ùÒ—¾´ÔùiŸèK\:W^yeó›ßüff¯¸¯Ü0µŠú'?ùIy·‹kCޝÞCžgÿøÇg‚·ÿÏ\˜‡ÊûÛßþ¶wß6å9çœsÊjqÆh]2I»€y­¯Ó(ëlu «,õ¹Ða€ò<®o=ãŒ3š£>º™M¿iÕéÏå÷4pKú“Ü‹/ÑÖ¾ýío]¤“]rÕUW]¤C+ªè+éë´&ŒUðƒL+ÊŒg˜¯¾j’,×:¸¢ëÏT,ê'ï¦ßgÓl¼[§¿ú¾xôå¾¾+ñy>ÿ>øà%Jˆ÷@½ð —8ß÷Ç>ö ¨…ümns›æ{ß»Öçÿ‹^ô¢æ†7¼a ò¬g=«9ýôÓg‚{9âzça{X™0pë›>ðÍ©§žZ»w¾ó‹e½ûùÈG–{=Ðo}ë[7·»Ýífâʉ@"$‰@"$‰À¾ûÝïrÊÏ_üâÍ-oyËænw»Û[—[˜Èóõ®7¿¯YÈkdý?þѽ¯×¤,«|×ùúÙÏ~6óþç• ±u§;Ý©±¢WûHÔ¸gܹ(-nC½G]sÍ5ƒôIÚödkË™gž9óNÔ¾6Íÿ}éJã[ßúV1òæ/xAy?óûØÇš‹/¾Øß"Þ3Ÿô¤'·ªq®ïˆp´ [<·¿ýí›vÚ©/èçáÇ%nHûœŸöñÏþssÜqÇ•‰ ºô„'Ò\zé¥3ç`ñØÇ>¶¹Ë]î2sÎIÚ…4k}FYg£K çOè0¢^·ºÕ­Fö­\ß‹›cnm'‘Hk’{F…®£âŸÖµŸþô§ÅSAߢÅ.ŒwÜqǦ~>|ò“Ÿ,“E¾;ï¼s¯çd>ç¾÷½ïÄuitq>_ýêWKàÖzR1©%Ž{ßûÞ“ÞÚþøCq©·ºûÝïÞ&OÎiëÑ|õU“”°ÖÁ]æwÙêj¿¢Yóì×7WßyÇ^78õmÜÝ¬Š–ô5^L>üábÞ iRy÷»ß½©ŸûܧÙ}÷ÝË u5Ö˜ñ5/n亇 ‰ÁÕI'Ôø´ÅžÂo½õÖͧ>õ©bµ “vÿÐA\;ÎüŸ$‰@"$‰@"°j#ÀØäŽw¼ã Š˜g/”|{Ù'­iY—"ż÷\÷º×-qÏåËD¢_þ’˜ÌðÆ%í npƒB˜Õù³Io¥³•Ô,ëYH#³÷Þ{ïòþW‡oÿæöÁí=q¡ ëðÏ~ö³Å¥kW^­VG:?àh6Ùd“†å%>ýéO7Œ×ê}àJ»è*ÇlÏÍF‡Ÿö´§ÍФŸMZ³-×BºO{´Šç·¸EóÄ'>±îHp+8L¦!Ù ÂÝ$"š$CÌO<±è// ]«bîz×»N•¨7ékµ ãÍÙõ]y˹u×]·Ùe—]fÜ*Ï%®¼wiæK–NiùœYÑõg™õQMÅ Îŧ6¿õ}š¿¯uæk¬×ü¿ë®Þ¬vÍUÍõ¯øesƒË¿W6 ]}ÒFŽfæ_ùÊW6«¯¾z±h7(4xàŠf¨ì¶ÛneR‡¿ÙÍnVþŠÑÞ–Úr%fÍ7Þxãb…aͨßèF7*ÖMÎ=ä!)DVõq߯´GD HD H6¬¡ƒë¯¿þ—‚°ŽF¾ZÑùoÿöoK\g,‚ì@°±Àw¿ñn· ƶ>¬¯È¦›nZ ‰ȸƲvÍ5׌ÛÊéÄJ’ûi‹¿–®<×ùahâ!Æáq¯x¥Ë ÇXŸL›e]ŽÐ1ÆFÐt‰|!cåQVÀÆ8^ø(»=®Ä…DBÍ6ßN=ò¿Î:ë+x$qHäGÙìq¥.¬¶%êÀûìk¢>ê†n¸áuŠüR^ÌCÔ?·'÷º×½Ê)ù¤m}ŠðòJ·Xó+ƒ}·ºD>ˆz …ƒº Çuq#«CB7‡¦\'·-°TÇò++ ñˆG4Ü!!kQ~uxÜö¶·mnq[Š´¶ÒºO”ùG?úQ!¶a=-QGÒÖþj cé]~ùå%¯tS{§§Ê <‘h/,Þá¯îaA¸8‚ +úw¾sñjþJ´£ï|ç;¥>õD±²*½ ê»ÚEþU³þ9Tª0‚Ÿz¤Ót€u<¢UÙk¢Utî¹ç–s®¹—Ô:ì¿v¬ž­`‚oÝ·jðˆv}‰~F?ê@øHWZÓh/↷(ú õ¯oÕÏŒ’Qýqèš2z†hOú}}]®(»kp'ÝÓ†¢ß¦³D{ÔÏ“dtϵ êéªþxûí·/÷ÂÖÂ9¶Û¥¶¯þð‡¥=l¾ùæ3éŽ{Fá–ôÊ*ÿú%ºã¾xšìÓè@ûX^ü¥OS·Â*Ÿ|w‰ºV^íÙÊõ£èÃ+CÜ+âÕ7š0ÐOÒÓZ…öïYIÿwñôµHcÈQzFÒ­x–v=·aI»Æ]©3xiŸÑ.•O®námòФrèv]O‘wiyÖ÷µÏ¹êÑо*òÛwŒ¶Ô“/Ê(-íÆµtáÛ¾Þ¥?u˜…ü{¹õ@AÂßè‚Ï•ÏBiyæÍÃ’˜Qå;žŒó!WU_í¾ºT:L†ŽHgLt~ ñ@Òzøzø{„x€!ù]#:¥ŸŽG[¿Ä}yLD HD HF!d„—Ø/üßüæ7ËK¹—7däSžò”æw¸C âe×~KÆ¡^tY>“§>õ©3ÄkZ¤ ÁV®6Úh£BºÄ}öZâ:$Ʋü,×®!]¿ÿýïßlµÕV% _íãº|#L¢n”ëÙÏ~ö ¬ÅÕ_MÔ³Tõ bZÚk­µVyW‰xë#+V¤‘_õÝERyÿ‰‰aÇá€ðBRÕD=²Y™çAÔIŸŸëð³˜ZýÜç>·Äõ¼ç=¯¼q?ÚõÒ&1Ãm’òô 2„u:‡¾O“¨çšÚXô¼‡ª;D 0y88 µ í—®ñºÅ]+݆Q[bòç~÷»_y祟Ò׿Â}kÜÓnÏ£ð²qïlŽCt@¿e_¹èã”Ñúüç?¿èRøá _8“| Yv“Z‡Õ/QÖºo¥ÿŒð¶Øb‹rþð¢^|Ñ  N9å”B83î ©ÓrnZíE\øÀÊdœzV7ú}6<ºd\¿ºxô7â¶%^eWßHo¢ìúdåF¸ú‡Œ­ûxE{S‡>ˆÙZWM\!ê»&ÐÔAêú7y ~fÜ3J]DŸ­}ÁL¿¨ÿV<‡´ íDÿTç¿lñ9âˆ#Šk2ùV~}Žc-Òãn‹G¿L,=ýéO/}M¦fÂ)^ŠžÁF+ ‚¨¯Ëiãª<çBGµiŒmŒnÑ}ƒãÉ'Ÿ\ø¸x6Õù_{< ïöuÔÇE>•ŸÎ œé£Wå[{íµËþ“¡ožCîQ'qÎÜ΋Ý!8yÖqOÝ>ç¢GâÒW 7N”¿kLî ;ñhSÆNdˆvéO¹yùZ®Dý ‚ÑrÍæ¢ ˜?÷¹Å‹Q:Šè`§‘1îj>ÿùÏ7o~ó››Ç=îq¥Ó48Bʇh ÔÁ‡rHñ;¦SµœKÇb "œN‰Ô¾ãzD HD Hq t. “uG/ûA âåÙÙË7 H[/Åïÿû‹‹½öÚk†À1žFZ±ÀDzp‚ôð"ÌW3RŠ«„²P^èÈÄ çΫA\çYù"?A–³æýà?X^r½¬³“–†üc{±'£Šq:RÝØ %ˆƒÈs!ÂIÏjs³Í6+–÷ÈyxÉK^Râ^|ˆLd1>™4ß›±Ê“½­9ˆ>›†zñ惱ð1‘á]A½Ø$Y˜ +Ó’}ÁG8˜Öâ½#ˆy¿• ´çž{ÖÁfý›Õr„>D=(r$Hµˆœ•-R—XÝ0‡¸oÔqHúÈz¤)‹|qMŽv¥Õ&Ü´D¬«."zHgàú׿z´jÅ;.샨GJ/,Ž# “ ô˜¨íÕäÌ£õ¨bAëý¹i¨íg>âpdO„­'”MŸ°þb£³º]Ìÿˆ³ï8DÜKßL°D?¨™Ó¶œÓ¶ˆ,¦£^a¥o‰²Õ:ùa¨ý‚ömOm@ßñDØ8Ò¡Ç<æ1Å8Ïýꮵ.ÖiM«½Dú,ªÃU‘2â4l&ùøÇ?>‚,q×G`:‚¼Ö6L"Eán „E¸~6ö¼à6‰³¬U.í ƒØÛÏ}ä²Ë.+G«„j‰ÿ11U_óüÑÿh7ž+õ³gÜ3êßøF!Ey[@Ô«Ï>}·:£ïxÇ;JÛCŽ÷‰¾]_Ís‚U9úln´ê=Õç‰D}‡ô´7|’ç›ç‚¾Ì$’ 2íO3/°‹ç~äÃó2ž¹[¯2ù¨¸Žkר£¸èvLðë´ä¶±†~aÖ&“áÏcmVÞº6ân×]w-ÏQ¸ô{^üâ‚´†9Ýú¬Õ>ç¢GCûªQ××Úc}—vg ·h‚+0P&c !:X§±"þ¾ÖzEÌù*’g/ –ây`¼ë]ï*âxøô –úÎwA&~€¶ö¨£Ž*P{£.ƒpË»,!ÔñêD<ðÀ%¢5×iLÅ üòO"$‰@"$‰@"0$Bí¾Apä6’žxaö2‹¸%^„‘E^Öƒ¼@N>üá/B+ƒ”%Aêùo\L€Ä #±¼x¹Exù  ò ¤Î3ò"ò ^²M"È“—O¤Av;ÃsÇÜk^/¬Œí¶Û®¼  /Ï´ZÄ¥\&Äclï%×½,óB„Qý¾0i¾ë< ?Ć "?ÊË [Z,P!½+tI`„b„ñ~+Ô‘8‘‰]¤GÜ3É1Ò•7‘ü!àÚD[{"aCò1$}"Y£Ò?ÿë°’–0t¹Ë0‹îõ ýµ"€ukàßt¨þ _Æ œôʤ’Š´]i £íE›ÕVà ž´å¢ï£$ÈO¤!·${ì±G!<µ?D"(¤nÓÀž5F~G3D`¤¿ÓÆ£t„•þMÙõÄ;;‰ ïÙ¤o7™ûèÛë0tÛ3N‹:¢·t&žmþÓizA`çžè×êøüŽ2"éa¤?@ø{®hCÚG;ήÿa°J÷éü›¤Qù5 ‡\ÖÄóX~ü&Ýçhô{6ä =hG†>ëǵÏv>†–mH_Õwß¹ö˜ÄÄ ½ô ¦_êÙÊ?­Ct°/½å|ZÔ¯5¥ãô£]Zº\@IDAT–ûip”UÇb`Á:'ÄìSý?Î÷uo}ë[‹Õ‘¸<œ»ÄàÐÇ쯈è<ê°Î½±åRJ"$‰@"$‰@"0°^µ,«Û–\™p×H‚¬lŠ(‚@ŠøXdÕâe°¶œ6®%È—·¼å-õéò;Hðvžûòã¥4H yG´ÇÔÈè 3¢|mW–þG¾.aù†7¼¡31yÑ.ûlò“í<ÅÄŠü ÃJ:2ä£}.®9"F`ª%ê0ÎEº°ñn4WQ‡íz ˆ–ÀUAPÄDÂ8†ækhút*V*±=4ç>ÆYê1˜2ö «]ï&|F ‹ã¶xWB"AùbF~Ñ ¶ÚiÛ•ŒwϘè©ÃÈç WCÒEi&i¢iwηۅpsÅžÑ_ˆ á¼çѸáëÓºªoAZrßbEN¸ÑU^ÒÖáˆ#ÚTüIËH3Î×Ç®þØu:¦m§5­öyh÷ þ›”`Í]ë†ðCúã¾xacbGÑÇÁY¢ßÒ‡D:qÞ‘n³öW/Aè;ý\öΑØë¯ïµW»¿ãY0êeÂ+zy—†•1ñÕó’gõíÚF»W–xn™üj‹û‰•î±®tV;nû"ûÅÙÕOGþƒð×>"¾¾£¶¢Ý„µ>ˆb«äwÖ02áºéè{¬„jKÝ·ÅØ£Öß8ç¾Ižõmý1^"Ñ>ËŸÖ׸² >¤¯jE;òo{L"0°bÌĪɛ媃qÏŠzöD[QK·å[§ “˜O‰™¼qiÌw>Æ¥Ÿ×D HD HV^‚¤ËÃ!% ¢ISKüï3D©ÃöýF#bUk„óÂãâvžãå“åwŸÈSXˆÕaÜ/êˆ â\Äé9~A‚䨉 –^j2}2›|Ù)už" ù‘—!VÕq#+̘P¨Ï·1Œ²×uZ¿Ì»7¬"ëxú~Ó®z݉ûXI*W ãpˆûÚyiÿ’>,¹–µ%"•Žpç1Tè×¥ˆz]÷#…“ž{B`pEøq•"\[´Ë$dÜ×w¤›tYWå R¹ïž¹œ²*ŽWÓ¡ív1 üaÞÖåhßCt@¢Þj«cý½ Ì]3)`² ùé|L¶u8ÊßΗ‰RR·±;ôØNkZí%Ò×>•;$ÊОøt=ú©÷tÛí>î üÝøÔ÷;ׯ y¬ýÀßžƒÑW»/t0VUD\ñ?ê,Î:yF™Äæ>Ê BÖꤽÕZ±cT®É?²90‰ðÚ¿þ%$°2ÉWO¦ÒÕÀHÝY‘Á•ku.UL’ÕE|ŽÚJ´Ïú|üŽ4ǵßw”?Æ©ú@yÒ7q¦?d?kù—×X½éèCÚÏ©¸6ôØÀnÒgý4Æ•Mpv»’^W˜ë¿è?÷Jú/ÏœØCa¨vŽ"ë^w¸"• óš$‰@"$‰@"$+ m‚gHÁæ^àb£½¸‡_YÒEG˜qǰŒC$†bVÒ r¢ga]—~„‘W"ÈU„’Ò‹{XšºŽE„µhXºÖå_,¿w——÷E‹­‘#Qæ°@¾6ä’ß³ÉwàÀ‘'þuc³S¤ëVäDˆ´”½Ëê”›äü·¥.»kñ?°AXK«ñ´ëA~jkzqqe÷?~©ƒ˜\p_]ßþIß$Ùf›mÊæŠ¬H»¬3K ž/û*XµÀÚ=\–´ƒÒKS:ˆ$A@ÆG=ûÀË‘¨ke¨?AÆ•c¾ˈq>ø•©Ï¢vL4ƒ.Gµõ9Dâz»]L}B‘ßAbú=N£ƒ{´qGçÝ휵³¾Ç t¶&õÛ: )ol.ì\»E¸IŽí´Û¹¶—ÈCäÑz‹€†§²8ÅÒþ¸Ü´ø+žñ?Ò©‰s}CíšF›–‡zE2ó£ýh™€²@{AÍšš[œšøgÉMâP—#òäX“åí¨g|äɸI6îEHè~ù³ø«‹p~A«pL@ÔáìçR 7D¤­«lçB覲ó»N‡k]0qŒç%] Ñ·²Ú‡áÐö÷ö=Ÿô“ê’k]vÙ¥ÔaLÞ ÁÚ>âaNàg¯™iÈlŸõ}iOªGCúª¾´†œ«~{†˜”ÖÇÕ:#Ž!:Ø•V­Ç®ûßDz:÷SË*ÅL'HD HD HA2÷Yê4r†ûGD¸‘UÈ8˧5ɲÔÍcNpáÁšË†r^Ü‘+ÒA³°óBß•g/™6O=î¸ãJ~ø¨ö‰¤á~R¼Èkþ¶9®#,¶ÜrË’+/åÈð!Ò”Ã˸ˆ¬[–¯Üò(]•M%¯ºêªÆæ}2›|³>†©¼JÇDÉø~ä(,¯¹WwùñÂPi b„,ê 꽸#9ø!G6I™ÄâÀyVxÂÈK›d/‘÷|±âT–¨d±z©É® Ú Cp ÇòCè§|µÝ– I?@ùRnÕ)‹7’*&P¸sÐNÜ«¼!\¿ˆ?62ŒÍ˜íQÖ–C=´èàN;íÔ¾4ëÿÈ9›3Úˆt”Eí¬¨nÔ~ô Èb8n¸á†Å}‹Y“*¡ív1Wü«,tþ¢êŽÏkúcóR“5¬¯O;í´âã:Ü‘è[ü= D]:™¡\ÑQ}‹ú@ªŽZu÷v»ÒšV{‰ôè´|›L2)á7¿ñ!‡vX™ CH«ëqýqܧånDÛEÒëçÃÇ„Ñ7èÛøi×_ëÛÔOX¦ÃÐÞÂ!évˆ •ØËϤ™±õ=úp“ú;úϬv9âÙ(M}’úòŒ²‘+}¢3ÈõØCÅŠ8Y…ÁÚŸ~9×îä“%~ôíž1ô¥}²4¸êrÝ3OÏ…öz¥,ôLd’Å=}mD_ê9£ŽÔ»>Õÿ¡í£/þ8ô—O+{ôâ·GAà4k«Ô¥g’ééˆrN:¹yª³}Ö×qø=[=Šzõ¼l§5Éý¡¯ÚöW?¯\¢ƒÂÕ¢M¾ç=ï)F öz1;òÈ#KŸÏ œ‰«ÖÔyß~u|óý;‰úùF8ãOD HD HA x.ñ"?ê&/æµXfï…Ø’~V‚®/ZLüzéj‡­ï÷àÅî„Nh¾ò•¯”à^¸^êúòÌú9òÖx,íl~G¸ŒxÄ$€Må› ulÒŠœˆ—UÄ=¢ÇdA2ˆûä r€ˆ‹»…p9aã8Û|ÃQûÉO~²È(ç|6~%ê …D<餓Ê9ùa%‡H$u q`äT ð¯/Ëß•äqó)"¤‚Íq‡È˼×wT¿Ox !õà~ aym"€Ôz)/ãp€ Pyau«ìÎ!QÃzHúÊDŸXž"i¥íä·«l„„³º:¥EðÛ¨]‡û–Å‘[?°n»½éÊS×¹¾|¸>6…DÆùmê)OyJ!*»ÚÅlðïËS×ù!:à>,šÓÇ…ÿy“ymb Ž,Ãõ7BÒ¥ÃåÂâ/ä¢*úí«/Ä-icXN¶¾BŸ…íJËùi´—HVÿ‹Ì£ÓÚ•ÉM!u~œÓ^FõÇq_î&Fé†<ëãCg" \aS÷múuzBô÷a­ývÜ+Î ê‘¿Hæ3Î8£L´ ³hqß_O8´Ë! 1ê_}ñŸ>î%^õ©ï€‘}QäÅ9yѧ#Ö÷ÂÕ‚À×'Ë3ßûÄÿš¬w ?“Ç&Ë$7kþZLnXE“õµú·~ÂóÒÊ èÃԻ祉2´}Ôñ¶ÛÛÁ¾uÿ ¾í%A†`mE’ hõûVãÀÒÅ!‡ÒN®óÔy×ÅÙ<ëÅqF}ÎV†öU]yrN_ä™i gR^;CÌ›¤Ž¼ÑÁvZqoL–Ä1Î>qlß¿¬ÿ_gñ`ãÿÖž,ëÔç9½xðN#™°Ü˜F\G"$‰@"$‰@"°Ð@xGd£kh¤ƒOX†Å9zˆŒi ¢ù°óÎ;KíÙÆ‹lCPÇKØlãißg™¾²×ä÷<Ë8aî¿O¤­‚xë nVƹ™F¾é‹ÉðÝ•'ùñR¤_W–Ú,éL,ô Y=Ñß7Ÿö}õ“Dâoç9ÄuŸ}ö©ƒÏü‡ƒzcý:N'ûÒ„":0„œx#ÿ éH/YËš©Ûʨv1ßøÃgœ†ãÚC„‹ã8nH[øFÇ¥8öõCÛ˨x\#]Ï€®þ˜ë%ÖÏü“Ó d­ÉÅöýoûÛËÄ’¾ªëy0 —¾kòúÛßþ¶Lµûž¾rÀȵz³QñËS`Û‡½8=ê|ôõ ]8¶Ë(¾>,Ûa‡þÒ×·Û‡É#“Q£D½#ÂÉ|÷a­±ù­‰è[LÞØ°Ú9+Φ!CŸõ£Ò𭉳«¯b©Ž;%tÖ$Ñ( =÷Ì¢ƒ‘Ž:­Ûs[¯Ûÿã¾ù<W˜(6±’ßiQ?Ÿ¨g܉@"$‰@"$‰@"0°¬Žåσolì#!ZÁ&þ‹¼¨  Éó¸ü´É–®Œu¥Ý.ˆ®kõ¹iäÛK︲袧 ÿ¢­"¨óXÿ—ÎP|ê8Û¿û&AXø…ßåö=þÃÁ‹÷¸ü‹§/}×ȸt® 5ûïU…¤‡½ ÷5b£ÚÅ|ã/ãt ò:DŸ"¬ã8f\[fˆŒKkŽCÛ˨x\ë“qع׊§q2>GÒ ÷Jí4ûÊ£.—§qØJOµôõ ãpÇP,ëôÆýWF÷·ófU‘UY£DDý|÷åÄ;£a®“Ö_¼ÌDËz$ìFm4* ]›F{­ÉhW_eeWb£¶ãˆúqzñ·ë9Îw¥[K[¯Ûÿë°Ëúwwë^Ö¹ÈôD HD HD`•G€Å“¥Ï+’¬ˆy†ïBÉ·U|s³°…e2€Û†”•…Ò.¦‰ô²Ôáe™Ö41b=¯7iŠÌey²b!ÀuM¸¯™ïœ#ûù=çˆ%?â®…[®cVfñœÌgåÜkø:‹gxVZ×7|øMKlÁ7UJ"$‰@"$‰@"°2"€ "Ž>–;ZÏåM|X»;‡™¶ë›•×,S"$‰@"$‰@"P#®oj4òw"$‰@"$‰@"°!À¿uøeM¶-#¶L›ï”D HD HD Xx¤ë›…W'™£D HD HD  ÀªÝfY¬×ç"î÷Aösã±|qÎ¥\yo"$‰@"$‰@"°² °ÚÊR,G"$‰@"$‰@"°²!0 ’¾ÆY/ΔD HD HD XX¤EýªÌM"$‰@"$‰@"Pàîf¶–ô|ËqÄÍ™gžYÜÝì½÷ÞÍ¢E‹J¼âwºÁIEKD HD HiQ¿pê"s’$‰@"$‰@"Ì ÀMÍlIÿ¾÷½¯ùö·¿Ýì»ï¾Íît§æMozSsÁÌD7Û¸g"ȉ@"$‰@"$‰@"0U’¨Ÿ*œY"$‰@"$‰@"0fcM¤ï{ßÛœ}öÙ +ú 7ܰÙc=š 6Ø 9úè£g26›¸gnΉ@"$‰@"$‰@"0u’¨Ÿ:¤ 3Âk®¹¦9äCšo}ë[ 3ƒ­\ýæ7¿i.¹ä’ÖÙkÿŽºÖyÃr8yÜqÇ5Ÿüä'G¦üË_þ²ÔI_9GÞ<ââÇ?þñ±i¸}…¿dÓ½C=t^uý׿þuóÕ¯~µùÃþ°àðºâŠ+JùÏ?ÿü©ç ùó£ý¨9ãŒ3š¿ýíoS?#LD ˜úéÃ?¼9çœs Iç;ß¹Dxë\§¹Á n0·ÈóîD HD HD ˜WÒGý¼Â»p"G^"îÖ[o½f‹-¶X8kåäÿøGóḬü¦Z¦2êZ„‰ã‰'žØ¬¹æšÍð€85•£%ä]tQóøÇ?~d|ßûÞ÷Æú}½üòËK Vos›ÛŒŒo’‹ßÿþ÷Ǧ=I|+ZX“RÈäu×]w꺎œ~Ýë^×\vÙe–cŽ9¦¹Ë]îÒ¼èE/j A®ºêªRþM6Ù¤ämZyúÅ/~Q&–ÂU‡>ô¡fûí·/Ÿi¥‘ñ$‰@"Ì é¿óï,AÒ;Øa‡5?üዜ٧w&‰@"$‰@"$‰À|"Dý|¢›qO„ s/’úÓŸšë]oIÕu­+Öη¸Å-¦NÔ÷»ß-KÉÇõ]yÊs+>GuT!éŸô¤'5›nºióùϾ9í´Óš/}éKÍÃö°¿€=%0ùÁÂj«­Öìºë®Í¿ýÛ¿• µÏ}îsÍÆoÜÜþö·ï¹3O'‰@", ñï~÷»›sÏ=·—¤ßgŸ}š;ÜáË";™F"$‰@"$‰@"Ì%ÙÐYD·,Xò²ˆÿÉO~ÒÜò–·,äØZk­U¿òÊ+›3Ï<³Ùl³Íš8çÂYgU–9³¬­…«.páÈÆqÖܬñþóŸK,Vî,ˆë=n0n~ó›7·½ím›o|ãÍ_ÿú׿ž÷¼g³hÑ¢+cîjXzoµÕV¥?þñKxánzÓ›–ì}á _h‚^$?õ©O5¿úÕ¯f²=êÚL Å?¤ýõ¯½¹úê«/­_þò—K>l¢FXŸwÞyÍÏ~ö³rþîw¿{sÃÞ°\‹¯ /¼°”æÊ W"ÞpS#^÷º×½â¶Î#ëæÓO?½QGMKÐGY^ÃZÞäqõÕWo6Úh£Îú‘x*çïxÇF9úâç×¾öµæº×½îD,þ½ðÿþ÷¿/y°£Õ¸|p#ŸþóŸ ™K#ŽÐ‰ûßÿþ¥¼pW¾tÇɨx»î¥ tø?øAiÒ¤_êeË-·ìºe©sp`¥Hg·Þzërý)OyJÉûg>ó™^¢^»å"GÙ´IQ›o¾yÁãÒK/-çD&Þº]9'¬< Gw´½ÀÏu¢ ê &üßúÖ·¾öBõ­üôJ]¬½öÚ%­›ÝìfUˆÑ?é¾¼ì´ÓNE…~þóŸß¼ìe/kŽ=öØÒfGÇWD Hæ q$½çˆ eËRD HD HD`á"Dý­›™œ!È^óš×¹nó/~ÈŸ÷¼ç ¡ú‰O|¢Yc5– ?ýéO—s5Qïeõ/ËXî 'œÐì¸ãŽÍCò™ôê¼å-oi.¸à‚™{\èCÚì°Ã%èñÇßÜøÆ7žñ×íž“N:©äåYÏzV c‚AM°ŽrØÔì?ÿó? ¹ˆ<}æ3Ÿ¹)‚Q×JÿúBŒ+Œ˜~³tFÔ#‹_ýêWÏàø•¯|¥¸ˆyÅ+^QÜäˆâ‹_üb7~\a+÷¾ä%/)e‚5ïúë¯?’¨Wo/ùËËä€{ûw»ÛÝšÝwßÝߥno~ó›„uÔÌÚîEÂ&.H„“—=÷ܳ¹þõ¯¿D¼â|Ç;ÞQ&GžûÜç.qmÔ)ü¼ÜóHW]Ñ32.§œrJÑSa#Ÿ&7ÄaÂ'tÂDÂ_˜˜¡ï}2.Þ®ûÞ÷¾÷’Ý5é°—C‰zzKô •c|ÑM:C7ꉲ¸~ê©§‚nô“n*ç¶Ûn[ŽÑÔé _øÂBè»—‹‚w¾ó%aèëMnr“BŽsëDLv¼öµ¯-mYØpoU Gæº&êácûXÙXñ?DbÒLYCäAÛ¸øâ‹ãTD H–1úx«¹Ü³qlø¤óÆ} ’¤_Æ“É%‰@"$‰@"$³@`êD=¢U'R4ÈYä+o©@˜²ßc=ŠE-ËÞ7¼á ѯýë«ã"DY³lG"†í¬}»,q]CÒ?êQj¶Ûn»²¤Böýîw¿™{øì'— ¬â£ßüæ7“õDôY᮳Î:…¨ýŸÿùŸæ]ïzWsÐAëñ¾°,"¬…‘’p@!b_úÒ—Î܆°fQï…±¸|ë[ßZ–ŠÿÇüG±ÀGàrãÁò ©Éµ òæòxÄG×7Ò'Èm÷&ÈG>ò‘b=ÍUÊøÀ¥n·-’þq{\™‘WËØåÉ‹7Ksä-BÁú‚¼ Lx˜40Á`5CM"{IGô²è–ÅöPáã_ùÿë¿þ«¬x@ª«S/ü÷½ï}ÇæCùXñ³ö§·Hd:c¥«tq„È'=” ÜÇú;VAD8Ga‡Æ÷ÑCiZ=òìg?»¬,@¬Ëâ{¨Ð]bEK-ÜÀí]D½k&L¬Hà2‰.™0BÖ?íiO+“PâF¸Ó mÑ$“v†ÜßÿýÖïüîë ´—ýöÛO´å·¸÷Úk¯Òï²¼×7ÔbScº®mÒqë7´¸·'wê{ã·ü™\QGµ(;ÝÐæ­ØHID Xvx&z&X-5Ф7æIID HD H…À’¬ËòkGäR 7 6…äWº(¸_A’Ù5¬_‘c^Èî¬s'$4•›.gÂâ™Û%ˆYé=ò‘,$Ëy./žóœç,Að!ñäYÇ=¬·½<Ö IOLè ÷—Ê9Ÿb#Z–¿ˆÊxau¼Ï}îSÈq/»éÉ>¬§ñˆG4|pÁ`Òü!5ÈP„½s0í$<Ø õÃÅÉ¿ÿû¿— #ùñî:¼ó˜Ç¼k·)ñòŽÜµÑé$$½øáàòÞQ]Á!öqù2ôTÙýɉÀV:Ä$²~Þë]2I¼q?\Iè¬: ‡±2 Â;þñ,AÂUS„x¸·%~ô£ËeíI?©„…:÷S&™´ÂMÜŸúÔ§’Þ¹»Þõ®E_á‡×f„×7¨¢m™è©…»øÉ¯U4â714DL¼ZYÑ–(»6“’$‰@"°ìðœ7ßGÒ;o5`Œy–]Î2¥D HD HD ˜-S·¨Œüå/)äæ6ÛlS,zYt"†R&Cࢋ.*7´7kDÈá=IŒíxâ¤SÇå%ðw¿û]™t©Ï#*ùØ®E^€!ˆW$^›”­Éda-ö—ÊBa8Ä7yÄ?é1IÖñ>ma‰ŒÌduÌçößøÆB¤"G‘õSû¾Qÿ¯µ•1‚ßå*¤kä¨IÈ¿z‚?|CàÞö•/H'HÜ?äˆÈµÚÁ ùVo~ðƒgÒ’aXÈÓ«ú¤Öc˜‘Q7CãôLtÑź.\“n`aGÃ’!_ç9ú[ÝêV½·[Q`""Ä“^µ îé¹`ñ*Òn/ôÐ~òµuS› ÷8Ä„M[èU¸Ih_«ÿ«—ÈS}>ÊnŠ”D He‡whgŸ}vóŒgÂu.¸ÛÝîv £VO[Ÿ’$‰ÀÊ‚ÀÒ#À)•Œ«Áä=îqâ—[GjV©SJf¥&|QsS3NX×Ò51Â'w-ñ¿ËZŒ<’6×U®æŒ­b¢žKB+Wô‰½ŽÄÅØbšÂÝ"#,.ü&®QÕK”cÒû»ÂÛK~¸´QÊt˜¦ÍGýOZÚvV4ýAV#¨ûØÇ.}÷îcõ¯ÖËš¨·šXÿaµ/~‰»TïìCWâ¹P¿ËÄ}ž« á¼Óø-\—0üÎ{¥÷z“¥Þ·Äí\mLWß__÷Û§OpVO{_뚌ÕwwÐGžOy#êÁ¢ç»9êBGnÅÍŠŽ'e<HF VSáÅ]ü[ŸwÞyeóÉ ~=0¹Á 0ÜQÔ$Ÿó°¯É6ñ’Úr׃#¬¬ê Aé ìøn÷rÖ¼^2êÍ4Ý#¸^Yüõío{f³LçÂÅÉ\IÎ:Ï‘Vý€X…£‡níËšÞ5[e³'—8>Hú÷¼ç=ë:íôÚÿåÁ˿巽ímK–´uÒçoß`nòáMÂõL¼d"VÕ»ó±A:øÀšÍ6Ûl¦þÅe³áÃ?¼`îþmæCÅ»ñÆ—ßê|ûÇ€l\>bPÁ Kè¤<ÏFjl#ŽIâå2ŠžuÔQÅúÂÀ†þª‹I\s….Û(9ö]0ˆâW^}Ť3Pk[ðORö Ðå;ôG¼ðGÐó…HmÈæÎ!Ѧ㿾WûÔÇÀÔàÖ ÿP¡?^ì….|L±ÈçS?%HD`Ù!`ÿ ÏÖûm¿Ï Ùµ¯úe—£Li.Ø7ƘپJÆv§Ÿ~z1ްOCâ½éC)ãU.¬âCÔÛ‡Æø/Ɖí|x_°*nÚD½ñ<#ŽÙõíñ‰…(Ÿô±O¸|„E8üä‘Þ±þ›TÚØY>I¼t™}ÖYg•6bR†¾ª÷z2g\Þ ÌâÒþð‡?\V Á^=Åž â`çeñ¿ÿû¿{­*Æ¥eâ,Ú±‰ ¿µo@ô„¨cõà…žÞZAað©ýÖ"¼ëŽÚ°É›à±YðÑïÐ;ý…ö ;Ë„aioŠ”D Heƒãcc“ÍŒ ¸d¬Àúœ_6¹[*žÕ&èYKK«ÆD¶ ç¯ýëe\®é¤æ‹FÏ­Z¼‡°7n¾÷½ï]ÆDõõöïQi «Œ¿áÿÕ¯~µÈpCÿƒ¯?Ó1€@â0€‘·0~a4`¼…bA„=¡n¼aU²½ó±„Pbˆ`ìQ ƒ÷8²@5ž4† #ãïbð5&·:Œ,"ãyðŽáz¸R<ùä“g\Š×˜ ö0Š{ë£1ª1˜±QŒgëë~3>`L/«`)^¢N]o[î*áiü£mÔÊèî vYÔº"nza\/~¸ªÏI%,‹¥cúUˬ?À;02Ö¢÷Æ~+ã7c=ã>®½‡x¯VçèÜ¢DèÍ9çœSÞ»½Ë¿‘ÐQï&ÒS8Å{á4ôhHýG^ûŽQ~ù׎áÄ])Q>úCï¦ê±^‰"̸<ÔúCçMzÂ’ÁQ‡ÜÑh§å˜¯qº_ï«t_ßûÚdô3ÞM•Qù¼h»D½{—ö¾ýΨ,E9àèý‚;Mτۅç(\Ýk‚~A¾C¼sѵ [üÇ»´þGXýœ°úŠv_ÓγòÁD»Ô§ª‹e!ú}ÂŽ÷%‚Ü÷^ï;žEsœDôÅêÏÊ8¤{è-œ¤©þ㩯0¶29àù™k^òþD X˜Ì;QÅ6 `yjð`€` ¦S÷pòÐHéG€åzö±•{ œ¥gàbð.Žc¼ŒN„S“Ýòd@ÒeéÇ}õ±}-ÊGa­>00@Ä#‰ æ|Lè„%?kãØ°ÕHØ÷¿ÿý%IEå«f/ôØ`ÚÀØŒvH;_ÈUZ$91Ȳ¡i¼´ÃìÀBž?úÑ–{ ÈÜ+#½ˆË_ßòhSàXQÇë…t¯½öj^õªW• —W¿úÕ3/‡%ž/³÷&4"ï°5¨@ô’!ù°ó³Ÿýlsä‘G–ºöŸåWä/Ž]Yh_‹ÿôi\¼]ñ½ð…/ll¬«¿——=V í=ºî­ÏIÛ‹ ­ÖíÛn»íÈÐÃÈsëxºÎÕ×Å¡žYÓy!6p40ÜfñË»<„ì´ÓNe¢À€ÖGKàYX†è lJ‹œ7ÁD´O>ëc@„u´Bá¯/ ô™Þ!*RD H– |óêëõãp@ëx¦!EôË+‹xî14@ y&úψ¡ @Z+)sM¾·x¶ÕD½1Ÿ±UÄÇç"ƒ‡.—¶{Ô‚K>ˆ{¤¨2†"ÆìòˆLckÜl¢ÞxßX†Àß½atQN.þBÆ©kähÔrÊ ¤(’V<Æ3“ôhÆ`êú”Ť½±Yàã½sÿý÷Ÿ)# ĽÇ{Ìì³EWj«Vÿ½ùu4&¢—êJüÆZ0Ž÷ƒ!Xç+õ©se£ßŒ% Ê¤VïSZÇÐ-å±Öû]XþêËÄŽ±hè¨É&ï5ˆ¥4Ká¹êÑú/óå×^ä ê ©NWáh›,äcÂbHjýѼÏ:'n8?#M<ðÀ19ý¿Ëãt£KGÝ­_¿ÛÝîV"Rïr&& Ã2ý¿÷P¢lÞEèÁ>É}ôK}Gßªß »q¸ê;ÄÃÈÍ»nèc&„¼xåI¿gß7bRͪdíìu¯{ÝÌ»m;ßöHS¾È›ëÊí}vYI=á oúÆjÓÈ‹¾–î“nÁ7Dx Æ€0Ã0ÆBìã.ÆMvDyLeFԃǃÀ ÌŒ·„ÎÈì-ò¬k¹bB:?¹¢™å—¯Z YÌz,Öƒ{á<äkÿàÝì>µ<ýéOo‡„÷µãNxä½IŸHGqd¡aPmàŠìm—#Âz·¯IJž¨óìkI¸‡Ÿr†x1ó182`PžZè¥A°‡°xŠ0ˆß^ "=D¥ uê²x¨†ÀÓ€¦‰ŽºN„g!ík8·ó'Œ P8+_;üœÖbbEþ‹.VÙ£Ää¼)§øå%¬šêûÆåƒ%· u­ìðª]´ Æ€°Ž·Æ¤ Ûqñ¬ŽÃK¾U^"ãåK}„Æä†«:%Í&Yè9\àèþºÞÝud|=¡ñ#áÛ¢NkÑ6é:=Œ—’úºßÒF ë[õ }Ò •X«K×é1¡Ç^DƉû½´yyÐLj§=˜G^OD ˜;žqžW .&]%+›˜L÷üF swç¹8Dbö­LìÃibÙøvŒLñ.Ò¾o\Úv‚ò¼µ¡/ëK®iŒ'XÖVÕ “óÆMŒTÈyãvcÔz¢A¼,¼½Õc&cÄL{ÌÁâWœ Qêñ®÷+îé‡ç·2‡ ÎŒ!“Ú½öµ¯-cEc˜ [Ã/"öaCä®>äÛ8ËØJ}b\âº1 "ÞXÑd¥n-êÂû!Ãe6æ‘7é0Popõ‰ÉF3¤Æ¨Ž9Mg‚ì}bU-©­žseB®2œ‰•´uœíßp1`3öV7&•ÒSã°6у¤WG°÷n„¤DV¶ÅXÏXa'~eCª3ŒÚf±!=7NCjÇ»Œ6õïýѤ ýfÁkÕlˆ²˜P¡C –X[máýr¶z4´þ#ãŽÊoâfçÅ–ÇARšä _°Ð.‘˜ê„ ÝŸM´Gºh²H£ŸEÒÓµ!z¡CtÃÄEþ‹—.x÷rþ]ïz×L»õ_úšhÿ³¬fù=4O5¾HzzÏ€Ï;±ö¦ž½§{—‡kWý›þ„Î9/Þÿè³÷U„¾vfòtë­·®o-¿qDôÎû£~U¿æV›Ï®÷Ñ¥"Y‰NxfÐ;} ~@Û51º=‹’¬ò\oy `ÌÕ‹…‚Á„ß,BÇ‘eË#¿ )M¸QÒ&ƒû¶ÉÝ:\Ðõ9¿GÝa»& âZ…Ó~Ðö幋è–qé‘yncùYœSü¯òЗ:\ý{ZX×qúí%+¬0Ú×â¿D=‘ïqXªóiÔu¶“ÄëEI£ìVˆ¸×@рߋ aùïÅp”,Æj¸À©KÄ?M߸v £qõ$Om½b S[“õåÛ‹|”W^Æå§/ž<Ÿ$‰@"0ˆÏŸä ¢Íól®îjb7â_(G.P<¿ÃĸŒ›î9&Í·çWºžƒ/xÁ qŒŒîštš6²+Ïiyb­mrÞûLMÔ3@¸¤ Ф ±\\ü… ç6Ñ3½vm€pl?ÃÝ«(XmÂ?â¬&è¥ W6ŽáÁèCöÈCDï¾ûî[G3è7ÂÍX ±cD3WM0 7|ä‰c0Bèq3ä¡• AÔ#üMR®—£I¤µ1^¸)¡KHP%c “!D{ }bQìCdž’Ÿ&h`+Üô5tvÖÆ£Ùïx¬‡I·¶% N”‡±ˆI å$° áZ–ëeµ27Æm¬œY5+cMÔ[‘ºhÕ ý4ƒ¨ï’!eZÿ]ñ÷ƒSm0¦-×m"Œ’f›“CðgÉ ;mq<‰ŒÒ :ªî¬ ]“BIoâÊ„ÑGEÿ5Iú£ÂzÏCÒm“Ž ÓáÅëÁ8\ûâ6±§!Ú?LŒ‰—ÑÓ;Þñޏ¼ÔÑdK}Ýû‚š¾j»Ñ,uãJ|BÛ¤‹&{ážVâ"gÑD`1Ë…¨‡¼A ¢Á²ÞCÈ̪„A™Ž(eÅ@`Ñb÷1a]Ô—clu\®úÂæùùCÀ@ÌgUvÖê–gäé{¼LyQŽ} ~VEñòáE8%HD`a"À0 &êMÌNJõ•¬mtÐnYŸ÷œæSº&»åQä’•]C%&š#|ü÷¾Ñ–!iÇ=Ƶõä¼1®ç*«ÞZX@Öé#wÃÅ…ë,ôí#žƒ[W×cf®ÿºò®wjr¬N«ý;â@Öú´yh\gFV*“º°²Áx~¨DZíòƒQoV´IÙq 2½#rùȺIì¤`—Dºs„Ad±é#~ÛÒ®¿öõø¯îµEnQ¢¾L pQ(íÈÇ(¬KI‚DÜܵ‰z¤¥Oˆ † ×ãœ6 SbE ±2¡-t­–ú=.&|àÜ'CÊaFÕ_ü]畽n˜À²O™I7_ÁÄq.y`‘o+ƒ­Hé2êʧsãt#tL[«%ÜõÔDý|Óí¶u$_õq¸Öy®‡îÄ9«?¼cYEc5å³2ºÏU–ûLäYá`¢)$ê4þ¯*GíÜ„°÷5mmŒW²œ‰Àª†Àr#êh3ʬÌ|›Ý53ˆ2€b¡²ðØm·ÝÆfÒÀ5¬ZÆΉÀ”`uK›§uF—$‰@"ÌÜ !Ìj²~‰yùo»?œF¼ÓˆÉ…·ÒMZ5‘ã?ëͶ´WëÚžtY¨O’6b°-ÞkÚñFz6òS‡CŒ´˜œWÏöj“sÞX_‹¿&+‘Yˆô¡$b¤iLÎj8Ä ;ú€‚1ëmû:ùp§ÂÂYäú kå­ËåGÄ¥L\ †¨GŸ°èf\¨gU¸÷¹@ ËôQ“9\+"«Ûî4ád~äeÔ‘¥3÷€6ËäÎêUî†`iYPO‚„Eø¨´Ç]‹ÕÁ\~²ÀQF:3R¶!õ?—<¸W[1YeEAšoyË[f°ŸKL¤9¬ ´Éíqy¥‘¯Ðåˆ+ÚÆ\ë'âë;F:q=þ‡ÎŒÃ5îwDÌs©dÕŒ÷UH{éÕýOÄgî®´ –äôÖ„DÓϽ® ¨x¾º¾%öõˆãBÌcæ)H¦‹ÀjÓnö±±¬ç+ÙCÑÃË '‰ÝÙã™w&‰@"$‰@"¬ø ŦI ˆ+ˆ¶…ŠBAÂ:ÄžV¬3Y}Ñe%nò‡õk[XÛ"¸CΤÞ0°&oÆ¥ñØ\>6Uuޝwé´ =nZjAê’Ø[Y‡C Úä3ÈÛúžÈ+W}!È\Ë*«ÜºN¼!âFê# ¹X‰¼¸×56²YÈÕŒxZ‰Ð§ÃV:(‡tú\<¬tùÿFÊ…»†‘¦ý®ÜWßéwMÀÀÉQÌõ‘áªcÖ±¢âý‹7Ò–ž[Í0W ×FÊet4‘1´Œu&Õ£¡õ_§1ÉoxÑUõÆî ê5\¢Î6âæ‚E_À ‘v‡£*ãt#t1]KôMmý¯ÃÔ¿Û“€õµQ¿‘æ1 !\´Qmq®£â®¯1¼´jDqÅ­ ×S~×}H¬tájËõ¦àÂÎU”Ïs"ÜCE|Ú‰•q>‰@",/Ô´¡f.ž:ÎÙ &– ™n"$‰@"$‰@"0-‡HSDR#¬ÿ&Ê5ÆBµ¤¯ËÃÍ —6!µÜß»‹jäR9Ü` ¶¸\‡ôí¹‚`ÏvÛmW†Â3|@#Y}î°ÃÍŽ;îX\¼ŒJ;ò)/â}ÌcSêEþÔÕ6ÿòáÄ/¿HDÄ·ˆ]ÖäêRGîeUAV¹ŸËyå§ùÄO,›Š‹«þ³¥‡#ír8ç]J|69µ§ÞHw›ÚüVšH+îÅÉå b¿jD/²L9M.°z)6Ø„½ (Y¶ÃRln‰ì²9¥ÉD8\øˆk›r WË£ýè’÷"êHzòk’ÆF’!òmsJÂÚv”˜``…«¹ÙànÕÆ«Ê¤ȳÿ°§k,침1Ù‚loO¶t¥%ŸpR?\|˜<2@à¡~ÇaÍb˜Å» uùLJ±ÉX±²Ÿ‹¨måè£.“,Hk ãèhìG5$ÙèÑÐú’~Wn|è7}ö[ ^Bf›VÝúœÝwß½Ll˜\²IªM•éÏ8¢,î¿øÅ/×M@úO÷b¯/˜°)°þ­kcÖ¾{×­ú ôL[¡‹Ñ>Æá:*îúšMÏ­úÑ>ô=11Áí1i÷Ya1n#_eÒWNËš^z›nºisÌ1Ç4|piwÚ™ÉNû¾h¿1!*ìlDÛ=çœs–¸•Û²¶k«%äŸD H:ÿ¤é¸iÔ)Ö`³y‰ˆA¬Žœè8“¤…v^KD HD X@°¯$û4ê‰hCÃc=¶AÚD² e½/½÷Þ»\aeŽˆ«p×í  †T&¬YùžFF‘Ç!i»Y¦>øÈ&& {ÈéZøRGV#H묚cC@n>ÃÍ +ˆ¬¸Ÿ•9"˜È3âAOœ‡"ˆDþãèœI¤â¡Ç¼÷4VÛ‘R—OnGäµ D4ágŸ}ö™I¡ÏK\$8â+Ò£÷Á<°›A`Æj„d<ìŸ$AÄÃé_»ƒ‘æ‘GÙ°–gik#^¤"9ML.„«å9à€Š› D6¡WE¤éaѯNä™ÿm‚h·¡p­ã°vIºÖù|„#(Š£$pî C×ìÑDgLòÍò<&w„é“:þÙèÑÐúïKÈyø³ÒŽ6¨îä;ò>›<¨›êÆÄáÈ{uBGÆÉݰ¿vHGY˜« ¶2EþûÒÑŽ”Uû¥&žÆÝSÇe’@»P&‚ ·¼È8\K ê«Oô×&áD8.oL¾’Èsm®ËH&GôÑêY«$"l‰ Š#þ:êOôIt&Ê.¼g‚çIW9Úéµãvå¼Ét®}jÑ¿Õn½êkCGâ8ô¾ —$+.×Yýõ¯Ýqfe0ɧ™Á1‹]þ!ÇE¿Î:ë”™}–>,(,É×\¤=£9—¸òÞD HD HD`eF dŽ>,¢‘J>Æù7¹ÉMfÎ!›‡ºj˜ v|#i‚XoÇÁÝ ‚´ïz„rIáûîáÕñ~ûíWaÖ·5©ì^D¶I–«,7/¿üòBæw‘AÂ啯 ƒëûúÊ¡þ”¯Môðïí=®ocau«®áÛ%ÈpuØÑ™®òÁGbÒ¡+.甋ô¹µ)'üRî^ºò%*e¨ëÏÿz"¡/9¤bäsH¾»°vò—%³U!ûï¿õ’‰‰vE˜IŽê&@úpßlõ¨¯þO>ùä™:ïKßêmm” éúò0*ÞöµùÒ u£ÿ˜´®£?æ‚«v'ÓηÿÚ°I8„´ÉC«ep.&¨úúÎ!¸v¥Õ>‡#2QصéuWŸEÏäÍJ—I1i§Ý÷ŸÛ «‰ô}ý_ß½y>Hi!À…£g³q”þ8ÆTs²¨×á[BTï>íf šeNCăÁØr+ƒ³¨^xá̬ÿ82L"$‰@"$‰@"¬|ÚW²6AÞnTÔGdGdSI×'9*o_™ûÊ«ÚéŒË“—ÇQÒ.w ì}¯ ¬ã‚w~Èï!e¨Ëе’cœ°‚G°“!ùîÂÚ}çŸ~Ymñµ¯}­“öT@Ün³Ø-É´HJõÒ•þ¸2¶¯ÏVúêŸë„ì(AÚŽ#êûÚCo_ê0ã~Ï—nŒÓѾ|iï>¼ŒÓY:€¨¯eÜÄÙ\ëøú~#Âûô¯«Ï¢g}áûÒ˜ô¼ Èqz5iœ>Hi!01Q¯ÓdéŽT·¹ŠÞC˲M9ÒÞrDý°@À›å¾.nø¢7k`Y&‚_\)‰@"$‰@"$‰@",D¸Ša‰9JXƒ²z¯IàQáóÚÂ@Àûl¸RY9:è ƒŠúóÎ;¯ì`•9× \â¬ìr衇®PE\Öº1«1ê}÷ápôIá.«/\žOD X¾LäúÆÃ‰_;ü%—\RHö_ÿú×eÓŒX*«ã·i0®!â/X¼KºMbCû›l²IY†h)”kÂN[ÒõÍ´ÍøD HD HVVb<ïÈÈÆÑ‡U+ájaYº¾YY±Îr%‰@"$‰@"¬ºLÅõ òý¢‹.*–ðH{‚„A½ÿ¬çùÛã Ї+8–ó|X²B!âb=`ðŸ’$‰@"$‰@"$‰@"$‰@"$‰@"¬ŠLäú!o‡p¾ÄlŠD6Ûl³¥p‹ 9„?ýôÓ›«®ºª„AÜ#éý·Y¬M•RD HD HD HD HD HU‰ˆú6P]»t c©¬ Kø¡çÚ¦–Ÿüä'OÛg}&'‰@"$‰@"$‰@"$‰@"$‰@"$« õ,åmˆó§žzjïæ¯|Я·ÞzÍk¬QÈzîpXЯèVô\ûLkðšìè“ÕW_½lÎÛw=Ï'‰@"$‰@"$‰@"$‰@"$‰@"¬øLDÔ#Þm;‰Üÿþ÷oþú׿6_úÒ—&¹mA†½øâ‹›]vÙ¥yå+_ÙÜûÞ÷žs_óš×ôNtˆü¶·½msðÁÏ9O|â=üàÏ9®Œ HD HD HD HD HD`ºLDÔÿùÏn.¼ðÂbQÎÚ›{›>‰kÜÜ´ÝßôݳÐÏóË¿×^{5¯zÕ«šƒ:¨Ùb‹-æ”å}÷Ý·Lbˆä;ßùNsÒI'5O~ò“Ëf½ÎÁxòå/¹Y{íµ“¨Ÿ˜G"$‰@"$‰@"$‰@"$‰@"$SF`"¢þw¿û]ã³hÑ¢fã7.nmb£Øv¾¸‡AìŸþùíK+ôÿ‡=ìae‚"ÈúÍ7ß|ÖåÙ`ƒ fî —@6Ûm¯ZøãÿØœsÎ9ÍoûÛæž÷¼gs—»Üe‰I’«¯¾ºùæ7¿Ù\tÑE…gíË[Þ²XëŸrÊ)åÈeω'žX6óÝpà gÒ͉@"$‰@"$‰@"$‰@"$‰@"$ˉˆúÈjXÈ#|‘Çíaïp‡;_öùË_ ¡Ü¾ñ¬¨Çm·Ýv†¬GØo¶ÙfóV”ïÿûÍÞð†¿U 'œpB!Û_ñŠW”Íz‘ø~;roãxÌ1Ç4¬õou«[5Ç|óü£|üÞ~ûí›$êç­º2âD HD HD HD HD H&F ßwMOTk­µV³é¦›–«k®¹fñÕ~ã߸üG$¯¿þúÅÚÞ‰[ž.Þn@IDATÜâ%ìu®sr}eúâï}Ÿ}ö)np¾ûÝïÎKѬVxë[ßZ¬ä=ôÐæÈ#lžö´§5?ûÙÏšãŽ;®¤yÚi§rþ€hÞþö·7ÂYÍpæ™g¢þðÃoÔ“zñûñü¼ä5#MD HD HD HD HD ˜YÔ#|ï{ßû–”Î8ãŒfuÖ)npX˜Û0ö†7¼a±òfIÿío»ö6DeÏ2|e“­·ÞºùÃþмìe/k>ó™ÏL½xܱ†ÎsžSÈv <â(¾ì¿÷½ï5O|â›þóŸ%]ÿ¹$RGozÓ›¦ž—Œ0HD HD HD HD HD`~˜È¢þÊ+¯l.½ôÒIÏç9òÝ&¨Î]÷º×->éY{³òþÓŸþT¬º¹Æá;}eeãfy>K‚xÖ³ž5ó±OÀ/~ñ‹rÏ|“!Ÿýìg›]vÙ¥Ùo¿ýʤÁÊæn¨6¿D HD H0f9øàƒ›ÓO?}@èåD^ô£•ý¤Î;ï¼Cœv®þ÷ÿ·aœsê©§–=¨Ú×W„ÿò¯^ìe6-a0õÚ×¾¶·i¥“ñÌù¨ÿIsÕÎCêÏdrµ{ÐA5_ûÚ×&»qLhü—¿ÑgMçsŸû\sØa‡‰}éË+Cºt©òL"¬ LdQ¯ó<ûì³—(÷¯~õ«Æ§K :mrº2 ’þÅ/~qs¿ûݯÙ}÷Ý祈«¯¾z‰÷©O}jqc‰X½ÀržÜô¦7-ƒ]ĽU ßøÆ7šc=¶ùùÏÞìµ×^qKD HD HU$ÌøÃâþq!Ú;Å+_ùʆËËûNÙƒŠ;ËŸüä'…ÄúÛßþVN½ï}ïkvØa‡fÇwŒ K?ñ‰O”=¬¸ìœ¦xßðîÁ%ç¤â½Q½D9&½¿+ü·¾õ­’Ÿ+®¸¢¸^í “çfÀ4õh>êÒ’µó°¢éÏ\P&ëûØÇ.}ÿÍo~S¸î·Új«Iáï Í5×CPû’¡é0 #ÆÞÈ[fÓŸÖQè¿ä·-׿þõŸ«¯¾ºìiÈãD-žK®Ýà7h®w½ë•~pT<õ½¸5†œk¬±F1Ô¬¯áéä)\R×לgT+=ž.ÄÓ'î·Õ<;¸ßÔ4\WVr£ݨ”·çº|µ1ˆ0®û´E^ûîi‡Íÿ‰ÀÊŠÀDDýÊ Â¤å ’þ>÷¹O³Ç{Lzûàðl°A «Cß|óÍgîó0Òé%+îq{”îpþã?þc)WCÈý”D HD HD X8¼þõ¯/äÉK_úÒfà 7,Ö¤GqDóîw¿»ùÏÿüÏ’QÄÊ!‡Rˆ—¼ä%Å€QÿÉO~²Ùl³Íz'#¾üå/—½®¦MÔ3bŒ5¢~>‡“‰û£¥LùÒ£éçtv1®húsá…6ê±Û €}÷Ý·Q¾qñ¸Q~íEÈ-rp@ò]wݵ¹ç=ïYâ6yúþ÷¿¿ùŸÿùŸæ&7¹ÉLz~¼ño,ž1ö?üéOºÄõúO7»ÙÍÊ©·½ímÍüã¦]» Bos›Û”<ñö@>øÁ6Vˆõ¹ev§Ž¶Üþö·o^þò—·OçÿD`•B ‰ú «;HzÄyW5at#ƒßéNwjÖ]wÝæè£nXˆÜñŽw,Ë]O9å”f›m¶)³ß'Ÿ|ró‡<¤Ùb‹-šK.¹¤áç®w½ëLÜ|ן{î¹ kˆ{Ýë^MÌRÏȉ@"$‰@"$‰ÀB!$ংµÞÝï~÷2–EnS¾þõ¯—±ï-oyË™\;ÇϘ¸–‹/¾¸¬:e©wï{ß»Œ¯ëëíߣÒ–뙵ÖZ«Yo½õš¯~õ«ÅBÒûÎw¾óŒ5â¯ýë†Å©1; r„…q½¼……"¢Ç¸ýQzTÙÛKÜ‹ŸÅgˆñ>7Ï~ö³ 1ïüž{îYV÷uÔQÅú>Â:²Øt#w¥'žxbã½ÂDa%éݾ ƒü´-1‘ñòÀòÓõ0 òÞO"^DaìaVN¶¾XÅžuÖY…HßrË-[W¯ýË( /«`¤:u½m¹«|p„'+S®Xë edÜwûªm²É&M­+RV«ÅÅoÂC}N*V1Ë·ô`L¿j‚µ÷7x›l€ŽÞ?ô¡-¿•ã²Ë.k6ÝtÓ2‘c4«º×_ýR®BXÁÂÍûb-ôæœsÎi¼Ã"õƒ«­¶Z :ú =¨¤§,pÒÖÈ4ôhHý—ÄF|Eùå! §'=éIåå£_0DªÇz%Š@ãòPëGdÂ2Vuø•¯|¥èÇÆo<"§K^§ðýÁ~Pt_ß{«iB¢ŸaA­ŒÊgU½¶KÔ;£=äpXAǽ]Ç(Y2Ÿyæ™…ÔEøßêV·Zâ–Q¸º×*ý‚|‡=à(ºÆâY­}ê„ÕÏ «¯h÷5K$\ýi§ã’¶ª~´[ñÐÝIEß1IÚ¿>ñq{Ü—Cg¸$FŒ#ì¹ R?Vr éåY; 0ïxÇ;J?öøÇ?¾ôw¬Ù?ô¡•ÉÛ ü#® òãáFžñŒg#O¿Möh[5§^‚z>x=å)O)¿ã‹çŸÿ÷/ ÚÍ 'œÐ ÷¹/*ží/zÑ‹–NßSU$ê'Ðîn öÙgŸ î4LõCÖ9³¬:<áèx=wÞyçéN;íT6´5xð!:þÝvÛ­üöõ˜Ç<¦<õþîSD HD H…ˆ€1ïk^óšBÀûϵ#Ba€´üèG?Z\ÔäëÇ?þñBdÕD½ý´ñ0·’|ä#;‹>.m71¢aµ(Ä=Ò0FGб<"ÓXÀ"_O¬ÿû¿ÿ»nˆ–©AŽ”qOP/ýý¾j"¡‡xÿå/·Í÷?þø’ž4ýÞ~ûíKx•=­„òô…/|¡^,ûƒ$äCÞä‚|!lX"‰|”Ó;/rº¨G²[@àï^Ö’µ Ô5ò-êèø@!¹¢ˆ$ñ 'é[u°í¶Ûâ ‘„t |Ùûï¿ÿLa n+¡c•2]ùô§?=“ÿ½3ÿùÏ‚Ö1tKy¼£"| ì‚4 üÕ·JÞ9CGM6™´F|k¯½v±¼« ©ÿ’Ñ1_Q~íEžà žƒt¶éý9&,†ä¡ÖmàcûX!'Å '¤(ÒüÀ“Óÿ»É}ôK}GߪßÒ‘q¸ê;ÄcÅ>ò=ôèK_úRáÄ+Oú=ÖÜiÒI'5ÚÙë^÷º«ír±ç«Ž`ïyÏ{Jþý–Ž>ˆ¾N"“ö§}q›L¬ ÷:œòâk?üðÒ¦¶Ûn»æ]ïzW™€ÄßÔ2*9Ú1gõ3Kÿfuލ/uñûv·»]ü,} »î·ÿ€k&M޵‰zz·h±1h´3ýÖN¬p}ˆè;»Òro†IVf†µ • ÊfP`€:Énhô|àŸ¶lxˆêütä:Í3É{ï½w¹~ùå—ëžúºp:?ƒoƒ˜¡fÄŸÇD HD HD`Y"ð‘|¤ôÆÝ,6–ì#1'µ D@2RAê"9_õªW’ ™Kôë²K›U+a!jE+ E>†¹¦A€±Ä­­ªY›"æ$6ŒER!ç¹0.¯'ÄËRùΚ:„%¦1{ŒÏzSœmb„u,‚ˆ¥"K™Cg,k¹`ɉ”³+àƒ:¨a ‹¤Gì#”éò‹Wò}衇×7Òè«\÷þ„ˆ÷ÃÍ ÓZ~Húg>ó™¥ÌÈDy“RN½ÁÕ'ˆªÓN;­DQcTljœ¦3,Z‘ý±À{ßûÞR?ˆ/¤¶zzá _XnU&ä*+ázeroý.&0XµÛc@ݘBVJOŒÃÚD’^ÁžË $,÷mñ.hB„»!ñ+Rý¹Ï}n³ÍâUôɋԦÛt]›‰úg¥jR†~wÜq w©!ÊbB…>~øÃ.Æ_¬i‘o³Õ£¡õywT~u‰ü ‹l“ô Ú%ërua„îÏ&Ú#]4Y¤¿ñ|¤kCôB9†è†ws:Šü/]0Áã#®úÚ™ÉÓ­·Þº¾uÐo« ôÚ¨E⦯tg2T&íO‡ÆÛguŒ~S?¡ŸÕnÕ±I¥¡bUyØÃ¶Ä-°6‘Ôåß~‰€³ü£¥ÑÿzÎŒÒ5ýúÕïÉ[J"Ì lEàgéç|ôC²àÁ颟Š{'™œ¨ï÷›nêGôÜIߪ“ÐU÷ÃP¡Ö˜P²Ú#VK#®‹§!}ÕQ.‰Àª†@õ«ZgyD HD HŠ¢˜Oéšì–UDuKܱ D}-ñ?,ëkCÒŽð´Úe%âÉŒX©…«‘Z"}än¸¸p…¾MûÄsÐAÍQ®1ÆéÊo¸Þ©É1áû$â@Öú´AÃA¸;àZA™Ô…• á§}_×ÿH«]~„võðæ[œ ÜÚ"äëÛO}êS… B ¹§í>"îtç8Èb?ÒGü¶¥]íëñ_Ý#o¹E‰ú21ðØÇ>¶XÙG>Fa.8Ú¾ñ¹jõ ®j£+Ä]ë‘'Fe1ÑÂï:éÚÈ‘®Õ«Dœ‹ 8÷ɲE˜Qõß×yer6®#ùlHiÒÍÄb˜Äq.y@³Úæ¾ÈŠ”°f´GÇéF蘶VK¸©‰ú˜äªÃÍõw»mDÉB}®}é‡îÄu«?¬&²Šæ°Ã+“YøÃ{]eÅ}}Gù3Së=¬µëèGúî­ÏO«?•—p3ñ·'TMÊ„XùÔ%£â¡wѦ»îsHyb5‰ü[%f•E¸˜rM{ôÿþ÷¿¿¿ Ïö"°"äÕ¯~uçÄr ØúR¾6†uÔ žU$êW™ªÎ‚&‰@"$‰@"$ Ä BŽ ”q‚(¨¥Ë €%ùµØˆ´ ç&I1ؤF;ÞH/ÂF~êpˆ1d/"ÉÑ&çQ¬¯Å_“•&4éCIÄH¡Ìj8„¯iÖˆ6°4g½ÍՄ׬'¹­p}ˆ ŸˆòÖå‰ò»q)Ó®»îêTõèÝV! êYÎî}.Â2}ÔdŽ€‘Õõ&ІCù׿dô7ËO~½TÜyp[ãýѬ#-+êI°úè«amk?ø!ÊHgæ"CÊ6¤þç’÷j+&«¬1¡#Í·¼å-3ØÏ%&Ò‚ð×Úäö¸¼ÒÈWèrÄmc®õñõ#¸ÿCgÆá÷;šxàRɪnköÒ«ûŸqñÄuù3™Ø–6ŽíëíÿÓêOmþä'?¹ýÿ­@jÓO“v&AÛs£â¡w°«Iò%ø×ŸxhßÑ>#œI·ú¹绎&ò¤Ý?zÄ>öhAäÛ×#Äd.wLö¼à:mˆh ã0O†IV6V[Ù ”åID HD HÅ,‚-§±©ḛ̈ªtž«†äë×¶°¶­­‘ 9R[K;î¹ôÒKg6UuŽbé´ =nZjAê›0$b Áb“Ï oËÅ}E^?ûÙÏΜFæ²^TYåÖ刀â 7BÉÅJ|䎮!°‘ÍÈB®ä Qnâ¤mµÚN¯ž8 ëàÚ2\ºíZXó²7yù ‚>Â!úáŠèƒ§•m2*ÂZé Ò­-Qù-gíi2$\8pŸin¶Ùfå¾úžˆ³ëhNŽˆb®|7C°Žï_¼ÁpôÜæs•pm¤ŒMöÀŠ•ý\D=h+G}t™dAZ›@àÖ‚ŽòS>Tf£GCëhÚá¸?ñ¡ßôÙo}€z ™mlb¬ÏÙ}÷ÝËĆÉ%›¤ÚT¹ËWx¤Ç!ºÁâþ‹_übÑqí¡ë?Ý‹I¼ˆ¯}Œ ›ëß&ݘU´êƒÐ3m….Fû‡k;?}ÿ:ê¨âÆLûÐ÷ÄÄ„MJIWŸÕ—óʩݪíV_h¥®óQ÷ǵ!ýi„u´šƒùZLÂè÷`|Ì1Ç”~WßKìa¢Os¾vÝ5*}­ 6u¤¥+úÔê»õ„;ý³¾Äj¤6Ú¨L$KËsÂ}CDmuEµxÎyN˜ ·úGl0ûècÕ'‰òú-\#yU]×M&Æ$›8RU$êWÅZÏ2'‰@"$‰@"$ $¢MùŽ=öØæø@É%rÚ†²È²÷Þ{Ò#¬Ì‘aîz¸UAî ÁÊ‘Â÷42ŠD¸8IÛ}È2„ÙÄ„bIS BY E˜ ]X‘†›$VYq?+sD0‘g$‚ž8#¤‰üÇÑ9ä ÂñÐãÞÇ«íH©Ë'7#Ì„"šp‰³Ï>ṳ̂Ðg-ÊAsðÁÏ\‹ôšx`!Õ‚À† !›YÊcñ°Dú×î`¤yä‘G6¬¥·Øb‹¸½óh#^â(ˆ$„W¸rPž8 AÈ"² ½B(%´Xô«yæ› Ú‘qA´ŽÃÚ=&èRXçón2Á6Jç¾0tÅ-1Éd&Ëó˜Ü¦Oêøg£GCë¿/ý!çáÏJ;Ú º“ïÈûlò |lªS6¦+ꄎŒ“!ºa펲0WleŠü÷¥£)«öKÿL<»§ŽË$v¡LÄÊûQÄ$Ä8\ë¸üîÓ#ýµ Á|ä#åḼ1ùJ"Ïq,'G|™LçG1Ón7ÝtÓfuÖ)Œ#n]ê’2ŽêO—º¡ã„U_\ùÔ¢=›ðåòÆžÑÖ„Q¿&õŽ&ÀȨx\×Ï«+íÑwêk¸ óŒ‰ö^÷&=+£Oа]Gúhâk«­¶Zê²É D½ÍÀM®Zaã«$Ô-½Üm·Ý– Ù…ic¤ÏWg]×é£ÉÞ”D`UFà:믿þÿ®¬´gïVÖrf¹D HD H¹".1{Ž>,ù¼Äû°ÖãŸ6Î!›‡ºj˜MþøGαގƒ…Ò¢ïz„rIáûîáÃɲß~ûB˜õmM*»c’å* i›í! ûH-÷ 啯.⥯êJùÚ„KNDB©KÔ­º†o— ÃÕM`GgºÊyˆI‡®¸œS.ÒçÖ¦\œðKXhvåKTÊPןÿA†JêÁ~ðL>‡ä» k÷!ÕX2[e²ÿþû7\+™˜h×Y„™ä¨^à0Wý›­õÕ?â1°ë+Õ Ahö…Òôå¡/ήóó¥êFÿ1i]GÌýKíN¦+ïÚ°I8¤­ÉC«e¬Z1AÕ×wÁµ+­ö9¨& »6½îë³Úq´ÿwå}6õ3ª?m§¹¼ÿ{›ØÕ'waùÓN­”0‰;;4âÏc"L®¼<›£ôÇ1¦J‹úéaœ1%‰@"$‰@"$‰ÀG4´ ò¾¤GÅDsûÞQ÷üöîn»jÜø®HR™UH/*D)*3…Ì™2d8ÄAæ‡c,Ç!òç¨L*D¤D2”¡ ™‡P!"¡$%‘þÏwÕõXï~÷¾ï}ßÏý¼½o]×çó<ûÞ{¯ñ·®µöÞ¿u­kEXX‘ap£ˆ•7䨾}uî«G¬Bh§?®L>GI»Þ}d8"° ¬Ó‚w~Èï!u¨ëе’cœ°‚G°“!åîÂZ¼SO=µ¬¶`©Ê=·Ûι%™”¸í+³véÊ¿/|ßõiõ¨¯ý¹n©÷¡èÊé8ލïëuz}e¨ÃŒû½Xº1NGûÊ¥¿û³WÅ8¥µKiŽ›8‚k_Ùêë&ûô¯o̪ãwýî*û4í3j<íÊ÷ʼf<—1£Ê¡ŸŽë3£âç½D ¸òH¢þÊoƒ,A"$‰@"$‰@"¬$pÃ…Î(Aº²z¯IàQáóÞŠâ.\©,í¾ûîůôÉ'Ÿ\ÜwpãÁ5Wu±±æÊ$Ë[7†bc5F½"£/ÞùçŸ_Ƥp—Õne½¾¢¶ÏÊŠg–;H®<ÒõÍ•‡}æœ$‰@"$‰@"¬0XZOW×7+ 8YD HD HD`Fô¹¾éßAfFg2‰@"$‰@"$‰@"$‰@"$‰@"$‰@"Ð@õýØäD HD HD HD HD HD`ÑH¢~Ñ!Î D HD HD HD HD H~r3Ù~l:ïœsÎ9ͬ6`¹ä’KšŸüä'Ík¬Ñ؉ž,ÖÎãÇ{ló­o}«yÙË^ÖY¯¼˜$‰@"$‰@"$‰@"$‰@"$‰@"på Dý¸Ÿyæ™Í.»ìÒ¼îu¯k¶Þzë bvýÍo~Ó¼ýío_ææÚk¯Ý<ûÙÏn6Ûl³eîM{á´ÓNkN:é¤i£g¼D HD HD HD HD HEB ]ßLìÍnv³f×]wmöØcæ;ßùÎ1GEú¿üå/o^úÒ—6Ûm·]ÃÒ~¯½öj~ûÛßŽŽ˜wD HD HD HD HD HVzÒ¢~Â&Ü~ûí›UW]µõ»ï¾{sç;ßy– ¾é¦›6·¿ýíËÍ7ß¼Ùd“Mš÷¾÷½Í‰'žØ<æ1)×Ï8ãŒæ?øAsþùç7K–,)ù†»œH‘Åÿ÷¿ÿý梋.j6Þxãf‹-¶hVYe•¸½ÌñÛßþvÕϽï}ïæ:×¹NsÞyç5®ýþ÷¿oÖ_ýfË-·\ÆÍ{ßýîw› /¼°Ùf›mšk^óšÅRÿþ÷¿ù-“Ë.»¬9õÔS›þð‡%þît§æú׿þ2ùç…D HD HD HD HD H¦I¢~ -¸ßýî7OÖ³®GhÏRÛ„krÔQG5ùÈGÊo“ÿú׿šÞð†Íÿþïÿ6k®¹f¹~È!‡4GqÄRa6Úh£æ¿ÿû¿ç ôróŠŠ£.HúŸÿüçÍž{îÙüãÿ(á>øàæU¯zUs«[ݪÄúêW¿Úì¿ÿþå7‚þÈ#lnq‹[”rnµÕVÍo|ãBÒÿÏÿüOó‹_ü¢`¤¬x`ó_ÿõ_Íîp‡ºù;HD HD HD HD HD`t}3¥pQ³Ûn»Ëz–ã³–ôÑÎ:ý+_ùJ±ÿûß_ˆòÇ=îqÍŸþô§bý.Ü~ô£BÒ³Ìg‰/ÜÃö°B–ÛD¶-Ÿýìg Iÿ =¨yêSŸZn#á×YgæÍo~só| Ô Yÿ¹Ï}®Ü—’ÞÁ>ûìSÂp“ ‘ÇÇ>ö±’ïüÇ”ðûî»o±¦ë[ßZ\úD¸<&‰@"$‰@"$£°BóŸøDy‡d`ròÉ' ¾BÜ‹2ÿóŸÿ\Ôòœ~úé+ig!Ë«ÜuY¿üå/7ÇsL}©üþÔ§>U¾oþøÇ?–:Z¼P9ì°ÃùßWCôÉJc†Im9ᄚ/}éKíË3?ïË\Fýë_›³Î:«wÕaÕ¦ßûÞ÷Š‘Ößÿþ÷úöØß‘®ÞCåÇ?þqóÉO~r>xû|þÆ"üPß³Ï>»ùË_þÒ™ºúXþ³Ÿý¬¹øâ‹;ÃLÒ/`n¼ }E]§ÕÎÊTC‡Õ)ó¸±?Áˆošq-òª²_ÐÏYຠLYcÉ#]¤“]Â[]¤C+«+éë¬&ŒUò“ŸÌ*ÉLgX¬±j’"×:¸²ëOZÔOÒò­°÷¹Ï}š?ÿùÏÍ«_ýêyköVA§_ÿú×›k]ëZÅšÝxg±~×»Þµ¸®aé^ +xÔ_ÿú×岟 ÎÃÎcûØâ"çF7ºQ¹ÿïâ>âhvÜqǸÜ<ëYÏšÿídžnØØÔ6üäÇÃûéOzsÝë^·„åöç.w¹KóÍo~s>®I ¾ü¹"Èÿ'=éI Âþ—¿üesÛÛÞv>lþHD HD Húð^‰Ü°¢ôW¿úUY½y»ÛÝ®/ø q=Ê|k,îg2Yϰf²¼Ê]—õ´ÓN+ß@õ5õBlÝúÖ·.n9Õ±D­ãú\”—ï$ßQ—^zé }’7·ŸmñíÃÍèbK_þ£òµ#/âûîz×»^ù ƒüã W©!\¤Æ7c\ë;"÷Ûo¿BÔZQ½ÓN;õ]ê:ü|†´Ïãú¬ÈùC=´LLÐ¥ú»ýèG›?üáóÙÂâáxs›ÛÜfþš“ô yÖú:‹ºN£KU ç$tQ/›Üä&ͨ±Õd¼þö·¿5k­µVOªÝ—#¯î»“_®“ç:y LÖ²dÎ…ñ£ýè¦~>~øáe²(ÂÁwçwž÷œ×ãˆÌ?餓 W4i[D]Gû$wÜq¥l°Á]AF^3©% û/ÎBðl¿ûÝïÊn¢g‘n¦q9³Ö£Å«&i¯ZWvýYÜ7ÈIP] Ãúp@z³L_ˆxõró›ß¼¼Xy`&tP!æ»,¼ü Õƒ¤ÇËòm1Nø¥¯…eć>ô¡æ§?ýiyöÐ&q Ëùö íE-ˆz3Äœä)OyJ9Öÿ’¨¯ÑÈ߉@"$‰@"$£`0.G…[‘î­Œe†ßŠRnû¡5+ëR¤Øóž÷¼fµÕV+i/䟉D¿ò­Hb2ãÓŸþtYí¼úê«/³’Ù $½}Å6Ûl³B`û.Df¿ä%/¹¯™z~ñ‹_,$ý¨ýÏV<˜YAn‚¯KLd ïq{4w¼ãËÞkpøÌg>Ó<ÿùÏ_ê›zEé]õ˜öÚ4:üÄ'>q*’~š¼¦­×ŠODÒÛ§ÏžƒÜ#ÁešLcIî&AÑ÷º×½ ßóùϾè// ]«b@Î’¨7ékµ Ò6çÓUŽÅ¾†Ûe—]ŠáçbçuuL±ôhEÁreן$ê§Ô$$ý‹^ô¢ænw»[yñ›2™í‘|dyirb¯gXÍÌ¿îu¯+~äY©{)DÈïµ×^óY²Z7Û8D”÷ßøFóÆ7¾±¸¹ñ"G^ÿú×w:}ÄÿÛÞö¶ùÁŸ•<ñ‚–ν¼„Ä‹›ÁýÙÏ~v\./³^hm’›’$‰@"$‰@"ŒC€5Ë×[Þò–KeÂ:Úû27‘íÕ£,ú¬:õŽjÿ$ñíÇÂmrÃë+²Å[”0Hb2õŽ] Ò‰•$ãyK¿–®2×åñmâ!V§F\éÊ—AŽw}ïÝñ^aÎ;ï¼y}+T•Ë»º2ʃÕwMäDÝ}o ‡H¢iËmi9Cå_o½õŠ||[(w”GݬÖë®»nT©ó¨ ÖXc‚}MÔGÛÀÐ7EݦÈ/máÃEø08b<¥›o¾yÜZê¨D;…ŒÂA[ÐãºÍHɇsÔchþÈxrÛKm¬¼ò9ãŒ3ŠwHHÁZÔ_7½éMËÊè¯}ík…´^ýõëàKýVçSN9¥Û°ž•h#„¹þWÿÊïÜsÏ-e¥›ú;=Uᑈt@añm«¸8‚ +úw½ë]KW?úÁ~PÚQOô+ ¨ò ã·®~чÿR™Ly2Tª0‚Ÿv¤Ót€¢UÝk¢Õdžkî‰Kjv®kgð­ÇVã.<¢_ÄXbœ1ŽŠc(°“éÊkýEÚðâÅ8¡ý­Æ™Q2j<]SGÏýɸb¬¯ëuw®Ò¤{úPŒÛt–0è4ΓdtϽ êéªñø!yH‰ CX çØî—ú¾~@Xê¼D¾ãžQ¸%c…º*¿q‰îˆÏCšúh?KÆsÿŒiÚVXõSî.ÑÖê«?[Ù¢”5Æ@ØÖÜ’rHרˆS2NÒÓZ…þïYIÿ£þÊÐ×?ºÊ×wM›zFÒ­x–v=·aI»Þº0ÒfðÒ£_ªŸñ\ÛÂÛä¡IåÐíº¢¼òò¬ïëŸ Õ£¡cU”·ï}©ýN"¼2ª£¼ô×Ò…oû~—þÔaVäßIÔOÑ:AÒsûò‚¼`Š–Ž¢õ ‚–ÄŒ*¿ø$ÜД“¹”Öl¬ÎË‚ ~üÕ[zñ„ÎsžSdþæ÷Þ{ïæ¥/}i± ÷ÒaÿÏÿüÏ’¬ƒ„åI,…;à€ ©ïa%?¤-` Ç\‰¢v™ë8ù;HD HD ¨2ÂGlˆþo}ë[å£ÜÇ2òñ|qÙ(Œ]{5yõËP„<á O˜'®XÓ"È%a¬µ×Ò%âüñe…kÝü,sWâþÝï~÷æž÷¼g\š'P¢Ì]åA(0Ò RÑ~R±:Õ=yX±ª^A>x>âˆ#J>® p«Üyø°ÆùÑG½>ê®Îáר´–\õr#S¬,&‘Ÿí'?ùÉ…x‘¿ÁŒ‰â¾ö²âwÛm·-ñºþ!¹à¬E¹&Ñ6êeŸ­À€µ¸ö«‰zßFþ‚˜–÷ npƒ^¢ž+Ò€(¯öîú>CàÅD‚°ãp@x!©j¢Þ·’:!΃¨’?w§ág?0e\Å5©´žñŒg(ö;S¾íÒ&1Ãm’úô 2„uºïUú>K¢þ‚ .(8èc5>òó­©í°BÈÑÀ1¨_è¿täÐ-ÆmtFm‰ÉÆk?ú)}îÁ~ðRÁÛcÐ(üã»w©&<¢Æ­÷½ï}ócœz#ZŸùÌg] á¾ùCð 0dÙMjÖ¾D]뱕þß÷¾÷m¶Új«rþð¢^z1Ž   Ž9æ˜B8ãBê¼\›U‘ÖX&ã´³¶1þp÷.7®…®ãô£oIWݵ7Ò›¨»1Y½î„þ!cë1$žGÑß<§ü!fk]5q…¨7N´‰zm„ºñMq8dÜ3J[Ę­Á̸hüW<‡ô ýĘR—¿d2÷Î|ðƒ,r«¿1DZùq·E„£_&–¸E6Á¯˜ §rza’Q_×1òFÖ{^†>Žêò'úÝ¢/ÆGû9âãâÙT—Czí÷e·ï£1.Ê©þtwG7¸‘R?{@Ú2ôÍsHm×LÀí<ç‰Àyȳ>ðˆ8uÿ\ˆIoÈX%Ü8Qÿ®w’pØIGŸòîD†è`—þ”È+É¿$ê'l¨ é ˆ|Â/¶,¹â…™oyƒº"ØÈÛËî¾ð…æÿþïÿŠïyá¼Àš…å­-ˆ{³K›5=êQ*/C>N¼è¸-ý ·7âsÃã#Çf±– y8Å,¦‚›Ø¾ãï(›Ñ>ð,iÈà¤|)‰@"$‰@"$‰À8:aIˆ:„£ý  ññìÙÇ7 H[L¸øðÞŽ÷i¤ L¤—!HÂÞy‘R\} …Ê‚@G&©€8wÍ»zÄu™Õ/Ê“ ŒW¸šô‘ëcŘò3táÛ‡µ4Y9#P쇅TG\"¡”qe®1Dø{'gµ¹å–[Cdƒ2¼øÅ/.i /=D&2ÈG0™´Ü›±*w—ÈDŸMC}xûþ@l |Ld Eµ‹odb*¬LK®øá`Z‹ï¯ æýV/Ð _øÂ:ØÔ¿}!GèC´ƒz G‚T‹ÄYÙ"u ÷ãpˆx£ŽCòGÎÐË ¥8HY䋸59Ú•W›pÓu°®¶ˆ4è!èKÜ[èѪ߸°¢A(¿°8Ž<{ô˜hýÕäÌCúÐbA‹Cnú.nû™4§aëIu3&°Î ©ûÅBñ4ûŽCt@\úf‚%ÆA}Ì„˜¾åš¾åŸÅt´+¬Œ-Q·Z‡£<&-b\пmâ©›"G:´Ã;ëgñµ \k]¬óšU‰üqáªHñ!6“Ämtɸñ8âÐäµ¾aâ) ÷zO?F‘ÆÙØówBMÀúó¬‰çM¤kŒ¦Gâä,©-Êëó˜˜*®øçùcüÑoc·6£ï|ç;KßCŽ÷‰±ÝXÍ•2ÃUc¶g,R:DÛx˜H4và—ô7\“ç›ç‚±Ì$’ ² dMÆ+ìâ¹iz^Æ3a«.êäOûÃu\ÿˆ´F¥E·c‚ßx¤? ·½kÆa‹ƒG<õYe£7mŽ7 c´wù{ïá½AZÜ® }ÖêŸ Ñ£¡cU»ž}çíwc—~‡o5Yh‚+0P'ï@Ct°/¿•åúªË» :”ˆÎ9êÏË£°+’xé0ÝvÛmÁER? –ây`¼ûÝï.âxøÄË’‡ç+_ùÊò0ñò``÷0‰%"6 mVØÀî¡Î2Þ‹®¸¸,ñÝóTÇ3ˆî±ÇåÖà-ÏX2éêPO{ÚÓÊàæ…‰å¾4^ö²—-óŠ8yLD HD Hj¬Îjñ^ëý”ø`ö1‹¸%>„½«{Oò‡ïð€BŽ ´B0HY¤žóØ H|0ËÛ‰[ïËþ„yR—y剉Ù&”ÉÇ'R‚ »‘Þ™½[#7š×+r‡ŒT¢Ì´Z¤¥^&¤ƒD÷‘+.˼a`T¿ãOZîºLAñáŽ(ú²Â–#ŸG<âå;¢ï(° B1ÊŒèëym$Mdbéq&9F¾HzĈò!àÚD[{"aCÊ1$"Y£Ò?çuX‰ÉKº‰Üe”E÷ú„þZÀº5ðï :Tÿ!_Æ œôê¤’Š´]iøöŒ¾}V_ƒvÒÔ‹¾’ ?‘†Ü’XðÔÿiˆ º_ÌxÖùcÌ€‘ñNqÐVÆ7udzx&&ð„Q‡¶—@sÿXOǸ€Tñ )Ù'ÆßXucj”Cœv^³ê/Qe4ΩEüšÀí’!ãqÄcÔ¨>ij.Q—Cç°t˜îD?Š1=ÂÅQ{˜£ïñì b[j‰q',öë{}¿‡>£ôy+èŠç«‰Žzb¥/ýúºg“±Ýdìcl¯ÃÐmÏ8},ÚˆÞÒ™x¶9§Óô‚ÀNœ×êôüŽ:"éad<À'z®èCúG;Í®ó0V¥ûôFùñj±/e”cÔû=ä²&žÇÊÖàí<én2ôY?®¶Ëá|H݆ŒU]i÷]k¿“à6é¥g0ýÒÎá5$&Z‡è`_~+ËõåjQo CäŽzÐp:?åôr%î]YGIŽ8Ÿ…¤ããÄÀÏRÝ‹…GY µøxÓ›ÞT¬ÜÍ`Fg0Ès!Ú‚ŸúZôB[èZ¼´˜0CËæÜ?`ÎJIZñ0sÍÃÍ_¼èÄG…{)‰@"$‰@"$‰À(°>—´,«Û–\™XAJ‚¬lg  )Òk¿›ú¬-§/µùbÿ¦¶ Þ.s_y¼3Ç{³²#âƒ<ÒFF™õk»@påBº„uæ[Þò–Hfþ¨Œ1yÑ®û4åŽIv™bbEya%ñ}Ѿ÷#°ðXK´a\‹|aãÛh¡Ÿv;Ð DKà* (b"aCË54:+‚ØšG„õ«v÷M«Ž}Âj×w§ ŸQÂâ¸-¾U‡€HPFc"ºÁÂV?õ=Z‹É«ßž¤£œC$\ ÉAF|G›¤a¬ß¹ÞîÂ-xÆx!=‚„ó?D".Â×_[誱iÉ}‹9áFW}I[‡#èSq“–‘g\¯]ã±ûtÌÚÎkVý%ÊМ›”À…Ôº!üñ¸/]ؘؑFŒqp6F„·Œ!‘O\w¤Û¬ýµKú®Ç8„½kÄÊÒÆ·\ìùÏ‚QÏ(üŒ :VôÊ®<\ÅÄWOÒK]6¶ëíq\]j<â¹eò«-â¼’8V"À•ÎêÇahÚŽ'Í®q:Ê„ÿ¸þÑN·}®¯è7a­Ï5 ¢Øêå‡5Œð`¡+‘¾±ÇJ¨¶Ôc[¼{Ôú×Ä›äYßÖïK$úg9iýW7Á‡ŒU­dGž¶ßI¦VŒá?MÞ„¨ÿPŒ8+ëqØmFµÓ½<¬,%%:(?DÊìóŠ àY‘ô“ÖÇ `':uݱDžoßFûºd™í1sË+-§Ó.S/áÛn»íRsÄíêtq/‰@"$‰@"$‰@AR‡åaW˜öµ Ú‘4µÄyÛ¥3î72¹«Z#¼Þx?o—9ÞƒY~÷‰2……XFœxŸGL×"MçAæø ’£&‚;ù¨EÈôÉ4å²Sê2EÊ£,C¬ª#Ž#+̘P¨¯·1Œº×mZÌ‹;‰¡Ýéj‡Ð( +Iõ d¯]–öùüaɵˆ°¬-©t„!ÕP¡ ‡vX!ZèIL8tÅG8"dä'Nì®?®R”ƒk‹vú¾)#8ÒM:ެ‡«z©afy ²*Ž‘vÓ¡í~1 üaÞÖåèßCt@¢Ýj«cã½ ÌÝ3)à;Ý÷ºë19ØÖᨻ\&JIÝÇ"ìÐc;¯Yõ—È_ÿTï¨C{âÓý§"LÄé:¶û}Ä üÅ |êø®µñBë?ðçã<ÆjñBcUE¤çÑfq}ÔqÈ3 Ç}” „¬Õ+H{«µb%ƨ<ÜS~ds`áõãKH`e’¯žL¥«‘¶³"ÇÄZK“d5F‘ž£¾ý³¾¿#Ïqý#Â÷•Ïê%c 2›¸3²Š‡µò+k¬Þ‰|Œ!íçTÜz l`7é³~Hãê& 8wµC»ß ɯ+ Ì_ôŸ{%ã—gNì¡0T»Ò^™®]>½œK¬ëÌ£þꎾœ‹—Ùõ °ûî»RÞ Ãÿœ‡‹%®²”D HD HD`´ ž!i"Ì}Àù诅_YÒE×áFýË8Dbø fT$¿øfi—YX÷åaäaµ0ra¤ôá–¦î#Ba-–®u½¤KÔÅ!Òòñ¾dÎ9Êu äËC.ýšrŒeâ_76;E ±nEN„ÈKÝ»¬N¹y@þ([꺻ç ÂZ^µÀp¨H§ÝÊS[ÓK‹Ëˆ¨»óø= ¤ brA¼º½Éß$a Å +Ò.ę̈çŸ}NŸ[µÀÚ=\–´ƒÒKS:ˆ$A@ÆŸvö/G¢­Õ¡þ 2®ó±Œçƒ_ú,jÇ$3èv´Y[?C$î·ûÅ,ð7&Ôù$¦ßãt0Æ8¸Gwt]Üèç¬=öX ³5©ßÖáM}csa×Ú},ÂMrlçØ.´¿D¢ŒÎé-žêNàHCÆãiî_<3â<ò©‰scCíšFŸV}!™i?yÛ? =`¼fMÍ-NMü³ä&ñ ¨ëi;Ödy`+¯Ðö3 >ÊdÜ$÷"$t¿œÌýë"\c\Æ*u¸¶.7D¤­«l×B覺ó»N‡k]0qŒç%] 1¶²Ú‡áÐþqûŽžOÆImɵŽýá“wC°¶€tX†øÙkf2í³¾/ïIõhÈXÕ—×ëaÕÏ‹‡Iiú\ëŒ4†è`W^µ»?î¼+åuíßSË+ÇÌg¥E fWÚ dÁD HD HVxA2-,r†‹MD¸=˜UÈ8˧5É24ÍÇ…k.ÊùpG®È9ÌÂÎ}W™}dÚ<õÐC-å±JØ9Iít‘×üm#rÜGXl³Í6%{åHð!ÒÔÃǸˆ2Z–¯ÞÊ(_•U°^xaÙ<0µӔ›õ1L•U>Î5&àCø‘G °¼æ^AÙ•Ç7B¥-ˆ²¤ƒ¨÷áŽäà‡Ù$$T_ˆ×Yá £,m’½$Þó§ºD; ‹µKMpeОH‚=V:@?•«í¶dHþA*—z#¨¬t*&P¸sÐOÄUß®_¤ÆfÌ6 n˾ûî[tp§vjßšú9gsF†`£,j§Î Š¨ÿÅp´‚ŸK ³&UBÿÚýb¡øWEèü9D´Ÿ×ôÇæ¥&kX_üñÅím¸#1¶øz.ˆºt8 C'¸0¢£ÆíTµê"âv»òšU‰üè´r›L2)á7¿ñ!ïyÏ{ÊBZ[#žñ•»}IoœÿÆØ`lã§ÝxmlÓ>a™ÃøÃe A4Òí*ÆDbÒÌÆØÆc¸É ã}ˆgV»ñl”§1IûyFÙÈ•¾ÀÑäzì¡bÅœ¬Â`I¿\k ÊÉ?ÆvÏúR‹1Y\u¹ï™gŒçBH½Rzf<2É"NŸD1–zÎh#ínLu>´ô¥בþÊieñAúÜCNC°¶JA[z&Ù˜žŽ¨ç¤“«Q¦ú8í³¾NÃïiõ(ÚaÔó²×$çÆ B_õ;ý¯~^¹7D…«EŸÜo¿ýŠQƒ½^¼ÙOÓ˜ÏkЉ«Ö´yß~uz‹ý{*¢^Ç«}%uÒ #…NID HD HD`„K|ÈŠãüËì}[ÒÏJÐý%sa>ºÚaëxã~#|Øyä‘ÍW¾ò•Ü7ÂÃG]_™Y?GyÂ¥Íïˆ=¥"]„1 `“Då&H+X‘ñ±Š¸Gô˜,Aq€¼Aiq·.'"l§-7,µ‡~x!‘V®!ùlüJ´ ‰xôÑG—kÊÃJ‘Hê6AâÀ4È©àŠ–¿«+Iió)"¤‚Íâ‘–y®ï¨}wÜqÇBHE;ˆoB!,¯MZ/•e0aª,¬nÕÝ5$jX@É_èËS$­¼Gy»ê&LHX1k«cZ? }Dt>â-#·$&~`Ýv{ÓU¦®k}å„q;|l ‰ŒóGô)›Ê"*»úÅ4ø÷•©ëúÏ‹&ÇŒqáÞd^›X‚#Ëpã ‚tép¹1÷ù†¨Š±C?äê ¤a¹Øúú,lW^®Ï¢¿D¶Æ_dÖ¯LnšÈ©Ëãšþ2j<Žx|¹›¥ÊlŒ‰0p…M=¶×é 1Þ‡µnŒÛWšAÔ#‘Ì'žxb™hfÉÜØ_O8´ë!1Ú_{ñŸ>î%]íiì€QcQ”Å5e1¦#Ö÷ÂÕ‚À7&+3ßûÄyMÖ‹cÂÏä±ÉòÀÉÍš¿“VÄ$G}¯þmœð¼´2è˜+Æ0íîy‰Ÿ$CûGnû·½ìPÚÛ^dÖV$Ù€Ö¸o5,MPì³Ï>íì:ϣͻnNó¬—N¤í9­ «ºÊ>äš±È3Ó;œIyý 1o’:Ê>DÛyEܘ,‰c\|âØŽ¿¼ÏW™{Ùø÷Ú“¹æâŒ é¢óÖ¾™Ì̱®z,!êKˆjiLë£>¼}Lûº @IDATyäõD HD HD ¸„qô.ïè54ÒÁ_X†Å5zˆŒY ¢ù°óÎ;KíiÓE¶!¨ã#lÚtÚñ,ÓW÷šüRæqå‘&Ì}à÷‰¼µAo}áÂÍÊ8÷#³(7ý`1> »Ê¤<>Šƒôë ÃR›%‰…>!«ç úÛá†âÓŽWŸ›$’~»¬È!¨»í¶[|þ÷8´ë×q:Ù—dùÐÅ èàb¤å_‘Žô’µ¬ ™º¯Œê‹?|Æé@`8®?D¸8ŽÓaá†ôÕHoÔq\^c߸1´¿ŒJÇ=Òõ è¹^býÌ­/@Öš\lÇß{ï½ËÄ’±ªëy0 —¾{ÊzöÙg— £öØÓW¹×6 W¦À¶{izÕåèºpl×Qz}X¶Ã=2Ö·û‡É#“Q£D»ã!Ér÷am ±RÇÄGŒ-&olXíNt2ôY?*¯iõHš]cî¶íš¯?5I4JBOÇ=3‡è`ä£MëþÜÖëöyÄ[Ì£÷ ÅÞ;àâ]Éï‰,êùcBÒSòörËváÍšÕ4Ëè…>%HD HD HQ„eu,vÔ½>bTœ!÷5!Î2+O›lé*KWÞ]á‚èºW_›E¹}ôŽ«Û¸ò ú}ð/™³$%ãòŠÏ¨<ú&AXø…ß宸ãpðá=®üÒíË?ò—O„›öxu!éáC/ÃýCר~±Øø+Ç8ˆ²ѧë8N‡…×W…"ãò‡ãÐþ2*÷údvâZñ4Nf1æÈC~á^©g_=`Ô%ãÊ4[ùI£–¾qaŽÒŠe߸ßãê(~»lVY•5JŒñAÔ)w_9L¸3fœŒ?5QÁ² »é¦›Ž*ÂD÷fÑ_§Õ#í«¬ŒâJl”ÀvQ?NO#ýv;Çõ®£|kiëuû¼»¼w÷îžRD‡µ …¿Qb6ÊÌ€Ê&Q? ©¼—$‰@"$‰@"$`ñdéóÊ$+c™W$¬­Êà›˜›…QXö!¸mH¹ê#°²öçQ-³¦Xà[>dé I×7 o°L!HD HD X TÄÑ`GÞå‘/þÛõÍbÔ+ÓLD HD HV$fâúfVbeÎûûÒ\‘–ô•1¯'‰@"$‰@"$‰@"$‰@"$‰@"$ E`"×7 Íì¼óÎ+›*°Ì÷7NlöPoF;.|ÞOD HD HD HD HD HV6–+Qo ­MìŽÌØ(±{¯Ý’ÅIID HD HD HD HD H®ª,W¢ˆˆ÷“N:骊gÖ+HD HD HD HD HD ˜U' D HD HD HD HD HD`¦,w‹úºôÿºö ›KÖ½KsÉ 6kþ±öÍ¿V»N³ê¥6×¼àŒfõsOjV?ë›Íªÿ©Žr¥ÿ>çœsšÝèFWz9²‰@"$‰@"$‰@"$‰@"$‰@"$W ¦"êoq‹[Œ%«¯ýë„ns›Ûtú™?mû4¿]s›ePDÖÿýz·-Í­Û¬õ›#škÿâ°eÂ]Î<óÌf—]vi^÷º×5[o½õ‚‹pÆs«¯Þ¬»îºK¥õûßÿ¾`v³›Ýl©ë& .¾øâfƒ 6Xêú$'\½õ­omîyÏ{6w¿ûÝ{£ž|òÉÍ¿øÅæüóÏo^øÂ6×½îuÅëM0o$‰@"$‰@"$‰@"$‰@"$‰@"t"0Q¿Þzëu&ÖuqÉ’%K]¾è²5š¯\r׿—Þp©ë}'½ÅÍ?ÖÙ¸YëÇïjV½ä‚¾`Ëå:â|×]wmöØcf÷Ýwo¶Új«å»÷Þ{7ùË_š÷¾÷½óé\vÙeÍ«_ýêæÿøG¹~ík_{þÞ›ßüæÆ&»ï~÷»ç¯MúãÒK/-›ôn¸á†½Qåñ¦7½©¹æ5¯Ùl¾ùæÍ*«¬Ò ‰×›`ÞHD HD HD HD HD èE`*¢þG?úQú{”lºé¦ Bÿ¸ãŽkþùÏÎýãí_Ô\´ö0’>"±°oîðÜfï½9.]iÇí·ß¾YuÕUçÉú;ßùÎS—åŽw¼csÔQG5gŸ}vs“›Ü¤¤ó‹_ü¢ôNlº»Í6—¯:øûßÿÞœuÖY3±äWàŸÿüç%ÈÓžö´byïÄÄAJ"$‰@"$‰@"$‰@"$‰@"$‰Àì˜j3Ù¿ýíoÍE]4ò/ˆ]®Z"ìŸnú 9’~ã©j¬¿x£GOwÖ‘îw¿û5/yÉK YÿýïêäƒäGȇ|ï{ß+?M|÷»ßËÍOúÓò{Ë-·œ¿öç?ÿ¹9æ˜cšƒ:¨„Eæ×rì±Ç²ÿôÓOo9äæ„N¨oÏÿ>ï¼óšÏþóÍøÃF}¾þõ¯—{§vZó…/|¡ÓuQDþíoÛ|îsŸk>ùÉO6§žzê|X“ÒTÆ“®Õ7&}”3%HD HD HD HD H«+SYÔO–c¹±é“ÕV[¥9îµ[57^ëÍ^_<³yßQ¿Y&¨ø×:óØbƒÙí¶ÛnÞ²ž+Öñ“ ÿýùÿøÇ òŸüà?h¸ØáÞõ ó·Øb‹r ÁÍ×<ᢆ?ùë\ç:ÅeÍõ®w½rýàƒnV[mµâ^Çw¹Ë]ʽø‡¤W;|ïzè¡¥ îõ«_-ißûÞ÷.LjÇüãÍ‘GYNÕã3ŸùL³É&›4¯zիʵ}ìc W>yÈCʹðÈ|+-bÂáø@c?ƒûÜç>‘lD HD HD HD HD ¸Z!0•Eý4]²îÒq;=Ÿ´I³ÙM×lÖ¿îêÍ&ëýÛ/{;ܸtÚáó¹¼óÎ;¢{š|Ûm´Qó³Ÿý¬D·úÀ†µwºÓŠ‹› .¸ ¸Åq“E=÷8k­µVñSÿ¶·½­Ygušw¼ã ²ûe/{Ysá…6ÿïÿý¿¥ŠÂþSžò”æ€(þõë›6Š}ík_[üÏÿÏÿüOsÓ›Þ´yÁ ^Ð<ãÏ(Áîï{ßûš5×\³ŽV~³¼GÒóa/Ì?øÁæ‘|d©Ë'>ñ‰RÖµ×^»Xô‹`Ûp©«øÂgq¯¾)‰@"$‰@"$‰@"$‰@"$‰@"\]X~Dý 6ëÅøVë]§Ùå^ë5ï:ö÷½aâÆ%#Ò‰0ËëȽ RúAzÐÔY²GÈ#¬¹ž!üÒÇFµÜßp#ô»ßý®â>äY£“Í6Û¬¹Ç=îÑüú׿^ʵ òýþ÷¿±Ü/¯ø‡¤Ík^SHú7¼á ó>òë0£~GYû׺ֵJú~ô£Kyâž2qwCN>ùäBÖßínw›¯g¬@£òË{‰@"$‰@"$‰@"$‰@"$‰@"\UXnDý?ÖÞ ÃC^x‡æ¼‹/m^ùÑË-Ë{ÎÝ•Ψx³¾‡¤Ñ‹^Ô žŸ÷¼çM|X“‡øk_ûÚ͆nX\ß° g¹~Ê)§”ô#,ßñäV·ºU9Æ¿8?ãŒ3âÒ<‘?áŠüÕs{së[ߺ¹á 'ÛÜWÊÀbI_ËlP&\CÀó›o•Àw¾ó2Àe|ÿô§?k{îz¸úIID HD Hf…€=‘öÜsÏù½—f•îb¤£¬Þ÷íåĸ¥ÞÏ©ÎKI†.öw²weå×.VýÎJŽ8âˆæÿ÷{q›U>™ÎÂXŒöŸ´Tí2¤þL† oùÝwß½wÿ»ÉRûwh+ðßò–·ÌÙCó±_Þ{Þóž'4à—±Ô>~çœs΀Ð$Hå‹ÀòóQ¿Úu:köø{­ßlr“5š‡üßIs×A–ºø¯žt– ´È'AÒó÷΢|!ÂâYý“Ÿü¤¸·áJ&1ÔQGŸñÜälºé¦å÷7„«›°¨wΕ ‰ûådÄ?|â{ÙŸÔG<’ž•[|4˜l Qþôý©¿ü|ê[)àCäw¸C;‰ñ‰'ûÛßþ¶´KÔcâ:"0R«”ëo£Ž yi f©G‹Ñþ“V©]†•M̾ßþð‡_)ú~ÖYg5¿üå/Ë*ù{Þóž“ÂßþÒK/-{ôá(ÈÐ|ìë§ÿä<7Âô_ñÒ—¾tlT“ûÛß– ·Ê*«îã’K.)…].ƒu¸ȸtÚð–—%K–,ã!¡N·Žy(‹çà¨I]eòb—S_sOšî¹®ÎöAllF^k¬±Æ2e•FÜ÷»-ò”nJ"\ŽÀr#êW½ô¦‹díÃ7œë´M³Ï“7žo“§Þí&Íigÿ­yÏ~=-~HçÊ” éY‹ïºë®3) žßv.n¶Þzëù4ýþ¾ÐsÌ1ÅzÞO6Þør¬¾ùÍo67¿ùÍË5òÛàyã߸\õ»žwܱù¯ÿú¯ò¢‹[Üb¢¾õ=ùò˜(ðâçoéžxâ‰ÍÿøÇR7ðínw»æøã/Võ{ÜãF3ï%‰@"$‰@"$WYÞüæ77Hïä›l²I±&µ÷Ó{ßûÞÆ~Q„…ý>ûìSÈ¿øÅe•*¢þðÃo¶ÜrËÞwø/ùËeå쬉zßßúÖ·¦"ê£!áÔ6`ZŒ|®®i.–­(x®lúƒ¼Õ&Œ¯Œ‰)†wöÅ›fUþ$m¾ù|ík_+ûë!š'ã1·Kì×§=;ì°²!wÄ!\¿õ­omò‡4zԣʸ9*ˆ÷ÉO~²ùÊW¾Rø¡¸öЇ>´LÎÆù+_ùÊbôÈr-\ïµ×^óÏ«ú„»å]vÙ¥Üæ•áõ¯}³Þzë5ö/¬¥«þ¸Æž}ìcKP¥»í¶[£ŽÊ[XÖ¬7¸Á ÊûøqÇW,½ózw«@+O¹¶ÜvÛm‹¹•¤òU¶xß¶gcïαÊTxé3† a´Ã ÄSŸúÔùo‚¾ð…å»äÃþp±¾°Ž,(ÅqôýÂ¥Žï„%¦Õ®ðõ~n_©X[ÌýCÆ+Ãꫯ^îÇ{üW¿úÕâÖR8é" ïz×»F´eެb}ç ÒíÃÕ% ŽZ𲊖A@jS÷Û–»êGxrÉù‡?üa©•êȨîˆ'ßXµ®(½`,%}ßZ¾¯&•°,–Œk£+i Áú÷¿ÿ}Á;0òÝÄ=‹}ÆüVnC}7}ýë_/{›q¿jµˆ:¸Fïà®P£ô†+UÆf¬‡‘aaü:ê»N~ê'}ÌB†´”µïõW~ýNAªý‚áúë¯_Ú±^‰"Íqe¨õ‡Î#:a¹îºë–"iC¤)ýˆ~ÚWÖúú8Ý€¯Uõtߨ{+`Bbœ±’]ÕouÖ)}WíÎ’œ^Œ;·ëõ€£ovF¬®þ\îÖ2 Wq­B0n(w肚®>g¸Ç Pÿ4þkœÖXÑkê|ëßí|ÜÓWµ~+º;‰|ô£-n†Mxâu&•g?ûÙóã·¸ú'ÑÚè ƒ*:rÝë^·`lÂÕøcõS-}ésôÑG7_üâËx¹ýöÛ—z:?òÈ#KuZžU£„ÎÔ+p70‹±Øs,ĘJŒ¥t¥m1ïÞsŸûÜâ¢ùÜsÏ-Ï)ÞpQÒ‹qE¸.‰ûxÀæ=/D¸%s+RDàßLDÔ`½¤y@õùNŒ¤c6›f@ýÉj7§ÄÍêøšƒÿýÊ`üüýîÓ|ú‡ç6ŸùÖYU¨ÿ\ýÜ“þ}²œy0š ƒYedµ—­z@ôÀõô ò²b{õ«_ݼýío/®q Ðð6èMj¡nðæ¾çïxG™yÝc="›‘|/@fq=|>øàÇ õ³žõ¬RæH)¨gE/^`õÖ¡'>‰@"$‰@"$Wo¼Û²èC yïu~È!‡Bûñ|!-?ö±•ý’jò•¢÷õš¨çæÁépãñ„'<¡yðƒÜ ò¸¼Eòî‹àBžqä¨ ëD±2"ÓXÀ"tOp@ó¦7½©nÜ_²LUæZXØ×ߌqHmÐC¼×{SE,YwÊÏŸß,:…GÀ½üå//$¬2!€¼³ì’ynƒ”ËŠYïò¾ü©'òH9ÝGÔ#Ù}+ø‹kµm-¾µ5ò-ÚèÀ,+ |s"i¥ƒ\ œäoÕÁýîw¿ÒÖ&¤càóç?ÿ¹aŒ„ Ü¥í›Ç· ¡+ŸùÌgæ‹â|‡v˜è;osz©­¤ÏW6Œ}§‘!X›ø&e Œ´9=¦ß¬u ê¤Ö¦ ¤ƒŽQGõaL†_°ã.‰þÚ‹[%ße¡£&›þÂH¥´ïÌ…êÑö/…ó/꯿(|#Õé*<}“…|LX )C­?úÀÇ?þñÆ5iÃé#ùH!%}û•qºÑ¥£Òf@ç»™¨‹ÉÅØÏîš´4‰FÔí׸FуraÌ?ؓģ_Ú;ÆVãVX‚ÃU¿’W0¾çC¾ô¥/B^ºtÓ¸Ç íÐúÙßøÆBöŽ)né¿u>Âï·ß~¥ü~ËÇ4‰e¿ÉN\DðÒ™DLtÅDkOzÏyÎsJßWÆ—½ìe…´×_·íüúÒÁ±}âŸ(ãÈ3ŸùÌù,žò”§” :sy¤îCÄØÔ¶T7Õ¾f6†"ñ&Ķ›4n .Pýõ/<ÕóŸÿü2vñßßunÂvþ]áòZ"puF`"¢ÞCB§5ƒßå« H3ÀdÓËÎnN³çÑÜ;n³ö3ŽíJfþÚêg}sþ÷òþÁ¢dÖ$½:ì>ô¡uV§o¦× µ—ƒ*kœx±®y÷»ß]Ÿ–ß^bÚyyè××´¯¿Zºâ™XðcÆÕCÅËS[̰Öi»?ª¾íøyž$‰@"$‰@"põB€Õ#’1lå&bqˆÄœÔ‚‰ä@ê"M¥ ™öŠj˸¼¹L÷½ï}›'?ùÉŇ0×40Æ5µU5kSÄjE‘´é+ïâ!ÄiË$„r;î¸sºc"Ù˜¦?YI€;Š•ãâ»o¬'1Ö•“+þi³‹.ºhÒ¿3ío+¬<~²¨GÖoÛAÔGút\ýHL,Ž<&‰ÀÂø÷ÝÀ´<ð¼L#kmtIó×[ì0MÔg­ßѬzñå–+S'r‹hv¶‹¤_žÕ4S[[ý,ϼ3¯D HD H«\  :à Ãx…{ŽšRkDjºÈ"«?ÇÈè.¢~hÞˆ4Ö™ÞÅ•‰µ6 H„GMÔ#jô©†pC„"邌‹z è÷Þ{ïBj#|B|){[ ÙÖ&üÛa³Ô—w áÊÆñ^÷ºW!#‘/þÈésî+‡ˆÞÚuB¹9ࢹŠX 76ˆæN8aÞu‡dà e"ˆ×'=éI…Ðã:f¾u¬Lò áü \KÄ+þ!›ÖH&Ä.¡KHP%ëk“!ñúÄ¢ØJ~Úäö°baJ_Cg‡`¸$Èv$=a@ʼn‰‘¶„Á˜ú0†2 d,èŸp§¢®üEÔ°rfÕ¬Ž5QÏMIè"7Lô“OjD}— ©ÛÐöïJ¿ïœêoN}¹˜èAvN[“Cð:ú€ ¹Id”nÐQmgHèš<òHzWÈhbŒŠñk’üG…µ:%ŒòôM:‚L‡"{®}i›ØSý?6ˆ•.B÷ï|gÜžøHo‰I˜Ð˜–—˜ÔŒ y?q!ÆwEHl}ñÝ%}éóÈ’W0ŒFý-†˜œ1®W=L²X¹Ôv#e2гˆëí«¯Ô.š‡”M¿2 b ®'“ãz«3õ ëÚ¿˜[‚¹ÎƾêÇ¥{­óNiÄOID HD H«&ˆb>¥k²[M‘A.qÇ2TõµÄy"õ½!yGxZíÎq„eÕ[ W#µDþÈÝÚ‘…þÞð†B@í>g]D”¸È’®ò†ëš«ójÿŽ4µþÚ‚pî“!u‹0£Ú¿/ý®ëê^÷ aL`Y¹mÒÍÄâÄq!e@³Ú澉X“³%“ÿÆéFèX›Œ w=5Q“\#²›øV»oD)B}®}†îÄ}«?¬&²Šæ=ïyO™ÌB\÷¹ÊŠx}Gå3Së=¬õëGúâÎêº1›û¡¶µ¼þÇê„FØúØ—Næ˜c–šÐsѳA§YÍàaÖP1Þ˜T ™¤oEœ<&Wu–+Q̵~<·Üñψ¬GÒ—xWõÖÈú%‰@"$‰@"$Wc9.PÆ ‹áZºÈ .@j ¥ËB}’¼‘mA^´Óü"l”§‡Cöraóú׿~Þ=â ¢X_K¿&+Mh I†‘'B™%d_ÓÈ'D Xš³ÞæjÂ2†…5·A$EܾcXû«oM6FýÅ‹´ÔÉæŠ!ÚÑ_Xt³REÔ³œ Üû\ …eú¨É("«wÝu×ȲádþR7zNX:óëÍz–;nküqo4ëÈË €z$,Â{²t9\Ø¥lˆ:Ò™…Ⱥ iÿ…”A\}Åd `:ò|ÛÛÞ6ýBÊ`"-ˆV} Mn+û(݈r….GZÑ7Ú>‘^ß1ò‰ûq:3׈7îhâ «füq[ƒ´—_=þŒK'î+ŸÉĶ´qlߟåù£ýèyýêJ—;,Ï%ã‚ú;»êÚ—NLÖðïoåÃ(áúGnK\sˆ˜”%¼fÔž3¸ÊªWÞc"2ú¿síklž„¨‡},SD Uûo-ÎU/™Ûæ{on¸±" /^J"$‰@"$‰@"\µ`QÌ"86QT[¬3êÒ5®B?¬_ÛÂÚÁ‚p&m´Q\*VÚq2.ïÇolªê_ïòizÜ´Ô~}mÂHLˆ1äŠM>ƒ¼­ãDY?ûÙÏÎ_Fæ²h\R¹H貂ÒFDi#õ\¬ÄŸ²ˆëÙŒ,äjF™å&NÚV«íüꉓ œjËpù"´kaÍË:ÜäA”'ú‡è‡+â ž¬Qk²(Â9Zé ò ër×¹£àÿÛdH¸a yrß ^G¼>1'GD1×GþC°ŽÌm0d#=â¬/ï!×õ‘úDMd ­cϤz4´ýë<&ù /ºªÝþó?ÿ³£Ñ®APN[isÑb,à†H¿CÜ•qº:j –›Úú_‡©·'ë{£~#ÍcB¸è£úâ\G¥]ß³rŪmÄ= ·^$\OùÝC\ëå3ÆÔã¾1¸=.õÅ_ìëÜŠgL"ª¯þmcØpG5$ÿX]Ã5L»Ÿ'^ò’—Ì_7ö{îÔm)㱬ëYÒU{0˜,å,þò¤Æº+®2Ê¿]†®°y-H†#°Ü-ê£hÜØ\ëÌc›KÖ½KsÉ 6kþ±öÍ¿V»N³ê¥6×¼àŒfõsOjl›>é±<&‰@"$‰@"$W}¸Yᲃ/yËñ‘G¬öHåpƒØBâ"7lÚ%fé<ð,„‰ C‘ÐáùË*ð‘|dÃÊq\Þ‘‡²Hw‡v(d“ò!G¶ó^‹ô•‰ˆ˜ãֱ˚i, „#÷2ˆ­pË" ._”•õáç?ÿù²Ù©´¸ºáçW~!Ò®‡kˆéÙäÔ&·Ü Ýmxjó[y"‘;ì°’&·ˆ5þª½¬$ÕÓä"'\¤Ø`ö6 DJÁRln‰L³9¥ÉD8\kH#„pµ<ìa+ùq/¢ä§¼&ilp¢Üü&>ÕG‰ DâfÆRTL (³sØkoö\Ô ½ííÉ–®¼”NÚ‡‹VÒH:í;k–î,Þm>É?>ŒMöÀŠ•ýBD;è+|p™dAZ#𸳠£ã¬uë¼§Ñ£¡í_ç3ÉoîOüÑoúì·1@»„L[›sž÷¼ç•‰ “K6Iµ©2ý'CtƒÅýQGUt\?4éœîÅ$^_>1!aS`ãÛ¤³êƒV}Ðz¦¯ÐÅèãpí+Wûú‡?üáâÆLÿ0öÄÄ„MkIטÕN£>WOýV;è·ÆB“(u›×áã7}ƒOˆ1ØæÍÊ`µ€‰0}šðõo£c×mr+ˆÜëKGzÜ€qédŒfoRÒÚð¢7‘Ëôïÿûͳ=‡ŒE&õñ-¶Ø¢<ä5JŒÿV)(síŽÆ3Õ3ǘ]O)·±Nÿ0 .?z£Ü!&µjŒ\ýBüîº/XÑ!LJ"puG`ü“fB¯qúçÊß"f“I'‰@"$‰@"$‰ÀJ‚ÉqÈ!‡4x`)5rÚ†²È²‘Væˆè° w?È ä ©LX³ò=DB„‹ã¼¥ƒ,ã.¡BL öÚ>‹ùRGV#O¬šY‘†›$VYåæÜ?äò…(³ú"è‰ë0ŠÍþ¢üqyƒ”C|#ô@þL°ÚŽüº|r8"ÁLh ¢ —8»í¶Û<¦}.Xâ"Á÷ÜsÏù{‘?Bó•¯|e!Õ‚À† ÕÉxØÌRƒˆ‡%Ò¿v#Ïý÷ß¿a-½ÕV[EôΣxÞˆ­Ø€ÒäB¸ºQ›""dÙ„^!‘¦Cy¦M”™ÿm‚h·¡p­ã°Ǥ] ë|>ÂMæ°Ä%s_ºöªW½ªè Â-ÈLä^LîÔäZ;:ýiôhhû·óäþ¬´£j;厲OSíàϦº11eÓhº¢MèÈ8¢ö7Ðé( smÀV§(_>ú‘ºê¿ôÏÄÓ¸8uZÈ^ýBˆ•ö£ˆIˆq¸ÖiùݧGÆkdóG?úÑE8ijÉWeŽc¹8âŸÉi~ðƒœï·éõÖ[¯L0ŽˆÚyËXÒWöÎsAmñl1Ùɺ߸8Òc€62Á›a‹ß—Ž{&r×& b²R9Åͪ…3A`Ÿƒ:h©pÆÔö†ÀÂwÉqÇW.·÷ ŸtLÈ>ãϘZ—±ã÷|€¹6¦õWË^{í5K×}–£SDàrV™›µ½ìª †Æ”D HD HD @,µåìŽþXD#•üñ¾æškÎ_C6×wãs™,_ã• ÖÛ±¹»AöÝð£ÒQ¯ W"¼c_>ÜY¿üå//„0ëšT‘m’å* ésÏ=·ù“CÒªE}•+Èàú^_=´Ÿúµ 1þ½YGÚ„´K´­¶†o— õM`‡ îª|”!&ºÒrM½‚kV¢¬\»Ê%u¨ÛÏy=‘ÐWŽí¶Ûn¾œCÊÝ…µxÈ_©È¹W¼â ×JH±v›E˜IŽÚ&@úp’Þ´zÔ×þ»À®/«ôµQ2d è+ètÛ÷K7´ñcÒ¶ŽñA\»“i—Û¹>lΧÉC«e¬Z1AÕ7vÁµ+¯ö5¨ˆç®M¯ûƬvíó®²OÓ>ítW´s­úI£Qúa|á ɳf!}|E«–'¸ª#À½”g³÷(ãq¼S]©õWuг~‰@"$‰@"$‰@"0=ãHÛ6AÞ—Ó¨t‚hnÇ'Â"Àúˆìƒ`é"©âþ$Gõí«s_=bB;Ÿqeòñ8JÚõî#ˆ}d`þ¼ëðC~©C]„®•ã„<‚ )wÖâzê©eµÅ 'œP\ðp'¸ÝvÎ-É(bn\ùêûÚ¥+ÿ:ÌßÓêQ_ûsÝRïCÑU$å8¢¾¯?Ôéõ•¡3î÷béÆ8í+—þîÏ^ãt– êk7q6×:½¾ß&ûô¯oÌêK+®w•}šö‰ôVÔ£Õ?C¾}‰ŸaD`ÅB ‰ú«=²4‰@"$‰@"$‰@"°#ÀU :£„ßsVï5 <*|Þ[1@N†+•åQ¢Ýwß½ø¨?ùä“‹ûn<¸©Ýd,r\yì»ï¾WF¶Sç¹¼uchA­Æ¨WdôÅ;ÿüó˘î²ú­¬×WÔöYYñÌr'‰À•‡@º¾¹ò°ÏœD HD HÑõÍ N$HD HD ˜}®oþ½=óŒ2ÊdD HD HD HD HD HD`8IÔÇ*C&‰@"$‰@"$‰@"$‰@"$‰@"$3G ‰ú™Cš &‰@"$‰@"$‰@"$‰@"$‰@"$ÃH¢~8V%ä9çœ3aŒþà—\rIsúé§Ïÿýú׿nìl?­ˆ¿çž{6gžyæ´Id¼D HD HD HD HD HåŒÀ5–s~+uvð]vÙ¥yÝë^×l½õÖ ®Ëo~ó›æõ¯ý2鬿þúÍ6ÛlÓì¸ãŽËÜuáücóÓŸþ´ùË_þÒÜìf74ï%‰@"$‰@"$‰@"$‰@"$‰@"¬ $Q?AC ¿wÝu×f=öhvß}÷f«­¶š vÐûÝï~Íît§†…ý©§žÚ|÷»ßm>ýéO7\pA³óÎ;÷GÌ;‰@"$‰@"$‰@"$‰@"$‰@"$+=IÔOØ„Ûo¿}³êª«Î“õw¾ó'LaÙà7¿ùÍ›Í6Û¬ÜÞã÷¸’þW¾ò•fã7nîq{ÌG:ï¼óšïÿûÍÙgŸÝl¾ùæÍmns›RžùWü8餓šýèGͺë®[&®w½ëÍùÒ—¾Ôl°Á%n\<å”Sš3Î8£Q¿–ùßûÞ÷šßþö·%ìíoûæØc-“ ÒMID HD HD HD HD X8IÔO! ø ëY×o¹å–S¤Òå׸FóŠW¼¢yÎsžSÈö êüã7oyË[JDùyä‘Í­o}ëæµ¯}m³Ê*«Ì'xÐA5Üê\óš×,>ï?ô¡âÿ–·¼e ó‘|¤¹Ë]î²QüñÇ7þ‚¨GÚ¿æ5¯iþõ¯•tŽ:ê¨fà 7løÁ_}õÕËÀ|†ù#HD HD HD HD HD`jr3Ù)¡Ûn»íšÝvÛ­à?üá§L¥?Úu®sfuÖiN;í´è /lÞþö·77¼á ›}÷Ý·Ùÿý›'>ñ‰åþ¡‡ºTB,à¹æùÀ>P|à#õƒà_*àˆ“½öÚ«ÜEÖKÇ9+þ”D HD HD`y À%ä'>ñ‰âòCiN>ùäå‘í‚òˆ2ÿóŸÿ\P:ã"Ÿ~ú雋.ºh\ÐA÷—W¹ëÂ|ùË_nŽ9æ˜úRùý©O}ª+ÙKûŸsÎ9Ë„™ôÂa‡ÖÈ\vÙeÍ}úö·¿] £Úyp Ê‹-}ùË÷¯ýksÖYgƒ«:¬zÃÔŠéŸÿüçÍßÿþ÷úöØß‘îùçŸ?6l`höÉO~2N›öùüEø¡¾¾_­ïõ±Rýg?ûYsñÅw)niéàþ óZ_gQ×iu ³2ÕÅÐaFyÊxU6ågäÕ¾>íù,p6ïIãécúׯt‘Nv ®‡.Ò¡•U‘Ò×Y L˜þä'?™U’™Î""°XcÕ$E®upeן´¨Ÿ¤å[aïsŸû4þóŸ›W¿úÕÍGѺ»ðÓßøÆÅ2^J^žÿñ4O{ÚÓ ïÚƒô æè£n¸¹yÌcãR‘‡>ô¡Í­nu«ò{É’%ÍÃþðæðÃ/‡›Üä&W„ê?x™ábç|`q½#$W7&ö)‰@"$‰@"$‰Àb#À¹ÁèäW¿úUãÝøv·»Ýbg» ô£ÌVÈ.¦ ¯‘õ¾f!Ë«ÜuY$]ëZת/R±eÕ°ïuì#Q—Š8âÉ*¯k_ûÚ•Ñ—^zé }’÷ïÿûeRþæ7¿9ÿ´ÌÍ^èËTßùÎwîSɳžõ¬&ÜŸÂàãÿxsæ™gÎG·"û±}lã{qœ ÷Ûo¿BÔÞâ·hvÚi§qQÊ}øýò—¿œÛ>Ÿ¿1ã¾g³™˜ K;î¸ã|°øèG?Úüᘿ ßÌÜÊÖ2I¿g­¯³¨ë4:P—¿ïwè0¢^8‚Qc«É xýíokÖZk­¾d;¯G^7§¸8 \§Èvâ(¿øÅ/„µèk~ô£›úù€§o|íS¸æškÆ¥¥ŽÈ|üÏ]ïz׉Ûb©„Z'öK<î¸ãJà&yRÁWIcë­·ž4jgx<Ûï~÷»2†qÁœ2[f­G‹5VMRëZWvýYÜ7ÈIP] Ãúp00_ ñ0 _ðaYÿÖ·¾u™¬¼€Ô$}\ —7ÒBÔs›CøÎ¯eÈK\>'‰@"$‰@"$Ó"`•hû½vÚ´–W¼•±Ì°YQÊícŸøî˜•u)RìyÏ{^³Új«•´òÏ ¢Eû.2™ñéOºyqSŠ0«Å $ýýïÿ²7šïGÒÈì—¼ä%K¹Q­ãÅï/~ñ‹…¤¯Ý­Æ½íÈ:ü³Ÿýlç>nÊj"Ãw6÷²w¼ã–—pøÌg>Ó<ÿùÏ/:Q§¥_DyfqœF‡ìMCÒO“×,êxe§¡?"é¯ýëƒJ¤;œÕ¼É4$;A¸#éÑ÷º×½Š›áÏþóEí[Ø%±*æ¶·½íL‰z“¾VÛ˜<†¨ï*ëB®á¢vÙe—y#Õ…¤•q—E`±ôhÙœ®œ++»þ$Q?¥Þ é_ô¢5w»ÛÝʋߔÉôF“¾—†˜‘ä ‡<á OXŠl÷Àä"§Ëøj‘Y{íµç/·—ïÕçAЛ­®Å‹JJ"$‰@"$‰@"°Ø°†ò~'‘Ÿ÷SÆ'È×6Ú¨¹Ñn·ÊAiO%ï­,ðÅ÷h¸M`±èõÙb‹-J$1Ë}÷XÖ¶ß±‘N¬$¹/·ôké*s]ÖÍ&®{ÝëÖÑŠ¹|Y-#€ov³›-Cœ².š.Q.d¬2Êcã7^ŠÈ‰º#¤…DBM[n߬¥á²ÞzëËe$qH”GÝì¥-Â)´Ú`5Ö(Ø×D}´ 7Ùd“¥Úù¥-âFšÚß÷Ôît§’…rÒƒ¶>EþÊJ·Xó«Ãæ›o·–:*©‰úQ8h z\·×1òaÁº94ä <}ïÁR+¯|ì1Æ€Œ;$¤`-ꯛÞô¦eϲ¯}ík…´^ýõëàKýVçSN9¥Û°ž•h#„¹þW[ËïÜsÏ-e¥›ú;=Uᑈt@añm ÂÅlXÑ¿ë]ïZª¸úÑ~ðƒÒ±œþneU~V^®~чÿR™Ly2Tª0‚Ÿv¤Ót€u<¢UÝk¢Õô£ý¨\sO\Rë°sýX;[ÁßzlÕ7àý"ÆãŒqTœÍ6Ûl;é‘®¼fÑ_¤ /nQŒÚߨjœ%£ÆãÐ5uô ÑŸŒ+ÆúÐ iGÝ݃«4éž>Yad©?g‰I2ºç^õtÕxü‡<¤Ä…!¬…sl÷K}_? ?ýéOK¸óï<Ÿï¸g”UÆ uU~ãÝ/ž‡Œ6õ:Ð~–ŒçþÓ´­°ê§Ü]¢­ÕW¶²E;Á(Æ@ØÆªñ•CºÆFÆIzZë¢00Ðÿ=+éà.¾þáÞPѦž‘t+ž¥]ÏmXÒî÷.Œ´¼ôÏè—êg<×¶ð6yhR9t»n§(¿<‡<ëûúçBõhèXåí;F_j¿“¯Œê(/ýƵtá۾ߥ?u˜ùwõS´Nô6d}Á ^0E ££è¨oxÃJ m¶Ù¦ãŃ¢ŒC(·‡r-üCqËò­o}«ü48ƒJ-ă%Rÿ _øBy!3Ì-LID HD H #|Ćøà÷^ë£Ü;12òñ|³á†– Þ¡ßÿþ÷—`º,Ÿ C— ®XÓ"È%a¸fØtÓM éñŽ?þøâ:‰GøYþú׿^~ûçþÝï~÷æž÷¼çüµv™»Ê£ì|ä#çIÅc=¶ä/÷äÁµˆzùÀB8Ülº¦¼·Zàá]݇­0ιȬñQwu¿öˆ˜ @¥µä ¬‡”Û·ƒ•Å$òó±ýä'?¹/ò?è ƒŠë‚¸¯½|Ûl»í¶%^×?äc´gÜWn„I´z=õ©OÇ€µ¸ö«‰z–ªþ‚˜–÷ npƒ^¢ž«ï*¢¼Ú»‹¤BàÅD‚°ãp@x!©j¢Ù¬Nˆó ê‡äÏÏuøÙL}³=ýéO/i=ãÏ($âmÑ.m3Ü&©OŸ CX§#|èû,‰ú .¸ à ÕøÈÏ÷ª¶C !GÄ ~¡ÿÒ5ß¿¡[Ûo¿}!aÔ–˜üal‡ðãZCþú܃üू·ûó(üÛ.œ–JhàÉ0n½ï}ï›ãÔÑúÌg>³è’sòœçWä; ÃUn­ÃÚ—¨k=¶ÒÿûÞ÷¾ÍV[mUîÃÞAÔK/ÆQ´Á1sûM œïp‡;”8þÕy9ŸU‘ÖX&ã´³¶1þp÷.7®…®ãô£oIWݵ7Ò›¨»1Y½ÕŸ[=sê1$žGÑßµþ³µ®š¸BÔ'ÚD½6BÝø¦ŒÁ {Fi‹³õ/˜òR¢Oè'ƧºüåæÜ?:óÁ~°¸&Snõ7æ8Ö"?pôËÄÒ“žô¤2Á¯˜ §rza¬4¢¾®cä¬÷œ }Õ?ä1Nô1ºE_Œ Ž_ýêW+âÙT—Czí÷eç2ÚåT:ƒp¦ nÕÏ”úÓŸæŸožCâh“ÐAp;ϹC"pò¬<"NÝ?¢GÒ2V 7NÔ¿ë$ÜAvÒѧ¼;‘!:Ø¥?%òJò/‰ú *Hzâ®»î:aìîà^x¼0S&³>þ¼8=å)O™ÐÅ ¤%Š:¼cƒ³çe7šr0³ûîw¿» úÂè ø1|˹YGÀ{ßûÞfË-·,ƒ¦ÛZ^üâ—É‚W½êU%žå[Ay§$‰@"$‰@"$‹‰Bá‚ 2 Q‡pô±$r!>ž‘Þ§¹i@Úú(>à€Š‹ïíAàøðFZyŸFzpâÙ‡07‘H)®>ÈBeA #ƒT@œ»¶dŽÜ‚¸.3l¢T>r}¬³S~ïùücû°–&+g Ã߈K$”ò!¢Ì5þ$=«Mï÷ÞÝ‘ Êà½^ÚDzˆLß>‚ɤåF0ØŒU™|¯ s}6 õáÍ3bác")ª]l‹¬@L…•i)Àÿà#Lkñýļßê…zá _X›ú·o/ßFô!ÚA=#AªEâ¬l‘ºÆãpˆx£ŽCòGÎÐË ¥8HYß‚â×ähW^mÂM@Ô!ÀºÚ"Ò ‡tþ¡/qo¡G«V>÷¹Ï샨GÊ/,Ž#“ ô˜hýÕ䌽Ù|7#Å|óú.nû™4YÄaëIu3&øN©ûÅBñ4ûŽCt@\ú¦¾1êc&Äô-×ô-$"#»hWXéÿQ·Z‡£<&-b\пmâ©›"G:´Ã;ëgñµ \k]¬óšU‰üYT‡«"uddh3ÉG=êQd©ã¸ñ8Óäµ¾aâ) w“!,³±ç·IÆÆ•þ÷¹…ßÑ6ž}ÆnmF?ÞùÎw–¾‡ïc»±úÞ÷¾wY•cÌŽ*âhωÆ<”þfRÕóÍsÁXfÉY²&ã•vñÜ4=/㙋°Eðª“?í×qý#Òu”ÝŽ ~ã‘þ€Üö®a\‡µÉd¼Y<õYeëÚˆ[¸g?ûÙå9Š—¿÷Þ;à¤5ÌéÊÐgý¨þ¹=:V¸¾×~'1véwøV“…&¸uò4DëÍ1s/f–ìyi¯ÅK†Å{Þóž2px=ÄBëÇõÏv9†ÖmÈXÕ•vßµö;‰‰AzéL¿´³•$&Z‡è`_~+Ëõ´¨Ÿ ¥t 3´|ØÍB Ò¬j†Š¥‘ÈzƒGÌ’×/º–¦Ez^4…ñ¢O¹k1°"áÍj|t⶘qÖ1þŽ$ÒŽvœ†«!ùÆŠtß¶¾Yëw®·û…tŠ? –:­ÁÏC»mÁÓ?¯%‰@"$‰@"$Ó $uXI#ˆv$M-qÞ÷Î[‡íû F.´]*øàwòv™ãã“åwŸ(SXˆÕaĉuÄq-ÒtdŽßA 9j"ˆqZ„LŸLSî ;•¡.Sä¡<Ê2Ī:â8²ÂŒ …úzè{ݦõǼ¸aY§Ó÷›îtµCèNÄc%©^ñ5‡ˆ×.Kû|Hþ°äZDXÖ–¾Éè㪡B;ì°B´Ð“˜pèŠpDÈÈOœØ#\~\¥(×í: ñúŽt“Ž#ë᪞A*÷ÅYÈõ «âi1zÐî³Àæm]Žþ=D„!Ú­¶:6ÑËÀÜ=“&ËŸ®Çä`[‡£þír™(%u‹°Cí¼fÕ_"ýS½C¢í‰O÷cœŠ0§ëØî÷'ð'ð©ã»ÖÆ ¢ÿÀŸó«Å ÔNµÄy´Y}¯ï÷g”Ilî£L àv¬^AÚ[­+1úÒëÊlLâºþo| ¬LòÕ“©t50ÒvVdpåÆZK“d5F‘ž£¾ý³¾¿#Ïqý#Â÷•Ïê%c 2›¸3²Š‡µò+k¬Þ‰|Œ!íçTÜz l`7é³~Hãê& 8wµC»ß ɯ+ Ì_ôŸ{%ã—gNì¡0T»Ò^™®]î{ee*q–u¹ à…ËÃÄ@lÉ#±üäo|cïà¹\ –™$‰@"$‰@"\¥hˆ²È/ȉv™…u_þF>Þ«‘« $¥÷°4uŠkѰt­ë%½X~/‘–÷%sÖÈQƨsX _réÿÓ”;ð@`„(÷œ±Ù)Rˆu+r"D^êÞeuÊÍòGùÛR×ݽ8lÖòª†CE:ívPžÚš^Z\FDÝÇïQ8 5`“ âÕíí|Hþ&1ȶÛn[6WdEÚeYõü³¯‚U ¬ÝÃeI;(½$0¥ƒHdüigðr$ÚZê¿ ãJ€1ÿˈq>øÕ©Ï¢vL2ƒnG›µõ9Dâ~»_ÌcB‘ßAbú=NcŒƒ{ôqG×Å~ÎÚÙØc:[“úmÐÔ—û°v‹ë“Ûy¶ í/Q†(£sz‹€†§º8ÅÒñ¸DšûÏŒ8|jâÜØP»¦Ñ§•¡^Ñ…ÌüØÇ>V& ìО@0^³¦æ§&þYr“xÔõˆ29Ödy`;êe2n’-\#‡îGÚ]„kŒ ÂX…c¢g?—Z¸!"m]E`»B7Õßu:\ëj„‰c ‡öˆÛwô|2NjK®uvÙe—Ò†1y7kûH‡e8Ÿ½ff!Ó>ëûòžT†ŒU}y ¹Vý<{˜”6ÆÕ:#!:Ø•W­Çî;ïJcy]û÷äòÊ1óY)ð€³ Ï_J"$‰@"$‰@"°¼@É<4Oï®6C„ÛXY…Œ³|áQ“,CÓŒp\x°æ²¡œwäŠ|Ã,ì|Ðw•ÙG¦ÍS=ôÐR>ª#14üÑJyÍß6"Ç}„Å6ÛlS²÷QŽ|±·"M=|ü‡û€(£eùê­ŒòEPÙTò /llØ'Ó”›õ1L•U>Î5&àCø‘G °¼æ^AÙ•Ç7B¥-ˆ²¤ƒ¨÷áŽä°ªÙ$dn_ˆ×Yá £,m’½$Þó§ºD; ‹µKMpeОH‚=V:@?•«í¶dHþA*—z#¨Ž™Û¯l¨˜@áÎA?W}C¸~‘~ld›1Û(¸-ö7£ƒ;í´SûÖÔçÈ9›3Úˆt”EíÔT•ÝØ€,†ã&›lR\P±˜5©ú×î Å¿*BçÏ!: íø¼¦?6/5YÃúúøã/>®Ã‰±ÅïгpAÔ¥ÃQ:Á…5¶h¤ê¨U·ëؕ׬úKäG§•Ûd’I ¿ë=þì×g‚ !­­ÇÇ‘®ñ•»}IoœÿÆØ`lã§ÝxmlÓ>a™ÃøÃe A4Òí*ÆDbÒÌÆØÆc¸É ã}ˆgV»ñl”§1IûyFÙÈ•¾ÀÑäz¸5¶âNVa°ö§_®µÇåd‰c»g }©Å˜,®ºÜ÷Ì3Æs!¤¿‡^© =3™d§O¢K=g´‘v7¦:Ú?úÒëHå´²Çø }{NC°¶JA[z&Ù˜žŽ¨ç¤“«Q¦ú8í³¾NÃïiõ(ÚaÔó²×$çÆ B_õ;ý¯~^¹7D…«EŸÜo¿ýŠQƒ½^¼í¿ÿþeÌç5ÅÄŠUkÚ¼o?:½ÅþDýb#œé'‰@"$‰@"$‰À <—øɇy-–Ùû ¶¤Ÿ• ûKæˆ_]í°u¼q¿>ìŽ<òÈâRxÜu}efýå k<–v6¿#ö¿ŠtÄ$€M•› ulÒŠœˆUÄ=¢'V½ ‡ â>yƒ Ò²B6\N”‹Õ¿iË KDíá‡^È@¤•kH>¿퇄B"}ôÑåšò°’C$’ºM80 rª¸âŸåïêŠ@GÚ|ЇÈi„`ó‡¸ ×fÔQûî¸ãŽ…Švß„BX^› µ^*Ë8`ÂùÆêVÝ]C¢†ôüÕ‰>±DijÂÝä˜1.üÏ›ÌkKpdn¼A’..7æþ!ßU1vè‡\}Áƒ´1,[ÿBŸ…íÊËõYô—ÈÖø‹Ì£Óú•ÉM!uy\Ó_FÇ/w£tC™ñ¡3®°©Ç6ã:=!Æû°Öq;âJ3ˆzä/’ùÄO,­Â,™ûë ‡v=äa"Fûk/þÓÇ=£¤«=0#Êa,в¸¦,ÆaäÃú^¸ZøÆdeæ{Ÿ8¯ÉzqLø™<6Y ¹Yó×brÃ*‚˜ä¨ïÕ¿ž—VsŦÝ=/M ¡ý£N·ýÛÞö ¨Çí¬C°¶"É´Æ}«q`i‚bŸ}öig×ymÞusšg½t"ÍhÏiõhèXÕUö!׌Ež™ÞáLÊëgˆy“ÔQö!:ØÎ+âÆdIãzàÇvüå}¾ÊÜËÆ¿×ž ÌÝKVTh`”‰‚±¸¨—N¹ ÞêRþLD HD HÄäèÏ»9ÒÁ_X†Å5zˆŒY ¢ù°óÎ;KíiÓE¶!¨gýÍb™¾º×ä÷2+4aî¿Oä­ ‚xë ßQãÜÌ¢ÜôƒÅdø€î*“òø(Ò¯+ Km–t&ú†¬žƒèo‡ŠO;^}n’Húí²"‡X î¶ÛnuðùßãpÐn¬_Çéd_þ‘QäCƒP ƒ‹‘n”E:ÒKÖ²&dê¾2ª_,6þð§á¸þáâ8N‡…ÒW#½QÇqyŽ}ãÆÐþ2*÷H×3 k<æz‰õ3ÿätYÛÅ{í½÷ÞebÉXÕõ<…Kß=e=ûì³Ë„Q{ì髌ܫ7•þ¸2¶}ØKÓó¨.G߸Ѕc»ŽÒëòvèù±¾Ý?L™Œ%ÚN†”»kcˆÍoM|ÄØbòƆծYq6 ú¬•×´z$Í®±Š¥zÛ5_;:k’h”„žŽ{fÑÁÈG›ÖãA[¯Ûço1Þ+L{w%¿'¶¨7c7Gî/fYËköØ‹ši&ž$‰@"$‰@"$W*aYËŸ§-L 1mzyQ®)ó¸ò´É–ȯ>vå]ßßA ÄyßqåöÑ;®nãʃè÷Á¿dÎ’t”ŒËg(>£òè›aá~—»âÃÁ‡÷¸òK·/ÿÈs\>nÚãÕ…¤‡½ ÷5^£úÅbã¯ãt Ê:DŸ"¬ã8f\_fˆŒËkŽCû˨tÜë“q؉kÅÓ8™Å˜#ù…{¥vž}õ€Q—Œ+Ó8lå'ZúÆ…q8Jc(–u~ã~«£øí²YUdUÖ(1ÆQ?¤Ü}å0áÎh˜ë$Ü)^“e=vÓM7U„‰îÍ¢¿N«G Ú5VYÅ•Ø(í8¢~œžFúívŽë]GùÖÒÖëöyvyÿîîÝ#Ja–‰P45³K)Ì(˜½L¢~Öèfz‰@"$‰@"$‰ÀŠ‹‹'KŸW&YË ß¥ÜVeðMÌÍŠ(,ûÜ6¤\õXQúÅ,‘^ž:¼<óš%Fø'ãиISdnpb³Ì?ÓZ\–çþ‹È~~ϹbÉO§¸káVßyUÏÉ|V.¼…W™›á™Èõ ÿSf:ꨥv§^xQ.OŸ.›Ø‚¿…Hº¾Yz7HD HDàê„‚Š8Zìè/ÜÜ `ÛõÍÕ ï¬k"$‰@"$‰ÀÕ>×7«^=áÈZ'‰@"$‰@"$‰@"$‰@"$‰@"$+IÔ¯í¥HD HD HD HD HD ¸š"DýÕ´á³Ú‰@"$‰@"$‰@"$‰@"$‰@"$+IÔ¯í¥HD HD HD HD HD ¸š"DýÕ´á³Ú‰@"$‰@"$‰@"$‰@"$‰@"$+IÔOØçœs΄12x"$‰@"$‰@"$‰@"$‰@"$‰@"ÐÀ GÔ_vÙeý¥½’ïœyæ™ÍÎ;ïÜ|ûÛßžII.¹ä’æôÓOo.¾øâ™¤·Ø‰(ëž{îÙÀ!%HD HD HD HD HDàÿ³w'ð×USÀOƒ©%¢A%QiQ’ˆ’B™3T(E¡”!óYHhPªW#J¥4GˆŠ”¨DQDÞç»ßÖÿÝÏyν÷ü§§gXëó¹÷ÜsÎ>{¯ýÛùç·Ö^gj˜ãˆú믿¾¹ì²Ëšÿüç?SSÃ)Ìe饗nvß}÷æ½ï}osÑEM:çë®»®y÷»ßÝ|ÿûߟt^³#« .¿üòæöÛoŸÅe‰@"$‰@"$‰@"$‰@"$‰@"Ì,|oÔr‘EiýèG7K,±D³è¢‹6wß}wó·¿ý­¹á†ʇçöœ*›m¶Y³à‚ ²~¿ýökÖ]wÝ9UÕÔ+HD HD HD HD HD`.@`¶õË,³L³ÖZkAó¯ý«¹ÿýïß<àh–\rÉfå•W.æsr8˜M7ÝtŒ¬ç]¿öÚkÕg:~üå/)ávn¼ñÆæ‘|d)ïa{X)Šwû\Ь·Þz¿(ß±ûÝï~åø]wÝÕœyæ™Ík¬Ñüõ¯m.½ôÒF;<ñ‰Ol^xæ. ¬K.¹¤ùÝï~׬°Â Í:ë¬SÚ&òµµÚÁŠ‚+¯¼²yÄ#Ѭ¿þúÍb‹-V'É߉@"$‰@"$‰@"$‰@"$‰@"$=˜™¥íyÑD“I/6ûUW]ե“ž‡úâ‹/Þ,µÔRÅÓ~à 7œãÉúM6Ùd&²~Í5ל(,C¯ƒ“¸ð ÷¹Ï}Êöè£nöÝwßfÅWln¹å–樣Žjô ÍDÔ{ì±ÍƒüàBÔ#ç¥9çœsÊŠx#Û¿üå/7ûØÇš‡?üáE‡ŸÿüçÍþûï_~Gù σ9âˆ#JœúHsÜqÇ5þð‡›‡<ä!‘$·‰@"$‰@"$‰@"$‰@"$‰@"$=˜m1êÅwçI¤“ý·¿ýmñîöòØÿûßÍÿøÇQ|ÅWnd=Oû9Yžþô§——˾óïœ65Ï:ë¬B¸ô£m¾úÕ¯–;Hûï|ç;ã.ÆõC9¤Ùu×] Yî¹ç–|L>ñ‰OÂÿSŸúTs衇6oyË[J}ýë_Ÿ©¬¿ÿýïÍÐ|ðÁÍV[mÕÜqÇÍùçŸ?SšÜID HD HD HD HD è‡Àl!êy^?á O(„<’)Ü%âÖÿùÏn®¾úêBÖ¯´ÒJ]Éæ˜c7ß|sóo|£yö³Ÿ=m:í´ÓN â\È#^îBÓŒWžò”§4 & ,°@³Á”÷üô§?-ÙcÃË~‡v(ïpPHŸw¼ãÕµ çyëk×-¶Ø¢œºöÚkë$ù;HD HD HD HD HD '³…¨6EØÄö ’~Ùe—mV]uÕ¢v¾Ž!ƒçDQ—7¿ùÍÍ“Ÿüäf—]v™6o»í¶æsŸû\ó†7¼¡èˆt¡l¯-T‹8÷âׯÒ6ެ²Ê*³Äà¯óñb`±ðyù§$‰@"$‰@"$swß}w ÉIhNºþò—¿lN9唲ºØ~—X‰l²p–sòû¼ºtcôÖÓ{¶¦J¾ýío7üà›A¸MU9™Ï䘎ö¯Vm²ÿŒAï´Ûo¿ý¦|E=§AaycÎî[ŽHxàø*1#µùôúë¯oN?ýôò>@QRD ˜Xxv(Áûš½B¼8ö±}lsÙe—/n1ÒÅx7)ßzë­%ð*H`iç´I3Hz/c}Ó›ÞTô®¯÷½ï}%ýFmÔ<þñ/ÞôŸüä'›>ð3Ù&Ê…DùÓüЇ>tè¥sªñd¨Òy2HD HD`¾A sùå—7+¬°Â]gÏïyÏ{fzÞñÞ'ïˆâTâ½UH¬p²së­·n¶Ùf›H2Ëö„N(ïj¯Ž%á8üà?(aL_úÒ—Žóʦ¬ Ö.QqgÐqÁE]4VuÔsLÇåyhSÙ¬ Ÿêö¡þ,§Û:Ìmýçšk®)Æ:«Üïþþ‡?ü¡ùÍo~Ó\zé¥ÍSŸúÔYðèü~Hҷ׫<Ž™Á0œ]‹ãØsÏ=›ÕW_}dVŒ wÞyç,éD.À_ÉŸáŸÕsŸ²âýƒmGú…Z¨8C¶¯åÐ §å—_~&‡V÷;Æ[eÓ¡ùÓ—se¤«Ï׿éä‚’g}Ì9ù9ç¸òºô²„³îââ|”Uo»êQŸÏ߉À¼ŽÀ„‰z“Ã(¯/‹5€M „U”¸n±ÅkÖYgFèánxˆ/·Ürå·A+mxkÏID}ôë®»n³ûî»—úL×—É–qCY¯}íkK1&}b2$Âà_ýêWc!jà¿ñܸ÷¸Ç•|üñŽ›£vðÂYí÷ªW½ªœÏ¯D HD HD ˜¼‡Êÿû·¿ýíÍÊ+¯\—¼êK_úR³ï¾û–B<;Ya‹ìð)NNˆúO<±¬|dŒøÞ÷¾WBZN5Qÿ“Ÿü¤ùÑ~ÔL„¨ŸÔfÎNžÆóì3s¹7 éêGÃÊœçæ¶þƒ¬Õ&œï>7ðn»%–XbZ›i:Ë9üðà Io3?Âtÿý÷o>ýéO7_ùÊWf!»Û5ÿ™ƒ»¢}Ž?þøAøáá†?þñ—ðÁ/xÁ š¯}íkÍ…^§Ç¶x±w½ë]cûÇ{lsæ™gÎÅà¹Ï}n1ÖJ„s{ë[ßÚì¸ãŽ%ÄñØ…3~œtÒIÍ©§žZî)êiõÑ Yo½õšwÞ¹œ¶Ú€Ó¨h ïÿûgº¤«þ8#ïp|Ñ‹^TÒŠÁðñêW¿ºñþɶÄùöqûêüT×ù<–Ì댛¨7 x_û#9J~ýë_—I#<&ó&V=’IqµÕV+K8íÀþh²Ðò$!AL*kvœgxîFìv“ÎTÉÅ_ÜXAPË£õ¨Bг&þìg?kÎ8ãŒbÍüÖ·¾U,˜‘–§IÑdéF-\4ã¸ûX:æaYÿãÿ¸¹ä’K7‘”D HD HD`v!Àc#Š0<ò<3¬¸âŠ¥xÏ\pAƒTXrÉ%ÇTrÌÿbÇkùýïßpFáè²þúë7Ë,³L}z–ßÃÊ–Xè™Å_¼üï>÷Üs‹'#$!$Ûñ†n(§o¼qñ ¾âŠ+J¹t‹•±•8å \¼Ï‹H/ô!gŸ}vqÂá8ã9„ì¶Ûnå¹áÄû¾Î>®±õü"¤Îcó˜±ç7Ïfž/àk•3Rϵ ãépßûÞ·œ—ŽœuÖY <‰|=<éIO*û]_¼b=S Ò½#«K8!°xèz¶eÚÔù¶ç®úÁžž9oºé¦1§%e¨#O[¸#š¬Þ®ûŠ4êÁiLþpõ5^ ÏbåÁXÿª¥Ö7ÞxcïÀˆ—ª~ÿÌg>³x¬ªÇ-·ÜÒ¬µÖZÅÃÉMøUÏnê`Uº~·#¡ç-ÏšœÍÖXcB~…wkôѧ=íi¥Í?Òšç¤5W´çšºÜúw»çŒUícÜÊG߈˜kðñ®AÜ–ßßüæ7Ë|Ò<¨Œ×¿þõcó¹4áy./möõ¯½ô<˜ú0À𬆠qÛu×]c·lÃÓŽÐ<§vZ™?7Ûl³Roû'Ÿ|rI[ç5*ª‚>„a4€aÌÍîk!æXbnÕwÜ[ÛòÆ7¾±8àŠŠ¡¿~÷»ß-¸Ê/æ™ö5±çŸõ¬g•¹)ŽÛr NIægÆMÔ›ÔLÆ}$&aä>ñçˆeŽ|&Z70bâr³1¹¸Ž7½ßm»$¾—¾Ü‘ÖñÇ`²jÄääf7ÜÈS˜žôþ„ö³Ÿ-ÖVçüùs#®oÆ{ì±G±þ²–,8"ÿ:mIpÏ—óuš½÷Þ»ùÌg>SŒPœÛtÓMÇn$ƒò‘ݰsu™ù;HD HD †€ÿ¡<øhþÚ?æ˜c ¡½ÝvÛÒò¨£Ž*+Kkò•סÿë5QÏé¡ùãñ’—¼¤yÎsžÓ©Â¨²]tôÑG—gä)q2Uሠ¦£ç^ŒÏ;‡zhóá¸n‹.ºhñL¥s-þÇפÈu×]WNמñ=äR<[Õ×{öâÍ©<¿·Øb‹’·×^{–NÏ^<ûƒ$Bxzy¿Ï}Ï@>ê‰|#òEN"ê‘ìˆ)×ò­Åó ¶F¾EvØae…RI+äbà¤|«<§hk„Ò1ðAd{®A„îò²ÔóÑWjç&û[n¹å¸žóÄ6×/µ•ü9X3|rÈ!%=]#m®nú7ï\¨ú#©µ)Yd mÔQ}8“!| ì„K"¿öbØñì}”± á/üxJêSŸ*ïE›L?êÓþE¹_Qã%œ=#ÕõUxÆ&ù0XôÑ¡î?ÆÀÿüÏÿ”>%o8qÄ…„ÜgŸ}Fhúÿ§Gõ®>êêw¼ãwÄua\d˜$ú.R—Q‡¨Û /ô¡rwˆªåT»:è ¢¿ßÊÑ?'âÙ/ÜX=ÇÊMô³¾Âë; ¯õ5úw š èü¶·½­öƯù·æNè1È{œå7¾ñ2¯ð–yå+_Y"QÀ\¤¾ÒUƒT»|ó²9‰o~` Ûx†¹- ¸êo¼1Ü08˜Ëƒøo§ïÚ_zé¥g)¿+]Kæ'ÆMÔ›ÌY¥ûˆk•Þ ”¥ÖÄ熳ü ¢ÞW“Uˆ4$þÀù㩼9Ex”LI¯N&4ˆ pýÑ3±†ñ£¾ÆÍ|X7¯®rÞûÞ÷ÖÙ”¼ý©…9Ÿðd‰DþtåcyWJ"$‰@"$‰@"0yä‘…¤G ?ÿùÏ/„+â‰9^J$R©ë¹Ãÿ_$2Ó Ö¶Œ*›G"AH=ãÏh^þò—ç"¡i`fj¯jNˆyd„Æ"©„àɈd« òå)êˆ7uˆ˜Úžà€ÊjÞaÏž%œG.#â=WXAý…/|!Ô,[mØyÅ+^QêŒL¤›r\Ú ®>a\9ï¼óʵ5Fu¦Èi$ŸpBÈþx€ÚÔÖNÈ4¢NÈUd‡©Q‚ÈbÀàÕŽôÓ6Œ@ˆ3åiƒQX3ô éµìyÝ"a9NµÅóƒˆPòW7¤úk^óšfãš~ŽäEjëÛúº1íÏC—QFÿ>î¸ãš¾ð…cE¨‹gI}ˆ0ïq«-&Úú¶ÿ˜#~¨¿¶Üa‡Æž‡9ô/X—¼Ëµƒ‰¾?ŒG}‘±È|ƒÀä)¬¯õéªÑ§o0Œé£Èùê H]Ç¿øÅ/ŽÕIoþ2×Äø—^Øäo_jx‘ôú½U Æ›vÆ;0TŒÂµÎ«þ­ÿDŸsœŽxý™‹Ð7ÎO…H¯ðP7£œå­¿êóã!וk\Ôù¸'Ä_ŸŸÈo}‰aÙg|YY€KŠ•}ò4÷“˜ûêk´!§Öšô¯ÏOæ7'R÷ Þî<ê‘õæ™A¢Ï« CÓ ´y<HF#ðÿÿèF§˜‚e7,÷u"ïþðYÖe¹žð6,rnþT¶…•ߟFZ^4)ÿ‡@x¸ ãOša×Ç9Ú$}œËm"$‰@"$‰@"0XÁ‹èD $Œð“ÂÔ„t©Aè"‹vÚi§B#£»ˆú¾e#Î…¢A’Љ·6GGMÔ#fô©†pC„"éÚ8z+iýGð„ »¼6#<’©MøÇµõ–§¾2„Oð¼El7Úh£BF"[|ˆç7ä!B«•PNöøBô"W«ñ\h>ÿüógZI oøÐ‰ ^_ö²—BÏÊk˜yÆáàdÂÙ¸Öê —ÖH¥S¢/!A”žMC¢+úb}¬/ùiõ7ìaÅK>ÛkÄ%A¶#é Ç(¡HFÚcê#l#zXèH|"”ºŠ SÂË™W³:ÖD½0%Ñ…aÒ?…UEÔwIŸºõmÿ®üƒSím,×c"È͉êÀ8ÿX¡c 0ÈG†õ }TÛY}My$=Ã2š˜£bþOùÃÒZ‚¤'Ʀ>‚L‡OèQ¸Ê›a¯&À80ŒÉûùÏ~Ðå#ë·„&úÌÌ “sñ>ð¢ë.»ì2®¬9ÀâBó©9/Ä|Ïp„Ä6Æßm1¿ÇûHâœ6×?"ú'×¶p"õ™ÁÕ™g̳îŒ.V2µÃJ1º7 }£½ÏÖW/ãŒQ,Äœ\—ãxnù )!êY3ý9d…ögÉ Ó$æÆ€¬7éÐ<Å Fñ°Lb¿!éM¸&Í”D HD HD ˜?@[[“ÝjŽ rI8–¾‚¨¯%öƒ©Ïõ);Ò#ÐjOFÄB”Wo-œj‰ò‘»µç!}d‘|ö›á]D”k‘#]úFèš«ËjÿŽ<µ>mA"„à̱ʋkÕI[XÙ0§ («]„v„ü„·ØâáÖy 2yߊÍ»Iì¤`—D¹s¤Ad±å#~ÛÒn¿öùØ×öÈ[aQ¢½„¡Pvè1 k†ÒŽouy›¨ÚÅ'Äósëq i†–px³2¡-úZ-µq<ÓÃyô©[¤Öþƒòï:®îõ˜Áiµ7£ÞQHb;Á¼¶qHÚŒ-… ùÕ7¢µÉWcƒÔD}¹†7îSí±mD/„ú(\}'Î[ýa5‘U4x`1f!ª…ÊŠëméÇS÷{X×1 ºvÐqŽ£V9 ’í±8èº8nç´Òö–7#”sôËH[}«íôc2î1 ³KŒ%N¶ {ŒJ«®ºj)úì³ÏžÉÀç {…6‘Îx³*Á=ʬ¾¢®Œl!ãkqMny )!êM>–Ò ’˜Xâϲ>¬Ç&¯˜€\$}LhƒòÌã‰@"$‰@"$‰@"0o!à¹!'Ê(‰°™‘.ž9bß¶³i?ž1º<ÔÇSv—C²¢o”§lúÔécÈ^¤ÑûÞ÷¾1ôÿ»âÿ¼¥y_Ë¿&+4"}‰(¡Ìó1Äó² Ñžæ¼·…šðA¾ð°¶¢~n‹ë»¶áí¯¾5ÙõwMä¥N^Ê¢}£›W*¢¾vä)<Ó‡s8ˆ!ævß}÷(²ládþL'ìðz×›·¬pÂÖø þú`eqj« á> Ø^‡#ˆ÷1ðŒ QG}f2Ò§n}Ú2:¸ÖXa¬âñ‹[Pæ'?ùÉ1ì'£CZ«Æ@›Ü¥û°¾zµç“mŸQºE9‘.ö£ÏŒÂ5®µexòŪaköÊ«çŸQùÄyú1&¶¥cûü }<”ðTbé[Mƒ£¯l³Í6cý­ëZá±Ü§Ìêo.m×]xñ‹_Üuy1n8AÇaúqœ%Æw[àçÛçºöi‰ÐZ>!ø¾z%Žã “1Ø×Þæêñõ°áÈ›’$ÿÀ‚ÿÿsâ¿Lr& AŸ¸É)YÏÇ"Ê+Â9:Ýd-gr®ý‡vâšå•‰@"$‰@"$‰@"07!À£˜Gp¼D‘î<üxg†W¥cB5„ x¿¶…·­g„3Yi¥•âPñÒŽQeG:q{ã¥ªŽ‰õ®œ6¡'LK-Ç×K 1†`ñ’Ï oëkBדN:iì02—ãòUH„./蚸‘7Rßó—+ñ¡‹kC`#›‘…BÍÐ QÎpÒöZm—WNÂ;¸ö W.B»Þ¼¼ÃBŸ è#¢®ˆ.xòb­É¡Hgk¥ƒz(×3jˆðâ3†„Ãï×(S¸×Õ×ĵ][Ï«p²E }äC<ÓöÁ:VTzè¡c+Éõó ʺÊí{,B©OÔÑ–!£oë²ÆÛú¶]Æx~7úªv{ík_[ˆÐh× %'ªƒ¼…h1CdÜ!îûʨ¾}” –˜›Úý¿NSÿž(g‚4¯ù™£Æb\k†ý¶r÷£„ƒÖ‹Dè)¿Ûsˆcƒ„~æ˜zÞ7·ç¥A×·3<2€FÿiŸŸì¾0cæFEõ7Þ½6ÂSõÉ?<ü…†i[óÆ{ìQŽ›2„j‹:ês}Å;O… ‹O¼°¶Æ¾+?:ê[uÿêJ—ÇD`8SâQ?¼ˆYÏòpð'$%HD HD HaV„ìðRËï‘G¼ô<ü#•# b ‰‹ÌðrÐ.A0ËgóÍ7/‰†"¡#4ò—àÖ[oÝðŽUv”Aùn¹å–…l¢¢wã7Ž$e+ú"sÂZ vy“#åp^±aY\,ä ]yžrÊ)åe§òêF\_å!€H»Ž!ñåç%§^¢*¼ÒÝ O½üV™H£ã?¾ä)ŒbM¼jD/¯Hõd\@ÜD8¡Maï”H(Xªƒ—["Ó¼œ’qäâ$]-Ï{ÞóJy‹h#åÑ—‘Æ nCè-N2S}˜00 ¶ô!a6fRÞ µá§E>æ:ûpwaѧ•½÷£¸Xµ@ç:{¬{9¼6$©¿¹Ïxag¼ÔèÂÈUcä8ŒCºÎ+#VxDºÜ&ó£ï4óY×D HD HD ¸W@""5Ž9æ˜æ°Ã+º §½P9Cx"rÃË^áÎyÜA‚!• ÏB±§ƒ8ˆt±íS¶|eÂÅ Pƒb¯£X,udµUÃÈ ^Í<8‰‘F˜¤QYåäŒ/d"˜ÐY}ôÄqÅËýBÿØJÈ€”C|#ô>> ¼¶£<¤®˜ÜŽH0Ä"š‰³çž{ŽaŠÐGñÄE‚ä#;å#4÷ÙgŸBª 1ª‘G!ðð2K:K¤F™‡rH á°ÞzëÅå[/âE~!²â”Œ êF}¼!‹È&úBiÚGeÚ„ÎâoD» Ñ: k×0*èKá/F8cÏÛa8J£¯yA¥>ƒ` 2S˜‰0îÔdZ;Ÿ:ÿ‰ô£¾íß.w<ûðç¥cPÛÑ;tŸˆÚÁÇKuÃ0å¥ÑúŠ6ÑGFIŸ¾áýÆ¡>ÊÃ\[ °Õ)ôTŽq¤®Æ¯þÇð4êš:/ä®q¡NÄÊï£#Ä(\ë¼üÔÌ×Èå#<²\"¢™ñ•„α-‡|1&Èóàƒ·k­µV³ÔRKãKg9«VŒ ÷™Z´K_¢ÞœÔ÷ÆOÞþæ™ÀU¿0'h3‡x9¶Õ*ÈûZ´ ƒ'Aº›¿¤Âx K×ÇË«¥s¯¡;ÿÜsÏu¨Ìk ÐÏ0Êô‘¸®ý噫h_÷º×eU×±óùX‚?¼˜Ö§!‡—®óÞ`ÎNIæW˜1 ýÿšÀy …¶er«^V'HD HD ˜2bi},_·õáð;|‘E;†l®=ì¦L™{2²W(‹ ÖÛù wƒ t>ÒËG½‚4ˆô¶ƒ®×ñ^{íUaÞ„5©ìZD6#ÏUÒ·Þzk!ó‘Z®é#êK¯ ƒëkÕCû©_›z”7¤—v‰¶ÕÖðíd¸¶ ì^]õƒÂèЕ—cêEZS%êÀ;µK/e¨CÝ~ökC =6Ùd“1=ûèÝ…µë¿ùM´ j]`7¨|«ŒµaÒg¤Ã°|Û禫ohóÇxÛ:æc„pN¦­·}c˜Ž¡kµ OrªAsg\»Êjc@E4w½ôzМÕΣ½ß¥ûDڧÏðj2" ê/îÙ™—ë÷SÌéuKýùá¤Ü›Wóqü§Júù±7dD HD H¹Q¤m› T¥aùÑܾvØ5‘6ˆÈŽ4•.’*Îg«¾ƒê<¨± ¡]Î(<<“v½‘ÀˆÀAd`¼ëô}~÷©C]„nÛö«^ðvÒGï.¬]wå•W–ÕçŸ~ Á#|â–ì "®KŸaÇ´KWùîé:7Ñ~4¨ý…nA¸¤ä(¢~Ðx¨ó¤CfÔïéê£úè ½ŒwïªÕgõD}-£ g}p­óô›!pPÿ4g Ê+Žwé>‘ö‰üæ–­Õ@£Ä¼1ž˜ô£òËó‰@"0ûH¢~öcž%&‰@"$‰@"$‰À\Š€P1£ˆqÏy½×$ð\ZÝùJmäd„R™ßo¿ýJŒz/|ô~a<„‰°³C‡{« /óœ›dv÷¾ØXQ¯ÈtÝm·ÝV椗5(ÝÜz|NmŸ¹ÏÔ;Hî=2ôͽ‡}–œ$‰@"$‰@"Ì1X2Ol…˰õáÕ*DA„Z˜¡oæpR‘D HD HD`ŠúfÁ)Ê?³ID HD HD HD HD H Dý@ËKD HD HD HD HD H©B ‰ú©B2óID HD HD HD HD H&€@õ-/ID HD HD HD HD H¦ $ê§ ÉÌ'HD HD HD HD HD ˜IÔO´¼$HD HD HD HD HD ˜*’¨Ÿ*$3ŸD HD HD HD HD HD`$Q?Ðò’D HD HD ˜^®¼òÊæßøFóÏþ³9æ˜cš+®¸bz œ‚ÜCç»ï¾{ rœÅ5×\S°ùûßÿ>8Ñ8ÎÌ.½k•¾÷½ï5gŸ}v}¨ü>ᄚŸýìgÍÿøÇRÇ?ýéO³¤ïã?¾QùïÿÛ«?ýøÇ?nN>ùäYŠ:ÿüó›3Î8c–ãS}`Pù£ÊùÛßþÖüáhþóŸÿÌ”T½aúÓŸþ´¹êª«š»îºk¦ó£v"ßÛn»mTÒ±ó—]vYsì±ÇÜ;1 ?Ô÷æ›onn¿ýöÎÜÕçâ‹/n~õ«_5ÿøÇ?:ÓŒg\ÀÜ|ýµ]÷ÎFœh‘m}X¡ó¨¹õ /lŽ>úèf"óZ”5J§¾ç§×¾eM61f¬ýä'?)}QŸì’;ôE}hns¥þ:U“#Ž8¢ùÅ/~1UYf>ÓˆÀtÍUãQ¹îƒs{ÿYx<Ï´‰@"$‰@"$‰@"Ì.½ôÒBn,¸à‚ÍoûÛfÉ%—lVYe•ÙQô„Ë^xz³Èúýë_Öµ¾pvé]—ùë_ÿº¹ßýîW*$'bë1yLó—¿ü¥Ôq‰:Ó…Cv‹ÊzÀÐlºé¦Í¿ÿýï^ý ¾7Þxã,9ÿð‡?lûØÇÎr|ª *X9]tQsæ™g–$;í´Só‡<¤ü†ÁÿüÏÿ4¿ÿýïÇ._`š½èEÍòË/?vlЄãATˆÚå–[®Ù~ûí%é8ü~ó›ßŒkï˜âÈùãŽ;®&ô¥m·Ýv¬Xyä‘ÍM7Ý4v [mµUó¸Ç=nì˜ãÊÔfÑ_§¢®é3U`ÀNôaD½2þð‡[3àuçw6|àäÚ}8Êê>;þ£SëøKÿW_}uÃ@X‹±¶Í6Û4õýáÄO,Æ¢HßvØ¡Yd‘EâÐL[dþÏþóæIOzÒ¸Ûb¦ŒZ; âçž{nË.»lëìè]F-y¬¿þú£÷Hñç?ÿ¹¹á†Ê¶Új«õ¸"“Œ©îGÓ5W§NuœÛûÏôþƒª™6HD HD H{øÝï~׬¸âŠss£ÎžSôö°OZSå]ŠÛe—]š…Z¨ä=™/D?ýæ$Aó›ßl®»îºæ¾÷½o!Ìjý¬@Ò?ó™ÏlV_}õB`óFfï±Ç ¢z˜œvÚi…¤•nX³ëïð“N:©aàë† ¤óSžò”fÍ5×lx^Âá[ßúV³ë®»ƒN\7§Œ‹Ðg*¶éÃ/}éK'DÒO¤¬©¨ã½‡ñˆ¤èCÚ¼ð…/,¤;Ü Æ4$;A¸3‚ ¢7Úh£æÚk¯mN9å”Ò_üâwV#VÅ<þñŸR¢žÑ×jÆÓ‰õÊNâà2Ë,Óì¼óÎ̓üàIä’—B`ºúÑ òf÷ñ¹½ÿ$Q?»{L–—$‰@"$‰@"$Cà Åóu…V˜)oJÞÑÈוVZ©yØÃ6Óy}È|×/ºè¢ci„Mà±èÃûЬµÖZ% ’˜ç¾s÷)mñˆG<"NwnµÁýïÿ‚}MÔGÛÀpå•Wž©M‘_ÚƒyˆööduÖ)‡è©´ûS¤§«¾Å›_ÖXc85Ó–D;… ÃA[èÇu› £«C¢oö-¹Oa[`©é«œë¯¿¾yö³ŸÝ‡„¬EýµQàñ¨G=ªyâŸØ\pÁ…´~ä#Y'Ÿé·:ÿò—¿,Ä6¬§J´ÂÜø«=Œ•wë­·]õMã]?U鑈ú€ñÂãþÚ>V9q^ô_øÂfR×8ºä’KJ{ ê‰1beUyV^®q1ÿrÁ$¿úö„*Œà§õi}€w<¢UÝk¢Õ$””cι–Ô}ؾq¬8à[Ï­Æðþö·¿]ÊqŒ¾·Zà!œ‡[iìŸ~úé3á£îêqí1A€Êkù{°î£72ElkåyØ~ùË_^ˆåýë_/¡ â¼öÚ`ƒ š7Þ¸\×õ…|ŒöŒóôF˜Dۨ׫^õª1 x‹k¿š¨ç©êÄ´²_|ñD=/V¤¡¯öî"©xaHv/$UMÔ#›Õ qD}ŸòŹŽ8ûéƒô æ5¯yMÉëu¯{]!Äo‹vi“˜6I} 2„w:ÂGŸJ¢þ¯ýkÁÁ«ñQbTÛ!a…‡#bи0~õ5ñ‘£om¶Ùf…d„Q[Âøóä'?¹‚Bk(ߘ{Îsž3Sòöx†;„ÓLõÜéÓÌ[_þò—Çæ8õF´î¸ãŽ¥/EŒð7¼á c¥"ßaȳ›Ô}Xûu­çVýÿÏxF³Þzë•óð‡wõò‹yTmpöŒ÷M œŸð„'”k|ÕeÙŸªñ"¯Ã;¬ã´³¶1ÿ˜³áÑ%£æµèkA€Ç|#ÿ[òUwíï>Pws²z«?1·"cë9$îG1Þµ>ˆÙº¯2\!êÍm¢^¡n~£#¢žŒºGi‹˜³/˜™ÍÊRbL'æ§ZÿrrÆ—>sðÁ—ÐdôVsŽm-Ên‹H§1,½ìe/+sS3éÔA®ŸÁF+ ‚¨¯ëe#ëÝ/£?Ê%Ƙ¾¥¿˜lÏ:ë¬Æ ‡¸7Õzȯý€î_ýêWs\è©þú ÂYßFJý–Xb‰æ–[n»¿¹¹F›Dd€ÛaF8$ç>÷úÀ#®©Ççdú‘üúÌUÒõïúOá ;ùSþ;‘>}°«ÿ”‹ç’¯$êç’†J5D HD Hù„ÂAd¢áèa?HäB<<#!<œ Ó€´õP|衇–— î¾ûîcŽo¤L¤‡!HÂb5#¥„ú@ é‚@G&©€8wlùävĵÎÚ)ô coÞ¯}íkå!×Ã:1úó4ÛƒµúðRŽyÁøöOcÀÜùDÚØêC[n¹eñ~v½vkÝ벦j¼Dù<ª#T‘:žzê©ååÏ/xÁ "ÉLÛQóq$ÖGׯÃ!RîŒ!<ÂͳñÎ a“Ì!<ëc•KÛ``ŽÖ\G³¤ö(¯÷Ã0UÝóåþcþ1nÜWê{Ϩ{Ô~ðƒBоño,†0mãÞgîÖfúÇç?ÿù2öãƒÄÜn®~ÚÓžVV嘳…ÑBJÄôG@IDAT‡h÷†DsÀñƨêþæ¾`.cDb B–1ž°‹û~äé~÷\„-‚W|´?\GÈkØV^úvøÍGÆrÛ óÂ(¬“á÷cc–n]/â–îõ¯}¹"À•ïÏ›ßüæ‚CÖ0×WúÞë‡ÏÉô£¾sÕ0Œësíÿ$æ.ãŽб+0P'ÿúôÁºŒ¹ñwwà¶¹±&©s"$‰@"$‰@"Ì xÕ‚ÜFÒÌf·Äƒ0²ÈÃzÈÉg=ëY…Ah… `²$H=ûHz‚$‰åíÄÃ-ÂËAH!µÎÈ‹Ð' ²èäá)A]ˆHd…pÈðæõÀŠÜÙ|óÍ˪ôtF Õ"/õbXÝC®kyæ…H#iBÆ«w­SÀðCl0ú¨//leñ@}þóŸ_ˆpK—–A(FD¬ ÐFòD&v‘qÍx¶Q.’AD?\›hkúàÐG>åÃyÈUÿ³X‡—xŸ²¤Ñ7‘»B¥è{ƒDÿµ"€wkàß•úPýA¾Œ8#èÕ?H9$i‡Ò@ÆØ‹1k¬ÀA;꥿“ ?‘†Â’¼éMo*„§ñ‡HC…Ôãb*ð‡g‘ß1Çôé02ßã1ÚÂÊü¦îˆz"$ ž4êÐîÃ%ÑŒ/ÞÓ1/ Uc~CJóo¼Ü;æÔÐÃ5í²¦j¼„>t4Ï©—>ŠøeÀí’>óq\·É&›Œyp»—À%êiô9$¬>¬ïÄ8Š9=ÒÅV{0Œéïqï b[j‰y'<öësƒ~÷½GóVè+î¯ µaePþõq÷&s;cìcn¯ÓèÛîqÆX´‘~«ÏĽ;>­_ع&æµ:?¿£ŽHz™þî+ÆAŸñÑγkFDß×oèÏH£.$ôö@?²&îÇô oð’Iõ¥ïÆ}4æ=/ä~Ðþ?Ò÷^?j|V*ŒýìS·>sÕX†=~´ÿ“0 ê—îÁú—v¶ò„¡µOìQô$=êçèæIåD HD Hù ¬äå[žÕmÏN¡LÄñ%AV¶ EA E~<²jñ0X{N#^j ò哟üd}¸ü¼­ó }<”úº#⼜ñ…Œ2#ê×`?ôBº„wæÇ>ö±ÈflKÇ0^´ë>½ÃèÑÖ) +ôA@†—t(‚l‹s¶ˆX j‰6ŒcQ.l“ø´ÛAŸ@´®Ê‚" £pè«Wßòõ©X©Ävß2"ëx¿jwqÙÕqðÚE’0ø ÇmAbõ!‘ b1#ˆô ¶Æi;” ãUHzê4ôì#jH¹2ÂÆHÃظs¼=.¤›,þðŒùB~ ÷¼ç=¯ÕãZ„¯O[ôUs ÒRø+ràIÔ—´ûp98ã+ÆTì‡Ñ2ÊŒãõ¶k>v^3‡¶Ëšªñ:´çûŒ¼¹ë¾!}ŸùxP¾°aØ‘GÌqp6G„˜·´_”ÇmõmÞþÚ%}Çcž ÂÞ1båiã[øŠ{Á°{/zºÓGh£0| Èz¦Ãævc£=«KGÜ·¿Úâzbåk¬D€«>k·c‘Çõòì#¡þ£ÆGä7hk¬7á­/4 ¢Ø;=è; k1øE_‰rÌ=VBµ¥žÛâ¿GÝã˜ëÆs¯o÷ÿ—HŒÏ²ÓúU7ÉûÞ¯ZYÜmÿ'‘PŸ°bŒa•ñ&DýûöÁ¸fnÝö»£Í­µK½D HD HD`®B Hêð<ì£|íHšZbIÞÆ0ê^·iý0ïÚðЬóô[ßéj‡è;q/Iõ dq][—ö~Ÿòa)´ˆ´¼-©úˆp}E_8þøã Ñ¢Ÿ„Á¡ëz„#BFy® =Âá'T =„¶h×)HȸnÐVßÔÇ‘õpUÏ •]3™ãAVÅ6ò b:úA{\Lþ0o÷åß}ú€4D»Õ^Çæ!ý20wŽQ€± ùéxÛ}8êßÖ‹¡”Ôc,Òöݶ˚ªñåŸêuh>y*ÒÄ5]Ûö¸k×>õõ޵ñB?ðã<æj×EŒU‘WìG›ÅñaÛ>÷(Flá£P²V¯ í­ÖŠ•ÃÊpŽþÈæÀ$Òÿæ—ÀŠ‘¯6¦ê«‘¶³"C(7ÞêBª0’ÕE~¶ÆJŒÏúxüŽ2GH?hK?«—Ìt27 f>ä? kúÓ5VïD9æö}*ÎõÝ6°ï½¾O£ê&8wµC{Üô)¯+ ÌÍ_ú¿ðJæ/÷œx‡Bß>Ø•÷Üt¬{ÝáÜTƒÔ5HD HD HæÚOŸŠ!Ì=ÀÅ‹öâqeIiFmÃ3‘1ˆyý)/ȉ¶ÎÒ:¯üH£¡D« $¥÷ð4uŠoÑðt­ë%¿X¢î"/ïËÏðF£Îáü)gþžˆÞ#„NâëÆËN‘B¼[‘!ÊR÷.¯Sa?ôoK]wçb?°AX+«öù´Û>µ7½¼„ŒˆºÛßÃp@jÀ Œ ®«ÛÛ~Ÿò1ÈÆo\^®È‹´Ë;³$ðå½ V-ðv%í¤ú%©>ˆ$A@ÆG;ûÀË–hku¨?AÆ•#¾ˈq1øÕiGíˆlzŽ6k÷ä‰óíq1ø›jŒüÓïQ}0æ8¸Ç·uܵ1Îy;›{¼cAŸ­IývÐÔ7^.ìX{ŒEºñlÛe¶“/¡Cèh_¿E@ÃSÝ œÂ€Ôg>.ÍøŠ{FìG95qnn¨CÓÓt¨Wt!3:ê¨b€òþ€¶Á|Í›ZXœšøçÉMâP×#t²­ÉòÀvØ= >tbgd^„Dß/;3¾ºט¤± ‡¢Nç}.µCDÚ}íXˆ¾©îâ®ëÃu_4±û¥¾bnåµþã#®´u2OjK¡uvÞyçÒ†a¼ëƒµ÷ȇg8ŸwÍL…Lô^?¨ìñö£>sÕ ²ú¯~ï a”6ÇÕ}F}ú`WYu?v~Ô~W³ëØÿ› gW‰YN"$‰@"$‰@"$@É< É,‡‘3^>†÷bEd2Îòi„GM²ÌrñˆBxðæòB9îÈå ‡yØy ïÒÙC¦—§wÜqE1ªí#14âÑÊy-Þ6"Çy„ÅlP´òPŽ|ñ>Dšzxøð¡ºeùêMGå"¨¼TòŽ;îh¼}@Û‰y­ÿxy)c ïëóÎ;¯Ä¸Žp$æ¿£ŸE¢®>ÊèB飿íT¶ê"®íÚv•5Uã%ÊÓ§é͘Ä(á·¸ñ!x`1!¤µõ¨ù8®3¿ 7bì"éÍóã?Ò˜Ìmâ´›¯ÍmÚ'<Óaxøá‡—9Ѩo‡0¨˜ £™c›{ÌáŒæ;ý!îYízĽQ™æ$íÛçåE®ú ŒA®Ç;T¬x€“U¼ýõ/ÇÚó=yâÇÜÔbNV†P]λç™ã…2Þ£_©‹~f>bdqÍ ‰1b.uŸÑFÚÝœj¿ïø”GúÓÓÊóƒü½£ pꃵU ÚÒ=É‹éõõ¯q5tª·½××yø=Ñ~í0ì~Ù.k<ûæ ¢¿wÆ_}¿r®O”®cò ƒ*N ÞõâÿØ!‡Ræ|aàV¬ZÓæƒÞ'Pç7Ý¿“¨Ÿn„3ÿD HD HD è…‚áòÃ.ò`^‹eöˆ-éç%èüò3ˆ_]í´õu£~#<Ø|òÉÍ™gžY’{àFxx¨¤3ïçÐ'¼ñxÚyùñ¹È¡A¼$‘Þ©ã%­È‰xXEÜ#z BA oD^Â-DȉHÛ‰ê KDí‰'žXÈ@¤•cH>/~%Ú …D<ýôÓË1úð’C$’ºM80 rª$¸çËòwuE ¹FÞbЇ(i„`óAÜ!òÃ3/Ò Újßm·Ý¶RÑ®gPÏk†R÷KºŒÂ&<@éÂëVÝC¢†tŸòÕIâyФU¶ýз«nÒ„„³¶:»Eð{Q+"ºN×ÍŽ­°$ ?°n‡½é񩑯 =aÜN/…DÆùcj»í¶+De׸˜þƒtê:Þ§¸Î ÇÌqž1¯M,Á‘g¸ùAHºúp91ã ù†¨Š¹Ã8ê qKÚ–ƒ­¯èÏÒv•åøTŒ—(Öü‹ÌÓ§+ÆM†ŒZÇŒ—aóq\'–;è¾Ags|ô™HWØÔs›y]?!æûðÖy;®•gõÈ_$ó…^X ­Ò,?cî¯ íz(ƒ!Fûk/ñÓGÝ£ä«=Í0#ô0….ŽÑÅœ"rxßKW ßœLg±÷‰ýš¬w ƒã1cy`€äæÍ_ ã†Uaä¨ÏÕ¿Íî—VŦÝÝ/HßñQçÛþíÝÞPÏÚÛ»$H¬­HòZó¾Õ8°d øÜç>×.®s?Ú¼ëäDîõò‰<£='ÚúÎU]º÷9f.rÏôŽQÞ8CÌ3R‡î}ú`»¬¸6Œ%±ãOlÛ×ÏîýfüÙøÿµ'³»ôi./n¼Ó\LfŸ$‰@"$‰@"Ìõ <ˆ-2ÈÖ‡74ÒÁ'<Ãâ=DÆT ¢ù°Ã;Oí‰æ‹lCPÇCØDói_g™¾º×äwGé#O˜{À$ÊÖA¼ JaVF…™ ½õ“ºK'úx(Ò¯+ Omžt ƒ†¼žƒèo§ë‹OûºzŸ‘Hþm]‘Cc5ò¶UVà8hÞè;^†åãéºtÍÇB/ñ~Ÿ\Ÿ@Ö2.¶¯ÿìg?[ K檮ûÁ0\£ëÍ7ß\ Fí¹gP=`ä\ý²QùÒ)°„½<Ýj=Í ]8¶ë(¿AX¶ÓöÝï3×·ÇãcÔ0ÑîˆpÒGïAX›C¼ü–á#æÆ/¬vÌŠ³©¾÷úaeM´ɳk®â©ÞÍ×._Ÿe$&ÑOGÝ3ûôÁ(G›Öã¹Ý¯ÛûqÝtný¯`(ö¿.þ+ùõÓ‰zæ$‰@"$‰@"$½ÏêXþÜûÂVÂA$D+Ù¸w‘5!ƒ>:Ò§M¶t)ÖUvWº ºÎÕǦBo½£ê6JD¿þåïYEPëXÿUN_|ê<Û¿AxøEÜåö5öGáàÁ{”þòT¾sdT9ÿ—jâßó I!ý2Â?Ôˆ Ó?=FõеOŠ´¶£ú°4£Æª4}dTY£pì;^†åãÜ …k­x%S1ç(Cy^©]æ zÀ¨KFé4 [åÉ£–AóÂ(åÑ˺¼Q¿GÕÑõmݬ*²*k˜˜ãƒ¨ï£÷ =Ü9 ´ÂŒ` <ë‘°«®ºê0Æun*ÆëDûE»æ*+£„&°EÔê§‘»ãx×V¹µ´ûu{¿N;»wîÙ­E–—$‰@"$‰@"$ó=<ž,}ž›dnÔ¾sŠÞVeˆM,Ìœ(<ûÂ6¤ÌûÌ)ãb*‘ž}xv–5•ñž72š"sy^§Ì]]ák¦[sd¿¸çÂñä×§„kVGè˜yYÜ'ó^9ù^`†…'CßLÇÌ!HD HD ˜«@P[K€m}"Ì fºCßÌÕ¦ò‰@"$‰@"$‰@…¾Y°Çµ™$HD HD HD HD HD H¦ $ê§ ØÌ6HD HD HD HD HD èƒÀ”Ǩ_~ÆK€ú¾ á /ln½õÖ>zfšD HD HD HD HD HD`žD`ʉú;ï¼³ï^ÆámÇwÜqG‰m =o9öfbq.ïºë®ûržD5+•$‰@"$‰@"$‰@"$‰@"$‰@"ôD`ʉú›nº©ñYsÍ5›e—]¶9ÿüó 1OŸÅ_¼Ùpà ›k®¹¦¹òÊ+{ª˜ÉD HD HD HD HD HyŒQ?ï¶mÖ,HD HD HD HD HD`.@ ‰ú¹ ‘RÅD HD HD HD HD Hy$êçݶ͚%‰@"$‰@"$‰@"$‰@"$‰@"Ì$Q?4Rª˜$‰@"$‰@"$‰@"$‰@"$‰@"0ï"Dý¼Û¶Y³D HD HD HD HD H¹$êç‚FJD HD HD HD HD Hæ]¦…¨_pÁ›%—\²ù׿þÕüûßÿCÏ>Yb‰%ÆŽåD HD HD HD HD HD`~F`Zˆúå–[®¹ÿýïß\}õÕÍþóŸ1|ÿú׿67ß|s³øâ‹'Y?†JþHD HD HD HD HD ˜Ÿ˜r¢~á…nV^yåâMíµ×΂í•W^YŽ­²Ê*³œË‰@"$‰@"$‰@"$‰@"$‰@"$‰Àü†À”õ~ô£›ûÞ÷¾Å›þî»ïžÏÛn»­ùÃþÐ<ä!)áqfID HD HD ˜žC>ò‘4ßÿþ÷'‘Ëì¹”®¿üå/›SN9¥¹âŠ+š®g(šü÷¿ÿm~þóŸ7çœsNóücö(7Å¥Ð_»Ü~ûíS–ó·¿ýíæƒüà@ܦ¬ ÌhÒLGûW©9A‡¾:‹Nð±}llkï÷Í'Ó%‰@"Ì=,<•ªÞç>÷iûØÇô¦²xÕ?âhÿøÇ7üããpnD HD HD ˜4HíË/¿¼Ya…&×tf ,è{ÞóžæŽ;î+†CÓ»ßýîæa{ØØ±«®ºªvwÝuW9öÕ¯~µÙzë­›m¶Ùf,MûÇ 'œPœ£6Ùd“ö©Iíÿà?h~ûÛß6/}éKÇÏï~÷»Ò.QqgÐqÁE]Tôfõ¡}hGŠ<4¦²MGû·ns‚}uö¾¿Ë.»¬á IÚûÃòAêgxÀfIæøB -Ôˆ†à=‚ñ.ÁYÎ8 ¤±wºÆçc~e,ŒsíëÚç¥u¬KèÁÑSyûÔ¢wÞyg©Ç ,PŸÊ߉@"Ì“L)Q¿âŠ+–É~˜'yPÜtÓMÍRK-U{ö)‰@"$‰@"$‰@"ÌO|ô£mþùÏ6oûÛKøP+>øàæK_úR³ï¾û(xØîsŸ+$Ù[Þò–æáxƒ¨?ñÄ›µ×^{ 1â{ßû^y/ØTõ?ùÉOšýèG"ê§£máÄБ$ýt Û4ÓÕ¦GÛÌ5`P;ôÐC›Ï|æ3Í"‹,‡Ëvÿý÷/ä¸yçÔSOm¾õ­oÍt¾ÞÙa‡š§<å)e>zÜã×ì¸ãŽ £Øž{îÙ¼úÕ¯n6ÜpÃ:yùÝ>ÿæ7¿y¦wÖ<øÁñïz×»ŠNúЇšš?è ƒš‹/¾¸q|‰%–¨/Í߉@"Ì“LQ¿ûݯAÔ³ÆvŦo£Ç«Qo²çI2ÈÂÚ¾.÷D HD HD`ÞF€å¯~õ«ê…׿j«­Vž5ÔšÓÏ\Ь·Þz3…ÒtÌ3‰ãµüþ÷¿oV<7×_ýf™e–©OÏò{XÙ =³øâ‹7Ë.»lsî¹çOÐuÖY§Yi¥•Ʀn¸¡¹ôÒK›7Þ¸xsdR.ÝøÀ–2…µºø¹Ï}nó„'<¡“^þ<èCÎ>ûìæ/ùKóªW½ªóŽï¶Ûn òëðÃ/Þ÷‘Ö–gªklÿô§?•:yÌcŠ!Àyž±?ûÙÏ ¾VC¯¾úê³xÝ"ãéÀÓÕyéÈYgÕÀ“ÕƒÒ“žTö»¾8dýøÇ?.DúlЕ¤<z6„—U° Ò]›zN|êSŸ:ÓµêGxþú׿.N`µABy"ÃÝ3çšk®9S_‘™züô§?-ù3xhÏñÊo~󛢷ò`¬ÕÒëo¼±7cŒx ÏòÌg>³üV[n¹¥Yk­µJ”?ÿùÏÍ“Ÿüäb QÆÄ&Ü<×¢ß 9=o¯±ÆåÙ›W4‰>ú´§=­”§.p2ÖÈTô£>í_ òõ§¿q §½èEåŠ>øŽÒALCŒK+5³Â…Œš¤é£tµh#cD?xä#YÆx½š¦N;‘ßôî’à_6Úh£±¹ÁJ™ãŽ;®A·]óeô¡®|kŸÛÛÞVV8ǹ袋6Ûm·Ýâ•ϳÿ5¯yMóéOº9í´Óšg?ûÙåœ{€±±ÕV[%I_ɯD ˜˜2¢Þ~-Çjÿ9¤› ª08<ìSD HD HD`þFÀ3ÂûßÿþBÀ"|ìsÌ1…ÐFî -:ê¨æAzÐLäë±Ç[ž-êg‘K.¹¤9ùä“ q$a<^ò’—4ÏyÎs:AU¶‹Ž>úèâ¡Jâeð:ÝyçË11‘¨œ˜¯šx¸~øÃ.„²êŸøDѹ\tÏ{‡ë®»®ü¬‰häáÊ+¯Ü\ýõ‘llË›õøã iá÷[lQÒ#÷Úk¯BÂÒ )&4Ö $Å6ÈsÚßþö·â¹ÿ‚¼ ñQOä?‘¯gÀAD=’ÝÊ¢]»ÜrË•ýøBj뫯¾z¬;찲€ñI+$-}ˆò‘}›nºiyîDÚÿð‡?lDöÞ{ï=VG¸+ÿMozS³îºë–<ô•Ú‹Øþ–[n9F—D#¾ÄÆ×/µ•ü¿óïŒßùÎw–+û`ÍðqÈ!‡”ôÚ#0Òæúñ’K.Y<Ú‘íê¤V^ÐÈLÛè[êÃËÙN`'¾9 üµ—°Jˆý裌MiäÇkùSŸúTñšžL?êÓþE¹_<úÕßxѷဨïƒo´CFMÔ3þ¿ ÆEŸy¡>íª2†oÚ4ÚÑüb•HÛèÒ¾vªöÕ/Æ>=ˆazÔ£¦ªˆbÄŒÌ^µ!‡ÍZˆô]s4/}ó»•CŒvŒ™)‰@"Ì/üŸ9}’µõç.â?²†ò0éó‰bŪ¯—7ÅñÜ&‰@"$‰@"$‰Àü…À‘GYHzÄ0Bö‹_üb!@‘˜¼NÇ#ÈW¾ò•… ý¾P„p¼‰»¤oÙAªóBvL#Hm¢_ ÛåíèhÓµ¨ ýµ×3žñŒbãÉ<™~Ô·ýk=†ýV†zª…ïTêÐg^¥OWýjŒS†2íøÞ÷¾·öŒó£ðäçüùµ¯}­n½õÖf§všeî›±É:'‰ÀüƒÀÌÿö&XïXæÉÒí¦2žåU–-Z’˜’$‰@"$‰@"$ó7B  ŸIž´Â¼¼á o(äöxÐA¤"t9yæ@úã]Ò·lįP4ÈvD;om"¤L-<—^zérˆs"‘‹Hn ‚þ³Ÿýl!ª#,„4Ç—S_Ãã”ð ï#ˆoe ¿#”­Ðž l}‘ ··¾õ­Åë·O‘Ñ‹\E¼ED3Ò¿xÃg³Í6+e!-_ö²—•kéä9“·¯• !^xa1¸®qÜ–Ñq†% éK¼Ülx 3†èOÚ0VB ñûЕ䰇Ñ_?ÿùÏ—¾ÛkíÙ¾Øb‹•ß<þŸøÄ'–ßí¯÷¢>^.ªß©'£O „RW1ÄaªyàKÓ®ã¶Ûn[ <ºÃsÙ;IŸºõmÿAet‡“qGf·£æ…>útÕÉ|Ä "ä Ñ®Æ5ãÜü(æ¹¾ð…%œ”Õæ s@J"$ó“}ã³%Œ¬öƒeÇ5ÂBÔ+‚‘¦QNHµ5RK”Ü]e•UÆNñÐÿÀ>PÈ×ýöÛo&/{±¢»ôÐ;}_°yœ~úéO[<Ë-¿üòg±À½¸V´ÅóŸÿü±ðíëºö£¬vý…óc¼c…½-òðŒøô§?½ùæ7¿YÞM€uÍöÛoßN^ö£ÜÀ9!²ƒØò¿mi·_û|ìkû—¿üåÍGÑD{1 ˆ¥­ìÐcÖ±º ß³u¬`ˆòè>!Œ AZÇ1ïˆgiÏØÄj‚¶èkµIì"ŸÀyô©[¤Öþƒòï:®îõʓȾ‘f²:ô™b%Æ0}¢ÿÕõc„ã=ÎpÈxǸEb[§_~3fZi`~¦,%Hù Iõb#ú£Âj ÆâwÞÙø£êO›?¼< RD HD HD`þCÀsBN,êQñ”#ç¶p&ªåïÿ{ÙíòPOÙ]ñ¼¹ÛùFy¡CèS§CÖ!{…°yßûÞ7æ× øx_Ë¿&+4éí8q]{e"”#¤Œ4žÇÄFÒ€§9ïma||„Sáa-äˆó}$¼ýÕ7<ê]õ÷;òR§×¿þõÑŽ>ž3 âQÏ«>p÷Ô. ÏôaÆ!Z‘ÕÂÌÔ‡.2µNSÿ¶RCˆ"/õ./½ôùЇ>4Ö†aeyþáòŸ ojÎtÑÉ?Dõ™ÉHŸ~Ô§ýç$ÚsG½ßg^èƒIW}w„´U-ŒRpûä'?9Öº®é{,æmúŵ 1õ\Çç”­±AÇEYdNQ)õHD`¶!0©Ð7n$&Q„Âb"š³ø³{¡HÜP&’O^“$‰@"$‰@"$s7<Š=Ü~ûícùÅ/~QâpóxPx°—9¶…§*‚;$BzÔájïáQeG>VÇKUóRHå´=¹…i©©K–Yf™²E<#ëi^òäm9yÏWèzÒI'Fæò^~†|H]8&ßy{ÖòÜ%ÄJ|èâZçØÈfd®îÒ QÎpÒ~Þk—WNâE˜µg¸rÚµðvæŽÐ}‚ tˆ~¸ŠSO+ÚÄc¤åø¥Ê­È„fœ1!JÄø2×^{ír]}MäÙµe€“-’Uè£$´L¬cEÅ¡3â÷‡B?·ša²¡Ô'êhËÑ·ŽµãíG}Û¿.c<¿ûàÛWó‰þ]w¼7¡–QóB}äûx§„67Þô½×¾öµÅè}SœöÉJôXAù鳯rÌ+q<·‰@"$s“ò¨ÉßR¯‰ÜðjýÑt³°ô.½ê™Ü&‰@"$‰@"$ó¬Ùñþ÷¿¿ÄÕFhñ¨Fô"•# Ò‰+tˆ—ƒv N>›o¾y‰ßí…¡HèÕW_½$GþzAåÖ[oÝl³Í6%Ä˰²£ ºÈwË-·,$7ýÄß3<ÒÉŸ¾8Þé@Ä./]„kcåè£.$4Òšáì³Ï.}´ý®€®:Ʊ‰ô£¾íeŒwÛß¾:0 1ºxɳUHìx)oè5jN꣼öØcb°:Eè"cÔ˜ôÛ<¦oM…™eÌê_V¬ºêªÅ¸øo|£Ìíðb)óš‘Ú!˜ÝRD H&ŽÀ¤ˆzLôþ ×Ëõ&¢Ž˜zD^“ýc2‘òóšD HD HD ¸÷@"zÉâ1ÇÓvØaE!D²Ê"ª  ‘^æë¬³Î˜W¸óˆ3‚ìæ…T&<®wÛm·1r)ÒŶOÙòA¼ #¾4ñ ³Ë.»rº¸çK,ud5‚‘Ï«™÷,ñ"Òsafl^ž·ÕAO‡2Ž„þ±uŒŽøf°Þ‡€×¶2 Rw×]w-¿áˆ”dÐ@D!qöÜsϱ2úˆL^ëHð|ä#cç¢|„ø>ûìSÈÏ °÷ÏzÖ³šï~÷»%__ððÂS:K¤ý|©ÌC9¤ái¼Þzë]ßõËxÞŒñâTÆ…u£>ïxÇ;šƒ:¨ÙòЯî}ÉKýÚ„Îx`QѾãŽ;Ž­–…µ‹ô¥ðÎß`ƒ Š1™:LçAiôµ}÷Ý·ôFž …ê ãŽ4ƒ¤Î"ý¨oû*¿ÏñQøöÕÁKv­d@˜[É£…†ÒLJ>óÂ(}Ô©¹>äÒ1èÊŒrûà0,Môº?1*™?cUϰëÛçÚú‹%ïSËþûï?öÒï®zt«¯ßí²âxnD ˜×X`†ÇÀ'ZIKÀÆcUŽ?þ4úÓ:rñÅOE6™G"$‰@"$‰@"0Ï#+dmý/·õáÜõá™)np󿽪d*Ž9ÛöÚŒ2„»A¬ :é†å£^H½¶ ºF w^Æ{íµW!„yüפ²||Œ â•ó¾õÖ[ ™?YòI}éÕE² ª‡¶R¿6A&66Ò.¦Úõ×¶Ú¾]‚ ×6>ÓU?øÐ!Œ]y9¦^dPX›rrœ_ê ÜK—^²R‡ºýì׆„AÅm²É&czöÑ» k×!P­^°Ê"dï½÷n„Vb˜h·Y¤ÏV»Àd}ò›h?ÔþÞÀnPùV'k£¤ ßúšA:ÔiÔÏJ+,†áNças’<ésl;ÿ¾óX­ïx~«gH†½©_ãÑ!Ó&‰@"ÌŒ€ŽîÍþGqˆÿT³þ#ùº¡{–FZ6èÏÛЋ;NºÖ/néH’‡D HD HD`>A`©Ô&ÈÁ2,Ÿ šÛ×»&Òzõ,„”³"`*D}ÕyP=bB»üQ:yx&íz"QFR”ÓïaútëS‡ºžG­ä%¼à# l½»°vÝ•W^YV[œþù… öN¤úÆ3B(µÉÜQ: :¯]ºÊ”~Ðñ‰ö£AíÜqÇÍôŠ®rÌ}ˆúQõ¤C]¦úÊGú‰¶·kµé 1íš©uëƒãT”•y$‰@"LIõŠvóôIID HD HD`^G@¨!t† ¯\^ï5 <,}ž›3@˜F’Ù¡Ñ~ûíWbˆ_qÅåýK-µTy/ƒ8óºˆÓž’$‰@"$3#0©Ð73g5çíeè›9¯MR£D HD H9Ë`;§„¾™3‘J­D HD H‰#0(ôÍà7ÈL¼¬¼2HD HD HD HD HD Hz"D}O 2Y"$‰@"$‰@"$‰@"$‰@"$‰@"0$Q?¨fž‰@"$‰@"$‰@"$‰@"$‰@"$‰@O’¨ï T&KD HD HD HD HD H¦$ê§ÕÌ3HD HD HD HD HD è‰@õ=Êd‰@"$‰@"$‰@"$‰@"$‰@"$‰Àt Dýt šy&‰@"$‰@"$‰@"$‰@"$‰@"$=H¢¾'P™,HD HD HD HD HD ˜’¨ŸT3ÏD HD HD HD HD HD 'IÔ÷*“%‰@"$‰@"$‰@"$‰@"$‰@"$Ó@õÓjæ™$‰@"$‰@"$÷wß}wó‘|¤ùþ÷¿¯éз`ºþò—¿lN9唿Š+®hìwÉÿûßæç?ÿysÎ9ç4ÿøÇ?º’ÌñÇè¯]n¿ýö)ÓõÛßþvóÁ~p nSVPf4i¦£ýǫԜ Cèl,ï¿ÿþ±ÛôéË·Ýv[óÑ~´¹ì²ËÆ®ëûã/ùKsÆg4¿ýíoû^2O¤»þúë›ÓO?½¹å–[æ‰úd%D`ÞF`áy»zY»D HD HD ˜ß@j_~ùåÍ +¬0GWýæ›onÞóž÷4wÜqǘžyÈCšw¿ûÝÍÃö°±cW]uUó±}¬¹ë®»Ê±¯~õ«ÍÖ[oÝl³Í6ciÚ?N8á„F^›l²IûÔ¤öðƒ¢ï¥/}é¸óùÝï~WÚ%ê1î :.¸è¢‹Š>ýë_›‡>ô¡)òÐd˜Ê~4í?ÞºÍ :„οþõ¯‹ñ-öûôå¿ýíoÍ/~ñ‹f½õÖkžð„'Ä¥#·ŸùÌgšŸüä'céÌ ïÿû›Å[lìXýãšk®)FÁ­¶Új®W šï|ç;›?þñ¥j‡~x³êª«6{íµW³À ÔÕíüý¯ý«ñiËB -ÔÜï~÷+Ó\°ü®Ó¸ÿ(û¾÷½o³ð —<†åS_ËP«_<èAjõ¨GÕ§šÿüç?åð€<`¦ãvÌ©ôV^\tÿûß¿¡7‘§>¥¼6&tvN]IÔ§ìÜóåú;ï¼³¡SûzIâ|}ßÒvÕ£.÷ù$êçÇVÏ:'‰@"$‰@"$‰À½ŽÏØþóŸÍÛßþöfå•W.+>øàæK_úR³ï¾ûý7ŸûÜç ±ò–·¼¥yøÃÞ êO<ñÄfíµ×hŒøÞ÷¾×,±ÄSNÔ#û~ô£5!ê§p81t$I?è6Ítõ£éÑvîÎuºúòÙgŸ]Hú 7ܰyÑ‹^TŒæ™/|á Í;ÞñŽNЮ½öÚÒöO|âçÚ±õ•¯|¥ô¯xÅ+Šaã[ßúV©ÓÉ'ŸÜ<ïyÏë¬w}ðk_ûZsá…Ö‡Êïå–[®y×»ÞUð³êø@óˆGÕ<ðœéúúü,'føò—¿Üu8%ó=IÔÏ÷] HD HD Hæ,xáýêW¿*„ï¿ÕV[­YqÅ‹’¦\pÁ…xYrÉ%ÇwŒç_qâ÷¿ÿ}!8x®¿þúÍ2Ë,§:·ÃÊvp‹/¾x³ì²Ë6çž{nñœ\guš•VZiÌ£ð†nh.½ôÒfã7.äBÚ(—nAfaÁÓó¹Ï}î˜g¬ôòçA‚d²âU¯zU!æßm·Ýš7¿ùÍ QÞ÷µðätíŸþô§Rç1yL1HÇóòg?ûYÁ÷±}l³úê«ÏâÙˆŒ§âÇyéÈYgÕÀ“ÕƒÒ“žTö»¾þð‡?4?þñ ‘¾Át%ixŸ"§àe,ƒtצÎ?õ©OéZõƒ#øŽÒALCŒKÞíŽñ<'£æiúè#Ý éêË<šéb%Àãÿøæ‘|ä Ë?ꨣŠÇôN;íTÚß<£½£«¯¾ºÌ]õÅæ¢ •£? ›ƒ¤Ž>â¼:Ÿyæ™eŽ0ö™]GFAó—ºk󧹦¯¶>oîYwÝu›Í6Û¬”g5¶Ž=öØ^D½‹ÜvÝu×r}|ñ>';ï¼s!Æ<öÛo¿2µ’Þ¸2ÞB†å#Íç?ÿù2_½à/(óÚßÿþ÷æˆ#Ž(FÚ ü#/}±KÌ•d£6›ïµÝqÇ× çc•V}<ÿüóË5îem¢¾œ˜ñƜ뮻®Ìû ÉúbÜË"ݰ-ƒCÞ<—’$Ý$QßKMD HD H{$„ H+¤¢ýcŽ9¦ÚÛm·]!-ƒxªÉWä o¿š¨¿ä’KÞ“‘0<ŸóœçtÖlTÙ.:ú裋7cÄ;v2x("n‚˜ŽH.ž©H a=ôÐæÃþp!Û]tÑæŸøDѹ\tÏû5àr„Ô!l‡<ðÅ^n‹0<$•çã÷[lQÒ#n…}@ÂÒ‰7%â‹g’bÈ K$!ïPä‘z"ψ|‘Óƒˆz$;B‡ÀßµH¾ZKÚYmtØa‡òKX$­|´ô!Êç ¼é¦›–¶F6þð‡?ѽ÷Þ{Õò~Ó›ÞTH;yè+µÇ©ý-·ÜrŒ–f”ˆ'®_j+ù#ç´‰P¤Ö ‡rHI¯=#m®ëß<Ú‘íê¤V³Ï~ö³Ë6ú–úì¹çž…l—!ì„K"¿öbØAìGEÐ!L¥‘¯U¼c'Ûú´QnÄWÔßx¡õ}ðí£ƒvCêÖD½¹Çøeø0.úÌ }ôQÕBœ×}™Iª|íüÝï~·yô£=*›™Î#Ò÷g=ëY¥ã$RÖØ1?!Øka‰090N%ú›1Ëó;„G5¬IŸùQºQcPxŸˆßàcÞ5úЇ†ë‘oˆz‘6ùüô§?½Œ[ä}ÿˆëÚ[ã»&ÜëóôB€ó 7v6ß|óæ‹_üb14n¿ýöuÒ2O ÊG]W¡Ìê{“yÌ**FÑA×ÎTÈ=;úlÌçæ " \;”Žq¯ï‡¦ ¡‘ÂhˈzÀCOxúßSìÐ ýk¢~hâ<™Ìç,8Ÿ×?«Ÿ$‰@"$‰@"$sGyd!éÃYĉ‰¸ "^ùÊW2”×£ðH%ÞÄ]Ò·l„ÆÆ3¼R…V@Ò ,Z<8kQb^h€}öÙ§nÈy‚üBÕ¤<ï^ä{„%އ&2‘Z‹º nÛ/Ÿ‡NMäŒß°$ˆ/İ t²µÿéOºœ¿æšk IØGÆ ‘Õ yH>þñBä‹Hê«Baa xãß8ftˆk´’^h e)¤ dd¾S©CŸya”>Ãê7蜹BÝ‘õÚ^;[=2 cž•%µD¨óK[_ýêW—Ão{ÛÛ îŒ;Hn‰úÙŒÆPȨù±Ïüæ7¿Yb®koŸðêf|í#Qï¨g\ûuâÜD¶VÁ˜#Íô4þwÙe—‚Ißü¬n áù×¹G0œçÓ! …<ý¾´!ÃÍ0Aú[á`þY~ùå‡%Ís‰@"0 fþ·7‰ŒòÒD HD HD H&‹€(HBÄ'B€'­0/oxà ¹=žü©]$o>¡ñ.é[6âWD ¢·6‰p‘7d饗.»B B9ˆä¶ –xª"ª­A8vy"Føä}ñ­ äw„²±&Q„°õ!{„-Ü„]@G½FÞ¦ÆÑÜöÀ„7|TÊb\xÙË^V®¥O_ž¡V&„ˆl \ã¸-£ãÌÆ3Œ(ñ¢M}‰—;ƒ ¢‰1„DÒ†±ØWþýï7°‡Ñ_…¯P^¬y´$\¼ÐS˜1Á»$½ȟWµ~§žŒ>a$JH]‘­0…-hiÚuÜvÛm‹!Q' á%=HúÔ­oû*£ë8œŒ;2»u5/ôѧ«NÃŽ™#´§6^e•UJRíüâ¿xØe³œ‹ùA?¨%°Œóõ¹A¿Ã;]xb¼éÇbß׆ÄQócŸ1hþ0Æxšû ㋱ 0([qMàpë­·Æ¡¡[ó¯wÔw-¯}ík‹ŽÆø®UÃòž†žæ¢¶¸ÿ…Îís“Ù7×›K´¹ÇJŠ¶Ð›‘„ᓱÕ5;î¸ã¸uúà?8†Œ·)‰@"ÐÀ¬3Awº<š$‰@"$‰@"$‰À´"€<SºëQ¡)„cé+ˆúZb?<ës}ÊŽôB ?C¦È„K-BÔå#wƒ|sžªÊg¿ýö›ÉË^\á.}#ô¯é>y!áÓÞÿËÏð’„3ÂFÈuÒÏþóÇB)´¯ëÚ²Úõ:!ŒðŽHô¶Èƒq1ǻֻ ¢®i‡•ˆk£ÜÀ9Ž#8ƒØò»¼TÛí×··Úþå/y‰!íÅ0°ÕV[/ûÐcÖÒ¨_x XÁå"Ñ}BzA´Æ1ï€)á9O¬&h‹¾VKóÑÿ¢£¶æÒîËíý’hÈÉù"’Æ<çãø°-£<õOµXA~\;j~ì3­±‚ÀJ¨<°„_¾gP¨­(;¶Q/¡²jì‡ð¬ôƒ¶Hlïn¨¥3Ž3X„´W8Åñaù8c7ÒO÷–QÞˆz[ó<ž‘¯f‡!†=%LO¬–²Ò­¯¸ßñÞ鋤Ïm"0?!DýüÔÚY×D HD HD`FŠCŒ’ˆ½éx_¶EZ¼ ty¨§ì.xdG;ß(/t}êt<‘½ž÷½ï}cèq ’‰×ªük²’A‘Žäé#Q&B9BʸîÎ;ï, ’<Íyo#á|„Sáa-¬ƒó}$¼ýÕ7<ê]õ÷;òR'/ ÑŽ>A!õ¼êw/jì’ðž ²+#«…™©5¡WŸëúm¥†EBAÇ!¦·Ú}°Ž²¬ˆxÒÊ™Š¼p‰÷1ai_'ëÛ§n}ÚŸ>•©Ö¡=wÔû}æ…>úŒ·®aôù¸¾=§ÄñAÛèga‹tþ+ÎÇñQ[ãQ«Ÿ {c|·måYÏ}Æ £žPGV3ø…´7‡Ôó× }£^ê¿¥ êªÊÃqí0jƒðlŒI ‹Œ5Œmܰ|à§ŽîiµA®­WÌõÆqô¹HøVßâø ­¸÷¤mÌk¿&êÍ©Þß  ’P?¼ìûŠøûmû^›éù ~ÿêæ7T²¾‰@"$‰@"$‰@"p¯ À£˜G0ê^|¼+y<%¼Fò&BÄ1[!nD!Ò£~ybí=<ªìÈGœèx©ªcW]uU)§MXñP¬©K‚$Bh!ç/^òYJq]èzÒI'Å¡B’ñ^~†|H]8&ßy#z„‘æ >tq­sld32W:!æN‚àŠüÚåÕ†“xù`í®Üv|iÞ¹¼ÃBŸ 補 uðä™9ˆðáɫʭ=T½0Ô sCi„Gg”¹öÚk—ëêk¢ü®-²N¶B }áxªöÁ:VT:#D›úyWø‰.†‹ÐFêu´Eºõ­cÿxûQßö¯ËÏï>øöÕÁ|¢ׯïM¨eÔ¼ÐGùÁ~Çu]žßò$áµ^vf|µç”8>hk|CÆOô3iãeÊ1¿ Ò­màÑnŒ!¤Í¯]¡hFÍ}Æ ¯{«NŒM!´â¶6gP}ãxÌÅÞm{†G«QjãXœŸÈV~îK¯{ÝëJ-F2FMó]_‰6¨uu­6rÇ;,HŒëX‘PÎøÒFæçÈ'ŽÚº'ê=4¬w@IDATã«­¶Z &4˜ëµi=—wå¡nõœÐ•&%‰ÀÄXhƤ½ßÄ/Ÿ³¯¬=3ælMS»D HD HD`ÎA &ó<”ÇÇq±ïa½o蕾µC*ñÔD„.´ÐB…88ì°ÃJèákˆW„>Od›k7Ùd“¢" ™ÌS‘ÎHòc=¶\/l‰cÈ«÷¼ç=…AœŽ*›>Èù"ýå!ì…—‚ÂDì^$2âF¹b‹ ;þøãKØÄ.ÒiôÞ÷¾·¼Ò¾º0>ÄGx’<3‘S;KJÏ£q‘E™¥°F¸DiS°BbÃN[Ê—7è53Âl æÕÅË_yð«oïð¼„=ïb+E<ú^o?¢I¸SBQÒGqDÁƒn›o¾yy÷€¶ôrØ‹/¾¸öp£ïy/´P ê-.42UØ› 1å¥]é¹õÖ[Û-å9¦îÚDóþ¤“¾aå€ÛÊÕžBa:ƒ,?í´ÓšÕW_}¦¼K†_êĈT—/#‘—ÑŠ{ '„ø(¬zê'F}š^ê¦/FÚ ÖÂ…œ{î¹…œÓf!ñ"Yžþú1 ਯêê÷õ¯½"ù}”wríe '4d"ý¨oû‡îö]õ×GáÛW}St}ßx6®y+7„žê3/ŒÒG~{ì±G_m‹@×ï½xÙsÝ—ÕQ’æÚk¯-«LÎ8ãŒFÛ“µÖZ«„ƒ);#¾\ãZ>ú®±D{[7cÿ¿ì¸$EÕþ‹œs°d$ç,,‚ä$(‚ñ¯ˆ‰OåSQEAø H$gœsÎ9çœþ÷WòŽuk»§kæÎݽ»ûžç™ééî §Þ:U=ýÖéÓŒ-töé¢ c†yq…·\ã”ý’ù±d âé͸@ˆctaq’…CÖÔW'è„íc?àÈ|wÜqÇʼnåsH]`Oû™‹Y|Їñ„M0ïA¢3V{NÌ×ÌU¤Ñ{2šÊÁÆxÂñÏÜÂÜÀ˜gžkæ<žŒaÇFè;p¡ŒcBQOФc™vñž0`îÓ"7×Wæjž*Æ bëóBYê¢èÍ<Ê¢å`?\ÇЕòXTåZB:I¹6 #®´«î<åå/8®ë7c*üÿ`,󇹕-‡¾S{Üí2FÀ#`Œ€0£!¼p”—¾ž|òÉ‚a1€ÊB¤!J|p$rØ_rÉ%#)à É…@BAd@ä# {î¹g¼1b_é´-©›|x„B’{ì±ìFÒf·Ýv‹¤u<ðé a[ ?Ð Bž"„ˆÂÌÄO¿ yDº 3íe¡á8A„ Ò_[Ž®" bâ—8ð|X €è‘§.^í»ï¾;Y"ŽÂÎö¤Ú>ûìÓªƒ°Â?H¼hVõj aµß~û…ßþö·‘€¦}bL_pÁìF^xŠŽR¡ Ž$ÔÉb7²K/½´Wny/dD"dB(…º¡=¼ñðÃ'žxb<]A°çïFˆ'+¾XÄ OÐ’ aQ……=-Ñ„5y Ú±%yç/OדN:‰Óµ"œë`kxâb3,´°`ªgû>ïdDc%îd_iùÝØQiÿgÕv´Û„o©¼d—'X|‚”¤Y¼ÀÆ…CɼФË1Wùu glèé Õ t cO8ᄺ,•Çñœf! [gú‚ .¶ÝvÛVú\7Æ+ãy;gÑBúB–s7§*¬i~,ƒÌ÷àÇ|,ýèú«TáÑ΢$8ãœüŒ³Ra1š;©°ðñ£ý(.*²ð¥1E°æ©쇭8Ú•C>æsæ$H~Í‘Ø"‹ "üI§qÎð\5÷®°hO§9¯E)Å¥ç¼l†ßœç¥Ó믿>»-a~N:è Ö¡ü<'t]n%ò#`"ãôM®ÿyãÌ+{#`Œ€0FÀ#`š=[ˆ=¶|ðf„ÜåCìf¼út ²9UÒ\Sy <¨!g!¨ªOfÈŒºóÊÓ®Ú©—K]b¸C¾|ï{ß‹„0žŒ9YIÃ"ñÊñXÄ£‚#'Ãò:›öi/zU2uí ¯hŸH6Õ'd DS•зô5øV d8}#ì°™ªö:hÑ¡ª,ŽÑ.¤.¬M<ÙámÀ»½J/Š¢ iÿ±/’¬]Ux“JϽ«°&dÞÁ¼â‰ŸÞwú­ÝœD™uúhŽmW~ª“~«sÝX\ÀÛ¾°8É‚Bý,\â!žÏ™uºiÞOçúxòryå•WîW}Éü˜fÈÇ`zŽßxųИ>¹Õ©íP8±(Y7äõŽª}ú\æÞ´Í¹>Ø+‹,Öj.ÊÓxß¡OPrmæ ‰úO5â?Ò¡ÝkgŒ€0FÀ#`ŒÀX‚@‘äu°´+GDsž·]¥…¼J ,O·rí—4mÓoÚ[׿ºvè)„¼ì&¸yl'y»ë0Áœ¬*·ïª|펕´!m„.Or4 à ¿Q¢wÖä»ï¾ûâÓ„¢€„%Ì ¤úðáÃÛ’ÅMú¥çé—ªúÓ4%¿»µ£ºþ?õÔS#1Þ®nÈÈ¢¾©}u:¤uÓ¾¦rHßm“·S‚ž±FFÀ#`Œ€0FÀ QðÊÄKµ@ºâõž’ÀíÒûÜÐ@€E…3í¿ÿþ1Œ ±Ÿyq&1› ͑ƞzŒŠ:xá¦e`n‹ÏÈâŠB"™0OUä~Éü8P}m;EÐù€Ê8ôÍPîëfŒ€0FÀ#`F }Àv¨„¾IMw5FÀ#`Œ€0F`¤!Púæ?o[ij¸"#`Œ€0FÀ#`Œ€0FÀ#`Œ€0F EÀD}І#`Œ€0FÀ#`Œ€0FÀ#`Œ€É˜¨É€»:#`Œ€0FÀ#`Œ€0FÀ#`Œ€0)ƒNÔãßHëóo#`Œ€0FÀ#`Œ€0FÀ#`Œ€0F A`P‰zHúüàÁd}‚¸#`Œ€0FÀ#`Œ€0FÀ#`Œ€HT¢~òÉ'UAÖ[Œ€0FÀ#`Œ€0FÀ#`Œ€0FÀ4¢/úo}ë[­íUß‚Â?Œ€0FÀ#`Œ€0FÀ#`Œ€0FÀ´4¢¾Uç?ìUŸ#â}#`Œ€0FÀ#`Œ€0FÀ#`Œ€0! Q¯Øô9ÀöªÏñ¾0FÀ#`Œ€0UÜwß}ᤓN ï¿ÿ~8ùä“Ã=÷ÜS•lH“Î~øá êõè£FlÞ~ûížÔ3²ôN•½øâ‹Ãe—]–Š¿O?ýôpûí·‡^x!¶ñÅ_!M§N;í´@}È'Ÿ|RdO7ÜpC8ûì³G¨êßÿþw¸è¢‹F8Þëuõ7Õóæ›o†çž{.|üñÇý’Òn0½ùæ›Ã<Þ{ï½~ç›vTîk¯½Ö”´uþÎ;ï §œrJí~ëÄ ü ½Ï?ÿ|xýõ×+K§=·ÜrK¸ÿþûÃ;ï¼S™¦“qæÌW²×¼í•4ìÖŠ ²al›æÖk¯½6œx≡›yMu5éTz¾¸–Ö5ÐtŒ1ÆÚM7Ým›¬’·Þz+Ú"64º s%öÚ+“¿ýíoá®»îêU‘.g¬¹ª•{mƒÔÝë´ã÷ºÀ´¼8 ,µÔRaíµ×އñª?òÈ#Ó$þmŒ€0FÀ#`Œ€Ûn»-’ãŽ;nxä‘G 3Ì>÷¹Ïn(Îã?¨·Y‘ „¬ÿàƒzÒü‘¥wªìC=&šh¢ôPlÄÖÜsÏ^}õÕ@ëHÔ~Ûì@.R×$“LV_}õðÑGÙu?óÌ3#”|Ýu×…yçw„ã½>PW»zn¼ñÆpÉ%—Ä$;î¸c˜zê©ão0øÇ?þžzê©VöqÆ'l±Åaذa­cu? ?üðHÔÎ>ûìa«­¶ªKÚï8ø=üðícù~ëD@Οzê©qa[Úl³ÍZ5€ÅñÇž}öÙÖ1°Øpà ÃüóÏß:ÆNÆu¦öÚ‹¶vcýP³#†¨§Žgœ±íÜÊbx½ûî»Aï!¬)z„êk„]è®]VÝQ¶|0°@˜ cmÓM7 éõáŒ3Έ‹EJ¾Ûo¿}˜tÒIu¨ß2ÿŽ;îË/¿|Ç}ѯ l‡ñ+®¸"ŽÙf›-;ۼˢe,³Ì2͉ R¼òÊ+áé§ŸŽsØB -TÃI:A ×v4XsU'mêµ vRw¯ÓöÜ£>õ¦‡”תॗ^u·W}¯»Ðå#`Œ€0FÀ1'Ÿ|2Ì:묣UÃFGx¨èÍÍ>RBÇ„_b»í¶[Øa‡ R·O‚·4D/õk_cÙY3ðx†¤Ÿp GÈÄôk¬±FØ{ï½ÃW¾ò•0ÞxãE2»ÎË7-äüóÏ$=„öP¼Ãÿò—¿„—_~¹RU0‚t^i¥•®»î¶Ûn»0Ùd“…3Ï%‰¥m›`‚ b_|æ3Ÿ‘º•[ú`â‰'ŽØËá‹„ê0œo¾ùúõ)áKè‹tQ‚•°'K.¹d¬=±ƒÜž¤ºb[xóÓ†E]T§úmÑ¡Ÿ$íp /°ã´ÏC=<"Û,­Ÿ{kð„ÌKú}©ç‰'žˆO² ÏØTh?}$HvtÅ6ïØ)m =Þ¾Øãwð§ïõ”!ŽÀ/úC=´ŸºŒ£[o½5öD=ÂáÉTêãÉ ¤j\Ôá3 ð«Ô TÁüèGlÀ;¢•¶§ÑÌA„’âçÈ‹¤6Ì>ã˜~æ &ðMçVÆxh\h.aža%Ï"‹,ÒÂŽòªºz1^(¼‹ÂpcyÖYgÅz8†¾n©€á<¸ç" û^xa?|h;mV\{ˆ ”5ìS¬Kô†L!¶5¢ú¸ÙÆSâ…úÿþ÷¿ÇÐ:O-»ì²aøðá1_Õä£úSçÑÂD}C»ð‚x‹Ó)Q§*ÓÔ=í´ÓÖõçž{n$à¨}éï*’ O ¤mÂò’*%ê!›iĹˆú’ú‰s­8ûÂtŠ)¦_ÿú×cYßüæ7# D<ñ\è—œÄTØ$ÚS'!x÷Bt`ï½$êßxãˆc,Ňú Fé;ˆ@°‚—‡<8@ 2.¿Øñ‘e[_üâ#Ñ F¹hñg…VˆD¡5¨Ÿ1‡7s*ùxn‡Â)-§ôw‰ 0oqÄ­9ŽvC´ò´¶¤á»ì²K«ZÈw0Ü|óÍã±Ô†é_„¶¦s+öÿ…/|!,½ôÒñ<øƒ·ˆzÊÓ íÌÉ´›ö#Ì­±é¢ë‘ÆD-ˆÙÔVY¸‚¨gžÈ‰zú@„:ó:BÔ#M×(úBs6ã ̘™?¨‹6 Œ Æ óSª<Ù÷…Íà„Ë¢zÓ~æ¶©Pá¶Òa_,,}ùË_Žs8 3ÒÑrì lHóùϾEÔ§mTÝõ\/eíÆu4 c ÛÂ^˜ØÂc~éK_j]›R=(/ÿ?€îðÌqÒ“öc3úØa¤hßtÓM^zé¥Öõ òš<ô‰l¸íûÂ!!à\r­Ê“ŽÏØå•ÌU¤kÚ_õŸ¤ ßÔõ[O‰zp`®«ˆzu„¼éÙg%Á>øàJ¯z.´\X¹øë°‰ü›¿Œ€0FÀ#`ŒÀX‡„„ 7X"“ ê ¹Ù ¹ b‚dÂ4@ÚrS|ôÑGÇ „Ã7¤˜ÿüç?#©Ë0±š!¥õaYˆ.Ü£@&ŠT€8çØ°>r[qª3&}´˜±qì±ÇÆ›\nÖñjC< ‰Í5eâå ²êª«FRâ ý ¤sjþôxmbÐ è@¨ÊF é!2!ƒ Nõ†`àå¾èôÕ¯~5’9}¼4”ob0Cl@ø°)J¿ð’XÈ ˆ)y™F>ýÒi*9"æùM» €öÜsÏ4Y׿ñÚ„ÁÔ´rD¤š ÇËRã&”¯Ý¶¤~ÈìR¤¤,¤ùSr´ª®œpc @Ôá©ZÕ*;ÄfÀ_ö¢sÝòÔÊ9眱QAH}ò8V,2`ÇýIÉâ a1Xp€;€Üd(3¯2ØêiÒ¦‹´9aÎ>ï\I:.Š¿Ê¬Û–Øy±7Ú«y1Æ‚c‹cŒ-H.<¦Õ¯`ÅøWÛR–>p6š߼ē1Àܤr”V[lhƒ 6ˆÞÏä§_À5µÅ´®^ÕGõî»ïuhãyç_þ¼É&›(I¿mÓ|¬ÄØä5cƒ…CHQpg@‚G8ó¬ÞyAØ$æÈ?>\kt½Qæhìˆ|ä,’z”§ûZ˜Š‰>ýâúÃüøẒ^{š®Q×\sM$E ÷yIßpícî¦Ï°?ýéOqìAŽ× s;sõ*«¬ŸÊaÎ&ìĨ„¾ázÀB"sõ1ÞXTåúÆu¹ŒE$È`Ãbô?¸6•ÕnKYضø™Ûü×`^hšÅdðÐõ˜1‹nU/â&a˜£!À©Ÿÿ={íµWÄAï0sl¥ôZßn|ÄŽJçªv§çòÿ$%ø–Ø`ZÇèø»g1꫼éDD=F¨ˆbÕ³e‚ $ŽBãð[+Ø1±¿Œ€0FÀ#`Œ€k€DÀë,ÈmHz„fnfõô.7ÂÜwp³.ò‚ß5×\3ZîK e‘zìCÒ#€dÂãí7·^|7ò@’ê y!}´ÀM6‹è„w%¤Ù YA8È Ý AA`¬µÖZ-ï~t†@K…²h ”‰ŽG,yñÌ“ŒH#éTïT'Àà±Á‚>´G,êÂu£6ŠD8K•KŠJÑ/ïyúˆ2!«Håéd«z!é!ˆÐ.'Úò…„Jô(© !ñFÅþØÖò/©‹4Ø&ä.¡R°½:Á~y"ïVá_•°¡ôùÒ$à AOûEÊÁ y( ˆC=YÆ 8ÐOŒÚ…½·‘Ÿ†„%Ùc="áÉøƒHƒÜ’¤ã¢øƒgŠ¿5Ç”Ø1ß1Æ5²+æ7ÚQ’ðHCrމú¾ðžÖ¼©ªù Ò¬N˜õroͩ҃Ô1Õ 6ðÉû›€h®Ô!‚B M8”êUZ?6Å}3õŠØ.­Céȇ÷+ýN\vÚX'xíB†±àÓNð8Ϋ„„%3ä ¶‡-ã4%Ãâ•D =iô,…¢^2„…4iðfÜq<¤(þà©ù‚òH¸õ×_?o²Aå…ðå“ ¶ÊÜiIøžÈO„ö"¹ ǃ}_SÚ×¢¥êÔñt[5scÍëêÕx‘ù¼À>‹xs§¶Aú’ù¸®\°aa‡24Ç3s„„y‹þS=:ÎÛÆÛŸ~¡ÏqÍs"ì9†ðä’ãÖ|éZÐîÅ tðmèŽ>„6ÒÂWMÑý3·36òyœ¶¤xèºÅâW.äGxò€<<‰®Ø,ã˜ù*¡Ìª1"ýEø7ª²ÓcŒƼõá1 Ç;=з k0bÁO¶¢²™{x*—tnÓÔ~uŒ|\ësûáÿ¢ñw²¯¦¶‘¼ôz•]»›ÿ'iÒ¡Ôk+MN”]ÑSçMO6ý‰`…‘W½¼ç9F~VR’žP8"ñ9o1FÀ#`Œ€0F`ÌG@$µ<KZ,¢’&íCw+Á yHîs䌔묛O<¿ëŠ#MCݨCL S™ì‹Ìá·HŽ”Â3›Z™:éFo‘èê¤:Ð]J¼ª•‡-^˜ZPHçªíiŸ*þ³òÉ+Rûí¶ØNU?Èv”/IÚ%¤ åËuÉ÷KêKB‹oKˆTl„p¥‚-œvÚi‘hÁN´àP•ÂB†úÈ#{W?B¥ ¡-ò6‰„T¾º-¶‰CÖƒ+í©\—g ÇEVi«²DLËòqÑ üÁ<·eï B¿¥^ÇÌCØ¥0ç‹,–A~r\‹ƒ¹ «ý¹^,”"éSÚÒm^W¯Æ‹êg|Òn‰Ú/|r^ó”Ò(OÕ6÷Ê#üÉ#|ÒüËñ‚|x|¹"^¤UÞ™1QÍïU੼ݲ$OŠ]"`Š BòB@êC?ó/¶}MÒȸ˜ á bbœü´©Î£¶¡˜¢Óê³Ü> äÏÇE/ðgNH1â·HL~7Ù æ8p×gËqòjœãíÌÜÃ;°Ù”ÔÏmX Ñ^‡Iò1¦ãlóº„í@Ç‹tŽìc·ÐàIÛpÒRÉ|3õ}éš¡}Õ“çÌ ihÆ4:¤œdæ 'œ x@¾€À|75aqRâOnD×€´Ò‰mJ– Ûv×(ðA'ÀYd#ì "Û;}_U„«æÒð i:Þç’Šœts[…Àæ˜Û¤íÄ]džS[Umu½ÄV%Ì­xíƒaéøPÞº-×'æIú’Ð:;í´SìC-Þ•`Í{(‡§ðã]3½n¯õuuwjG%sU]]%ÇKð-±A]ƒô¤uc³ix3Žåã‹ýÔFó}òŒ ùïd—µÕyÓó¨/ÕàŠįҊ!Õ°zÇãC¬–k‚SìúÔ“ž—4àUo1FÀ#`Œ€0F`ì@€+‘Ì¥-†œYj©¥"΋!« ãxá&„GJ²”–©tÜ×ðÒWîk¸q‡\p‡ÆÃŽú*ñ|&Œç©§žõ!F5û4_H(Á;FlûD¯.Š¢BåÏ ïˆyýÀ³°Xƒ÷õ•W^c\+ s ¿eg ATeÃR› „6ÊÜB@ª¶{êBy«¶Uuõj¼¨>l½YLbQ‚ßÄ—vØaq Bš¾nš•ù•p#Œ]HzæyÅøWææ6x.ækæ6úGžé`xÜqÇÅ9„Ål[‚ s"¢/Æfîagqƒù{Ð5+o‡®ÔÉœDÿ–\£x‘+öF"×õžx'žÂ€»Ã¾8–Ï è‰“­æv®1ØK*ÌÉÔÇÇy®yÌñp€ŒwÙmÁΘXd!OhŒ0—r¡èwæTöKÇG]ù:é/®’ùòyGp*Áš§èK®I¼˜¡.®J§tÛíµ>-ƒßÝÚ‘ú¡Ýõ2¯«“ý|Kl¸ÿÌ…,¼ñŸ ÛÊí½Òñ…MñBeV˜7«\óØî¶Ûn­EÀNÚÓmÚõªXžñìó²Íûýï?Ç{lkEPé÷ßÿ8À¿ÿýïÇ?qxÙ3à ‰Ã€f€··[Œ€0FÀ#`Œ€;€àpÑ|»Vscž ÙsCŒS^‚œÖGüB¦çiÓ|M¿!x‰çÙgŸ.¹ä’˜œnnäëtÆûYúÈO;ÞÅ…p#©rå´Ä %/IDo„D^Ò 9!r⢇Šdá o Ê"Ü‚BN(­¶Ýê –µgœqF$!­8ÉÇ‹_ú ñ /ŒÇÐgóÍ7o…«HûLENÅ Ÿ~­²Ê*±­Hä¡lbŠK¨Ò‚ÄD¾bÝ*]Ý–þÝl³Í"!¥~ ? ò¼f!Ií]šp<@ѯ[ˆŽA¢Ò·HIý´ {Âó’–ºÙ—¾± ì‹4y1ÓW—e?÷îÑizå[Â’°ðÖyØ›*ªŽÕé Æyz½2Ž˜ÚrË-#QY5.ºÁ¿N§ªã%6@>^rÌâsœâϳ˜—¿¨!¨˜o ‘*Ž'ú¾ ßðÖÜÁ8$ÔÄ-’cf_²gÒVÕÅñ^ŒUËü ùŒM3®XÜd!C’êÃ1ÆK»ùXùà£XÅ6Й9^6£4à 6éÜÆ¼Ž Ì÷òB×¼­¼”)¢ò’™(,´"ÃúæþtÁ!ou°CÿÓ_ÄOoºFQ.ýÉÜfz0IŽ¡ s i¨ï{ҥϜŒÎÄÞGØOIPò°àG›X,°xó§ÂâOh‘#=—þfžàzÉ“AšÃèw®—, ¥ã#-7ÿÍ»xo@:?Ðß¼K)Áš'’x-ó>Oã€% ‡rH^]å¾ú¼êd7×zÊQ™êÏní¨t®ªÒ½äX ¾%6H]¼‡…ÿNzJìp¤Ñoá¾>:Fº‘-ãôýÙøï³'ÖžzÓ+†$š ÝøÃÆUYŠüõ¯ÿü( Ç”—AF ( «•\x&8º%ìuáUÙÞ#`Œ€0FÀ#P„Â2ˆ-nl øÈ3LÇðЃÈè•@´B>l¿ýöÑS»Ûr!Û ¨{}£Åcú´=%¿KtnÒ‡2Áœü:¡nú@Ä[]:…YÑ£ßuéz¡7öǤb@WÕ…>ð"ýªÒെÇ- u†x=CÎTI)>UyuŒE"ÊÏu…ÂuŸ}öQÒ~Û&è7¼_›l²®~U¦z° :׋-68åöB·^—]â-Ë‚L:VÚ‹ÁÆŸ66Ù€phJ§m“ “®d¬ª¼vÛ¦º„cݼQ:^Ú•Ã9¤êP5z ïg¸)l²–ÅÅ<ÿþð‡¸°Ä\Uu=h‡KÝ9t}þùçã‚Q>÷ÔµŒ8—¾l”ò›t¶uØS&×£Tºy¡ Ǽ”W‡ež¶t¿d®ÏÇ‹GUa×Ò:éwÈ\¤Dï:¬™Cxù- š[à6ya5Çxâ¬Rz­oWW·vD™UsOæ¡ùòú±Y‰š¤ß4_‰ ò…:S›Vô3¢qÞ´¯|½Úò¿‚…bþw #ÿ•ø]ý/§ÃZåMÿµ¯}-üèG?йþóŸG£„d×€à'qDÒC¤3²ÒG i!é!ë÷Úk¯XN·$}‡Mpr#`Œ€0FÀ#`F1ò¬†´ˆÔ‘)“¼Üèå7{%:7é““-UzVÕ]•NÄ@Õ¹ôX/ôææ¶©mMúpÍ ÿ°OŸ"HuL7ÕSŠOZfþ»n¯xÅ]Îó°ß„7ÞMúSN]ýœCšêùOªî¿Ç’„°K…Hk7.ôh²éZbOJ˶ɆIÓ4VIS"Mu5áX:^ڕù:iÂŽ¼ðSMÒ‹9‡:¨Oá•ò:ëÚFUÒ¤S¶ÔG©ÔÍ M8RF)–i}M¿›ÚHþ\7ž*⩬vÂ/¢¾Dï:=à=á: 4gß`,TàY »à‚ ¶S¡£s½¯ÝÚŠVÍU<E(±v¶%D}¾iÙy?§çô;u¬Ð1mÑ%•¦ý4í`þ®Ý5Šl')dúw¾óQÏ#.<~Ã*ñåùè¥B¬ðh Çx¤†2 éõ«_ÅÓxôœ×c¡ê8‰0FÀ#`Œ€0£9x3†at’ÑQgð*zóTñcÓ§¯‡Rÿãq@Ø˘ÀP½DzdÚðȬ«—á=Ï<Ô´h ™‹çµeôB€Ð5 _3ØšCöÃsO~lŠp-„Õ!´Ë˜,\'}­xÓ·ÂÓUèõxÓÃiì¼óÎqÕHjᯗÃê[‘ôœ'.½DžùİRøëfëÐ7Ý æì°Ãbè›:€6ÝtÓHä+ÜM“þ¼ó΋/Í€è—P¾_ +4¼5FÀ#`Œ€0FÀ#`Œ€0FÀ1މúµÖZ+ði’4ÜMêI¿öÚk¯zõ#ÀâFÀ#`Œ€0FÀ#`Œ€0FÀ#0†"Ð1Q¿âŠ+Æ6Mxô½¤6 6,ƤÏÃÝ“ò¾NÓ‹—ÉÖ•ïãFÀ#`Œ€0FÀ#`Œ€0FÀ#`† õ„¦)¼â!êEÒëűm´‘IøÆ#`Œ€0FÀ#`Œ€0FÀ#`Æ ü2Ù&” öMÒ7¡äóFÀ#`Œ€0FÀ#`Œ€0FÀ#0¶"0hDýÁ1%æüðÃðË_þ2\}õÕ£¢úŽêD×{ï½7œ{î¹áž{î ìWÉ'Ÿ|î¸ãŽpùå—‡wÞy§*É?†þôË믿Þ3]Ï:ë¬pÀÔâÖ³Š\ЀŒþïT©¡ ƒtf,ÿú׿Ön(±å×^{-üêW¿ wÞyg+_éW_}5\tÑEá‘G)Í2Ú§{æ™g“O>9Ú·Ã 0F`ìA`ü±§©n©0FÀ#`Œ€0cÚwß}w˜sÎ9‡tsŸþùð“Ÿü$¼õÖ[-=§žzêðãÿ8L?ýô­c<ð@8ðÀÃ{ï½yä‘aã7›nºi+MþãôÓO”µÚj«å§´Í5×D¢o›m¶é¸3úEíè¸€Š 7ÞxcÔç7ÞÓL3ME ½´£ÁèÿNÛ6tÎ=ôP\|Ó~‰-¿ùæ›á®»î K/½tXxá…•µqûûßÿ>ÜtÓM­tÌ ?ûÙÏÂTSMÕ:–þxôÑGã¢à†n8ZŽ«÷ß?0O2_M2É$á/ùKÚ¼Êßüqx÷ÝwG87Î8ãÄ2>øàƒÀgâ‰'ãŽÛßç•ãäŸh¢‰â¶]9y/¾øbxá…Â\sÍó§ç™+©k‚ &H·ê˜tÒIãqo¹îUÉxãׯ\ÊDWpI…üÌ£ÈøãÔî4 ¿ëtR:ÎôÑGÚmmiCÞŽÖIÿ0F …€‰úþaŒ€0FÀ#`Œ€yà ¡ôÝï~7Ì7ß|ñ €£Ž:*’J?øÁ¢"xØrÈ!‘¬Ù{ï½ÃŒ3Î ¨3Î8#,±Äµ‹_|q˜nºézNÔCö]ýõ¡¢~0':LÒº! – ޶£w©ƒeË—]vY$éW\qŰÅ[ÄÅæ™C=4|ÿû߯í±Ç‹}¿ÜrËvc‹E†ƒ:(ðA'Ä0óä~•qÄá©§žŠOï°H²ÓN;µ’±x²ï¾û†Yf™%ì·ß~q~lWŽ2²0s 'ô{ˆŗ]vÙ%L8á„1åqìk_ûš²Åíƒ>ŸÆÀf† öÚk¯H¾÷KôéΔSN~ûÛ߯=ú=÷Ü3¦=üðÃ#¯<ôûì³vã–E‚Ï}îsQ'!êœþùÃ;ì÷ó/ÎóäG.k¯½vØl³ÍòÃÞ7F CÀD}ˆw€0FÀ#`Œ€µ@&Üÿý‘PÂ{q¡…ŠÞ†hEØ”«®º*z”Î0à -E9‘‰’ ä ^•x.³Ì2aÖYgMOð»]Ý$&\Å´ÓNf›m¶pÅWÄ04K.¹d˜gžyZ¤ÇÓO?n»í¶0|øðèANHêE·É'Ÿ<Ö ‘åzë­×òŒ%=åãA/dƒpÚn»í"1Ïqˆˆ™ãŽ;.zß+-[<+ÉÃOMBêÌ=÷Üq!€óx;Þ~ûíßyç7,²È"#xVBÆ£dçI‡\z饑¬â7åBŽ/¿üòìVÊsÏ=n¸á†H¤/»ì²•iðâ¼ï¾û"^pO¼’Ÿ}öÙ~ ´‘Ð à>ÓL3…Å[,¤¶BaØÅÍ7ßËgÁƒþìT~øá¨7õ1ö•J Ö„æoÀb‘ð,k¬±FüM;^z饰øâ‹Ç…œW^y%¬°Â q†6Þ ÏWpÃ#7ìæ–[n <¹±è¢‹FrMžÀ²ÑUVY%ÖG[À‰±†ôÂŽJú?Õ·ê·ÚþŒcp‚lFJðmÒ 0•0.!Q9†ç9Ò4/¦DÒÕI•-C£ O,°Àaæ™g®Ë^{2xŠ)¦;î¸c\ðcž¡¿G¾Ì]©0?ðA°/Bå`„Í™}öÙ[6ÂyÚ|É%—Ä9‚rJæGò!Mcù‹¶ÓÌŸÌ5¹øJêÿM!<ºô£…“N:)<þøãý4ìí¼óέyš¤"û!Ä×\sÍpÁDÛÀ&=ð¨‡`O¥®Ò°˜€—?ós;[l}!ÕÓl¯Iþßÿû-/vô™l²É–[n³q •Üzë­-BŸ§›4Þ9¯¹9a¥•VŠs×1æýã?~„Å•Yµåš±ÑFõ;õ™Ï|¦ß¾wŒ€¨FÀD}5.>jŒ€0FÀ#`ŒÀ(@R‚ VìŸ|òɑІx€´ñ”’¯§œrJÀs0%ê!%Î>ûìV9„ñØzë­Ã:ë¬SÙ²¦ºÉtâ‰'  Bê€Ø—%1:Brá™ Ñ‘sôÑG‡_üâ‘lƒHAçTð°—ç"ÇE2¥!l ñÀâ‰'Ò¬ñ7^‘§vZ¬:ù½îºëÆô·ßûÞ÷"ƒNçŸ~$¾ðì!I yô‚$Äs“M6‰Ú y†P.ätQÉ®pô#y!ùR„§¯! Õ×ÇsL|Â/RHZÊNÔµúê«Ç¾†l¼îºëZD=D6Þ­Ͳ÷Øc°ÔRKÅ걕3Ï<³¥ ûl°A‹nhó2»¤¯(ÿœsΉÿð‡?Œ¹J°†û¿ÿû¿˜]…}Žcßx´C¶Ó~Hjúô¼óÎ x§²UiÞ°íØ. þôa• öe£,6A˜’†òx ãw¿û] 1;*éÿ¨\×ÚÏxÁ¶Á¢¾ßè72R¢ž¹‡ñËÂã¢d^(ѧ¡©‘8Om™E[ê§Ÿ!ˆç˜cަbú‡Hg,@0ÓÇ’/|á ±>æ§œ¨gADarÀ0(²7Æìþð½ÇÁŠÅC¤d~$]Ó$¼â÷³(wá…Æ<ÿû¿ÿ[®‡r‘UW]5|ó›ß,"õÿ“£ÿ7ÞâZPí&Dpu<ìó›ßÄ Ö¶ß~ûÖ£ò´+‡k6ýï|'^OÈÜF_CÖ3&?ûÙϪ¨ÆmÚ‡óŒêÏ…9‡EææÎ”¨WZæ -úqž¾`a§a±µªþNÊpZ#0¶"ðß™zlEÀí6FÀ#`Œ€0F`È €çDä0„ìŸÿüçH€BbB\u"_ýêW#J˜<ú •ð&®’ÒºñðÞç•ú׿þ56ÉZx§B=ó„A „$ŒB@~AĦ¤<ä;¤¿OZ¼HS’s´â6ù,¡q ‘ jˆÑÏo°D ¾ îð4E'¶ì|ðÁñ<^žôûüã#n´ ò˜ÒBåBˆW O HbcCJíºë®­Eå¡/ é·ÝvÛXuBðP WØJ®¼òÊøß*œ†˜$œm¤,𣯠!Õ µñ~ǾøàñÎK}K„rXÀ_#ø€1ö é‹4aÍB$=Ä8¡Ðõ[ßúÖQvˆÚ#Rýë_ÿz<†!ÚvÙNÙêGB1®N=õÔ˜F_´ýé/È[ôâi–ØQiÿK‡¦-ígá=iÒ„o/u(™šôijcÕyæ ÚYOßÓÏ<=Ò‰h1'KR‘w3óK.,8*Ô žÚàÎâ8 ižÿûßq1$išKÆà?ÿùÏøýÍGæ,¾6 sV‰ç}S9Uç{»í¶[Œe^Ì+Ùéœ]•/?Æ\Á˜VŒyg^Ö½–·ß~;Îq<¡D$žš`žm'<‘Ââ( Ã#`F&êGήÅ#`Œ€0FÀxÔ’âR¯C¼Vr»H…ø€dÂC’Ðdt•”Ö D¸ôhÇ[Q¸•篼" [ ‘ ‘œ „žªÕ Y@Ç*ïNBY x—„ u@~+” ÛÏþóÑ[–aa nx|âe߉@ôB0¦^¦Rh©€7ø|ñ‹_Œu±¸ðå/9æÅƒ/Q<šy2Aríµ×ÆE áªãlYt`q†Eˆ][Â˼HY AdOô¡ž„€@,Bk€=X!ØëŸþô§X_ Öx´#íz¡'ÿį…{¡=xUcw´ }´H@(!Ú Ù ¦ô#ѤÉÛHÌhìmÂ0!Äꮓ’¶•ö]UÇÁIäëÈÖ¡i^(ѧªMíŽ1GПô11ÂúùK_úR»l#œÓü€¤",u>=W÷›ù !¼ ÂxÃŽ!}ӅĦù±d 20Æx2„ß`Àøb±`°å€ˆq؉µÎ‡…ÆTÃÃSY,ì¡ÞûURWýJ¾ü #Ê`¬2ç1{-Zðd.¦ÏЧ'ra‘…®kÌØO¢u"<$ü´åé ‹0ÍtöO·¹<§0FÀ#`Œ€0FÀt…D1|y¬oˆj…¦ K©@Ô§¢ýªÇøKêVY¼4BEi F,çT!Šê‡ÜùÆyˆê‡øÍ%ï¿ü¼öéû¯|å+áoû[P±0°á†F/{éÑk…4ÊcãCÞé Õi—w,.ˆhUÞ' …<çž&È[K%y‘€s”´MiÚõ]ùUÇi{úä‰Êo‡¯Ò T‡’yAOb´ÓGöWÕ¾ªcZÊm9߯ʛ“g¶æ Ó<ªó:ÞnËžØ' jz‚H¾ò6Í%c'@x‚€'¡;ì°Š…ð=̃-ÌÍi\w=}Ö«`é±üw]9ºvh¼æùkŸP7Ì/¼¯…cŸE—|ñƒãÌû<­Åõ˜0H<õío»ßu¯ž,$+ —ÒU-8뜷FÀüõÿÅ¿Œ€0FÀ#`Œ€…@`@È‹ºIðNïË\’ þ#U„A'uWyÄã]š—«ú¤ƒôIÓá1Ù )òÓŸþt„8Ç|x­R~JVB @¤§ž¬ª§j«:!”R†tï¾ûn$¥ iÁOs¼·!áøNkÂ<ˆ`ª*?=&oÚ«Ãr^íç·Ê¢M¼tQB?òQ¨H@ˆz¼ê…;/;¬y¦‹„¬JË€!« 3“ 8tB¦ò¤á.xÙ%á8 ³øC»kÕÅé"HV$Õ¯“ß<…‚à›±´1÷¬î¤\Ò–´­¤ÿ;­7Mßkò¹#Ý/™JôIõ/ù- eóÊ“Ï):^·•iQLéþKçu¼iËx$LvJØÆw¾xëL™éüX2YÔ#äO3ðÁÒž9$¿šôíæü¦›nÚ²óªü<±ÂÜH¸Æ>á³£?M_Ws-ó6æM!s Ô·¹èçK„ë…ž¸Ù}÷Ýûeᩊtžæé<î%`tßS,.–¾#§¡:}úCõykÆvÆÛpû€0FÀ#`Œ€:àñ‡G0Ô^f‡w%Ï"°¯!¼!vr.„¸ ’(¤Gúâ½Ô{¸©n•CœèÔ£òˆõä„aZRÔE€@hAÎCºð’Ï*ÒLºþë_ÿŠyø‚$ÃcxXŸ¼$m‡Ž‰ÌaŸ²!‡w@ˆ}Ð…¼œƒÀ†l†Ì%´:MŒe+@IDATA̱p’}y}é‰^D˜z†So_ï\¼ÃY<>"èÕEà‰—ªˆQ¥ÑO^ÚA½©·*/ å…¹,†à¥à%«:—Xb‰˜/Í£2«¶¥àÄ–ÐA„ˆPø#BË”`­'* ÀDlbçU¡(ªthwL¡hÚÈ–…ŒÒ6¦åwjG¥ýŸÖÑÉï|Ku`>Á¾ÓÅÞ›JÓ¼P¢å}þN‰´žô7e"òZ×¹|NÑñº-ã‹1Äø‘‘V/SÖüR§[¾0€G;cŒ§˜_solÊnšKÆ ^÷ÃŒÍí·ß¾õ[…Í¡žQ!Ì{, °Æ#!¯óö ó.c=µ;ÚÃ1BqiÁŽ9žë óg*ÔI?ÈNÒsU¿y‰,ÂËÄ gǰfˆBÅŠ/Ùl:T$ó!#`z„€=ê{¤‹1FÀ#`Œ€0F`àf…?ûÙÏb\mÈ%<ª!* 7ÒŠÐ!"òÚ!à(g­µÖŠÞ„¼0z‘E‰I!¯ ]6Þxã€÷cSÝ*](wƒ 6ˆ$7úAš ï‹' å£/Ä„ Ä.^º”9N˜ˆ)…„  0tųñÜsÏ/;¥,BÝœrÊ)±>,$oÇ x(ïôÓOžŸxBBº_tÑEñå¥Ô‰‡åi§Ë$´Ä! zy¡,ídq¯w…H™þù£‡=^µx¶ƒ%m॓ï¼ ò"\ ð!“ ƒH—Êúë¯ëûŸÿùŸØGÔ‡¾,Òè·¤Go^¦‰S½°ÀÀ‹‡±!HEâòŸwÞy±M–èÌ>ØÓßx÷B:B†A¶ç‹-Uu¡'8Ñ?„è€ìe1ú· kˆ><Þy¡.ij¨VxÙDèÆÊ‰'žIhHkˆ=B_`£ù»ÚÕÕ•ö»zÛ+Á·T†XtáÅ­<%Á‚”^Ê+šæ…}(‹Ð!³<’>£zÒ-e20n ŒW<¹ñbïTx7 U¼h™÷@°37P¾ˆÞ\7‘é¼|˜yT„"~±ó\šæÇ’1xÜqÇŰ+Œ/æ.-XðâÖÁ°ICK1·+Œ z$7}„÷9×¢c=6."ê©tlWï`¢_˜ki# uÌí̵Œ_{â½Ów\o˜s°WÆòâ‹/¯1aà ÌMZ˜Qrì›â(á½Ìi\£˜y7ý®@ÒJ ½RaîÔÓAUçYH­ #”–áßFÀ„`¢ÞV`Œ€0FÀ#`ŒÀAbéä“OÇsLÔ ršÊB JWò2'¼Â9‚@.á…©Œ@¢@$ŒÒi[R7å@bÃraÁ`·Ýv„ –:d5¤ d×7¾ñ˜‡pòš„„Oö}áe)ƒ 3í…ÄA8FįG¤¿¶ƒÔ‡ø†X"<ȼ6U¤®B!€#¤$ ÑaöÙgŸVú™x­C‚ó¢YÕ«-„ø~ûíÉOØ4Ę¾à‚ b¹|„:ŠˆKH>¤£N^pH˜‡¥—^šCµÂ‹x!¼Y¼€œFX\P¨ÚË?üðHds»‚„ -<úét&~6ñºÃ;´¶&¬Éâ¶$ïüe—]6.æœtÒIœ®á\—[ãŽØ ‹<§¤ŸwHS'iùÝØQiÿ×Õ_r¼ ßR Z!H!Uy’‡~$´ 6.Jæ…&}hSŽ¹Ê¯k/cCOoàIμ¥ÅºÓsüf–…Æô¥Ù½´¼¾Q±Ma3ô•£«ô 7\S2–«Êö1#`F>„päÚÌÿ(ƾþSøtäëæ€0FÀ#`Œ€0# ÐDÚæù|z ]9"šó¼íò(-äUJ`éxº…”KI¦ô\§¿io]›ëÚQGü4éÄÍc;ÉÛ]GAæ¤`U¹%xWåkw¬¤ i; ty’£IðVˆ½«°&ž®ùä“"{ºá†ÂÙgŸ=BUÿþ÷¿ÃE]4Âñ^¨«¿©ž7ß|3<÷Üsáã?î—”vƒéÍ7ßxàðÞ{ïõ;ß´£r_{íµ¦¤­ówÞyg8å”Sj÷['áí}þùçÃ믿^Y:í¹å–[Âý÷ßÞyçÊ4Œ 0g¾’½æm¯¬ á`·6ÐPl c#èÜ4·^{íµáÄO ÝÌkª«I§Òó½Àµ´®¦cŒ1Önºé¦h‹Ød•¼õÖ[ѱ¡ÑU˜+±×^ ˜üío wÝuW¯Št9ƒˆÀ`ÍU¨Ükì¤î^§¿›§Ÿ~ú°Ì2Ë„qƧ›ìµy˜¸øã0:OPµó #`Œ€0FÀ#`Џí¶Û"¹1î¸ã†Gy$Ì0à ásŸû\qþQ‘P:?~W·YÅ*CBÖðÁÅyÚ%Yz§:<ôÐCa¢‰&JE’bkî¹ç¯¾újlc‰Ú/c›ÈEêšd’IÂꫯ>úè£"{ßgžyf„’¯»îº0ï¼óŽp¼×êêoWÏ7Þ.¹ä’˜dÇw SO=uü ÿøÇ?ÂSO=ÕÊνü[l† Ö:V÷ÂñðÃDíì³Ï¶Új«º¤ýŽƒßÃ?Ü:–ï·Nôøäü©§ž&°¥Í6Û¬UXüñáÙgŸm‹ 7Ü0Ì?ÿü­cüèd\P'}&{íE[»±~ ¨Ù‘ CÔSÇŒ3ÎØvne1¼Þ}÷Ý0ùä“×”Z}XuUŸíüh/pí¼ÖÎs<øàƒÂTk›nºiH¯gœqF\,R:ðÝ~ûíä“NªCý¶pewÜqGX~ùå;î‹~e;,ˆ_qÅq Ì6ÛlÙÙæ]µ(ž°òÊ+¯„§Ÿ~:Îa -´P/Št ½¶£Áš«•öÚ+Ä]ýƒdàŽ7ÞxÁÃ^Ç馛.0y™¨ï¢.Ã#`Œ€0FÀŒ¾<ùä“a®¹æ­0:ê ÀCEonö‘^ÞBŠí¶Ûnñþ5>€/H¸ÿE¿¡$Ãÿüç?Ãã?&œpÂH˜¥úñ$ýk¬Yd‘H`ã! ™ýío»ÑïüóÏ$}¯õR{õïðýë_¾*a!Òy¥•V ‹-¶XÀsÎ<ó̰ûî»Çå*ãBúôbÛ o³Í6]‘ôÝÔÕ‹6Žê2ôÓL3MØ|óÍ#é Î,¦A²#î,‚@DþóŸ=öX8÷Üs£ý~éK_ªl†žŠY`zJÔ³è‹Ó,‹§Ýõ•ÊàଳÎvÚi§0å”S g­C`°ì¨®>ï ®ˆzUÁ@ÖŠ±Žu»`‚ ÂZk­Õmvç3FÀ#`Œ€0F` A‡ <_çœsÎ~-›ïhÈ×yæ™'ð¤o*xôAv@°áOþÉ&›¬•„° x,òÁû Y|ñÅcHb<÷9‡gmN@:á%ÉSÀÔMù©Téœêƒw3 SM5Uš-z‘S/^ËÀŸýìgG Nñ.Wˆš*A/ÈXt¤¼¾i‹DmÇS“² ‘ ˆºÕ‚oip™i¦™¢<$±DúÐ6îõè‹Ï|æ3:]¹¥&žxâˆ}ê¼¥¾Ãù曯_ŸB~Ñ;úŸ°'K.¹d<„žØAnOJ®ØÞü´aÑEÕ©~[ô@R¢¾ôvœö¡c¨‡§Cd›¥õC®€'a[À’>F_êyâ‰'ÂÚk¯‡)˜ í§„Ç,³Ì–[n¹pÕUWEÒzæ™gN“÷ûM›ï½÷ÞHlƒu¯„>‚0gü¥ÆÔ÷òË/G]±MÆ;vJH‰ˆ 0^ðxú^O9âlð¢?ôÐCû©Ë8ºõÖ[c@Ô#Œž,€@¥>ž¼@ªÆEþ1ÿJmBŒÀ~Ħ±¼ãágh{J´2JŠcœ#/’Ú0ûŒcú™ðMçVÆxh\h.aža% @ÂŽòªºz1^(¼‹Âæsú¼Y<Ä)Z¶ö“ô§Î’k}Ýø¨•ÎUÒ·n«±”ÿ'!}“-Wá›ÖÃu¼›iß±ËõF׋4ÏPú= ¢~(5ĺ#`Œ€0FÀ#0f 2‚›X 7ü×_}¼)çF2rË-· sÌ1GLÂÍî_ÿú×xÌ®žüÝzë­[Ä޴ܤA @.‘†Ð .¸`$]”ïÊ+¯ „ÑMœå«¯¾Zªί¸âŠaå•WnËu®ÒBaã7nÝ$^~ùå±~ áuZ„v‰|àÆò¬³ÎŠõp }!ÜRÂypóNö/¼ðÂ~øÐvÚ¬¸ö1"@)kاX—è ™BlkDõq³ý•¯|%/Ôÿ÷¿ÿ=†.ÐyúkÙe— Çùª¾ ÕŸ:Þ&êÚµÝvÛµ0À[œþK‰zˆQú"¬ äÁˆAÆã[#>²lë‹_üb$Á(-þ¬°Â ‘"´õ3æÖYg~ÉóñÜÿ<„S¿‚ wJl€yëˆ#ŽhÍq´¢u‡vˆ¶¤á»ì²K«VÈw0ijIm˜þEhk:·bÿ_øÂÂÒK/σ?x‹¨§<Í£$ .ë{ß„ó /óð•ÖÅ~¯Æ esÌ1q1Ž~¦o˜˜³Á£Jšæ5ÙšpÍ7”¯±E¹´þÖ»h;s2í¦ýs+dl:‡èz¤ñQËb6µU® ê™'r¢ž>¡Îü†ŽõHÓ5Š¾ÐœÍø3æEæê¢ c‚qÂü”êOö}a3GuT M†Þ´Ÿ9‡m*ÔG¸-„tØ K_þò—ã\Ž,¦ 3ÒÑrì lHÓ"êÓ6ªnÈz®—²Çvãƒ:š„1†ma/Ì l/½ôÒÀº6¥zP^þÝ<òÈÀ'=i?6©mFŠöU䥗^j]߸‘‡>‘ ²·}_8$œK®õÂCyÒñ9;¢¼’¹ŠtMBû«þ“4á[bƒº¾`ë)Q\wMÔ7õŽÏ#`Œ€0FÀ#` t \¸Á™QáÈ;HÈÝúèøÄo}ë[-‡oH+<0!=éÁ0±š!¥õaYˆ.è‰" Î96¬ÜAœêLS¤ðæ=öØcãM.7ëxµ¡?ž†ÄÇæÆš2ñr†@YuÕU#©q …~Ò9+@ø+´ÉK,Ÿz†l@‡½÷Þ;–MzHzˆLÈ È¤S½!x¹/:}õ«_dD/ 寛Ì>,d@ŠÒ/¼$²bJ^¦QO¿À‡t`š dŽˆy~Ó. =÷Ü3MÖõo¼6!G°õ퀩¦Âñ²…ÔEx²¼ åk·-©r»)) é@þ”­ª+'Üu`U}¡2°Clüe/:7Ð-O­œsÎ9{õ„Ô'cÕÁ"vŒÐÿŒWgÖ[o½èA )¹É"Pg^e°Å#!mºè@Û˜æìóΕ¤ãb ø«Ìºm‰ {cEó cŒ1ÆÇ[\xL«_ÁйEmKmXúཪyñÍK<ÌM*GiµÅ†6Ø`ƒèýL~ú\S[LëêÕxQýxT+Tm<ï¼óâËŸ7Ùd%é·mš•¼fl°p) î,HðgžÕ;/›ÄùLJkW¾`À‘œERòt_ S1ѧ_\˜7\WÒkOÓ5êšk®‰¤è®»îÂè®}ÌÝôöñ§?ý)Ž=Èñ:ang®^e•UâS9ÌÙ„Ñ‚•Ð7\XHdî€,e¼±¨Êõës‹H,1þÀ†Åxô;]÷U&×K]sYì†à¥M|èpm*«Ý–²°m-ð31 ·ù¯Á¼Ð„5‹ÉàÁSèŘE·ªq“nçwŽ×QpêçÏ^{íqÐ; À[)½Ö·Ÿ±£Ò¹ªÆé¹ü?I ¾%6˜Ö1:þ®Ü6:¶Ä:#`Œ€0FÀ#0F ‰€×Y*Ûô7ÌÜÌBÜ"ÜCqS,òrrÍ5׌„–R©Ç>$=ˆ@v!<ÞŽps áÅ‚<¤:C^H-$p“Í":á] )@vADBVŽrCÞ¼E„…¤GÐ-Ê¢],,P$:±äÅ3OB0"¤S½SDƒÄ úÐ^¼°© Ô6Ú(á,U",E(* D¿ž  (2±ŠôPžN¶ª’‚ý àr¢-_H(Á¡D’úÁòoTì}a-/ñ’ºHƒmBî*Û«ì—'ðnþUiÁJ?/MÎô´_¤$’‡Ò€8ÔØÓ˜e¬€ýÄx ]Ø{;ù iHX’=öØ#žŒ?ˆ4È-I:.z?x¦ñ[sL‰ €óc\ó [°b~£íõ!Y0a4´!·á˜¨ï ïiÍ ªšß Íê„ùW/÷Öœ*=È“×Õ«ñ"}БyŽva£¿,àVIÉ|¬|«­¶Z˃›k ¸¨-JƒÍAÂbÃØŽÆ‘æt¥Ó–þ`a {×µGÄ6mHEóŽ<öÓsu¿K¯QŒyžÀV¸¾²Ð‘.¬Ô•ŸçÚÄÜÎbØknOÓ`Û\ãcê#ì›Ñµ}l»@ÀŽ<š×Òòø­6BÒƒó„?ׯAÉøÈˬÚ#ÛÇnПEÚ‚Hvÿ°CBÖèzŒ¾è]%Ø®®£š÷x!¯ì ÿ?Rz­oŸUº”´­d®ª*»îXþŸ¤D‡¬«ot9îÐ7£KOYO#`Œ€0FÀ#0 Ë ò°Ì³:÷ì$” q|‘•9¡…@ ©<ÞñDßdCê9 ñ’ŠÈ—ƒ:(=‹Ïu®Ó‡›R>ºCè†<ìû‚Œ™¡öå!Ø—^.òÎ<ðÀULk‹ŽZ¼ÈÛÞÞZôÈuÒ ú@@ÊKZŠ@æÇtŽ-ÄX@¥¢>Ô1Õ 6ðÉû›€h®Ô!‚B M8”êUZ?6¥'Dl—Ö¡täÃû•~'.;m¬¼v!ÃXði'xç‰UBB‚‹rÛÀÖqš‡’añJ¢…ž4 z–ˆB Q/ÂB‹4x3î8ž Ò ðÔ|Ay$Üúë¯7Ù òBøòÉ[en´$| Oä€'B{‘܆ãÁ¾/)íkÑRuêxº­š91‡æuõj¼H‡|^`ŸE ¼¹SÛ }É|\W.ذ°CšãÀ™9B¼Eÿ©g‹mãíO¿ˆÐç¸æ9öCôÈßÿœ­þÖµ Ý5Šèð¢Gwô!´‘¾ªK±‘Ïã´%ÅC×-¿r!?“äáIpÅfÇ,ÈW eVé/¿i|T•c¬0nä­OhBÄñNômÂŒXð“­¨læž„Ê%Ûôß#µ_#_'×úÜ~ø¿„h|Æì«©m$/½^eE×îæÿIšt(µÁÚ G“eW´Ñ¤1VÓ#`Œ€0FÀÑ‘Ôò<,iˆvHšT´IÜ­@C.ä!¸áå&ÉuÖÍ'žßu‚N Å‘¦!nÔ!&Ž©LöEæð[ $GJáÈM-„Lt£·ÈNtHuRèƒ.%^ÕÊÃ/L-(¤Çs Õö´OÿYùä©ýv[l§ªd;Ê‹—$íÒ„ƒòåºäû%õƒ%¡EH‹·%D*6B8RÁN;í´H´`'Zp¨Êá!C}ä‘€=„+„¡RЃÐy›DB*_ÝÛÄÆ!ëÁ•vŠT®Ë3ã"«´UY"¦eù¸èþ`žÛ²Æw‰ ¡ßR¯cæ!ìR˜sŽEË ?9®ÅÁ܆Õþ\/J‘tŒ)mé6¯«WãEõ3>i·DmÈ>9¯yJi”§j›{åþä>i~ŽåxA3~ÀŸçš«É'ÔS*Kûê3o·-¹F±ˆMø(P CyzÒž§µô$F»:8‡þÍÂDéÿÌ/aÅ"_º˜Š­ #úŽ'2冷:!UX$K1Ryl+ŸéqýVMãCéë¶èÇÓKÌèÄÜD0æC¼â›°FtÕÓ;ª‡9$¿Né\éVØ€]§×ú’:šÚFà\Õù¸)©¯*M“¥6¨²sÌS;U𡏭~îpiÊÀÍEª¸Z#`Œ€0FÀ#`F9ÁS¢„97pzÑžòW©"€•¦i+Ï8ˆDÅ ÆëútÓ—ëLZÎS¿ÒP¡D W¹ï¤äÞGž¦œ‡…·¨<]ÓvQž'BYܼëóF–Žj³<ÿ“²ÿw7z  :_W/;…»rBB]´½Êë”0?èŸKÚvÎi_Ø@XSW*`X*”“÷ú¤Þô”Eȵ}ýn‡¤hq|i³_R?‹ÈðáÃãËñ"­òÎŒ‰j¾x¯O-àí®%yRìSl’Rú™x±EèkÚ~DÆÅ _ËãÄà§Muµ ÅVŸåö!‡è|>.z?sBŠ¿Ebò»É5Ç»Æ8[Ž“WãogæÞ±€Í¦¤~nÃöêåÂËǘÒu²Íë¶/ÒA:²ÝB@ƒ'mGÀI H%óqÌÔ÷¥k†öUOJœ37¤¡iÓè>Ñ™y 'Ä(Þ/ 0_ãMMXœ”øÇ“Ñ5 m‡tb›’å¶Ý5 |ЉpÙ{ƒÈöãNßWáªy4<…ÃDšŽ÷¹¤B"$·UlŽI°MÚNÜul8µU¥ÑV×KlUÂÜŠ×>–Žå­Ûr}bž¤/ ­³ÓN;Å>Ôâ] Ö¼G€rxZ?Þ5Ó éöZ_Ww§vT2WÕÕUr¼ßÔ5HOZP76›†7ãX>¾ØOm4ß'ÏÈÿ.AŽŒÚÚÔÁ…G‚,FÀ#`Œ€0FÀŒ½pc%’¹È™¥–Z*á¼X² 2ŽnBx¤$Ki™JG^úÊ å¸q‡\p‡ÆÃŽú*ñ|æå©§žzjÔ‡ÕìCb@Ò–r!¯‰· ‘Ãy‹e—]6VÏM9ä /àã~‰vpó¯ðÒ‘Çòi7:R//•|ë­·/¬“nôÆûLÑ•z؇¨añ|âÈC àyMxtGnz!TrA†UõxBr‡²‰z sE|Ap/bÒ KN²ÇÂk¾ðâ¤-êÈbú%½Yçæ>_H(Á;FlûD¯l½YLbQ‚ßÄ—vØaq Bš¾nš•ù•p#Œ]HzæyÅøWææ6â´3_3·Ñ?òLÃãŽ;.Î!,¦`ÛT˜Íx16ss8‹Ìw؃®Yy;tm¤Næ$ú·äÅ‹\±0‚è¹®w¨ðÄ8ñÞþØÇòy=ñÄ×ÜÎ5{I…9™:ÕÅy®yÌñ„b¼Ë®h vÆ|Ä" yêDc„¹”ë }D¿3§²_:>êÊ×qHôäÉæÊç©kžR /¹&ñbzl„vvº¸*Òm·×ú´ ~wkGê‡v×˼®NöKð-±Aâþ3²ðÆ6l+·SôJÇ6Å •YXaÞ`¬rÍc»Ûn»µ;iO·i•¨­ìv¢ ä˜h-FÀ#`Œ€0FÀŒ=@ðp ùv-çÆ<³ç†˜Gúñäü°>â2=O›ækú ÀK<Ï>ûìpÉ%—ÄäÜpCxp#_§3ÞÏÒGÞxxÚñò;„I• ¡pÿÄKÑá^Š—´BNˆ\…¸‡èa±@DøÈÈ„²· J«m·zƒ%DígœÉ@H+ŽAòñâW„þƒ„‚D¼ð ã1ôÙ|óÍ[á*Ò>ÄS‘S1ç_«¬²Jl+y(›˜âê4‚`ãq‘¯X·JW·¥7Ûl³HH©ÈÏ‚‚<¯Y@R»D—&ÀPtÁëâ€c¨ºO.©Ÿ6aOxžBÒR7ûÒ·ªm¤‘È‹™¾º,#øyQ+Dtš^ùFÆ–°$,ü€uö¦J§ªcuz‚qž^/…„Œãƒ0¦¶ÜrËHTV‹nð¯Ó©êx‰ —³8Ƨøó,æå/ªG*æB¤Ê†ã‰¾/È7<€5w0 õq‹äƃٗ왴Uuq¼ãEÕ2ÿB>cÓŒ+7YȤúpŒñÒn>V>b¹³0Šm 3s¼lFiÀlÒ¹y;A˜ïå…®y[y)SD=ä/$óµ×^ZI3¬oîOòvP 1ô?ýEüô¦kåÒŸÌ`† s‘táº0§†zð¾']*øÌÉè,G[öS”<,ø±xÌb¹0€„Å›?7xŠ@‹é¹ô7ó×Kž ÒF¿s½da)i¹ùoÞíÀ{Òùþæ]H Ö<‘Ä h™÷y,Y 8äCòê*÷ÕçU'»¹ÖSŽÊTvkG¥sU•î%ÇJð-±Aêâ=,üwÒS*`×,,H£ßÂ…}}tŒt#[Æéû³ñßgO kgâgUŒUÌô1¾4;+4í]IÓæ¿™lèŠÏ•Ÿ/Ý×…·4½Ó#`Œ€0FÀ±þƒ#l!ƒØòáÆÒ<ÃtŒÿ뽈Vȇí·ß>zjw[.du¯o´xLŸ¶§äw‰ÎMúP&˜sƒ_'ÔMˆx«K§û3=ú]—®zcxL*tU]è /Ò¯* ^kxܲ°P'`ˆ×3äL•”âS•WÇX$¢ü\WÈ!96YËâbžÿøC\Xb®ªº´Ã¥îº>ÿüóqÁ(Ÿ{êÚFœK_6JùM: Û:ì)“ëQªGݼP…cÞFÊ«Ã2O[º_2×çãƒÅ£ª°kiô;d.R¢wÖÌ!¼ü–…Í-,ÞðÂjŽñÄY/¤ôZß®®níˆ2«æ*žÌCóåõc³,5I¾i¾ä? u¦6­2ègDã¼i_ùzµå Åüï@Gþ+ñ»ú_NjÅÈ<ÔØ’bùsÊŠ+<%êKês#`Œ€0FÀ#`†ò¬†´ˆÔ‘)“¼Üèå7{%:7é““-UzVÕ]•NÄ@Õ¹ôX/ôææ¶©mMúpÍ ÿ°OŸ"HuL7ÕSŠOZfþ»n„{ZÅ]Îó°ß„7ÞMúSN]ýœCšêùOªî¿Ç’„°K…Hk7.ôh²éZbOJ˶ɆIÓ4VIS"Mu5áX:^ڕù:iÂŽ¼<ñÔ$½˜s¨ƒú^)¯³®`T%M:5aK}”‘JݼЄ#e”b™Ö×ô»©äÏuã©"žÊj'Ìñ"êKô®ÓN§aB'ÍÙ÷ xÖCÂ.¸à‚íTèè\/Æk·v„¢UsOFJ¬€m Q_‡oZvÞÏé9ýN+tL[tI¥i?M;˜¿«Gwkä‘VäJ…Õ=ºRšÇ錀0FÀ#`Œ€ýÀÁ‡0 £“ŒŽ:ƒïPÑç,âÇfa( ‡„m°Œù •qÑK¤G¦ ̺z‰ÞóÌCM‹¦¹rJíeý.kp tÂ× nM!’ý„]!JžüØáZ«Ch—1Y¸NúZ9ð§o…§ëÐ7¼ ‰˜Gaˆ¿”òĘâ½<>”oR¢ž—)±â¥ØfMyêÎ;ôM2>nŒ€0FÀ#`ú#A…°åÿ=[>ü—ç??ŸÁ}Ó_#ï#`Œ€0FÀ1ž†¾!–+‰¼TAB¼ÀômÖ:î­0FÀ#`Œ€0FÀ#`Œ€0FÀ#P@W¡oð¤á…b¼)z^†@¼%„øUxàXŒ€0FÀ#`Œ€0FÀ#`Œ€0FÀfÆmNR‚7 óÒ>’i§6¾,AûÞ#`Œ€0FÀ#`Œ€0FÀ#`Œ€0í蚨ϋå¥Ë/¿|ô¨ÏÏyß#`Œ€0FÀ#`Œ€0FÀ#`Œ€¨F «Ð7UEM:é¤ñeS¯¼òJ¿Ó¼=»êe²{÷ÝwcZÂåL6Ùdñ·¶ý ñŽ0FÀ#`Œ€0FÀ#`Œ€0FÀ1õ}ôQxï½÷ÂLn¸á†ðâ‹/F¨8Ž¿¾Nn¿ýö@¼ûÅ_<Ì2Ë,ý’)¿ƒÞ1FÀ#`Œ€0FÀ#`Œ€0FÀ#0†!0`¢Ïø‹/¾8L2É$á­·ÞjÁsß}÷ÅßãŒ3NëXúc¦™f ‹.ºh •Cl{^@ûÄOÄ$ï¿ÿ~ëwšÇ¿€0FÀ#`Œ€0FÀ#`Œ€0FÀŒi ˜¨Hö”¤çþwÞÉÏJyøá‡Ã +¬^xáxrþŽ;î¨LëƒFÀ#`Œ€0FÀ#`Œ€0FÀ#`ÆTzö2ÙN‚Ø¿úê«Ã;ï¼ÓiV§7FÀ#`Œ€0FÀ#`Œ€0FÀ#0Æ 0ʈz|ûí·#Yÿ /„ûï¿ŒÕ 1FÀ#`Œ€0FÀ#`Œ€0FÀ#PŠ@OBß”VV•úë®»®ê”#`Œ€0FÀ#`Œ€0FÀ#`Œ€ã¥õc<ºn 0FÀ#`Œ€0FÀ#`Œ€0FÀLÔ7äÓFÀ#`Œ€0FÀ#`Œ€0FÀ#`õƒ‰®Ë6FÀ#`Œ€0FÀ#`Œ€0FÀ#Ѐ€‰ú€|Ú#`Œ€0FÀÑ ?ü0üò—¿ W_}õW]ï½÷Þpî¹ç†{î¹'°_%Ÿ|òI¸ãŽ;Âå—_xÏ×è(èO¿¼þúë=Sÿ¬³Î p@-n=«È ÁèÿN• :HgÆò¯ýkí†[~íµ×¯~õ«pçw¶ò•þxõÕWÃE]yä‘Ò,cDº—^z)Λ—]vYxñÅLj6¹FÀŒ¹Œò—ÉŽ¹ÐºeFÀ#`Œ€0FÀŒ  µï¾ûî0çœsŽŠê‹ë|þùçÃO~ò“ðÖ[oµòL=õÔáÇ?þq˜~úé[Çxàpà†÷Þ{/;òÈ#ÃÆo6ÝtÓVšüÇé§Ÿ(kµÕVËO hÿšk®‰Dß6ÛlÓq9O>ùdìµ£ã*2ÜxãQŸ7Þx#L3Í4)|h ôÒŽ£ÿ;mÛPÐA:?ôÐCqñMû%¶üæ›o†»îº+,½ôÒaá…VÖÆíïÿûpÓM7µÒ17üìg? SM5UëXúãÑGäö†n8Z«O<1œsÎ9iÓÂ&›l?ýVì|üñÇáÝwßáÌ8ãŒ&™d’ðÁÄÏÄOÆ·¿,çÈ?ÑDÅm»rò XLxá…Â\sÍó§ç™;©k‚ &H·ê˜tÒI׿v‹¹äMó³pJ{ÒcŽþœã8mo¼ñFÐGuUa@:Ïï\¨“r-FÀôGÀD}<¼gŒ€0FÀ#`Œ€)àûþûï‡ï~÷»a¾ùæ‹OuÔQá/ùKøÁ~uÀÃþC‰äÌÞ{ïfœqÆQÆg„%–X¢v1ââ‹/ÓM7]ωzȾ믿>tCÔ¨àÄB‡IúÁ@7„Á²£ÁÑvô.u°lOrÆíŠ+®¶Øb‹¸8À¤ýÙ©<üðÃQoêcì+•¬Ÿyæ™H¢²ØFx¯že5Öˆ¿i¡;_|ñ¸óÊ+¯„VX!.ÐÐÂ;á• nxংÝÜrË-'7]tÑHŠÉóW6ºÊ*«Äúh 81Ö^ØQIÿ§úVýVûÑŸq NÍH ¾M:`”¦Æ%¤)ÇðöE¨ì‘°9³Ï>{ËF8O›/¹ä’8GPNÉüH>¤i 2Ñvúƒù“¹/ìRaÃÞ!ºe÷Ì£?ÿùÏãØo"êUÏÎ;ïÜš·9&Ïsò¯¹æšá‚ .ˆ¶‚",‚àQO½©Ô•CšGûž``–yÙb›'tRøíoÛoA[l'Ø$¸ä7¿ùM`œkæú%ÁæPú’kh.»îºk|òâå—_ŽýK{ç˜cŽXžpÍóh_çÁIøè\)þJï­[0Q?¶ô´ÛiŒ€0FÀ#`F !ÉiÅM>û'Ÿ|r$´·ÜrËHZŠxJÉ×SN9%L9å”ýˆú[o½5œ}öÙ­rð¤Üzë­Ã:ë¬S‰DSÝd"”á OòPÇJ+­Ôòª„ FGH.úèð‹_ü"’m“M6Y$`Ð9<ìS²äñǧÓ6‡xàã ™ Œ§vZ+,¿×]wݘâö{ßû^$aÑéüóÏÄžý"$‰!OØ ô‚$Äs_¡"h§b‘¿ß¢Ü9çœSÎyík_Û›®G½µì·ß~“}÷ÝwI¿Í ¦uù¾ï¢¾ûÊï¹çž…Pa·Àa¡í€X\pŒz§ÕãÚÀÆì®/„3öÈzst“M6‰ª¦nù¤6R}ã7¾Ú>cÉÞøü€q_½°ˆÓŠ…Zý7,ÜrÈ!Ågñß–ïúM÷V§®r¹/H®Ù B"$‰@"$‰@"$ë 'Ÿ|r!é‘ÃÙcŽ9¦ HLÄÕ,‚ÜxÚÓžVÈPiH%ÑÄ]2¶mžã?~‚ A$#´D×¢ļ´R ]DGä"¶&åEP"ß‘þ!"iE‘Ö$›cú‚¸m_>+5šrôûK‚øBܽâ¯(:ÙúýÆ7¾±Õ‰¤GìuÔQ7}CæDT<± ^„x—xZ t %*3âc¤—–a­MÑôÚ±`¨"° ¹ð ËW¾]‚œFLJ'wuÁÏX!¦hHmÑïìËG4:âÝK}Ljz,`À×b„ŒÙ'Ò— am¡GŸãRÑõÐC½Fêb7D¢?Aªxàe#HmÂ.íêŽq”ª}Úi§•2ñG_èo¼·ôò4ËJìhìø‡C[ý·pCOý!CøÎS‡1~aHŸ¡>vç+ôYoì³ÈçY$óøà’ËžžüŒ§Rj>¦¾Ã’>ÎáçØ–ù8o±¨Àÿ[ıȈ¬Ÿ&|…'Èæ›o>­hK Dý ÀËSD HD HD`¾Hƒ”@|"ADJó"r{A¤":L"¥~ Èè.Û6âWzú ÚEk“Hu‹ü(Hi+¡ˆ\Dr+5‘ªˆ%O„ »¢9¥² "ÈÇâ[ÈïHec»ýöÛ—4H‚°GØÂM„§(ûYÑ‹«£J‘PmêxÃg§v*m!ÖDÁ:Wj‘¾"š=™rñÅ—EŠÀ5öÛ",ÎXDAì¶$ÊÝ‚hZ‹!$ìÉF~iâX¹òÊ+ ö°"ìõè£.íÁZD;A¶Ç =Eü÷ˆ‘îE¤`wúI`aÑ' ¤ÒWd+L£|eÚ>ŠfÏ"¶¥a"rS÷ɘ¾ÿ¾6ºöÉždmë0äÆèÓÕ§iûøãiŒƒ5Î{ï½÷´Ó®v,ü;¨%°Œãõ±¾ïü‘Þ†˜oìXnòz!qÈ?Ž™ƒ|€9æÉßa`~Y,X®8Ÿà»cΩK¾w¹ßãcá±iy<¥e¡®tP}xñ{_=ÆÙyíGN4wù@ósÞâ‰þÄ¢ë€ëC<1U·e‘ÄGê}w ‘.lñEàgë%ê)‰@"ÐÀlÿévב{D HD HD X1ˆb9¥Û\߈êHM!ËXAÔ׿‘À­Œi;Îñ’@JÒ&—s-RÔí#wƒ|s\ª¼Éêyå+_¹$Ê^Þö.}#õލé1uH!áÓŠèÿU 9—á,‚×ꓱØm·ÝSã´çuý޶ÚþK‹ðŽ'$è­¨ÃâbNt­w D#E—D»s”Ap±í#~[iǯ=¿ýþûï_¢sc¼, <á O(Qö¡Ç4¬ãé‚67>².ž`ˆöt5Qgq!ˆÖ(ã}±Ð"ržxš ¶VKó‘OàÜ'cúe¦_ý]ûõÝ܉ú§áeVªÃ¿ObLÓ'ì/ú0´å#HkËíïRhÊD, EÃÆñØ?m‹0†'û´ OçùÇ1sÐ ž ð$Ô±Ç[Ò®ˆüîKµm÷mEÒÓ×bTF¬¯|½Ÿ¯ö®”x!~ÛvÜõqßûê‰kIÌßö¼5ñÛBôBð,¸l±Å¥©ÄöÚk¯%Mº&XVμòäŽkƒ'¯Æ ?c1¤^؉}¹M«H¢>-!HD HD HÖ 9¹¨‡$r G9Ñ—­HRË_þò—ò³+B}–¶»"â‘m½Ñ^èúÔåDÌ#{¥°yõ«_½$Uƒó|¢VÕ_“•4%c h¡)eÔÅW I ‘梷‘Z>HÖÒ:¡ä¼iÑþú[§žˆþ;7êÒ'‘š!ÆÑG*‚DÔ‹ªÜåTî’ˆ’ ²«Œ)"«¥™©³©žÔÞB*é8äôö‘C{ ÖÑ–_hHV$öͺõ ñ>†šˆÕG6³Ó·1ã¿.éÐúŽú÷¿0“Yû†ÍÇù­O‰ý}Û°³X‹r‘þ+ŽÇþ¡­ù(M;•öÆünZÕYûÇ1sТžTGžfð‘þ iχÔþkH_Ç¥µ’ßߢ_ý´Ò˜s•Ùc=çt×9ž`á+¥ÿá ¤ÓŠœýuù¾zø^~Üû†Ræx)ºyÜJìs|ŒXŒ%ÞKáâ½%-Qo2l\9ãÂ'ÏBÔë—'/RD`ë ɉ@"$‰@"$‰@"¬D‹A"òOt¥ˆç °¤×AÞÈÜŠ7¢HéQ¿<±Žj;ê‘'ºŽ üÎw¾SÚi +iZj‰ü¾R%„rÉâ%Ÿ]¤YèzÖYg-V…$Ù¸j!>¤îGì òÆou#ƒ¤Xb%>tq®cld32W*:!æ,œ´D_Û^½p‚d#ud¸vÛüÒ¢sE‡[<}‚ /,ü "QOQ©5iålEòê‡vëèT/ õÂ\‹!¢ô‰¨ØhSçÕç”B=¥p²•2Bê£H$µÌ¬ã‰ ©$‚ØdçA õ4=jw¤6ҟ裭…Œ±}¬šÕŽÆŽÝÆ,ßÇà;Vþ„}׋;Þ›PË_£ú`ß¾S¢n§þ®NŽìè)‹IDAT‚®¥õ)õ±®ïæ—9dþ„)/SÿÒ§[»0 ¢Ý\ñôÿÚ•ŠfÈ?Ž™ƒ¢î=ubnJ¡/°´9]}íÚ'½‹—>‹ Æ3žÑUdEûøAĵ1 ŽR`ñ³Ž?lî×vH1û¤æŠ<>ßõ†?­E›Æ%ì¦>Öõ]ú"‹¤Ò‚Å!Oêkk×¹lEû­]es_"ÌŽ@FÔÏŽYž‘$‰@"$‰@"$kiV¤ìxÍk^SËG.‰ÞC 3" Ò%uHN­J8õì¼óÎ%ß³ÈJ$ô–[nYŠ"¯,»ï¾{‰šj;ê§‹zwÝu×BrÓI²zõê(²X?}S7 bW”.ÂNÈqéeS‘ÂÉ0ºŠBüЇ>T¢BÕ%Õ|¿ÚC`‘¶ö!tÔ÷¾÷½¯DzJ;€t?÷ÜsËËKµ)7òé§Ÿ^êDd!ƼüÑ+ZR?-. t"EÊf›mV"ìEÕŠl‡¥>xé$âÝ j‘ˆp¸ ð‘Gˆ!åjyüã_Ú{Õ«^UÆH{ôµH/¸UžÞ^¦IäTŸ&¼x˜ !å]1ªOHK:û {ã-ºéˆüB¶·‹-]mÑNÆGŠd¯Åã;„5bOÄ»ê"â`lQ V¢ìW"ÆÁ\9å”S ´FäIkÁFÛwLkk9v4vü§µ;íØ|Çê`aÈ¢‹·zJ‚T¼”7tò côQ×a‡VˆXO§ïi¢N>À¼1ÌW‘Û¢Øgïæ°PåEËR¿ Øùõ±Ûêdº—ó£AÈóA}à„·2äÇÌÁ“N:©¤A3¿ø®X°ð¢Ö±â/l&æÿâåº0#°ªSMñ½[mµU9•žðñ.c&•ŒkÓ‰'žXã)…§Õã}|’qâ{õÙÂ_Ï÷ÒŸˆLÿâ¿XÆÒõ‡b¿æöÖ[o]® ¥à”?ü¾'*øŸ:k©k ›«LéÍǹ³íñ§pùáBª¦#ûã=!¾w×F<ý£LJ"\…@õi ‰@"$‰@"$‰@"°Î €DD,‰‚<ᄊ^Èi/”E¤ò*¢Ì¥ˆ¨pÇ&¹$ ©LD\KI„B”‹í˜¶ÕƒÄ’³C,|ðÁ…œ.;þóG.ud5©àЍN/qŒ(I„RQq>’ C謿Hb?Œä¯'¡líCâ Æ߈$yà},ˆÒŒöº‡rˆS ¡”Dh!¢‰´ ‡~øb}D¦¨u$¸ÍF»±Eˆ¿ìe/+äg؈{9¦¥ ‚ŠŽAÄÃé_§ƒÑ&ÂMZ‡m¶Ù&NïÜJmð¶xœ&"Õþx1âqÇWˆlÇÙBp,q'¢ß˜ÐYþl‚x}æ3Ÿ¹H¨ aí‹ l)¢ó·ÝvÛ²˜sê©§:Ü+s_¶æ…lñ†8%H¾XÜ©I¶¶žºþåØÑØñoÛå÷¾cu@¬"D‘‘žä1ŽR«°ñÀaŒ_ÒGßZÌ£þ¾~›ñô†Hr~+wúÎéÚܶÉ"2]Nò§>õ©‹Å[ÝÌWóŸ`çÈõÐ9o¿E­zžFeCþqÌäï<òÈÉÉ'Ÿ\ª¥Ÿq1^c…q=©I†/ é«ÇqDØð¥ñ²tû¦É\P·ùþ£ õ qk½ëá·ë6,ÜÄâMìýë_¿øø®ã7øæ”D XŠÀ ÎõßKw]{~YiLID HD Ha"%F<Önë#"¹ë#wóFm´¸Ù\GÞ ·2[ i³A¬·gKwƒXë;å§Õ£_H½VúÎA¬ZˆVÉK^RaQ†-Y…ä³È _¹ˆÕË.»¬ù-Ö¶9ô[éÑ•uù¾~?ý ’-ÎñrIQ’^BÚ%ÆÖX÷Eþ"ÃM`‡ îê|è‹]mÙ§_¤/­M98ã}ÑÚ¥—ªô¡?¿ë…„¾æ¼Œ2ô£wÖÎCÈ‹Lõ”EÈK_úÒ‰Ô!ȱvÌ¢Ì,[ã }8Œ©o¹vÔ7þˆ»À®¯}O'˜kCÒ…o}NŸuýó$‡',¦áNçi>I}ú„V­S|1lu³¸ ¯ù4A #V‰ö-\ŠÐn}fŸná÷k?`¡ï˜cŽ)ÄðvÛm·¤ù1þ±>¡ƒõ1ß-ÀZh¬_š=OÛiÛ[~³16dìbqºK/e,䏯¬dnwÕûD`Í# Í”k³ÿ£ÌõøŸêêÿ‘®y]²…D HD HD H2´¯`K÷•›VOÍí¹ÓΉ²È«šÀŠýõ)W“Lõ±Y¿ëo_ŸûúÑGô éäæqš´ýî#Š‚-)ØU[Λ¶oLê~ tÛÈÛ®úE #ØÉ½»°vžT&ž¶Î +­R}õêÕSÉâ.úö—®öûÊ÷í_®õ¿”*õ{(ºÚEž!ê‡ú×§Cݦþ Õ£ürÇÛ¹³ôÎ!}cxá….yÄU¥—þ•(ˆzíGîþ¥¥úuãW·X(”6ÊB¤…·¡ŸŽñCe,$¶ã2OÛiqX~³±±vÖb³.èŸ:$‰ÀÊH¢~eøåÙ‰@"$‰@"$‰@"ð?„€T1ušƒ®®#]E½×$pW¹Ü·n!`$Ò­ Í^ùÊW–õ—\rIy?€¼ÙR{H‰smyÚSV†€t[>kKä6—¿\.sižºÆøÇ•ê›¶³RóüD X—Ø`ÓL}³.Oê–$‰@"$‰@"¬"õ­T ¶>¢Z¥?ˆk3õÍZéx6’$‰@"$‰@"¬EúRßü÷5ÍkQ™l*HD HD HD HD HD H®B ‰ú´„D HD HD HD HD HDàD ‰úkül:HD HD HD HD HD H¢>m HD HD HD HD HD ¸H¢þ?›ND HD HD HD HD H’¨OHD HD HD HD HD H®A’¨¿ÁϦD HD HD HD HD H$êÓD HD HD`Cà[ßúÖäÔSOüýç=ï™\rÉ%뜎­B¡ó?ÿùÏöÐ\ÿð‡?,Øüå/™K½kKïZÙ~ô£“óÏ?¿ÞU¾¿ï}ï›|å+_™üú׿.}üÍo~sµ2³î8ýôÓ'Ú#ÿþ÷¿GÙÓg?ûÙÉÙgŸ}µ¦.ºè¢É¹çž{µýóÞÑ×þP;þóŸ'¿úÕ¯&ÿú׿–Õo˜~á _˜|ç;ß™üío[r|èGÔû‡?üa¨èâñ¯}ík“÷¾÷½½¿¬/ú{饗NþøÇ?vÖ®?_üâ'ßþö·'ýë_;ËÌ2/`Î_…½¶}ïl``çrm` ÚIØ0¡óo½øâ‹'§œrÊd9~-ÚÒiìñyà:¶­•–3Ç̵ÏþóÅÙd—\~ùåÅÙÐú*|%{—ÀäïxÇäë_ÿú¼ªÌzÖ kÊWÍ¢ò¼mp–¶ç]vÃyW˜õ%‰@"$‰@"$‰@"°R¾üå/rã:×¹Îä?øÁä6·¹ÍdóÍ7_iµkôüÐyà ×ìm2Yÿüc.ýY[z×Ê~ï{ß›Üà7¨w’±u·»Ýmòûßÿ¾ô±D]râ”ÈEmÝèF7š<ò‘œ\yå•£ì ¾¿øÅ/®Vó§?ýéÉ=îq«íŸ÷޾ö§µó¹Ï}nrÞyç•"ÏzÖ³&7¿ùÍËw¼ûÝïžüìg?[<}ƒ 6˜<éIOš¬Zµjq_ß„ãqÇWˆÚ;ßùΓ}÷Ý·¯è’ýðûþ÷¿¿¸¯ý½x`Î_ó§vZY˜`K{î¹çb °8ùä“'¿üå/÷Áâ OxÂd³Í6[ÜçË,óB›Æ,ìu}]Ž ,é@ϰaD½6n{ÛÛNõ­3àuÅWLnr“›ôÔÚ½;Úê>:ûÞyà:{«³ŸñÝï~wb°sm=ö˜Ôׇ3Î8£,E9øpÀ“6Ú(v-Ù"ó¿úÕ¯NüàÏ<K*j~X¿à‚ ʸÓîÔþiQK|à‡ (ñ»ßýnòóŸÿ¼ø°{ßûÞ#ÎÈ"³ 0o;ZS¾j–>ÍÛgi{Þe×ìóÖ6ëKD HD Hÿ ~úÓŸNîz×»®W}]u𺢷›}‚КWt)Rìàƒž\÷º×-u¯äD?ýÖ%AŸy晓ÿøÇ“ë_ÿú…0«õó’þQzÔdË-·,¶idöa‡6ATO“|ä#…¤*7­ŽµuLtøYg5±À×%2Î{ØÃ&[mµÕDä.Þÿþ÷O9ä² ç­+ó"ô™Çv96ü”§¢›-<Üìf7«O+QäÚµŒÞd“M®FœŠ.š.¡2–ŽÚõ­/!Ñw‘šêB"!ˆ–«7‚S´4\nw»Û•(x$qHè£o×»ÞõÊXl¼ñÆq¸sk nxÃìk¢>Ɔ÷¼ç=—Œ)òËX vBŒ¿´'÷¿ÿýË.z²ƒÖž¢<]Ù–h~}¸ï}léAŒSÈ4Œ;®ÇLêíx:$lslûÈxJÛKcL_íüä'?™ì²Ë.邵è¿1 <îp‡;Lô M>ñ‰OÒúö·¿}]|Éw}þæ7¿YˆmXÏKŒÂÜü«#ŒµwÙe—]Ù¦ùÎNõAy$"0_D¼ÃߨÇSRÁFý›Þô¦%êšG_úÒ—Êx ê‰9âɪö{²Å8Á(| lã©çÓC½|£~’Ö¶¨ Ì×Jö¸«£o~86VŒ©k$ÛŠki×u–ì°ëÿ.ŒŒ¼ÌŘ—úÇŸ[<´¨¶]Sè¯Í1×ú¾ù¹R;ë«Bß¾mÌ¥öå‡l¹ ߺ×Ax››õØYÈu½‰ëE}κô=‰úui4R—D HD HD X$#ÜƸáÿÌg>SnÊÝ("#÷ÙgŸÉ]îr—RÄÍîñÇ_n€ÝèŠ|&O~ò“‰+Ñ´nÒÈ%e¤fØb‹- éç]xá…©C‚è–gù“Ÿüd©ÏÇúЇN¶Ûn»Å}A „Î]ú vß}÷Å›Äüã¥}•8¦ ©Eô+È7–øÀJ;öÑáV <¤ópó®ŒßçœsÎ|ô]Ÿ#¯="&Pu­úÖcôF¦ÈmM¢=7Ûûï¿!^´ÿÎw¾³¤.ˆãÆkÛm·¬^½ºœ×õùãÇé0‰±Ñ¯§?ý鋈7~5Q/RÕ'ˆimßò–·ì%êE±" }wI…À‹…e‡p@x!©j¢Ù¬Oˆó êÇ´/ÏuäÙLozÓ›N<ðÀR×ATH ùÄ[1.-‰i“ô§O!¢Óì}žDýŸþô§‚ƒ9Vã£=Ĩ±C !GÄ yaþ²5ù‘övÚi§B2¨•XüyÈCRˆ ©5´oÎ=æ1YR¼ÏÓðoS8-©hä16Ào½ùÍo^ôqúh}æ3ŸYl)r„?÷¹Ï]lùC‘ݤ¶aãKôµö­ìÇwœl³Í6å8üáD½úÂ*` Î_xßÂù>÷¹O9ÇŸº-¿ç5_Ôu '”Å8ãllø>]2ä×ÂÖ‚£þ˜[êÕwãï>Ðw>Y¿õŸð­ÈØÚ‡Äõ(æ¢Ö1[Ûª…+D=?ÑõÆ uþŽˆz2t2á³Í/˜ñ‹ü‡¶ô˜æ ÿTë_.üa3o}ë[Kj2zë?Ÿc[‹ö¤Û"ʱ/ Kûí·_ñEp´˜˜)§rveUHÑç=ïye!ÌØ¸öñÝÆŒ}}ôÑeî!Çû„oç«wØa‡òTŸ-b4ÄØ¸XHä;¥æ›EU×7×¾Ì"’2ó6ã黸îG®—q͵ØàÕ'ãסùuMÛª‹mÇ?d> ·ý¯Á/ am1ž¢ —9K·®q+÷œç<§\GàÚ÷Ï _øÂ‚C¼Ãæleìµ~Úü\‰õUÓ0®µÿ“ŒÁwŒ Öm¬ß»·­=ID HD HDàZAÔY-Èm$=qÃìfqKÜ#‹Üyœ|ô£]„V)K‚ÔóIO€ÙE<ÞNÜÜ"¼|„yR댼}b!ÁM¶E:‰®DJd"Y!r#¢yEŒwÞy1ºŸÎ´ZÔ¥_ÔƒDë\‘y!ÊÀH™Yõ®u ~ˆ „>ú+ [["PwÛm·B„#Xº$° B1Ê úã c¤Ndbéç̲v‘ô"ú!àZ¢­]HƒÃ=Æ´Cä¡hTöçw`QâcÚR†m"w¥Ja{}Â~= º5ðï* 6T/Cg½þ)‡¤"m* Äa̽˜³æ Œ“ù _ì}šù‰4”–äùÏ~!<Í?Dr+¤žóÀž5F¾‡c0âïÌñðƒ¶°âßôQO¤d!0±€§Œ>´6\ -ü=~©þ iÖ'üo¼Ü;|jè᜶­y͗ЇŽüœ~±QįÜ.ãã¼G<â‹Ü®%p‰¾D6‡„eÃl'æQøô([ãaaŒ½Çµ'ˆm}¨%üNDì×Çú¾½F™óž`+®¯:ê…•¾úëý®M|»Å4؇o¯Ë°m×8s,ƈݲ™¸¶ùͦÙs¯Õõù}DÒÈ?@ø»®˜cæG[g×o¶Ïnèo‘F_Hè1íÿv(eM\éKï.a»q ¿ç…¼aíÿ#c¯õCó³K—1}㫺êîÛ×þO2F‡16Ø×Þú²?S߬/#•z&‰@"$‰@"$ÿ `Ý ¯j"«ÛÈN©Läñ%AV¶„"Š ¢>Qéµ êÈiÄK-A¾¼á o¨w—ïA‚·:÷éã¦Ô‡Ð 7äeçÂdtÑ¿6‚ß¡Ò%¢3_÷º×E5‹[:ÆâEÛ÷åè‹­N±°BdDI‡"Àv_³EŒÀ TKŒaì‹vaƒŒX©À§6h \µE,$ á0V¯±í³©xR!ˆí±mD9ç‰~5îò²ëcŸˆÚE†Yð™&"Ž[Ab!‘ r1#g؆[ó´M%cñ*$zê2ô#‘jH»2b!Í"`óÎþv^(·RüáþB} ÷øÇ?¾ì²Á8áëÓ [å[–Ò·x"žDIkÃe矘Sñ;-£ÍØ_o»ü±ãlŒmÛš×| Z¿à·E Ñܵm(?Æ÷Õ  ;êg>"„ß2~ÑNì·eÛ¢ýKúö‡Ÿ ÂÞ>âÉÒâ[vöü‰kÁ´k” t¢èéN©bá«§ê%»ùvs£õãúRã×-‹_­8ŸxòÀ9žD€+›5-Èw‰:»æHè„ÿÐü誻Þg®˜7­/5qÞéAß!¬adÁ/l%êæ{< ÕJíÛâÚ~cŸóf¹Ö·öãÿ%ó³ühþ õMñ±×«¦êÞŸíÿ$C:ŒµÁÞדã®hëIgRÍD HD HD X¿’:"Çô&ˆv$M-ñI¼\A#Ú” nxÝÄ“Vç¸ùùÝ'tŠTuçÄ:b‚ØuúdŽïA 9j"Hd ›Z„LŸ,Gï ;éPëmЇ.c¢ªã[Q˜± Pïo1Œ¾×cùŸã¼ˆŠŒßÓ¶l§kÂvâ\Q’úÈq^«Kû{Lû°”ZDYÑ–ˆT6"ÇXa §Ÿ~z!ZØI,8tpDÈhÏ9!°G¸"ü¤J¡‡ÔmŸ‚„Œóú¶l“#ë᪟A*÷³’ýAVÅ6ê b:ì óÀæ­-ÇücÊãVGóCì20wÌ¢€Å2ä§ý±8ØÚpô¿ÕËB)©çX”»mÛš×|‰öÍOý‰>´ ŸŽ‡ŸŠ2qN×¶÷qNàïœÀ§>ß¾/ä±ù9ÎÃW;/l0žªˆºâwŒY쟶s²ˆ-}”d¨§WöžÖŠ'1¦µáý‘ÍI”7ÿù—ÀÊ"_½˜ÊV#cç‰ ©ÜD«K©b‘¬Æ(ê³5Wb~Öûã{´94?¢|ß–~ž^âéÄ7IÆŠŠšþt§w¢>¤½Nű±ÛÀv³^ëÇ´1Ô7uÀ¹kÚy3¦½®2C:ŒµÁ¨»Å¼¶Ó(³.n»Ÿ;\5MD HD HDàZ@KðŒé0ÂÜ \¼h/ΑW–tÀQfh‘qˆÄÈA,êO{qÓ×ꬬãÚ2Ú‘J¹Š°@Rº‰ŒHSÇ¡ˆH׺_ê‹ÇÃCÔåæ}ÕB4rè}Žä«J.ý»½Fä×—"…D·"'B´¥ï]Q§Ò< èßJÝwÇâw`ƒ°ÖV-0+êiÇ>u4½º¤Œˆ¾ûß§á€Ô€A,.8¯o¿Ç´oƒ¬^½º¼\QiWtf)ÔóÇ{<µ Ú=R–´EÙ%)Dò" ãcœ}àeKŒµ>ÔŸ ãJ?ˆeĸüúÔQ;PͨÃ1f­} äHoçÅ<ðçjŒ|Ó÷! ÷˜ã¶ö;7æ¹hg¾Ç;ØlMê·6 éo¼\ؾvŽE¹Y¶m[íJçKè:úÍnÐðÔw§X@ãËI âš¿£š8çêÔ4æ4ê'º™ïz׻ʔ÷´ üµhjiqjâ_$7‰k@ÝÐɶ&ËÛi×(øÐɸE6ioHØ~ù±ð§‹p ¿ Œ§p,@Ôå¼Ï¥iˆHk«lûBئ¾Ë»Î†k[2±ë%[ á[EíÃpìüˆsû¶®Oü¤±”ZçÙÏ~vÃX¼ƒµ÷¨ÇÓ~Þ53Yîµ¾¯íYíhŒ¯êkkÌþ1øŽ±Á¸Å“Úf³uz3ûÚùåwm£íoç¬ ùïäÚh-ÛHD HD Hõä"ÈÍ4ñ;‘ywÆUÌcë¦Ëð€B„{±"² ç…›šd[g”“ÂÃK_½PÎ;ráŽa熾Kg‘Ï^žzÚi§}ä¨ö‰¤‘V½Èkù¶9ŽÃxÛm·-Í»)G¾x"M?ÜüGú€ÐÑcùúMGí"¨¼TòòË/Ÿxy`Ÿ,GoÑÇ0¥«vüFÔX<€‘G"òZzºÓÇM/B¥ÄYÕAÔ‹þDrÈCŽlÒ27ˆ/Äý¢ˆ•¡KK²—Ê{þˆâÔ—d±q©oÖÝÜ· cp`Çôaì“^mÚ’1íH/ýFP¿ð"Ù±bE:óĹú"õ‹úãE†ñ2f/ n娣Ž*6¸ï¾û¶‡–ý9çåŒ^D:-¢vÙ T'š?|²Ž÷¼ç=K *³UÂþÚy±Rü+:¿Ž±c'ç5ûñòR‹5¢¯/¼ðÂ’ã:Ò‘ð-¾‡E ¢.eØ„Fl”o1HUÑÉË‘®¶æ5_B6Mo‹I%|—7>äØc- dic=äã<þUºsIÏÏGŽÿ(Ã7ðmò´ó×|›ñ‰ÈtžtÒIŇXLaÛ!®£|"±hæÅØ|nqƒ¿cqÍjû×FmòIÆwÌ5Ê‹\Ù ŒA®Ç;T<ñ'Oaˆög_öµ~ž"ñ÷»Æ°—ZødmHÕå¸k/…ùv¥/ìŒ?²Èâœ>‰9—ºÎ#ãΧú=v~ôÕû‘þôôdÿ ~ï(œÆ`í)céšäÅôlD?g]\ êír¯õu¾/׎b¦]/Û¶fù=ß16(ï?_háÍÿll«µSzÕó‹My¡²…~Ã\uͳ=øàƒ×Øÿ¼l×'Ä÷$êÜ&‰@"$‰@"$‰À5Š‚á7òÓ”©ol”ó˜½bô‹t|Õñ‹LoËN«·=†@ðϳÏ>{rÞyç•Ãn¸näût^½ýúD4žH;/¿#n$£^„qCé%‰ô&n½¤9ä*âÑc± $}ò9@Ô%ÝB¤œˆ²±]®Þ°DÔžqÆ… DZÙ‡äóâWbüPHÄsÎ9§ì£Ï^{íµ˜®¢$Lƒœ*'üçÏ;ìPúŠ@rŽºåÑÒÁ惸CäG®Û(×·5¾{î¹g!¤bœoA!"¯-Ú.é2„LD€ÒEÔ-âÀ>$ª±%cÚ×'ö$òI«m¿CßRQóG™ˆb6Vç7¿µ"¢ëòqÞÚØJKbáÖmÚ›.ºöõé ã¶|¼çCÌ©}öÙ§•]ób9ø÷éÔµŒ 8ÏKŽ-Žñq‘Þb^û¢J8"¨ø!é²ár`áòMpøóPª/Ä-i1,;›?aÏÊvµeÿ<æK4Ëÿ"ŸÙ´yeqÓBFH­}æË4çÉåna”mЙ›‰2p…MíÛøuvBøûˆB¿çª3ˆzä/’ùâ‹/. ­Ê¬Zðýõ‚CÛmXˆ1þÆKþô¡k”z'ß3B¾(t±.|Š2Ú}¯\-|>™Îrï¿kÔ9ü,[, °¢ùk±¸á)‚Xä¨Õßù ×KO…3î®—ÈØùQ×Û~÷nï ¨ýƒñö. 2kO$y-¿ïiXZ 8òÈ#Ûæ:ǘw\ε^=QgŒçríh¬¯êÒ}̾1øŽ±Amy‹ÿâ)Ø ,”‰ï‹ßñ‰}Ê­mÙ`áŸÿ>{²¶[_ÃíÅ…w 7“Õ'‰@"$‰@"$ë=[¤†ïnl|÷A>x¤ØÖGVDƼÑŠ|8à€J¤örëE¶!¨ç}£å1}xÔä÷‡ôQ'Î ~ŸhæA¼õ•‹4+ñèw_¹yèÍ>DLF讶胄Ò¯«Œ¨5·ú†¢ž‘3]2Ÿ®scŸE"õ·º"‡D ~øáQtÉvã&úuÈ&ûÚÆ¢6C›Ç– ®‰zç¡Û¼ë`—¢e-ÈÔseÚ¼XÓøëã Có!ÊÅvȆ•3W£¾iÛ¡¶Ç>¿1v¾L«Ç1Òu èòÇR/‰~–ŸœM k-.¶çqÄea‰¯êºLÃ¥ï]/½ôÒ²`Ôúž¾~Àȱúe£êÒ)°íÃ^®Gµ}~¡ Ƕêëò-;ö÷_ß΋G]i×ê6;2—ŒÑ»k>ÄËo-|„o±xã…Õöyâl2öZ?­­åÚ‘:»|•'ÛÔ|mûlÖ"Ñôá[Ÿ7Æý¢ÍÚ¦£ãLbžýŽóæµõ…'XüßáCOÿ/uÿ—3¯V³žD HD HD HF"‘ÕHÛ•H ±’:ëF¯½Ù£ó>-ÙÒ¥gWÛ]å‚è:VÞnn‡ú6¤›h7ü«þóA­cý}¨±øÔu¶ßûADÅGÞåö¿‡pp>¤¿zúÚwŒ µsU©åÿý_!é!Ä.#ýCØ´y±¦ñ§Ç „®cì)ÊÚÙ°2CsU™12ÔÖŽcçË´zë“!ìœë‰§!™‡Ïцö"½RÛf_?`Ô%C: a«=uÔÒç†pTÇX,ëö†¾õÑù­nž*òTÖ4áヨ£wŸÜ K´éÂ`*DÖ#`·Øb‹i*Ìtlóu¹vDÑ._åÉ(©Ä¦ lÇõ}øÖu·ã\‹ïu`Eì‹-]jú]—]“ß»g÷šl1ëND HD HõöFfžÍ$ Ãú$ë£Îð]Wô–7YþXiÖEqˆ ¶!åÚÀº2/æ‰ôÚ´áµÙÖ<1=Ï -š"sE^§¬_H]ékÖ´æÈ~iW¤ÉϦ¤k‘VGj—k³¸Næµry#\ÿo½Á O¦¾YŽyV"$‰@"$‰@"p­AAâ»ÇÝ}ÚÔ7¢¨1>RGÌ3õM´ŸÛD HD HDàÚŠ€Ô7þ<ÙO^\çÚÚáìW"$‰@"$‰@"ŒG ŽæiÏòè»ãñiçïD HD HD ˜øÿÚÿÛÿ“¶]Ep£²|IEND®B`‚cccl-2.5.0/.devcontainer/img/container_list.png000066400000000000000000004703071463375617100215100ustar00rootroot00000000000000‰PNG  IHDR5œ^ê)gAMA± üa cHRMz&€„ú€èu0ê`:˜pœºQ<bKGDÿÿÿ ½§“caNv@ì,Y7”€IDATxÚìw|Çùÿg÷zÕI'étê'N½÷ŠQDïÕÇ ãšØ?—oljkâØ‰ãc0Ø€)¦÷" „PïíÔ{».]ÓÝþþX±,’’°@†y¿xÙºÙÙÝ™gfw?ûÌ3³bç÷@H ·þÿ–m`´ŒãIýM¢ïpê@ äŽ`̊ݺvë!0l¢çG°ú2döšåÇÉX,ØQ!y˜@Æ› ù7rã?·¯ë Ûû€a¢k„2.˰‡N¤a¥*…@ äáeä£4u…`C†¶"¸@B‚Ë'lüšlÔ[Ïw« #+3 -ÛCÒFPŸA òƒ‹’jBHZˆjC‰¸JCF’hØä•t¢aâl˜2#»Í°Q ü ·Ôg@À(* Yœaw%Ѩ·œo˜8»E™‘ö°´Ôg@È* gv#í.$õÖsgv§±Î‡¡¬°'B òÐ3LWa7§`¤,Øhm¤ŒêÐ Fg7GŽx>lS ÿ @ È-*lÈ9ø|º!Ñn µq¹Ðn0Š8»‘B¡åAZ× @ d’Œ— ­²1…†'²l4b}.1Œ4š‰ÿ €F›ôQ@ BA¡£F¦!·¨4p#ó˜ÊŠ:âð„ÿ ” ¨Thy@ ;€ë0Üy°á*ô㎠·:ϰ›?Š3@ q`½ñ× IusÓDÂ÷Qò~¤ÿcÕM @ 20ì¦Ã `·Lµ`œ«a 7ýlÜg*Œ9ƒ@ ™87¢ù10|Ùñ¹ÐÐ[²’g( ­ @ 21†d6šï':<÷Í¥=(ÐÂ@ wÁ¨CœwfDüÛ„@ ™ ØçvëVüÿ·UlÔ[÷$VÖ€+æC LŠ›ß+¿ñ¨ ‚Žy\@ ȽfØ÷ÑÇ‚â"™.ŽœC@ ä!³Z mݪkù· ÄF_vä‡nÃd–ŸÄEZQj×ÕrÌb@ ä¡¡Px± .R•7Žì#Ù8:‰1Q¦‹£¶¡Š3@ !˜Å¢mè`º8NÝ)n ß'‚¢PœA yhÁ,–;EyÝU,? ƒ@ ™R&¬Õ >ƒ@ ™^ p!Z@ )g"N4è?ƒ@ ™j&6Ä õ@ Èô‚:mK&ðÅ„\¯i(mlûwat°Ÿ»ë|öCҙ¼Ü"¥¿\+êÕèîñ©·ÌMjíQœ)¬¸¿pÚFúx¸l*Š^¯ihéQ¬LŒ,ilË­i˜Vå„@ È4gªô™€Ã •¸‰…f´ MÝ*McW_kŸò~UÕY(H Ê®ª¯hn‡ ?‘>a·ƒÙ…Jm?´Æøá0iáþz£©´±Íb±v*5Ð&™S¢Ï¼ô¨`*Šöiû;úT(ŠòÙL™«“€Ë¹ú 2ÕÔ¶ww(Ôê~ý½?õñë%ÆÁÁû[}±­ Esj»z‡®.=‘WªÓa߀@ È„˜}ëëEEÑŒòÚÚ¶."‘Í ;Øð Å`ú Æ~ÃýÑ"]ªûï¬b3é½ÑD¤ Z­ 5ì™(S¢Ïìù\½y,ÎFSSw9…Í ‡{»»9رè4ƒÉÜÖ§Ì—7ý€wàsC½Ý|…¢3ê;{Šê[-V+‘BAƒÜ½ÄŽ6læ Õª0ÈÛº*Z:"¼ÝýÝñ~^ñ~^€~£é§Ë×G=‹Ddï-vò¹,ÝlìRiòëšï8ØÇc1c}%Îv  uNUýÈ< òt‘Šxl–ÅbéVi ë[ºU‚nœ׫é?v½˜¼‹Ï]VÙÚyµB>d^ˆÄÕÉ–O§PL¦N…º°¡EsÃk%äqÂ¥N>•JÑ èkZ»Ê›Ûˆï·º9ØÍ È*¯ÕM‘R;Çd”·wçÕ6Z1 °0:ØÉÖ°">ߥ®£ûRiÍxÌ2,þ,F& ötÙŸ•çæ`àîÌe0´cICKÍ­c ›hÊ£³ãë:º‹ê[¢}½œ<€ #Cú†ÅuqYÌpo7g[6ƒaÔŒòönò¸¶§HìéjÇecQhû‹ZZzÄÖ¥qa,ý@V~”§ÄÉžN£öit9ÕõÝ*í¨MϤQ7ÎŒÃÿ^2l+9þl$“¸ òÀ3%úÌh¤Q)4*Å/ÌËÝÉÖæD^†a ]âÀçöiû+š;((jÇcyºV´t4võ¢(*q•·w·õ©æÛ¢*Fæ©7¶÷© æA“î)²wÚþ’]¨¸íÈ›A_ʤÓ:{5N¶6 cBŒæ[FÜ((š,²å·ö*[z•t ÅÃQ¸0:ølaEk¯Òl±6÷($N&ƒl/'{@}GþS"²O ñµbXcW_¿ÁÈa2\ìmÛ•j\Ÿ‰üô¨ € õÝz“ÙÍÞ.ÖWboýTRM.‰«ƒ»½]kŸ²F£s ‚=]ÁeQ}‹¿»ÅÃÁ.¯¶ /†Fo˜´YQ>žÎv‚Ö^E§ó “}Œ&sÓ 14¶Mˆƒp˜ŒE±¡ýzcSwƒF»ûÑ©”%±¡tµ±«W;ÐM§Ñì¸l©³#¡Ï‚<\b}%FSu[€Dì87<àJe]UKqAæF²´¦î>&æá(œ|àJþ¨Ól±^.­qs°ór²/¬¿©•Y ZŒL2FQ'q@ äa`JôY}gO »ó¢èòæö…Z{ãO&%È׊a?_É'¢s¼…Ñ!‘R÷Œ²Ú‘ù4ÚŒ Y§R}¦ ‚p˜áZ©Ø¡¶½éíîÀç–5·“}Wl Ð 0ªP‰kF'ïè»üG¯— F©œæ%qáaÞî—K«o·K”‹N»\V#o:xR ¯‹ÈDRa^®6ÜÓyeD^nm㲸°é¾Ì\@]GDdïådOž²êåäÐo4u*Õ&–äc´É)ÖÜPE(‚ÐiCí˜胢è±ëŸ§'¯¶i~d·“C}go3Éyéá(<‘[‚°Ó(”‰~®Nyµ‹µ­Oådgãá`×Ü«æ2œ„Y|îÁ«øŽåÍíËâÃ=]}vG›à8ÙÚ”6µ]¯nO÷sڲ贬òÚj’£Ž0—Ɉ–yê Æ_² æA@qCë²øð8?¯¦®^½ÉŒgcÑi=Ý©«exg pwŽ÷󒹈 ëšGžÑbµÊ;º9L†—“}[¯’lpXc볉^yH˜’õÏòjåíݶÆ$ðg‡ø¹:Õwöv(TxQQC˨³ÛF},áÆ ÅU#—QÀ]#t LÊ[6 <’ A‘Šæv…¶ß<8ˆa ÔËÍÉ–»G&Bn èM7õƒN4÷*Ë›FYnWqCÄÔuöú8‹¼œŠêšàn¹¾˽Òð:F¯#Jè#Ša6Ó©”[-vKÜχ ¿²YF?É­5N›à– Læc9Å‘>žžŽBo'@·Z{­ª¾G­Ð)€þÖÐ@üød…jÑͬ†üª«ÄU@ ‡„{ôý€.•¦´¡%F&Ûò;*óà ÀjÅÚûTã<¾‹ÞdºÝ²Ÿ¸À£ÍîƒF=WTIžmæí6VÙ,“> ³Œ³Mïh˜ XQõëÏURPÔÁ†çá(ôwÏ øùJ¾Ñ<ˆG²hTòBxK™îí j“¸ òp﾿‰;i¨  G£x: d¼‰^µ 9Ü.ƒVo4:øTtôJáø;žËdp_ ƒFµãrÆØE3`´XmøäD6ƒÎaÒÉyŒƒƒŽÞØ Ã@Cg€Í²ã²‡ nvs¾MÈ>’(ÜR Š" í>¸4d¨»6˧M&ÅjíTªsªëKZ™tšŸPhú"Û[lˆí/aW@¦DŸÅúyÙ°Yä6ƒàî èÑh}]·ZkÇã{º³1iTW¡í¨Ç”wt›-V?7§aúCÀe ù\†a5­,5Ræ9ìÔøF³Àa2Æ.¼NoÃÞ£|npÐÂf2œ… ‚´)T]CÀË¥Õ £C¢}<=…J5€-—# ê;{Fý”Ád¾\Z=+ÄoaLHsB¥í§R)޾Ÿ›Q^‹GhÈ›lm‚ÜŶ6í Š ¶\ÅÄWjPêôæA?' ‚êM&³ÅRÞ4ʇ8å=¡·ä ™‹PÐo49Û ølf§RãdË£ÊyµM®öv)A2W{[Í€AlËç³YJ݇uSåÈ›<»UIQ-= ³ÅÂc1]„³ÅÒpã‹@€.•Fg0¸‹Q©ëhf„Ìryjˆïò„p|ý36“îb'È“7á«g–×Î ^R×Ùc0™]ííì¸ì†®¾&Òâ«w‡Š÷ó øƒ‹RÛßÔ£˜œYîÈ8m2Œ7±“­M§B=2æàáhîåÞ®P©ôƒƒ!ŸëjoÛ£Öv)ÕÞ/oŠöñ\‘ÞÐÕ‹DâdÏ¢Ó®VÖéï:xq¢Œç*ˆóõ2Y,•¤an<ðL‰>;‘[âî ÛÙ8Û 4ªÙbéQi:{ªZ;‰¨gÍ€ápvaˆÄÍÝÁ6ÐÃeÐbé×+›;jÚ»nwئî¾_®†x¹‰mmÜìmæAÍ€>¯¶©õ†ø´ZOä–K\½œÜ-ê}ic+¾ð E•Ñ2O™«ˆŠ¢ýFÓ¨ú¬ß`<‘Wã+ñt´·bX§R}¹´&Rê1v•Œ¦c×Kb|%îB€aJõ±ë%)A>d}f±ZOæ–ú»‹¥bG™«0`0¶ô*G:·ê;{B<]‚ŒÔ(õ=:ƒ!Tâææ`K¥Pôc›BÕuc6b§Rs4§8ÂÛÝÃQH¥P´úë5 eMû¼BÛ¥Bäáìé‚"H]GwSbrf¹#ã·ÉøiéQp˜ ‘€/ð‚è †üºæòÆV"†­¤¡U;`òtñuuôiú³*äÍ÷Öy6é«@ ÖT…$/6ôÜøƒyÛÕ°Äku\.‚¶ƒL—NŒ [æ$6u÷+ª„Ö€@ ȽA<#¬cß±Ûl´ äæ?€ÿ 7ÃáG AÆö~†> Fƒƒ@ ™^@}@ 2½€ú @ dzõ@ Èôê3@ éÔg@ Ó ¨Ï @¦“ú¾•ܽ# Ñ¡!³s„Í@ ÀT颉ë3s@ ‚35ºˆú[3@ ÈèHEöjm¿ÑôxJ ŸÅ$o2-(oU¨~ú @ ȃE½ú”B7þÑ¥-‡_y†ÍÂ7±è4Aþ¾aé’|}¡¼vú×å·ºþÙ¹7·êþ÷ì‹@p,Vë£ÿýÁßEtö­ò®^—mæny ÿWÙÖu0·¤°±õø«O- ˜þu¹úì™Ù‰ØîO°ÝŸ¼·vÑÈ­5ÿx ßjÏãLOÙó8Gÿðdz¨ÿƒ×•÷<÷èss’îožŸ—²ëÙG<Û¾±$í‹-kî͹<ìíŽþáÉ?ïémùõ‘8ß]³p÷s›~ÜöèÛ}9=µ©Wíå~úõg†åÔ ó?ü"»¶ñÐËO¬Š æõºwþ³A‹õ‘Ä(Aȉ 2‰“àÅ:Ñ£½´óМ÷?‡ý’̪˜Ð(/·éY¶ÿ=µ~SRôo×¶¾bÇ ‘ Œ€@ iD’¯Wɯ==;ÁÅÖÆžÇÅÿ1i4—Á™¿ßhZø÷¯.VÔþôüæG“§õSéÞ=oΖU§‡ú§úK/VÜ÷}4)Z=`(kíH”I&t´²ÖØ/‡±*6ìR…<¯¾eü»lþb·æº`žv|n^Cóo×¶~΢õ Ç ËæÁqîòѱ ·¾ŒL!Í}Ê•ŸüÏ<ñ÷ùí‚ ÈŽ§7\¯k^ºýÁˆ'ú89\ú¿m=Šù~1ê^z“yÉ?¾9øÒ–OoÈ«o©h뜞µ»wúìH~Y¼ç£ÉÑ„>cШkâÂöçIíÈ9™4ÚÖ9‰‹#‚üÄŽv\N§Zs´ üO?ŸPèˆ<çÞÜ'õäny ÿùñÆ¥¿_0Óç•¿- x~^Š»Ð¶±GñÁÑsß]Î=x ÌË™E§éMCë)GIÜ´zcU{÷Ce[‹uªÔ‚ t*…¬1 3 Z~v@ _ oG¡·Èþ¥]‡qFAÑ£xrÐjùî¿›û”Ãò/‰*zÿUüo:…Š Hj€ê3`0›÷ç­‹xL&Àâð@[{gæõ?¯œOÎé$à}°nñ¥ ùÁÜ’“)ÄÝykZbŠŸwÌ·Ìc}Bá½µ‹fÊNWfTÕ­ˆùßSëº_òK'ZT6óÉ™ 1ÞîV Ë‘7Ê+™aCBdœÔ“ÏböhugJªä–`&`³¾vãÁë%ßg^'2óYÌÏn<”[Š'RPtidÐì@™Ø–¯7™å½»¯æÕtôY4?ÔÏɆ¯5®É›veæjot»¿¬Lç³™_œ»²%5V*r0šÎ”Vÿp%Ã0&¶ÿÅÇ Ãð°ÇÜúæwžvàsWF‡†z8;𸫵²½kgfn}wQ¶=Ï=z¥¦þ?g³«bB7§Ä<óí¾ ‰‘‘7 ŠæÕ7ÿçlÑïžv$F¹‰iJS¯bïµÂy¾ ß}ÛŽ#CÜ̓–Mÿýßãí‘×ÐŒÝpÔ¥øy¯OˆÙðÚ•š=Wó‡Ù–‚¢Ë¢‚Ó‚dN6|ÁxMÞø}f.^†gf'Î ñ}ä??àýçÉ™ñéaþ›>ÿ¡ßhÂK¸>>"ØÍ™A£öhtÙµD[¸ØÚlN‰ qwYx_±ãÇ—¾äœTd?;PÆc1ä½ÿ=w¥¡§°eFìòèÀîç6áù_ÞuHÞÕ;+Ðgf€‡½-ÉìÑê.”×î»VHø#ßX’æao÷Ìÿöá?¾¼åXAyS¯reL¨XÀïÖhweåeU×§âÄî]íâð ‘ ïë‹Ù'Š*ˆ}=ìíþýØÊ¿»QUà0è"ã|93~YT0þT~49¦¡§/«¦aX¶N•V¼õO}º~"ecbä[7mLŒüöÒµ1Žíåòú‡mJ5àÓSïýá¥ôÕg}gÕw¡íÑ‚²Nµ6ÞÇó÷ f’3pô¿oXÊe0N•Tvkt2'‡G“c„<Η篪ô¥ÍÉ~^d}–(“PPp"òÿÏŽ÷ñÌ­o>URIE)®"g'\Ÿ=ž³<:$¿¡åxa…‹Í‚°?±ãïwÿBx¹xÜ×ÏÞs%ÿ›¾k±RµqájÍÙÒjÓàà›{ýiÅü¢¦Ö#ùe\Õy;Ú‡y¸d×6vi´6+=Ôÿƒu‹ŸÛñsFw»êÿyåü 嵿䗺ÙÙ>“–øÔ¬øí'.á›db‡÷Ö,êÖèöçÌæx©ä­¥s?4ö(Š›ÛÞ^9¿¬µãz]3†a‘·’ævíh¯”ïýr–ü“I£=3;q:×îžê³¬š†úî¾G“£÷\Í·çqæ‡ú}pä6"þÉ`6ã~2*µa±hTÊÅ ¹Þdžà3¶>{ï—³Äc¦¸¹-·®9ÂÓu¢…œ õrþëÔåse5€“Å•[½€œa}B„-‡ýâ΃ø“õTqe§ZûHbÔ¡Ü’nîr•ü…y)¾bÇꎡá¼?ï–>î‰IðñŒ÷ñü9§˜p‡ò>gÂIÀ_’]Ûøþ ›4÷*·ÎIšâw¬°ÏlÃf¾ñÓ1<ö®¶³'ÌÃe^ˆßÙÒj+†•¶tX¬Ö>í@iËÍȼ¦֜ïš _ª”ÿç±UóCüweåÞ®úÊkÊ.Ôtô¸Ú –G~ö ÞÏÍIîPi^ùá0®Vüuõ‚Í)1d}v¥¦ž¬M7 à [‚lIkU¨^ÿé(>WÚÒñîš…ª~=ž9Vê‘(“||üâåJ9ž’ßÐòáúÅ)~Þ+j+Û»z4ºd?/BŸ¹:ÙqÙ—+‡4Öss’Mƒ–v †Â¿Ïªé†„H.“ñÒ®ƒ¸ûðtIÕ§›WünfÜ•šBuª4„ùêÂÕ¿¬Juw.hlíPi:T@e[—Fo ªöÖ¾ã„C÷DQÅss’ÒÃüweåÝÎË˦Ó^øþ.%ÿÙ{iç³Ì òÅÍ5vÅñ“ùÒÎCªýûp”—û/ùeD+ÿrÃG‰ È“3ã{µºW÷Á}Wk>{lå£ÉÑï97ž>@ ÓŠ ÿÞuæg¯ýåe³Å"yñxÏÏl{ãïSarϸ§ëŸaöCV^ZÌIÀßI£Pveåš3=Ô?ëÏ/|÷÷Þ/ßíøÏ;mÿþ ‹N øc­ ŸÅœ¨÷2ÚË]«7^¬e&äN’¯WqS[¯VG§RðÙµ ]Å€ìÚÆA‹5ÙÏ ÏlËaºŠ 'M¢ÌË4hÙ{­p˜YQ7‡òJ9u¦´Zg0’§dªúõä‰u]½N6cÙÄhĆ BQèú›zÞ"á»GÜêº{)(êÀçľ—£ð|y ‚¢âWjÄ>ya”‘‹þE{¹W¶uá¢ÄÃÞÖ‘Ï=U\EK•4·“½ÐI2/­Þ˜]Û@œBÞÕÓ«írã†Êª®ópá1‡få$ûỹ¹õÍW;‡½íñ¢rrœ"aÌho÷âæ6bl×`6Ÿ,ª´å°¥"ûÛÕà4f—#$ EéTJ^C Bñ°·½]þÂÆ6Ü€~£©S¥qðÆSñ»·ŽGœúÆ 7';.{Xº»ÐVdÃ;URE 7÷)óZ"$näËäv}@¦m~xï‘Ïw½}àþþä*¦Q(@Õ¨Nv‚.¥`ØÐ?pã&{*η++÷O+æmLˆ\ŸqMÞXÛÙ32Oz¨ÿ±WŸªëê}kßñº®Þ“ðÓ¶Íw\Ý@}ëÓ ™FQÄj™ÀEG^§ZC–ÞäoA0hT!—#”r¼´eØŽøw$tcaSk’¯×·—r0 KöõByCŸ9 x*ͨ®G>ÐÚwó\«µC¥!¬“Ç|F3—9Ög¿((º*&tf€³­ ñü­jïcò)ð±<.“p±³<‘÷DjÜÈŠ÷Þ£6rŠ"H„§ëþœ"ü§ hSªÈyZ*Áõ]ìlx,ÆHÛÚÜøFGFUÝòèD_¯SÅ•Mð‘äÈ›ðHy\¾7õ*GVŠI£ñYÌ–¾áçÅÍNx:ÉuÇ'7p™Œ1låå(|$)*Àʼnø٠l}<¶èÍfâøw¬8 G«gþߥœçÏØñôFyWO~CË¥ 9îWêc·~Û¤µOíåÎg1 ñw»>@ ÓÁ¸ûJþƒW¯{½ž“¼«7»¶ñ•©Î¶6[¿Û?jž—ÓS-Öäw>íRkñÊcÝ£'€–‚ ûûJMáÜᓺoH“Ë•uÑ^î.Ne­)þÞu]½Ä¨+rw¡ˆ#eæØÜ2#vqDÐÁÜâòÖNÑhµb[ç$½ËÈÕ6ð*ãÿÝ™™[ÒÜ>,C»RCü=l‰?gÅ VÖÀÏ<ìäò Ò©Ò~|üÂÈËè?m u²¯×©âÊPwg63£J~kù'¿\Èhu¿-ö<·뗴+Õ__ÈîÖhMƒo‘ý³i‰ÈXÇÇFµíx*0{†fFU]YkG‚$ÌÃeYTȪ˜°ž¼DŽ´›¸¦o-<܇õ6wfæþwËê‘Ã|îö¶Í}JBœ’}½îÙ$‹.µ6ØÍEb"î:Â1˜ÍꃇM8]F‚{t’ý¼º5Z™“㎌›k|t(5±R&6Ò…†Ë;W¡ ²mÈ¿EAQ±€?ƉîHj€ôByÍŽŒ[&“ó⌓NµÀ¤Ñ&Tžh/÷N•–p\ ÕÑNßps‘6Ò„šN•&NêÙÔ«#Ú)³ºnm\„-‡âçÝo4ƒÚxˆ˜§ƒÝ•3N f³Fop ȉ®v@RÕc3RZÅJ=˜4껇ÏGp¿ýÈæÍ;ŽŠO…nàXaù±Âr.“ñÑú%"3ªêûßb¡À`$ÇÕA ä¾s¾¿¹ûJÞò~;ïƒÿ’ã„È4õ*<ìm=†Eã0è®_2uåaÒh¾bG"ˆ*¯¡…ÇbÌ ðÁ"²(<œ?£ªÎßE'õ$'rtbÀÛ`6çÖ7'ʼR|™¤hž+5õt*euly_Ü”×Ђa`Yd0‘ž$ã2xpÕxЛÌÃÆ¡0 PЛMœäë5éhµô©{ Ãð12[ÎXãàÑÞîäò7õ*»5ºy!~tê­‚\ÅDCã¶¢RÐMIQäƒPP”GªWFU‚€Tiœç•šâã­ US¯raX ¹H„s.·®9ÔÝEâ0{Ç Qç‡ú)ûä]½ã²­yøp'.ØóR)è°~2!ÆSñaf÷;rFK¥ (¾v6ŽÎ`ìPiðœøkϼ?}(ƒ›P)qËoh¹ËC düÜÿ™Ö`<œ7ÖšŸŸ½27ØïÚ_^þ1»`ÐbYÖܧìÕöOQy\íl>Þ¸”˜Sy¡¼vqxà¶¹Éö¶*M¼'ŸØs5?ÜÓåÍ¥s.”×Ôtö0iTO»É3ÿÛG2£ª.É×kULhU{9$ëjmc޼iM\˜§ƒ]QS+Š .¢òÖÎ#e*Íá¼’åÑ!^1?¯¡ÙÅV° Ì¿¡»ïLIõ8+R×Õíå¶42XÙ?Чë/oí¼^×”ä«5zú$™RÜÉ49>=ñÞÚEŸn^yº¤ª]©¶å°ýœÅþÓßî5¿ŸëaoKžr‹aØw—sþßâÙï¯]|¡¢Æ†ÅZÔÜ«$,|µ¦!£ªnId»½m~CË Åêlk“(“ìȸNLcÄ•âú„–yë˜ÝÎfþmõÂÏ6¯<]RÕ­ÑŠlx1ÞÛvüŒ·ZŒ·Ç»k+,ï7šfúˆ6¿0Î9>u]½€-3bñùž­EMmf‹åËç+,§¢èì ÙÝtÂñTœLª¿tKjìŸ>9lN €Ïb~±euVMCcO߀Ñìï"ŠñvÇW™Á0ìë‹Ùo-û÷ KÏ—Õ0hÔEá“yWf.¼B È´b:~Oð—üÒÿÙõú’´gf'ôé\/~kßqùöÿ»7g·X­üùÄS³ÒÃ,Vëõº¦ÿ]ÎùlóJ"ƒÎ`üÃî_ÖÆ…ÇûHføKû¦v¥zÏÕ|õÀÍ¢¼ú–~£‰Ã  úÁ0ìý#çVƄΠð‰ðtí7šä]=•7ö¿Ë¸Þ§Hõÿ]j¼Î`<[Z½33wüKü{éÚ¶¹É›’¢4jn}sykç7¯a¤øyÏ ö•wõ¾}àÔ¦¤h&}’^ÛÙóò®Cëâ#fÊ8 ºj@ßÐÓ÷Ãmfà¢½Ü æÁ²–[¾Ä•U]OEÑuñO¤Æµ+5ŸžÎˆ”¸Åx¹>>~±²­kN°ï¦¤èA‹¥[£»T)/m¹%èír¥|sJŒª_?,®²­ëµlHˆ\@£Pz4:b¬³[£{uÏ/›Sb–DáëÓ¾ûËb}Ú;RßÝ÷}Æõ…á1Þ2´>í{¿œÝ”ýDjœÖ`¸X./lj}wÍÂIw¼ñT|<ôg˪CÝ]’dE»ÔÚo.^#æ çÈ›Þ>pr}BÄ#IQƒKy[ç÷¹äÅÏ 2@°¦*$yéæoƧA«AÆÃŸWÌ´ZÞ=|šbª‘8?ݼâƒ#çFÆÞA ÈÃö¹ÉŸm^ÉzìÕ;Æò2i4ýŽ¿?ÿýŸÉüNœ}nD’ @P€ 7ÿü'DTýháõØÞϦ£ÿ ò`PÖÚ1m¿kö€aËa`Œ?<0@}™*\/†F˜j((:;P6?ÔOo2×vöBƒ@ ‡™Ãy¥/§§ö~ùîx2×uõŽ ú ù C£P¶ÎIlU¨ßûå,üyÈiU¨RÿöÙS³î¸j#°¯.\¶^÷´ê3ä7ŒÁl^¶ý[hÁiéSýqÿ‰ "èòç_ýëóOûI<ˆ$&þÕ_Þ€m @ r_@S¢"d÷¿-"Àw\{˜MÐj@ L•.B_Þ¼þw|·G©zjÍòq}RÑ=Ý,@ r˜]„ªu: W Šílíï¼GsÐ릛q @î5zh®›ŠSW½øúÞíïšÍfŸ×Öu'8heùÀÝØ96 @ ‡³ (ºAsœ’¹óÔ¿í‰~Žafµ6µuŒk§A3¨¯õU°u @~u¨~EU5¤ï;A @î'(4@ È´bHŸQ©”Ĉh@ ûõ޼þ»Í!2o­Nw%~0@ ä>ƒ €íýì7úv×¾Sç&‘g$ËÓRCdÒ@/Og1Š¢k_y«®¥uúT<>,xÃÂyR/ÞÑÓ›W^ùÉÎ Æ»ýdXTÿÆEóßÿjG·Bù õ“¤ˆÐ•sgýé³/µý¿ÅòÛp¹oo{rÿéóW Kª Zrü¼ûÒ³ÅUµã¹¹Ý=/_ìdo÷þ×ßÃŒŸ‡KŸ­ž7{Ft„n@ß«TßîcVãÉ3’7­ã²Y= ¥Z×oËçM«Z?µfùS«—  …•Õý'áÒY)_ï?|÷úì7§‹ØOây!'Ïd6O·² 6ÿ÷Ì–/ö¬nhºÅHŽ “º»¹‹E>¯¬¶îÛGÈ[™ ú¬Ø(w±“»Ø‰Åd>ùrnÁG›hþ©ÆÑÎ6=%ÁÇÃA£+5š¢ªšW§a+Ü ³ã¢Û»{*ë†+yHx¸ôÙŽÃÇÿµkoKg׊9©o<ùؤóŒäõíÿ®mjéS©ß{iëÜÄØéSåÄðЧV/»^Zþúöÿhtýx"›É4š qðpÏKŠËÌ/¿>»RXr­¸lÐb™ê²xKŒ&“¼©åþšhnbŠ"Mm|.gäV>‡3'!V©Ñ¶uwKÝÝîx´‰æŸR<]ÄÏ®[i2–ÕÖ©4:Ÿë*rœž­p7¤ÅÇä•WLHŸýù³¯¬˜õÞoç‘ã®}L{¤ÏtÚ†…óç%Ź9‰Œ&SsG牌«¸k}åÜ™o<ùØÖw>º^ZNäߺ~Õ–‹W½ôzãON±™Ìg×­œ—Ça±j›þûÓðÙ¦"¡Ýªy³ÂBÄöL½µ«ûØ¥¬Žž²ZoÞƒJkäw,êxòŒäZqÙä,ã*r|yóúèà«+¨¨Ú¾cÏŸ¶þNæéžºù™ñXgnbìêy³ežî”ÒÑÓ›UPüÕþCzƒðìºzƒñÍþ—g€ƒvý1À0lêăN'Äq€·¤¦±ÙbµÞßúnß±[©Ñ>üý¶‘[j >Ôëæ$zå± w<ÚDóOáÝBÙ´dAWŸâ˽‡úõúÛe›&­p™ºN¥P0€Y,7íIþŒ÷Rºç`Ði_¾ýFwMcóþÓçhTª‡û#‹ÓÇú€¢è?_92Я ¢ª°²ÆÕÉñ_o¾RXYMθjîìܲмòJ*…îïûÂ#kÝÅNûâÓÖúŽBÛoÿö¶|þÙìœÖÎî0?Ù×I& ©ñXïÕ-¬MŸÓݧ<™™= 7¸‹Ekæ§í?}^oèqv´÷óòGö÷?¼UPÔÑÓ7+.Ê^`£PkNd\-ªª,•’ øÛ‹CzwûŽ=-]WçäÈ0Og>‡=`0VÔÕ½˜E<‡ş=½f9‡Å:xîââÔd7±Èh2g•œÌÌ& €¢È̘¨˜à@¡€¯7šJkäÇ.e"ßýÈÅŒE©IÎŽåòúïÐi4©»ÛÏgÎIŽŒ°ŒfSKG×ÉÌìæŽN“ñÞK[‡þiëïê[Ú~8zÿÉe³–§¥J½¬V¬\^!'Ü(¶|Þ¬¸(™‡‡­ Ïb±6´¶ϸÒÖÕCdÀÅÙí´X&‡7Ñü8ÑAsblùü^•êdÆÕ©W ÔëŸ~Adòñžé"rt+”y…ye•ø&±ƒýü¤x©‡+FS¨Ô¥µuÇ.eá½ÑΆÿÃÑ“ýz=“A7Ž ÃZalKâæ'ÅùxsÙ,•V—]TŠwE‡ý—mO]ÈÉÃOÃa±Þyþ©‹×óñÄÛ51A”¨ð„°¡€?`0–ÖÈ_¾2¬ Útú¯<HŠKŠTÔ5|½ÿ0…‚Ή JìÚÑÝ{!'¿(pÈñg³ã¢¥&½÷ÕwéÉ þ^•õûO'¿È9;:¤'Ç{»¹ÒhÔŽžÞ3WrÊjëðMž.â7­ûþðqOq¸¿/Ãþð›]} bßañg~^žsbì… ªÖꪛœ¹€ošô•Þ{ŽÞþý"ÑQh;þ#,HIˆ ô;r1ãÏ¿ÅSÒ“þúÂÓä<™yE§³®‘ǰÞzúñe³g|øxKg×ô´þ³kW 6üôË“™W‰2/OKÕ èÇi½ÄðеésJjäÏýõ#Üaàs9xl™ŸÄ íøéïÃ:V+öÅÞÿ;xt¢¥ l¶mXc2›Og]3&†‡ {ÇeMU}ãÑ‹™T*%!,äÅMë>þî‡>•º\^o4™Âýed}î/ë×ëk›ëåÍëmù¼œ’òÖ®n‹,“ xÞn®G.d  iñ1.]ÐÖÝÓ£P^),¡R)Iaß8b0Ý  ÔׇÇá\+.ÕöˆìãÂ]·¿»Í'hmmx›–,8uíðùËÁ2ï9 ±}*uNÉÇ÷Ñ% ƒeÞ¹e•—ó -Wo09;Úû{yâúÌÓE¼uý*£Ét¥ X¥Õ 6R/\ùJ<Œ&“Nÿ¿g¶6‹¥´¶îçÓȾ4r+ÜÑ’,&ãÅMëØ,æÕÂ¥Fë.-œ‘(à󜹠í¨mj ÷÷%ë³P?Eñ7ɱ›xÉÌäÔ˜Èʺ†ÌüB‘½01<ÄÃÙé“?—Øíz ÉlþÏžýO®^VÝД‘Wè×t-9*¬ ¼êzI9•Jð÷ݼl¡å uŒñ§V/Ï-«¸t½ÀÉ^¸bNêò´ÔÝÇNá›ÜÅNÛ6®V¨5ç¯åM&üÍêûÃÇ‹«k‰Ý—ÍžÑÜÑùã‰3†‘ï`ÃpvtøÝÊ%MíÇ/_´ 6øm p7W:|–C<¦\Ÿ¡(ºlö …ZóùO?“Ó»û&pEÍKŒ³Z±/÷"RNeeoY±XâêL¤¨´C:Æe³¹t½`yZjd ßôÔgM‹ilë8••M$~ýóá%³RÆo½sRÿïBœˆ¡L[`yZª¼¹eóihm–yÿù¹'·®_UÛÔ’™_4¡ÏOŠ§Ó¨Û¿ßÝ£PòÊ*ß|ú1b+Ã^:{FvQ)áØË).ã©Ísb:qfÐb)­© ‘ùì;yH¢ÓhR¯ÜÒ «ÌOŽ l¾Ú¸²®ßýìÕë‚¢<œÅ‡Î]Â?YÅ.]’UPÜÑÓ‹gfÒé7d„ÝÇN½óüÓ±!Ç.eõ*U½J5 ¡µüT>™™MŽÀklëØ´$ÝÇí¦±yÔºsÙìïÞÏÉméì’yºÇ‡ãú,ÈÇ;ÔÏgבCݪ†¦ç7® ÷÷%<ëÛGN€·¤µ³w5…ȤÁ2éùk¹ÄÓý"ÈGF[g$Q."‡ŸNœÍ))\-*Ùº~9CuCSYm!=óÊ*_rs|Xð°0ùû‚ ‹S“;{ûþ½{?.Gj›Z¶m\M<ãmxÜ%3“ëZZ¿øé ¡Wã¬MŸcüø»Ýjí¤#lè`k زbInYEMc³›“hfl¤ÐÆæ“]?â½nX+ÜÑ’óãø\ÎÇßíÆÁÕBЧR§''\¸–«Ôh *ªÖ-˜ëáìÔÔ>¤ª#|»ú¸«rŒ& lfDG”ÖÈ¿;t o¦ŽžÞÕófÇ…eÝ1 “7·Z­˜Z§“7ßœ3n0šÞþ÷׿ÁAügf~ádfLäú,¯¼òÌ•@sG§£ÐvfLäÏg.à—Éšùi½JÕö{pûg?»nåâ™Éd}¦Ñõå_‰;‚ _ì=H”è‡ws¥? OkÈÃÆ”¯æâèÀçrÊjëîf¶ ‡[W_ÙaŽaXé­O;A6,š÷ó'\Ùýõ™o>;ýõ§ÿzó€½­à¾[™Íd¾öÄ&âß+—\YLùÁ èîSvöôßzþ^’~½¾â†¦Þº Àöÿþñïryý€ÁSRþÎçßÖ/œ;ÑZH%•õ¸8ôëõ„þJ½¨JvQ)•BÁÿ™ÌæšÆf©»+ž¡°²šÉ û{Kˆüt­°rhl"ÌÏ·¾¥­òÖŠà– ”zM&òsò7¨nl"ªÞ`ìUª„›1êBˆ3A¨ îÕssÝ.¿n@O^0¥¥£‹8~˜ŸlÀ`(­©#*ÞÜÞ©ÖêÈ¡ñ£©\^?ÜžÞ¢áBý|̃ƒøÓqXõïÜ.Þ’ƒ!·¬œØ+3¿œÁd6ã‡Â+«Öê:zzG†Éß/Äö>/»¨”Ð^u-­m¤!ø@©…B9uì¯Åkähgëd/ÌÊ/&ÄÙnt•A§ç–Uì;u®¨ªæè¥ÌÓW®¹‰EA>Þ£¶Â-æ'«ilVª5D[—TËñvw”ÖÔY,–p_<3ŸËñru!Â0Æhb/OAÈy9%ez¹‡ÑGÃ0BQPHU}£›X4Æ.…7#FZ;»Qð¸ø-ÔEäp½´@T¼¸ºV(°±ár‰]ò+ªÆÓcõ#Š¢!¾Ò‘¯Sq¥C ¿]¦ÜÆe³½JÕÝ„Ãb5´µKT¨ÕäŸÏmXõزE¹e?8Ó­PZ,NöÂ7ŸzŒN£Ýw+³˜Œ5óÓˆŸMíß8Âa±Àh¡? µÚ†Ç§õ¸!˜F‚{ šÚ:ÈD|LÄ×ÓcBU`Ðél&³«WAN$Ÿ÷UŒŒ'”eucÓ€Áî/ÃÝHáþ2µVWßÚn¸<‹*{F=µ-Ÿ§PkÈÏf\©ÛÙð‰òã`4™ØLæÕáqØéÉ R/òŒE&ƒ~»üª[›É48HßQhËf2?úÃó£öü¡ÝµÚa.‘ƒ Kˆ6{[AŸJ=¹@lù¼>•špÎiEfÇEGú;ØÙEbæÍ=†A§á=ÿFo×àí8lˆªG©²á ­Sc/°2øÛר›Àøüò*"%¿¼j~R¼—«KIµ|d+ŒmI:fÃãÚð¸õ…‘7(À€ÁPÕÐæ'ûåB†aa~2A *ªîØÄ¸Â ŸËb±ö©ÕB`<=ðv„ùÉRc"\©T ‘HAÑÛÍ„ )Íf›Å8Ø Kg¥,%¹ö‰û±—R­O(¨¨Ž zdqúò´Ô𯿲Úú¢ªjÜæSq¥C ¿]¦\Ÿáa 'þØ"ßAÀ­Ï6@¿^?rQ1;››¯MT eíü9UõÏýõ#âö8M¬Ü§RG­Þ<,n»^w´ž®_oo{Û×G<´eX,†a“Θ`%0@FK€¢àË}‡ð0/"Ëb±WÕFúÑi4Eü½$YE„_g´ƒO‚<½f…€Ï={õzgoîK{ᑵ(ŠÞ¾—b#BØ O¥&Bõ ȱÕfóà°­ÞmÿÀ͉·÷;Œê“@I¾AÆ^(aé¬É‘a¯ç×5·â¥Z5w6r¯¬=Œè €•sg?_þàŸ£·@†™zT×ÌM£›O­íhnN¸ÁÇ1YLƨ­0¶%ñsWÕ^ÊͶIqCšTTJ½¼\]êZZ#|[;»o¾ÃÜÝcõÀÑ–I7/[XR-¿˜sR;0`µZãBƒcCÇÜ õøuqüòyóð%HÈohãœk2›?ýa¯‡¾x¸¿ojLħ»öÞƒål ßS®ÏÚº{4ºþ o&ƒ>ê ¶¿ÜxE# "Fqj›ZbC‚DB;bˆA`Ò … Ëb2ªšÈムþÓÙô­Ý£)PêENt°³urü;Z¯²¾aFtyŒ†LMc³n@ïî,"¿4Ûòy¯¡µ}B¥5šÌƒ£í°Òãa^†á˜)¬¬Ž ð–ШT*•BlM&•?qdE„66MíãõqŽìä‘~ µfÇá›a‰á¡“k2ÜO§Q-Ÿ+†a5Íx¬gBxÈêy³ƒeÒÂÊ껼Ò!Œ)?³Z­‡Ï_¶³áo]wK¤-ñh¯ªo,œ‘ˆ;`éÉ !¾RræÓW®¡(òôÚåDÊü¤xòä¥Fk4™CdR*eÈç.vZ·`ÎUjv\ô†Eóîò;«õ\öu‰«3y=Û'W-¥¼8w´ÞÁ³—¿|#ylŽÏåࣺæÁÁc—²<Þ†EóðM‚à!Ï#‰£ÐÖÓEL— yƒ¿—'áÌc3™QA7pi­|pÐ’žO4Ã&þ–7·jtýáþ¾áþ²>•š<êZ\]ãíæê+¹eÔ?uYmƒN &ÒgDGnu7 Ü=†ÔààO,¢¿R£#&ÝŽ…Õ eáŒDr"…‚Ž1ìÂe³ÜÅNò›å/®ª¥Q©iñÑ#«a˜B­ññ¸Í–NV“uõl&3êÆÛ‚ ø: äú“Fa~2Á¯ú‰ >—ãé"ct˜ŒB­)­‘ÿ=½j­..4ˆèù^n..¤ð¸ry½Åb™—G¾4p t+”½}ÉQaänF§¨ªÃ0rÏIT76Ú w´dae§‹8XvË݉Åd}Þd6—ËëCý|ðKƒ¼ÐM\Y߈aØÌ˜("=6$ˆÍb޳‡ãçÖß0€‘ß8ìlø¡~>“kß®>EGOordø°;Ùæcß:È +gs{'¸áѼË+yÀ¸ëk|µÿPd ß†Eó"ý®—VP)owW‘hÉs¿´tv]-,IÙùþÛ%5rw±S¸¿oVAqRÄÍ·½W§&/™™â*r,¨¨v‹fÅF]/- Á´Z­Ï^\¿pîÎÞ¾ZTÂçpæ'Ç_+.›E.ILpà‚”€‡³`AJyz*+›X`vF†Od»V\¶(5éñå‹ë[Û\EŽWgÝÀÍÆòÊ+“#ÃÖ¦§‰„½*uˆL:,< \^8`0´uu;;:Dúã‚_) ¨ƒíœ„@e]#1ð7#:‚N£âÁà¾: ¸ZXŠÑGú-™™Bž{;vþa`vìrÖÆEó·m\“_^Ée³“£Â:{ûˆ05µVw<ãÊ’™)/oÞPXYm0šœí™ ú®#'ûNÛº~ÕkOlÊ.*Uj´v6ü ï¿Ù èQ(/ç¤ÆDÒVQk›ZÜÅ¢ˆ¿ry=î¹Ù w´ä©Ìl_OÇ—/Ê-«hnïbÐibûP?Ÿw¿øŽðGVV‡ùÉfÇE7¶u£KÇhâ>•/çSk–WÈë…v‰á!m]Ýã_õº¥³+ÀÛkFt„Fׇu–ËëÃüdO®^V\UÃçrR¢Â»û”øêq“àÇãgžÛ°êµ'½ZXÒ«Rñ¹±“½­à½¯vŒçÖAfáŒD7±¨BÞ ÔhYLFBX°ÑdÂUò]^éÈƽÐg£éé·ßß´dÁ¼Ä¸µéi£©©½cç/7õÁ[ÿúïË›×§FGzº8WÔ5<õç÷fDGõ™Õj}ùƒn]¿jnbl Ô»¶©ùÅ÷¶‡ùùú ðÙî}Czr†…óÚºzþ³ççë¥åÃô™§‹xQjñ3ÈÇŸÉUÓÔLÜ Ç“g$QAþäs͸áÙsüôú¬[¡üÝß}éÑu3¢#0 Ë/¯zòOï}üÚ‹äÐ¥;Zï£ow•ÕÖ­š;{ÑŒ$@GOï¾SçˆÉ]ÿ–·þö̺©Ñ~½JÕžã§¿ÚwhŸèUªþ³ççåi©ó“âûõCëÓž9À•Ââ>µzfLä̘( Õèúë[Û²‹nù°4.eÄÌMœ~½þ“?¦''ùxdžjûäÍ­J -ØvpAJb¨¯O\(S©Ñ»”5ÿN[WϱKYø7ïÁ×§ýæÀ/Ëg§ÎŠÆ0¬ª¡ñ‹½ß~îÉI÷ð]GNÖ·´Å…/HI°X¬J&¯¬²¶ñ¶ß JêZÚÈ |`öÝ¡£³b£¢ƒü$z£±¥£‹ƒ¾”›Ïçr¢‚üý½=åM-Ÿÿøó¶kˆ}-ë—û-OKMÁWUýåBÆkOl"2>Ã@¸¿,.4¨¥£ë«ý‡¤$2è7çÍ„ùɈY‡"¡Ý‚”D@¿Þ@è³´øB©ø{yú{yŠ«jo·(ÿDóç•U¢š½löŒ¥jaþ2g1‘ábN¾R­M‰˜—g±X» bµÞ†Ööíüi~r|rd8…‚*ÕòXç‘‹™šþþ„°?‰‡¶àüµÜS™Ù·k…;ZrÀ`ød×sâcB|}"üôFc¯R}*3›\¯ yƒÁhb2èÄÌ€ñ4ñ‘‹™*­.!,xÙìz£)§¤üØ¥¬ñ‡dýr!cmúœ…3iTjE]Cýþ¶¼²J>‡“ºjîì^•êÈ…LŸ;i}ÖÒÙµ}ÇžyIq1!,C7 oëî9™™=‰C•ÉëmxܸР.›Õ¯74¶u|wè¾:Ò]^éÈ‚5U!ÉK† ý7þ`²¡uî=Lýì7ÿ.­•o}ç#hEÞ}qëÉÌ«¸o 2*¯<¶Álül÷>Ø [V,v°³ÅšÈCƒ @P€ 7ÿü'ÿ `´HlïgèÃd¬é±”ÎÓkV°˜ŒË¹ð™ñÀÂa±.çŒÏþÐBAQr¬’§‹ØÍIT;•,‡­p/ás9c|ÿŒ šàþrôó\-,ilë Óháþ²`™´¦±ùйKÐ2*Úþò# 7±hÕÜY…•5]¿ÈÞ.92L7 Ï*(‚­ð[ÇUä,“º‹ µ!‰õÙ}æèÅÌØ „ð:ÖÕ«øáèÉo™Ü:¥Èo¥Z«ÔhgDG°™ £Ù\Q×püò•1¾áù­èp½´üÒõh dBÀø3@ _@ ò`õ@ Èôê3@ éÔg@ Ó ¨Ï @¦PŸA L/ >ƒ@ ™^@}@ 2½€ú @ dzõ@ Èôê3@ éÅoXŸ=µzYÞþïežî°!¯¬¶îÛGÈ[™ ú¬Ø(w±“»Ø‰Åd>ùrnÁG›hþ©câyÑA#Óÿ³g¿¼¹uºµÂÝ0;.º½»§²¾ñ!¹¨!‡‡KŸí8|ü_»ö¶tv­˜“úÆ“M:ÏH^ßþïÚ¦–>•ú½—¶ÎMŒ&õmíê~û?_“S˜ ú™o>S¨5W Š’F÷pÏKŠËÌ/¿>»RXr­¸lÐb™ê²xKŒ&“¼©åþšhnbŠ"Mm|.gäV>‡3'!V©Ñ¶uwKÝÝîx´‰æŸ:Jªå=·zvS¢ÂiTjsG×4l…»!->&¯¼bBúìÏŸ}eŬ÷¦x;ØCr¿@~5î‘>cÐiΟ—çæ$2šLÍ'2®â®õ•sg¾ñäc[ßùèzi9‘ëúU[V,^õÒëmx ›É|vÝÊyIq«¦±é¿?v ‘ÐnÕ¼Y a!b{&ƒÞÚÕ}ìRÖGOY­7ïA¥5ò;uMZá3u=œJ¡`³XnÚ“ü7ï¥tÎÁ Ó¾|û ïšÆæý§ÏѨT÷G§?ôEѾþrd _AEUae«“ã¿Þ|¥°²šœ'&$pÕÜÙ¹eyå•T %Üß÷…GÖº‹þöÅÿ¦­õ…¶ßþíÿlùü³Ù9­Ýa~²¯ßyS­ëŸõ^ÝòÈÚô9Ý}Ê“™Ùzƒ»X´f~ÚþÓçõ†ž‘g\23pôbæä ì"rX‘6ÓÝÙ©@µ¨D©ÖËâ+åìè`µZëZÚŽ^Ìììí„úú<¶|Ñ—ûU‘^ñƒeÒ-+‰k~r|Ô‹ÇaëôµM-G.fà`L==9!ÔׇËf)Ôšœ’ò 9y6ôJþ÷?¼UPÔÑÓ7+.Ê^`£PkNd\-ªª,•’ øÛ‹CzwûŽ=-]WçäÈ0Og>‡=`0VÔÕ½˜Õ¯×ãy†ÅŸ=½f9‡Å:xîââÔd7±Èh2g•œÌÌ& €¢È̘¨˜à@¡€¯7šJkäÇ.e  øV|÷#3¥&9;:”Ëë¿?|@§Ñ¤în?Ÿ9O$5:2:8À^ 0šM-]'3³›;:YLÆ{/m6Jø§­¿«oiûáèIü'—ÍZž–(õ²Z±ryý…œÈÇ›Ëf©´ºì¢R¼+ò8ì¿l{êBNùÔë矺x=O¼]I‰ O  øciüøå+úШ=A§ðÊs€¤ˆ°¤ˆ0@E]Ã×ûS(èœøØ©Ä^  Ñ¨ݽròð‹‡6;.zQjÒ{_}—žœàï%APY߸ÿÔy¢gG‡ôäxo7WÚÑÓ{æJ¡}=]Ä/nZ÷ýáãž.âp_‡ýá7;»úľÃâÏü¼<ç&Ä:Ù )T­ÕU768sß4é+yð¸úìé5+‚|¼÷;½ýû›éŽBÛñaAJBd ß‘‹ï|þ-ž’žœðמ&çÉÌ+:uüfüÖÓ/›=ãûÃÇ[:»¦§õŸ]»R(°ùã§_žÌ¼J”yyZªn@?Në%†‡®MŸSR#î¯3>—c0šFžÎÙÑ!2Яº¡©vRC9BͶ kLfóé¬kæÁÁÄða¯à¸¬©ªoO*Ä×ÇÛÍõÈ…Œƒ!->æÑ¥ Úº{zÊ+…%T*%)"ìÛG F# [¡„úúð8œkÅ¥Úþ±ƒ}|X°‹£ãöï÷O‚aØÚð6-Yp:ëÚáó—ƒeÞsbûT꜒!ï£K˼sË*/çxÜäÈp7'Ñ'»~$Üvþæe‹Nef·u_F‘¡IÓ2Ow ­¨k òزEÁ2i¹¼>»¨”BA%.ÎWgüá=6 úÌÚ•Növ™ùE½Juˆ¯tÓ’trW'‘Ìý´V®PkxvBXÈó×|øÍαeÙ½$"ÀwâyMíy…\6{Ý‚¹jŽœ!%*|yZj[W÷ùk¹zƒÉÙÑÞßËWZž.â­ëWM¦+Å*­N(° ”zÝNŸ©µºaŸÜ w´$‹ÉxqÓ:6‹yµ°D©Ñº‹E g$ ø¼g.hûj›ZÂý}ɧõóAQ“»‰—ÌLN‰¬¬kÈÌ/Ù ÃC<œ>Ùùq‰Ý®šÌæÿìÙÿäêeÕ My…€~½@§Ñ’£Â Ê«®—”S©ÔßÍËZZÇxjõòܲŠK× œì…+æ¤.OKÝ}ì¾É]ì´mãj…ZsþZ®ÑdÂ߬¾?|¼¸º–Ø}ÙìÍ?ž8ƒaù6ò.ô»•KšÚ;_¾2h lü$žø¦»¹Òá³òà1åú EÑe³g(ÔšÏú™œÞÝ7+j^bœÕŠ}¹÷‘r*+{ËŠÅWg"E¥zØÐi4.›… È¥ëËÓR#ý¦§>£ hZ|Lc[Ç©¬l"ñëŸ/™•2~ë­˜“ øø?G35·zà¥&!rd²Î³ùIñtuû÷»{*@^Yå›O?FlåqØKgÏÈ.*%{9Ååo<µyNBìO'Î Z,¥5u!2Ÿ}'ÏáIt-Pê•[Zaµb€ùÉqBÍWûWÖ5ໟ½zA@T`€‡³øÐ¹Køã'« øÑ¥ ÂC² Š;zzñÌL:ýã°ûØ©wž:6$ðØ¥¬^¥ªW©4´¶î1ÀÉÌlbÐØÖ±iIº‡[Mcó¨uç²ÙÿÞ½Ÿ“ÛÒÙ%ót ÆõYw¨ŸÏ®#' *†ºU MÏo\îïK¸j8,Ö·ŽÇÚÞ’ÖÎnÜÕ"“ˤç¯åO÷‹ ¯þ‰ p9ütâlNIàjQÉÖõ«ÈªšÊjëé™WVùú“›ãÂOd\‚ ‹S“;{ûþ½{?.Gj›Z¶m\M<ãmxÜ%3“ëZZ¿øé ¡Wã¬MŸcüø»Ýjí¤Uœ¹Šì…dgÌÈV¸£%ç%Æñ¹œ¿Û ‚«… O¥NON¸p-W©ÑTT­[0×ÃÙ©©}HUGøvõ)pWåM,ØÌˆŽ(­‘wè^¼ŽžÞÕófÇ…e݈½]Ä0LÞÜjµbjŽ<éÁ`4½ýï¯ÍƒƒøÏÌüÂ?<þÈ̘È1ôY^yå™+9€æŽNG¡í̘ȟÏ\À/“5óÓz•ªí;öàöÏ*(~vÝÊÅ3“ÉúL£ë'Ê?¾wA¾Ø{(ÑïæJÿM?†!Q™òõÏ\ø\NYmݨqâãáÖÕ×Gv˜cVzëÓA ‹æýüÉWv}æ›ÏNýé¿Þ|`o+¸ïVf3™¯=±‰ø÷ÄÊ%W'G“A~pºû”=}ã·ž¿—¤_¯¯¸¡iÆAE3’̃ƒd98!¤’ÊúF\œúõzòPQ Ô‹J¡d•R)üŸÉl®il–º»â +«™ º¿·„ÈO§Ñ +‡Æ&Âü|ë[Ú*o­n™@©—Ñd"/pp1'pãP€êÆ&B¡ê Æ^¥J(°£.„8C„J¡à^=7'ÑíòëôäSZ:ºˆã‡ùÉ †Òš:¢âÍíj­Žo0šÊåõÃíé-!.ÔÏÇ<8ˆ?‡UÿÎíâ-0rËʉ½2ó ÉLf3~(¼²j­®£§×Uä8MîAb{Ÿ—]TJh¯º–Ö¶®nr×¢P(§³®‘ýµxílì…YùÅ„8»Ý¢‚üÃ7‡µÂ-æ'«ilVª5D[—TËñvw”ÖÔY,–p_<3ŸËñru!Â0Æhb/OAÈÚ1§¤l@o ÷ð1zà¨`F Š"©ªot‹ÆØ¥°âfÄHkg7Š¢¿…ºˆ®—VˆŠW× 66\.±K~EÕxz¬Þ`DQ4ÄW:òõc*®tä·Ë”ûϸl W©º›ƒpX¬†¶öa‰ µšüó¹ «[¶(·¬â§gºÊA‹ÅÉ^øæSÑi´ûne“±f~ñ³©½óÛG8,-ôG¡VÛð¸ã´—Ã"ÓØDú9;Ú_ÈÉ#?ÉÆƒNg3™]½ r"ùÔ¶¶€‘ñà„²¬nl0Âýe¸)Ü_¦Öêê[ÛÀ —gQeϨ§¶åój ùÙŒ+u;>‘2¬RF“‰ÍdŽQ‡žœ(õ"ÏXd2è·Ë¯ºµ™LƒƒÄñ…¶l&ó£?•:jõæa‰øpÛØõº£õtýz{Ûq½>.NM·ú £%@Qðå¾Cx˜ƒe±X‹«j#ýè4Š"þ^’¬‚"¯3ÚÁ§ Až^³BÀçž½z½³·÷¥½ðÈZEoßK±‘!lЧR¡úäØj³ypØÖo‰¶ •ðÝÞï0ªO%ù{¡„¥³f$G†]¼ž_×ÜŠ—jÕÜÙȽ²ö0¢ƒVÎEü|ùƒŽÞFfêQ]376ÝÁmã'ñà²Yç¯åŽÝ c[?WqUí¥Üüa›7¤IAEU ÔËËÕ¥®¥5"À·µ³ûæ;ÌÝ-01V`™tó²…%Õò‹9'µV«5.486$p̽QO_Ç/_‘7[%¿¡s¬Élþô‡½>nøàáþ¾©1ŸîÚ g”C Ør}ÖÖÝ£Ñõùx3ôQé´ýýàÆ+1ŠSÛÔ$ÚCœ‚ûxlx\“QÕÐD~÷ ôŸÎ¦oíì6MR/r¢ƒ­“ƒpàßÑz•õ 3¢#Èc4£Âf2gÅFÝͲgF“yÀ`p´³VZâo<Ì Ã°13…•ÕñaÁÞ•J¥Rˆ€-£É„Ç麗R£õ•¸S)â.ÚÒCqlF>ØDB;‘ÃÇÏKº6ÈdKŸZíìhßÞÝ;!ïW Ô«¢®ž([¯JäãM§ÑFÄh2[­Vƒqó¢¥PÈn?…Z#õpE„8Ú°É7‘~yå•äI»sXþ=£¢®A}ð(9oG;[òò]ä·Ž¥ v’㫆6)”±ƒ=9j$QAþ†å—WŽÝ c[Òh2élxÜ1zxYm½yp0Ü_¦P«ÝÅNäס1š¸O¥ÆÏEE¡MSûx}œ#;yd€ŸB­Ùqøf@Xbxèäš ÷ßÓiÔ_ËçŠaXMc3뙲zÞì`™´°²ú.¯täcÊãϬVëáó—ílø[×ÝiK<Ú«ê› g$â@zrBˆ¯”œùô•k(Š<½v9‘2?)ž<9@©ÑMæ™”Jòù‹Ö-˜3E•š½aѼ»üN€Åj=—}]âêL^ÏöÉUK)$/έwðì%ÀïßH›ãs9ÃFuç$ݘŒW&´È“£ÐÖÓEL— yƒ¿—'áÌc3™xLNi­|pÐ’žO4Ã&þ–7·jtýáþ¾áþ²>•šìŽG¶ùØ·2ÃÊÙÜÞ‰Ûðî¯täã^¬¯ñÕþC‘~Í‹ ô»^ZA¥P¼Ý]\E¢%ÏýÐÒÙuµ°$!'"À7¯¬’ˆ[3? Ïäãغ~%þ0øbïA|°QùüÇŸÂCþö³ɑam]=a~2g'ys+ÙM5¶õ®ï;unÍü´ƒÿúèr^Þ`turL YóÊ›íÝ7ùÏL½8±ÁÍ¥³fxK^ýû§øëìé+ׂeÞÛ6¬ÎÌ/´X‚ÕZq·ÕèúŸ¿¼rîÌW·<’_Q¥ÐÛÙðý$­]=?8ƒçÁ0¬¨ª&!<`ØÊR§2¯x{=¹jiNIy[w›É ”z<{±¥³+¿¢21"dyZªHh×Ù«ð•¸ùx_-,¹cÔ>zµtVJQUÅb­nhêQ*{ÊÅ©É<»_oð“xØÛ ÆŒ?’’yaeuJT¸“½°²¾qppÐQhêësôRæÈht/‰Õj­&Í-©‘—Ëëç$Ä:;:T74¡(Š‘áÙ®—-JMz|ùâúÖ6W‘£ÄÕY7pSjä•W&G†­MO;{Uê™tXx@¹¼>&8pÀ`hëêvvtˆ ôÇ6!¾RÜQAAQ;Û9 1€ÊºFbàoFtFŃÁ}%tpµ°£ ô[23…<÷vìüÃÀ0ìØå¬‹æoÛ¸&¿¼’Ëf'G…uööajj­îxÆ•%3S^Þ¼¡°²Ú`49;Ú3ô]GNö:·uýª×žØ”]TªÔhíløA>Þ~³“8~¨¯ŒF¥æpžl…;ZòTf¶¯§ÇãËå–U4·w1è4±ƒ}¨ŸÏ»_|Gø# +«Ãüd³ã¢Û:ÈÑ¥c4qŸJ}9· 5&ò©5Ë+äõŽB»Äð¶®îñ¯zÝÒÙàí5#:B£ëÇÃ:Ëåõa~²'W/+®ªás9)QáÝ}J|õ¸Iðãñ3ÏmXõÚ^-,éU©ø\އØÉÞVðÞW;Æsë ³pF¢›XT!oPj´,&#!,Øh2á*ù.¯täã^è3ƒÑôôÛïoZ²`^bÜÚô4ƒÑÔÔÞ±ó—ãD†·þõß—7¯OŽôtq®¨kxêÏïÍˆŽ ë3«Õúòÿܺ~ÕÜÄØ@©wmSó‹ïmóó!ôà³Ýû †ôä„ çµuõügÏÏ×Kˇé3Oñ¢Ô$âgw7 ¦©™¸Ž'ÏH¢‚üÉçšqóçøé1ôY·Bù»?¾ûÒ£ëfDG`–_^õäŸÞûøµÉ¡Kw´ÞGßî*«­[5wö¢I€ŽžÞ}§Î‘'O¸9‰Âüd•õwùÉö^¥ê?{~^ž–:?)¾_?´>í†Eóˆ W ‹ûÔê™1‘3c¢(T£ë¯omË.ºåÃÒ¸”37qúõúOvþ˜žœäã¨í7·*5€ÕŠ}±÷à‚”ÄP_Ÿ¸P¦R£=v)kä¡·£­«çØ¥,ü›÷‚àëÓ~sà—å³SgÅFcVÕÐøÅÞƒo?÷ä¤-³ëÈÉú–¶¸Ðà) ‹U©Ñä•UÖ6Þv‘¹©¤®¥¼À†aß::+6*:(ÀOâ¡7[:ºZ‡æÄ\ÊÍçs9QAþþÞžò¦–ÏüyÛÆ5ľ‹õË}‡–§¥&„‡à«ªþr!ãµ'6Ÿ¿Œa Ü_ÔÒÑõÕþC Rô›Ö0?1ëP$´[’è×}–C(/O/O@qUí¨zkùóÊ*QM‹^6{FRµ÷ÄÙ0™‡³˜Èp1'_©Ö¦ÆDÌKгX¬Ý ±ZoCkû¿vþ4?9>92œBA•jͰ±Î¨ £ÉTZSwÇV¸£% †Ový8'>&Ä×'2ÀOo4ö*Õ§2³Éõª7Œ&&ƒNÌ O¹˜©Òê‚—Íž¡7šrJÊ]ÊHÖ/2Ö¦ÏY8#‘F¥VÔ5ÔïoË+«äs8‰¡«æÎîU©Ž\Èð¹“Ög-]Ûwì™—Àb0tú¶îž“™“™ ^&¯·áqãBƒ¸lV¿ÞÐØÖñÝ¡cøêHwy¥C ÖT…$/6ôÜøƒÉ†Ö¹÷0ô³ßü»´V¾õ 5HPy÷Å­'3¯â¾1Ȩ¼òسyð³Ýû`+<lY±ØÁΖìÔ„@¬A‚¹ùà?øO£E`{?C&cMGˆ¥4pž^³‚Åd\Î…ÏŒ‹u9·`ìxö‡ Š’c•<]ÄnN¢Ú©ü`9l…{ ŸË¹ëÜ*4Áýåèçÿ¸ZXÒØÖA§ÑÂýeÁ2iMcó¡s— eT´ý“^"øAÅM,Z5wVaeF×/²·KŽ Ó è³ Š`+üÖq9ˤîb'hmd¢@}vŸ9z136$(!<„N£uõ*~8zòÛG&·N)òE©Ö*5ÚÑl&Ãh6WÔ5¿|eŒo8B~+DúE\/-¿t½Z™0þ @ äׯŸA ƒ@ ™^@}@ 2½€ú @ dzõ@ Èôâ7¬ÏžZ½,oÿ÷2OwØŠ@$¨UmãBƒ’#ü%¾:öÑ·»ö:7‰<#Yž–"“úxy:‹Q]ûÊ[u-­Ó¤Ö)QáÍ—¸:³ŒÎÞ¾Ìü¢]GN(5Ú»?rTÿÆEóßÿjG·Bù õ“¤ˆÐ•sgýé³/µý¿ÅòÛp¹oo{rÿéóW Kª Zrü¼ûÒ³ÅUµã¹¹Ý=/_ìdo÷þ×ßÃŒŸ‡KŸ­ž7{Ft„n@ß«T;;ÚO:ÏH^Ü´ŽËfõ(”j]¿-Ÿ7}ªüÄÊ%Ï®[©Ñõ_Î-ÐA>Þ.]071vëÔèú†F÷tûI W4òðp鳇ÿk×ޖήsRßxò±IçÉëÛÿ]ÛÔÒ§R¿÷ÒÖ¹‰±Ó¤¾ :mËŠ%*­vÍËo*Ô<ñO[·dfò¢Ô¤=ÇN? îá,ž——™_4~}v¥°äZqÙ Å2Õe ð–M&ySËý5ÑÜÄ8EšÚ:ø\ÎÈ­|gNB¬R£mëî–º»ÝñhÍ?uØð¸/?ºØÕ®ßM,šáãáö{0 ›n­p7¤ÅÇä•WLHŸýù³¯¬˜õÞoç‘ã{n6ȯÉ=Òg :mÃÂùó’âÜœDF“©¹£óDÆUܵ¾rîÌ7ž|lë;]/-'òo]¿jËŠÅ«^z½±­Oa3™Ï®[9/)ŽÃbÕ46ý÷§ƒÃN!Ú­š7+!,Dì`ÏdÐ[»º]Êúáè)«õæ=¨´F~Ç¢Ž'ÏH®—MÎ2®"Ç—7¯°Z±‚Šªí;öüiëïdžî©›Ÿõpæ&Æ®ž7[æéNA)=½YÅ_í?¤7…ƒN+¨¨"Ä 3¿pÉÌd.öþÛaØÔ‰3n4™ð¿¼%5Í«õþÖwûŽÝøx÷‡¿ß6r«B­Á‡zÝœD¯<¶áŽG›hþ©#2ÀÍbþgÏ~yóP°Þ`œá*rléì"²M“V¸ÇL]§R(À,–›ö$ÿ @Æ{)݃s0è´/ß~#ÈÇ»¦±yÿés4*ÕÇÃý‘Åéã}@QôŸ¯¿èWPQUXYãêäø¯7_)¬¬&ç‰ \5wvnYE^y%•B ÷÷}ᑵîb§¿}ñ¿ik}G¡í·û?[>ÿlvNkgw˜ŸìëwÞTß:ìxG뽺呵ésºû”'3³ôw±hÍü´ý§Ïë =Ý}Š~½ÞÏËÓ–Ï#ÎÂB¹e“(°‹ÈaEÚLwg§þýÕ¢¥zx[ˆ¯tVl”³£ƒÕj­ki;z1³³·êëóØòE_î;TEzÅ–I·¬XL$rX¬ùÉñAR/‡­Ð×6µ¹˜G€1ôôä„P_.›¥PkrJÊ/ää.¿ÿá…¬‚¢Žž¾YqQö…Zs"ãjQU `鬔ԘHÀß^Ò»Ûwìiéì’¸:'G†yº8ó9샱¢®þèÅ,bØkXüÙÓk–sX¬ƒç..NMv‹Œ&svQÉÉÌl¢(ŠÌŒ‰Š  øz£©´F~ìRÖ€Á€oÅw?r1cQj’³£C¹¼þûÃÇtMêîöó™óÄAR£#£ƒì£ÙÔÒÑu23»¹£“Åd¼÷ÒÖa£„Úú»ú–¶ŽžÄrÙ¬åi©R/«+—×_ÈÉ#7Š-Ÿ7+.JæáakóX¬ ­mÇ3®´uõÆF´X&‡7Ñü8ÑAsblùü^•êdÆÕ©W ÔëŸ~Adòñžé"rt+”y…ye•ø&±ƒýü¤x©‡+FS¨Ô¥µuÇ.eáÝ@ÇÇÿ6)ÃZalKâæ'ÅùxsÙ,•V—]TŠwE‡ý—mO]ÈÉÃOÃa±Þyþ©‹×óñÄÛ51A”¨ð„°¡€?`0–ÖÈ_¾2¬ Útú¯<HŠKŠTÔ5|½ÿ0…‚Ή JìÚÑÝ{!'¿(pÈñg³ã¢¥&½÷ÕwéÉ þ^•õûO' pvtHOŽ÷vs¥Ñ¨=½g®ä”ÕÖá›<]Ä/nZ÷ýáãž.âp_‡ýá7;»úľÃâÏü¼<ç&Ä:Ù )T­ÕU768sß4é+yð¸úìé5+‚|¼÷;½ýû=D¢£ÐvüGX’èwäbÆ;Ÿ‹§¤''üõ…§Éy2óŠNg]#a½õôãËfÏøþðqò»ò´âÙµ+…›?~úåÉÌ«D™—§¥êôã´^bxèÚô9%5òçþú‘Þ`Äù\ŽÁh Z,ÛwìyëéÇ÷ò~F^¡Þ` òñöruùç÷?϶ñ#ØlÛ°Æd6ŸÎºfL ö ŽËšªúÆ£3©TJBXÈ‹›Ö}üÝ}*u¹¼Þh2…ûËÈú,Ü_Ö¯××468,ÖË›×Ûòy9%å­]Ý3X&ðxÚþEžY»Â]ìt­¸¬½»ÇÏËsQj’ ÿéóÄ¡B|}¼Ý\\È0Òâc]º ­»§G¡¼RXB¥R’"¾=pÄ`4º @¨¯ùV\ªí;ØÇ‡»8:nÿþ–a/2¶6¼MKœÎºvøüå`™÷œ„Ø>•:§dÈãûè’…Á2ïܲÊËy792ÜÍIôÉ® ·€¿yÙ¢S™ÙmÝ—QdhÒ´ÌÓBA+ê‚<¶lQ°LZ.¯Ï.*¥PP‰‹³ÄÕx …‚>³v¥“½]f~Q¯Râ+Ý´$œÁÕI$óp/­•+Ô‡òüÆ5~³óW™#ò«à»aѼ¦öŽŒ¼B.›½nÁ\µNGξ<-µ­«ûüµ\½Áäìhïïå‰÷aOñÖõ«Œ&Ó•‚b•V'ØJ½p=TÛÔ2'!vMzÚ±KY]¿«ÈqfldigÙ w´$‹ÉxqÓ:6‹yµ°D©Ñº‹E g$ ø¼g.hûj›ZÂý}Éú,ÔÏEQüMrì&^2395&²²®!3¿Pd/L ñpvúdçOÄ%v»h2›ÿ³gÿ“«—U74eäúõFKŽ +(¯º^RN¥R#ü}7/[h9hc|à©ÕËsË*.]/p²®˜“º<-u÷±Sø&w±Ó¶«jÍùk¹F“ ³úþðñâêZb÷e³g4wtþxâ †aä;Ø0œ~·rIS{çñËW-ƒBŸÄßt7W:|–C<¦\Ÿ¡(ºlö …ZóùO?“Ó»û&pEÍKŒ³Z±/÷"RNeeoY±XâêL¤¨´C:Æe³¹t½`yZjd ßôÔgM‹ilë8••M$~ýóá%³RÆo½sRÿïBœ[¿\ÈÐö¼½íÉ%3‡›‘Wxµh2“Ñæ'ÅÓiÔíßïîQ¨ye•o>ý±•Ça/=#»¨”pìå—¿ñÔæ9 ±?83h±”ÖÔ…È|ö<‡$Ñi´@©Wni…ÕŠæ'Ç 6_í?\Y×€ï~öêuAQÎâCç.៬‚âG—.HÉ*(îèéÅ33éô¿2Âîc§ÞyþéØÀc—²z•ª^¥ÐÐÚNŽ ?™™M 2Û:6-I÷ñp«ilµî\6ûß»÷ãsr[:»džîñaÁ¸> òñõóÙuäDAÅC·ª¡éùkÂý} Ìa±¾=p„ð7àxKZ;»qWSˆL,“ž¿–K<Ý/‚|¼úw$*0ÀEäðÓ‰³9%e€«E%[ׯ"g¨nh*«­#¤g^YåëOnŽ >‘qu:\‚,NMîìíû÷îý¸©mjÙ¶q5ñŒ·áq—ÌL®kiý⧃„^!Œ³6}Žypðãïv«µC’ްamSËÑ‹™sc_Ü´O¹V\6ÌmOn…;Zr^bŸËùø»Ý¸ ¸ZúTêôä„ ×r•mAEÕºs=œšÚ‡TuD€oWŸwUŽÑÄBÍŒèˆÒùw‡ŽáÍÔÑÓ»zÞì¸Ð ¬‚â±{ †aòæV«SëtÄ0.À`4½ýï¯ Oaf~ádfLäú,¯¼òÌ•@sG§£ÐvfLäÏg.à—Éšùi½JÕö{pûg?»nåâ™Éd}¦Ñõå_‰;‚ _ì=H”è‡ws¥?hOf䬿âèÀçrÊjëpÎäðñpëêë#;Ì1 +½õi‡ ȆEó~þäƒ+»¿>óÍg§¿þô_o¾°·Üw+³™ÌמØDü{b倫“#‹É ?8Ý}ÊΞ¾ñ[ÏßKÒ¯×WÜÐ4#Y3?íƒW¶ýxüÌ‚§_JÙôô ïýÃÇÃí»wÿD–¶ã$@*©¬oÄÅ _¯';á¥^T %»¨”J¡àÿLfsMc³ÔÝÏPXYÍdÐý½%D~:VX946æç[ßÒVykEpËJ½Œ&yƒ‹9ù€€‡T76 Uo0ö*UBÍu!Ä‚ T ÷ê¹9‰n—_7 '/˜ÒÒÑE?ÌO6`0”ÖÔonïTkuäÐxƒÑT.¯nOo Ñp¡~>æÁAüé8¬úwnoÉ€Á[VNì•™_HÎ`2›ñCá•Uku=½®"Çir;Ø ø¼ì¢RB{Õµ´¶uu“»…B9uì¯Åkähgëd/ÌÊ/&ÄÙ0»iúû;zú~¹±ãбŒ¼Â˜à€5óÓn× w´d˜Ÿ¬¦±Y©Öm]R-GÄÛÝPZSg±XÂý}ñÌ|.ÇËՅã‰ý½<!äå”” è ä>F ÃDAQ UõnbÑ»VÜŒiíìFQTÀãâ·P‘ÃõÒ QñâêZ¡À†Æš_Q5ž«7Q ñ•Ž|ý˜Š+ùí2åþ3.›èUªîæ «¡­}X¢B­&ÿ|nêǖ-Ê-«øéÄ™n…rÐbq²¾ùÔctí¾[™ÅdŸ Míß8Âa±Àh¡? µÚ†Ç§õ¸!˜Fâé"þÃ㜿–ûߟà)W KÞùüÛÿþùÿmY±øŸ~9þ*0èt6“ÙÕ« '’Oí`k N(ËêÆ¦ƒ!Ü_†»‘Âýej­®¾µ ÜpyUöŒzj[>O¡ÖŸÍ¸R·³á)äÇ3Àh2±™Ì1ªÃã°Ó“¥^ä‹x¸Ò¨¨nm&Óà q|G¡-›ÉüèÏÚó‡v×j‡=º\D6<.!Úìm}*õäV±åóúTjÜ ‰3Ì9¢Èì¸è¨@;[â¡H̼¹Ç0è4¼çßèí¼‡ Qõ(U6¼¡ujì6ƒBûu ÌO¶aá¼ìØ{°Š«kûõúôä„‚ŠªÚ¦–‘­0¶%é4š kÃãþýÕFÞ CUCS˜Ÿì— †…ùÉ)¨¨ºcã ƒ|.‹ÅÚ§V ‚ñôÀÛæ'K‰pqt¤R)D"Eo7‚<¦l4›là`+,•²”äÚ'î?Ä^JÒ$¤1(¨¨Ž zdqúò´Ô𯿲Úú¢ªjÜæSq¥C ¿]¦\Ÿáƒc8±ðÇùn}¶úõú‘‹ŠÙÙÜ|m¢R(kçÏ©ªo|î¯·×ØÀibå>•:jõæa‰øpÛØõº£õtýz{ÛÛ¾>F (2l"EQU †a¾ Vd´D(оÜwó" b°,kqUmd FCQÄßK’UPDøuF;øT ÈÓkVøÜ³W¯wööá¾´Y‹¢èí{)6ò „ úTj"TŸ€[m6Ûà-Ñö´^¢ÛûFõI $ß‚ c/”°tÖŒäȰ‹×óëš[ñR­š;¹WÖÙ'WÎEü|ùƒŽÞFfêQ]376n¾ÄˆP•VKž QZS—žœàíæŠë³a­0¶%ñsWÕ^ÊͶ‰˜]PQ(õòru©kiðmíì¾ùsw LŒÕG#X&ݼlaIµübÎIíÀ€Õj Ž s/dÔSà×ÅñËWäÍ× !¿¡s¬Élþô‡½>nøàáþ¾©1ŸîÚ{–³@~[L¹>këîÑèúƒ|¼™ ú¨ƒtÚþ~p〈ũmj‰  íˆ!NA‚}¼‰ 6<.‹É¨jh"¿ûFúOgÓ·vvŒ¦@©9ÑÁÎÖÉAH8ðïh½Êú†Ñä124Œ€>A“ib®£É<`08ÚÙ+-ñ7æ…aØŽ™ÂÊêø°ào J¥R)DÀ–ÑdÂãôGÝK©ÑúJÜ© q íé¡86#l"¡‹ÈáÇãgˆ%]„d²‚¥O­vv´oïî÷+PêUQWO”­W¥ òñ¦Óh#b4™­V+‹Á¸yÑR(d·ŸB­‘z¸"BmØä›È@¿¼òÊ£3‰‹9,ÿžQQ× >x”œ‚·£ƒ-yù.ò[GR;ÉñUC›J€ØÁžEÀe³ˆÙ87Þ%F¶ÂØ–4šLºwŒ^V[o ÷—)Ôjw±9.jŒ&îS©ñsGFQDhcÓÔ>^çÈNà§Pkv¾–:¹&Ãý÷tõ×ò¹bVÓØŒÇz&„‡¬ž7;X&-¬¬¾Ë+yÀ˜òø3«Õzøüe;þÖu·DÚöªú&À‰ÄM3=9!ÄWJÎ|úÊ5Ež^»œH™ŸOŽ Rj´F“9D&¥R†üpîb§u æLQ¥fÇEoX4ï.¿`±ZÏe_—¸:“׳}rÕR É‹sGë<{ ðûÇ7’Çæø\>ª[R-¬˜3“È¢èÓk– nuªŠ£ÐÖÓEL— yƒ¿—'áÌc3™QA7pi­|pÐ’žO4Ã&þ–7·jtýáþ¾áþ²>•šzdõ1 S¨5>7£ÙR¢ÂÉj²¢®žÍdFÝxA_g\_²L ó“ ~ÕO\ð¹Oñ£ÃdjMiœøèèéUkuq¡ADÏ÷rsq!…Ç•Ëë-˼¤8ò¥[ [¡ììíKŽ #w3Â8= ¥ ëíæJlŠà³‘­pGKVÖxºˆƒe·ÜXLÑçMfs¹¼>ÔÏ¿4È®ë1š¸²¾ð™1QDzlH›ÅgÇÏ;¬¿a#¿qØÙðCý|&×¾]}ŠŽžÞäÈðaw<²ÍǾuVÎæöN܆w¥C ÷b}¯öŠ ôÛ°h^d ßõÒ *…âíîâ*-yî÷€–ή«…% á!;ß»¤Fî.v ÷÷Í*(NЏù¶w"ãêâÔä%3S\EŽÕnbѬبë¥å1ÁC#˜V«õàÙ‹ëÎÝùÁÛW‹JøÎüäøkÅe³b£È%‰ \’ðpv,HIÀ#OOee ÌŽ'ÏÚô9¾ye•DôØšùix¶ /ÀÖõ+ña_ì=HžÌ?ŒÏü9!<äo/<›ÖÖÕæ'ópv’7·’ÝTc[ïJañ¾SçÖÌO;ø¯.çè FW'Ç„°5¯¼ÙÞÝSV[w.ûzZ|Ìþ¾Ÿ™_¨ЇúúÈ<Ý»ú;»cÃ-5#À[òêß?Å_gO_¹,óÞ¶auf~Ñ Å’¬Öꈻ­F×øüå•sg¾ºå‘üŠ*Ý€ÞΆï'ñhíêùéÄ<†aEU5 á!ÃV–:•y-ÀÛëÉUKsJÊÛº{ØLf ÔëàÙ‹-]ù•‰!ËÓREB»Î^…¯Ä=ÈÇûjaÉí¢Ž†?Œ—ÎJ)ªª±X¬Õ M=JeB¹85™Ça÷ë ~{[Á8ƒñGRR#/¬¬N‰ w²VÖ7: íB}}Ž^ʼÝ"&þ^«ÕZMš+ZR#/—×ÏIˆuvt¨nhBQ#Ã'²]+.[”šôøòÅõ­m®"G‰«³nàæcyå•É‘akÓÓÄÂ^•:D&P.¯ 0Úºº¢ýq‡ Aˆ¯wTPPÔÁÎvNB  ²®‘ø›A§Qñ`p_‰F\-,ÅÇè#ý–ÌL!Ͻ;ÿ00 ;v9kã¢ùÛ6®É/¯ä²ÙÉQa½}D˜šZ«;žqeÉÌ”—7o(¬¬6MÎŽöL}ב“€}§Îm]¿êµ'6e•*5Z;~÷‡ßì\ÈÉ ð–<µf¾†·›kˆ¯´µ³€Ù w´ä©Ìl_OÇ—/Ê-«hnïbÐibûP?Ÿw¿øŽðGVV‡ùÉfÇE7¶u£KÇhâ>•úrnAjLäSk–WÈë…v‰á!m]Ýã_õº¥³+ÀÛkFt„Fׇu–ËëÃüdO®^V\UÃçrR¢Â»û”øêq“àÇãgžÛ°êµ'½ZXÒ«Rñ¹±“½­à½¯vŒçÖAfáŒD7±¨BÞ ÔhYLFBX°ÑdÂUò]^éÈƽÐg£éé·ßß´dÁ¼Ä¸µéi£©©½cç/lj oýë¿/o^Ÿééâ\Q×ðÔŸß›AÖgV«õåþ¹uýª¹‰±RïÚ¦æßÛæçCè3Àg»÷  éÉ ÎkëêùÏžŸ¯—–Ógž.âE©IÄÏ ï o@MS3q+Ož‘Dù“Ï5ã†?fÏñÓcè³n…òw|÷¥G×͈ŽÀ0,¿¼êÉ?½÷ñk/’C—îh½¾ÝUV[·jîìE3’=½ûN#&O¼õÉ‹«k¤$ÎŒ‰¢R)= åÏg.|µïÐ$† z•ªÿìùyyZêü¤ø~ýÐú´Í#2\),îS«gÆDÎŒ‰¢PP®¿¾µ-ûÖµq¡L¥F{ìRÖÈ…CoG[WϱKYø7ïÁ×§ýæÀ/Ëg§ÎŠÆ0¬ª¡ñ‹½ß~îÉI÷ð]GNÖ·´Å…/HI°X¬J&¯¬²¶ñ¶ß JêZÚÈ |`öÝ¡£³b£¢ƒü$z£±¥£«¡uhNÌ¥Ü|>—äïïí)ojùüÇŸ·m\Cìk±X¿ÜwhyZjBx¾ªê/2^{b‘áðùËÂýeq¡A-]_í?´ %‘A¿9o&ÌOFÌ: í¤$úõBŸ¥ÅÇJÅßËÓßËP\U;ªÞšDþ¼²JAÓ⣗͞ѣTí=q6Ì_æá,&2\ÌÉWªµ©1ó’â,k·BA¬ÖÛÐÚþ¯?ÍOŽOŽ §PP¥ZCŒu6¶uüsçó“âÃý}¹l–¶ 3¿èdæU< bd+ÜÑ’Ã'»~œâëà§7{•êS™ÙäzUÈ F“A'fŒ§‰\ÌTiu aÁËfÏÐM9%åÇ.e?$ë— kÓç,œ‘H£R+êê÷·å•Uò9œÄˆÐUsg÷ªTG.d øÜI볖ήí;öÌKŠ‹ `1º}[wÏÉÌìIªL^oÃãÆ…qÙ¬~½¡±­ã»CÇðÕ‘îòJ‡@0¬© I^ 0lè¸ñ“ ­sïa2èg¿ùwi­|ë;Ak< (òî‹[Of^Å}cQyå± fóàg»÷ÁVxزb±ƒ-îÔ„@¬A‚¹ùà?øO£E`{?C&cMGˆ¥4pž^³‚Åd\Î…ÏŒ‹u9·`Ôxö‡ Š’c•<]ÄnN¢Ú©ü`9l…{ ŸË¹ëÜ*4Áýåèçÿ¸ZXÒØÖA§ÑÂýeÁ2iMcó¡s— eT´ýä/F@nbѪ¹³ +k4º~‘½]rd˜n@ŸUP[á·Ž«È1X&u;AkC ê³ûÌÑ‹™±!A á!t­«WñÃÑ“ß82¹uJ!ß(JµV©ÑΈŽ`3F³¹¢®áøå+c|Ãò[!2Ð/:8àziù¥ëÐÈ„€ñg@ ¿.0þ @ äÁê3@ éÔg@ Ó ¨Ï @¦PŸA L/ >ƒ@ ™^@}@ 2½€ú @ dzõ@ Èôê3@ éÔg@ Ó ¨Ï @¦¿a}öÔêeyû¿—yºÃV„@ ò A}¨j”à-ñ•xÐi´¾ÝµïÔ¹IäÉò´Ô™4ÐÇËÓYŒ¢èÚWÞªki&µ^’°6}ŽÔÝÍbµTȾ=p$·¬âW9rTÿÆEóßÿjG·Bù õ“¤ˆÐ•sgýé³/µý¿ÅòÛp¹oo{rÿéóW Kª Zrü¼ûÒ³ÅUµã¹¹Ý=/_ìdo÷þ×ßÃŒŸ‡KŸ­ž7{Ft„n@ß«T;;ÚO:ÏH^Ü´ŽËfõ(”j]¿-Ÿ7}ªüüÆ5›—-T¨5§¯d£šþùŸ^{}ûÎ_Ë}HÝÓEì'ñ¼“g2›§[Ù„›ÿ{fË{V74ÝÇb$G†IÝÝÜÅ"ŸWV[÷í#ä­L}Vl”»ØÉ]ìÄb2Ÿ¿|9·`Œ£M4ÿ”âd/\8#ÑÛÍ•J¥´vvŸÈ¸"onž­p7ÌŽ‹nïo|H.jäaàáÒg;ÿ×®½-]+椾ñäc“Î3’×·ÿ»¶©¥O¥~省sc§I}½\]]º £§÷Ñ×ßVj´G¡í¾óÆS›³‹J †‡¡Ñ=œÅó’â2ó‹Æ¯Ï®–\+.´X¦ºlÞ£É$oj¹¿&𛇢HS[ŸË¹•ÏáÌIˆUj´mÝÝRw·;m¢ù§G;Û—]g±X³ Š FSD€ï3kWü÷§ƒÃ|ÛÓ¤ø˜¼òŠ é³?ö•³Þ›âí ½§‡|ÀÖÎ.@d ÿ$ô™‹ÈaEÚLwg§þýÕ¢¥Z;,Cˆ¯tVl”³£ƒÕj­ki;z1³³·êëóØòE_î;TEzÅ–I·¬XL$rX¬ùÉñAR/‡­Ð×6µ¹˜G€1ôôä„P_.›¥PkrJÊ/ääaØÐ+ùßÿðBVAQGO߬¸({B­9‘qµ¨ª°tVJjL$ào/éÝí;ö´tvI\“#Ã<]œùö€ÁXQWôbV¿^çöôšåë๋‹S“ÝÄ"£Éœ]Tr23›(Š"3c¢b‚…¾Þh*­‘»”E¸'ñÝ\ÌX”šäìèP.¯ÿþðqF“º»ý|æ6@*±h4jGwï…œ<ü¢À!ÇŸÍŽ‹^”šôÞWߥ''ø{ITÖ7î?užìbwvtHOŽ÷vs¥Ñ¨=½g®ä”ÕÖá›<]Ä/nZ÷ýáãž.âp_‡ýá7; ƒñg~^žsbì… ªÖꪛœ¹€ošô•Þ{ŽÞþý"ÑQh;þ#,HIˆ ô;r1ãÏ¿ÅSÒ“þúÂÓä<™yE§³®‘ǰÞzúñe³g|øxKg×ô´þ³kW 6üôË“™W‰2/OKÕ èÇi½ÄðеésJjäÏýõ#½Áˆ'ò¹ƒÑPiµ±Ã-QtÎŽ7'lj–V(°Ù¶aÉl>uÍ<8˜2ì—5UõG/fR©”„°7­ûø»úTêry½Ñd ÷—‘õY¸¿¬_¯¯ilpX¬—7¯·åórJÊ[»º9,f°L*àñ´ý(Š<³v…»ØéZqY{wŸ—ç¢Ô$;þþÓç‰C…øúx»¹¹1`0¤ÅÇ<ºtA[wOBy¥°„J¥$E„}{àˆÁht+€P_‡s­¸TÛ? v° vqtÜþýâI0 [Þ¦% Ng];|þr°Ì{NBlŸJS2äñ}tÉÂ`™wnYåå¼›îæ$údׄÛÀNÀß¼lÑ©Ìì¶îË(24iZæéN¡ u A[¶(X&-—×g•R(¨ÄÅYâêŒ?¼Ç†BAŸY»ÒÉÞ.3¿¨W©ñ•nZ’NÎàê$’y¸—ÖÊj ÃN y~ãš¿Ù9¶,»—DønX4¯©½##¯Ëf¯[0W­Ó‘3¤D…/OKmëê>-Wo09;Úû{yâúÌÓE¼uý*£Ét¥ X¥Õ 6R/\Q©TSÿ-o;&ó ÀM,"Ô¹îhI“ñâ¦ulója‰R£u‹ÎHðyÎ\ÐöÔ6µ„ûû’õY¨ŸŠ¢ø›äØM¼dfrjLde]Cf~¡È^˜âáìôÉΟˆKìv=Ðd6ÿgÏþ'W/«nhÊÈ+ôë :–VP^u½¤œJ¥FøûþöÎ;¾‰#ýÿ³jVs“‹ÜmY²Üe˽l° ¦˜bÀ@àHáB:w—ï%—»Ë/—Kîr¹\Hr9HB -´1½ÜmÜ{ïU.*¶¬fK¿?–Å Ù0aÞ/ý!fW;3ÏÌ~ö™gF[W-›8©e~à…u)·ªjn–X™›­^“sèìEô+k«—7­–Ê®åßR©Õè“Õ§Ï•×7b‡¯Š‹îèíûñüeN‡Á&aciñÜší=}çnæŒOŒ›™»qœÐ¯¤§Ã{9ä×ÇC×gaU\ô°Tö呸tÑÐzTBxˆV«Û{ô–r1;oûêå;,Õ"èÀĤÓ¹QX’ãïé¶0õ‘@ˆ jë‡%~}âôŠEQú×ÞêÅ1€¿=ˆ‰3€ìŽ.¿¼jçÆµëŸ½‘&²ŒÖ.Y`Òés½àĈP ™ôɇ†%€¢ªÚ·wlþ5dÐWÆEç•Ub޽‚òê·^غ8,øÈùË㕠;˱ Wщ$ ™ìÉs¾UY£Õꉑ!f&Æ_?]ÛÜŠ~%·A@€§‡£õ©«7ÐÛOvIù3+—† Ù%彃hf*…òñw·+áÐÙ‹ï½²#XàyöFö X2(–Z»z0÷àBV6Éhëîݲ"ÉÅѾ¡­cÚ²3éô/Gã–:ûúùN¡¾Þ¨>óráú¸¹H?_RsÛ¡[×ÚþʦT¡»+æàaÐhû~JÇü (\NWŸu5 øïZþ-ìÑâß—O[¶Å‘óW *ª¹e;7®Åg¨om¯jlƤgQUíŸßêë}>3w!tA–ÇDö }qè8*GÛ;_Þ´»Ç2WÄF6wví9rÓ+Xå¬OZ¬ÿø»CÒ‘Û’«CÑа-Û‚N¥b~ ®½-À˜Éœ¶î[“ á!FLÆÇßBAn)’H“"îçßËFJjê6,]âhcÕÞs[Uûy¸b¼YšØÌÄ8:Я²¡é»SgÑfê\—âã•]R>»êtº¦Ž.­V'Å/}PªÔï~ñµf|ý˜U\úûßlŽ òŸEŸU×^Î)tôöYš™ÆùŸ¸|í&©‰ñƒbÉ'ßFë?»¤üÅ k–ÇFâõ™lTŽ]ÿ,¸rÙsô$vm˜>HOÿõÞ£!O/}ÿ3[K #&£ª±õèÌGûþ¡!¼Ã\§ÓUÞ{·C$-9áħÿÈ9ôõåo>¿ôõg»ßÞ075yìµL§Rß|v özvÍ €•%j€¿qDCâ¾!ýkÏÝ™#W(jîhšIÔ4·¦gdÙ²-Ž}òÁÛ/l{ç·Û~òL.G+p®¥ðàqj[ÚPq+˜þxòœIDb^Y%‰HD_j¦¡­ƒç`‡f(­­§Pܹ,?…L.­½=7áëæÚÒÙ]{oAЋôä9«ÔjüÅ;§Ô·µc U¡T Š%f&Ƴ”g‚ˆDÔ«goÅž)ÿè˜TÞÙÛß×?¦TV64cïè铎ŒâCã•*uuSËäúär°†óqsÑŒ£wÇIÅ¿»p9cJå­ªj쨬âR|µFƒž -¬td´w`ÐŽ=gêCÂÚÂÜÄÈ0¯¬Ó^Í]Ýý"¼i‰ÄKÙùx-Z"K–©•¹Yvq9&Îðõ–]Rn@¡üfõrG+¶kyl¤‹£=€L"MÛ ÷­I_7~C[‡X*ÃÚº¢¾ A®ƒ ²¡ybbBèîŠf6b2œíl±0ŒYšØÝÙ AüD^AEÕ˜B‰·ðY,pZt:&€ˆº–6{kö,‡”ÖÜéêC&:„Ú²- +kXÁËëÍLŒñJ·¸¦N‹U(UAàÊ›úøñ0z:òäòÐýgL: 0(–<ÈI4ZkwϤÄa©ÿñ¥´µÛV%ߪª9rþ²hX<>1aenöö Û(dòc¯eÕ 51ûØÞÓ·ï§t¦ ý–J ™zÖ“AÃÓ´¼¿çÛ†¶öUq1É1 ¥êfQé‰K×öÿãݹÎmP(t*µpŸˆÿi SSÀÔxpLYÖ·µ)•Bw>êFºó¥#£-]ÝàŽË³¬v`ÚŸ652–Êð÷fT©£Ñu(øÛ3@¥VÓ©ÔYŠcÈ 'E†yòœñ+©”™òKî­.õø8v~K3S:•úÑï_™Öòo>22éÖe˶06db¢ÍÜÔdH"ß ¦F†C)ê†D™äœ&¸ÀOw –)vSÄVÞ8¾nü˜ ?[KK‰ˆ% „™VBàç”U €N£,LM+E­Ä¹ö±ñ;J,•éc%5õÁ¯ÍË“RâcÚ:ª[ÊêêÑ:=yryèú ¤˜Å‰…Þ¶ð#¸÷Þ+S7cß}l"‰ë×µ´½ô·°á5Xà¹@jyH" X·uR":Ý6{¹î[{£r…¹élZ­öÈù+GÎ_ÁRÐí?æ¾Õ“é`ï±Sh˜ƒ51¡-¯kô÷t£ÉâîÌÉ.)Ãü:Óüa ÈŽÔÕ&FÌ+¹…}ƒC¨/íÕÍë ÂÌVª›z¬†$R,T[­ÑŒOúփˑua^¢™ýÓú$8ß‚ ³o”°rQt¤¿oFaqsGzUk—Ä!ª¶'èå±fÉ"ìãÿøÏômIU=­kæÎW3Vßµü[Ù%e¶––Z¶³·_èá pÏ“ZaöšD«¼®ñÆ­âI_ ß‘&%5užÑÝg˜Û`b6 œo>oëªeõMFÆÆ´Zmˆw°ÀsÖ£iíçnæ4uLÞ‚ÿ„¦çXµFóÙÁ£.ŽöèàBwט ¿Ï}ÛÙ@ O]Ÿu‹d£r/.Õ€2í$݈\î<¢a`£(íÁ/üª+A¼]¸XcC&jP×ÚŽöõ÷t_ÈUßÕ'RªÔžp5¯pNW«RkÆ”JK–餫ÅÞ£a^:nÇLim}¨¯·—C&‘H$"°¥R«Ñ8ýiËF\9$"ÁÙf,€»)ÎÎÔÛŒe˶øñÜelK3cd¾‚eH*µ±4ï ÎÉûåÉs®inÁ®mP"ñráRÈä©'Q©5Z­–f`p·Ó‰x·ß°TÆs´C;Û¤Å7þžnEÕµg2²°:)ÿ‘QÓÜ*=yŸ‚¶£Ë¿}þ©c@,X[˜MÝZõºY[˜ã¡¦V ê¦ø¼ññ‰–Îîi[aöšT©Õ£ccƆÌY,¼ª±E3>.tçK¥ÖVø¸¨YšxH"E ;3€˜·÷èëãœjäþnÃRÙ÷§ï„… }æ×d¨ÿžB&ýR>WN×ÐÖÆz† ëâ¼ù¼ÒÚúìéȯŒ‡¦ÕjO_»É26Ú¹ážH[ìÖ^×ÒXŽ:`I‘aW>ó¥œ|Ù±>KIŒÅ/ËFTj€Ï#oûᬭ6,]ü ˜–œð€ÿ0¡Õ^Í+äØÙà÷³}~íJ"΋sßÚ;yåàw¿Ù„Ÿ›3b2°Y];¶%¦<ÐåQ^™E¥úè9K3S'[kìðš¦Vwg'Ì™G§R¼î*àÊÆ¦ññ‰¤ÈP¬ P w"4utÉFåBwW¡;H"ůÛ(¯oàÚÛ¹rñÇ¢?]ÕØl@¡„úzcéÑ~=%)êCgjPÐ;fo€˜@¿y·ciM=‘H\ŽO$ ³L»0é4k«š¦»×_^×H&‘âC§_§Ó Kehà º­ö±¦¹…N¥ÜyAÝg_^lÑ(À×oò‹þÅ…“ádk=Ëì0ža©¬²¡ {z¥#£!>^˜å;ÛÛÚâÂ㪛Z&&&"Bð]­Ѱ¸op(2ÀofxƒÇW”— ×Ë…[XY:§¶Â}k²´¶ÁÉÖÚ›ÏèD£`6¯Öhª›Z|Ü\Юßh–&®miÓét±AXz°À‹N£êÿÐ¥Öh&Ù›èðO,c#7—ùµoÿÐpïÀ`¤¿pÒˆ‡¯óÙ‡<“®³£§­ÃïéȯŒG±¿ÆWÇOù{º¥%'ø{ºVÖˆD®ƒ­›½â¥ß:ûúsK+„‚ý¾[ÑÐä`m%twÍ.)ð»û´w>3wyLäŠØ(;¶eIM½½5{Qp@aeu÷íL­V{òJÆÆeKöÿãÝܲ ##124¿¼jQpþJ‚¼=—F…m¬K£ÂÐÈÓ‹ÙyسúäYŸ´ØÏõ¨ª‹âJMŒG³y¹8vn\ƒ®Ûsô$ºØ´|ùã‰0¡àýW_Œô÷íîðuã;ÚX5utáÝT³×^Niù±‹WSãOîþèfQ‰B©²³² ó¤îz»G4xcëFwgNSG§R­öàr¬ÌÍš;»°mJfgå¢h.çÿú }œ½”“ïÍ羜¶.«¸l|b"Ì×[:2ж²Qùék7×,‰ýÃöÍÅ5u£c –±‘DZ«àÈùËhNWV×& LÚYêbV¾×ùùµ+ *ª»Et*Õ“ç|òJFg_qMm¸Ÿ %>†mÆêvå8x¹psK+fŠ:š:{µrQTY]ÃÄ„¶¾µ}@,/‰4dÐå ¥ÇÑÜÔd &P*šJk룄Væfµ-mããã–f,W—37²ðë'ð¸;s´Zm=n­hECSuSËâ°`K‹úÖv€Î‘¡ ÙòË«’c"~“²¼¥«ÛŽmɱ³»»ÁXQum¤¿ïú¤xk ³A‰TÀçM ¨nj òöS*»ûE6–žî¨ÃCàÊCDÁ‚eº8,PÛ܆MüEúQÈ$4Ü•ãH!“¹¥•轿§ÛŠØ(üÚÛÙóOB§Ó½™½)9ñåM©ÅÕµL:=2À·op S“ŽŒžËÌYõÆÖ´ÒÚz¥JmciN5 H¿8vñêÎkß|vK^Y¥X6Â26òráþó›ýs“ßnX]×Ò>"³27óæs{DgndÍÔ ÷­É‹Yy®NŽ¿II¾UUÓÑÓo@![[˜û¸¹ü}Ïw˜?²´¶ÞרÖÝ‹òœ¥‰‡$Ò›·Jb‚ü_HM©ij±4c… Ýý"ýw½îìë÷à:GúÉFåhXguS‹¯ÿùu«Ê댘Œ¨¡hHŒî7~ûLniÅ DbÄd8Z[™›š|ðÕ÷ú x–E‡Û[³kšZŲÕ Ì×[¥V£*ù{:ò+ãQè3¥J½ãÝ·¬Xš²>)^©R·÷ôîÿù–áO»ÿ÷ÆÖ1þN¶65Í­/üõƒè@?¼>Ójµoüã?;7®]ìÉã6¶w¼öÁ'¾n.˜>|~èØ˜R™–¶,¡»࿇OVVOÒgN¶ÖÉ1ØGôyÐÐÞ …úä™J€—;þ·¢ïøcŸ»4‹> ‹Ÿûóß_fCt ŸN§+®®{þ/|üækøÐ¥ûÖÞGûT56¯]—èÏ€BîîØ{ìÔôóó[K;(–ü÷ð‰”ø˜ÄˆP¹âöþ´iÉ X†œÒò!©46È?6(€H$ÈFå-]Ýye÷ü±4*eØÊM¹Bñéþ“"ü\¸ÁÏùXSG—X&hµº=GO. ÷qu ñ¡Še#godOÝ8t&ºûÎÞÈFÿóAtÚo~ú9%.fQp N§«kmÛsôä»/=?o ?~¡¥³;ÄÇ{iTØÄ„V,“UÕ6¶ÍøA¶êmÆZ+”˜>‹ Â”Š»³“»³ ¼®qZ½5üEUµ„¸*.z@,9zþН;ßÑÆËQP,–ŽÄù%D„LLhEÃÃØn½­]=»÷IŒ ô‰±T†ÍuÊÊöž>.‡I§IGå…Å—sòUjÍL­pßšS*?=ðãâÐ «‹¿‡›B¥K/fåáËUÓÔªT©©le€>Mœž‘% óõ^­P© *ªÏÞÈÖ?$ëçë™ë“/‹'“H5Í­-Ç»‹ªjŒp?ŸµKâ%’ôëY&FÌyë³Î¾þO¾?œ$𠌎)ºE²òæqªª¦cCfˆ“N“+”mݽß:‹îŽô€=ù•èÚëÈ•@§»ýwÞPé°v=TÊ•o¾¨llÚùÞG°6~•Èß_Ûy!+õA¦e×¶4füóCÇ`+ü ؾz¹ËujB O Z€ !¹ûèGúÀt‘º£Ÿž¦ÊZˆ`[i ìH]M£ܼï¿Z4ÚÍ[%³Ä³?… |¬’“­µ½»ñaþa9l…G‰“1“ëÌ VÁãåÌ—ÿÎ-­hëî¥ÉBw¾7Ÿ×ÐÖqêê X3¿VFäcøŒ€ì­Ùk—,*­mÊÙæ¬HßÑ1EvIl…';¶¥7Ÿç`mk™+PŸ=fÎdd ¼Â„ ™Ü?8|ðÌ…}?¥ÏoŸRä E,ËF¢ýèT•FSÓÜzîfÎ,ÿáyRð÷t ôö(¬¬¾QXk™0þ @ ä—ÆŸA üº€ú @ daõ@ ÈÂê3@ …Ôg@ ¨Ï @PŸA ,, >ƒ@ YX@}@ ²°€ú @ daõ@ ÈÂâ Ög/¬[Utü¾“Ó[„g׬(:þƒ—ƒ~ôuãÿaËŠ$h—<ÍžªÒ†øxEúûzp9®G ™üѾÇ.^Gž©¤ÄÇøøMÊr+sÖ‡_ÿ°0¯6ÀË}Srâ‡_}/C³Ÿ7 ¼•!GÉÓ¥ÏÖ%ÄEúŽ)ÅRKóyç™Êk[60é´a±tTnjd¸Àë!Òß÷ã7_Ójµ™E¥Ùh ·û[Ïos¶³ý×·ŸK0dÐÃ…>eu }ƒC ðòþ²óÙ¢ªÚó™¹ñÜ|Ü\m¬Ùf,•ZóÖþ;ÕŠxöÖl#ÃªÆæ}?¥ß×êæ”ÿãå ´e[j4ãí?_¿9,•MÊ,ð\—÷§Ý{Tj54{òPyºôÙ÷§Ïí>p´³¯õ☷žß6ï‘lTn@¡LͰ$<„@@Ú»{˜ }N8×üO·ÍË“ú‡.få‘I¤p?Á«›×üÝÁÑ1>›×¹¹³û g ÜìQö§Ÿ:@À#ÓgrڲĄˆ{+¶J­îèí;Ÿ™‹Î®YûÖóÛv¾÷Qae5–çÆµÛW/_ûúÛº{Ñ:•úâ†5 ! ­¡­ýGNNú ¶km¢0_µ…9Õ€ÒÕ/:{#ûà™‹ZÜÈRÙpñ¡Ož©ä—Wý²5F§R¯ìû¢¶¥õ¹?ÿŸîÊq<ôÑ{'._ÿÇ×?ЩÔÌ{/då~wêìÏlôqsQ©5—²ó?ÝÿãøÄÄÊEQ›’í¬,û‡þ{øÄµü[èܹ –ifQ)&ÅÔÍ×ÇOzy¬Š‹~JôÙ<Ðét㯃¨Ôô½—3¦T¶÷ô=Þòþ|=óÐÙ‹Úç×­r¶³šá“ï‰e#€þîe}N8×ü A–ÇDJGFÿóÃjPR[÷æö-ñ¡A§¯ÝIJ‰WŽÃãujþŠÍebBûT òHxúÌ€BÞûî[^.܆¶Žã—®’I$G‡ÍË“ô‰ëB!ÿùãþžn%5u¥µ vV–»ßÞ5)"Hà¹vIÜ­ªš¢êZ‘(tw}uózk«÷÷|û$6̘R™U\È6cõ cé á!€KÙùX ÛŒõÍ{jhë8w3ÇßÓmÃÒÅ€¾Á¡ç×­Ì.)¯jj‰ üð~÷NKW7€elè â®»àïá>KeÒi)ñ1ž‚ QÂ0_™‰Ñ˜RUÙÐtîfΘR‰žgGj ƒF;y5cyL¤½5[¥Öä•U\ÈÊÓétž<ççÖ®lKIF3§gdf›. à;:šNLh[»ºÏeæ 5¦âÄ…&ÇD|ðÕwI‘aîε-mÇ/^Ã. på- °±´ÐjµÍÝg2²0¿zøGû$E†º8:ŒOLüù³=èWž<皦V,ÐË…äo˶ˆ†Å™E¥EUµ€ßÿf“X6‚Ÿ%LKNà;:¼û߯Ñ‚$E† ¼hTƒöžÞ“Wnà…H$, öàqÌMLÈdR¯hðzAQY]–A:2:{»£bKæš&îk6N¶Ö¯mÙ°ï§t;¶eˆjÐØÞùã¹Ë •2)2,ÈÛ“F5¨ki;rþ ÚX,c#cCfvI9*΢!qGo¿¿§;^Ÿqíí (̯i˶Xë`c%Sä–Uˆ¥#¿ Ù3h´ÄÈP/ž³!ƒ>:¦hlïLÏÈDã𨔤È0W&6,•TT_/(¬åÉ5{”IñgQÂP_o3cf|H"ͯ¨Â¢Z-X&Ëc"yö“ª€Àw'ÏØ[³½<4ZW¿èÄåkX¡ '‚G¡Ïv¤®örá>{é“c‰–f¦úŸaiT˜¿§[zFæ{_îCS’"Ãþöê|ž¬¢²KÙùØ øÓŽß¬Š‹þáô¹Î¾þ'±m.åäLJ- >~MAdqXhXŒ¿úy¸}òýáÃç.(dòÑOþ¾zqŒlT¾ùÍ¿¢¿˜•÷å_Þܰtñ_}ÈFÖføß²±´@‰œæ2ˆDÂoׯ±2ge— Š¥WÞ¤õ§ÖV/oZ7,•]Ë¿¥R«½ù¼í«—ÿpú\y}ãˆ|¬±½SèQù¸¹ôf€ ȶUÉÞ|^uSK^Y%‘HàØÚpìlP}¶"62&È¿¶¹5«¸”mn.8ÚX}ºÿö¸ojl¸eÅÒKÙù§¯Ýôæs‡I¤ÕmݽG/\YŸ´øRv~SG'`P,ØY±ùŽ•MÃR™!ƒæ+xeSê?¿Ù?‹°xa]Ê­ªš…%Væf«ǤÄÇ:{ý o¯ki;“‘E"Ã|¯mÙðñw‡$RÜá«JjëoÞ*¥Q Ð:•êdkU\†~Œ ¦ÄÇt÷‹®åßR(Õ6–æîÎN¨>»/«ÇFøù”Õ54¶wÚ±-_ܰAî~K!“#|Kªë +ªI$’Ÿ»ëÖUË&Njçç?~”Ü×lP–„#’YTjlÈŒô÷MKNÈFì¬,oÞ*152 ÷óY¯>|ö€H ðCú‘I§3™ÒÑÛ:ÕƒË ‰Å€™‰ñËi©jæRv¾f|<\(˜ädz³gÐholÝhjdXPQÝÕ/bШÞ|ž‰¡áˆ|Œ@@~»~µƒµU~yUhÀÍÙ)9&‚eltüÒ5ìTO¢ÙOK¸Ð'%>æVUÍÂb‘deÎr´±Bõ™©‘áëÏlD’U\6¦Tz¹o_½üà™‹%5uØá«âcš;º¤_ “I)qÑÛW¯øûÞoµZ8{ ybxèúŒ@ ¬Š‹–ʾ¿5Ô×{–)­¢êÚË9€ŽÞ>K3ÓØ ÿ—¯«ÔjC}e\t^Y%æ$.(¯~ë…­‹Ã‚œ¿Œ^^ßx&# B7gGNW×Ú06d®ˆlîìÚsä$vûGð"kfÌLŒÃ…‚¢ªZìÆ)•'D„`”*õ»_|­G?f—þþ7›cƒü¾>»¯ÙÜÚHDÌêQÂöžÞÝŽ wh2™àévüâ5Íø¸X62>>ŸÃ5 Pì¬,&F÷è³ê¦ô}bD(…Lúä‡CÃÔTÞÞ±í—2ûÄÈ3㯎Ÿ®½ã«»’[ˆ¶{€§‡£õ©«72‹JÑÓ>³ri˜P]RÞ;0øäšý´xð8­]=¨†žDRTÍÀàßßBk,¯¬òÍg·¬Š‹.¯oÀfH‡$RÌøO°#5…ïäX×Òö$Þ O'}ÿ3[K #&£ª±Y©šP­‹£}ÿÐ~šO§ÓUÞñf£ ’–œpâÓäúúò7Ÿ_úú³Ýoš<öZ¦S©o>»{=»f…>G©5šŒ‚"7g'k+4½¿^ÊÉÇgkêèÂÇØ J$€†¶|žA±ÄÜÄ;í'?& Ⱦ÷ßyÿµßþaûæÃÿú›7Ÿ‡î  c 0,u«ªk—¬âRì[sS[¶Eae €D$¢¯òúF3cc&PÙÐ<11!twEó1Îv¶˜ÄÇÍE3>ŽÞ ðMpwvB?³SPQ5¦Pb›ÉFÇø]N:{ûÍîTÂLŽž A‘(í´c[ÎrHiÍ]‡MWŸˆ@ ˜2ž{Lúìfsg|¸»;†B¥÷À¡)˜=œ¾vÓ€BŽðó‰ô÷töõg—Åùãm(Uj4‚Ó€B¡S©ýƒÃøŸÃWõƒ˜=êþ/«˜©» KeøÊAŸZÑæ~BÍ~&®æòìvmK’Hk›[ËëQ·Ÿ… Ñðë´9XÆÆ˜K?€^fðØïˆþÞÁO=çOY,Y¬7ŸÝ‚}üïáãø ¸i™Ílfhý©¾a¬°jæ@ú…ôŒ, S…JÕÝ?°aébN‡z£\N]kÛ±E7]õë~³¿}IÐOœÙÏDßàЇ_ÿàÉsvå8\]"ü}³KÊ~ºœ1ïz€@ž,º>ë ÈFå^.\ªeÚ)ιÜyâÄpã8á?6¶w ¼ð+ñvábŒ ™4ªA]k;Þ‘ãïéžp´ZÝÕ¼ÂÔÄxGûi'7)VÅE0Ïþ Keöø{¸ Keߟ>‹5\¸ÐçqÙù°TúíÉ3ØÇÞûïÎ5‹ÙÌéÈ(ºj•H xò¸]ý"t¢Ù€BæÚÛbQž*µfL©´dݳÂÉ‚õ˘½J­‘Y[L¿3¶X6âÊq ‰˜Äa›±À½ÎÂ'ÎìgA¥V—ÔÔ•ÔÔÈÆ¥ ~¾—s Fäcr…-8Û­éBÃ!yóÐãÏ´Zíék7YÆF;7Ü6Ž gu-í€eÑáè³ )2LàÊÃg¾”“O ;Ö§`)‰¡øÅbÙˆJ­ðy$âm?œƒµºÓÄà .$0-9áÑüOº•FrLDlP@}kû/²-–¥™©åî´oT€pûêå¢añ¤-¦ÅˆÉp²µ¦ÜÞ°´¦¹…N¥Ü‘‚Døùb™û‡†{#ý…“êÊAÇÞ«5šê¦7—/wpoÔHy]#™DŠ Ä‹>©×¶´étºØ ,=XàE§QõÜÖ·B'e0t:ÞåëÆ7™oW66O$E†b9µàSñàrð×_ÝÔ211‘‚.0Ä0(–ر-±†p´±r´±Â²¡õ Ä·ò=…:¼„eläãæò Ú-…Lv²µ61Ô«UjMeCöš4e?-³˜Í<À×0:1é44´ àêäH j›Û° 5M­îÎNØ´JE/ã—1ûú®½+ÇqªÙW56P(¡¾ÞXzt àÉ5{{+¶•¹Ù´_á§eµZ]·h@30@»†‹£=ºé jl¡>Þ#ò±'t)2-b¯ŽŸò÷tKKNð÷t+¬¬!‰\[;6{ÅK¿töõç–V„ û?|·¢¡ÉÁÚJèîš]Ráw÷iþ|fîò˜È±QvlË’šz{kö¢à€ÂÊê oÏ;½W{òJÆÆeKöÿãÝܲ ##124¿¼jQpþJ‚¼=—F…Ð;ÙÒ¨04®öbv¶Á¬>yÖ'-öóp-ªªÅ"ZRãÑl^.΀× ›í9zòÿJ¥¢¡©w`pÃÒ%Dá—ržÅ‡íHM©mn“ŒŒ8X[ñFÇø×gúÜý=ÝVÄFa‹ËŠªk#ý}×'Å[[˜ J¤>oÒÜôç.¿”¶öÍgŸÉ-­”HŒ˜ Gk+sSt³”ÒÚz_7~\H`[w/>H¨¢¡©º©eqX°¥E}k;@@çƒ2‹J‡$Ò›·Jb‚ü_HM©ij±4c… Ýý"=7 ’Hǔʨ¡V«UkÆ»E"Ѹº©%ÈÛsL©ìîÙXZxºã7˜²Qùék7×,‰ýÃöÍÅ5u£c –±‘DZ«¿ ©‘¡•¹~Ã-éÈè¹Ìœ±QolM+­­WªÔ6–æT ºž7¿¼2ÄÇë¥ëŠkêŒô`¯Ñ VùƒbIniE¸Ÿ…LF÷×ðráâãϪ›Z|ÝøÏ¯[U^×`ÄdDECb솰c[ºsæ&Æ$qqX hÛÁXàÊC}DÁ‚eŠf¨mnëêØf¬×¶l¸^P„9ffÏ?f2›yìãâãÕÚÕ£R«ùNŽ6Ö…•Õ˜fòàqÚ{úð½ãRN¾7ŸûrÚº¬â²ñ‰‰0_oéÈ(^O<ˆÙ_ÌÊ÷à:?¿veAEu·h€N¥zòœO^Éèìë/®© ÷¤ÄǰÍX}ƒÃ®/nni…žAŠ Ðì_Þ´®w`ðÓýG¦~õÊæÔ!‰´µ«G6*775‰ v÷ Nå‹Yy^.Ü×f•)Tª@/sS“ƒg.Âí3 ¿&…>SªÔ;ÞýpËŠ¥ á!ë“â•*u{OïþŸÏaþ´ûolÝèïdkSÓÜúÂ_?ˆôÃë3­VûÆ?þ³sãÚ%áÁž{#[ÏÐNwèÌÅ屑ëâˆDbzF¦h¨øôµ›:ºóC|¼:{û¿:~jiT8ÞÑ8'rJˇ¤ÒØ ÿØ "‘ •·tuç•ÍøŸñž<®J­n¾w*3£ X,‰ òKˆ™˜ÐІ‡oÞ*A¿jïé;vñj|hPrtxßàð§Ïx¹ó°cO^ÍS*C}ž<çöžÞÿù ¿p¸¨ªÖˆÁ÷óY»$nP"I¿žebÄÄë3«¥QáØGô}im=¦Ï|ÝøØ D¶ Í W(gÒ[sÍ_f2›yÐÝ/R(U^î2¹pèø¥kye•èW‚xpñ«’ƒbÉŸH‰IŒ•+nïO›–œð‹˜½\¡øtÿI‘a^.Ü`çˆ|¬©£K,“´ZÝž£'—F…û¸º„øPŲ‘³7²§îý™ý,ä•Uú¸ºÄû)2¹¼°²úrN:/–ì>p$9:":ÐD"ö ~{òLÕ½+ú!'D×^‡D®:Ýí¸ó†J‡µ<2^HMLJaAöVì]ÛÒ>þî Ü}þ°}õr –é?¿Ù«òä£@»/€~ L·Gwôó§ëÿÑ!KS{çÔMd AÎãþïòP1b2ô‰²€@ž >ƒ@úÏRA%½}èÿ‰A*vlKo>ÏÁÚêbv¬ @}@ ÇŽ¿§[ ·GaeõÂX€ú @ Ÿ¯gþ|=Ö‚A€U@ ² €ú @ daõ@ ÈÂê3@ …Ôg@ ¨Ï @PŸA ,, >ƒ@ YX@}@ ²°€ú @ daõ@ ÈÂê3@ …ŬÏ^X·ªèø|'ØŠ@~MžªÒ†øxEúûzp9®G ™üѾÇ.^Åg “HaBA|h 'ÏÙÊÜlL©¬¨oúþÔÙŠ†¦YNkÄdÄ…ÆùsìXÆFCi^Y徟Òû‡~‘Ql,Í_NK xR Zºº¦_¸”“ÿ‹\À†¥KÛs9§`¦Ón]¹lëªe#ò±Ê†¦ …Ò•ã¸ý¿µu÷.ð"l,Í÷ÿã]#3£°hH" òÿûë/š™>wé)1 '[k7ŽÓõ‚"µF³Ð®-Xà¹.!îO»÷¨ÔêÇx‘þ¾<{k¶‰‘aUcó¾ŸÒñߺ9;ù{¸9ÙZ2¥#£í²rg¹m[[˜z¹»rÍMMJUW¿èbV^W¿èÑ—ËÊÜ,Ô×ÛÑÆÊÖÒ’D"~üÝÁîþ©æ‘á`m¥¯jlN¿ž%W(&åÙ”œhnj²ûÀ‘'´ 2èáBŸ²º†…ðT @ÀӦϾ?}n÷£}ý«Ǽõü¶©d£òÝŽü|=S6*GSüów/ÿß³Ï\Ï/Ÿ˜˜ö´m=½oúeFA±f|@ ^fCÚ²„W7¯ßõÏOx‘o<“fbhøçÏö^ÈÊì9zòÐGï½¼iÝ•¼Â_×ÿL8ÚX'D„d—é¯ÏrJ+òË«f2‰_®ssg÷ãg€%á!ÒÞÝkÄdLývëÊe*µº¶¥M:2Ê6g…øxy¹pÿõí™$ÚÒ¨0¾“CMskeC3“N ðrcëÆ½ÇN5´u<âr9ÛÛFúûö õ Û²-¦f°e[ìܸV:2z!+—A£Em--?Ýÿ#¾éqsvÌ,*}r»€“‘Ò;0¨¿>{d]察¥ÕiŸ†áAyDúÌ€BN[–˜boÅV©Õ½}ç3sQOõš%±o=¿mç{VVcùwn\»}õòµ¯ÿó?Ñ©Ô7¬IˆaÐh míÿ;rrÒO°ÍXk…ù ¬-Ì©”®~ÑÙÙÏ\ÔjïvéÊY§)UÍUÍø”ëEeu BwW'[릎®i:“‘…ÿ¨Õj¿8t<51Þ×ßš±c[¾±uc ·‡V«+©©ûäûÃÙùßÉ!fëoõ©=”%áÁëâøND±w`0»¤ü«ã§J•>E66dFú5wv¡â •H¿ðæ³[’"C÷ÿ|v’iÑétïÎd@¡ ‚ŒH$¸rÎgæ>öò~òý!±lðÏß½<õÛ#ç/W44ét:ôcˆ×ú¤Å‘þ¾3]yniÅ¡³•ªÛ¢3§´b×Ö´¤È°G¯ÏÊë‹«ëTjul°ÿ´ú,9:B«Õ~~èúÌÖ#ܲ")Hà‰Ÿv´±bÒéÕM­° üâ]ã‘ Adð(ô™…¼÷Ý·¼\¸ mÇ/]%“H.Ž›—'éI@ þóÇ7ü=ÝJjêJkì¬,w¿½«´¶Ÿ'Hà¹vIÜ­ªš¢êZ‘(tw}uózk«÷÷|û€×/W(j͸þ‡ŒOL¨5šû:c,ÍL÷½ÿŽ©‘Ñ•¼‚®>‘¯ÿë÷Þ–ÞqÝéY{ؾy}ÒbÑøBVÞ˜Bé`ÍNMŒ?~éšB9 Ï¥ ø<)¨¨Æ'TT|ÝøóÐg~n á!,c£A‰äÂt·g+oQp€¥…V«mîì>“‘…>²¯Y"ðþóç{°Û6 %>&ÌW€%Z[˜'F„òí(dò°DZÙØ|öF6šÓ‚e²<&’ç`O&“z/ç`RÛÉÖúµ-¾;yÆÞšèåÁ ÑºúE'._Cg²V.ŠŠ ò¼ÿÚmMüÉ÷‡;ûú½<¼Ü­-ÌéTÉÈhaeõ•ÜBL‚L ¾ù×ï_Í.)ëZ`nb<,•ÏÌ-«kÀÙ0äíifb¤P©+šÎÞÈS*Ñow¤¦0h´ôŒÌä˜K‹ê¦–NŸpíí (”šæVì$1þÞæ&&*º³·ÿBV^GojðÁë;O_»yóV ö‹Ùù\Kg÷Á3ÐL:-%>Æ“ç¬ÕꪛZ®á…cgéïëdkcÄ )U5Í-g2²ñ³x¨8›‰òúFüÇ¢ªÚÔÄx¶™ÙLùk[Úð{»E"K3ÓÙM+ÐËcqX©ÑmÓòà9ãCgªô[/nlÐm&g•UÕ¦ÎTâ¡S©®Ç¢êZÌ¡^Z[Ÿ-tçãõ™—#‘ôˆn÷8[¶ÅêøX+ù˜"·¬B,ѳ ø¸ºlKIÞ{ìT®~¼ù¼í«—c‰ -12Ô‹çlÈ Ž)Û;Ó32Q#¤P’"Ã|\]˜tÚ°TVPQ}½ ³ØYLÔ“çüÜÚ•€m)ÉhæôŒÌŒ‚âÙ­bR@møäÕŒå1‘öÖl•Z“WVq!+»€Y >KÀ˜ õõ631ÖhƇ$ÒüŠ*¬Eæ=@ ŠG¡Ïv¤®örá>{é“c‰÷‹ñ, ó÷tKÏÈ|ïË}hJRdØß^ÝÏ“UTv);/‰þ´ã7«â¢8}®³¯ÞonjèåÑÙ× ôú ¤S©Wr gÏöâú5f&ÆØÄ"zÍ)ñ1£c =k/\è³>iqECÓKûu˜Œ˜ ¼Ä™{k6 ³÷ž*êîÐétöVì¹V—«Ë–I}ý?_¿É¤Ó7.KŒÜssBÇôº–¶3Y$1ÌWðÚ– wpH"-­mˆðóõváݪªA3#âãêRÛÒŠÇÉÖzçÆµ*µ:§¤\22jfbìÉsFõ™©‘áëÏlD’U\6¦Tz¹o_½üà™‹%5uØO¯Šiîè:~L&¥ÄEo_½âï{¿Õju9¥$1ÂÏwßOéJ• Døû I¤…Ejõ8ßÉ!)2ŒA£ºzc¦² \]¸övé×3Ç”ÊøÐ gV.í `ÄϬXæÍçÞªª½YTbbÈŒôÚ[±?=ðãÄÄmÿ.ËÄhëªä‹YyÝ¢›äöÂj.G4$KÐÚØ¶*ٛϫnjÉ+«$ [Ž>–I$~»~•9+«¸lP,¸ò¶¬HšÔp† F~yåˆ|ÌÚÂ<Ô×ÛÖÒò“ão®úàÓ‘Ëõ?„N¥Îfîçáš–œÐÞÓ›YTʤÓ7,]"ž½r¢„)ñ1Ýý¢kù·Jµ¥¹»³ªÏfÇÚÂAŽž»½C§ÓuöösìlñÙ<¸Î˜†631~9-U­Ñ\ÊÎ׌‡ “?³tꦕZ-tçãõ™Ð/W(ÚÚ í­M *ª»úE Õ›Ï314‘Èoׯv°¶Ê/¯ê ¸9;%ÇD°ŒŽ_ºv_mëî=záÊú¤Å—²ó›::ƒbé<¬ÂÔØpËŠ¥—²óO_»éÍç. ’H±g¿Y >K˜–p¡OJ|Ì­ªš…Å$"ÉÊœåhc…ê³ žj-Yxç•U¾ùì–UqÑåõ ˜’H½ˆ¾?ÀŽÔ¾“c]KÛ X‚ÞZ»zðõðå?a3,9¥å©‰ña¾‚ó™¹3…‚Q)”¿»Ý‡Î^|ï•ÁOô ½\¸>n.ÒÏ—ÔÜ.K]kû+›R…J`Ðhû~JŸ4½îÁåT7µ ï|ž7Ÿw-ÿVê PŒ ˆ>Màéa˶8rþ êÍ-«Ø¹q->Ã…¬<|¹Úº{·¬Hrq´Ÿß„c\H  ¨ºVÏü>Ï‚ezîfÎ,¦µ<&²opè‹CÇQ¹ÓØÞùò¦»¦5Kå2WÄF6wví9r“JzÖl'“âer9Õ€B!“ѧAcC¦-Û{ÄJŒ¥IŸüph`X(ªª}{Ç6ìØÙ»ÀøÄDeC³€ïrìÂÕ ­ÍýGΡ¯/óù¥¯?Ûýö.€¹©É¼÷Ïm õõÞÿóyü´ÑìЩÔOßÚenbò—Ï÷b®{:•úæ³[°×³kVì¬,iTƒªÆfüèhHÜ70¤í¹;sä öì>ÐÁ]÷K<:Z™›±ŒrË*PqhêèÂ&}žn.¹¥"påU56£^ Tgcž< ƒFë¿÷wû‡,cãöžÛÓx·"ú‹tšÁ,űe[, w¶³¥P°DªÁŒ‡à…#@¥VÓ©Tô½¥™)Jýè÷¯LÛ;P$##“Dƒ—£T©[ººÑæ¦&Céüv152’Hñ38“؆ zRd˜'Ï¿<_p=áÚÛm^žTßÚ~êÚ }ò1/nX£R«¿>~zâÎR&†u[Íøøˆ| 5­I«‰ÄcCÃûV޹‰ñL–s_Ð’É÷Œ“$½0¬™Û;Ñ JEmïîuâzè}»@}[û˜R)tç£n$¡;_:2ŠÚ:3PV;0SKex Šö´êîk¢Ó2W«Ü;š©ÇDZóß·àÓv™¸šWÈs°Ûµ-mH"­mn-¯oDÝ~i(€@ ]Ÿ¡³8±Ðþˆy“ä‚\¡052œt ÷ÄC"×'.®ki{éoa7¡`ç¼/ûµ-Ò’Îgæ~øõzB5 ì~k—€Ïûðë&-[’HÖm”J›½\÷­½Q¹ÂÜôžüÐÈ34 Öm È\ç…§3BÀÝDì=v óÂÀæ:ûú†ÅBw×ÜÒ ¾“#“NÇ<wü|ówôÍéXc&ó•M©ƒbé©«7†¥Òñ‰ ;6{Í’Xa^.g’H±P} l}˜â¤xp9u­mwE•nnE#àšAYö&@dGêj#æ•ܾÁ!Ôkòêæõs-¬½ûùu+»û¾;ukÓY`Ðh/nXc@!vð(>˜l]B¼À•‡¾oîìúâÐñé/gZ³T΃xˆÑ¹ƒ‰O4b0”*5*ÝHD¢‹£}úõ¬{®™Òüzw‰ my]£¿§…L&wgNvIÚ¾·û ó°Š©vˆ ÷-ø´]`&ú‡>üúOž³+ÇAàêáï›]RöÓ匇1@ ‹‡®ÏºE²Q¹— —j@™v’ "¶¸W‚¸qœðÛ;ƒ^l3ö`„ ˆ· Ë`lȤQ êZÛñO÷ù]ó+›R·¬Hº’[øî¿Ò3h”j@ùô»ü<ÜþõíÁ“Wô&ºúDJ•Ú“çŒO´`™ZY˜as÷­½Ú–Öè@?.gÞSœ MZ­.Èû-,ðà—êçkiÆÂ¯Î³`ÝmY4ÌK§ÓÍ2óUZÛ°8,ÈAºóJ6•ƒúN¬-Ì'-¨Ôj¹BÁ6»gæ‘mÎSœ¬31u¼öæs (”oO¦Ke·OhÆšw/’Jm,Í{Dƒú{¿ (d®½í‘ WîÖžDâåÂÅžî­V«¥á|{$"ïó–ÊxŽv‚`%Å/Ða›±lÙ?ž»Œíqcfb¬g„†µ…ùo7¬K¾:~JŸbR (;RSL ¿8|l’øj^!»†>Æ ­`Á2Å›þÉd–ÊKÖf3í3 ½ƒ:ÎÞúîÔ‚ öÖì®þÛ..N2¹¦¹kˆ1¥Ò’e:©Sϱ Ô‡úz{p9d‰D"b1‹*µÓŸö(±lÄ•ã@"1j±˜Ï•_Ä*æTð9¡R«KjêJjêdãÒ„?ßË9#ò± …ÃC?Ójµ§¯ÝdíÜpO<26fÕµ´–E‡£X€¤È0ìéåRN>€ìXŸ‚¥$F„âˆe#*µFÀ瑈·ýpÖV–.žÇïܸvëªeÅïìþßLâ,.$0-9s}QÈäOþïõ/÷O÷9Š»¡Î΄V{5¯cg³$<K|~íJ"îñô¾µwòÊ Àï~³ ?é`Ädè?«+½y«„ç`—‚¾yy’Z£Á/\˜ K3S'[ktÔîËFB}½±VàÚÛÙXÞÝSª²±i||")2Ë€bÈ cïKjê ðr÷váU44as^¢aqßàPd€/>3v·¨njqq´Çö¯¢É¡>Þ#ò1=]€¨o€N»;׃ÊÌU@$"ý…óî¥5õD"qYt8>‘H$Ì2»äêäH j›Û°”òºF2‰ˆÏ†¹‡¥2Ç»ÑlQBü­´¦¹…N¥ÜybA$ÂÏWX¸ãá@‰ ô›S-X¦;7®•)ö;…_ª2­  ôº+ ³¯ŽŸšºµAg_eCújéìô JGFC|¼°Þálok‹‹(š¥rª›Z&&&"Bð=KO1¦T6´uø.˜Õùºñ™tzyÝím=¸œÞA|”BMS«»³æó¦S©^îsêM]²Q¹ÐÝUèÎ’Hñ6\^ßÀµ·så8N-fUc³…êë¥Gúô|rS©5SºÀƒZ} އB&;ÙZ›Nû-¾ãhµºnÑ}>yÀ¡Y8<Šý5¾:~ÊßÓ--9ÁßÓ­°²†D$rlíØì/ýÐÙן[Z&ìÿðÝŠ†&k+¡»kvIy„Ÿv†ó™¹Ëc"WÄFÙ±-Kjêí­Ù‹‚ +«1¯V«=y%cã²%ûÿñnnY…ƒ‘š_^µ(8%AÞžK£ÂŽ6V€¥QahðìÅì¼üò*@T€pûêåZ­N¥QÿùÅgñÇ1¡ÓéÒ¯gn]µì•Í©·*k˜tzT€°opˆI¿=ËF姯Ý\³$öÛ7×ÔŽ)XÆFnÇ®þl WÿÐph !<Ä€BÁ/‰»xuçÆµo>»%¯¬R,ay¹pÿùÍ~ÀŬ®.gndʹ˃ÓÞÓ‡_OZÑÐTÝÔ²8,ØÆÒ¢¾µ@ 8ÛÙ6wv¡‘ŽùåUÉ1¿IYÞÒÕmǶäØÙ ‘…(EÕµ‘þ¾ë“â­-Ì%RŸ‡!‹†ÅËc" t¹BéÆq475™äS¸òP·‘@°`™. Ô6·uõ‹Ù¹a “N«lh ññ‘ÈF±¥¸x;¤&ÆsìlÐðp|„øÕ¼[ÓÎ=étº³7³7%'¾¼)µ¸º–I§Gøö aKIf©éÈè¹Ìœ±QolM+­­WªÔ6–æTÊô &ôplm!>^²Q¹Z3Ž- :{#ûµg6¼²)5·¬’A£Fúõ æ—WÞn&.g’•^ÊÉ÷æs_N[—U\6>1æë-Åô„>]@§Ó•Õ5„ “¶©»˜•ïÁu~~íÊ‚ŠênÑJõä9Ÿ¼’ÑÙ×_\Sî'H‰a›±ú‡]9^.ÜÜÒ =ï†$Ò1¥2*@¨ÕjÕšñn‘H«Ð} އmÆzmˆëE“vÿFyesêDÚÚÕ#•››šD»ûPGéÈÂáQè3¥J½ãÝ·¬Xš²>)^©R·÷ôîÿùîÞƒÚý¿7¶nŒ ôw²µ©in}á¯Dúáõ™V«}ãÿÙ¹qí’ð`O·±½ãµ>ñusÁß5??tlL©LŠ K[–ÐÝ?ðßÃ' +«'é3ôô°^.\/. ¡½+h`>€$F„N*ÅåÜ‚™æGŒ €©‘!þä(|õ=˜YŸ‰†ÅÏýùï¯?³!:ÐO§ÓW×=ÿ—>~ó5|XÒ}kï£}ª›×.‰KŽŽô »xsæß·È€ÑàÖ·þßÿG§´tv~ð˜>γ©”Õ5Ï–„¯\=(‘9ÝDËSZ>$•Æùlj٨¼¥«;¯ìž¿è.©©Oމ@÷Þħ·võìÞ$124Ò_H$ÄR6×)–ì>p$9:":ÐD"ö ~{òŒ> õQºûÎÞÈFÿHAtÚoO¦/‹ŽX%SU×Ö·¶OÚ“bNH¿ÐÒÙâã½4*lbB+–ÉŠªjÛ:§ÍŒ ˆ×yÒKN÷Ý©3‹‚½<Ü8Ž •ª³·¿µëöº™·Š˜Œ/ww®SS{ç—?žxyS*vìÄ„vï±S)ñ1aBº?íÏ×3ß|vËþ¥ûæ§ŸSâbêtººÖ¶=GONzÒðuã Ý]Ñ÷l3ÖÒ¨p€\¡ìê!012à7h{aúlhÐ:ßÉïä€O¿–k&PTUK@ñ¡«â¢Ä’£ç¯øºóm¬õ©œŒ‚b±t$&È/!"dbB+ÆäƒFEË‚‚ºGÇX†®~Ñ_¹4*L3>^V×~=U™è‚åIKÅ’ÿ>‘“*WÜÞŸ6-9aN]Uó,þE®P|ºÿǤÈ0/n°ÀsD>ÖÔÑ%–ÉÐFÜsôäÒ¨pW—ªX6röFöõ™—ÄNB§Ó:sqyl亄8"‘ˆîO{_«˜ú\OòÊ*}\]bƒý È™\^XY}9§ÕŽ8@ D×^‡D®:Ýí¸ó†J‡µóè¡P®|óEecÓÎ÷>‚µñÔboÅÞµ-mÚÿê†`ìÚ–¦ÑŒ~èØãº€EÁq¡ïìÞãͼ¾³¸¦VÿEÈãF  € w_ý@?˜.ÜBwôsÂÓTY l+ ”©«iTƒ›·žà?Z†<8‚œÏÌâ ‘@À9ÙZÛ[±'yX1ÃRÙÉ+Pœ=(d2Õ€"Sª€<=`<^Î|ùïÜÒŠ¶î^ ™,tç{óy m³ü‰äi £·oNÿ'ö4`oÍ^»dQimƒlTÎ6gEúûŽŽ)²KÊã%Íu3d~xòœý<\™ß¿Y@ O(PŸ=fÎdd ¼Â„ ™Ü?8|ðÌ…}?¥ÏoRäWŒX:"–DúÑ©*¦¦¹õÜÍü?ÕB~­$D„0é´ôŒLl¯fäiÆŸA ü²Àø3@ _PŸA ,, >ƒ@ YX@}@ ²°€ú @ daõ@ ÈÂê3@ …Ôg@ ¨Ï @PŸA ,, >ƒ@ YX@}@ ²°x‚õÙ ëVÿïä[@ ȯ ÒSUÚ¯H_.Ç•ãH!“?ÚwàØÅ«ø d)L(ˆ ôä9[™›)•õMߟ:[ÑÐ4Ëi˜Œ¸ÀØ ®ƒËØhH"Í+«Ü÷SzßàÐÂ/²žyæÍ†¥Kn.Ž6Öl3–J­yë?ÿö<›’ÍMMv8ò„vC=\èSV×°ž*!xÚôÙ÷§Ïí>p´³¯õ☷žß65ƒlT¾ûÀ‘Ÿ¯gÊFåhÊ¢à€þîåÿ{ö™ëùEãÓž¶­§÷íO¿Ì((ÖŒÂëÏlH[–ðêæõ»þùé/²žy~Ý8ÚX'D„d—é¯ÏrJ+òË«f2‰_®ssg÷ãg€%á!ÒÞÝkÄdLývëÊe*µº¶¥M:2Ê6g…øxy¹pÿõí™$ÚÒ¨0¾“CMskeC3“N ðrcëÆ½ÇN5´u<âr9ÛÛFúûö õ Û²-¦fôöðæóºúD²Q¹…2íIqsvÌ,*}r»€“‘Ò;0¨¿>{d]察¥ÕiŸÂÑ òÔòˆô™…œ¶,1!"ÄÞŠ­R«;zûÎg梞ê5Kbßz~ÛÎ÷>*¬¬Æòïܸvûêåk_ÿ#æ¢S©/nX“ ÑÚÚÿwä䤟`›±Ö&, óX[˜S (]ý¢³7²ž¹¨ÕÞíÒ•³NSª›«›ñ)× ŠÊê„î®N¶ÖM]Óu&# ÿQ«Õ~qèxjb¼¯ÿ¾5cǶ|cëÆ@o­VWRS÷É÷‡ÿ²ó9¾“CÌÖßêS{(Kƒ×%ÄñˆbïÀ`vIùWÇO)”*}ЬgÈ$t:Ýû3P(¨ # ®‡ó™¹½¼Ÿ|H,üów/OýöÈùË M:ýâãµ>iq¤¿ïLWž[ZqèìE¥ê¶èÌ)­Øµ5-)2ìÑë³òºÆâê:•Zì?­>ûùz桳'&´Ï¯[ålg;íIm¬˜tzuS+ì¿xÀxdBY < }f@!ï}÷-/nC[ÇñKWÉ$’‹£ÃæåIúG„ÿüñ O·’šºÒÚ;+ËÝoï*­­Çç x®]w«ª¦¨º–D$ Ý]_ݼÞÁÚêý=ß>àõËJ€Z3®ÿ!ãjæ¾ÎK3Ó}ï¿cjdt%¯ «OäëÆÿú½·¥w\wzÖÞ¶o^Ÿ´X4$¾•7¦P:X³Sã_º¦P<“òópKa J$¦»= \y‹‚l,-´Zmsg÷™Œ,ô‘}Í’Ø÷Ÿ?߃ݶ)ñ1a¾,ÑÚÂ<1"”çhG!“‡%ÒÊÆæ³7²Ñœ,“å1‘<{2™Ô;0x9§“ÚN¶Ö¯mÙðÝÉ3öÖì@/ÖÕ/:qù:“µrQTL?àý×nkâO¾?ÜÙ×èåàånmaN§HFF +«¯äbdRðÍ¿~ÿjvIYïÀТsãa©ì|fnY]Ά‘Ø € oO3#…J]ÙÐtöFö˜R‰~»#5…A£¥gd&ÇDØXZT7µüpú€kog@¡Ô4·b'‰ ôôö071QiÔ½ý²ò:zûhTƒ^ßyúÚÍ›·J°_üËÎçZ:»ž¹€~dÒi)ñ1ž®.L:mX*+¨¨¾^P„Yì,&êÉs~níJÀ¶”d4szFfFAñìV1©  6|òjÆò˜H{k¶J­É+«¸•‡]À,Ÿ¥ `LŠ?‹ †úz›™k4ãCi~EI9ï¡YP< }¶#uµ— ÷ðÙKŸüpK¼ïXŒgiT˜¿§[zFæ{_îCS’"Ãþöê|ž¬¢²KÙùxIô§¿YýÃés}ýó¾xsS“@/ξ~l ×‡¨!J½’[8{¶ׯ131þóg{/dåbל3:¦Ð³öÂ…>ë“W44½ô·P‡ÀˆÉÀKœG‰«Ë–I}ý?_¿É¤Ó7.KŒÜssBÇôº–¶3Y$1ÌWðÚ– wpH"-­mˆðóõváݪªA3#âãêRÛÒŠÇÉÖzçÆµ*µ:§¤\22jfbìÉsFõ™©‘áëÏlD’U\6¦Tz¹o_½üà™‹%5uØO¯Šiîè:~L&¥ÄEo_½âï{¿Õju9¥$1ÂÏwßOéJ• Døû I¤…Ejõ8ßÉ!)2ŒA£ºzc¦² \]¸övé×3Ç”ÊøÐ gV.í Ü fÅ2o>÷VUíÍ¢Cf¤¿ÐÞŠýé'&nûwY&F[W%_ÌÊëÝ$ ·V{p9¢!ñ X‚ÖÆ¶UÉÞ|^uSK^Y%‘HàØÚpìlô±L"‘ðÛõk¬ÌYYÅeƒb©À•·eEÒ¤†3d0òË+GäcÖ桾޶––ŸüpsÕ† Ȉ\®ÿ!t*uö0s?×´ä„öžÞÌ¢R&¾aééè]å4{åDSâcºûE×òo)”jKswg'TŸý"xp1 mfbürZªZ£¹”¯ &9~féÕM-*µZèÎÇë3¡;_®P4´µ4Ú[7šTTwõ‹4ª7Ÿgbh8"#ß®_í`m•_^Õ#psvJމ`¿tí¾&ÚÖÝ{ô•õI‹/eç7utÅÒyX…©±á–K/e矾vÓ›Ï]<$‘TTß·à³ti ú¤ÄÇܪª¹QXL"’¬ÌYŽ6V¨>{¡à×wƒ‡<ÑÆßÓmÞúŒ@@þºó9 ùÓýsûe½ùì–Ñ1Å×ÇOÏ’H ćµu÷^ÌÎÿ>qzÅ¢(ýkoõâÀÇßÄÄ@6*AV,Š ‰??x Åkêè|)m&7 ô•qÑye•ØCpAyõ[/l]|äüåÖ®±L&tçcúŒkoklÈ,¹vÛQº>i±f|üãïa. Ìy–F30ø÷÷‡Ðç༲Ê7ŸÝ²*.º¼¾@C顳Ñ÷§Ø‘šÂwr¬kiKлQkWÞ›òå?a3,9¥å©‰ña¾‚ó™¹3…‚Q)”¿»Ý‡Î^|ï•ÁOô ½\¸>n.ÒÏ—ÔÜ.K]kû+›R…J`Ðhû~JŸ4½îÁåT7µ ï|ž7Ÿw-ÿVê PŒ ˆ>Màéa˶8rþJAE ·¬bçÆµø ²òðåjëîݲ"ÉÅÑ~~Žq!€¢j}€Ï³`™ž»™3‹i-‰ìúâÐqTî4¶w¾¼é®iÍR9Ɔ̱‘Í]{ŽœÄ¤’žõ¦ƆL[¶öˆ•J!“>ùáÐÀ°PTUûöŽmXæÙ»ÀøÄDeC³€ïrìÂÕ ­ÍOž3‰HÌ+«$‰èK­Ñ4´uðìÐ6-«mpå82h44¿ÐÝU­Ñ ¥³d™Z™›e—ãç›°ªóàrÛ;±I µF“SZnÈ Û±-±Ìø ñ®¾~€™±ñ,ÅÁîLD"D$Ö4·’HD+sÖLùëÛÚ1•¬PªÅ3“Ûç÷uã)•• ÍXÁ;zú¤#£<{ìp¥JI1 –©ËKôqsÑŒOZG¬§˃ËS*oUUcGe—N[XAHD"z¯²·bÏà ®¼?Ÿœ’r=/›™¯_º¸» ãÞ)W<Öæ&F†ye•˜ÀjîìêÆ­÷œ¥rÞ5?¿à´xòœ5ãã˜dñàqj[ÚPq+xGÝì]µRªÅËÁòSÈäÒÚ†;†äÚÒÙ]{oÇŠ©R«ñ»¥d£M¯‰ÎÞô´ŠÑ1*ÎP:{û±óß·àÓv™P(U,“i¯äa Ècá¡ûϘt ™7 ­µ»gRâ°TŠÿøRÚÚm«’oUÕ9Y4,Ÿ˜°27{û…m2y~?ºkkÚêű§®ÞøìàQ=¡Q >ç÷îÎNýâ+üb.Õ 51ûØÞÓ·ï§tT…L ë–J ™zÖ“AÃî–±@„“ÑѰ؈y»8¦¦€]ÛÒ&ˆ©ÏÒÚ†Øà7—ÜÒ ¸òª›Ñ9kTgcž< ƒFë¿÷wû‡,cãöžÛÓx·"ú‹tšÁ,űe[, w¶³¥Ü]µG5˜ñIJ*µšN¥¢ï-ÍLéTêG¿eÚÞ"™$<¸¥JÝÒÕ~475’Hç·ˆ©‘áDŠŸÁ™äÀ6dГ"ÓP(t*µ½»×‰ë¡÷íõmícJ¥Ðº‘„î|éÈ(jèÌ@YíÀLM<,•á5(Ú#Ъ»¯‰NË\­Brïh¦ÇÎß‚OÛfâj^!ÏÁn×¶´!‰´¶¹µ¼¾uû=¤¡y,ÂnBÁÏy_ök[6¤%'œÏÌýðëô<„j@ÙýÖ.Ÿ÷á×?LZ¶6$‘¬Û:)?:•6{¹î[{£r…¹éByò›vÎw °÷Ø)4Ì ›wèìë Ý]sK+øNŽL:ó '·ÇœŽ5f2_Ù”:(–žºzcX*Ÿ˜°c³×,‰%æårÖ!‰ ÕÇÀÖ4S xp9u­mwE•nnE#àšAYö&@dGêj#æ•ܾÁ!Ôkòêæõs-¬½ûùu+»û¾;ukÓY`Ðh/nXc@!vð(>˜l]B¼À•‡¾oîìúâÐñé/gZ³TÎËy8(‘èâhŸ~=ëžë@¦4¿Þ]`bB[^×èïéF!“ ÄÝ™“]R†¶ïíþ…€GÃ<¬bªbcÂ} >m˜‰¾Á¡¿þÁ“çìÊq¸ºDøûf—”ýt9ãa Èãâ¡ë³nÑ€lTîåÂ¥P¦¤Cƒˆ-î• n'üÇÆöÎ`ÛŒ…=!âíÂÅ22iTƒºÖv¼‡ÀßÓ}~×üʦÔ-+’®ä¾û߯ô ¥P>ýã.?·}{ð但‰®>‘R¥öä9ã-X¦VfØÄ}k¯¶¥5:ÐσËYSœhœ¯¥ ¿:Ï‚u·eÑ0/N7ËÌWimÃâ° C]èÎW(UØTê;±¶0Ÿ´T R«å Ûìž™G¶9 Lq²ÎÄÔñÚ›Ï5 P¾=™>,•Ý>¡kþ5#•ÚXš÷ˆõ÷~PÈ\{Û#®Ü­=‰ÄË…K!“§žD¥ÖhµZηG"ñ>a©Œçh‡ VRü¶Ë–mñã¹ËØ7f&Æsв¶0ÿí†ÕƒbÉWÇOéSLªeGjЉ¡á‡Mò_Í+Äb×ÐÇ´,X¦xÓÂ?™ÌR9b ÀÚÂl¦ýq' ™\ÓÜ‚5ĘRiÉ2Ô©çØêC}½=¸2‰D"±˜E•ZÆéO{”X6âÊq ‰˜ µXÌ€çÊ/bs*øœP©Õ%5u%5u²qiB„Ÿï圂ùØÈÂá¡ÇŸiµÚÓ×n²Œvn¸'³êZÚË¢ÃÑ,@RdöôŒr)'Ÿ@@v¬OÁR#Bñ‹IJ•Z#àóHÄÛ~8k« KÏã‚wn\»uÕ²Œ‚âwvÿo&q˜–œ€¹¾(dò'ÿ÷z€—û§ûÅÝPggB«½šWȱ³YŒ%>¿v%÷xzßÚ;yåàw¿Ù„Ÿt0b2æ=«;W,ÍLl­ÑQ»opH, õõÆZkogcywO©ÊƦññ‰¤ÈP,Š!ƒŽ½/©©C$ÀËÝÛ…WÑЄÍy‰†Å}ƒC‘¾øÌØÝ¢º©ÅÅÑÛ¿ŠB&‡úxÈÇô\‚úè´»s=¨ŒÁ\D!Ò_8ïZ*­©'‰Ë¢Ãñ‰D"a–Ù%W'GPÛ܆¥”×5’I¤øÐ@|6Ì­8,•¹8Þf‹ âo¥5Í-t*5àÎ ‚ ~¾¸ÂêÀJL ßœ hÁ2ݹq­|L±÷Ø)üR•iím Ö¥XY˜}uüÔÔ­ :ûú+šÐWKg7 w`P:2âã…õg{[[\DÑ,•SÝÔ211‘‚ïY¿Ôú.§w`¥PÓÔêîì„ù¼éTj€—ûœº@SG—lT.twºó‡$R¼ —×7píí\9ŽS‹YÕØl@¡„úzcéÑ~=ŸÜTjÍ”.ð VGŸ‚ã¡ÉN¶Ö&††Ó~‹ï8Z­®[4@ŸOp(€@b¯ŽŸò÷tKKNð÷t+¬¬!‰\[;6{ÅK¿töõç–V„ û?|·¢¡ÉÁÚJèîš]Ráçƒá|fîò˜È±QvlË’šz{kö¢à€ÂÊê oÏ;]T{òJÆÆeKöÿãÝܲ ##124¿¼jQpþJ‚¼=—F…m¬K£ÂÐàÙ‹ÙyùåU€¨áöÕ˵ZJ£þó‹Ïâ=xæöü½>i±Ÿ‡kQU-:.¿¼i]·§X6Âs°{÷¥çñG½¿çÛY¶UüòÇaBÁû¯¾éïÛÝ?àëÆw´±jêèÂ?Ï^{9¥åÇ.^MMŒ?¹û£›E% ¥ÊÎÊ2ÌWºëm40ÿ¾EÖ3ÏL¬\íÁåüá_ŸOLètºôë™[W-{esê­Ê& ìbÒoÁ²Qùék7×,‰ýÃöÍÅ5u£c –±‘DZ«[ÃÕ?4Ü#H1 PðKâÇ.^ݹqí›ÏnÉ+«ËFXÆF^.Ü~³p1+ÏË…»sãÚ¬¢2…JèåanjrðÌE=ÝŸè? ­\UV×01¡­omohïŸxníÊìârÁ,m~T44•ÖÖG­ÌÍj[ÚÆÇÇ-ÍX>®.gndʹ˃ÓÞÓ‡_OZÑÐTÝÔ²8,ØÆÒ¢¾µ@ 8ÛÙ6wv¡‘ŽùåUÉ1¿IYÞÒÕmǶäØÙŒŽÝÝ®¢¨º6Òßw}R¼µ…Ù D*àóð!bñÀ°xyL¤!ƒ.W(Ý8Žæ¦&“|ŠWê– ,ÓÅaA€Úæ¶®~‚ ;7¬aÒi• M!>^Ø!Ù(¶o'€ÔÄxŽ Ž¿šwkÚ¹'Nwöfö¦äÄ—7¥W×2éôÈß¾Á!l)É,•#=—™³"6ê­i¥µõJ•ÚÆÒœj@9~À Ñ„ގ­ ÄÇK6*WkƱ%AvlKw®ÀÜĘD"¢ï®¨oxp9“¬ôRN¾7ŸûrÚº¬â²ñ‰‰0_oéÈ(¦'ôé:®¬®!L(@˜´MÝŬ|®óókWTTw‹èTª'Ïù䕌ξþâšÚp?AJ| ÛŒÕ78ìÊqðráæ–Vèx7$‘Ž)•QB­V«ÖŒw‹DúX…þèSpQXY=IŸ9ÙZ'ÇD`½\¸^.\@C{*DÐÀ|IŒTŠË¹3Í1S#CüÉQ>øê{0³> ‹Ÿûóß_fCt ŸN§+®®{þ/|üækø°¤ûÖÞGûT56¯]—è›´Îwrà;9àÓ¯åßšIUÕB|hસè±äèù+¾î|Gk}*'£ X,‰ òKˆ™˜ÐІ‡1ùÅ QѲ  nÅÑ1–ÁÁÆ Ÿ}_Z[_Qß„.Xž´äpP,ùïá)ñ1‰¡rÅíýiÓ’æÔP5Àâ/Qä ŧûLŠ órá ƒ@ YX@}@ ²°€ú @ daõ@ ÈÂê3@ …Ôg@ ‹'XŸ½°nUÑñøN°!üš =U¥ ññŠô÷õàr\9Ž2ù£}Ž]¼ŠÏ@&‘„‚øÐ@Ož³•¹Ù˜RYQßôý©³ M³œÖˆÉˆ Œ òç:رŒ†$Ò¼²Ê}?¥÷ ýZ‹¬?–.ñä9ÿù³=¿2[ŠðóY³dÑ_>ß;"{¯ß˜É|÷åç_º–[ZñT ž/÷Mɉ~õ½hX »À¼ùûë/–×5N¾ _1O—>[—è7:¦Km,ͧfôöø÷›¯iÆÇ«›ZêZÚm,Í£„~>ï|¶çrNÁL§ÝºrÙÖUËFäc• M ¥+Ç!%>&.$pû;këîýUùW†“­µÇézA‘Z£Yh×,ð\—÷§Ý{Tjõc¼ŒH_žƒ½ƒ5ÛÄȰª±yßOéøoÝœü=Üœl­ ™Ò‘ÑÆöÎ Y¹³Ü¶­-̽Ü]9Žæ¦& ¥ª«_t1+¯«_ôèËeenêëíhcekiI"?þî`wÿö-‰Hº»úºóm--hTƒ!‰´¸ºîæ­’ñ‰‰IçÙ”œhnj²ûÀ‘'´ 2èáBŸ²º†…ðT @ÀӦϾ?}n÷£}ý«Ǽõü¶©d£òÝŽü|=S6*GSüów/ÿß³Ï\Ï/š:(£´õô¾ýé—Åšñq@xý™ iË^ݼ~×??ýUùW†£uBDHVq™þú,§´"¿¼êÔ×¹¹³ûñŠ3À’ðiïî5b2¦~»uå2•Z]ÛÒ&e›³B|¼¼\¸ÿúöÀLmiTßÉ¡¦¹µ²¡™I§x¹¿±uãÞc§Ú:q¹œím#ý}û‡ú‡†mÙ“¾e›³Ò’zËêÔ ×Þ.9&ÂÍÙéËOèt:,‚ nÎŽ™E¥On0b2"Bzõ×g¬  üõó¯´:íÓ0A (HŸPÈiË"Bì­Ø*µº£·ï|f.ê©^³$ö­ç·í|ï£ÂÊj,ÿÎk·¯^¾öõ?bþ':•úâ†5 ! ­¡­ýGNNú ¶km¢0_µ…9Õ€ÒÕ/:{#ûà™‹ZíÝ.]y¿9»ªÆæªÆf|Êõ‚¢²º¡»«“­uSG×´GÉÈÂÔjµ_:žšïëÆ¿oÍØ±-ߨº1ÐÛC«Õ•ÔÔ}òýá¿ì|Žïä³õ·úÔÊ’ðàu q|'"Ø;0˜]RþÕñS ¥êá¢ÓéÞÉ€BA‘Hpå8œÏÌ}ìåýäûCbÙàŸ¿{yê·GÎ_®hhÂ$Kˆ×ú¤Å‘þ¾3]yniÅ¡³•ªÛ¢3§´b×Ö´¤È°G¯ÏÊë‹«ëTjul°ÿT}6:¦øâÐñæÎ»½`óò$O7¾“C}k;–èhcŤӫ›ZaøÅ»ÆSò¬`< }f@!ï}÷-/nC[ÇñKWÉ$’‹£ÃæåIúG„ÿüñ O·’šºÒÚ;+ËÝoï*­­Çç x®]w«ª¦¨º•xuózk«÷÷|û€×/W(j͸þ‡ŒOL¨5šû:c,ÍL÷½ÿŽ©‘Ñ•¼‚®>‘¯ÿë÷Þ–ÞñcéY{ؾy}ÒbÑøBVÞ˜Bé`ÍNMŒ?~éšB9ð(‹Œáçá–Â26”H.Lw{¸òØXZhµÚæÎî3Yè#ûš%±!ï?¾»mRâcÂ|X¢µ…ybD(ÏÑŽB&K¤•Ígod£9-X&Ëc"yöd2©w`ðrN¦;l­_۲ồgì­Ù^ ­«_tâò5t&k墨˜ Àû¯ÝÖÄŸ|¸³¯?ÐË#ÀËÝÚœN5ŒŒVV_É-Ä$Ȥà›ýþÕì’²Þ¡E!æ&ÆÃRÙùÌܲºœ #±AAÞžf&F •º²¡éìì1¥ývGj ƒFKÏÈLމ°±´¨njùáô9×Þ΀B©inÅNèèíanb¢Ò¨;{û/dåuôöѨ¼¾óôµ›7o•`¿ø—ϵtv®.L:mX*+¨¨¾^P„Yì,&êÉs~níJÀ¶”d4szFfFAñìV1©  6|òjÆò˜H{k¶J­É+«¸•‡w4ÎTðYºÆ¤ø³¨a¨¯·™‰±F3>$‘æWTa‘”ó Å£Ðg;RW{¹pŸ½ôɇ±ÄûŽÅx–F…ù{º¥gd¾÷å>4%)2ìo¯îÀçÉ**»”—DÚñ›UqÑ?œ>×Ù×?ï‹775 ôòèìëÇz}ˆ Ò©Ô+¹…³g{qý3ã?¶÷BV.vÍ)ñ1£c =k/\è³>iqECÓKûu˜Œ˜ ¼Äy4EFñquÙ²"©³¯ÿçë7™túÆe ’‘{nNè˜^×Òv&#‹D"†ù ^Û²áãïI¤¥µ ~¾Þ.¼[U5hfA|\]j[ZÑâ8ÙZïܸV¥Vç””KFFÍLŒ=yΨ>352|ý™@²ŠËÆ”Ê@/÷í«—ÍD$~»~•9+«¸lP,¸ò¶¬HšÔp† F~yåˆ|ÌÚÂ<Ô×ÛÖÒò“ão®úàÓ‘Ëõ?„N¥Îfîçáš–œÐÞÓ›YTʤÓ7,]"½«œf¯œ¨aJ|Lw¿èZþ-…Rmciîîì„ê³9FLºT®3¦¡ÍLŒ_NKUk4—²ó5ããáBÁ$ÇÏ,] º©E¥V Ýùx}&tçËІ¶vƒF{cëFS#Šê®~ƒFõæóL GäcòÛõ«¬­òË«zDnÎNÉ1,c£ã—®Ý×DÛº{^¸²>iñ¥ìü¦ŽNÀ X:«056ܲbé¥ìüÓ×nzó¹‹Ã‚‡$Ò‚Šêû|–.0-áBŸ”ø˜[U57 ‹ID’•9ËÑÆ Õg2<=7~ÈÁC×gaU\ô°Tö呸tÑЖ2%„‡hµº½GOa)³ó¶¯^α³ÁR0@!“™t‚ 7 KRâcü=Ýæ­Ïä¯;Ÿ3 ?Ý?‡°_–±Ñ›ÏnS|}üô,ÙˆB|hP[wïÅì<,ñë§W,ŠÒ¿öV/Ž|üíALœd£r0_æWdAV,Š ‰??x Åkêè|)m&7 ô•qÑye•ØCpAyõ[/l]|äüåÖ®±L&tçcúŒkoklÈ,¹vÛQº>i±f|üãïa. Ìy–F30ø÷÷‡Ðç༲Ê7ŸÝ²*.º¼¾@C顳Ñ÷§Ø‘šÂwr¬kiKлQkWÞ›òå?a3,9¥å©‰ña¾‚ó™¹3…‚Q)”¿»Ý‡Î^|ï•ÁOô ½\¸>n.ÒÏ—ÔÜ.K]kû+›R…J`Ðhû~JŸ4×ìÁåT7µ ï|ž7Ÿw-ÿVê PŒ ˆ>Màéa˶8rþJAE ·¬bçÆµø ²òðåjëîݲ"ÉÅÑ~~Žq!€¢j}€Ï³`™ž»™3‹i-‰ìúâÐqTî4¶w¾¼é®iÍR9Ɔ̱‘Í]{ŽœÄ¤’žõ6µwÄúËŠÚ–»S™Æ†L[¶öˆ•J!“>ùáÐÀ°PTUûöŽmXæÙ»ÀøÄDeC³€ïrìÂÕ ­Í·5Ô×{ÿÏçñÓF³C§R?}k—¹‰É_>ß‹¹îéTê›ÏnÁ^Ï®Y°³²¤Q ª›ñ£¢!qßÀþµçîÌ‘+سûƒ3"cX™›±ŒrË*PqhêèÂ&}ž9%åz.^631^¿tqwÿ@ƽS®x¬-ÌMŒ óÊ*1ÕÜÙÕ[ï9Kåxòœ‰Dâ¥ì|¼k~~Á•‹¢í¬,O\º®RkðV­Ç$‹SÛÒ†Š3€\¡À;êf凞R (î\–ŸB&—Ö6Ü1$×–ÎîÚ{û;VL•Zß-%£ mz}Ltö. §UŒŽ)ð±z½ýØùï[ði»ÀL(”* –É´Wò0†ä±ðÐýgL: €NÐÌÖÚÝ3)qX*Å|)mí¶UÉ·ªjŽœ¿,OLX™›½ýÂ6 ™<¿ݵ5mõâØSWo|v𨞇ШŸ¿ó{wg§¿~ñ~1jš}lïéÛ÷S:ªB¦†õ K¥Æ†L=kÉ aw‚gEÆÃ26ˆp2 1oÇÂÔ°k[Ú¤1õYZÛàãæ’[ZA W^Uc3:gêl̀ǀBaÐhý÷þnÿà0€elÜÞs{úïVD‘N3˜¥8¶l‹¥QáÎv¶T –H5˜ñIJ*µšN¥¢ï-ÍLéTêG¿eÚÞ"™$<¸¥JÝÒÕ~475’Hç·ˆ©‘áDŠŸÁ™äÀ6dГ"ÓP(t*µ½»×‰ë¡÷íõmícJ¥Ðº‘„î|éÈ(jèÌ@YíÀLM<,•á5(Ú#Ъ»¯‰NË\­Brïh¦ÇÎß‚OÛfâj^!ÏÁn×¶´!‰´¶¹µ¼¾uû=¤¡y,ÂnBÁÏy_ök[6¤%'œÏÌýðëô<„j@ÙýÖ.Ÿ÷á×?LZ¶6$‘¬Û:)?:•6{¹î[{£r…¹é/óä7"ObÚ9#ÜM$ÀÞc§Ð0/ lÞ¡³¯`X,twÍ-­à;92étÌs€ž|~n;–6‡c™ÌW6¥Š¥§®Þ–JÇ'&ìØì5Kb „y¹œu`H"ÅBõ1°õÍ”Õ\N]kÛ]Q¥›[Ѹæ@d–½ Ù‘ºÚĈy%·°opõš¼ºyý\ koÅ~~ÝÊîþïNÅÚt4Ú‹ÖPÈŸ<Š&[—/på¡ï›;»¾8t|úËÆ™Ö,•sÇr¨w„ Ëc#3‹J'¹èHD¢‹£}úõ¬{®™Òüzw‰ my]£¿§…L&wgNvIÚ¾·û ó°Š©vˆ ÷-ø´]`&ú‡>üúOž³+ÇAàêáï›]RöÓ匇1@ ‹‡®ÏºE²Q¹— —j@™v’ "¶¸W‚¸qœðÛ;ƒ^l3ö`„ ˆ· Ë`lȤQ êZÛñO÷ù]ó+›R·¬Hº’[øî¿Ò3h”j@ùô»ü<ÜþõíÁ“Wô&ºúDJ•Ú“çŒO´`™ZY˜as÷­½Ú–Öè@?.ç§8çQä© q¾–f,üê< ÖÝ–Eüt:Ý,3_¥µ ‹Ã‚ t¡;_¡TaS9¨ïÄÚÂ|ÒRA€J­–+l³{fÙæ,0ÅÉ:SÇko>×€Bùödú°Tvû„f¬yWïTjciÞ#Ôßûe@!sím\¸r·ö$/.…Lžz•Z£Õji8߉HÄû<†¥2ž£‚ XIñ tØf,[¶Åç.c{ܘ™Ï5BËÚÂü·VŠ%_?¥O1©”©)&††_>6É|5¯‹]CcÐV°`™âM ÿd2Kå ˆ%k ³yoàå¾vÉ¢ÂÊêÓ×nNúÊÅÉB&×4·` 1¦TZ²L'uê9vúP_o.‡L"‘HD,fQ¥V£qúÓ%–¸rHD"æBC-3à¹ò‹XÅœ >'TjuIM]IM€l\šáç{9§`D>ö€C²pxèñgZ­öôµ›,c£î‰GÆÆ¬º–vÀ²èpô †==£\ÊÉ'ëS°”ĈPüâ±lD¥Öø<ñ¶ÎÁÚjÃÒÅó¸à×n]µ,£ øÝÿ›I©Ä…¦%'`®/ ™üÉÿ½àåþéþ#Gq7ÔÙ™Ðj¯ærìl–„c‰Ï¯]IÄ=žÞ·öN^¹øÝo6á'Œ˜Œ9ÍêêSä™°43u²µFGí¾Á!±l$Ô×k®½åÝ=¥*›ÆÇ'’"C± († :ö¾¤¦A/wo^EC6ç%÷ Eøâ3cw‹ê¦G{lÿ* ™êã="Ósiê ÓîÎõ 2s „Há¼{AiM=‘H\ŽO$ ³Ì.¹:9„Úæ6,¥¼®‘L"Ňâ³anÅa©ÌÅñn4[T€+­in¡S©wžX‰ðóÅVîx8PbýæT@ –éÎkåcнÇNá—ªLk'h½°.ÅÊÂì«ã§¦nmÐÙ×_ÙЄ¾Z:»½ƒÒ‘Ñ/¬w8ÛÛÚâ"Šf©œê¦–‰‰‰„ˆ|ÏÒ_gø¼´e • ÍGÎ_™ªã=¸œÞA|”BMS«»³æó¦S©^îsêM]²Q¹ÐÝUèÎ’Hñ6\^ßÀµ·så8N-fUc³…êë¥Gúô|rC#êîíjxô)8 ™ìdkmbh8í·øŽ£ÕêºEôùä‡dáð(ö×øêø)O·´äO·ÂÊ‘Èu°µc³W¼ô;@g_niE˜P°ÿÃw+𬭄î®Ù%å~>ØÎgæ.‰\eǶ,©©··f/ (¬¬òö¼ÓEµ'¯dl\¶dÿ?ÞÍ-«0b0#CóË«à¯$ÈÛsiTÀÑÆ °4* ž½˜—_^ˆ n_½\«Õ©4ê?¿ø,þ؃g.`Ïßë“ûy¸UÕ¢ãòË›Öy{Še#<»w_zÔû{¾e[Å/<&¼ÿê‹‘þ¾Ýý¾n|G«¦Ž.üó÷ìµ—SZ~ìâÕÔÄø“»?ºYT¢Pªì¬,Ã|©»ÞFó©"ÏÄÊEÑ\ÎþõÙøÄ„N§K¿ž¹uÕ²W6§Þª¬aÒéQ¾Á!&ýö,•Ÿ¾vsÍ’Ø?lß\\S7:¦`¹q»ú°5\ýCÃ=¢„ð ¿$pìâÕ×¾ùì–¼²J±l„eläåÂýç7û³ò¼\¸;7®Í**S¨T^æ¦&Ï\ÔSn¢ÿ,´rQTY]ÃÄ„¶¾µ½¡½c||â¹µ+³‹Ë ³´ùQÑÐTZ[ ´27«mi·4cù¸ºœ¹‘5Ó.^Ø!Ù(¶o'€ÔÄxŽ Ž¿šwkÚ¹'Nwöfö¦äÄ—7¥W×2éôÈß¾Á!l)É,•#=—™³"6ê­i¥µõJ•ÚÆÒœj@9~À Ñ„ގ­ ÄÇK6*WkÆÑõ1VæfϬ\:>1Ñ78„-=h ¼—3ÉJ/åä{ó¹/§­Ë*.Ÿ˜óõ–ŽŒbzBŸ. ÓéÊê„€IÛÔ]ÌÊ÷à:?¿veAEu·h€N¥zòœO^Éèìë/®© ÷¤ÄǰÍX}ƒÃ®/nni…žwCé˜R ÔjµjÍx·H¤Uè>ÇÃ6c½¶eÃõ‚¢I»£¼²9uH"míê‘ÊÍMM¢„Ýý¨£ô‡dáð(ô™R¥Þñî‡[V,MYŸ¯T©Û{z÷ÿ|wïÁ?íþß[7Æú;ÙÚÔ4·¾ð×¢ýðúL«Õ¾ñÿìܸvIx°'ÛØÞñÚŸøº¹à:6¦T&E†¥-Kèîøïá…•Õ“ô™“­urLöÑË…ëåÂ4´w b Ì'ĈÐI¥¸œ[0“X1b0¦F†ø“£|ðÕ÷`f}&?÷ç¿¿þ̆è@?NW\]÷ü_>øøÍ×ðaI÷­½ö¨jl^»$.9:Ð;0xìâUÌ™ÿŠ<eu Ä3„%aÁ+EJ$GÎ_F7Å2ä”–I¥±Aþ±AD"A6*oéêÎ+»ç/ºKjê“c"н7ñé­]=»÷IŒ ô‰±T†ÍuŠe#»IŽŽˆô#‘ˆ½ƒßž<£ÏB}”îþ³7²Ñ?’GÝŸöÛ“éË¢#VÆEÉÇEÕµõ­í“ö¤˜Ò/´tv‡øx/ ›˜ÐŠe²¢ªÚƶÎi3#âÁuž´ÄR§Ó}wêÌ¢à€@/7Ž£B¥êìíoíº½næÆ­b#&#ÀËÝëÔÔÞùå'^Þ”Š;1¡Ý{ìTJ|L˜P€îOûóõÌ7ŸÝr§é¾ùé甸˜EÁ:®®µmÏÑ““ž4|ÝøBwWô=ÛŒµ4* W(»úELŒ xç Ú^˜>›´Îwrà;9àÓ¯åßšIUÕB|hસè±äèù+¾î|Gk}*'£ X,‰ òKˆ™˜ÐІ‡±åÉ - êVS Œ˜ "‘Hïä\É-hîìB,OZr8(–ü÷ð‰”ø˜ÄˆP¹âöþ´iÉ sꨚ`ñ—(r…âÓý?&E†y¹pƒž#ò±¦Ž.±L†6➣'—F…û¸º„øPŲ‘³7²¯Ï¼$v:îЙ‹Ëc#×%ĉDtÚûZŜЧàz’WVéãêìo@¦ÈäòÂÊêË9¨v|À¡Y8 ºö:$r%Ðén¿À7T:¬GÕ€rå›/*›v¾÷¬§{+ö®mi“þ«2‰]ÛÒ4šñÏ{\°(8 .4ðÝ{`¼ù#àƒ×w×Ôê¿yÜh‚„äî  èGÓ…[èŽ~Nxš*k!‚m¥²#u5jpóÖüGËAó™9Pœá!ø 1'[k{+ö$ë#fX*;y%гG…L¦PäcJX§¬‚ÇË™/ÿ[ZÑÖÝK!“…î|o>¯¡­c–?‚< tôöÍãϵ~ÝØ[³×.YTZÛ •³ÍY‘þ¾£cŠì’²ÇxI“6Bƒ<$xæÂ¾ŸÒç·)ò+F,ËF¢ýèT•FSÓÜzîfþŸj!¿V"B˜tZzF&¶W3ò4ãÏ @~Y`ü@ ȯ ¨Ï @PŸA ,, >ƒ@ YX@}@ ²°€ú @ daõ@ ÈÂê3@ …Ôg@ ¨Ï @PŸA ,, >ƒ@ YX<Áúì…u«ŠŽÿÀwr€­@ ä×é©*mˆW¤¿¯—ãÊq¤Éí;pìâU|2‰&ćzòœ­ÌÍÆ”ÊŠú¦ïO­hhšå´FLF\H`l?×ÁŽel4$‘æ•Uîû)½opèñ–×›Ïûîïžö«Ãç.}òýá<€—û¦äÄ¿ú^4,þ5ÙI„ŸÏš%‹þòùÞùØ“xýÆLæ»/?üÒµÜÒŠ§ªƒÃšÔŸ¿¿þby]ã¤ð!ñ›”åV欿þZ¢?O—>[—è7:¦Km,ͧfôöø÷›¯iÆÇ«›ZêZÚm,Í£„~>ï|¶çrNÁL§ÝºrÙÖUËFäc• M ¥+Ç!%>&.$pû;këî}ŒåKegodOJ xZ°L‹«ëž’Fw²µvã8]/(Rk4 íÚÌLŒßùíö=GOÖ·¶?ÆËˆô÷å9Ø;X³MŒ «›÷ý”ŽÿÖÚÂ<ÐËÝ•ãhnj¢PªºúE³òºúE3m®ùiÉ ^SÓÿ{øxSG×Bk…!.$°G4PÛÒö”tjäiàéÒgߟ>·ûÀÑξþÕ‹cÞz~ÛÔ ²QùîG~¾ž)•£)‹‚þù»—ÿïÙg®çOLL{Ú¶žÞ·?ý2£ X3> ¯?³!mY«›×ïúç§±¼]ý¢wÿû5>…j@¹üÍçÃRYNIùSÒèŽ6Ö !YÅeú볜Ҋüòª™šûăËQ©ÕMí·Š–„‡H{w¯“1õÛ¥Qa|'‡šæÖʆf&àåþÆÖ{jhë˜ölsÍÿð¨¨o¸×³ $“H½ý °„øÐ ¢êš9鳿~þ•V§}4—·?ýÐ=%ã ò‹ñˆô™…œ¶,1!"ÄÞŠ­R«;zûÎg梮õ5Kbßz~ÛÎ÷>*¬¬Æòïܸvûêåk_ÿ#æ¢S©/nX“ ÑÚÚÿwä䤟`›±Ö&, óX[˜S (]ý¢³7²ž¹¨ÕÞƒ*g¦T56W56ãS®•Õ5Ý]l­ñÏÜxÎddá?jµÚ/OMŒ÷uãß·fìØ–olÝèí¡ÕêJjê>ùþð_v>ÇwrˆÙú[}jeIxðº„8¾“‘@ìÌ.)ÿêø)…R5õçЩÔÓ×n>ññä¢Óé^ýP(*µ}ïÁå4´uLhµ·¼Ÿ|H,üów/Oý6·´âÐÙ‹JÕíkÎ)­Øµ5-)2l&½5×üIÝÙ˜ÉLŠ +ª®¤ÔH+„ÿüñ O·’šºÒÚ;+ËÝoï*­­Çç x®]w«ª¦¨º–D$ Ý]_ݼÞÁÚêý=ß>àõËJ€Z3®ÿ!ãjæ¾K3Ó}ï¿cjdt%¯ «OäëÆÿú½·¥w\wzÖÞ¶o^Ÿ´X4$¾•7¦P:X³Sã_º¦PLýűQ`Š Ô[¶ÅêøØÿÏÞyÆ7ud ®šÕ,Ë–{·%Ë’å"¹7¹`Û`Š)¦…@ „t6›'Éf³y³›ºY–ÔRè„ ôîÞ{ïÝÆÝ²\T-éýpár¶0&÷ÿók4wîÌ™væÌ™‘£­õ„LžSV!‘ŽéEðvçÌ ò·µ´ÐjµÍÝgR3a'<w·õÉI;ž¬C-ñ½¸œ K 4 %AâÉq5¦QÇeòÆöÎÓ©°Ùˆ”(õqw£S)ÃÒÑüŠêëùE:ÝÍ%ù¿ßz-«¤¬g`hN°¿9ÓdX:z>#§¬®°hNDT à_¯ßÔw·í9ÔÙÛçbo+ö:ÛÙ2hT™BYÓÜr&5kB.‡ãèùŸmNI¦Q('®¦.ˆ;ØX)UêܲР™¹Hp8(:Ð?ÐKÀb2äJUeCÓÙ´,™B ?~:5#)*ÜÖÒ¢º©eï©s‘Èqt8~ù’HT€_€—‡9“©T«:{ú.dævôôRÈFŸ¼±åÔµôôÂDtly¾¥³ûÀ™ ðG:•’%à¸jµºê¦–ëùEèJ1eÏ öç:9™šk4ÚÖ®îsÙÝ}·›¬œM‡žU¦g`°»¿ß’eú{Ň ðôˆ 4e0GF.däxp\׿½‰àéÆŽô³³²ôK2ŠJ‹ªjá¯l,ÌÂC8Nö$"qxDZÙØ|÷¶>Àß“AòŒ^-–$!!<ÄÓM§RFÆÆsË*á¦hL£þ¿W6]Ï/B¿šF¡|ôê¦Ô‚b8pº*@á/ z³˜ ™BYÙÐt.=[¯ MÙH¤Ï¶¾ ÷†û 5Í­?;…ÇãâB‚<8.æL&‘Hèé„—šHÞÐþg1ÁIQáŸìÚ(建@¨mi;vñ’€­¥E¢8„í`O$z/gç#º¯³ÍëkWî=uÎÙÎFÄw7¦Q?ÿq_ßÐ0ò¬žÿÏÕynhµ9 ÇIÇÆëÛ:~½|þjÖ=ãÉãQèg›S–xº±½´mïm‡ô{Ž×hæE„ú x§S3>úþ'8$QúÏ×6£ãd•]ÊÊC«DÛüÜâ˜È½§ÎuööÍ:óæ¦ÌOÎÞ>x!þ"*™|%§Àp´—V,e1MþþõÎ ™9Hž“c£ÆeòJ/Lä³"1®¢¡éå~Ìtb½@ckiá'àÕ·¶7Îj+‡Å4yeuŠJ­¾”•§žœ yë-Áaµ¦®¥íLj&€z¿¾vå—» H«›Z”*•ˆÏEëg">wB.ohkÐ(”7×­2eçWTwõõÓ(d/.‡il<6!Ãá W,q´±Î+¯ºÑ?ÀsuNŠ 73a»t IÊÛÝí`úz†L¡ˆ |vѼîþaIvi€÷þôëi…R èø¸»Óhyå•c2 󡗥嶽‡™@Sãµ ç]ÊÊ;u-݋ˎ ‘æWÜ´ø>»p¾—]XU›^TÂ4¦‹ýDÖVÛ÷ÿ‚˜ ̘Œu‹“.fæv÷§ã ›‡¦¹ÎŽx<®¦¹AÐúÅI^\NuSKnY%s±³u±·I«Ããq/®Xjmn–Y\6(‘z»sÖ.LDG°·¶â:9V66 KGiÔP¡÷«kR>ÿqŸaµÌT2ù¾NNÜ3¾¯‡ûê¤øö=E¥t*uå¼¹Òñq½•ÕÝ×-¯P®PÙZšó]aMËÙÎf˪eJ•*»¤|dlœÅ4p\§ÓϤcãz] ÷”$…lôúÚ•T 9§´B2:æhc5?2ŒÉ0þõòõ± Yc{§ˆïŽ~µÏ ‡ÃÁ+IÃU¼0ZèWÛÜšY\jeÎ y;ÙZoßwébÓµ@•ZýÝ¡c/,_\ßÚžQT n­'ID¢Ø_XR]WPQM |ùîëÏלÐØCØ´<¹°ª&­ ÄÚœµ$.*96êàÙ‹ðWŽ6Ö¯¬Y>,½–W¨T©à•ÕÞSçÊë‘ÇÇDvôôþrþ²N§C`wBÏ/]Ø~£÷\zö¤f’Å4á¹8Ã_=HOÇærŒ'‡®Ÿáp¸Å1‘ÃÒÑïG‡÷ÝGŠ Öju;œDB.fånX²ÀÅÞ »9ÙˆD:•APZAIrl”Ÿ€7ký ‡ƒþ±åy#qû¾Ã3ÊÌ„ñöƵã2ùÇNˆ†ÇábCÛº{.få"??µpNÄÌ¥·$. ðåÏл™£wZà’¢Â!:=[ãYBx‰Hض÷àÀ𠨪ö½Íë‘oiÔE1‘¹e•ˆa/¿¼úÝMëâBƒŸ¿<©ÑT64{sÝŽ^¸ o$‘ˆDǵ°²F«ÕÄÁ,¦É®c§j›[áǯä@ðx8ÙÚœ¼šO?Y%åÏ.š*òÎ*)ï„#“I¤/wßÂÁ³?zus·àlZÖ ddP"´vÝ@Ìc€ ™¹È&# ­»gíÂD7'‡éöàèTê·5wv:{û¸ÎŽ!B/X?ótcûðÜöŸ>_RsÓ [×Úþêšß1ÕÐ(”Ÿ~=­·uîÁvéêí‡o.Nj˹–WˆÌî© .þ=ñxØYY>%¿¢ SV±eÕ2t„úÖöªÆfDõ,ªª}ç…u!B¯ó9³hÞ\Ž…™é¹ôìß+>A ¢Ä½ƒCß<«#í¯¬YŽÌñ&Æô…ÑâæÎ®‡O ú "œ‰qêÉÉ/w”ŽÝTé¦TÎì­,­ÍYhcÌݵpOIƇ3è´/w„‚œR04"M‡^Ï+”ŒŽ•ÔÔ­œ7×ÉÖºýÆM­Ú×ýoh6U¨bÓ$2À·²¡i÷ɳpöz—ÇÇûxfÝò®êtº¦Ž.­V'G;`(”ª¿ýöˆd—¾õÜ3Ñ~ô³¢êZøTGO¯%Ë4:Ðïøåëp7IIˆ”ŒlÛs–VIùK+—.ˆ£õ³Ññ $ÿpwq„ hÇ‘HÞvø =ýéš·1žúýgv– :­ª±yJ‹Î qsrèBÌu:]å³A«“âoÿ,ûà—üæÒ_õÞV€¹)sÖï}çùu!B¯}¿Go-†J&ow«9“ùÁ7;‘û5¨dòÛ×"—.Ø[[RÈFè‰Ð?$éš¹ôø®.ryÍ-Æ%E†«''Ñêà}áÁq©miƒ•3À„\ŽÞ*p\ x|nY%‡ÿTjuC[ÇÑŽPZ[O6"ñÙ.H|‘XZ{soBÈsoé쮽³ °dW¥J…¾à 5¿àq+)@}[;¢¡ÊÊAÉ‹ib ,ˆrA‡­zÖVÓÅ—Éᩦ³§I_Èãʊʆf¤à7z¥cãG$¾B©ªnjÑ—'Û©8ž›zrRïŒð=§:$™BQXU<•Y\ŠŽ R«á¤àÂJÇÆ{í­,gÑXL“óâºûRïÚø›u| s&Ã8·¬ѽš;»ºQç=W<)+m¯…KdifjmÎÊ*.G”³éäæïÉèmnêÕÂ=%)äqÚ:$ÒQ¤®+ê› b;Ú*š5ˆïGfÐi®övˆ†*æ»:C„Öó+ªdrº…hS¢ÓéÃAªkis°±2ðHiÍm‘®Þ~Ç4¦ÃC¨•EAe )xy}#‹ibB§#×ÔͤÅÊJçíι{ùñ0z:ÆŸ—‡n?£S)€AÉȃ$B£PZ»oèK¥è/¯^¶~qRaUÍáó—û‡%“µ9ë½MëIDâì^ºuÝê%qÑ'¯¦}}àÈ ¡¾yÿ-¾«ó?¾Ý/‘ð”„XäcûÞŸ~=M£PÀT®?ÃR©‰1}†Ò£Ó(ˆÂd?ÏÖÒüz~z&›9F$•LîF¢_maj غ~µÞƒˆfYßÖ.S(D|.lFñ¹Ò±ñ–®npËäYV;0å«MÆÃÒQôÜ kêf& $D¯PJ•ŠJ&(Ž1š(p\Ñ'ÉF¤éâÜYMªÉI$}K–)•Lþâ­W§lù7Ó›ºì¬,LŒéˆÒfnÊ‘ÎîS†ñЈ6CÂè§q8(&8À_À·03E&ÅYÜü Ó^Z¹T©RýpìÔL¼é§ŒoD"Â-ÿVk…ëQo‹j@2bbl|S8LbAA¯¾¦ü A~~w߀^L½Z0,I‘hbL71¦ÿû¯¯Ý=@d E]k»Çýíz†N§ò¸•ÔÔݳŠa ý.F;$•²˜Ì™´Àéò¸Q¾v–– ÄãpÓÕzOY©V¨2À” X4'bÊ´Œ?ÈSéèLZQIM}·ç3 “c£Ú:ª[Êêêa™?ŒžŽñçå¡ëgð&…#¾ùÙ§®#é‰zJÓÌ­¯îa¶á¹8Ñ©”ky…†kÁ°$áw•×5¦ë}5|K5)©©p\]ííš;»|=Ü»zûo¯aì‚ C-p*¼¸œu‹çWÔ7¥æ_“É´Zm°W·ÀàSД¯€ûŹôì¦}¿Uô m†`Ujõ×ޏ99À÷„‹øîQ¾_ï?‚(ÇÀÐã¡ëgÝý£ãžnl²iÊMº±‰ pk‰†€xŒÂ4¶wy{Z±Ì-N‚¼ÜØHc:…lT×ÚŽ^ûú ø³Ëó«kRÖ.L¼’Sðáw»Ð €lDÚþÎV_Þ¿>pâJêLéêíW(UŽ+:ÐÂÌÔÚ‚…ðï)½Ú–ÖÈ_ôÍ”PÉä9Aþrí™R¥–)–f¦z¹Eþ‡Ý¼t:ÃLim}ˆÐ˃íB$<â°¥T©`?ý)Ÿ’ŒŽ¹»8ðxd·b™Ô¤h˜»'6+–™•Å/ç.#Wº°˜&4KõpH*µµ4¿Ñ?x_Ö/ǵ¦¹ÉÛàȈ§›D$ÞˆR¥Öjµ#£ÛG›ý†¥£'{‚Ôôßø xEÕµèC»4 y&:ºyoNIf{èèLìµâ×4·JOœA‡ÀõhafŠ>ø‰^u HF6¬»/¸­n6æhG¨»ñ÷äëtºâêZõ`X’J•j\&31¦háU-êÉIŸ;,•:ÚX£—CªxhD ¿ I‡ƒX&&í7fjã¼»‘ûyð†¥£{NÝv ùÌ®…Ãö{‘ð{ݶ­ÓéÚ:`_ÏP‘÷òø/.§´¶þ{:ÆÆC÷?Ójµ§®¥›™0¶¬¼ÃÓ™ÚëZÚó#Ã` QêíÎAG¾”‡ÃA›W$#! á!èÃ’Ñ1¥JíÍåð7ípŽ6Ö+çÅÍ"Ã[V-[·x~j~ñû_ýo:å,&8`uR=0•M““šDqR0Æ4*òSG×èø„ˆï.âs‡F¤èsåõ l{w'ô³ð««›H¤¡à ˜‰×¸åjïÔÀÀ3ÒÞQ¾³ná¥5õx<~~d:ÇØv¡S)Ž6Ö5M·ó_^×H$bCî.¾N§–Žº9Ýöf‹ð¡µÉšæ*™ìk5A|Ϻ¼È¡Q€ÇeÞeµ5‰HÜ´<ÙÚ‚µëØIô­èöælgƒ´@Ãñ‡¥£• MÈ g`P:6ì㉴|W;;”{\uS‹F£‰Fw XýÃ’ÞÁ!±¿ÝÌôTm#ы˩om×;Czw-ÜS’¥µ Îv6^Ü;F' Ùió*µºº©Å‡çw ô@ª¸¶¥M§ÓEú#áAÞžT y†-~¯^{ÓZ f& žÛìZxßÐpÏÀ ØO¤gìGËÜðÐF/Ÿ7za>xOÇÀxÂx÷kì:vÒOÀ[ï'àTÖðx¶£½•Õ—ÿèìíË)­yïûôʆ&Gkß=«¤<Ü÷öjï|F΂(ñÂè{+Ë’šz«9Aþ•Õ^7w0µZí‰+©«æÏÝ÷Ù‡9e -A’W^5'È“@/Á¼ˆP€“­5`^D(ìyz1+7¯¼ á/Ú°dV«SªUi#úÙg. Ë÷‰q¾îEUµ°÷Ø+k–z $£cGû_~ýÔ¿vülÀnÿý/ÇCEÞÿzí%±Ÿ°»o@Èã:ÙZ7ut¡ÍT†¥—]Z~ôâÕ”„Ø_}‘^T"W(í­-C…Þ)[ß»Ñ{j\-œI½¿ÍÍEs"=Ø.ý÷×p.eçyqÙ¯¬^žY\6©Ñ„ ½¤cãÈh;:>qêZúÒ¹ÑÝðLqMݸLnfÂà¹8uõ >Ž£ÓéÊêBEÞz7K]ÌÌó`»¾°lQ~Euwÿ•Lp\O\Iíìí+®© óõN޲b™õ»»8zº±sJ+îéuï^-šQV× Ñhë[Û$’aÉ‚(±1:!Wð\œÌM™3tÆ¿›Š†¦ÒÚú‘µ9«¶¥mrrÒ’eæãîv&-ónot¾«‹V«­G­hhªnj‰ ²µ´¨omÇápðìŘW^•þ\ò‚–®n{+K{ÛqÙmU£¨ºVì'\‘kcÁ‘zs9zîÕM-^™BÑÝ×okiá/àÃowl¨Àãpf¦q¡€Úæ6Xt) ±.ö¶ðQä´àjn!,4?oatrööžñõÐétgÓ³Ö$%¼²&¥¸º–N¥Šý…½ƒCˆ›štlü\FöÂèˆ7×­.­­W(U¶–æd#ÒþÓG/^ݲjÙÛ׿–UJFÇÌLžnìÏ܇¤ïãÎ%EwÏî®…{Jòbf®»³ÓsÉI…U57úŒHD sžÛÇ;v#öÈÒÚz!ÐÖ݃ö.5PÅC#ÒôÂ’¨@¿M)É5M-–,³0‘ww_?<(Í„ÎÞ>¶kd€ïèøìÖYÝÔ"äq_X¾¸¼®A§Eø‹ú‡$ðíq³à—s—_^½ìíÏæ”V ŽŒ0è4'ksSæ'»öÌdè@3?2ÌÁƪ¦©U2:F!… ½”*¬%?`OÇÀxÂxú™B©Úüá§k΋ ^‘«PªÚoôìûíáo_ýïÍu«¢üœílkš[7ýã“È_´~¦Õjßüì¿[V-›$à°Û;^ÿd›ç†èg€o•)‰âÐÕóã»û¾;t¼ ²ZO?s¶³IŠ G>zº±=ÝØ€†öx(„óq8(!š926*ôZ)Wªò+ªÏ¦eÍÜ%ë·ë+ãæG† „šæÖ–cÝEUµ -Ì×gÙܘÁ‘‘Ó×3™ ú¬õ³ÎÞ¾m{Ňz{PŒŒÆeòîþ ™³9 ^ÕÔbbLöñ¤S)rE[wÏî“gáÛ‘°§c`&2Àw\&”Hm-ÍïŽàåñŸ·_WONV7µÔµ´ÛZšGø‹Â}}ÞÿzÇåìüé’]·hþºÅóÇ&d• Mr…»‹crlTLpÀ†÷ÿÙÖÝóÇyãÒ…/­\::>‘^X"W*=ÝØÏ.š77,hõ_ÿ>:>ñ4Tº³ ÏÅùz~‘J­~ÜòÆbš¼ÿâ†GNÔ·¶ÿÙû 9ŽŽ6VL†qUcóO¿žFkcaàÉwwq27eÊÊ®¾þ‹™¹]}ýÓ¥v¿ñ*æ¦ÌDqˆ«½F«jl¾š[8!—?†µð ÄÜè¨mi{z4ÆSÂÓ¥Ÿí9uî«ýG:{û–ÄE½ûÂú»#ŒŽO|µÿðo×3ÝeNÿçyåÿ6>{=¯hR£™2Ù¶=ïmÿ>5¿X=9 Àápo<»rõüø×žY±õóí`yHÄ KŽŒ¥¼ùÞ°tü`Ëó £ÅIQá‡Î^z*ÝÉÖ&><8³¸læúYviE^yÕtÕý;âÁvQªTMí¬ˆæ†ãpP{wƒN»ûÛy¡\gÇšæÖʆf:•âïÉsݪGO6´uL™ÚýÆx˜Óß|v•èrJ+FÇ'l¬"|Ýœþ³çN§{ÜjáAˆ ,ª®¹/ýìßìÒê´&{ûNŸº§a°ÁÀø=yDú™‰¸z~B|x°ƒµ•R¥êèé=Ÿ‘›Ö—Î~÷…õ[>ú¢ ²‰¿eÕ² K,{ãÄþD%“_Z¹4><˜F¡4´µÿïð ½WX±Ì–ÅÏ zÛX˜“H]}ýgÓ²œ¹¨ÕÞƒ* nSª›«›Ñ!×ó‹ÊêD|wg;›¦Ž®)Ÿ:“š‰þ¨Õj¿=x,%!VÈãÞS2öV–o®[àå¡ÕêJjê¶í9ôÁ–ç¹ÎŽQë^œ‰ô`æ†-á:;âqøžÁ¬’ò]ÇNÊJ“iD"–ÔÔ!Ê ³¸ta´Ø„NÇZÿtètº‡§œ‘HJ• þ߃íÒÐÖ¡ÑjÿØònÛsÞïþü/¯ÜýmNiÅÁ³Ê›yÎ.­Øºnu¢8t:}ë~ã?<ü{[±ŸÐÙΖA£ÊÊšæ–3©Yȶ—žÿÙæ”d…râjê‚(±ƒ•R¥Î-«¸™‹d‡ƒ¢ý½,&C®TU64MË’)ð·ðã§S3’¢Âm--ª›Zöž: ‰G‡ã—¯!‰Døxy˜3™Jµª³§ïBfnGO/…lôÉ]KO/,AD÷Á–ç[:»œ¹¤S)ɱQŽ«V««nj¹ž_„®S†ñœ`®““©‰±F£míê>—‘ÝÝ7€D0쌨g•éìîï·d™þ^ña<=âBMŒÁ‘‘ 9WÇõï_ï@"xº±£ýì¬,ýÃ’Œ¢Ò¢ªZø+ ó„ðŽ“=‰H‘V66ŸMË‚› ½ÿÛ¹aôjÁ°$á á!žnl:•226ž[V 7Ecõÿ½²éz~üj…òÑ«›R ŠáÀéªAP„¿(TèÍb2d eeCÓ¹ôl½&4e 4"‘>Ûú2 ÜWî+Ô4·þpì‹ òธ3™D"¡§^j"yCûŸÅ$E…²kw¢8”ïêA ¶¥íØÅkH¶–‰â¶ƒ=‘H輜,eíl^_»rï©sÎv6"¾»1úùûú††‘gõüÏx®ÎsCƒ¬ÍYxaoeäWÙЄ>¸ƒ®…{J’B6z}íJ*…œSZ!s´±šÆdÿzùúØ„¬±½SÄwGëg><7¯$ WñÂhqT _mskfq©•9+Läídk½}ßa¤‹M×Ujõw‡Ž½°|q}k{FQ)¸µž$‰baIu]AE5@ð廯[<_sBk`aÓò䪚´‚ksÖ’¸¨äبƒg/Â_9ÚX¿²fù°tôZ^¡R¥‚WV{O+¯oD_ÙÑÓûËùË:=‚éakiñüÒ…í7zÏ¥gOj&YLž‹3üÕƒôtl.Çxòxèú‡[9,ýþðqtxÿÐ}ô¨ø°`­V·óÈI$äbVî†% \ìm‘‘±›“ ‰H¤S)¥”$ÇFù x³ÖÏp8è[ž7"·ï;<ó§ÌLoo\;.“ÿpì”hx.6$°­»çbV.øÃñS çDÌ\zKâ¢_þ|QÎÀƒß®gŒMÈ>|å……Ñ7“Í(*Í)›Ía´„ð‘°mïÁá@QUí{›×#ßÓ¨‹b"sË*Ã^~yõ»›ÖÅ…>yR£©lhö溽pÞH"‰ŽkaeV«$ˆƒYL“]ÇNÕ6·Â_É)€ à/ðp²µ9y5 ž~²JÊŸ]4/TäURÞ30G&“H_î¾)„ƒg/~ôêæ oÁÙ´¬AÉÈ D híºö ¿™‹l2Úº{Ö.Ltsr˜nŽN¥~{ðXsg ³·ëì"ô‚õ3O7¶Ïmÿéó%57 ºu­í¯®IñÝ%˜F¡üôëi½­s¶KWo?¬¸xs9^\ε¼BdvOÅpñÀÃÎÊâðù+ùU€œ²Š-«–¡#Ô·¶W56#ªgQUí;/¬ zÏÈ™E3ðær,ÌLÏ¥gÿ^ñ!Z%îúöà1Xilï|eÍrdŽ71¦/Œ7wví8|ÑWá¬HŒSON~¹û tì¦J‡È°±½óLjæÜ° ××®„CòÊ«ôÌöèZ¸§$ãÂtÚ—» AN)‘&ŠC¯çJFÇJjêVΛëdkÝ~ã¦VíëáÞ74 ›* T1‹ià[ÙдûäY¸šz—ÇÇûxf•”n:®©£K«ÕIÇÇÑ ¥êÃo@,…™Å¥o=÷Lt Ÿý¬¨º>ÕÑÓkÉ2ô;~ù:ÜMRb%#Ûö‚åŸURþÒÊ¥ ¢Åhýlt|É¿Ü]!Úqä’7¤>HOê§rŒ'‡~ÿ™¥ƒN«jlFüQf›“CßÐÚ`®Óé*ïœí Z|ûgÙ¸üã7—~øú«÷¶ÌM™³~ï;ϯ zíûí<7õä¤Þá{NuH:2…¢°ªy*³¸A¥VÃIÁ…•Ž÷ Ú[Y΢G°˜&+æÅu÷ ¤Þµñ7ëø6æL†qnY%¢{5wvu£Î{ 8®x<þRVÚ^ —ÈÒÌÔÚœ•U\Ž(gzr˜èúízÆž“g3ŠJ½¹X2©ÑX›³ÞÛ´žD$Îî¥[×­^}òjÚ׎Ìð Ùè›÷ßâ»:ÿãÛ]ð GÏ í7zúõ4BS¹þ K¥&ÆôJN£ ÓÝ8ÛÙ¼õÜ3×ò ÿwøW8$§´â£ïúß?þoÃ’ÿzçÌ¥aD"QÉä¾Áat úÕ¦¦€­ëWë=ˆh–õmí2…BÄçÂf$Ÿ+oéê·LžeµS¾Ú”a<,EÏͰ¦nfÂ@BÐÓ3@©RQÉdÅ1¦QÅ¡Ž+úÄ"ì®4%#wV“jrIß’eJ%“¿xëÕ)[þÍÇÇÆô¦.;+ c:¢´™›2‡F¤³»Ä”a<4"…Í0zÆiŠ ðð-ÌL‘Iq7¿0è´—V.UªT?;5oú)㑈pË¿ÕÚGázÔÛ¢Œ˜ßÓ€XPÐÀ«¯)¿yÜÕóãÿ³ç lÁ*¯oœËÅ¡%5uíwׂaI’ˆDcº‰1ýß}íî S(êZÛ…<îo×3t:Ç… ¨¤¦îžU kèwi4Ú!©”ÅdΤN‡Ç ôµ³´$ðH ‡›®îÐ{ÊJµ@¥¦LÀ¢9‹P¦}düAž’ ! ¤¦>ÈÛó™‰É±Q mU-euõ°ÌFOÇÀøóòÐõ3x“€ ž¶Ð#¸snLÈå¦ c½ÍLn/›xüŠ„¸º–¶—ÿù2¼y fí××®\>#gæw*’H_½»Õ›Ëùô‡½z;GC#RÿåëôâÃÛm†ËuOéOÈÍM§]>xzàpÞAвºNçîâtŸ"Ñ4U 캤¦NÀquµ·kîìòõpïêí¿½†y° & µÀ©ðârÖ-ž_Qß”šaL&ÓjµÁ>^AÞƒOAS¾îçÒ³›:ô¯ A¯ÐfxV¥V}àˆ›“|O¸ˆïèûõþ#à: Œ?]?ëîŸðtc“HSnÒML€[K4Äc¦±½3ÈÛÓŠe†lqBäåÆF"˜Ó)d£ºÖvôÚ×OÀŸ]ž_]“²va╜‚¿Û…NÐd#Òöw¶úzðþýóWRgòHWo¿B©p\Ñf¦Ö,Ä€OéÕ¶´Fø¢÷hЉp— ÈdC¤RÝŸ©F©RË K3S½Ü"ÿÃn^:΀a¦´¶>DèåÁv!qØRªT°Ÿþ”OIFÇÜ] x<2‚[±ÌjR4ÌÝ›ËÌÎÊâ—s—‘+]XLš¥z8$•ÚZšßè¼/ë—€ãZÓÜ‚ämpdÄÓM"ïND©RkµZŠ‘ÑíN‹Ç£Í~ÃÒQŽ“=AHjz‡oü¼¢êZô-04 y&:ºyoNIf{è¨{íLâ×4·JOœA‡ÀõhafŠ>ø‰^u HF6¬»/¸­n6æhG(:•‚œÆ€ÁãpàÖZâîZ0,I¥J5.“™Ó ´ðªÆõ䤈Ï–Jm¬Ñ~QªxhD ¿ I‡ƒX&&í7fjã¼»‘ûyð†¥£{NÝv ùÌ®…Ãö{‘ð{ݶ­ÓéÚ:`_ÏP‘÷òø/.§´¶þ{:ÆÆC÷?Ójµ§®¥›™0¶¬¼ÃÓ™ÚëZÚó#ÃA3QêíÎAG¾”‡ÃA›W$#! á!h*Éè˜R¥öærø›v8Gë•óâf‘á-«–­[<·Ùµð¾¡ážA±ŸHo¥‡–¹á¡^>;nôÂ2|ðžŽñ„ñ(îרu줟€·:)ÞOÀ+¨¬!àñlG;{+«…/ÿÐÙÛ—SZ*òÞ÷é‡ MŽ6Ö"¾{VIy¸ïíÕÞùŒœQâ…ÑöV–%5õ6Vs‚ü *«½nî`jµÚWRWÍŸ»ï³sÊ*4Z‚8$¯¼jN?:'^‚y¡'[kÀ¼ˆPØóôbVn^y Â_´aÉ­V§T«þþÒFô³Î\@–ï+ã|=Ü‹ªjaï±WÖ,ôHFÇ8Žö¾üú©íøÙ€Ýþû_އмÿõÚKb?aw߀Çu²µnêèB›© K/»´üèÅ«) ±'¾ú"½¨D®PÚ[[† ½S¶¾w£ ª±ùjnAlHà±ÿ~šY\:.“û¸»qû††÷œ<{ÏŠ[4'Òƒíò× áRvž—ýÊêå™Åe“M¨ÐK:6ŽŒ¶£ã§®¥/ý× Ï×ÔËäf& ž‹SWßÀáó—á8:®¬®!Tä  w³ÔÅÌ<¶ë ËåWTw÷PÉdÇõÄ•ÔÎÞ¾âšÚ0_ïäØ(+–Yïà°»‹£§;§´b:¯#=àÉxÑœˆ²ºF[ßÚ> ‘ KD‰iÔ ¹‚çâdnÊœ¡3þÝT44•ÖÖGø‹¬ÍYµ-m“““–,3w·3i™Ó]bÂwuÑjµõ¨³¢ MÕM-q¡A¶–õ­í8Þ#ƒ½óÊ«’¢ÂŸK^ÐÒÕmoeébo;.»}]EQu­ØO¸"1ÖÆ‚58"õærôܪ›Z½2…¢»¯ßÖÒÂ_À‡ 6ÞîØPÇá,ÌLãBµÍm°èRb]ìmá£ÈiÀÕÜBXh~ÞÂèäìí=ãë¡ÓéΦg­IJxeMJqu-Jû {‡75騸¹Œì…Ño®[]Z[¯Pªl-ÍÉF¤ý§/Ž^¼ºeÕ²·7®Í-«”ŒŽ™™0<ÝØŸÿ¸p=¿Èƒí²)e1|ÛÁÞÛÓÕÛ{@Þ] ÷”äÅÌ\wg§ç’“ «j:nô‘ˆ6æ><·wìFì‘¥µõB7&8 ­»í]j Š‡F¤é…%Q~›R’kšZ,Yfa"ïî¾~xPš ½}l×ÈßÑñ Ø­³º©EÈã¾°|qy]ƒN‹ðõIàÛãfÁ/ç.¿¼zÙÛŸÍ)­aÐiN6Öæ¦ÌOví™ÉÐf~d˜ƒUMS«dtŒB6 z)U*XK~ÀžŽñ„ñ(ô3…RµùÃO×.œ¼"1V¡TµßèÙ÷Û9$Âß¾úß›ëVEø9ÛÙÖ4·núÇ'‘¾hýL«Õ¾ùÙ·¬Z67,HÀa7¶w¼þÉ6!Ï ÑÏß<*S(Å¡«çÇw÷ |wèxAeµž~ælg“Ž|ôtc{º± íðP;æãpPBxˆ^).çäO÷û  `Ê0F'óÉ®=`zý¬Xòüß?~ãÙ•‘¾:®¸ºî…>ùòí×Ñ®K÷”Þ?í¯jl^67&)2Ð30xôâUäðÄß¶ÿ¯¼¾q^DXt ?€–¿|}×Ñ“³Ø2”Œ|wèxrlTBxÈ„üæý´«“â‘Ù¥åCRit _t ?Ÿhéêνó.X• '7a&äòíû~I‡zº±ƒ¼c²¦Ž.Éè(@«Õí8rb^D˜»[°Y2:v6-ëúÌκûΦe‰ý„l‚àûiüõ·ä˜¨9A:®®µmÇ‘zŠõ}±ÿô…–Îî`¯y¡V2:ZTUÛØ6íïyp\š;»Ñ|ètºÝ'ÏÌ òðôà¹8É•ÊΞ¾Ö®›gbÒ ‹tš¿'ŸÏvnjïüþ—㯬IAžÕh´;žLŽ y÷ªþv=ãík‘§®¥ët@Äçûxvöôí:vr^D˜éö¹!‹œ:´b™Í‹LȰ~{gs¹ÎŽèR\Ë+œR§½ßø€¢ªZ„‹ X9 9rþŠÏu²µA"¤æK¤cQ¾ñáÁ¶x9RÝÚuã«}‡Ä!b?“HG‘½Î¶îžÿîû%!žt*eB®hëîÙ}ò,|;Òöt Œ' H×^‰îæ¸õ™ŠIçÑC6"]ùñÛÊÆ¦-}I㉇ƒ>~}Ë…Ìô _ =¶®_­VO~sð(V O–,°03…šO ZAºýàø#€©<tG¾Á=MÂzA®Ò€Ùœ²„B6J/ÄæŒ'…’^X2¥?ûS ‡Cû*9ÛÙ8X[5>Ì,ÇjáQ ÓÐ&F Œ™@ÀDðÇræûÿä”V´u÷ˆDŸëÅå4´uœ¼š†IæIelB†þÅ €ƒÕ²¹sJkFÇ'¬ÌÍÄ~Âq™<«¤ «…?;öV–^\Ž£5&m ŒûÓÏþ`Τfy{†Š¼IDbßàð3~úõôìî)ÅÀø“"‘ŽIFÇ"|©d#¥Z]ÓÜz.=ÛÀo8büYðð¼< *«Ó J0i``ܘÿÆï æ†ñdég˜~†ñxég˜~†ñxég˜~†ñxég˜~†ñxég˜~†ñxñ'ÖÏ6-_\tl/×Ù«E Œ' ÂSUÚ`O±ŸÐƒíâîâD"¿øiÿÑ‹WшB¨È;6$@Àqµ6gÉŠŠú¦='ÏV44H–A§ÅDú±íÍLC#Òܲʟ~=Ý;8ô8”z^DèŠÄ8Ž£ƒF«©ijýé×Ó…U5¿KÊþžü5I ŸîÚÓ?,y’ÚI¸¯ÏÒ¹s>øfçØ„ìϘ:ýÃW^8véZNiÅSÕÁ1IΜßx©¼®Qo|H<—¼ÀÚÜìÓöb- cæ<]úÙòø˜Èßq™|P"µµ4¿;B€—ÇÞ~]=9YÝÔR×Ònkiá/ ÷õyÿë—³ó§KvÝ¢ùëÏ›U64UÈî.ŽÉ±Q1ÁÞÿg[wÏ[äWפ¬[<X:z);á"üEßðö;Û¾»–Wø”Tº³ ÏÅùz~‘J­~ÜòÆbš¼ÿâ†GNÔ·¶ÿÙû 9ŽŽ6VL†qUcóO¿žFkcaàÉwwq27eÊÊ®¾þ‹™¹]}ýÓ¥v¿ñ*Öæ¬ù‘al{ßÕÛ>#»©£ëñ¬…!&8àFÿ@mKÛSÒ©10žž.ýlÏ©s_í?ÒÙÛ·$.êÝÖßat|â«ý‡»ž1:>‡Ì òÿü/¯ü߯g¯çMj4S&Ûv£ç½íß§æ«''8îgW®žÿÚ3+¶~¾ý,¯«½Ý³‹æõ >û·’Ñ1€%ËôÀç½»i]nY¥L¡x*ÝÉÖ&><8³¸læúYviE^yÕtÕý;âÁvQªTMí¬ˆæ†ãpP{wƒN»ûÛy¡\gÇšæÖʆf:•âïÉsݪGO6´uL™ÚýÆxXš™¾ñìJF›UR¦Pª|=Ü_\±ä‡O4wv=†µð ĆU×Ü—~öoviuÚG“½}§ÏÝÓ0Ø``üž<"ý̈D\=?!><ØÁÚJ©RuôôžÏÈMëKçF¿ûÂú-}QPYÄß²jÙ†% –½ñb¢’É/­\L£PÚÚÿwø„Þ+¬XfËâç„ ½m,ÌÉF¤®¾þ³iYÎ\ÔjoA•·)UÍUÍèëùEeu "¾»³ÍÝ+o˜3©™èZ­öÛƒÇRb…<î=%coeùæºU^Z­®¤¦nÛžClyžëìµîÅ™HfnXÐòø®³#‡ïÌ*)ßuì¤\¡ŒðAtøüX9ôIŽ_¾¾iùâè ¿séÙX˜N÷ð”3#I©RÁÿ{°]Ú:4Zí[Þm{Â-äó¿¼r÷·9¥Ï^T(oæ9»´bëºÕ‰âÐéô­ûÿðˆ&‰ÿ9x°»oQTúöƵɱ‘_î>ˆŽö˜ÔÂ#æáµp¯:æ¶<Ñÿc``Ì´+=‚w‘ˆ;?|×ÓÝÐÖqìÒU"àæäøÌ‚Ä™»>àp¸ÿ¾ó¦Ÿ€WRSWZÛ`omùÕ{[KkëÑq½ËæÆVÕU×ðxßýµgV8ÚXÿkÇϘÿ ¹ ROÎü‘IF¥VßÓ`cÉ2ýé_ï›2Wró»zû…<î½'½eº›¡ôþºá™‰qýC’ ™¹2¹ÂÑÆ*%!öØ¥krÅ€™ pc``WoÀOÀŸ…~fge±$6ÚÑÖzB&Ï)«HÇô"x»sæùÛZZhµÚæÎî3©™°ž»Ûúä¤GOÖ¡–ø^\Ά% @…’ ñä¸Ó¨ã2yc{çéÔ ØŒlDJ‡ú¸»Ñ©”aéh~Eõõü"îæ’üßo½–URÖ304'Øßœi2,=Ÿ‘SV×X4'"*Ðð¯×oê»Ûöêìís±·û íl4ªL¡¬in9“š5!—ÃqôüÏ6§$Ó(”WSD‰l¬”*unYÅ…Ì\$8èè%`1r¥ª²¡élZbž„?š‘nkiQÝÔ²÷Ô9‰Hä8:¿| I$*À/ÀËÜÉTªU=}2s;zz)d£OÞØrêZzza "º¶<ßÒÙ}àÌø#JIŽp\µZ]uSËõü"t¥˜2ŒçûsœLMŒ5mkW÷¹ŒlXeAÔ÷)ѳÊô v÷÷[²L¯ø0žq¡¦ ÆàÈÈ…ŒŽ«€ãú÷¯w <ÝØÑ~vV€þaIFQiQU-ü•…yBxÇÉžD$H+›Ï¦e\ííú††‘’Nj4eu q¡AV,³¾¡a8P¯ KŽâ鯦S)#cã¹e•pS4¦Qÿß+›®çÁ¯†¡Q(½º)µ œ®ŠEø‹B…Þ,&C¦PV64KÏÖkBS¶@#é³­/Â}…á¾B@MsëÇNáñ¸¸ Ž‹9“I$zúá¥&’7´ÿYLp@RTø'»v'ŠCù®.j[ÚŽ]¼†6±ÛZZ$ŠCØöD"¡g`ðrv>²”u¶³y}íʽ§Î9ÛÙˆøîÆ4êç?îC$ îò?ã¹:Ï ²6gáñ8éØx}[ǯ—¯Ã_ͺ§c`înÆ4Z^yåØ„ÌÆÂŽïº¬Õêv9‰„\ÌÊݰd‹½-ë"ðÀD§R J+(IŽòðf­ŸápÐ?¶yR£©lhö溽pÞH"‰ŽkaeV«$ˆƒYL“]ÇNÕ6·Â_É)€ à/ðp²µ9y5 ž~²JÊŸ]4/TäURÞ30G&“H_î¾)„ƒg/~ôêæ oÁÙ´¬AÉÈ D híº˜Ç2s‘MF@[wÏÚ…‰nNÓíÁÑ©Ôoƒý–:{û¸ÎŽ!B/X?ótcûðÜöŸ>_RsÓ [×Úþêšß1ðÐ(”Ÿ~=­·uîÁvéêí‡o.Nj˹–WˆÌî© .þ=ñxØYY>%¿¢ SV±eÕ2t„úÖöªÆfDõ,ªª}ç…u!B¯ó9³èÞ\Ž…™é̯÷ŒAЂ(qïàзÁêHc{ç+k–#s¼‰1}a´¸¹³kÇሾ‚gEbœzròËÝ¥c7U:D†ýCÃvVT2±±ì&tú”µpOIƇ3è´/w„‚œR04"M‡^Ï+”ŒŽ•ÔÔ­œ7×ÉÖºýÆM­Ú×Ã1à¨bÓ$2À·²¡i÷ɳp5õ . öñÌ*)7Üu:]SG—V«“Ž£0JÕ‡ßþ{Ä2‹Kßzî™è@?úYQu-| ª£§×’eèwüòu¸›¤$ÄJF¶í9Ë?«¤ü¥•KD‹ÑúÙèø’¸»8B´ãÈ $oH;|žŽÍåOýþ3;K VÕØŒø£Ì7'‡¾¡!´Á\§ÓUÞ9ÛA´:)þøöϲþpùÇo.ýðõWïm˜›2gýÞwž_"ôÚ÷ÛyôÖ’a¨dòöw·š3™|³¹_ƒJ&¿½q-ò·qéB€½µ%…l„ž8ýC’Þ¡™Kïê2!—×ÜÒiô¨in=šigeqtÛ'ïmZÿþ‹ŽlûdtbàýJÃãRÛÒ+g€ ¹Ñ?Ž+Ï-«$àñðŸJ­nhëà8ÚÃJkëÉF$>Û‰O"KkoîMyî-ݵwΤ€ãªT©Ð¤æÈÛó™‰É±Q mU-euõ°ÌFOÇÀøóòÐõ3x“€ ž¶Ð#¸snLÈå¦ c½ÍLn/›xüŠ„¸º–¶—ÿù2¼y fí××®\>#gæw*’H_½»Õ›Ëùô‡½z;GC#RÿåëôâÃÛm†ËuOéOÈÍM -µZíáóWŸ¿‚„Ì ÜÿUO:€¦ €ÃA€GOÂn^ˆ–F£-¯kôðHD"ñ]]²JÊ»ÎT‰?, Úœ²„É _É)è‚mi¯=³‡ÃMßJuw'‚È`hDЏê# }«Õw.ñ`»ŒMÈnß 6½ÝaJ›e{€ ÈðE ‹æDŠý„©ÅÍ]p®–ÍîSÚ4 奕KHįÑs»¯øžKçÎA>¾ùÙ§®#é‰zJÓÌ­¯¦ßµ¼Â¬’2;KK­NÛÙÓ'òp¨u…^-–$ü®òºÆ´Âb½¯†o©&%5uŽ««½]sg—¯‡{Woÿí5̃]0a¨N…—³nñüŠú¦Ôü c2™V« öñ ò| šòp¿8—žÝÔ¡ z…6ð*µúëGÜœà{ÂE|÷¨@߯÷y×Ù``ü¹xèúYwÿÀèø„§›lDšr“nlbÜZ¢! £0íAÞžèSWy¹±‘&Æt Ù¨®µ½öõðg—çWפ¬]˜x%§àÃïv¡4Ùˆ´ý­¾¼ÿ|àĕԙ<ÒÕÛ¯PªWt …™©µ 1àßSzµ-­‘¾è=š{’¸š[p_2QªÔ2…ÂÒÌT/·Èÿ°›—N§3`˜)­­zy°]ˆ€G¶”*ì§?åS’Ñ1wGŒàV,3€š s÷ÄfÅ2³³²øåÜeäJÓ‚f©I¥¶–æ7úïËú%à¸Ö4· yñtc“ˆÄ»QªÔZ­–bdt»Óâñh³ß°t”ãdA’šÞá?¯¨º} Bž‰Ž…nÞ›S’™ÆÆß:jØ^{Ïø5Í­ÒgÐ!p=Z˜™¢~¢W’€ëî n`«›…9ÚênÂfZ€7—39©iéìž² KR©RËd&Æt-¼ª±E=9)âs‡¥RGk´_”*‘ÂïBRÆá –‰Iû™Ú8ïnä~¼aéèžS·ÂÂD>³ká°ýžD$ü^·mëtº†¶Ø×3Tä½<>Æ‹Ë)­­ÀžŽñ„ñÐýÏ´Zí©kéf&Œ-+ïð´E¦öº–vÀüÈ0ØH‡z»sБ/eçápÐæÉHHBxúp€dtL©R{s9üM;œ£õÊyq³Èð–UËÖ-žŸš_üþWÿ›N9‹ X˜¾HDâ¶ÿ{Ãß“¿}ßá#®ÌðE­öjn‹½-lЂyaÙ"<ÊŠsO鸒øËskÐ{s : ÙÕµ·²D4øxT€§GFQéLô9K–©³ òxMS+ßÕ1æQÉdÏÛpecÓä¤&Q‚TŒ1íöA„¦Ž®Ññ ß]ÄçHÑç6ÊëØöî.NègáWW56‘H!B/$<2À0C•6Á;50ðŒ…´7@T€ï¬[xiM=Ÿ†Äãq¶]èTŠ£uMÓíü—×5 „Ø€»‹¯Ó醥£°ã |­ò±¦¹…J&ûßZ@ß³€./rh äq™wYm @"7-O¶¶`í:v}+º½9ÛÙ -ÐpüaéheCò蔎ûx"-ßÕÁÎåWÝÔ¢ÑhâÃÑ]–@ÿ°¤wpHì/D73tƒG ÊÓíéÆ.¨¬†ˆw×Â=%YZÛàlgãŽct¢6¯R««›Z|xnp×@ßd Šk[Út:]t ?äíI¥g¾èR©ÕzíMt臙 Çç6»Þ74Ü30(öéûÑ27ý°¢¡ÉÑÆZÄwÏ*)÷½½Ú;Ÿ‘³ J¼0:ÂÞʲ¤¦ÞÁÆjNAeu ×ÍL­V{âJêªùs÷}öaNYƒFK‡ä•WÍ òGç$ÐK0/"àdk ˜ {ž^ÌÊÍ+¯Dø‹6,Y Õê”jÕß_Úˆ~öÀ™ Èò}Ebœ¯‡{QU-ìÅõÊšå^ÉèÇÑþ×_@?õ¯?°ÛÿËñP‘÷¿^{Iì'ìîò¸N¶ÖM]h3•aée—–½x5%!öÄW_¤•ÈJ{kËP¡wÊÖ÷nôÞ\·ŠïêÒÔÑ©P©<Ø.Öæ¬æÎ.äšÃ,šéÁvùë¿¿†‹p);Ï‹Ë~eõòÌâ²I&Tè%GFÛÑñ‰S×Ò—Îþë†gŠkêÆer3ÏÅ©«oàðùËpNWV×*ò†Ð»YêbfžÛõ…e‹ò+ª»û¨d²€ãzâJjgo_qMmB/t÷€IDAT˜¯wrl”ˬwpØÝÅÑÓSZ1בðîÕ¢9eu ¶¾µ}@"–,ˆÓ¨rÏÅÉÜ”9‹0 M¥µõþ"ksVmKÛää¤%ËÌÇÝíLZ&úü¾«‹V«­G­hhªnj‰ ²µ´¨omÇápðìŘW^•þ\ò‚–®n{+K{ÛqÙíë*ŠªkÅ~‰±6¬Á©7—£çPÝÔè%)Ý}ý¶–þ>l°AðvçÀ† <gaf¨mnƒE—’ëbo õ@N{®æÂBóðFG goï_Nw6=kMRÂ+kRŠ«kéTªØ_Ø;8„¸©IÇÆÏed/ŒŽxsÝêÒÚz…RekiN6"í?}pôâÕ-«–½½qmnY¥dtÌÌ„áéÆþüÇ}s&óÅ•KêZÚÇ&dÖæ,/.ûFÿÀ™´Ìéjáž’¼˜™ëîìô\rRaUMÇ>#ÑÆÂ܇çöñŽÝˆ=²´¶^ÈãÆ´u÷ ½< TñЈ4½°$*ÐoSJrMS‹%Ë,LäÝÝ×J3¡³·Ïƒíà;:>»uV7µyÜ–/.¯k`Ðiþ¢þ! |{Ü,øåÜå—W/{{ã³9¥ƒ## :ÍÉÆÚÜ”ùÉ®=3:ÐÌ s°±ªij•ŒŽQÈF¡B/¥JkÉØÓ10ž0…~¦Pª6øéÚ…óâÂW$Æ*”ªö=û~;‡DøÛWÿ{sݪ¨?g;ÛšæÖMÿø$2À­ŸiµÚ7?ûï–UËæ† 8ìÆöŽ×?Ù&ä¹!úà›ƒGe E¢8tõüøî¾ï/¨¬ÖÓÏœíl’¢Â‘ðzÐÐÞ…°c>%„‡è•ârNþt¿À Ñ¦ ctâ0ŸìÚ¦×Ïú‡%Ïÿýã7ž]à«Ó銫ë^øà“/ß~íºtOé}ñÓþªÆæesc’"Ã=ƒG/^EO\Ë+45axq9F$bwßÀΣ'÷Ÿ>?»³´ƒ’‘ïOŽJ™ß¼ŸvuR<!»´|H*ô‹ôÇãq£ã-]ݹewü°4¬Ê“›0rùö}¿$ŠC=ÝØAÞ‚± YSG—dt Õêv91/"ÌÇÝ-؇,;›–u}fçÝ}gÓ²Ä~B¶ Aðý´?þú[rLÔœ NW×Ú¶ãÈ =Åú¾ØúBKgw°×¼ˆPF+-ªªml›ö÷‚<8.ÍÝè >t:Ýî“gæùxzð\œäJegO_k×Í31i…Å :Íß“Ïg;7µw~ÿËñWÖ¤ Ïj4ÚGO&ÇF…мá[U»žñöƵH„S×Òu: âsƒ}<;{úv;9/"̈tûÜŒÇENZ±ÌæE„&ä X?ƒ½³¹ÎŽ\gGt)®åN©ÓÞo|@QU-Âņ,މŒ9EÈç:ÙÚ Ró‹%Ò±¨@ßøð`FÛ?<Œ©níºñվà ⱟÇI¤£È^ç„\Ñ~£×ƒíB§R¤ã©Å—³ó”*õtµpOIÊŠíû‰ ôvwóóàÉ•ÊA‰ôbf.úê–š¦V…RE6"!'fRŧS3GÆÆC…^‹c"åJU~EõÙ´¬™»dýv=cEbÜüÈ0"PÓÜÚr¬»¨ª–A£…ùú,›382rúz&“AŸµ~ÖÙÛ·mÏ¡øðà@oŠ‘Ñ¸LÞÝ?p!3wIU5µ˜Óƒ}<éTÊ„\ÑÖݳûäYøv¤ìéO®½/:ÝÍ?pë2“Σ‡lDºòã·•M[>ú“Æ }üú– ™9è¾zl]¿Z­žüæàQ¬ž6,Y`af 510ž´‚„tûÀðGSyèŽ|ƒ{š„õ8‚\¥³9e …l”^ˆÍO,4 %½°Ä€?ûS‡Cû*9ÛÙ8X[5>Ì,ÇjáQ ÓÐ&F Œ™@ÀDðÇræûÿä”V´u÷ˆDŸëÅå4´uœ¼š†IæIelB†þÅ €ƒÕ²¹sJkFÇ'¬ÌÍÄ~Âq™<«¤ «…?;öV–^\Ž£5&m ŒûÓÏþ`Τfy{†Š¼IDbßàð3~úõôìî)ÅÀø“"‘ŽIFÇ"|©d#¥Z]ÓÜz.=ÛÀo8büYðð¼< *«Ó J0i``ܘÿÆï æ†ñdég˜~†ñxég˜~†ñxég˜~†ñxég˜~†ñxég˜~†ñxégbýlÓòÅEÇörÿ¼Eظtaѱ½lø£Ç-:¶wíÂD¬]b`````<ÍžªÒûxŠý„lw'‘øÅOû^¼ŠŽ@$BEÞ±!Ž«µ9K¦PTÔ7í9y¶¢¡É@² :-&8 :Ðíhof‘æ–UþôëéÞÁ¡ÇS.ö¶‰âPÇ•ïêÌ Ó®ç½ýå7wG£’É/,_hfÂè–œKÏÞsò¬zròÁ3àïÉ_“”ðé®=ýÃ’'©ÙX˜¿½qí¾ßΗÖÖÿI‹ðþ‹:zz÷ývþO‘Ûp_Ÿ¥sç|ðÍα àã7^*¯kÔëÔ"VΛ+à¸þýëOR§<—¼ÀÚÜìÓöb}cæ<]úÙòø˜Èßq™|P"µµ4¿;B€—ÇÞ~]=9YÝÔR×Ònkiá/ ÷õyÿë—³ó§KvÝ¢ùëÏ›U64UÈî.ŽÉ±Q1ÁÞÿg[wÏc(‡¡×†% Tjuw߃N›2‘@øöý¿z»sJkëSó‹}xn›S’y.Nùâ«§¤µÓ¨a"Ÿ²º†ÇPÏf1MÞqÃŽ#'ê[ÛÿÀlð]}xnN¶6V,3¥Jýî¿CkcaàÉwwq27eÊÊ®¾þ‹™¹]}ýàéá鯶27:Ð?,I+(.¯o|Ldn¸°0^‚è ?s&sL&Ë-«¸–W¨Õêôâ|°ecQUíùŒœ?i¿p¶³á¹8_Ï/R©ÕOÉP€ñ‡ðtég{Nûjÿ‘ÎÞ¾%qQï¾°þî£ã_í?üÛõŒÑñ 8dNÿçyåÿ6>{=¯hR£™2Ù¶=ïmÿ>5¿¶-áp¸7ž]¹z~ükϬØúùöÇPYÅå¥5õíöÖ–Ç·6eœ¥s£½Ý9§®¥ÿkÇÏp¡þýÖ«‘¾s‚ü¯ç= ­…A§Å‡÷ Î\?ëúë¿¿Öhµ;ol¥JÕÔÞùÇŠ(ÀËËËéêíŸ0"‘ô¾Êuv¬in­lh¦S)þžü7×­ÚyôdC[Ç”©ùzð’¢Â›:ºrJ+p·ç¶>9éRVÞŬÜÇ¡=., Tä½<>¦¡­#§´ÂÑÆz^D˜ ~üòut sS£¦¹õÏÛ/œlmâÃ3‹Ëf®Ÿí;}èQöYÄÀxØ<"ý̈D\=?!><ØÁÚJ©RuôôžÏÈ·!–Î~÷…õ[>ú¢ ²‰¿eÕ² K,{ãÄþD%“_Z¹4><˜F¡4´µÿïð ½WX±Ì–ÅÏ zÛX˜“H]}ýgÓ²œ¹¨EuÔJƒÛ”€ªÆæªÆftÈõü¢²ºßÝÙΦ©£kʧΤf¢?jµÚoKIˆò¸³–•L¾òÓ·µ-­Ïÿýct¸»‹ÓÁ/>:~ùúg?쥒Éûw^ÈÌÙ}òì›Ï®òá¹)UêKYyÛ÷ý2©Ñ,š±&)ÁÞÚ²wpè»Cǯå"‰tôôÞ3 £#´ZÝ÷¿G õý/¿Fø.ЉxJô³Y Óé¦Sâ—N¤TÝœ=Ø. mø$ôÛõŒƒg/j4Ú–/vµ·Óû6§´âàÙ‹ ¥ þ˜]Z±uÝêDqètúYk×wî‘Â/eç½õÜš˜à€Ô‚b¥Jõ‡W®áÂ’ˆÄù‘a=½;ŽœÐétõäd¨È;«¤­ß{°]d EûÞ?¼8æa5T‚ˆZS|¨}ãQò(ô3#qç‡ïzº±Ú:Ž]ºJ$ÜœŸY8s7÷ßwÞôðJjêJkì­-¿zo«ž{A ·`Ùܘªš¢êZ/⻿öÌ GkØüó LÈ•ú>ü®&5•Zý ö™B‘Y\`Å2ëFÂã—²ò+–Ùý­¡­ã\z¶Ÿ€·r^ wpè…勲JÊ«šZbC>}sËÊ¿¼ßÒÕ=÷SÉd®³c}kû°t lî쑊øî³(ޕŒØhG[ë ™<§¬B"Ó‹`ki‘(a;؉„žÁËÙù°¢ìãî¶>9içÑ“u-mHd/.gÃ’H BI‡xr\iÔq™¼±½ótjìD6"%ŠC}ÜÝèTʰt4¿¢úz~<}þýÖkY%e=Cs‚ýÍ™&ÃÒÑó9eu Çõùe‹ë““àȧS3Ró‹]ìmÅ~Bg;[*S(kš[ΤfMÈåp=ß—Í)É4 åÄÕÔQb+¥J[Vq!3ÉÀÛ3'ÈßÖÒB«Õ6wvŸIÍD¦ó˜à€¤¨ð/~ÚŸ(qsrœÔh`Ï$‘Èqt8~ù’ˆ§;:ÐÏÎÊÐ?,É(*-ªª¼õÜÉèØO¿žFb®NŠç:9~øÝðG‚Å!AÞž²QûžWÒЕbÊ0žìÏur251Öh´­]Ýç2²»ûÒ±q•^‹ª2@ÏÀ`w¿%ËtºøzË•Z]ÑÐ`fÂ輯öfL£þ¿W6]Ï/:›–…Ò(”^Ý”ZP|6-ËÙÎæõµ+úõ´½•e°'…lÔØÞù˹Ër¥"Qè% êZÚŸ¿"S(fRX®³#•LÎ.©@*7³¸,Dè%äqÑö?ǵ¦©‰ãëÁ‹ 63a ŽŒ\˜jÇsºæ±tnt°·×߿٨¿€äبP¡7hcažÂq²'‰Ã#ÒÊÆfDfÌQbŽ£ƒ^wÀ’Ù}⌃U€§Béêë?~ù\ï‹æDDúþõú‹püm{uööxzø{òm,Ì©d£‘±ñ‚Êê+9H1õüÏ ô»[>èè%`1r¥ª²¡élZRðãC#R±ŸÅ49y5=»´yV¯Â#€§›A£)Tªý—²ò‘çêlge¡Ñh›;»Î¦eéõ¾OvíN‡ò]] Ô¶´»x ÉÆÃæQèg›S–xº±½´mï!$ÐÀ0}7ó"Bý¼Ó©}ÿ’(ýçk›Ñq2‹Ê.eå¡U¢¿m~nqLäÞSç:{ûfysSf€§GgoßLlNþ"*™|%§àAäv);/6$pnXÐþÓà‚âBû‡%è±Ì׃·mÏ¡Cç.HDâ‘m/‰‹ŸxæíÀ¿˜™ûýo¯œ÷É®=3|µ½µ%àn¹uöö y\3Zo»',¦É+«STjõ¥¬<õäd˜È[oëhcýÊšåÃÒÑky…J• V¿öž:W^ßXÝÔ¢T©D|.Z?ñ¹ryC[;€F¡¼¹n•)Ã8¿¢º«¯ŸF!{q9Lcã± ½¸b‰£u^yÕþž«sRT¸™ ãØ¥Ûš·»ÛÁþôõ ™Bøì¢yÝýÃ’¶îž#®¬HŒ»”•×ÔÑ ”H>înÆ4Z^yåØ„ÌÆÂîŇkµºGN"!³r7,Yàbo‹„ C‰H¤S)¥”$ÇFù x³ÖÏp8è[ž7"·ï;<ó§ÌLoo\;.“ÿpìÔƒˆ.»¤b\& Fô3o.ÇÆÂ|ÿé hmàFÿÀá WàÿUjuj~ñ³‹æº–Ž”º ²zX:Êqr˜ù«éT*¸e8DŠèTê}ég á!$"øŠªjßÛ¼!%!vP2²mÏ!XoË*)iåÒÑâòúÆI¦²¡Ù›ëvôÂUx;D$ 8®…•5°çu‚8˜Å4ÙuìTí-}åJNA‡“­ÍÉ«iE¥p²Ï.šo9!ö2‰ôåîr…pðìÅ^Ýä-8›–5!—ÃìDïk_ÈÌEïµµu÷¬]˜èæä0ÝžJýöà±æÎ.@go×Ù1DèëgÆ4ꢘÈܲJÄœ_^ýî¦uq¡A‡Ï_FR(¯oÔÛ@÷`»tõöÃŠŽ‰1}a´¸¹³kÇáˆÖ ¡•¬éa1MÂDÞEUµÈ”3:>ŒD¨om¯jlF[QUí;/¬ zÍηݛ˱03=—ž=Ãø¦ c_÷úÖöûRéJjêVΛëdkl&úz¸÷ £íiu€Qûž¯ö†›‘HôðŽ]¼6“3Ëð9Äo ÓéÆ&dèó7#gºééúÀÓíÃsÛú|IÍͲԵ¶¿º&EÄwGVT ù?{Τað]Ó Š‘›~ËË‚ äب‘±±¯ö†5¼òºÆ·7®MŠï>yy¼¨º>ÖÑÓkÉ2ô;~ùúã°ÛŽñ4ðÐï?³³´`ÐiU͆W9†qsrèBoóétºÊ;Å Z|ûgÙ¸üã7—~øú«÷¶ÌM™³~ï;ϯ zíûí|zaÉ ¡’ÉÛßÝjÎd~ðÍNÄTN%“ßÞ¸ùÛ¸táL’R©Õ©ùEün×Ý×KN Ùˆ´ý­¾¼ÿ|àÄ•ÔßKzZ­îjnAJB¬›“Ô›› ™BÑÐÖáæäˆ> àjognÊD•š J•Z¦PXšÝqÄõ6ˆ†™ÒÚú¡—Û…H xÄ1E©RÁ~úS>%swq$àñˆŠcÅ230½L‡ËÌÎÊâ—s—‘»`XL“Y×°N§»/‹”€ãZÓÜ‚LŸ’€kÊË_äJ%rªÉ0ò?, 3&²ÕE%“Ñ6*?¯¨ºíýF£g¢c¡ûÅæ”d¦±ñ·‡ŽêéFÓ±<>FÄw?ráªÞvÔ4·JOœ¹/ W5¶¨''E|î°Têhc>+ð»kÌÖVˆWƒµ9‹H ÀK¸â†F¤È±d™¡O¹Z˜1ï«y”Ö6Ä…Ó¨">W®P"»ü°£½…ùÝ·û*Uª ¹î·¶9Ü/¤3)éݪ›—mD"ý|â4Ò³ôÒ¿/†¤R[Kóýƒ¿×ý·Ò±ñ¬’ò¬’r*™üÚÚñaA%5upV­ÌÍÀí£VÀ’e¦R«õ600þ(ºÿ™V«=u-ÝÌ„±eå2t82C×µ´æG†ÁëE@¢8ÔÛƒŽ|);‡ƒ6¯HFBÂCЇ$£cJ•Ú›Ë!àoÚám¬á›&î—-«–­[û0§¬‚A£%ˆCòÊ«æù£sè%˜ €g©y¡°«øÅ¬Ü¼ò*@„¿hÃ’Z­N©Výý¥ègœ¹€˜(V$Æùz¸UÕÂn¯¬Yè%ŒŽqí?|ùôSÿÚñó^–XÑÐÔ30¸rÞ\<÷{Ï,Y¦°º ï#ó]ál×µ¶>S¿üõrêÜÐà%qÑÎv6u-íB>׃í’^X2“Ëiý¼…ÑÈ™ÊKÙy^\ö+«—g—Mj4¡B/éØ8z´ýåÜå—W/{{ã³9¥ƒ## :ÍÉÆÚÜ”‰\Ïs¡"o½ \ÌÌó`»¾°lQ~Euwÿ•Lp\O\Iíìí+®© óõN޲b™õ»»8zº±sJ+fè­54"•)þ"­V«ROv÷÷H$Ã’QbcuB®à¹8™›2g=šŽOœº–¾tnô_7Û`Î4!ðq¡€¾¡áŠú&@JB¬‹½-ìÒŽ>ôp5·š^; öñŒ 𔌈8)˜âêºY›—ñ¬÷Å£ÐÏJÕæ?]»p^|XðŠÄX…RÕ~£gßoçûêo®[àçlg[ÓܺéŸDø¢õ3­VûægÿݲjÙܰ ‡ÝØÞñú'Û„<7ô@ðÍÁ£2…"Qºz~|wßÀw‡ŽTVëégÎv6IQáÈGO7¶§ÐÐÞëgð1JÑ+Ååœüé~?€A£ÁC0:q˜Oví¦Ÿétº+9Ï.š§Óé üè}aL¥¡³jca4¦QýL=9ùÊ¿þ½)eñܰ`/.§H²ëØ©Ý÷¹Á3(ùîÐñ䨍„ð ùÍûiW'Å#:{û¶í9èíA12—É»û.dÞñ«>ð @œl`&äòíû~¯  òŒMÈš:º$££­V·ãȉya>înÁ>dÉèØÙ´¬™ÿøN§;xæâ‚hñòø<ßOû㯿%ÇDÍ €/JØq䄞F~_d—–I¥Ñ~Ñþx)*¾–ÞÚuã«}‡Ä!b?“HG‘½NÉèØWû'E†GøøžÁŸOœÑûÝt÷ œMËû áó¿ðý´?Ÿ8=?2|QLÄ„L^T][ßÚ¾eÕ²Y‹zÿé -ÝÁ>^ó"B5­dt´¨ª¶±í¾ÍL¡TåWT¹99ø¸»ápаtôÔµôÌâ›'·ª›8þÛÜРDq¨F£méê>›–5Ã]x ŒG¤k¯ƒÄ‹€NwóÜú‡LŤƒñø€ÃA¿¾åBfŽÞ-?›R’'''žÕòã¾°³²xë¹göœ<{·ïÆ£E @8A·ÿü0•³îÈ7O×ï£c`üy¡Q(é…%جóg¤©½óî‚0ð%z·a`üÁô3 Œ?c2ô/9bü‰˜ù®:Ƭ]ßBEÞJ•ª£§ÆŸ¾Ic"ÀÀÀÀÀø³C$—ÇÇ „ŸOœù½îæÀÀøÁìgz”*Õ_¾ø “Æžaj¶á™Õ) 1G‡š¦¹RÉuvܰ$)§²“Æý£»ãdÀÍóÐ Ï|¸|®¦¹5»¸¼¡­=À“ÿÎ ë¸NŽ}nB…‡10000000þ uK51j ÐéÞX»¾8tçáãEUµØý³âAï×À!?_ªÔÑÓSTílb```````üaÜ>¿9<: èxT?ïˆ1%·õ3­F+U˜P00000000þ@°ûÏ00000000/0ý ãñÓÏ00000000/0ý ãñÓÏ00000000/nÿþfGOoÊ›ï Ã„‚ñò'¶ŸmZ¾¸èØ^®³ãŸ·—.,:¶×ƒíò¸EÇö®]˜ˆµK Œ§ÂSUÚ`O±ŸÐƒíâîâD"¿øiÿÑ‹WшB¨È;6$@Àqµ6gÉŠŠú¦='ÏV44H–A§ÅDú±íÍLC#Òܲʟ~=Ý;8ôxÊÁÅÞ6Q*à¸ò]tÚõü¢·¿üfqf¿'MR§»öôKž¤fcaþöƵû~;_Z[ÿ'-Âû/nèèéÝ÷Ûù?EnÃ}}–ÎóÁ7;Ç&d€ßx©¼®Q¯Sÿ‰X9o®€ãú÷¯w<7'[+–™R¥~÷¿ß¡¿µ±0ðä»»8™›2å eW_ÿÅÌÜ®¾~ðôðtc[™›è–¤—×7>&27\X€ØOÈqtp´±b2Œ«›úõô”é|°ecQUíùŒœ?i¿p¶³á¹8_Ï/R©ÕOá°€ñÈxºô³=§Î}µÿHgoß’¸¨w_Xw„Ññ‰¯öþízÆèø2'Èÿó¿¼òŸ½žW4©ÑL™lÛž÷¶Ÿš_¬žœàp¸7ž]¹z~ükϬØúùöÇPYÅå¥5õíöÖ–Ç·6ë8O6 :-><¸g`pæúYïàÐ_ÿýµF«}Øyó`»(Uª¦öÎ?VD^^\NWoÿèø„‰¤÷í¼ˆP®³cMskeC3Jñ÷俹nÕΣ'Ú:¦LÍ׃—ÞÔÑ•SZƒp><·õÉI—²ò.få>íÁpasÂq8¨½»ÇÀzÆÆÂÜ”Á¨inýóö '[›øðàÌâ²™ëgûNŸ{dŽÍ¬b`yöÍgWùðÜ”*õ¥¬¼íû~™Ôh͉X“”`omÙ;8ôÝ¡ã×ò ‘D:zîý+«3‰ƒ¡‡N§›N‰ÿ]:‘RusFô`»4´uüá“Ðo×3ž½¨Ñh_X¾ØÕÞNïۜҊƒg/*nýd\viÅÖu«Å¡Óég­]7>Þ¹{hD ¼”÷Öskb‚R Š•ª?þwç °mÏAÉèàó¿¼2]"l™BÑþ”ý̱Fó°*AD­)>Ô>ˆñ(yú™‰¸óÃw=ÝØ mÇ.]%nNŽÏ,Hœ¹›‡ûï;oú x%5u¥µ öÖ–_½·UϽ Ð[°lnLaUMQu-ñÝ_{f…£õ¿vüü€ùŸ+*õäÌ™ÔhTjõƒØÿe EfqiLp€ˬoh  \ÊÊCB¬Xf?~ô·†¶ŽséÙ~ÞÊyq€ÞÁ¡–/Ê*)¯jj‰ øôÍ-+ÿò~KW÷ÕÎì¬,–ÄF;ÚZOÈä9eé˜^[K‹DqÛÁžH$ô ^ηew·õÉI;ž¬kiC"{q9–,@iJ‚8Ä“ãjL£ŽËäí§S3`‡$²)QêãîF§R†¥£ùÕ×ó‹tº›kù¿õZVIYÏÀМ`s¦É°tô|FNY]@Àq}~Ù"Àúä$8òéÔŒÔüb{[±ŸÐÙΖA£ÊÊšæ–3©Yr9GÏ÷esJ2B9q5uA”ØÁÆJ©Rç–U\ÈÌE2ðvçÌ ò·µ´ÐjµÍÝgR3s]Lp@RTø?íO‡¸99Nj4°g‰Hä8:¿| IÄÓèggeè–d•UÕÞzndt ½Ñ¶:)žëäøáw?À!J‡y{RÈFí7zN\ICWŠ)ÃxN°?×ÉÉÔÄX£Ñ¶vuŸËÈîî@"HÇÆ Tz-ªÊ=ƒÝýý–,Óéâë- TjuECSLp€™ £g`ð¾Ú›1úÿ^Ùt=¿èlZH£P>zuSjAñÙ´,g;›××®üé×ÓöV–Á>ž²Qc{ç/ç.Ë•ŠDqh —€B6ªki;|þŠL¡˜Ia°rfǵ¦©i¾¼ø°`3ÆàÈÈ…©v<§kKçF{{ýý› Ô/&'ÇF… ½‘@ ó„ðŽ“=‰H‘V66#Ò°0c.ˆsôº–Ìîgl¬<=hJW_ÿñË×àz_4'"*Ðð¯×_„ãoÛs¨³·/ÀÓÃß“ocaN%ŒTV_É)@Š©çf ßÝð¡è@ÿ@/‹É+U• MgӲЀ‘Šý„,¦ÉÉ«éÙ¥åȳz}<ÝØ M¡RÝ踔•‡Œ„­Ÿ‰øÜ ¹¼¡­@£PÞ\·Ê”aœ_QÝÕ×O£½¸¦±ñØ„ ‡ƒ^\±ÄÑÆ:¯¼êFÿÏÕ9)*ÜÌ„qìÒmÍÆÛÝí`úz†L¡ˆ |vѼîþaI[wÏ‘ WV$Æ]ÊÊkêè J¤w7c-¯¼rlBfca"ô²³´Ü¶÷ZåBcjb¼vá¼KYy§®¥{qÙq¡AC#ÒüŠ›¦bؽ½®¥íLj&€z¿¾vå—» 6$À¦å‹KjëÓ K)d#8„ëìˆÇã=²QrlTw_ÿµ¼B¹BekiÎwu†õ³{²$.:Ü×§¬®¡±½ÓÞÊò¥•K!èö·öÖV\'ÇÊÆ¦aé¨1*ô~uMÊç?"2%T2Ö›gBŒËd÷û¢± Yc{§ˆïŽÖÏ|xn8½®›APFQ©‰1]ì'\?2:fom™^XbÊ0óõY«:töÒïÕ¨d²³MfqÙÍü¸»­]˜ØÙÛ÷Ûõt:•ºj~üÈØ‚5жÓ9ÆÆC×Ïp8Üâ˜Èaéè÷‡£Ãû‡îãà^|X°V«Ûyä$r1+wÃ’.ö¶H2´‘ˆD:•APZAIrl”Ÿ€7ký ‡ƒþ±åy#qû¾Ã3ÊÌ„ñöƵã2ùÇN=ˆè²K*Æeòø°`D?óærl,Ì÷Ÿ¾€Önô¾pþ_¥V§æ?»hÞ©kéH© *«‡¥£'‡?ª‘%„‡ˆdà+ªª}oózt„”„ØAÉȶ=‡`½-«¤ü¥•KD‹Ëë'5šÊ†fo®ÛÑ Wáí<‘(à¸VÖhµ:@‚8˜Å4ÙuìTí-}åJNA‡“­ÍÉ«iE¥p²Ï.š*òÎ*)Gì1déËÝä %ààÙ‹½º9È[p6-kB.‡Ø30ˆÞ×¾™‹ÞkkëîY»0ÑÍÉaº=;:•úíÁcÍ]€ÎÞ>®³cˆÐ ÖÏŒiÔE1‘¹e•ˆ!9¿¼úÝMëâBƒŸ¿Œ¤P^ߨ·îÁvéêí‡cúÂhqsg׎Ã'­B+YÓÃbš„‰¼‹ªj‘)gt|"><‰PßÚ^ÕØŒ4¶¢ªÚw^X"ôšo»7—cafz.={†ñMƾîõ­í÷¥Ò!”ÔÔ­œ7×ÉÖÙLôõpïFÛÿ<Òêþ¢ö=_í? 7-"‘è/à»x ö+}px®N:®®µ ®£…s"ú‡$ß8 §ßÔÑùòêåã²›¶XÃÍ£µë†dtTÄç"úÛÁÎĘ^rí¦ö¹"1N=9ùåÙQU#B)FFÿÙsEnYåÛ×.މ,¯o@ ¡)Ò*N°9%™ëìT×Ò6(*­]7³1àû_~EúEviyJBl¨Ðû|FÎtÓÓõ;€§Û‡ç¶ÿôù’š›e©kmuMŠˆïŽ¬:¨òöœIÃà»:§#-6ý–—AɱQ#cc_í? kxåuoo\›)Þ}ò òxQu-|2¬£§×’eèwüòõÇa·ãià¡ßfgiÁ Óª› ¯r ãæäÐ74„ÞæÓét•w:ŠA´:)þøöϲþpùÇo.ýðõWïm˜›2gýÞwž_"ôÚ÷ÛùôÂ’>B%“·¿»ÕœÉüà›ˆ©œJ&¿½q-ò·qé™$¥R«Só‹x®ÎŽ6ÖpìЩÀ däA¡Q(­Ý7ô‡¥RôÇ—W/[¿8©°ªæðùËýÃ’IÆÚœõÞ¦õ$"qv/ݺnõ’¸è“WÓ¾>pd†PÈFß¼ÿßÕùßîBOu²QJB,ò±ýFïtgïõ¸”· Z<7,èÇã¿ápPlH`GOoÝn=èU,`rR@Vá0­–€ÿcÎê‘HT2¹opˆÞA°0e͉X4'B¿ñÐ(Òññú¶v™B!âsaŸ+‡w(`[iYíÀ”¯6eKGÑŠ ¬â›™0=§"¥JE%“ ǘF…¯…CŸÑ#‘¦‹?rçV jrIßÂÔ°uýj½GôV2#£wäÐÎÊÂĘ^ÝÔ„Õîûuς倮 ™BÞLÄá ˜àßÂÌ™´fqe ƒN{iåR¥JõñS39Ó€ÃAë““ì­,w;… ÿŸ½óŽkãHþ¬*ôÞ‹¢‰Þ„Á`0ÜK\b'ñÅé¹ü|—\rÉ›»Ô»ËÙ)w±“8îq‹{7l:˜ÞDïU„¨*HzÿX{½–î—ù~ô‡v4;šyö™ÙgŸyfÖ€BÆl@¿|pêBFŠê¦_÷ÌõtNçËã"‚ŸÂÓ+dL©÷Æ¡)SëÃôA„Çt½š“¾¤ïž~!é—NS=ŠÅµÑÁ>.äsûårg;[üZÇ€Íìc#ºÄÚ¿ÊÕÊÜtFêQ,®dÄ  ùÜ1…›åGíí¬,ïßÝW©RŒ¡á®b[¢ýB>†ÜoºyqYÊž“g±ž¥WþŒè“Ëí­-;%Ò‡µÿ­|h8³¨4³¨”N¥¾¾~U|xpQU5ZUKspw©°¶0W©Õz“È“â‘ÇŸiµÚÓ×nš›o[½ŸŽÝ¡«[ 愣ϋ€Q˜·;Ÿ95+—@@¶®JÆRæG„âȇ”*µ7—M"ÞöÃ9ÛÙ¢;MÌ”mk–o\² -¯ðýÿÌ8‹ \›¹ô(dòWz3À“¿cÿ‘£wBõèVIQÑA5M-ÏÄÎIƆ W;l¦£ª¾‰ÏtÅ|¨t*5Àó®éÜÓ×ßÕ+ù õ\¤F :ö½¾µ}pxDÈwò¹}rü‚ÒšZ–“£»› þ\Ôfª¨k0 PB}½°ô9~€iîŠn6†ŸÎ@ïL˜¢¢ýf-¥òºúñqM‚(SÚû®‡!ælg‹ù`•õ&>"„ˆóU`&£T6àhc]{[{[,›¸±Y§ÓE±üw´½x›/kzŸ{ (dòK+’m­,v?…ÊŸLOKbæ„úz¹žžWV¡—¹_>X^[}¦S•Z]YßèÃã úöd_øƒtKûdƒC¡¾^ØÕg99Ú[[ÍH=Šªª ðä{qØeµõØÜ±¤_Ö-íøâ3cZQYßÈqqBwcA/S¨×ÐÈè4×Q¡nã{ûwá" òÎZPÅU5D"qÁœp|"‘H˜l¢™B&»:Ø™M ™D"¿“ð¨B!• Ði4TþýòÁ/Êí s–›¸¡ùAüŽÈCäqÄ$í>~Ê_À[›ï/àå—W‘ˆD–³ƒ£Í¢WþhëîÉ.. zïÿ죲Úzg;[!ß=³¨4ÂÏ+ábzöÂ(Ñ¢èHG뢪';›¹Áùå•A^·g0µZíÉ+ikÄíÿü£ì’2cc¾(4·´bnp¾&A^‚ÄÈ0z—JŒ CCÅS2srK+‘ÂÍKjµ:¥ZõÁË[ðç‘_^©gŸ¹:Ø%EE`‡ž–'‡¨miEí3t™€ÌÕkÅåì¼ÉÞ`Ì`ÌŒð…£|º{/x0ûL§Ó]ÉÎß°8Q§ÓMñÐaDgà«jge‰1è˜í5<ÓD*øîð‰äبù¡#c·÷§]›ehëîùjïáøˆ ošÁðèX‡¤÷RÆ=oõA‡ldƒ226¶cÿ/è”ÁÞ‚¡‘ÑúÖvÙà @«Õ}ôdbd¸;'ć*:#óúôÖ¢’?t.ea´hE| ‘HD÷§ýñ×3É1QsƒÑ¾?zRÏ"ŸYÅ¥}ryttP‘HilïÈ))›,¿Û­¡­CoyZ^¡L>ä¢Ñh%ýýØrã–Îîc)WcCƒ’æ„wKû÷¾àÉçº8cçž¼š6ªP„úz ØÌ–ήÿù¿¸øôµ›:ò¹!>žm]=»ŸJŒ Çü g{ÛÄÈ»~ô{±¸5YÐl®«3×Õ_ák¹·&´iÑü–f¦ø2ÑVÌÚ>«ªoR(UTÊLBŽ&fêÆ|y\l¹¨…9šadLÑÞ#°YJ•ªáÞ¤¤º–x޼xîéÀÀ‘‹—=ØL›9#õ(ªªIŠŠ@·eƧ7µwîÜd¾(Tä/$ 2ù 6×)ÚyàHÒœˆ9~$±«Wºçä9½÷¦LAGOïù™"_tý/º?íž“g̉X92:VP)®ijÙ¶fù¬E}àì¥Æ¶Ž¯ÄÈ0F+,¨×5Ïømf ¥*¯¬‚ãâäãÎ!~ùàék73 o¯Üª¨køáÄ™¸°àQ˜F£mlï8#sš³ðÈcѵT#¢Å@§»ýw¾PéP:ÈÓ€|òƶKÙz»`@ž~^Z™<>>¾g†ñsYà`cõÎóÏí=uþþØ;äñ¢@»€ÐC1ëŽ~sÛöÙ[¯8ÙZ?÷@‰B O' íæ­"x×y©oi»‡ ȣݠDo×!äY„E< Œ¦dæ@9<‹LV2kÐз0¡·R¥jíê<ó* E@ g2‰¼">†L"í9yîaíÍ312‚²€@ y ,ÍL¬ÍM¡, @ž€uI |‘@ ÈSá_ìôä·tuCY@ < tÚ;ÿØ94< e@ ò4€èZªÑb ÓÝþ€;_¨ð@ ³@  € w?= ‡p÷ ÝÑožá°³—V,)8¾ëêüì6a˲EÇ÷y°ÜÐC_·àø¾õ‹ ^C ò{†ô»jmˆ§Èß׃åæîæB!“¿üéÀ±”«ø d)Lè(`3m--Fвšú½§Î—ÕÖOQ¬±!#&$0:ÈŸåìhnbÜ7 Ï))ÿé׳ÝÒ¾§SnŽö ¢0›Égº2®çlÿç7zy¼ÝÙóBƒýËâæþõ›]C#£€OÞ|¹´ºN¯S?C¬NŒ°™|ýýÿR§<Ÿ¼ÐÖÒü³öÁ>LŸß—}¶">fN ßðè˜T&··¶¼?C —Ç¿¶¿¡¯¬o¬nl±·¶Œ Føù¼ÿõ÷—³ò&+vãâ—,-¯­/S¸»9'ÇFÅ„n~ÿoÍ]O¡B}½6/]¨R«;zz ÷g É{þþN§«omÏ**3bÐý<Ü?~m«—ýÅû'ÚbÄ ‡ }JªkŸB;ÛÂÔäý?lþþèÉš¦–'X >ÓÕ‡Çq±·³±0WªÔïþû;ü¯vV–ž|w7K3Ó1…²½G’’‘ÓÞ#™Bàžž–¥9ÐI¿ìF~aiMÝS"ó©;ýÊÿuÛ–‚ ñÃzÔyü¸:ØñÜ\¯ç¨ÔêßÉP<~_öÙÞÓv8ÚÖݳt^Ô»/nº?ÃàðÈÎGÎ\OASæ|ñÇWÿ´eÃõÜ‚qfÂb›;»ÞÛñŸ´¼Bõø8€@ ¼¹aõÚñ¯?·êí/v<…rÈ,,-®ª©kis´µ>±ãóû3ètºc)WŸOÅî¦Îv¶{>yE|Ìék7Ÿ¬MðØ06dÄG„tõJ§oŸuKûþï_kýÛl=XnJ•ª¾¥íÉŠ(ÐËËËnï– P(z¿&F†q]«šÊk é´Oþ[×ì:vª¶¹uÂÒüþ2¿¼Ë¿mÍòÍK.óϘÿ‰N¥¾¼zY|DƒF«mnùï‘“zaca¾<~n˜¯·•%Õ€ÒÞ#9#óà¹ü«ß˧œ¦TÔ5TÔ5àS®ç”T× ùî®võ­ížu.-¨Õj¿=t|åüX_wÖ£S©W~úVÜØôŸàÓÝÝ\}ùñ‰Ë×?ÿaJM?°ëRFöϧοµa£T©S3swìÿe\£Y<7r]Ò|G[ënißw‡O\˽…Òú[Û©¨ÇÇ¿üé>¥µ«ûDêõ–/öåq'öÙ,Ðét“ñ¥)U·ïˆ,·ÚæÖ'~:s=ýÐùFûâŠ%LG½_³‹ËOQ(UèaVqÙÛ×&ˆÂ&³ÏšÚ;?Ùõs߀=LÍÊ}çùu1!iù…J•ê‰_Ü©;ÍÊ{°ÜFŠ–Îß×~FÍ£RTAÈ$ÞR|¤}yœ<ûÌ€BÞõÑ»žVmsëñÔ«d‰ãâüÜ„釉„ÿù-¯¨ªºX\ëhk½ó½·õ ‚¼ËãbnUTTŠID¢ïþús«œílÿþýž¬ÿȘ ROÿ”qF¥V?ˆÿT¡È(,Ž ´±0ïéëÇÒãÃC©™¹XŠ…ùÿ¥¶¹õÂÍ,ouâ<@·´ïÅ‹3‹J+êcC?{kÛê?¾ßØÞñ`r >™â`cµ46ÚÙÞvdt,»¤L&×ß«½µU‚(”åäH&“ºz¥—³òPCÙdz)9i×±SÕÍXf/.{óÒ…X"ƒF›/ õd3ôáѱº–¶³iéh@Õ€’ óqçÒiýòÁ¼²Êëy:Ýígù¼ózfQIWoßÜKS“~ùàÅôì’êZ€€Í|aùbÀ¦ä$4óÙ´ô´¼B7G{‘¿¯«ƒ½1ƒ>ªPV54žKËD%î‹}Ùº2™A£¼š¶0Jädg£T©sJÊ.eä`x»³çØ[[iµÚ†¶Žsi˜».&$0)*âËŸ$ˆB9.Îã ™D!“ÙÎN'._à ñä°¢ƒül¬’~YzAqA…ðÎóëdƒC?ýz˹6)žëâüÑw? ‡‚$ˆBƒ½=iTƒ–ή“Wnà/Š™±ÑÜ®‹‹™‰‘F£mj︞ÕÑÓ‹e OqÑŸKèê•vH$Öf“å×{lP©Õeµõ1!æ&Æ]½Ò雃þÿ^}éz^Áù™X"ƒFûøµ—Òò ÏßÈtu°{cýêŸ~=ëhcâãI£Ôµ´ýráò˜R‘ òШÕÍG.^U(¦ÓØiV^ÀfVÕ7a àçÁ‹171– \šhÆs2õXâíõÁ7ßcæ/ 96*Ì×K´³²œÊvq¤ÉýòòºLVæ¦ £Dlg'½î@%óóÉsNv6ž ­½Grâò5ôº/žäøû@óµ÷p[wO §G€'ßÎÊ’N5Î/¯¼’5S/þlŠ~wgÀG¢ƒ‚¼¦ÆcJUymýù™Ø…@Oï‹ü}-LMN]½™U\Š«×ÑÀ“Ã2f0*U§¤753 yL×øð+FÛÐÖ~þF¦^ïût÷Ï ¢0>Ó A€¸±ùxÊ5¬È£æqØg[W.õä°ŸOýjßa,qŠaú~#Ãü¼³iéÿç'4%Aö·×·âód”¤fæâM¢¿l}~IÌœ}§/´u÷̺ò–f¦žmÝ=­3ÙÂ72@H§R¯dç?ˆÜR³rcCƒâƒœ½„¦ 2/,HÒ/Ãe~¼¯ö>|!@!“~õÉÒyQƒÃ#ÏmÿmxJFÎþº}uâ¼OwïueB|xˆV«Ë-­˜é¹¦&¯®]©R«S3sÕããáBo½\g;ÛW×­è—^˽¥T©Pókßé ¥5u•õJ•JÈçâí3!Ÿ;26VÛÜ`Ðhom\cfl”WVÙÞ#aШ^\¶©‘ÑÐÈ(€üaÕRg;ÛÜÒŠNI/éšanb|<õ®eãíÎa99ž½ž>ªPĆmXœØ!éíí—5wt½teU¼ÔÌÜúÖ6€T&ø¸sŒŒÜÒò¡‘Q;+ËP_/kë¯öÆ›\xÌLŒÖ/JLÍÌ=}í¦—5/,¸o@žWvÛUŒ†·W76ŸKË ‘ˆa¾Þo¬_ýÏŸbnÀK+–‰knÞ*¦Q Ю«3‘HÀæÈ"„ɱQ=’k¹·Æ*{kK>ÓµÏ~“¥ó¢#ü|JªkëZÚm¬_^½ ¿ÐÛÑÖ†ëâ\^Wß/4bÐÃ|½_[·ò‹÷ˇf§Et*µ›§ ƒF ÎxwÆ¡‘Ѻ–6!ßoŸùð8ÿ\Œ HzA±‰‘¡ÈßwmRüÀ࣭õÍ[EfÆFá~>KbU‡Ï§Î®±÷WžN¥º:Øe–Ü®;gý¢„¶îž3×oÒékÄ Ý#Ø)Ô£X\áçëÅaߪ¨Âwޏ± 5Î\ì¶­Y®T©²ŠJ††-LMl&* 3c£77¬A’QX2ªPzò7/]xð\JQU5ö×Kb£ZÛœ½D&“’cæl^ºè“]{´Z]Vq‰DŒðóýé׳ ¥ éïDøûô ÈÓò Tªq®«s‚(ŒA£ºzc2áLÖïÐ_7,ZàÅeݪß,(252ù lmvøóà ùîcJ奌ùð0ÞB½ŸU ólæÍ[E’>Fuu°³·¶Dí3OkóÒ…=}ý—2²)d²Èß÷õ«¿Úw¨·×û’oUTÝÈ/²µ´X:/*96êÐù”gî6yFyäö@X3§_>øŸ#'ðé’¾,ÜC-ƒ]GOa))™9›—.ts´ÇR°¡B&Òi‚ÜÈ/JŽòðfmŸȇÛ^0 wì?2ý³ÌMŒ·oY?<:öÃñÓ"º¬¢²áѱøðÌ>óæ²í¬,œ½„·:%½G.]A¿«Ô괼 ‹O_»‰µ:¿¼²_>ÈvqzÊlZ²€Çt=|!µSÒ;ÓsçG„RÈ$là+¨¿·u>ÃÊù±RÙÀW{£v[fQéË«—-Œ•ÖÔk4åµ Þ\αKWÑé< ™,`3o•Wiµ:À|Qˆ…©Éîã§Åwì•+Ùù‚.öv§®ÞH/(F‹Ý°81LèYTй4¨Ê?>8¦POùøµ­ÁÞ‚ó72GÆÆPvõJñóÚ—2rðÓUÍ]ë%p\œ&›³3¤Ó¿=t¼¡­ÐÖÝÃuuõõBí3#}qÌœœ’rÌ‘œWZùîKç…¹x+¡´¦No݃åÖÞ-A #ÃEÑ¢†¶öïœÄ¬^d¢ÝtîÇÂÔ$\è]P!Æn9ƒÃ#ñ!X†š¦–ŠºLÙ *Ä~qc¨¯×ìbÛ½¹l+s³ 7³¦™ßÌØÈÏý¦©eF&FQUõêÄ8{[l2ÑÏý§¯ïÿ#‘ˆ˜Ö"„-];AU‹L&xÇS®ÍÂgYþ)ú/;ªP”×6` oíì– ³ï>aÖ6·NÓjW(U,'G#ý‹ˆÊ?§¤»1uKûÄÍ<¦ þñ¦¸ / @05š±>@ ³ã‘ûÏ é4€T6ð …0h´¦ŽN½Ä~¹øÊÚå›–$ݪ¨:rñ²¤_6®ÑØZZ¼÷Ò& ™<»?}{ãÚ¥ó¢O]½ñõÁ£Ó<…F5øæýwøL׿ݿÕѨ+çÇb‡-Ýø )HÍÊ]-Š þñĉ jíꮾ7¬ÿ ×°§pVK"ÎòZÏ øèÕ«›ZÞüì«YDÔP(t*µGÚOÄÏ X™™Ï\<7R_y4ùðpMs˨B!äsÑ!Ÿ+Fg(P_i‰xb—ž™±Q¿|o¸ &¾¹‰1–¢T¤T©èTêÍ1bÐÑ­ãðû’P (“å¸w*P5>Ž•oefx{ÓZ½Sôždï©¡ƒ•‰‘ae}#zˆšÝ3 ÏBA倿£ ~>Ž@@bB|+s3ì¦5‹-cŒ /¯^¦T©~8~z:kdSr’£õîã§±¡Ã€BÆl@¿|pêBFŠê¦_÷ÌõtNçËã"‚ŸÂÓ+dL©à§nÑ”©õaš• Âcº^ÍÉÇ _ÒwO¿ôËŒ §©ÅâÚèà'»¸Œ@@¼ÝÙu hEJ'Ô  …A£õÜû¿h÷471Á|Øvìé4ƒ)Zí`c•ÎttÀ÷ªÁ¤§LÑï¬-ÌèTê—ï¼6á­eú3ìgÓ2Ö,ˆûpÛ í=qCsa•ºAå¯7.õHûÓ-³¨óëLTø£A­+—š^ÉÎï–ö¡>ƒ×Ÿ[…6aõÖÝ_Ȇ#€]ÇN¡q<z‹Ýôbõ<ªoú…?STÜ5è–öɇB}½°«Ïrr´·¶š‘zUU#àÉ÷â°Ëjë±¹cI¿¬[Ú' ðÅgÆ´¢²¾‘ãâ„îÆ‚^¦P¯¡‘Ñi®£BÝÆ÷ö î8ÂDAä/œµ‹«jˆDâ‚9áøD"‘0ÙD3…Lvu°35š@3‰D~'áQ…B* Óh¨üûåƒ!>^”Û06æ,7qCóƒø!‡ÈãØ_c÷ñSþÞÚ¤x/¿¼ŠD$²œml½òG@[wOvqY˜Ð{ÿg•ÕÖ;ÛÙ ùî™E¥~>X Ó³F‰EG:ÚXUÕ8ÙÙÌ È/¯ òº=ƒ©ÕjO^I[³ nÿçe—”3óE¡¹¥sƒð5 ò$F†лTbd*ž’™ƒn ܼt¡V«SªU¼¼îÁs—0Ū„y~îb4 âÕu+‚¼²Á!¶³ãG¯¼ˆ?ëïßïyÀÍËjë»z¥«ãˆÂÃržY[˜¡æ2:Ìgº¢Õ®nj>rñ ÀÜÄøËw^#“H]½}¯®]?7³¨‹¡™ oQt$¶¦25+׋Ëzu튌’q&Ì×K>4Œm¹pù•µË·oÙ]\&06d¸ØÙZš™b‚ètº’êÚ0¡7Àõ{ÌS2r=XÌ—/Î+«ìôÒ©T›yòJZ[wOa•8ÜÏ;96ÊÆÂ¼[ÚïîæìÉae—M3Z«o@>ªPDµZ­J=Þ!‘ôÊd½ý²…Q"#}dLÁss±43õh>8ÓM«ÕÖà&ÎäCÃÒ³EG¾µqm±¸F¡TÙ[[R (èšßÜÒòÏWÖ¬(¬ª6fЃ}<;%R,x@*È.. ÷ó¡Éèþž>þ¬²¾1ÈK0ªPtôHì­­|üÆGk>Ë`ijB"ç…zúúËjê+çǺ9Ú£!íøEWsn¡BÓÓ“Ï9~RÙ…LB‹B)¬¬žµË³X\ãËãÆ„6wtÍz[é4vêÊ›ÙZZœ¾vK×étg¯§o\²àµçVÞ*¯2¤Ó#„ÝÒ>C:}úêÑÓ×ß)é1 PôæìŽ¥\ݶfùö-ësJÊeƒCæ&Æžúr¶”ŒOkÛšå%cJe §‡¥™éÁs)Óœ+@çÖÏ,©®Õh´5M-µ-­ããš–/Î,,%lXžõˆW,®‰ ÚZZˆ›ÇÇÇ­-Ì}Ü9çndL¸kŒ…ùëW_Ï+Ð[æ  S©ï½ô|iM]§¤W¡T¹:Ø¡{m ò?uõÆæ¥ ßܰ&¿¼’L"‰ü}•*õ…ôGîd…@¦Éã°ÏJÕÖ>[¿(1>C—ùÈüˆP½V\ÎΛìýÆ ÀÌØ_8ʧ»÷‚³Ït:Ý•ìü ‹u:ÝïFt¾ªvV–衃ŽÚgTt]Åý1|}òß´Ïôʾ;|"96j~DèÈØíýi×&ÅcÚº{¾Ú{8>"$ÈÛƒf`0<:Ö!é½”qÏ‹qÐ!€Ù ŒŒíØÿ ºe°·`hd´¾µ]68Ðjuß=™îãÎ ñ¡Ê‡Îßȼ>½õƒ¨äKY-ZC$ÑýiüõLrLÔÜà@t£„ïžÔ³ÈgDVqiŸ\ä@$‡GÛ;rJÊ&ËïÁvkhëÐ[ÞŸ–W(“EùÅG„h4ZI?¶Ü¸¥³ûXÊÕØÐ ¤9áÝÒþ}§/xò¹.ÎØ¹'¯¦*¡¾Þ6³¥³ë¿G~Å/.>}í¦N„|nˆg[WÏîã§#Ã1ÀÙÞ61ò®Ÿý^,®AM4›ëêÌuuÆWøZî­ mZ4¿¥™)¾L´³¶Ïªê›JÕ€2“£‰™Nc'«¼€ÍRªT ÷ %ÕµÄs„¸°àÅsçHŽ\¼ìÁf ØÌ©GQUMRTº-3>½©½sçþ#óE¡"!‘Hɱ¹NÙàÐÎG’æDÌ ô#‘ˆ]½Ò='Ïé½7e :zzÏßÈùû¢ëÑýi÷œ<»`NÄâ˜È‘ѱ‚JqMS˶5Ëg-êg/5¶u„øx%F†i4ZÙà`A…¸®yÆo3S(Uye'w€ôËO_»™Qx{åVE]Ã'ÎÄ…'ˆÂ4mc{Çù™Óœ…‡@ˆ®¥-:Ýí¸ó…J‡Ò@žä“7¶]ÊÈÖÛòôóÒÊäñññ=3ŒŸƒÌ«wžnï©ó¾œyŒh‚„äî ‡ôÀDA̺£ßü¾Þ<»0h´›·Šà]çY¤¾¥íþ‚ tƒ½]‡ ghŸA ÏC#£)™9PÏ"ÓŸU‡Ì4ô-Lè­T©Z»z @ ϼJC@ äY‡L"¯ˆ!“H{Nž{X{s@ Oè?ƒ@ È3R¥úã—;¡ ÿ3æmyuûæçäÓ·¶ù×'øwœA @?„?½°áÜL­V·óÀ¥Zõòêd :Õ @ž„ØÐ ô#=Òþý§/ØZZ®NŒƒr@ yRܳ>àZnAQUutÿ¼Ð (@ '‚þúÍOœ]1?Š@ 䉠oŸÉ‡GÛ: \ @žúöY¨—3:¦€¢@ y"ܳTÓÌÄxSr’T6ðé®=P4@ O„{ügÛÖ,£S ~üõŒlpŠ@ ä‰@8t>%ÔÇ æãι”™SÓÔ å@ ò¤ ì=u^äïcomµaQb[wÏñ”kP(@ O•Ÿ¾ýrÏÁNIïêwÞÿã;ÕããÏJÕ_Z±¤àø>®«ó³+ý-ËßçÁrC}yÜ‚ãûÖ/J€z @ Èï™ß׫œB|ÓÕØq=¯`û?¿ÑË œÊcºXš™jµºNIï嬼c)WGamo€']ÒüÏvï•ôËþ—ÌÎÊrû–õûÏ\,×<£Mxÿ›[»º÷Ÿ¹øLÔ6ÂÏgYÜÜ¿~³khdðÉ›/—V×éuêgˆÕ‰q6󃯿ÿ_ê€ç“ÚZšöÃ>Ø!éóû²ÏVÄÇÌ ô“ÊäöÖ–÷gôòø×ö7Ôãã•õÕ-öÖ–‘Â?Ÿ÷¿þþrVÞdÅn\¼`ã’C#£åµõec w7ç䨍˜ÀÍïÿ­¹£ë)”C¨¯×æ¥ UjuGO¯±!cÂ<ó#BÄÞu-­5M­²ËíÕu+ç„m|÷ÿ)”¿m1bÐÃ…>%ÕµO¡majòþ6ôdMSˬŸéêÃã¸ØÛÙX˜+UêwÿýþW;+Ë@O¾»›‹¥™é˜BÙÞ#IÉÈiï‘L!ð@OOËÆÒ耤_v#¿°´¦î)‘ùԥɋ¢E®öfÆFd2©o@^,®¹y«X©Ré•ó×m[ *ÄÓ³ŸÑ~áê`Çss½žW R«ãò¤ø}Ùg{O_Øyàh[wÏÒyQᄌéþ ƒÃ#;9s=}pxM™ðÅ_ýÓ– ×s Æ5š ‹mîìzoÇÒò ÑÙaðæ†ÕkÄ¿þܪ·¿ØñÊ!³°´¸ª¦®¥ÍÑÖúÄŽÏ'Ì󯽇Þÿú{­V‹ „OßÚ¸0Jôìú'f„±!#>"¤«W:}û¬[Ú÷ÿøZsGh–›R¥ªoi{²" ôòðâ²Û»%ƒÃ#ŠÞ¯‰‘a\W窆¦òÚC:-À“ÿÖÆ5»Žªmžx’Ÿ/)*¢¾µ=»¸Œ€|xœMÉI©™¹)™9Oƒ>LÝX ™Ïrkîè76ét:'[›Q˜»«Ë·‡ët:,›•¥™±qUCÓ³Û/\ìíâ#B2 K¦oŸí?{èSõ[„@5É>3 ×.˜âdk£T©Z»º/¦g£·ùeqÑᄌiÛÇ_æ—Wbù·­Y¾yéÂåoþó?ѩԗW/‹aÐhµÍ-ÿ=rRï/l,Ì—ÇÏ óõ¶³²¤PÚ{$çod<—¢ÅuÔò)§)u u ø”ëy%ÕµB¾»«ƒ]}kû„gKËÀjµÚo_9?֗ǵÄèTꕟ¾76½ðÁ'øtw7—C_~|âòõÏØG§RÓ캔‘ýó©óomXãÃã(UêÔÌÜû×hÏ\—4ßÑÖº[Ú÷Ýá×roa…´vuÿfúäøCV{áfVLH £5ì6“¡Óé&3âJ'Rªnß=XnµÍ­Oü&tæzú¡ó)öÅK˜Žz¿f—:Ÿ¢PÞv e—½½qm‚(l2û¬©½ó“]?cŠ—š•ûÎóëbBÓò ï÷B=mýÛ§$ÇFE]ìmñNt–Û¨BÑÒÙýÄ›ó8Ñh•¢"B&‘ð–â#íƒÈãäqØgò®Þõä°j›[§^%“Hçç&Lß C þýç·ü¼¢ªêbq­£­õÎ÷ÞÖ /ò,‹¹UQUP)&‰B¾ûëÏ­r¶³ýû÷º×îȘ RÏ`åĸF£R«Äÿ?ªPdÇ„ÚX˜÷ôõcéñá!€ÔÌ\,ÅÆÂüÇÿRÛÜzáf–¿€·:q [Ú÷âŠÅ™E¥õ±¡Ÿ½µmõßol 7wE³+ÄÁÆjil´³½íÈèXvI™L>¤—ÁÞÚ*AÊrr$“I]½ÒËYy¨¡ìãÎÙ”œ´ëØ©êÆf,³—½yéB,‘A£Í…z²™F úðèX]KÛÙ´t4 ‰j@I…ù¸s é´~ù`^Yåõ¼ÌŸñw^Ï,*éêí›`ijÒ/¼˜ž]R] °™/,_ Ø”œ„f>›–ž–Wèæh/ò÷uu°7fÐGʪ†Æsi™#cch½Ø—­+“4ÚÉ«i £DNv6J•:§¤ìRFÞ¡âíΞ`om¥ÕjÚ:Î¥e`À¤¨ˆ/: å¸8k4hd…Lf;;¸|w©µ'‡äï`côËÒ Š *Ä€wž_'úé׳XεIñ\ç¾û=D$AìíI£´tv¼rQÌŒæ†p]\ÌLŒ4mS{Ç…ô¬Žž^,ƒ|hªŸKèê•vH$Öf“å×{lP©Õeµõ1!æ&Æ]½Ò雃þÿ^}éz^Áù™X"ƒFûøµ—Òò ÏßÈtu°{cýêŸ~=ëhcâãI£Ôµ´ýráò˜R‘ òШÕÍG.^Á.§nìý Ý–j`€O°™UõM˜øyðâÃCÌMŒ¥—&šñœL=–ÅE‡x{}ðÍ÷˜ù HŽ óõÆí¬,çG„²])drÿ€¼¼®“†•¹éÂ(ÛÙI¯»PÉü|òœ“M §ƒFk¸| ½î‹çFFùþþÆÐü_í=ÜÖÝèéàÉ·³²¤S ††óË+¯dçcÍÔ‹?›¢ßÝð‘è € /…©ñ˜RU^[þF&v!ÐÓûä"_ S“SWof—bçêõAtðä°Œ …JÕ)éMÍÌÅ1Ó5><ÄÁÆJ£Ñ6´µŸ¿‘©×û>Ýýs‚(ŒÏtC nl>žrí¡àB ÓáqØg[W.õä°ŸOýjßa,qŠaú~#Ãü¼³iéÿçöj‚(ìo¯oÅçÉ((IÍÌÅ›DÙúü’˜9ûN_hëî™uå-ÍL==Úº{¦ãsˆ Ò©Ô+Ùù"·Ô¬ÜØÐ ¸ðàg/¡)‚Ì ’ôËðc™Ÿ﫽‡_HPÈä£_}²t^ÔàðÈsÛ?Dž’‘óŸ¿n_8ïÓÝ{gZ‡U óXNtÕƒåælg[XY}áfÖL ±05yuíJ•Zš™«zë=à:ÛÙ¾ºnE¿|ðZî-¥J…š_ûN_(­©«¬oTªTB>oŸ ùÜ‘±±ÚæƒF{kã3c£¼²Êö ƒFõâ²MŒ†FF ä«–:ÛÙæ–VtJzyLפ¨sãã©w-owËÉñìõôQ…"64hÃâÄIoo¿¬¹£ëè¥+«æ¥fæÖ·¶¤29ÀÇcÄ`ä––ŒÚYY†úz9X[µï0ÞäÂcfb´~Qbjfîék7½¸¬yaÁ}ò¼²Û®b4¼½º±ù\Z‰D óõ~cýêþ|ï¼|iÅ’"qÍÍ[Å4êí;=×Õ™H$`sd‘Â䨍ŽɵÜ[c •½µ%ŸéŠÚg¿ÉÒyÑ~>%Õµu-mŽ6Ö/¯^† wu´µáº8—×Õ÷Ëô0_ï×Ö­üâÇý²Á¡Ù©4JEíæi Ñã£3ý£¡‘Ѻ–6!ßoŸùð8ÿ\Œ HzA±‰‘¡ÈßwmRüÀ࣭õÍ[EfÆFá~>KbU‡Ï§Nÿ)d²!F&‘ì­­â‚ÇÊVœ«ŒN¥º:Øe–Ü®;gý¢„¶îž3×oÒékÄ Ý#Ø)Ô£X\áçëÅaߪ¨Âwޏ± 5Î\ì¶­Y®T©²ŠJ††-LMl&* 3c£77¬A’QX2ªPzò7/]xð\JQU5ö×Kb£ZÛœ½D&“’cæl^ºè“]{´Z]Vq‰DŒðóýé׳ ¥ éïDøûô ÈÓò Tªq®«s‚(ŒA£ºzc2AMÖïÐ_7,ZàÅeݪß,(252ù lmvøóà ùîcJ奌ùð0ÞBp°™7oIúdtÕÕÁÎÞÚµÏ<9¬ÍKöôõ_ÊȦÉ"ß7Ö¯þjß¡Þþ\ïK¾UQu#¿ÈÖÒb鼍䨍CçSþGnþ§žGnŸ„%1súåƒÿ9rŸ.é›Á½øð­V·ëè),%%3góÒ…nŽöX 6´¡C$‚ 7ò‹’c£ü¼YÛgòá¶ (äûLÿ,sãí[ÖŽýpüôƒˆ.«¨lxt,><³Ï¼¹l;+Ëg/á­NIï‘KWÐï*µ:-¯pÃâÄÓ×nb­Î/¯ì—²]œfQ‡?ŸP_/ôûÍ[E~»{;°Ì¥IØÀWP!~oë&|†•óc¥²¯öFí¶Ì¢Ò—W/[-*­©×hÊk¼¹œc—®¢Óy2YÀfÞ*¯Òju€ù¢ S“ÝÇO‹ïØ+W²ó<\ìíN]½‘^PŒ»aqb˜Ð;³¨óÇP)”þ|]ñpè|Êǯm öœ¿‘926† °«WŠŸ×¾”‘ƒŸkkîèZ¿(ãâ4Ùœ!þí¡ã m퀶s¨¯jŸ1è‹cæä””cŽä¼ÒÊw_Ú8/,øÈÅËX ¥5uzè,·ön jè˜.Š5´µä$fõ"x#kr,LMÂ…Þbì–38<‚e¨ij©¨kÀ”­ Büç7†úzÍ.¶Ý›Ë¶27›¾}ofläçá^ÓÔ2#“£¨ªzubœ‹½-6™èçáÞÓ×÷ÿ‘HDLë‘–ήŽ ªE&“¼ã)צ¯ó6sÃâDô{·´oß™ xw é¢Ó骛šÑk´hn¤¤OöÍÁchùõ­m¯¬]1W%ÌSÿóçC˜Û3U"ÃhÿÚ{ENIùö-ë—ÄÌ)­©Å  ¾9¦§Øº2™ëêRÝØ,•  *Mí˜ÛðŸ_~ÅúEVqéÊù±a¾ÞÓ³'›˜ž¬ß<9,çÀÙ‹EU·ÛRÝÔòÚº•B¾;öÔA§Qÿµ÷ÐtƒÏt½‘_ˆiìÍ;Q‚$ÇF íuõÆ×NóÕà›÷ßá3]?üv7þVG£¬œ‹¶tvãC‚¦ 5+wa´(.<øÇg$64¨µ«»úÞ°üS,`|\ÀžÂQ4Z-‰8ûk-º’ß#íßóÉ/¯^ŠÍ2O …N¥öHûñ‰ø+3SÀ⹑‹çFê+ƒ&®inU(„|."#äsåCÃè ê+-÷Nø×fÆFýòA¼á‚šøæ&ÆXŠ^P‘R¥¢S©S4LjAG·ŽÃïKB5 L–àÞ©@Õø8V¾•™àíMkõNÑ{’¸÷e¸6V&F†•õè!jvÏ4< •þZŒ*øÉD‰ ð­ÌͰ›Ö,¶Œ16d¼¼z™R¥úáøéé¬i MÉIŽ6Ö»ŸÆ† ³EýòÁ© U(ª›Z|yÜ3×Óu:/‹ ~ O¯1¥Uu½”©õAùÐpùP=àVEÕÜà€W,ùò§ès‚ <¦ëÕœ|¼ð%}÷ô I¿ÌØÐpšêQ,®ðáq²‹ËÄÛ]Q×€x ¥j…… Ñzîý_´{š›˜`¾Fl;ötšÁ w°±JŒ g::àû‚^ìž &ëwÖft*õËw^›ðV‚HÓ¼"gÓ2Ö,ˆûpÛ í=qCsa•ºAå¯7.õHû½  ¾yÀг¨:5òã‰3žæ&Æ'RŸØ ¸ÊëTjõÌ¥ê™(@\¸™…†yáAMF[Z]ç/àQÈdá3Ý2‹J0¿ÎD…?*Ùºr©©±á•ìüniê3xý¹Uh&QoÝý…Üi8Øuìǃ¡·ØM/Vσå642Ší"†–6¹¿L_4„ð›mľ/ž;Gäï›–_ØÐÚŽÎÓ-‹Af(möòêeò×âïvST`]Ò|–ÛÞSçñnà@Oeqs±Ã·>ÿ÷oUTU-`3™Ž mí~îíݼ1:áÕÑÞg>"È,Õ«¨ªfa´ÈÏà ÜíéTjU}ÓÅ"¸ëõ›êÑÖÝÓÛ/òݳ‹Ë¸®.†t:æT¾£³ßÓbFçš¾¶n¥T&?uõF¿\>®Ñ8ÚØ,‹‹ž¢_L=`ô Èž»¤—ŒŸ)ŸöŒsQUuC[»‡åîæ2'PpøÂe=3}j}|XúÌ”GnŸuHz‡G<9,ªeÂ)Ρ‘plj‚ÁssÅÖµ´{{âW2"âÅaÝ Œ iTƒê¦Ì8ø ø³«ókëV®_”p%;ÿ£ïvã œªeÇŸßöóàýcÏÁ“WÒ–ô´ZÝÕœü•óc9.NNn>N,ÍL)d2ꟛ>J•zT¡°¾wV?IŠ:H(dÒŽ™bqM¨¯—ËL"‘HD,0E©R¡qúž%rws&‰˜‰cca¦áz™  s«_.\Æö‚±05™õxÆñètºy¤lfUC#vûì• ì¬,&ÜüeL©ÄV`ƾ£r°27ŦºèT*ÞGå/àTŠñÑo u:6¾_l]™ljdôíácz¶Ñd¬ˆòÝ^ºª÷ÒŽª†&ùÉs3’pE]£z|\ÈçöËåÎv¶øµ à|H6³o@Ž`èk sü*W+sÓ©G±¸v^Xƒ.äsÇJl– ´·³²¼w_¥J526†v„»Šm‰ö ùtÚu¿éæÅeP({NžÅz–^ù3¢O.··¶ì”HÖþ·ò¡áÌ¢ÒÌ¢R:•úúúUñáÁEUÕhUm,ÍÁÝ¥VÀÚÂ\¥VëM>@ OŠG¦ÕjO_»inb¼mõr|:v‡®nl,˜Ž>/DaÞîl|æÔ¬\Ùº*K™Š_ RªÔÞ\6‰xÛçlg‹î41S¶­Y¾qÉ‚´¼Â÷wþw2ã,&$pmR<æÒ£É_ýéÍOþŽýGŽÞ ÕX [i$EEDÔ4µ<ž“¨”¹ÁDÜ㯉‘áØ È-­øÍÓ ®vØLGU}ŸéŠùPéTj€ç]Ó¹§¯¿«W*òê¹Htì{}kûàðˆï.äsûäø¥5µ,'Gw7ü¹¨ÍTQ×`@¡`ësýÓÜÝl ?Þ™0EDúÍZÈåuõããšQ(¦´÷7\C:ÍÙÎóÁ*ë5M|Dþba&£T6àhc]{[{[,›¸±Y§Ó¡Û¦ à¿£íÅûÛ|y\ÓûÜØS@!“_Z‘lke±ûø)|TþdzX3'Ô×ëÌõô¼2}5ë—–×ÖcŸéT@¥VWÖ7úð8¨¾=Òþ˜›ã¯#‚ ÑÁþ€¶®ÛºŠtKûdƒC¡¾^ØY,'G{k«©GQU5‚ ž|/»¬¶›;–ô˺¥}¢_|fL+*ë9.Nèn,èe õñæ:*Ôm|o¿àŽ#@$DþÂYK²¸ª†H$.˜ŽO$ “M4SÈdW;S£ 4“H$àwU(¤²:†Ê¿_>âã…šÑ s–›¸¡ùAüŽÈCäq쯱ûø)omR¼¿€—_^E"YÎŽ66‹^ù# ­»'»¸,Lè½ÿ³Êjëíl…|÷Ì¢Ò?¬„‹éÙ £D‹¢#m¬‹ªjœìlæä—WyÝžÁÔjµ'¯¤­Y·ÿó²KÊŒŒù¢ÐÜÒŠ¹Áøšy #Ãè]*12 OÉÌAÍŽÈáæ¥ µZR­úàå-øsž»„¹(V%Ìóóp/¨£a¯®[ä% ±?zåEüYÿ~Ïn–XV[ßÕ+]G$–óÌÚ 5—Ñyd>Ó­vuSó‘‹WTŠÁ—ï¼604ÔÜÑÕ7 736æ3]iTƒÆöŽ=ó›åû x‹¢#±5•©Y¹^\Ö«kWd–Œk4a¾^ò¡aühû˅˯¬]¾}ˆìâ2éÀ€±!ÃÅÎÖÒÌÛD§Ó•T׆ ½®ß`ž’‘ëÁb¾¸|q^Ye‡¤—N¥ ØÌ“WÒÚº{ «Äá~ÞɱQ6æÝÒ~w7gO+»¸lšÑZ}òQ…"2@¨ÕjUêñ‰¤W&ëí—-Œ1è#c ž›‹¥™é¬GóÁá‘Ó×n.‹‹þ¿ÍÏVUŽ™›óÜ\Ú{zñë7ñð™nZ­¶7ë'¾žµ(:ò­k‹Å5 ¥ÊÞÚ’j@A×üæ––‡øx¾²fEaUµ1ƒìãÙ)‘bÁRÙ@vqY¸Ÿ…LF÷×ðä°ðñg•õA^‚Q…¢£Gbom àëíZìhcÍg¹,MMH$â¼° @O_YM=`åüX7G{4¤¿èájÎ-Thzzâã9'ÐO* IhQ(…•Õ³vy‹k|yܘÀ掮Yo 2Æ ùîQAþ5M-²ÁA …ãâdkiÑÒÙU$®˜ÙZZœ¾v+M§Ó½ž¾qɂמ[y«¼ÊN vKû éôé«GO_§¤7><Ä€BÑ›³;–ruÛšåÛ·¬Ï))— ™›{rX_ü¸’‘ãÉam[³<£ dL© ôô°43=x.ešsèÜú⹑%Õµ¶¦©¥¶¥u|\óÂòÅ™…¥‚ ˳ñŠÅ5‘B[K qcóøø¸µ…¹;çÜŒ w±±0cýêëyzËœt*õ½—ž/­©ë”ô*”*W;t¯ Tþ§®Þؼtá›Öä—W’I$‘¿¯R¥¾þX¬È<ûL¡Tmýè³õ‹ãÃCV%Ä*”ª–ήýg.`þ²ó¿om\èïê`_ÕÐôÒ‡ŸÎ ôÃÛgZ­ö­Ïÿ½mÍò¸ð`›U×ÒúƧ_ùò8øà›CÇFŠQØÚñ=½ß>‘_^©gŸ¹:Ø%EE`‡ž–'‡¨miEí3t™€ÌÕkÅåì¼ÉÞ`Ì` C0¾p”OwïfŸétº+Ùù'êtº)Þ:#Œè |Uí¬,ÑC#µÏ†GGwì?îçíæ`ïÉa©ÔꦎÎë¹G.]žÅ:\©là»Ã'’c£æG„ŽŒÝÞŸvmR<–¡­»ç«½‡ã#B‚¼=hãc’ÞK÷¼Õ²X ÊÈØØŽý¿ [P{ †FFë[Ûeƒƒ­V÷ýÑ“‰‘á>lpèüÌëÓ[?ˆJþй”…Ñ¢ñ1D"ÝŸöÇ_Ï$ÇDÍ D7JøþèI=‹|Fd—öÉåÑAþÑAD"apx¤±½#§¤l²ül·†¶½åýiy…2ùPT_|DˆF£•ô÷cË[:»¥\ JšÞ-íßwúB€'ŸëâŒ{òjÚ¨Bêë-`3[:»þ{äWüââÓ×nêt@Èç†øx¶uõì>~*12ó7œím#ïú9ÐïÅâÔdAC°¹®Î\Wg|…¯åޚЦEó[š™âËD[1kû¬ª¾I¡TQ (3 9š˜©[ÛÜjkiÁtrð1äèt:I_ÿÅô¬ùEh¸˜€ÍRªT ÷ %ÕµÄs„¸°àÅsçHŽ\¼ìÁf ØÌ©GQUMRTº-3>½©½sçþ#óE¡"!‘Hɱ¹NÙàÐÎG’æDÌ ô#‘ˆ]½Ò='Ïé½7e :zzÏßÈùû¢ëÑýi÷œ<»`NÄâ˜È‘ѱ‚JqMS˶5Ëg-êg/5¶u„øx%F†i4ZÙà`A…¸®yÆo3S(Uye'w€ôËO_»™Qx{åVE]Ã'ÎÄ…'ˆÂ4mc{Çù™Óœ…‡@ˆ®¥-:Ýí¸ó…J‡Ò@žä“7¶]ÊÈÖÛòôóÒÊäñññ=3ŒŸƒÌ«wžnï©ó÷ÇÞA -@€‚Üýô€˜(ˆYwô›ß×ûÑ!gvóV¼ë<‹Ô·´Ý¿CäQ€nP¢·ëò,Búßý¥<ý Œ¦dæ@9<‹LV2kÐз0¡·R¥jíê<ó* E@ g2‰¼">†L"í9yîaí͇í¯abÈðâ²3 K h @ž¯xÀ-ï!@ ³Fß>“Ôµ´… ½¡h @žøÏvìÿåz.Ü­@ äÉ0}6ªP¹tŠ@ ä‰@ÐMø¦b@ È‚Ð+“áMŒ¬­ \ @ž¤ÚæÖ?mYßÚÕ=®V_J϶±0ûó jšš??pJ@ äñC úx øáBïñññŒ‚’®^é…ô,Ž“# @ ÈѵT#¢ÅØþ´ÜùB¥Cé@ Ì-@€‚Üýô€ÀÝ/8tG¿!<»MiÅ’‚ãû¸®ÎÏn¶,[Tp|ŸË =ôåq Žï[¿(ê5@ ¿gH¿«Ö†øxŠü}=Xnîn.2ùËŸK¹ŠÏ@&‘„ޱ¡6ÓÖÒbT¡(«©ß{ê|YmýÅ2bB£ƒüYÎŽæ&Æ}òœ’òŸ~=Û-í{:åàæhŸ °™|¦«±!ãz^Áö~3E~!ß}÷ÿ{AÏ~Øûëå´¯@€']ÒüÏvï•ôËþ—ÌÎÊrû–õûÏ\,×<£Mxÿ›[»º÷Ÿ¹øLÔ6ÂÏgYÜÜ¿~³khdðÉ›/—V×éuêgˆÕ‰q6󃯿ÿ_ê€ç“ÚZšöÃ>Ø!éóû²ÏVÄÇÌ ô“ÊäöÖ–÷gôòø×ö7Ôãã•õÕ-öÖ–‘Â?Ÿ÷¿þþrVÞdÅn\¼`ã’C#£åµõec w7ç䨍˜ÀÍïÿ­¹£ë)”C¨¯×æ¥ UjuGO¯±!cêÌ2ùƒ—7+”*Õàw¥-F z¸Ð§¤ºö)´³-LMÞÿÃæïž¬ijy‚Õà3]}x{; s¥Jýî¿¿ÃÿjgeèÉwws±43S(Û{$)9í=’)èéáÉaÙXšôËnä–ÖÔ=%2Ÿº±x¬ÌMÿ´e‘Hüï‘_k›[õ~ýë¶-â‹éÙÏh¿pu°ã¹¹^Ï+P©Õ¿«yÌü¾ì³½§/ì8<‚¦Ì øâ¯þiˆë¹“½–´¹³ë½ÿIË+T›V¯]ÿús«ÞþbÇS(‡ÌÂÒ⪚º–6G[ë;>Ÿ:ó֕ɦFF‡/¤nY¶èw¥-ƆŒøˆ®^éôí³nißÿýãkVû¨ëæÁrSªTõ-mOVD^^\v{·dpxÄ€BÑû512Œëê\ÕÐT^Û`H§xòßÚ¸f×±S÷›,(~¼¤¨ˆúÖöìâ2Bðáq6%'¥fæ¦dæ< ú0ucñ$ÇDi´Z"‘xÿOvV–fÆÆU MÏn¿p±·‹É(,™¾}¶ÿìð¸öÙ|l}yÔ<&ûÌ€B^»`~|Dˆ“­R¥jí꾘žNC,‹‹~÷ÅMÛ>þ2¿¼Ë¿mÍòÍK.óϘÿ‰N¥¾¼zY|DƒF«mnùï‘“zaca¾<~n˜¯·•%Õ€ÒÞ#9#óà¹-®£–O9M ¨¨k¨¨kÀ§\Ï+(©®òÝ]ìê[Û'<ë\ZþP«Õ~{èøÊù±¾<î¬%F§R¯üô­¸±é…>Á§»»¹úòã—¯þÃ>:•š~`×¥ŒìŸOkÃG©R§fæîØÿ˸F³xn交ù޶ÖÝҾ–{ +¤µ«{šÕà1]Ÿ[”ð鮟§¾!APt:ÝdFüCéDJÕí;¢Ë­¶¹õ‰ß„Î\O?t>E£Ñ¾¸b ÓÑAï×ìâ²CçSJz˜U\ööƵ ¢°É쳦öÎOvýÜ7 GS³rßy~]LH`Z~¡R¥zâwêÆbxrXWç´¼‚yaÁ÷ÿêÁrU(Z:»Ÿxs'Í£RTAÈ$ÞR|¤}yœ<ûÌ€BÞõÑ»žVmsëñÔ«d‰ãâüÜ„釉„ÿù-¯¨ªºX\ëhk½ó½·õ ‚¼ËãbnUTTŠID¢ïþús«œílÿþýž¬ÿȘ ROÿ”qF¥V?ˆÿT¡È(,Ž ´±0ïéëÇÒãÃC©™¹XŠ…ùÿ¥¶¹õÂÍ,ouâ<@·´ïÅ‹3‹J+êcC?{kÛê?¾ßØÞ1£: „¿¾¼¥´ºîlZÆŠø˜‘¡ƒÕÒØhg{ۑѱì’2™|H/ƒ½µU‚(”åäH&“ºz¥—³òPCÙdz)9i×±SÕÍXf/.{óÒ…X"ƒF›/ õd3ôáѱº–¶³iéh@Õ€’ óqçÒiýòÁ¼²ÊëyØ;3þñÎë™E%]½}sC,MMúåƒÓ³Kªk6ó…å‹›’“ÐÌgÓÒÓò ÝíEþ¾®öÆ ú¨BYÕÐx.-sdl Í£û²ue2ƒF;y5ma”ÈÉÎF©Rç””]ÊÈÁ¿´ÃÛ=78ÀÞÚJ«Õ6´uœKËÀÜu1!IQ_þt AÊqq×hÐÈ$ ™Ìvv:qùVˆ'‡äï`côËÒ Š *Ä€wž_'úé׳XεIñ\ç¾û=D$AìíI£´tv¼rQÌŒæ†p]\ÌLŒ4mS{Ç…ô¬Žž^,ƒ|hxŠ‹.Æ]2@W¯´C"±¶0›,¿ÞcƒJ­.«­ 471îê•ÎHߌôÿ÷êK×ó ÎßÈÄ4Úǯ½”–_xþF¦«ƒÝëWÿôëYGëOÕ ®¥í— —Ç”ŠQX—€F5¨nl>rñʨB1ÆÞO‰Ä䨍´¼‚~ùà„lfU}¦~¼øðscéÀÀ¥‰f<'SeqÑ!Þ^|ó=fþ’c£Â|½±D;+Ëù¡lG ™Ü? /¯kÀ¤aenº0JÄvvÒënT2?Ÿ<çdgèéÁ ÑÚ{$'._C¯û⹑QAþ€¿¿ñ4ÿW{·u÷zzxòí¬,éTƒ¡áüòÊ+ÙùX3õâϦèww|$:( ÈK`aj<¦T•×ÖŸ¿‘‰]ôô¾¹Èß×ÂÔäÔÕ›Yťعz}<9,cC¡RuJzS3s±‘Çtq°±Òh´ míçodêõ¾Owÿœ ã3݈›§\êOFAIjf.Þ$úËÖç—ÄÌÙwúB[wϬ+oifèéÑÖÝ3}Ÿ 2@H§R¯dç?ˆÜR³rcCƒâƒœ½„¦ 2/,HÒ/Ãe~¼¯ö>|!@!“~õÉÒyQƒÃ#ÏmÿmxJFÎþº}uâ¼OwïQ6%'¹:Ø­þãûø0 S“W×®T©Õ©™¹êññp¡·Þ®³í«ëVô˯åÞRªT¨ùµïô…ÒšºÊúF¥J%äsñö™Ï«mn0h´·6®136Ê+«lï‘0hT/.ÛÔÈhhd”@@þ°j©³mniE§¤—ÇtMŠŠ071>žzײñvç°œÏ^OU(bCƒ6,NìôööËš;ºŽ^º²*a^jfn}k@*“|Ü9F FniùÐȨ•e¨¯—ƒµõWûO&"3£õ‹S3sO_»éÅeÍ îç•Ýv£áíÕÍçÒ2H$b˜¯÷ëWÿó烘 ðÒŠ%Eâš›·Š±ø?®«3‘HÀæÈ"„ɱQ=’k¹·Æ*{kK>ÓµÏ~“¥ó¢#ü|JªkëZÚm¬_^½ ¿ÐÛÑÖ†ëâ\^Wß/4bÐÃ|½_[·ò‹÷ˇf§ t*µ›§ ƒF ŽÎô†FFëZÚ„|w¼}æÃãüs]\X0‚ éÅ&F†"ßµIñƒC޶Ö7o™…ûù,‰U>Ÿ:ýÿ $Wsnùy¸OØ|W»ŒÂ’Ûõqç¬_”ÐÖÝsæúMC:}Í‚ø¡{;…z‹k#ü|½8ì[UØààãÎ76¡Æ™«ƒÝ¶5Ë•*UVQéÀа…©‰€ÍD¥aflôæ†5@2 KFŠ@Oþæ¥ žK)ªªÆþzIlTCkû³—ÈdRrÌœÍK}²kV«Ë*.#‘ˆ~¾?ýzV¡T$ýý€Ÿ¾yZ~J5ÎuuN…1h´SWoL&¨Éúúë†E ¼¸¬[â›E¦F†"¡“­ÍŽ¿`~8!ß}L©¼”‘#Æ[¨÷³*až€Í¼y«HÒ'£Ó¨®vöÖ–¨}æÉam^º°§¯ÿRF6…Lùû¾±~õWûõöàz_ò­ŠªùE¶–KçE%ÇF:ŸòLÞê!Ï Ü>#KbæôËÿsäžHúf°p/>Üö‚…¼cÿ‘éŸenb¼}ËúáѱŽŸ~Ñe• ŽÅ‡‡`ö™7—mgeyàì%¼5Ð)éÅÞg¯R«Óò 7,N<}í&ÖêüòÊ~ù ÛÅiFÿîê`·eÙ¢='ÏÍÈ0ù¡2 ø *Äïm݄ϰr~¬T6ðÕÞèݖYTúòêe £E¥5uãMymƒ7—sìÒUt:B& ØÌ[åUZ­0_baj²ûøiñ{åJv>‚ €‡‹½Ý©«7Ò ŠÑb7,N zg•bþ*…òÏŸŽ)”€CçS>~mk°·àüÌ‘±1T€]½Rü¼ö¥Œü\[sG×úE §Éæì éôoohk´u÷p]C}½Pû̈A_3'§¤s$ç•V¾ûÒÆyaÁG.^ÆJ(­©Ó›@÷`¹µwKPCÇÄÈpQ´¨¡­ýû#'1«™h7û±05 zTˆ±[ÎàðH|D–¡¦©¥¢®S¶‚ ñŸ_Üêë5»Øvo.ÛÊÜìÂͬiæ736òóp¯ij™‘I‡QTU½:1ÎÅÞ›LôópïéëÇûÿH$"¦u€ÈaKg×ÎGPÕ"“ÉÞñ”kh\ét*x,åêdŽsÓE§ÓU75£×hÑÜHIŸì›ƒÇÐòë[Û^Y»bxô¶/vjõhjï”  ù\Ì>c99˜]»m}®J˜§ÿçχ0·fª&D†Ñ þµ÷*Šœ’òí[Ö/‰™SZS‹@}rL+N°ue2×Õ¥º±Y*@TšÚ;1·1à?¿üŠõ‹¬âÒ•ócÃ|½/¦gO61=Y¿xrX><㋪n·¥º©åµu+…|w쩃N£þkï¡é(Ÿéz#¿ÓØ›w¢<IŽÚyàjá•V×mß²>iŽèçSç°Ó *Åèʰ֮nk ³è ÿ—¯? ³íß|ÿ3k+cCFE]ÃÔO9SÃqqêéëÃOóétºò{ÅY›bÇçY‡~¸üã7©?|½ó½·–f¦³þß?¿°1Ô×kÿ™‹7oMó:•ºãÝ·-MMÿúÍ.ÌUN§R·oY}¦h¯R«Óò xLWg;[4½w¦fåâ³Õ·¶ãcì¤=sA*°45™ZÝöB{·dßé ®l7qc3öT:26†÷îXš™:ØXå—WHD"ú)­©³05114‹k¨þ]âl&…L.ßö úòÜÛ:Ä÷\£&…€ÍTªTÙÅeXzZ^!ÛpPÓÜ‚Þ$c ¥T6`1¥ °¡A‘ˆzõœlm&Ë?<:†g(m]=Xù6“D$æ””c­V©ÕµÍ­lç{ÞÞÝ€ïÊ“å†9Ïl&‘HLÍÌÅ»$§éïô`¹!rãV!–‚²x D‹B+îê•:ÚXÏB,LMV%ÎëèéMË+˜N~‘¸qÉNwâòõÙi]ymƒF£òoû±Œ LG½ ˆÜÒ Lnm€œ’rÔ8CSˆD¢©±á4ÿqILT§¤·°²z² 6³©£Õ7[K sãì’2Ìø«omï”ôâ3O¡:®D\ëîæ‚ºB¾»J­FÃÚÜÌÖÒ"³°?'‹i…Ë­®¥ ³SUjuVq©ƒŽ¿²xAµw÷,L¦Õ/ˆD‰H¬jh"‘ˆ¶–æ“埢ßùò¸£ EymÖðÖÎnùÐ0Ûùîfmsë4­v…RÅrr41Ò¿ˆ¨üsJʱS·´OÜØÌcºàoŠ«ðrS£éêò€xtš§Ð¨ß¼ÿŸéúá·»ñ·:Õ`åüXì°¥³4©Y¹ £EqáÁ?ž8C ±¡A­]ÝÕ÷†õàŸbããöŽ¢ÑjIÄ\륱Q^Ö |2MÏÁP(t*µGÚOÄÏ X™™Ï\<7R_y4ùðpMs˨B!äsÑ!Ÿ+Fg(P_i‰¸w¿636ê—â ÔÄ771ÆRô‚Š”*J¢9F :ºu~_ªÁ¤‹'î Tcå[™™ÞÞ´Vï½'™Á{jè`cebdXY߈¢f÷LóPP9à¯Å¨BŸL$˜ÀßÊÜ »iÍbËcCÆË«—)UªŽŸžÎšÙ”œähc½ûøilè0 1[0Y€¾-ÕM-¾<î™ëé:ΗÇE?…§WȘR ÀOÝ¢)Së×ÕÙ‹ËÚ±ÿÈdÆ1‚ <¦ëÕœ|¼ð%}÷ô I¿ÌØÐpšêQ,®ðáq²‹ËÄÛ]Q×€ºîÐ‡Ò µÂ€BaÐh=÷þ/Ú=ÍML0_#¶€ûG:mªíul¬#ÙŽø¾@5˜ô”)úµ…Jýò×&¼• L†ýlZÆšqn{¡½G"nh.¬£S7¨üõÆ¥iŸËÍNÃŒ?ùðÝz*ÕjFÈcá‘Ûg¨¡0… ÍH$âd]026ffl¤w¢9îyŽD$®š?¯º±ù•¿}‰=þ{ f]í7Ö¯^›1={ú{*R (;ß}Û›Ëþì‡}z@}ò€gQ tj2><äÇg==ÌMŒOàb§\WgA~úûûzéᄌéÝ7ýõ›]3™áÒ2Q"¸p3 󃚶´ºÎ_À£ÉÂgºe•`~‰ T ²uåRScÃ+ÙùÝÒ>Ôgðús«Ð&L¢Þºû ¹Óp°ëØ)4ŽCo±›^¬žËmhdÛE -mr™¾há7Ûˆ}_µ>ÎN ç‘Ûg’ÞÁáO‹j@™pŠshdÜq¢`ðÜ\ñ‡u-mÁÞžø•Œ‚xqXw#CÕ º©3ÎþþìêüÚº•ë%\ÉÎÿè»Ýø§€j@Ùñç·ý@ OŠG¦ÕjO_»inb¼mõr|:v‡®nl,˜Ž>/DaÞîl|æÔ¬\Ùº*K™Š_ RªÔÞ\6éŽUálg‹î41S¶­Y¾qÉ‚´¼Â÷wþw2ã,&$pmR<æÒ£É_ýéÍOþŽýGŽÞ ÕX [i$EEDÔ4µ<ž“.¦gÿí¿?á?è¡Ó³þöߟ~s9cC†«ƒ6ÓQUßÄgºb>T:•ày×tîéëïꕊü…z.R#û^ßÚ>8<"ä» ùܾ9~ÁGiM-ËÉÑÝÍ.j3UÔ5P(¡¾^Xúœ@?À4wE7ÃOg w&LQQ~³ry]ýø¸&AJº×Æ7\C:ÍÙÎóÁ*ë5M|Dç«ÀLF©lÀÑÆ».ö¶.ö¶X6qc³N§‹ b)øïh{ñþ6_×ô>7öPÈä—V$ÛZYì>~ •?™ž–ÄÌ õõ:s==¯¬B/s¿|°¼¶ûL§*µº²¾Ñ‡ÇAõ푾ðç— —÷ž:}ШÇÔÌܽ§Î£ö >jÐ-í“ …úzaWŸåähom5#õ(ªªF$À“ïÅa—ÕÖcsÇ’~Y·´Oà‹ÏŒiEe}#ÇŠݽL¡>^C#£Ó\G…ºïíÜq„ˆ‚È_8kIWÕ‰ÄsÂñ‰D"a²‰f ™ìê`gj4f‰üÆ£ …T6@§ÑPù÷ËC|¼ (·`l,Ì=Xnâ†æ\®<,Çþ»ŸòðÖ&Åû xùåU$"‘åìàhc³è•?Úº{²‹Ë„Þû?û¨¬¶ÞÙÎVÈwÏ,*ðóÁJ¸˜ž½0J´(:ÒÑÆº¨ªÆÉÎfnp@~ye×íL­V{òJÚšqû?ÿ(»¤Ì˜Á˜/ Í-­˜€¯I— 12 €Þ¥#ÃÐPñ”ÌœÜÒ @d€póÒ…Z­N©V}ðòü¹Ï]Â\«æùy¸TˆÑ0ˆW×­òȇØÎ޽ò"þ¬¿¿ç7K,«­ïê•®NŒ#Ëyfma†šËè<2ŸéŠV»º©ùÈŇ`_ú x‹¢#±5•©Y¹^\Ö«kWd–Œk4a¾^ò¡aühû˅˯¬]¾}ˆìâ2éÀ€±!ÃÅÎÖÒÌÛD§Ó•T׆ ½®ß`ž’‘ëÁb¾¸|q^Ye‡¤—N¥ ØÌ“WÒÚº{ «Äá~ÞɱQ6æÝÒ~w7gO+»¸lšÑZ}òQ…"2@¨ÕjUêñ‰¤W&ëí—-Œ1è#c ž›‹¥™é¬GóÁá‘Ó×n.‹‹þ¿ÍÏVUŽ™›óÜ\Ú{zñë7ñð™nZ­¶7ë'¾žµ(:ò­k‹Å5 ¥ÊÞÚ’j@A×üæ––‡øx¾²fEaUµ1ƒìãÙ)‘bÁRÙ@vqY¸Ÿ…LF÷×ðä°ðñg•õA^‚Q…¢£Gbom àã7þ8ÚXóY®KS‰8/,ÐÓ×_VSX9?ÖÍÑ iÇ/z¸šs šžž„øxÎ ô“Ê(dZJaeõ¬]žÅâ_7&$°¹£kÖÛ‚L§±zûZ£nËÆötŠÖÌØÈÖÒâôµ›XNwözúÆ% ^{nå­ò*C:=2@Ø-íC'F§©=}ý’Þøð EoÎîXÊÕmk–oß²>§¤\68dnbìÉa}ñã~@JFŽ'‡µmÍòŒ‚’1¥2ÐÓÃÒÌô๔iΠsë‹çF–T×j4Úš¦–Ú–ÖñqÍ Ëg–6,ÏzÄ+×Dm--ÄÍãããÖæ>îœs72&Ü5ÆÆÂüõ«¯çè-sЩÔ÷^z¾´¦®SÒ«Pª\ìн6PùŸºzcóÒ…onX“_^I&‘Dþ¾J•úBú#t²B 3âqØg ¥jëGŸ­_”²*!V¡Tµtví?swaà_vþ÷­k¢ý]쫚^úðÓ9~xûL«Õ¾õù¿·­Y,`³êZZßøô+_?|sèØ¨B‘ [» ¾£§÷»Ã'òË+õì3W»¤¨ìГÃòä°µ-­¨}†.ó!ù¡z­¸œ7ÙûŒ tÆŽòéî½àÁì3Nw%;ÃâDN7Å;@g„¯ª•%zhÄ ?ûL©là»Ã'’c£æG„ŽŒÝÞŸúAiëîùjïáøˆ ošÁðèX‡¤÷RÆ=oõA‡ldƒ226¶cÿ/è”ÁÞ‚¡‘ÑúÖvÙà @«Õ}ôdbd¸;'ć*:#óúôÖ¢’?t.ea´hE| ‘HD÷§ýñ×3É1QsƒÑ¾?zRÏ"ŸYÅ¥}ryttP‘HilïÈ))›,¿Û­¡­CoyZ^¡L>ä¢Ñh%ýýØrã–Îîc)WcCƒ’æ„wKû÷¾àÉçº8cçž¼š6ªP„úz ØÌ–ήÿù¿¸øôµ›:ò¹!>žm]=»ŸJŒ Çü g{ÛÄÈ»~ô{±¸5YÐl®«3×Õ_ák¹·&´iÑü–f¦ø2ÑVÌÚ>«ªoR(UTÊLBŽ&fêÆN€ÍRªT ÷ %ÕµÄs„¸°àÅsçHŽ\¼ìÁf ØÌ©GQUMRTº-3>½©½sçþ#óE¡"!‘Hɱ¹NÙàÐÎG’æDÌ ô#‘ˆ]½Ò='ÏéÙ—SÐÑÓ{þF¦Èß]ÿ‹îO»çäÙs"ÇDŽŒŽTŠkšZ¶­Y>kQ8{©±­#ÄÇ+12L£ÑÊ *ÄuÍ3~›™B©Ê+«à¸8ù¸s¤_>xúÚÍŒÂÛ+·*ê~8q&.,8A¦ÑhÛ;ÎßÈœæ,<ò@t-Õˆh1ÐénÀ/T:”òô@ Ÿ¼±íRF¶Þ.§Ÿ—V&ï™aüd8ØX½óüs{O¿?öy¼h‚„äî ‡ôÀDA̺£ßü¾Þ<»0h´›·Šà]çY¤¾¥íþ‚ tƒ½]‡ gäïïmwñô3¤S‡†êšÛÒò ²‹KtZè?ƒ@ È3ú&ô¶67ûë7»ÖòOd¶<°ÿ,£ „díÐÞ%1¤QyL>Ó%Ø‹ÿÝáãJ([<#Iäñ1’~Ùž“ç qùiͽé¼ò%4æÌFý꥖kVQÉç¯Bé@ ÌœõŸœìî¾:pxtlç#’¾þ\ @ ™5úûÓªÇÇSÒ&@ Y0ÁûÄM¿?9@ <-L`ŸÉäC¿?9@ <-  @ž*&°ÏÌLŒ~r€@ yZ˜À>ã3Ý \ @žúö™DІr@ yRÚºz°C:í«–Z[˜g—BÑ@ <ø¸¸¨„Ý•Êg¹‰ÄŠªïWÈP:@ 3çß¿áïãdkÃgºÕµ´^ÏÅÞí3@ '¢k©FD‹Ñ÷o€;_¨t(@ ™óÀïß|v›þÒŠ%Ç÷q]ŸÝ&lY¶¨àø>Öí³¾Ø!éóû²ÏVÄÇÌ ô“ÊäöÖ–÷gôòø×ö7Ôãã•õÕ-öÖ–‘Â?Ÿ÷¿þþòä¯ß¸xÁÆ% †FFËkëËÆînÎɱQ1!›ßÿ[sG×S(‡P_¯ÍKªÔꎞ^cCÆ93 KäCÃØakW÷ïG[Œôp¡OIuíShg[˜š¼ÿ‡Íß=YÓÔò«Ágºúð8.öv6æJ•ú݇ÿÕÎÊ2Гïîæbif:¦P¶÷HR2rÚ{$S<ÐÓÓò±4: é—ÝÈ/,­©{Jd>ucŸ½õ Õ€‚OÉ,*ùõrš^¶¿nÛRP!¾˜žýŒö W;ž›ëõ¼•Zýû ÇÏïË>Û{úÂÎGÛº{–΋z÷ÅM÷gÙyàÈ™ëéƒÃ#hÊÜà€/þøêŸ¶l¸ž[0®ÑLXlsg×{;þ“–W¨„77¬^» þõçV½ýÅŽ§P™…¥ÅU5u-m޶Ö'v|>EÎŽŸ®jhú}ö cCF|DHW¯túöY·´ïÿþñµF«}Ôuó`¹)Uªú–¶'+¢@//.»½[28þ2¿¼Ë¿mÍòÍK.óϘÿ‰N¥¾¼zY|DƒF«mnùï‘“zaca¾<~n˜¯·•%Õ€ÒÞ#9#óà¹-®£–O9M ¨¨k¨¨kÀ§\Ï+(©®òÝ]ìê[Û'<ë\ZþP«Õ~{èøÊù±¾<î¬%F§R¯üô­¸±é…>Á§»»¹úòã—¯þÃ>:•š~`×¥ŒìŸOkÃG©R§fæîØÿ˸F³xn交ù޶ÖÝҾ–{ +äwå{œètºÉŒø‡Ò‰”ªÛwD–[msë¿ ¹ž~è|ŠF£}qŦ£ƒÞ¯ÙÅe‡Î§(”*ô0«¸ìíkDa“ÙgMíŸìú¹o@ަfå¾óüº˜À´üB¥JõÄ/îÔEiï–\ÉΟ¢–Û¨BÑÒùûê€Í£RTAÈ$ÞR|¤}yœ<ûÌ€BÞõÑ»žVmsëñÔ«d‰ãâüÜ„釉„ÿù-¯¨ªºX\ëhk½ó½·õ ‚¼ËãbnUTTŠID¢ïþús«œílÿþýž¬ÿȘ ROÿ”qF¥V?ˆÿT¡È(,Ž ´±0ïéëÇÒãÃC©™¹XŠ…ùÿ¥¶¹õÂÍ,ouâ<@·´ïÅ‹3‹J+êcC?{kÛê?¾ßØÞ1ÓjÌ]:/z\£©mn½’‡FùÌ«¥±ÑÎö¶#£cÙ%e2ù^{k«Q(ËÉ‘L&uõJ/g塆²;gSrÒ®c§ª›±Ì^\öæ¥ ±D6_êÉf1èãcu-mgÓÒѪR ( ¢0wŽ!Ö/Ì+«¼žW ÓÝ~–ÿÇ;¯g•tõöÍ °45é—^LÏ.©®ØÌ–/lJNB3ŸMKOË+ts´ùûº:Ø3è£ eUCã¹´Ì‘±14^ìËÖ•É íäÕ´…Q"';¥JSRv)#«ÀÛ=78ÀÞÚJ«Õ6´uœKËÀÜu1!IQ_þt AÊqq×hÐÈ$ ™Ìvv:qùVˆ'‡äï`côËÒ Š *Ä€wž_'úé׳XεIñ\ç¾û=D$AìíI£´tv¼rQÌŒæ†p]\ÌLŒ4mS{Ç…ô¬Žž^,~îû~ĸKèê•vH$Öf“å×{lP©Õeµõ1!æ&Æ]½Ò雃þÿ^}éz^Áù™X"ƒFûøµ—Òò ÏßÈtu°{cýêŸ~=ëhcâãI£Ôµ´ýráò˜R‘ òШÕÍG.^U(¦ÓX C:H$à¯2†€Í¬ªoÂ~òóà҇˜›K.M4ã9™z,‹‹ñöúà›ï1óæë%ÚYYÎe»8RÈäþyy]& +sÓ…Q"¶³“^w ’ùùä9';›@OÖÞ#9qùzÝÏŒ òüý? ù¿Ú{¸­»'ÐÓ#À“ogeI§ ç—W^ÉÎÇš©6E¿»3à#ÑAA^ Sã1¥ª¼¶þüLìB §÷ ÈEþ¾¦&§®ÞÌÂm¨®×ÑÀ“Ã2f0*U§¤753 yL×øð+FÛÐÖ~þF¦^ïût÷Ï ¢0>Ó A€¸±ùxÊ5¬È£æqØg[W.õä°ŸOýjßa,qŠaú~#Ãü¼³iéÿç'4%Aö·×·âód”¤fæâM¢¿l}~IÌœ}§/´u÷̺ò–f¦žmÝ=3ò9EéTêÔOÒ¿IjVnlhP\xð³—ÐAæ…Iúeø±Ìσ÷ÕÞÇ/¤(dòѯ>Y:/jpxä¹í¢ OÉÈùÏ_·¯Nœ÷éî½3­ÃÚ¤xìû«kW¼ûïïòÊ*gZˆ…©É«kWªÔêÔÌ\õøx¸Ð[ï×ÙÎöÕu+úåƒ×ro)U*ÔüÚwúBiM]e}£R¥ò¹xûLÈ玌Õ6·4Ú[טå•U¶÷H4ª—mjd442J XµÔÙÎ6·´¢SÒËcº&EE˜›O½kÙx»sXNŽg¯§*±¡A'vHz{ûeÍ]G/]Y•0/53·¾µ •É>î##·´|hdÔÎÊ2Ô×ËÁÚú«}‡'¼ÌLŒÖ/JLÍÌ=}í¦—5/,¸o@ŽÉ o¯nl>—–A"Ã|½ßX¿úŸ?Ä|H€—V,)×ܼUL£ )\Wg"‘€Í‘E“c£:z$×ro)TöÖ–|¦+jŸý&KçEGøù”T×Öµ´9ÚX¿¼z~¡·£­ ׏¼®¾_>hÄ ‡ùz¿¶nå?î— ÍN¥éTêŒL|ñSÁÐÈh]K›ïŽ·Ï|x€®‹ F$½ ØÄÈPäï»6)~`pÈÑÖúæ­"3c£p?Ÿ%±ªÃçS§ÿ¿ž¡¾^€‘±±ô‚b¼‚6ßÕÁ.£°äv}Ü9ë%´u÷œ¹~ÓN_³ ~`èÁN¡ÅâÚ?_/ûVE68ø¸sÄM¨qæê`·mÍr¥J•UT:04laj"`3Qi˜½¹a ŒÂ’Q…"Г¿yéƒçRŠªª±¿^ÕÐÚ~àì%2™”3góÒEŸìÚ£Õ겊ËH$b„ŸïO¿žU(•I? Âß§o@ž–_ Rs]Da íÔÕ“ j²~‡þºaÑ/.ëV…øfA‘©‘¡È_èdk³ãÀ/˜NÈwS*/eäȇ‡ñêý¬J˜'`3oÞ*’ôÉè4ª«ƒ½µ%jŸyrX›—.ìéë¿”‘M!“Eþ¾o¬_ýÕ¾C½ý¸Þ—|«¢êF~‘­¥ÅÒyQɱQ‡Î§<ã7}È3Ã#·ÏÂ’˜9ýòÁÿ9O—ôÍ`á^|xˆV«Ûuô–’’™³yéB7G{,Ú(d²!† Èü¢äØ(oÖö€|¸í yÇþ#Ó?ËÜÄxû–õãc??ý ¢Ë**‹Áì3o.ÛÎÊòÀÙKøA¿SÒ{äÒô»J­NË+ܰ8ñôµ›X«óË+ûåƒl§ý{msë_ï*ªª– ÙYY¬ˆ]•ûÿ{}Å›ïâýyÓa~D(…L¾‚ ñ{[7á3¬œ+• |µ÷0j·e•¾¼zÙÂhQiMݸFS^ÛàÍå»tΣÉ6óVy•V«Ì…X˜šì>~Z|Ç^¹’  @àábowêêô‚b´Ø ‹ÄޙE¥˜?†J¡üóçƒc %àÐù”_Ûì-8#sdl `W¯?¯})#?×ÖÜѵ~QÇÅi²9;C:ýÛCÇÚÚmÝ=\WçP_/Ô>3bÐÇÌÉ))ÇÉy¥•ï¾´q^Xð‘‹—±Jkêô&Ð=XníÝÔÐ112\-jhkÿþÈIÌêE&ÚMç~,LMÂ…Þbì–38<‚e¨ij©¨kÀ”­ Büç7†úzÍ.¶Ý›Ë¶27»p3kšùÍŒü<ÜkšZfçµ-ªª^çbo‹M&úy¸÷ôõãý$Ó:@d€°¥³kç#¨j‘ÉäïxÊ54®ô7é–öÕ6·Je† Z— AfÌ`œ¸|ËÀcºètºê¦fô-š)é“}sðZ~}kÛ+kW ÞöÅN­Mí²ÁA!Ÿ‹Ùg,'#âk·­ÏU óÔããÿüùæöÃLÕ„È0šÁ¿öBE‘SR¾}Ëú%1sJkj1¨o@ŽiÅ)¶®L溺T76KeèƒJS{'æ6üç—_±~‘U\ºr~l˜¯÷ÅôìÉ&¦'ëwOˇÇ9pöbQÕí¶T7µ¼¶n¥ïŽ=uÐiÔí=4Åà3]oäb{óN”‚ ɱQCC;A-¼Òêºí[Ö'Íý|êvzA¥]ÖÚÕmmaäâòõ§a¶ò{à‘ïæ`melȨ¨k˜ú)gj8.N=}}x³@§Ó•ß(† ÈÚ¤ø;>Ï:ôÃå¿Iýáëï½ °43õÿþù…¡¾^ûÏ\¼y«hš§Ð©Ôï¾mijú×ova®r:•º}Ëzì³eÙ¢é¥R«Óò xLWg;[4½w¦fåâ³Õ·¶ãcì¤=sA*°45™QÛ3 K.ed÷ôõ«Ôê–Îîþ|ðÈÅ+t*596j¦bô`»‰›±§Ò‘±1¼wÇÒÌÔÁÆ*¿¼ @"ÑOiM…©‰‰¡! X\C5 ðïì'`3)dr±ø¶Ñ—çÞØÖ!¾7à5)l¦R¥Ê..ÃÒÓò ؆s€šæô&S(¥²‹)… Í‚ˆDÔ«çdk3YþáÑ1Ô8CiëêÁʰ™$"1§¤kµJ­®mne;;âKÀnÀwåÉrÜg6“H$¦fæâ]’“9óî/A· ±ÔÅk ZÚXùÐpW¯ÔÑÆz]ÉÂÔdU⼎žÞ´¼‚éä'‰—,ÐétxûfF”×6h4!ß=46d0ô‚"rK+0¹5¶urJÊQã M!‰¦Æ†ÓüÇŽ\ÊȾUQ•–WøÏŸ¶tv… ½ñã€ÍlêèDõÍÖÒÂÜÄ8»¤ 3þê[Û;%½øÌS¨‡N§+׺»¹ .F€ï®R«QŰ67³µ´È,,ÅÏÉbZáÁr«kiÃìT•ZU\jÄ ã¯,^PíÝ= “iõ "‘@"«šH$¢­¥ùdù§èw¾<î¨BQ^Û€5¼µ³[>4Ìv¾û„YÛÜ:M«]¡T±œMŒô/"*ÿœ’rìÆÔ-í76ó˜.øÇ›â*¼$ÁÔhºú< ÜfH§¤²)„A£5utê%öËåøÃWÖ.ß´$éVEÕ‘‹—%ý²qÆÖÒâ½—6Qȳ|QÕÛ×.}êê¯æ)4ªÁ7ï¿Ãgº~øínü­ŽF5X9?;léìÆ‡MAjVîÂhQ\xð'ÎHlhPkWwõ½a=ø§XÀø¸€=…£h´ZñA¯õ…ô¬5 âø,×e@¡Ð©Ôé=.7ü ‚•™)`ñÜÈÅs#õ•‡A“×4·Œ*B> ‘ò¹ò¡at†õ•–ˆ{'ük3c£~ù ÞpAM|sc,E/¨H©RÑ©Ô)šcÄ £[Çá÷%ÑÛUÏÀ½Sªñq¬|+33ÀÛ›Öê¢÷$30xO l¬LŒ +ëÑCÔìžix *üµU(𓉠à[™›a7­YlclÈxyõ2¥JõÃñÓÓYÓ@ ›’“m¬w? 2f‹úåƒS2ªPT7µøò¸g®§ët:_Aüž^!cJ%?u‹¦L­“¡ÑhÓò 7%'±Ñ& Âcº^ÍÉÇ _r¯+ZÒ/364œ¦z‹k£ƒ|xœìâ2ñvgWÔ5 ¨Q8¡VP( MÏŽvOsÌ׈-`Çþ‘N3˜¢½6V‰‘áLG|_ LzÊýÎÚÂŒN¥~ùÎkÞJP¦?Ã~6-cÍ‚¸·½ÐÞ#74V‰Ñ©TþzãR´ÏƒåfH§aÆŸ|øn=•j5€N›>@ ³à‘Ûg¨¡0… }¨#‘ˆ“uEÀÈØ˜™±‘Þ‰æ¸ç9‘¸jþ¼êÆæWþö%öøì-˜uµßX¿zmRüÅôìéï©H5 ì|÷mo.û³öéMõ ÈVlœE5Ð©ÉøðOœ ôô071>‹zÌŒŒŽȤ™êŒ‰À…›Yh˜ÔtÐh´¥Õuþ…L&>Ó-³¨óëLTø£A­+—š^ÉÎï–ö¡>ƒ×Ÿ[…6aõÖÝ_Ȇ#€]ÇN¡q<z‹Ýôbõ®Ñ8ÚØ,‹‹ž¢_L=`ô Èž»¤—ŒÌŸîj­¢ªê†¶v/ËÝÍeN 0&$àð…ËzfúÔúø°ô™)Ü>ëôxrXTÊ„SœC##àŽƒçæŠ?¬ki ööįdDċú;@Ò¨ÕM-˜qððgWç×Ö­\¿(áJvþGßíÆ8TÊŽ?¿íçÁûÇžƒ'¯¤=,éiµº«9ù+çÇr\œ&œÜ|œøð8€NÉÌ\5J•zT¡°6¿g9ˆîõ.PȤ)3ÅâšP_/–™D"‘ˆX`ŠR¥Bãô'wL¡ÄfùÑ@{;+Ëûw÷UªT#cchG¸«Ø–h¿O§]÷›n^\–…²çäY¬gé•?#úär{kËN‰ôaí+Î,*Í,*¥S©¯¯_\TUVÕÆÒÔâ/™¹J­Ö›|€@ž<þL«Õž¾vÓÜÄxÛêåøtì]ÝØX0'}^$ˆÂ¼ÝÙøÌ©Y¹²uU2–2?"¿8@68¤T©½¹lñ¶ÎÙÎÝib¦l[³|ã’iy…ïïüïdÆYLHàÚ¤xÌ¥G!“¿úÓ›žüû½ªÿ°@·ÒHŠŠˆ ¨ijy<;'QÈäè`"îñ—Ït}ý¹U€)Þ£€alÈpu°Ãf:ªê›øLẆJ§R<ïšÎ=}ý]½R‘¿PÏEjÄ cßë[Û‡G„|w!ŸÛ7 Ç/ø(­©e99º»¹àÏEm¦Šº ]O‡2'Ð0ÍÝAÑÍÆðÓè ST@T ß¬…\^W?>®I…bJ{Ãõ0¤Óœíl1  ²¾Q£ÑÄG„à/f2JeŽ6ÖØ…p±·u±·Å²‰›u:]d€KÁGÛ‹÷·ùò¸¦÷¹±§Ö¢—V$ÛZYì>~ •?™ž–ÄÌ õõ:s==¯¬B/s¿|°¼¶ûL§*µº²¾Ñ‡ÇAõ푾ðÇÌØ4Z\X°F£Á^ð€tKûdƒC¡¾^ØY,'G{k«©GQU5‚ ž|/»¬¶›;–ô˺¥}¢_|fL+*ë9.Nèn,èe õñæ:*Ôm|o¿àŽ#@$DþÂYK²¸ª†H$.˜ŽO$ “M4SÈdW;S£ 4“H$àwU(¤²:†Ê¿_>âãe@¹ccaîÁr74?ˆßyˆ<Žý5v?å/à­MŠ÷ðòË«HD"ËÙÁÑÆfÑ+´u÷d—… ½÷öQYm½³­ïžYTá烕p1={a”hQt¤£uQU“ÍÜà€üòÊ ¯Û3˜Z­öä•´5 âöþQvI™1ƒ1_š[Z178_“ /Abd½K%F†¡¡â)™9¹¥€Èáæ¥ µZR­úàå-øsž»„¹(V%Ìóóp/¨£S¯®[ä% ±?zåEüYÿ~Ïn–XV[ßÕ+]G$–óÌÚ 5—Ñyd>Ó­vuSó‘‹Wò?Þy]68ÔÜÑ)²µ´à¹¹„c)WñÛO†¿€·(:[S™š•ëÅe½ºvEFaɸFæë%ƶ¿\¸üÊÚåÛ·lÈ..“ 2\ìl-ÍL± At:]Ium˜Ðàú½æ)¹,æ‹Ëç•UvHzéTª€Í2¦à¹¹Xš™Îz49}íæ²¸èÿÛü\aUõð蘹‰1ÏÍ¥½§¿~Ÿé¦Õjkp³~ò¡á éY‹¢#ßÚ¸¶X\£Pªì­-©tÍoniyˆç+kVVU3èÁ>ž)< • d—…ûùPÈdt O VYßä%U(:z$öÖV>~〣5hijB"ç…zúúËjêÿ¿½óŽâ8ûø³W¤+꽡~êI Ž@!ºhlÀ—„ØÁvòòÚŽS^'¶ÇÁblcS ÓA€D‘„zï¨÷Þ»®ßíûÇJËr*œ$@…ù~ôÑçnnvv÷™™Ýß>ÏÌìÖ•aVf&Äv꤇{©™„ÑÚ‰¯»ËŸ…=½*LQAöÃÒ)».NzÚZ?݈V2V@ÄÖ×/ Î+-—Éäe5uåuõR©ìõÍ듲ói4Œ¼,OùŠ—[Rìíi¤§[R]+•J tuÜíy7âÇ\5ÆPWgÿÎm±éY Óœ€Ãb}ôækùeÍíB‘ØÒÔ˜Xkƒ°ÿ•{ñ{6®}w×öŒÂ‡L#ÈËC$–ÜLx®NVbž‡>ŠÄoýåóëV…ø¾&‰ëš[N]»IføÃáÿ¾·{{ˆ—¥©IqUÍ›þl‰ÏBª>“Ëåïýý«}Û7¯XìlkSQW¿ÿ³ƒ<ê…àÈ™_øBaDÿŽÕáMm_Ÿ½˜QøPAŸYš¯ $¿ºðl\x6P^WOè3bš†­ ôS8‹;)éã½?@ƒË%.ÁÔ >ûöLOŸá8~7%c×úU8Ž+ã»Ru—z¨ÆúzÄWu.‡ÐgB‘ø‡Ë7¼]-LŒÝìyC|AÖÃ’Kwâ¨/!PžÎžÞ¯Ï^Œ Yè7$^Ÿ–º²ZCkÛÁgÃ}¹9±UUù‚¦öŽÛ‰½Õ‡¸d9Ȇ`H 8têgb ÊÅnÎCüÊúÆžþ~Ëñ£ç/¯ p·çùº³zú¢â“b•›?HXþÌèµKƒ¶„‡Òétb}Úï/]‹ Y¶Ø‡X(áèùË Š|R$çæwõõ-]äµt‘7NëªnlJÍ+/¿“­UUC“Âôþ¸ôìž¾E Ã}e2y{w79ݸ®¹õ—è{a~‹Ö, híì>yõ¦·‹£…9¹íå{q|¡ÐÏÃÍÙÖº®¹å¿ç.Q'_½ÿÇÁÓÑÎ×Ý¥¡¥íÛ WVþ071ZüÈÏA|Î-)#$ 1ÛÎÒÜÎÒœzÀ÷Ó2ÇÔ´D~=m-j™ÄYLYŸWÖEb–ªÊd†ÍÄ'[ÓØ\cÚlkn¦ÎåHe²¦¶Ž„¬\2¼èlk#‹«¿€ä•–ÓoÐVø/^¿lIgoï¹[wœl­m­'ÕIÉ(<ñÀðºR,h=àøðŒ|`quˆÙ†}ºßíÄ…U0³Ÿ7·FJ¥Ò&9~1L õÿÚ+'®D{‡@<_ä€a€ÑÃýñˆ¯0Ö füü‘ëýèÄÜ…Ëf?ÈÌAw¹He]Ãè‚Ïb…U‡ˆ¹ÒgÄÜ``ˆ”Šì0Q>ªŽ˜2ÄÐ7O7‘X\ßÒ† ‚˜óM™@ s&ƒ¹%<”É`üpùÆÓZ›˜Aÿ @ s‘Xü»/#; æ Ȇ@ 1»@ú @ bvô@ ijfr/¤QßT@ x&LF¡!ÿ@ Äìé3@ ˆg 6Ù hÓÛ@ 1ši‰*~†@ 1«@ñM@ ˆgöÄ„ÑÐ&•@ ñ$°QÈoJé-ÚŠE @<_FôÙ° Ã0À°9!ÐÞܲ!ëÂI;Kó¹ký½›Öe]8édcE|õp°ËºprçºÔ.˜Ã K* ¦*©”’Ÿ)€ÏC/š¯»K—‡“•½•… “ùÅñÓ¿Dߣf`2þžna~>ζÖFzº|¡° ¬òÄ•¨‚òÊ ŠÕPã†úú,]äecn¦£©ÑÕÛ—šWxüÒõÖήÙi+3“ˆ g[kGkK 5nlzÖ/Œ™ÓÂÄhï¦õ>.ŽšêjÝ}ý…åUß^¸RÓØ<Íðvq|yÍÊÏ¿=ÑÞÝ3Ÿ˜±¾Þ½;O]»•[R6GOáã_í©oi=uíÖœ8ÚÀ…î›V,ûÓ‘cC|øôÝ_ç—V(tê9ĶU+œm­ÿøï£ó©SÀk‘kôt>ÿî$ꃈ‰Çu™bXsä36®ÒbŒäÃmB|Äe€Ñ癵¶„‡.ñY8Ètöô™èÎàãêô¯û%RéÃÊêÒê:½`oÏÀ…îÿûèäôñŠÝ½~õî «†ø…å•¡½•ydXH¨¯ÏžÿZÛÔ2 íàçáºgãZ±DÒÔÖ¡¡Æ Û¿ìgÐùeå9%eÚê¾î.qYÓ×gsu.'ÀÓ=¯´|êl]-͵çèùËe5u3xŽÖ–î< cC]‘XòáW_S5Ö×óqq´·²ÐÓÖEmíщ©míÜÇÅÉ…gc¨§8´w÷Ägdç—UÌ›O|²l–jx€¯›=OƒËä ª›Îݺ+–H¨yþ´ooVQÉ­„”9Ú/,M¬,cӳΠ@Œœ„Iù½ÃÛâ@ù€ —m¾é³Wo>}¾¡µmãòßxut†þÁ¡Ã§Ï]‹Mè"R–-öþÇïÞþß½»bÓ²¤2Ù˜ÅÖ6·|t蛸ôl‰T 4íÝ]Ûv¬ÿí+/½ÿC³ÐIÙù¹Åeu fFý}Ì<ú:ÚŸ¿÷›žþwþöeuc‘H§ÑTUT^.¥¡Æ ôméèT^ŸµvvýÏ?ÿ-“ËŸõ±9ÙX‰Äâʺ†™5‘«“«mck{ÿàÐ膱*ØßÎÒ¼¸ª¦°¼JÃövq|o÷öc¿\)¯­³´…NkB+ëSr hÍÝ÷j䚘¤´è¤ÔÙÐ&>YPçrÞÞ±EWK³ ¼²£»—ÃfYš«ª0©:ÆX_O[C£¸ªfîö ãð@ßÄì<åõÙ©ë7ŸÛBNÏ­"A7ÇdÊJ4Ås†¨4p™èŒ {ñ~Z&YH}Kë`çº5û£Cßâ dr9_(D]oYˆ Qãp¾:õsS[Çx…8ÙXñ…ºæÖ?ç‰Lö¬*†aLƒªŸiD &Õ<‡ÿa”à&öè—'2jüÕ…&•ó)8KTU˜Çþò¡ Ϧ¼¶þBÌ=&ƒÁ³0em„òÃDh4ÚW¼çåìS\š[Rnfdpø£÷†,rsÞ¼"4³¨8ëa ƒN÷t´ÿí+/™ýíèÓ<þ!Ä©ò›He2±D2ÿ?_(LÌÎ õõ1ÔÕiëê&ÓÃ| &)L1ÔÕùþ“?”×Öß|ìåì°mÕrhíìzcËú¤œü¢Êê0?ŸÏßÛ·íwS•ÖY¶Ø»«·/-¿ÈßÓÍÙÖZ$ç•–”UNítL õ7†-571â Rò zú2˜èGùÙ,0c2-w’Ó ¡ìnÏ{5rͱ_®”V×’™]íl÷l\K&rÙì•A~.¶Öê\Î _PQ×p=.ÄRU‰òw·ç©qØÝ}ýécÓ³p|ø™äŸ¿ÿmRN^KG×2_o=-Íî¾þ[ )y¥åàlkýúæõðjä"óõ¸„¸ôl+3“ /KS .‡/WU߈Kˆ< c_ÞÚÉe³/ß‹[´ÀØP$–¤æÜNL%Üìm—-ö61ЗËåU M7âIw]¨¯ÏšÀ/ŽŸŽòãY˜Ke2bd’ “ik¾àâûd!.<›¥‹¼L õ ½»'!+7«¨~ÿÚË=ýÇ/]'sîXngaþ—¯¿îöä·ØÍ…ÍR­kn¹|7žZ)ÚêË|½í,,´5Õe2yMcÓÍ„dªøèœ ÒK(U-MííºÚãåWxlK$å•¡¾>:š-“joê\Îÿ½ýflzVT|™Èe³?yç͸Œì¨ø$KSãý;·¿tÝÌÐÀ×Ý…ÍR­¨køùæHä¿ÈÕ™ÍR-­®=wë.ù@2ñÉjª©y8ØÝNLijëPa2q'¼é 8ÛZWÖ `¡“Cx€¯Ž¦Fgoïí±"žã5M+–úº¹þñÈQRþ@dXˆ¿‡™h¬¯·2ÐÏÖÂL…Éìîí+¬¨"­¡¯£µ6$ÈÖ|BwÂ2?^¾±ÀØÐÇʼnËf7¶µ_¼sŸ¨÷õË‚CyÀßöÿŠÈðÄÙ†Ö6'oGc}=Kµw`0£ðáÝ” ò4ÆŸMÐïF.øØÒEÞ‹\uµ4"qayeT|YÄæ]½}A^ºZšWî=HÎÍ'·UèƒÄÀ…g£Áå ÅâæöŽ˜¤4òJè`màkj¨/“É«£â“zßgßþäïhm…aPR]{!ú>z@E(FNµ3¸ùäÁgÀQc8à£\h¸¤R`0¦yœomÝè³9sðäY2q‚ËôhVû{9;\Køä›ãDJDÿ_û5ObV^LRUýá­×6„.9yõfCkÛ”^O[ËÇÅ©¡µMŸI°·'‡Åº›’1»Å$§…ù-Z°øôõÛ#U‰-÷_ÔÞÝC½–-tr8xâìÙ›1 Âdž?øéÆå!ýƒC¯ø3qâщ©ßüéÀ¶UË?ûö„’»ÖTW3ÒÓÍ**9ôÁ{þžndz\zöÿw²ºSWKóí[ÅILRšD* ðtSxÀ576zûå-Ý}ý÷Ó2Eb1!¿N^½™_Vñ°²Z${:ÚQõ™§£Ý@P^[\6û½ÝÛµ5ÔÓ 6¶µsÙ,W;[-uõ!>†ýê¥æÆFiùEÍíÖ–kBu45.Ä×­ð_ŒaXBV®¦ºZ—ÇŽ5á½ýfF2s´5ÔºoŸŠQfv–æ†uõö¿ýò›f8Ž×5·^ºKoÇa±,M³ó†Çž·s]DCkÛµØjÎöÕá½v‚æ‘[R¸ÐÕg›YTL^Üíy%Õ5„8³45Þ·}³H,NÎÉïÔÕÒt¶µ&¬¡­¡þî®í`‰Ùy|¡ÐÇÅqÏÆµ?݈Î).%w½!,¤ª¾ñôõÛLtÉžë>=öƒ\Ž'ç0ôÀ…Ç/]ŠDÐÞÝ ^î]½}qYb±ÔÎÒ<"ÈŸËf_¹?ž¹Æëwį»Ö­vµ³É,*y•£¥®äå¹ÀÈðÐéŸI?œ§£½@$º˜Ú78HU¨£y)b¹³­õƒÌœö®›eijlb Gè3žÍžkÛººo'¦¨0™A^ûwn;xòLGw/¥÷EfÇgäéén\r&*z^ÉÄSçQH£8ÏF~jJÁx¼TÂ…† OÞÄqKA"sÊN¶!tIw_ÿ7ç.RÓÛ»&1q/<ÀW.Ç¿B¦D'¥îÙ¸ÖÊÌ„L!/m*L¦‡aX|FNdXˆ—³Ã”õ†ýyßëª*ÌC§Î)¿•Ž¦Æ½;ù‚ï.\N-'ç òᾤ>s³³5Ö×;}ý6U 4·wœ»}—ø,–HâÒ³w­_uõþò¬3 v÷õÛZ,P~×Úê°ÐÉa`hèÀ—GÒ ëëþïë»–.öÚ·}Ó¤¬+ýT˜ ò—UTòÑ[¯R3l]ÖÙÓ{ðÄYB·%åäÿzÛ¦µKƒòË*¤2Yay•›ï—Û÷ˆpž “élkYX,—ã°2ÈWWKóÛ WKFôÊÝ” ÃÀÛÙÉÂÄøÊ½ø„¬\¢Ø]ëWù{º%åä“þ–ŠÊ—?þ$ŠàLTô'ï¼µØÍ9*>iH ØÒÑIkßNL¥ÆÚj›Zv®‹àY,/f§ÆáüçÌ…ª†Fhhm³³4÷óp%ô™:—³>tIj^!éHNÏøá›»—û/>wëYB~Y…BÝÉÆª±µ:šêjë–U54=w™T½˜r½UWK3ÀÓ-«¨„¼åô…ú’ÊjêŠ*ªÈÆ–UTòÁ»ý<\§6¶ÝÍÎV_GûæƒdåáB'û²šºII:’œâÒm«VX˜‘ÁÄ…Nöm]ÝTÿƒA'[{{Ö5·>}ŽhZL&ÓÛÙáBôý1=a èëhÀ¦K[ÛO_¿¥Îå®Xü›[¾8þHÎ:X[à8^ZSKÔѺeÁí]=G~ú…(¿²¾á7;¶ ò‡}±7šÆæžþ~OG;RŸÙ,0ÕTW˹?¬>_ŠX.‘J¿üñ éö#¥jD°?[Uõ_'ΦHÍ+<°wç†Ð%ùeå¤êêí#[Å€·¶FÚYZ”V×vöô*5ͤÛ¾ùùÙ/’só·® ó÷p»•2^`z¼~.<wÞéë·rŠ‡Ï¥´¦î—·z:Ú“O6ë_'Î(Ó0­-ã3²Éû`d”†a‘a!½‡OŸ#^~iŽ;×, úñÊ ró¬‡%Ä̰ú–V]í¥‹¼.Þ‰ ÑvÄìeX„a”ñg Ë–)«¦h”↛íc;\b!H% —MaŒ§©¾†·¨¢jâ§œ‰áY,hëꢆùp/|| †a;Ö„_<ô÷ä3ßÝùþHÌwÿ>üÑû §­5åý~ðún?×S×n=ÈÌQr‹uèÃ÷õ´´þtäé*ç°Xöî$ÿönZ§LQb‰$.=ËÁÚÒÜØˆH!î1ÉiÔl•õÔ1v½½  :{zõ´4•?q4öÕ©ŸcÓ³†‚ÊúÆÿùç¾P¸yE(s’.U'[«’êZò©tH  zwô´µL õ3 ‹€A§ùeºZššjj[RÆRUqY%ÎÙÖZ…ÉÌ-ö z8ØW74•<>àšDX6%·€LKÏrÁ9(«­#n Š:{zu'4yiÆ0ŒA§^½F†ãåä qFÐÐÒF–ïlkÍ ÓSó ɳK$åµõ¶æfÔÈð#{ÚX‘Î3g[k:“”FuIŽçÌ]†añ™Ùd !d©-(Š8Ù¾Á–ŽN3Cƒ)t%]-Í—V-ojëˆKÏR&?ƒNß½a5ŽãïÄN­ó–WÉd2OG{⫆×ÚÌTaPDZ~i·ê†&HÍ+$Ä‘B§Óµ4Ô”Ù1c``ˆôüåœâ²™9?\ºÁRU!¢deÕ45íÍHOWGS#%¯€•õÍíÔÌ4ÇóJÊí­,#x:Ú‹%¢aèhéé&eçSc²d«p²±ª¨k uªX"IÎÍWçr¨5K5Tckèj*Õ/ètƒN/®ªa0èFz:ã埠ßy8Øñ…ÂÂò*òÄë›[ûmÍ=a–×Ö+©Ú…"±Í3MuÅJ$쟚WHÞ˜Z;»Jªk¬-¨7¹ÅT;´Óh4-u¥Úâ…ƒ`´56Z™ž¼ù¤i ¯yˆ-Ý 8 ;Ïpò?>œ¨ð‰/ ,j{ÄÌ€ã àËuÅAÌ2˜*Oe(ölD&‘ÕðØp&Öâ«.Œü=æB#%Ú#G˜RƒÏðóGFÖ?ÃpÀ10 pœâÀpì±åjçãÒµÖzÝ1Èå Ü$¨"f‰pTTçá½`޼.hæŒ3Fê¨o}”°æ4Äã±Ý(J4|$ÄIQiðâÕ&FC xÞȤ  §#Äìh2 @•tƼ:/LŽºØøÆQâÖQF•QEØhwÀdÅÓãëkŒ–h@ñ–ç6›÷‘Ô|ˆçŒT "!êzˆÙ…H*ªó*Ö‰ÑP/ß8[†²ž5ÖI¾Í\ÁgF-M9›3(êj´Dƒá0–8Ã;ȸ’ÄSE"±E]³²qŠ—ƒ*{žœ†¡Ž6‘qÆùaÔDZƢMOœÁcï.ž”hÔ˜&N)ùUxÚHD ¡11ˆÙ‹T ò!Pe}î¿?{‚q“ÆÊ—ԕÏÈüSg øþÍGÅ6)* `d"ç‹'·‘>C ž"!HĨÇ!f;8B>¨²æ|¬ÃPwß8J=§Š"llÚ”`PJÄLjuˆJþ:ÖžÆ6ŸF¥¡ùijG82)º[ æ bÈå Ê™Ã§€æLdå-3j,Œg“·óãïÀpEGŒøÒÈÉ›À2Žq¸/\%!ˆ)!™ …Zs ©ä Ê³±NßœØ8Êÿ8Ž,NœŠ‘Š;ÀF¯=KõŸQÀ_˜JE­x¦ .G 1'Áq‚*˜Ì¹wðh~ÀDÆ™äÏãŧêâaŒ±KÅXç‡ò‚,êÚ/ñŒÀa¨pÔËs‘äÒ¹ëœÞ©ù¦Ü/d›–mã_ø”Ž~^Õj¾Ä3—Ã`?2Ãt¡Ñ@• B>àè-X3ŠDòA`q€6‡bèî6m°™Ù–ìÙOª9ò5쨮ÄÓC&Câì)À`› &°¸@C3™fA«‰YâÞâ”X%_UA›€ÑÄôI@ÀGžé颢 *¬áÏ Ð8 ‚\† 3È€Ë@e.¬a‹¡Ñômø¬ 8…W‰½xu‰š/ñ‘ˆ@(@f˜4¨ŒZ|‹Î6 „I‘…f±d²¹ëDw· l3ÃÆAþpñ ‘8›.D4sÌ•Qit`s€1¿Þá=GA±NÄôÀðê"d…'ÀB6@ ž,μz·ôŒ èIÃæêr/l•ÍtpÔPÍ@ïÌîÿÿrzµ¥Â ¦IEND®B`‚cccl-2.5.0/.devcontainer/img/debug.png000066400000000000000000000034121463375617100175460ustar00rootroot00000000000000‰PNG  IHDRqœ6ÇiCCPkCGColorSpaceGenericRGB8U]hU>›¹³+$΃Ԧ¦’þ5”´lRÑ„Úèþe³mÜ,“l´AÉìÝi&3ãü¤i)>AÁ¨à“àÿ[Á'!j«í‹-¢´P¢ƒ(øÐúG¡Ò ë¹3³»“¸k½ËÜùæœï~çÞsîÞ ¸,[–Þ%,®-åÓâ³ÇæÄÄ:tÁ}Ð }Ð-+Ž•*•&ã¿Úíï ÆÞ×ö·÷ÿgë®PGˆÝ…ج8Ê"âeþŲ]€AûÈ ×bø Ä;lœ âõWžð²Ï™‘2ˆ_E,(ªŒþÄÛˆç#öZsðÛŽ<5¨­)"ËEÉ6«šN#Ó½ƒû¶EÝkÄÛƒO³0}߸ö—*r–ᇟUäÜtˆ¯.i³Åÿe¹i ñ#]»¼…r ñ>ÄcU{¼èt©ª7ÑÀ+§Ô™g߃xuÁ<ÊÆîDüµ1_œ u~Rœ æàâ*-°z÷#°Mi*ˆËWh6Çòˆø¸æf}î-gi:×Ð9¥fŠA,î‹ãòV§>ÄW©ž—Bý_-·Æ%=†^œ tÈ0uüõúvW™â’9 Œ%/VµñBÈWµ'¤_¶tâÜÈMÛ“ÊŸ¿ŸåP““í\>ĘÉ@Á„yì0`D i|[`£§ èh¡è¥h¡øÕàìßÂ)ùþ·TjþÈëèÑ0B¦ÿ#ðЪÂïhU#¼ ~yh«uÐ fp#Ô1I/I’ƒø"“ä0!£ ’'ÉSdŒdÑ:J5Ç–"sdó¹ÑÔy#RŸ7‹¼‹èwAÆþgd˜à´ÏÅJŸ7ØÏØÏkÊ•×n^:}nW‹»FVŸ»Ösét$gj-tÈÚÔrÏÿÂ_ç×°_ç7Z þ~ëÛV·5ë4ÌV }ºo[ÄGó=Nd>¥-Ula³£¢Y5VúË}¹x»g[üä÷É?’kÉ÷’&ãÞä>áÎsŸrŸq߀È]à.r_r_qsŸGjÔyï4k± æi—QÜŸBZØ-<(d…=ÂÃÂdKO膄 a/zv7«]»ǰod«}¬€©sìn¬³Öá?TF–'|¦ãï3Nnã„#I?"…m»z„íõ¦v~K=Ú¯Æsñl<b|_|4>?Âpƒß‹¾QìñÔré²ËâŒi´µšêŠÃÉäãb ¯2* åÀ (ëºè»Ѧµ—hå°{28ÂoIþýÛy¥esŸ8ü';÷Z¶9à¬ÐûhË6€gã½ï¬>¦xöRx'Äbß8ÕƒÃÁWOÏ«ëõ[xn%ÞØ|½^ÿûýz}óÔ߸ ÿÿ%x ÅcÖËleXIfMM*>F(‡iN“ú­ìþÿ  B÷ü pHYs ò ÇŸc5§*IDATH cäI>ðŸÎ€‰Îö­c!ÇRoS1o}a†/?ÿ1,<üœáêƒO$C²Oó½"ÌÅzÎpìöG†¹I æ‚$YJЧJÜ î†" Ó·?dàdcbÈw•bÐ)?Åðíç°Eï¿þa¨ö“gð»ñž‡ƒ…áûï¿ ÿâO&}úèÍ7^†"?Emy>†£w>3p±31ˆð±ñõGŸTÅ8$9–çé´äRFbR/ #ÃÙs†7_~1üÃá aV†m—ß3Ô,»E0¨ /È„@›˜aâÜr§zŠ| éÎÒ8å‘%/H±”0'ÃíW?õa°ï<û #Å!ŽM€(Kĸî½Æoé7`öáÆ51¯*V`\‚RäÕ‡ŸLåyðš§ ÎÉpýåw°z&¼¦20à•þýç?×ÞÐé‡_ŒUøqZf%ɰàà °úÿp*KàµY놳¯j˜™‘…ÁlIAv†x+1†Wß`Èa ÊRP—yÉ1üú|~†7*’\ ›JôÞûËc'ÆK#tãP*…@™~ññ— «€ål(0O6™0\{þA‹™›™!qö †û/¾Õ-Ø÷‡Ia¢ Ce~†ów?Âu1ÃGYœ‹áó÷¿ /?ü„‹ƒBà+0 DYJÈR削SR %¤~äX M £f u)ÂIEND®B`‚cccl-2.5.0/.devcontainer/img/github_auth.png000066400000000000000000001525211463375617100207710ustar00rootroot00000000000000‰PNG  IHDRLvûºšgAMA± üa cHRMz&€„ú€èu0ê`:˜pœºQ<bKGDÿÿÿ ½§“caNv@æ”çÂQ€IDATxÚìÝw|SUûð§I(]éÞ;mº÷¦“F¡€¬xTPÔWeˆŠ/ð¢¢ü@DEÁ2KBK¡{ï‘îÝÒ‘6mÒþþ¸Ciso“´”ð|?ý#½9¹÷œçŒÜ“»ÀÉB!„BH^Ð0!„B!y‚“„B!„\ÁIB!„BH®à$!„B!$Wp’ƒB!„’+8ÉA!„BÉœä „B!„ä NrB!„Br'9!„B!¹‚“„B!„\ÁIB!„BH®à$!„B!$Wp’ƒB!„’+8ÉA!„BÉÆK¿Ú´.ÈÝYøo^çß;v?o¡ pqüúõ¢K^Þ¹7«¤ Íøäeo³{ýšŠº†UŸþã Oq07Ð?¿gGZAñš]_À$g‡_^zãAÊ׿^³8H–iø»8n]µìZbòþ_~¥M|÷þF/[á¿Ãoã¼EQ Ôó9>œùôC¾ð?;ŸŸ"?WÕ-¦˜{7Æ`´vI9ˆ1ļ÷Ó•nšÚ=‡‘­¬o<ü{4ñ:ÐÍÉÉÊr”6dn Ï ÓKkjG¯,c°‰§nf€¯&SM“©æaËN+,~v "eeÉM„½ÝàVJ:ñ¯¶:ÓHW‡F£e$˃4´˜júÚZ £·‰ûY¹µÍ-àeoc¤«óŒ¶(Š’I)ž¹”mfRÛÔü,öz©ÄAL1)öîç¤=Èd£—‡1íŸ]Rbâ&9'/ßhnïxrùä`o/ÝA «kºÎüZ.ü×ËCÛÙIKSS±¯¯¿º†{?±¡¡‘'š>8ÐÀƆÉT›Àëíomᥥ·· ßuwÓ²·ÓÐÕQb0h]]}•Uܤ䦿ÞXF¶²¡ñð…¿&9šLUÉ&9/„Ì 4Õ×Sž¨ØØö(£°ôìÍ;Ù¥ÿ*ØÝùËMëÎÅÆ}~âÌ(dT7±éÅÓü¼´Ô™ :½›Ç«nhº—‘}àìcßî¦gGMªkjy¦¿Ò¤¯,ùˆƒ¨0ow¸’ñã YƹãÑ׉»×¯³$-JúRŒÁXdêçäë@L1)öîç¤=H¿ ìþÏ(†Ÿ™¨HŸ`v”©­Æ_`ÐØÖê–jç~/¯ªî&¾üokm%aU†©©jJjÓí¸zˆ3rwÓ®M]]ÑÉQÑ–­~ælYCcÏ3ÜÝëWGøxvóx©ùE`ª¯;3з¼¶Nt’ó¬³13ÑÓÒŒÏÈnïâ2UTœ¬,VΚ>ÍÏëåOþ×ü¨},sr;5cÒª Ø¥å,ÚZŽ,‹üòŠšþ–#Ã8HœlQX „ÕÅDèi‘d’£8‘í}wîÔ v÷ð‰v6Lb†ÓÒÂËÈjÑÔTtwÑf0hÓ¦˜;Q “|u‰NEegn^›šê_]EEº§‡NrJsg?9¥ÉÖFÃélnåõöö;ØkªLP¤ùùê^Š®zV"ë`iáãYQ×ðʧÿkíè$šèéT7Êá~Ò׿ü^Rý×1\bj·jväÞŸ~ņ¤æå¦   ep ¾µ5S©¨ ÝÝü––Þ̬Ö! %–±!ÜMÏÎp@t†³qÉüå3¦¯„‡,¾µéÃú–V÷v_5]_[SCM•×Û—Ï©8}ã~V®0¥‰žN¸·‡·ƒ­™¾¶:SIQ‘Ë멨kX¹s/ÅMfønY¾ˆSWO|PûÏüáãéëdGü»tzøæ¥ ÷Ÿ9òJŒ0Mò‰ƒµMͳßÞ 6f&Ë#§Øš›èjj0UUºº{Ò ‹ÿ]PAuBKz…¥ô›F{éô?'{CmhíèÌ-å¼{àˆ0ÁÂðày¡A–FÜ^bvÞÑ?.sê(æAúö «8LóóZ4e²½…ù p*O^޹“–I±ÉɼE…{{À­äÁç‰9YYzï-3Å ŒœRΑ ѩţÔ$È•ÎKTzˆ‡«¡ŽÖÀÀ@kGguCÓá Ñ)ùEÿŒQî.D^o_FQé¡sGÚh¥!Ûk—“Oîd1‡"1¢X 1uAq _ ñ‡È保Å¥e ?²jv亅s»øýÅ«ã°vþì>:êõ-m)ù…O&®w[êŸÛ½£©íQä[ÿMâîòŦµ7“Rß;p‡Á‘ƒÃ!ýZ$ƒ¬ú&é$†µ‰ÑªÙ‘vf†:Ú :½¥½=!3÷Ø¥«ƒŽx“öné»Ù—ˆ¸Cº ™ìªQé8ÒöÒôÍ׬ðr°ÕbªU76ÿ|í&ñ3}7÷ýÅ+î$PI@Š]tI&9ªª ›Ëò]c#(.é f8‘Ùêë£&ÆÊ•U]0 tº‚š*ƒHCœÿ6Ð?ÐöhðU7jª ï¿®ÿ©¨ì¢’=í‰övšÄë @ôÌ7Wí–ÖÞ”ÔfX¹ÂZG[Iø“©Èd*Ö7ôÈj’CÌmœ‡¿’'³¸ô\lœ¡Žv ›siumZÁ?ʰëª*)õôg•ôðz u´½ì¼ìm7}yè^F6‘`~XðKQÓúûûËkë³KÊx}}jªŠŒ Ô7AxunSUÅÙšµ ,øÜ­»ÒœàT&*QLocfä×Éí.«©ãöô˜èMötósrX±c7Å+üâÒ2ËkëÀÏÉÁÔ@o46“=\?[»Ji¢b{WWaEU—«£¡îio#LðÙë/O÷÷éäv§ëh¨Ïð ñpÙð¿™ÅeTò }{IÞ_±daDH'·;%¿A§{ÚÛìÛøú§Ç~Žb⛜l[”SÍÍÆŠS[ÿd5yرۻºrJ9šjª>Žvžvì-_ŽKÏ’y{,T:ïñm[œ­YUõwÓ³úø]Mu;s3šÂc7303ÐãÔÕg•šè¹;{ØY/ßößʆF¤‘” ñ‡"1¢R ñuA¥o’–B|ljÏÌñr°õu´äxÚ± 1;bŽ|°™h“©ÅZLµy¡AƒˆéÝœº†âÊj¶™‰¹©è>G°‡ ˆ\†Ãàˆ†A‰‘ÆA&}“Ê$†…Q×9¥å|€mf2740ÜÇý=ßä•WPìÒw=Râ é&d²«FÚq¤í¥ì›QA“Ú::c¦:YY~´jyŸ3)ÍÝÖú£UËËjê2ŠJIÆJ×#ÄFJ’IŽš,,Ô^ZnýèQ_EUgjZ ñS1A‘ =úzJ‹X–s:ÿ¼RÕÓÃWRbhh(@~A›•“F£-Ydy7¾¡£³ÏÍU ò uqÿTØã·7ÈÏo»_O=“ý‚k7ª½½tôô”ÛÛ{ï%4è««+Z˜«¦¤6;Úk3œººî¬ìÖÐPW44P®«çÊjºŸ•›^XìnËþrãگΜ{ò‡ÉØäôØäô`wç@7ç´‚¢!à¼t/ñÒ½Dá¿Q~¿úÒ‹Ó„=‡ðñÑ“Ññžü8•Mª›ˆ²Jêý'â‹™©»p'þ«3ç‰×[–ÿkñÔ°QSwù‘ÊgO]‹%^ì^¿ZÌp/Í&Œuuv­[5ÁØwêìéë·žLè7Ýß§€S¹lÛçÄ’B¶¾²|ç«+ç¾»J¤oÒÇaŠçˆ¼2ÎÆ/7q°4?úáæK¿ÝÅ79Ù¶¨P/7:>äybwÓ²6}uH4[_Y>uÃ{2o’å´²YÎÖ¬ôÂâÕŸ}!&§¯ßúòô9âõG//›¸|FĘ]ùJ1’Ò ØqH‰ i)Hë‚´oR/Åp'>#ûÍÅó\àÔYáB6«­£3§”C%/N ó°c‹>ïÁ•Í:¶u‹0iï¾™”Æ63 ts~l’ãî®ÜOÂaP‚aPb¤q¾oR‚H‰Frâ¹/EMûlíªùïí Ò¤ïzTˆo0¤›¾M’véG{™ôÍ”üÂíG~´61úe×ÖKwïï:~ÚÉÊâÄö÷½=ˆ9 i*»¬bºé &Inª¤ÄEEº®®’µ53l²Ñk«muu& ‰4ÝÝ|–¥ÚD%º­­:ôô`¢" ò Ú@Cc⬙f/.²š0^RÒqùZµ˜jë(ii*RÏdÿÀ@nþ£Ü¼G  äå?Ê/xª* PVþ+ŸÕ5ÜÌìÖ¬ì¶{ ¿ýÎ)*îá8µú³/®$< rwþí¿Û¿~ûG)Wÿ ‡×kad0hùÀÀ€”kÞúÝñƒ¿]|ï›#rò¥YO¨§ÛÎ×V ütõ¦Ä+9}Ül¬`ÔŒt‹§†NTTü5æÎp#Ô‚°`Øÿw×€?âÒ ŠM ô]F©˜Ãµ‰ã°dZ(ìüþ'ámóÊ+®ÞORSQT Ò&'“æå·’ÓŸ|‹SW/‡¤Ü-u&q£gÙ¶YåaPeõöõ€š²²øŒ‰ÆùØ¥«àȲ9"AÇ‘y (Ö…LJ1\Ç)©®-¯©37Ô'Îsk–ª²òCÊÝ'*Є;I0èPiïŽIJ…¿ï–Nðv°ÕÕÔHÈÌÁaP²ap<“¾Ù?é›_/ä–rÌ õ‰½Òö0yƒ3ÒMvé1öMâújuUU ~a1ÒÑQê¡ÔõH1 Hr$çð÷ ©:ÁÐPÉÅY[SSQMmÂÌH“O•Š&++ëðôÐ)çW¤v¸·;qŽ·ƒ-Œä\53“>>_ôª°AH{wYM]iu­½¥¹‰žnucLóó€?ÿþi‡Á ƒO·ëɪٴ÷2²­,\ج„¬\Òö }×£’É1Ø¿ ÒŽCqSL™÷MÑgöÐéC2ÁHC=¨ë‘bd’ÓÑÁ€ÖÖÞŠª®‡ÉÍk^±UgNÐÓSÖÑVäñþ:ßLE…ÑÐÄÞ?@Y‰¼Þ~˜alf¦Ú×Ûæ×Ò––^wWí€Iz**Œ9³MOž*miíý«ðM¼†&j"íßÂÈ@ø«^e}C\Z–d‡ÔH%A]Hßý¹’ððµy³føû“œIÎ0’INoûØn*½;æaê«ó¢¦Oò&~¡˜êëõ¨³KøÃƒ#ŸnדU³i)Z[…M…J{²ë‘fr ö£H7AÚq(bbŠ9J}S¶q íz¤ƒ˜Ò¯¢¶†«n§jjš[ºúzû'(Òôõþ 4Ô'LT¢@[[/88h@aq{KK/¤g¶Ô7t/]bÅ`е†¼ð¦¦–kjª Lµ‘e¸§GÐÔÔÓÝ-xò­rNבcEV,5¶º [}âDº­†šê„3gË`t|xèkã@7gS}ݪ†&baÿ°Ç»O 57Ô¾÷`û‘Â…1öLTÙÕb6!+Ç.^%Ž`ýpóœ€ßoÇ ŸÄíé%EE}£wî@'·´ÕÕë[Ú†LÀç 4ÕÔ-ÔÑP€n·L*‹z{8|¾ ›Ç [÷Œ'FíÆÍúZZO†Zúö q(VÖùÛ÷Îß¾çÊfÍ š4'$àÿ¶løôØ)Ñsš¥'ýý¥‰dèÚ·…¯‡¼»š¬:Žô¥ ­ 1}S&¥¨nl~˜“ïçìàȲÈ-ã¸ÚXå•qZÛ(~\ h1Õ¤ìÝÑñ‰kæÎœàsìÒÕ铼™ª*§DN<ÆaPV]â×"•ïn)G9Òf?Ò„øUžh*Ûƒ4]4“ é&¤i“¤Gú%Û¾YÕÐ(~J6déwYI1 ÈàÇr]½¿æš]]|¨©å€µ•ºšê_W—¿N׫®áªªÐ (NøçX—p¢¢2ôcFÍÌþ:ÚÕÙ5²3ÊŠŠ;NüTòë¹òá”–u^»QsàP~}=LLT45&À¨©mjmõÍwp¹ 6ÔLÚD_n§¦K¹Q1›ÒVg®šáí!å¶>?~Z è÷ß‹…KšÚÚ`TÏjïꀑþÔMqwÎy¡Ã%È+¯`0辎v¢ ý] »¤\&•E½=H‡¼ò ¦ªÊ RHFúåïâØÖÑIñ'="ÔYŸ¿+}{8#ê¼™Åe»ŽŸ^±c7F[¿h.Œ¹Ž®nPž¨8ö=KVGVヘºÓ7eUг7ã`AXP„·ÇãÆÃÔE’N§Oöp•¦wW76'f籌Üm¬†‡ üv3‡A™£øµHå»›´˜¤½[¶C£TTŽ´=HÖõHQl0¤›¦M’véɶoÎݲ}ïOgÅlnÈÒï²’bñ$ç…Yf3¦{{ê¸:kùxë¬XfEܦ¬¹™×ÔÌ€¬œV˜ H[ü/K_o©F>Þ:ÐÒ«ªævq]]}`c£1}ª±£½†—§Î‚yæÄʉ RÔ “³‚ôÜ\µ|½u–ü‹ed¨}•²¹û™—‡öºWí"§™xºk;;iúz먩ý5·éëë—É& ´5,ÍE—Øš›zÚ³è¥TÄ-ýœž\CMcº:K™1›ú䵕ëÎÙ½aŸ“½4Û*©®ýó^¢£•ÅÌß¿ZmG Lötµ57…QÖØúFt½2E¿ÆÜæ óÂWÍŽ2Áo±qðÖ’ùÿŒ“ÜmÙUõƒn…'qeQoÇáט۰}õ wk)#&e‹ ÷vWš¨'ò` 1ÖΟíÂfUÕ7:±XÊö M¨T–+›%úï£Î. _‘!ƒ£ë#E.tsûž%«Ž#åø@¥.ÄôMY•âvjFmSs¤¿ÏÜÐ@ˆÉ$ç\ì]x{ÙÂáΡػÏÞ¼›—.ô°c'dæˆÞÁ‡AY¡øµHå»›´˜bz·L† sC}áë9Áþ¾Nö­íÄ)ŽTÚƒ”]ÅCº iÚ$iǑޘõMéC-Í &¡ZZª‡bDq»ú¢¯üu¹XAa»½];ÛZ]SsbpÐ_7Šáó×cjˆ×·îÔEEš*М´œþ¹¬ª²²++» LŒU˜LEây;B}}ý×nÔȪØúúÊÊ* 'GM'GMÑå9¹m¢·±–Æâ)¡+¢¦U74•T×ðzûô´4\ÙV4íØÅ+÷œ6âNÓ1öVTÑi4–ùÚÝûsJ9¿ßŽ_274ÐÕÆª°¢J è×PSUSVîðG”1›¦1ÑûënݦúzRÞ`íØ¥«³‚'­]0ûrÂChnïøåÆí¥‘Ç·mÉ/¯lïâhkŽR?MN[3wæÒÈ;voßÚÔø½_S¼«xœº†‡ܶzùº…sV̜ʩ«oïâ2U”Ù¦&kwïÏ*)‹Žàâ8Ýßçö¡}¹emu&ÛÌ„ÛÓ³íðqYUõö q®ÜOre[ýkÊ䣽]ÕÐXÓØL£)˜èßËÈÞuüôˆ "e‹"=Oì…; ³kSc õNn÷ÖïŽË¶=H“ÒÊ órÛûækõÍ­åµuMmíLUeO;ƒ~áz<Œ¹›I©«_˜±,2•mÕßßÿd F¯gɪãH3>P¬ 1}S†¥8y9æÝ‹ý]Sò =TQ¼K÷ýœ"ý}N}òAqeu[G'q.ÊH{w\ZVyM£•ˆ>ª‡A †ÁáPüZ¤òÝMZÌáz·¬† ×;ßî+ª¬VQR²57»ŽÿL±wKßõH³G±ÁnBš6IÚq¤oQcÖ7¥µ4ƒ˜F<ÉIIm65QÕÖVœ8‘Æç´¶õ––u$ÜìQ\ªôñÖurÔÐÔPìë믪æÞ¿ß@ÜE Û[[K¼ jšš£ï=xarÀüРó·ïÀ§Ï5¶=z!$ÀÞÒŒF£µut¦'=~—™(©®}ÿÿ޾6o–¹i¯7§´œÛÓÕʯ&&ås*^œæëdOÜ᤭£+1;¯›÷×&>üö‡Œ¢ÒùaAî¶ÖÜÞõÄäï~ÿs¤n—I{&»OþŸ™³0<ÄÙÚÒÓÞ¦­£³¦±Yød7ê¤lQÁî.Üžž»éCüÂTÀ©¼ž˜lgaæbÍšÀ ·´w\º{ÿû‹W„׹ɪ=H“ÒÊ*«©;sÇÇÑΑe¡¢4‘ÛÃ+©ª¹rÿáo±Òv@É:·‡Ž½:/Êe>d dÒ³ðI{ׇâeÕq$¨×…˜TV¥øõæW^˜¡£¡.Áí[>úö‡¤Ü‚y¡lS“ fônwvIYBfîH{÷É+1[_Yž[ÊyòDMe…â×"éw7i1‡ëݲ‚Ò ŠÀÚÔˆA§'åýãrJ~ÅÞ-“®'õCº iÚ$iǑޘõMéC-Í 6R àäûäÒ¯6­ rwž¾á=á}âŸsïþ{Ñ¢)¡/ïÜ+“97BHŒǯßYãAÊ~ÿ<ç=ŸNîxÏHWgÊúw1h<›à»óµ•?]‰ùJäé++4 Bh\ ÷öÊ÷4“ã< çЊ™SX'¯ÜÀP „”Ä®öRÔ´n^/4¶µ=•“(ž.s}âñ«àlŶ‚ÐØñpéíë»þ å9Ïz~|µi]WO·…¡½¥ybV.ñ p„BÒ7ÉYA¼È+ã<‡“S}ÝW^˜M¡±äeo£­¡~/=û9Ïz®Xèii4´¶ýãò·çÿÄ€ „ô†¾&!„B!„žQxMB!„BH®à$!„B!$Wp’ƒB!„’+ãw’ãeos`ϱÞyÎóð¬—b’³Ã¥}Ÿ¼¹h.v6„B!46Æï$gf€¯&SÍÕÆÊÖýLçÁÜ@ßÊØhœGrô2©­Î4ÒÕ¡Ñð˜!B!„#Cìz^ûú¿Ž}óÔsv7=›/TÕ7¦?»yvw>¿gÇâ©“G5ŸK§‡ÿôñû÷÷Ýg>ýðµyQ#*…˜L&Ýs`Ï … ‚“O|eN$ö„B!41ÆmÎn§fLZµó@êäŽ÷XÜžž”üB[sÓ5s£³ó2ŠJŸ¡R „B!$+ãw’ƒ¨Ø¿yËânZÖ¦¯ NöpÎpB!„zÞH8ÉY¬oi¥I1¼lÝœ+ëDg8p'-“b)(fRŒ¥ÓÃ7/]¸ÿÌù“Wb„ “O¬mjžýöVá'+ËCï½ecf¢8‘SÊ9r!:µà©…ˆB!„ä›$“œÏ^yº¿O'·;µ XGC}F€oˆ‡Ë†ÿÈ,þkúÓ×_Žô÷éæñ2ŠJø©¾ž£©¾®p‚!>A\Zfymø99˜è=™3“¨ ¿NnwYM·§ÇÌ@o²§›Ÿ“Ê»Kkj)æA<éóY\z.6ÎPG;Ð͹´º6­ HøYáä4’âMŸä ?_‹•¸T2)vìö®®œRŽ¦šª£§{ËׇãÒ³°"„B!™ñ$'*Ðoº¿O§rٶω%/„l}eùÎWWÎ}w;Ìð÷‰ô÷)­®]»{ó£v"Mò‰ƒÂ5&8õ÷^ûîõ«‡Ü5'\¸ÿÕ™óÄë-ËÿµxjØŠ¨©;ŽüHe¤¤ÏClrzlrz°»s ›sZAÑç'ÎŒ4’¤œ­,àAv¾Ä¥ ͤ¬ˆžPè÷ñ«/m}eùÔ ïaD!„B27âI΂°`Øÿ÷ž=ü—0+h’‡;ÐÕ)>3gax8{A8»„4NDßX<5ÌÍÆjô61Ò´/·Œ£­Îd›™p{z¶>N$;|!zН§»-;æÀžN¥ž–&ËØPt=¤ „b“ÓÖ̹42ÂÃŽÝÛÇ·65~cï×9¥ÒRQß)‰ó@¨oiK/,&rRXQE§ÑXækwïÏ)åF’Šÿüß÷‡ÞkF€o›sfqioß‘e1ÒRˆÉ$išÛ;~¹q{idÄñm[òË+Û»¸ÚšO&{!$ÀÎÂl``ÀÚÔXGC½“Û½õ»ÁÅ4ÑÓ%^˜êë=Èâ¾ØÒ'@!„BÏ:è› ZôïST””lÍMý5?j'Ž0Ä&§·utšèÙY˜M`0âÒ2?#[OKÓXW‡el¨®ªšURv'5³­³‹J$Iµwq¾;qÂ-; 3sCý.÷~Vî•„¤F‘» W ÒL¾2gF_ðãåÇž êȲvwIÎ+H+(€ûÙyÝ<ž‰ž.ËÄÐÌP_AA¡€Su+%#%¿úûû5TUU••¬LŒLõu¹=¼Øä´­ßýP\UódY¼l‹*«>?qz¸ÂJ™!„B=ÀÉw´·an ~ÏŽôÂâÕŸ}!Y‚1ÈB!„BH>ÐÆ`¯Ì™s $N0y@!„BɆÌ×8Õ×sÇšE•Õm]ŠÄ5Õ M‡/DSL0y@!„BÉ+ÙOrZÚ;R Š­MŒlÍMjšš/Ç?ØÿËïÔŒAB!„Bòj,®ÉA!„B¡1Cà „B!„ä NrB!„Br'9!„B!¹‚“„B!„\ÁIB!„BH®à$!„B!$Wp’ƒB!„’+8ÉA!„BÉœä „B!„ä NrB!„Br'9!„B!¹‚“„B!„\ÁIB!„BH®à$!„B!$Wp’ƒB!„’+8ÉA!„BÉœä „B!„ä NrB!„Br'9!„B!¹‚“„B!„\ÁIB!„BH®à$!„B!$Wp’ƒB!„’+8ÉA!„BÉœä „B!„ä NrB!„Br'9!„B!¹òÔ&9^ö61öûè¬4NÈe›<ó釿}¾ +!„BÏ•§6É™à«ÉTsµ±ò°eÞVÌ ô­Œ°š•ö06mrŒ±ÍL&*NÀªG!„Ðs…1ÜÁîÎ_nZ—’WøÚ¿éJ¯}ý_ õ'—/Ûº« ¢Šx}7=;*hR]SKZañ(•(¹ظÏOœÁšF¤ía ÚäsbšŸ×Š™SYƆÜ^|fÎ_/4=j2™Ëbf ¯—½­¾¶æþ3¿ÿ— üøì`'+ %¥–GíI¹…‡/DW76 ?¨§©qeÿçƒÖ–]R¶rç^Š9$£F/Áô ï5·w`#A!„Fc”Ö;00ÿ`ÐÂG]Â×·S3&­Ú€€Æl“2±djè;Ëñz{Ó K ´µfMò²·Y±cwkG§h²ÝëWGøxö÷÷—ÕÔå–r„ƒƒ™¾ÞÎWWvõôpª½¥yTß$‡%~*\ƒŠÒD(«©Í)åWXÕÐ8¢|’ŽQ£”g8!„ЭINÿŽ#?b|z®èkinX4·“Ûýò'{Ëjêà?/½¸ <øõù³D ýôñûvf§¯Å¾ÞüøAžÊ†Æ5»¾È*).9ôÞ[>Žv3|¾K,QUV€›IißžÿS⬒ŽQc!„B£„1ö›üîý^¶Â_Þ¹Wt‡†`¢§îíáí`kf ¯­ÎTRTäòz*êDOGÙ¸d~ˆ‡«¡ŽÖÀÀ@kGguCÓá Ñ)ùEÄ[ËgL!’-Y"üTÔ¦ë[Z)fÕDOgéô?'{CmhíèÌ-å¼{àˆ0ÁÂðày¡A–FÜ^bvÞÑ?.sꈷ¬MŒVÍŽ´³03ÔÑfÐé-íí ™¹Ç.]­ijÃæ¯Å¥e W¸jv亅s»øýÅ«#Iº1™\:=|óÒ…ûÏœ?y%F¸†äk›šg¿½•z]“3“å‘SlÍMt55˜ª*]Ý=é…ŇžÙH˜æçµhÊd{ ó(àTž¼s'-“âÖIÛi›üxÍ /[-¦ZucóÏ×n:XšGøxvóxß_¼ráN‚L2)}“#¬?{F€Ž†z}K[J~á“› ’É™¾[–/âÔÕ¨–‰NTT¾m‹³5«ª¾ñnzV_ «©ngnFSøë> ™Å¥çbã u´ÝœK«kÓ Š„¤>ÙìáúÙÚUJÛ»º +ª:¸\ uO{a‚Ï^yº¿O'·;µ XGC}F€oˆ‡Ë†ÿÈ,.; 3âÝœÒr¾@À63™îãþÆžoòÊ+ >3ÇËÁÖ×Ñ^tŠâiÇ€Äì<*9¤²ñ™¤‚´.H‰Ïƒ™IT_'·»¬¦ŽÛÓcf 7ÙÓÍÏÉaÅŽÝ¥5µÄÞ_±daDH'·;%¿A§{ÚÛìÛøú§Ç~`ˆAÚHÛdTФ¶ŽÎ˜‡©NV–­ZÞÇçßLJs·µþhÕò²šºŒ¢Ré3)}“€#lö°c·wu¥k1Õæ… ÚÅL¾:7Š©ªâlÍZ|îÖ]êuíaÇ€„¬œÇâ_Têå`ëfc•QTª«¡>/4(»¤l¸Γ½Ýàan¾p‰“ 8ÉA!„Ðð†äðèãóe¾ÉSŸs²{ýêá&9„ž|òŒvpdY8[³Ò ‹WöÅŒMNMNvwtsN+(’àÆƺ:»Ö­šÀ`ì;uöôõ[O&ˆ ô›îïSÀ©\¶í¯  _ ØúÊò¯®œûîva² wâ¿:sžx½aÑÜ—¢¦}¶vÕü÷v@|Fö›‹ç¸:Á©³Âô.lV[G§è•b®b&©®.HI¨-ËÿµxjØŠ¨©Ä©>S|<F„ä•q6~q¸žÁÁÒü臛7.Y@qþ@Ú¨´É”üÂíG~´61úe×ÖKwïï:~ÚÉÊâÄö÷½=2ŠJ¥Ï¤ôMîÅiavì¼2οwì&¸²YǶn®z&«›ˆ8ŒôBsC}(¯­·57=øî›©ùEï8R^[ïå`kª¯—QTäî<Áà '¶¿kah@§ÓŠ+kN_½þ eЪ6½¸@SMÍ×ÉŽ©¢²÷ä¯ÄAZ‚ž–Ì ðµ³0ãÔÖ'fç "„B˜I·§ºº{$[/NO>qPtÉ?ÿ&<¥ž¢âpÒzûú@MYyôâ²xjèDEÅ3×o ¹» ‚`ÿßûåðG\¬ Ivì@W§øÌœ'?òͯ|ì­,\²rKªkËkê, YƆĹ=.Ö,Ueåë‰É3Iº 29Òº %ANDßX<5ÌÍÆŠøwÉ´PØùýOÂ+¶óÊ+®ÞOzaràHK!½’êZPWUb&i¤£-“LJßä¢ýàËÓç„ ¯£žÉ­ßŸ;9S[ÿ 'Dñ!zeó£öy¡šLµpxÔÙ)|ËÙŠöæ©E×$k¨©º:ïZ÷Š®¦Æ ñaYdñ¢±µ­§·Wô­þþþ¢Š*kSc_'{600pþÖ½ÏOœ–í5 ÞûæÈÍä´gú;!„z& ;Éimï€Ö o400p1î¾è’âÊjYeº¸ªærüƒ™~w¾ÝwáN•„‡ùœJÙÆ…ØÃ¾rÿáp Y|¾àanèÂÄì\;¶ ›5ÜNí½ŒlG+ 6+!+®&&½>v¸·;qýŒ·ƒ-P>W ~ ’eR¶$ÈCCk¯·—8% œ­X}|þœ`Ñ4,c#09ê²}õ¿Ù¦ÆÂ÷ú-½¨d” ¥ ðÏk:F=“£ÚälÌLúøüÔ‚aïM=“-íÇ.]•(2…æ^zö¢ˆÉOž©­Á€5»¾ NÚ[sÓ·¿÷úüYƒ&9Þ/­3ÐÖòs²u^ÔÖW–«*+ œº+<ø6ÍÏë…/,îêîþú× ÔÛé%“ƒŽÖ6·Œ‹!„’wÃNr* ©­]²õö÷÷rì§ÑË÷¶Ã'Ò K–N_±,2"³¨ô๋Éy…²Z?ñS}SÛ£aÇ ?ù.qŸ(âªè!5´¶ WW¾6oÖ _bŠ2ÉÙF8É¿É2)ã&Qúøbò@¬^œþd2%%ák–±¡ËBø/SuŒ 8¢LŽj“£Óé­Ã\&™$ÕÁåj2Õt4Ôó9•ÓÞ|ŸX¨¡¦ÝÝ0q™ @aEUr^Á$Gg+ËìÒrѵշ´^¼{?%¿è×]=9 "\’YTzá;M ä¶Ò1J& ðîj!„ÐS!îîjûN•á´áI’žõ—ó·ï¿}ϕ͚4iNHÀÿmÙðé±S—î% ô÷K¾Nn7h««×·´ ™€Ïhª© ZH<û¯ƒÛ=Üj‰£'¨nl~˜“ïçìàȲÈ-ã¸ÚXå•qZÛ¨çSüH3Iœ”¨¤¨8zµ,Y ­¡›Ç [÷Žød¤÷£Ò¨´Éª†Æ''39ªMN h1Õ¤¤4*êÌ ô- Do ma¤_ÞCduP>‰G…WÚ ÕÈ›Š*«­Yfúz•C]#T×ÒZÀ©t²²´0ÔÞkn¤÷…C!„<¡‰yïzbr‘ìÎ1{R{Wüý\?‰e—í:~zÅŽÝ4mý¢¹¢oup¹ &Ñ! â&¶óB‡KW^Á`Ð}íDú»8@vIùpŸòq´€‚ŠN®#î1µ ,(ÂÛcƒqãaêH³*f ¤™$ŽÔYŒ^-K¨Ak`ªª Zƒ¨´*mrî–í{:+óLJßäòÊ+ètúdWé#©­Î\5;2ÂÛc¤¥H+(€W'Ñ…®l«n¸]aEü}^¥…¡Ô6 {*—¡Ž6ðúú†K@Ìš]ºƒB¡çÖ°“œU³#¯}³ûüíÆÖGøøþE®l–è¿:»‚~EÆc¦ˆ»û99H°þ_cnó‚ùaÁ«fG™à·Ø8xkÉ|á’¹“ÜmÙUõ÷2²… ‰›Mæûû:Ù·¶wÄ&§ ÞNͨmjŽô÷™1#ŸäˆYi&óÊ8`²§«­¹é(Õ2Å@‰¯ ؾz…»µ49¡Ò$n“ÒgRú&w.ö.¼½l¡µ‰‘”™ü䵕ëÎÙ½aŸ“ýˆJqå~¯·wAX°åß3ç÷W,Qœ0áJÂ_×Å–®¦Fqeµð¥¶Ö Æzºå5uÃC!„ÐófØÓÕB<\ÀÜPŸ8j4¶›œ¶fîÌ¥‘vìÞ>¾µ©ñ{¿¦r÷ä0/·½o¾VßÜZ^[×ÔÖÎTUö´³a0è®Ç‹&«oiK/,v·eÇØSXQE§ÑXækwï§² N]ÃŽÃ?n[½|ÝÂ9+fNåÔÕ·wq™*ÊlS“µ»÷g•”EÇ?pqœîïsûоÜ2޶:“mfÂíéÙvøøãat½óí¾¢Êj%%[sS@°ëøÏƒ¶uòrÌ»+û»8¦äJv'ÜáÖ@šÉæöŽ_nÜ^q|Û–üòÊö.®¶¦lk™b Ä¸r?É•mõ¯)“~ôvUCcMc3¦`n /#{×ñÜP‹J{¸MJŸIé›Ü¥{‰~Αþ>§>ù ¸²º­£“8™M‚Lšèé/LõõFtƒµú–Öÿûíâæ¥ OíüOza‰¶–¥±a]sËw¿G *êŽÿyý•fü°uKBVŽ"cB°‡ _ øüï Ì ž´nÁœ’êÚÆÖ6:fmj¬§¥ÙÍã ož6ÙÃu–V×Ö·´*((°MõµµzûúvÿøËx^i4ÚŽ5+-LÎ+øóÞƒñ–U„BHÎ ;Éù-6ÎÒÈ0§´|”f8PR]ûþÿ}mÞ,sÓ^oNi9·‡Gåƒe5ugcîø8Ú9²,T”&r{x%U5Wî?ü-vðS ßûæÈ¦¥ }í¼ìm¸=¼Œ¢Rê7žš˜”Ï©xqZ˜¯“=q›¦¶Ž®Äì¼nÞ_™üðÛ2ŠJ燹ÛZs{xד¿ûýÏAŸ'ÎÞ±65bÐéI¹Gÿ¸,ú¸¯7ï¼ò  õ?â$‹¤˜5fò‹ÓçÛ½`oiF£ÑÚ::Ó Š“¿…—”¨J¼Ý'‰ÏÌYâlméioÓÖÑYÓØ,¼=u¤íAâ6)“LJßä>úö‡¤Ü‚y¡lS“ fônwvIYBfîH3yøBô–å‹8uõ#z(áçk±ÍÚÿ=cЇ›Û˾—xà좗è:©¦©yaxðd·þþ”¼Âï~ÿSx¶Ì¢²ëR\Ø,G»ú–¶s±wO^¹QÕÐD$¨jh¼t÷¾›µ·ƒ-Fklm»t÷þÉË1ÂGÇŽ ³‚& ZÈëíÅIB!4ÚÀÉ£0føî|måOWb¾y°ÉpNîxÏHWgÊúw%Þœôk@!„BH>Ð0OÝŠ™SX'¯ÜxŠk@!„BHn00OËW›Öuõt[Ø[š'f垈¾1ök@!„BHþà$穱42ÐÓÒhhm;úÇåoÏÿùTÖ€B!„üÁkrB!„Br¯ÉA!„BÉœä „B!„ä NrB!„BrE>'9þ.Ž—¿üì­ÅóÄ'ó²·‰9°çØGï`;ÏDmb““Þ$g‡Kû>ysÑܧ¸†çÅô™cn Ÿ|âà‘6Kœ@æ½[úP?£­zP¨¿{cò‰ƒÂ?kÖ(mW¶C1•sæÓû|Ûx.BòçÙžä˜è[=¹\‹©¦¯­¥   þã3|5™j®6V¶lù®æá%O¥x&jóùir£G[i¤«C£ÑžâžR 0ÂŒ«!(ÜÛn¥¤Kœ@âÞ-åw–üµêA¡¾Ÿ•ûç½Ä?ï%Ö65êve;Si0l3“‰ŠÆs)’?Cˆ+fNý峎î9°gÏú5væ¦ã0ëÁîÎç÷ìXqpAX0lzqÁ•¯v%û&ùÄÁ»‡¿<óé‡ëÿõ ô×¾þïƒcß<õ@=Õ=ªµ)+ÏD&’mßoCP˜·;ÜNÉ8d½[>†âQ­‹ãÑ×wùqÇ‘sË8£º]ÙÅ’5˜ñV „äÏÏÉÙ¿y] ›sSÛ£{éÙ:êaÞîAîÎoîû¿ä¼B9+üíÔŒI«6<­­Û˜™èiiÆgd·wq™**NV+gMŸæçõò'ÿk~ÔŽMóÙªMyÊ$BrÌ@[Ë‘e‘_^Q3̱ÒػǬ.F +K>J\<É™èæœœWðú÷K½Ýw¯_³cÍŠY›?ÂxÉÜ׿ü^R]K¼Þ½~u„çªÙ‘{ú#ƒB2æå¦   æÌ"ÒhÌêK’ÆàIÎâ©¡°ûÇ_„Kb“Óc¦NõóšæçuýAе‰ÑªÙ‘vf†:Ú :½¥½=!3÷Ø¥«ƒ~ØæçµhÊd{ ó(àTž¼s'-“xËÆÌdyä[s]M ¦ªJWwOzañáߣ *ª(fzã’ùËgL!^/Y"|+jÓ‡õ-­Äë`w—WC-^o_FQé¡s…›øîý^¶ÂO½¼soVIÙ"Ö000ÐÚÑYÝÐtøBtJ~Ñ(UÆþ3¿Gøxú:ÙÉj…µ0ëbf€ï–å‹8uõTêèIbRÒ¾Iš€â •¡˜b1ý=àVrÆHÌ šdidH§ÓÚ::Ëkë³sODß Ò»¥ÿ΢8FIß/¤Ì?^³ÂËÁV‹©VÝØüóµ›–æ>žÝ<Þ÷¯\¸“ A] ‡ô;KšÊéP<\)ÖΟ=#ÀGGC½¾¥-%ˆa†ë݆úçvïhj{ùÖDÓ‡¸»|±iíͤÔ÷¥¸CZ›Ò|©!4þ=6ÉÑVgZ›WÔ5”ÕÔ‰.'&9~NפØY˜M÷÷éävç”–ó¶™ÉÜÐÀp÷7ö|“W^A¤Å’…!Üî”üBîio³oãëŸû‰ælÌL¢‚ü:¹Ýe5uÜž3½Éžn~N+vì.­©¥’éÌâÒs±q†:ÚnÎ¥Õµiÿ|Õ ¿-ÀÌ@SWŸQTjf äîìag½|Û+ .-³¼¶üœL ô†ÜÊñm[œ­YUõwÓ³úø]Mu;s3šÂ(^ÖI|¯«LT’Õ ©ê³×_&*4µ XGC}F€oˆ‡Ë†ÿÈ,.£¸•É®Ÿ­]¥4Q±½««°¢ªƒËÕÑP÷´·&¿ Òö@Z ÒÚ”¾É‘¶jRcÉøÌ/[_G{ÑIާ³ódUÝóÂ_ŠšÖßß_^[Ÿ]RÆëëÓPSUdPº ¶¡µ 4™jO¾¥ÉTkmï j;v{WWN)GSMÕÇÑÎÓŽ½åëÃqéYÔ{‡ø5SL$)“t„!źxunSUÅÙšµ ,øÜ­»#FÄ ¤¤}“4•/ŠC1•bj1ÕÜl¬8µõÃõ©á|úúË‘þ>Ý<^FQ _ 0Õ×óq´3Õ×%ö›I{·ôßY2iÕ£=˜GMjëèŒy˜êdeùѪå}|þͤ4w[ëV-/«©Ë(*Q] ‡t“²²F4WŠ#l&ê"µ X‹©6/4ˆú §®¡¸²šmfbgn*:³ öp¿¯ü¡²CZ›R~©!4þ=6ɱ42€²'ºqIu X™üsO˜ wâ¿:sžx½aÑÜ—¢¦}¶vÕü÷vÀÏ…!yeœ_lnïKó£nÞ¸dhÏ]ÖåÿZ<5lEÔÔG~¤’éØäôØäô`wç@7ç´‚¢ÏOœ2Ùéë·¾<}ŽxýÑËËæ†.ŸA$>u-–X¾{ýê!G–…³5+½°xõg_ŒYeû£ÄÈ%¤Š ô›îïSÀ©\¶ísbÉ ![_Y¾óÕ•sßÝNeƺ:»Ö­šÀ`ì;uöôõ[O& ¸ 1í´¤µ)}“£ØªÅƒLÆgd¿¹x^€«œ:+\èÂfµutæ”rdRÝB=ÿ`¤ ²¡µt5Ô‰—N/ª¨NÊ+=- bob¨ï¦emúê°}üêK[_Y>uÃ{Ô3Ce ÃS|$©“t„¡ÞäH뢺±‰hrU#Ù]¦2’öMŠcµ˜/êC1•b†z¹Ñét1g ™`†¿O¤¿OiuíÚÝû…LŠÞW†´wKÿ%ÃV=ªƒyJ~áö#?Z›ý²k륻÷w?ídeqbû{áÞƒ&9¤u1$ÒoûÿÊค´‚bS½á*kÅSC'**þsgÈaT²MŒR{f´ê±ÏdIumyM¹¡>ËØXâbÍRUV~ø÷ISÒW÷‹ºšÆfÐ×Ö[sÓÍK¾>ñ–®†FUCõPsêê…¯£ã$åh©3‰¹RDe ÃS|$©“t„¡ÞäHëbëwÇþvñ½oŽ¢ ÃCàÀÙ £}KéC-M¿á`N\hª®ª Ä/,F:Ú#­ Éò0•%f(²Q~ œ¯À C夽;&)þ¾35ÁÛÁVWS#!3‡bžIkó©|©!4Æ¿&g˜ïJÒ›÷ßËÈv´²pa³²r­X}|þœ`Ñ,c#0æW“†Ö6^o¯“9zå¬ijîãó‰Ý*Š«j.Ç?˜èwçÛ}î$\Ix˜Ï©Œý²ëŸÓ©»y¼OºŸ•;fÕïȲàós D&fçzر]جx ã)1î_¹ÿP†›ƒö0ÒMHЪÇ>“p51éõù³Ã½Ý‰+p¼lAä\5é«[JÄ®±®»;€³µ%jk1ôÊú‰C”[àãhggn;Â}&ÉÖ@%’â‹I:ÂȰɵ´w{Ìc?JFô …úPL¥˜>ŽvõÍ­9ÃßžxÈNÖ–ýýý¢gŽ™„zD­Z惹èŽNi]H–‡Ñ¨,1Cñ¥°13éãóÅ\ EÚ»ËjêJ«kí-ÍMôt«›`šŸüy/‘bžIkó©|©!4Æ›äÇpžÚדuvëê¹;_OLþ#nL3ô¦¶Gƒ?€ wà¡ O®DÊMŒA{Ñ&$kÕcœI¸’ððµy³føû“œIÎ 2É‘¾º¥WßÜj¤« a^îé…Åî¶ì™¾uÍ-PQß q¨[ÚÛ@MEòºÑH#IZLÒæi59™¤ýBÙ Å3|'0·S3Fš€A§‹âÆs¨GÔªGi0—¬.$ÎÃ(UÖCñp¥ Óé­bó@¥wÇ_ÐÍã…­{g´³Þß? ýJÄŸèqþö½ó·ï¹²Y³‚&Í ø¿->=v굟R:»{ô‡;:¸\á’c¯?oýpóœ€ßoÇg—–Y ø|¦ÚàË£u4ÔAìœö±br»@[]½¾¥m”6A½º$mÜžPRT(Yµê4ÛaU76?ÌÉ÷svpdYä–q\m¬òÊ8ÄuðTê‚4Ò+ª¬v³±2Õ×µ·4óÖ.˜îí—– %Uµ‡Z_Kk¤-Jš5F’´˜¤#̘ ¤£:ÂPï¼¢D¿PHEÝ_g‰»yô° (v ñ½[šï,ÉúæˆZµlóª†F1{ؤu!qדUeIÓ¢ÖP÷-iO\3w挟c—®NŸäÍTU95Ô©ìÕ‚Jm>+# B{ì—‰–öŽ’ªsC}âBĉ¡rò†[‹£TT@^ySUÅ×Ñn´³NLÔ¤ûº½« ÄÿZ–Y\¶ëøé;vÓh´õ‹æR\smS38[YŠ.d›ñƒî Ÿ?-ô¿ûïÅc¨¼ò ƒ>¨²ü] »¤œÊʉÛVÎ .ô› ^ÝTjsHMmí`ñx³T Yµj‰3IÑÙ›q° ,(ÂÛcƒqãa*õº ƒô 8•LU•§…µ3gaxˆ³µ¥§½M[GgMcó“'iHï½oŽlZºÐ×ÑÎËÞ†ÛÃË(*íêîÑJªkßÿ¿£¯Í›ecnÚÃëÍ)-çö𠬦îlÌG;G–…ŠÒDn¯¤ªæÊý‡¿ÅŽàQz«>ùßšf¹;¹;óz{3‹K¾+æÂÁš¦æè{^˜0?4èüí{ÄB…YA“¥äõöRŸäˆÔ‡ßþQT:?,ÈÝÖšÛûž˜üÝïzz´xW“ò9/N óu²'ÎÇkëèJÌÎëæñdµ ŠÕ=\mRñÅésm^ °·4£ÑhmiÅI"÷ð‘U«–&“ýzóÎ+/ÌÐÑPò>¤uA)U64vuwWÖþýü‡ßnƸ:‰FR|¨ 8•דí,Ì\¬Yô–öŽKwïñ qkf*¤_i$I‹Ie„‘U“;|!zËòEœºz ž*«¾)>˜/êC±øb»»p{zî¦{ÀA|‚E|òÞ¿‡z¹yرë[Z£ãÌ šÄFÚ»¥ùÎß7©·j1›Áœ´.ˆH~xèØ«ó¢Xæƒ"IšYU–4¥øèÛ’r æ…²MM&˜Ñ;¸ÝÙ%e ™¹#íÝ'¯Äl}eyn)g¸SûÄ”‚´6ÇlW ¡§EœFvrÔÌ߯­üéJÌW"7ªG+'w¼g¤«3eý» 4žÍJ€‹ã×﬿ñ å?¿—,Á æúç÷ìã§Eˇ‘†Zz£QYc_ „h„äÌŠ™SX'¯ÜÀP áÞ öü(Òƒ¼2g<Ì)xöc3îêBæF£²Æ¾! 00ɇ¯6­ëêé¶04°·4OÌÊ%žÒ ñpéíë½wΈLõõܱfEQeu[G—âq¹KuCÓá Ñ[™×…”Ʀ²F»!™ÀIBrÂÒÈ@OK£¡µíè—¿=ÿ'!ð²·ÑÖP¿7ü ¤ ZÚ;R Š­MŒlÍMjšš/Ç?ØÿËï[™×…ôÆ ²Æ !™ñ59!„B!4žá59!„B!¹‚“„B!„\ÁIB!„BH®<“7ðwqܺjÙµÄä§~é§—½Íîõk*êV}ú¿ç9O×wïoôr°þûòνY%eØ·B!„ž[ãz’cn Ï ÓKkj-×bªékk)((<Å<føj2Õ4™j¶ì´Âb9ÎÃxv?+·¶¹¼ìmŒtuF/Ô!„Bè™0Ч«½½taò‰ƒÉ'²Œ %øx°»óù=;Oü£Cš‡»éÙ| ª¾qôfã!ãÜñèë;Žü¸ãȹeœQ 5B!„z&P=’£¯¥¹qÉ|c=•;÷RüH¤¿Ï½ôì@7§þ>Ï]’ËðÝN͘´jæ!„B¡ñƒê‘œ¯ß~cªŸ×±‹W)¦qwÑRgÿóZZAq¤¿!„B!46(ÉÙ½~5ÛÌäØÅ+qéY×ä×ÐÒš^Tó0õÝ‹Ýl¬2ŠJ‰·–Nß¼táþ3çO^‰¦O>q°¶©yöÛ[`ã’ùËgL!–/YòÏj7}XßÒJ¼vw ñp5ÔÑâõöe•:w± ¢J4Óü¼M™loa>œÊ“—cî¤eoÙ˜™,œbkn¢«©ÁTUéêîI/,>ü{´p ¤y x±»‰žÎÒé~Nö†:Ú ÐÚÑ™[Êy÷ÀâÝpo÷•QÓõµ55ÔTy½}ùœŠÑ7îgåÊ6 Ãç…Yp{x‰ÙyGÿ¸Ì©k ŠÄS|>^³ÂËÁV‹©VÝØüóµ›–æ>žÝ<Þ÷¯\¸“Ö&F«fGÚY˜êh3èô–öö„ÌÜc—®Ö45(“bÚÅ&'f !„Bhü Ÿä,Š˜áãù ;oD§œº:ÿ—·S3Þ]±8ÜÛC8É!•Y\z.6ÎPG;Ð͹´º6­ Hø–pwÌ ô8uõE¥fzAîÎvÖË·ý·²¡‘x÷ýKF„tr»Sò tº§½Í¾¯zì'b¿ÙÆÌ$*ȯ“Û]VSÇíé13Лìéæçä°bÇnâ¢sÒ<Ä¥e–×Ö€Ÿ“ƒ©Þ™ìáúÙÚUJÛ»º +ª:¸\ uO{aU%¥þþŒ¢’^¯¡Ž¶·ƒ—½í¦/ÝËÈ–U>{ýåéþ>ÜîÔ‚b õ¾!.þw ³¸ŒJ¨ -¦øžSZÎØf&sCÃ}ÜߨóM^yÅLŠoTšœø5 „B¡ñƒd’celôÖ’yuÍ-oìý†úJýݕ&*Æ¥e@Ck[QEU¨§Û—§ÏQüxlrzlrz°»s ›sZAÑç'Î ™ìôõ[Âu~ôò²¹¡ËgD‰§øx.ŒÉ+ãlüâ`s{8XšýpóÆ% DwI/܉ÿêÌyâõ–åÿZ<5lEÔÔG~¤’‡S×b‰»×¯r‚a¬«³kݪ ƾSgO_¿5d.ÝK¼t/QøoT ßǯ¾ôâ´0b’#}¢ý¦ûûp*—mûœXòBHÀÖW–ï|uåÜw·S‰)ÒbRÉCJ~áö#?Z›ý²k륻÷w?ídeqbû{¢scÑLnX4÷¥¨iŸ­]5ÿ½T2IÚHCM±E!„B¡ñ€äšœ¯½D§Ñß?ptD+ðñèêî~“Oü›•k¢¯kkn*Û¬ _»tYÄ¿K¦…ÀÎï"öG ¯¼âêý$5å@W§!×v"ú¸ÙXÉ*{‹§†NTTü5æÎp3œ'EÇ?èáõZÈ* ‚`ÿßsø#.!­ ØÔ@OVq -&õ<”T×€ºª*ä”rÀHG{Èu~óë…ÜR޹¡~€‹#•LJÐd¾„B!4f†8’óBH@7wýAÊ+s"í-Íÿ]Z>¢•º:?ÈÎþ›™óRÔ´—Â^éA]MSsŸ¯¯­EüëlÅêãóçû‹¦a€Ù0§u5´¶ñz{µ˜LYe‰˜'\¹ÿPL õ%SCC½Ü u´{ûøõ-­ :]CMUVypdYðù‚‡¹¢ ³s=ìØ.lV|fŽôq -æHó úô#:}ØIø½ŒlG+ 6+áïK˜Ä =È| !„BhÌ žä¸X³¶¾²¼©íQIUÍ+sfä—W¾=¢5ú8Ø©©(ëjj¼ûïEÄ:AnÎGÿ¸2z%áõö©(Mü«T :¼8=üÉd*JJí¡/³W=Rĉ¦¶GÃ%ÐVgù`³¹¡~yMÝÕûI½}}†:ÚÖ&F²¬]ýÉ 4?j¦Š²Lâ@ZLÉò@ª¡µU¸u*q =Èv !„BhÌ žäd•”õöõéjjœØþl?|b¤k rwW+×ÇOyžKÆíé%EEñëéï¸T|¾ ›Ç [÷Ž”Ñ¡’‡a’tr»@[]½¾¥mÈ‹§„šêGß{°ýÈ?AŽ9°g¢âYåÏhª© Z¨£¡Ün™4 ÒbŽ(U bæK¢ˆcMÔJA±=ˆ µ¬ZB!„Cü`OÜÂXi¢â·çÿ$.“‘@W§ª†Fï—Ö‰þ}ùóo4mf€/4µµé•'\.¨Iôc^ySUÅ×ÑNÊèPÉC{W" wsž8ÜMôuàvjúèå!¯¼‚Á Šƒ¿‹#d—”ˤ‘sDy˜»eûÞŸÎRÙ®£TT>«®nPž¨(Y{jê-J[¹jvd„·.!„BOË“œ/ß(«©½’ððÇË7Fº:#mKcÜ'v^‰ëÈýœì ¯Œ#&{ºŠ¿që^?' JõkÌmؾz…»µ4Ñ¡’‡ÆÖGðäÕç¿ÆÜæ óÂWÍŽòƒ5MèêúÑÛU 5Í4š‚¹þ½Œì]ÇOS/•<Ä&§­™;sid„‡»·omjüÆÞ¯sJ9œº†‡ܶzùº…sV̜ʩ«oïâ2U”Ù¦&kwïÏ*)ûývü‚𹡮6V…UA¿†šªš²rŸ€/«3'¿¼¨¬A|„ Z;:Kªk¬LŒ¬LÕUUrJËï¤e¶ut@qUÍͤ4:¦£¡nf g¤«M§ÑÓ K³s[Ú;:»»o&¥Òh,cCG–…•‰SU¹²¾1)7?.-KVyˆMNoëè43г³0›À`Ä¥e~xèXaE5ñAêqC|1Ióð꼨²šÚ˜‡©ƒVûê¼(Nmýõ)D&Ó Š›uX›i2ÕÒ K>9öÓ“÷Ukíè,¯­³65²~"âÛ•PS\C{×ÛÁ¶¨²êó§!„B= àä‹Q@ãÙÌ߯­üéJÌW"ÛA!„Bh84 B!„BHžà$!„B!$Wp’ƒB!„’+xMB!„BH®à‘„B!„\ÁIB!„BH®à$!„B!$Wp’ó3Ÿ~øÛçÛFô/{›˜{Ž}ôŽ|DàçO>¸òÕ.;sSêñwq¼üågo-ž‡í!„B8ÉùÛÌd¢â„}df€¯&SÍÕÆÊÖý¬Ч­¹©ž–f„õOi1ÕôµµÆCÌ ô­Œžb{@!„BãÁ¨Lr®}ýßǾyÂw7=›/TÕ7¦?ëeÉ,.mj{ÔÃëMÌÎ{óìî|~ÏŽÅS'c¯F!„zÎ1(¦Ó×Òܸd¾±žÎÊ{1jB·S3&­Ú eihm‹|ë?X§!„BèYGu’óõÛoX›¿ýÕ·2„B!„ÐxFi’³{ýj¶™É±‹WâÒ³dµá…áÁóBƒ, ¸=¼Äì¼£\æÔ5ˆ&˜4?,ÈÒÈN§µut–×Ö'f瞈¾A=Á4?¯ES&Û[˜À@§òäå˜;i™¢›X;öŒ õú–¶”üÂåÿ»÷7z9Ø ÿ}yçÞ¬’2Ñ6f&Ë#§Øš›èjj0UUºº{Ò ‹ÿ]PQE}+áÞî+£¦ëkkj¨©òzûò9'¢oÜÏÊM³qÉüWC­ÖŽÎꆦâSò‹(nbvФíkVˆ.)¯©[øŸÂMôt½=¼lÍ ôµÕ™JŠŠ\^OE]ƒè1½`w"¼Þ¾Œ¢ÒCç.*¦˜ºøxÍ /[-¦ZucóÏ×n:XšGøxvóxß_¼ráN•bn\2ùŒ)D²á! ÂC„ŸŠÚôa}K+ÅHJÓB!„ÐøA>ÉY19ÂÇóAvÞÁs—dµÕÏ^yº¿O'·;µ XGC}F€oˆ‡Ë†ÿÈ,þkžðéë/GúûtóxE%|ÀT_ÏÇÑÎT_W8‡!MðþŠ% #B:¹Ý)ù… :ÝÓÞf߯×?=ö“p¿ùÈ›=ìØí]]©ÅZLµy¡A#*B\Zfymø99˜è=™ÀÆÌ$*ȯ“Û]VSÇíé13Лìéæçä°bÇîÒšZŠ[QURêèÏ(*éáõêh{;ØyÙÛnúòнŒl"Áñm[œ­YUõwÓ³úø]Mu;s3šÂ.µâÔÕŸ‹û«5Ð/L”`~XðKQÓúûûËkë³KÊx}}jªŠŒÇ®È73ÐãÔÕg•šè¹;{ØY/ßößʆF*u4©­£3æaª“•åG«–÷ñù7“ÒÜm­?Zµ¼¬¦.£¨”´˜™Å¥çbã u´ÝœK«kÓ þ™à3*‘”²= „B¡ñƒd’celôÖ’yuÍ-oì•Ù¢ý¦ûûp*—mûœXòBHÀÖW–ï|uåÜw·À ŸHŸÒêÚµ»÷7?j'Ò$Ÿ8(\i‚)>ž #BòÊ8¿8ØÜÞ–æG?ܼqÉbÇúÅiavì¼2οwì&>âÊfÛº…z)N]‹%^ì^¿zÈIáÂø¯Îœ'^oYþ¯ÅSÃVDMÝqäGŠ[¹t/ñÒ½DÑÐ}üêK/N #vÍYÎÖ¬ôÂâÕŸ}!qud— ç–ðä$‡ðñÑ“Ññ†[Ééë·¾<}ŽxýÑËËæ†.Ÿñù‰3TêRò ·ùÑÚÄè—][/ݽ¿ëøi'+‹Ûß ÷öÈ(*%-flrzlrz°»s ›sZA±ÝERúö€B!„Æ’Ÿüw¾öFÿÀQnrAX0ìÿ{×þˆKH+(65Ð tu€…á!pàìáfÒK¦…ÀÎï"öª ¯¼âêý$5ebQ~ Ü/ÑýQBer³±’x Ññzx½FÄ¿½}} ¦¬< e``€â»Ç.]G–ź*©®uUUÈ)們Žö(sP$ŸJ{@!„B£dˆ#9/„tóxפ¼2'ÒÞÒüðïÑÙ¥å2ܤ#Ë‚Ï<Ì-]˜˜ëaÇva³â3sœ¬-ûûûãÒ†½þ‡4³«ÏŸì/ºelfz`cfÒÇç§Œé}ŸZÛx½½ZL&õèh¨/™êåf¨£ÝÛǯoieÐéjªÄ»ÅU5—ãÌ ô»óí¾ w®$<ÌçT>õ&UÓÔÜÇçëkkQ¬‹ADŸ¸C§ÓdULñ‘|*í!„B’Á“kÖÖW–7µ=*©ªyeÎŒüòŠÃ¢e¼I½©íÑ …Ä1¦Š20èC$xl ¤ txqzø“o©()No»†QÒÇ;îTh«3|°ÙÜP¿¼¦îêý¤Þ¾>Cmk“Çv¹íð‰ôÂ’¥Ó×EF,‹ŒÈ,*=xîbrÞS¾hž×Û§¢4‘b]P!e1I#ù´ÚB!„ ƒ'9Y%e½}}ºš'¶¿ÛŸù&ù|¦šÚ …:êÐÁí&þURT¿ñ ø|A7¶îá-¦šLŠ#ö4.©,žjn¨}ïÁö#ÿÔBÌ=»èÿüí{çoßse³fMšð[6|zì”èõ'Oi]ˆªjhnúJZÌþþ‰#)Ãö€B!„žº!Ž*÷ÕUš¨øíù?‰Ë$d+¯¼‚Á û:Ú‰.ôwq€ì’r(¯©SSQv²²n ¤ òÊ+˜ª*ƒ61(NŸìá*}qÚ»º@xàB†Lôuàvj:•Ä™Åe»ŽŸ^±c7F[¿hîøia¤u!jî–í{:+Y1;¸\PSQ– ’2l!„Bè©b’óãåe5µWþxùÆhlò·Ø8xkÉ|á’¹“ÜmÙUõÄ­®®?H€M/.&4Ÿ!MðkÌmؾz…»õy8{Þ^¶pÐÙ_hl}ƒ®¡—‰šÆ&tu“Æ•Íý÷Qg—@ЯÈ`À¸AZTP)&q·h?' ")Ãö€B!„žº!ö†3ŠJÿõŸO¤\/FÛñø#& 9¯àÏ{¢ã¸8N÷÷¹}h_nG[É63áöôl;|œHvøBô_Ow[vÌ=œJ=-M–±¡èzH\¹Ÿäʶú×”ÉG?z»ª¡±¦±™FS07п—‘½ëøi¸t/ÑÏÙ!ÒßçÔ'WV·utçËI 69mÍÜ™K##<ìØ½}|kSã7ö~MÜLJ¿ßŽ_274ÐÕÆª°¢J è×PSUSVîð‰a^n{ß|­¾¹µ¼¶®©­©ªìigÃ`Ð/\?-Œ´.HQ,f}K[za1Ñ* +ªè4šË|íîý9¥ÒHʰ= „B¡§n´~òWPP˜4iÐB^oïŸ÷À‡ßþQT:?,ÈÝÖšÛûž˜üÝïrê„)}ðÉ{ÿ^êåæaÇ®oiŽ0+h_  ž`÷É_â3s†‡8[[zÚÛ´utÖ46ç•W|ôíI¹óBÙ¦&ÌèÜîì’²„ÌÜ‘–´¤ºöýÿ;úÚ¼Y6æ¦=¼ÞœÒrnO&1¬mnY±ã¿K§Gº9…zº@—[XQUTYE$(«©;sÇÇÑΑe¡¢4‘ÛÃ+©ª¹rÿáo±wÇU##­ ñ¨ó½oŽlZºÐ×ÑÎËÞ†ÛÃË(*íêî¡I¶„B!ôÔ)€“ïøÏ¥¹þù=;Ä<’4¢ÂXWçâ¾OrK9+>ÞÑ@!„BÏ(Ú3‘ËWæÌ€‡9'@Ã}8)ñLÌ‚Š§ÿ°„B!„$6Ž®Pšêë¹cÍŠ¢Êê¶Ž.Å kSc õê†&á{H Šôµ4|°¹³»»¬ºŽ©ªbebÄëí=sýF!„B=»è o2Þò¤ÅT3ÐÑ6Ó׳656ÐÖjzôèÏ»‰¿zÛÿ f’³Ã¥}Ÿ¼¹h.VÇË_~öÖâyÏy¿@!„†$ù$ÇÜ@ßÊØHšc@|føj2Õ\m¬ùn°»óù=;O<ÜÇIŒÒ<ÜMÏæ Uõi…Åã6“h,-þÓÇïß;üUÜw_œùôÃ׿E¨Áˆ©Í„£ûcì´pAXpò‰ƒ¯Ì‰ÄÈ}Çy&º¤ÏPe!„Wýéïs/=;ÐÍi†¿ÏÁs—ä²ð·S3&­Ú€NîxÏeÁíéIÉ/TPP°57]37*1;/£¨ BÃÁ~Bhz’âÎ<þç5¥‰‘ò;ÉAHhÿæu,‹»iY›¾:$\8ÙÃU8ÃA!„BÏ pò}réîõ«]¬Y37}¸(bò»+¿òéÿ„»z—Ì_>cÊëŠÚôa}K+iâõ4?¯ES&Û[˜À@§òäå˜;i™Ä[¯Yáå`«ÅT«nlþùÚMKóÏnïû‹W.ÜIk£U³#í,Ì u´tzK{{Bfî±KWkšš)fò»÷7z9Ø ¾¼soVI™h23“å‘SlÍMt55˜ª*]Ý=é…Ň.¨¨M6?4h~X¥‘!Nkëè,¯­OÌÎ=}ƒJè)jaxð¼Ð K#n/1;ïè—9u kwéôðÍKî?sþä•áÂäk›šg¿½U4'!®†:Z­Õ M‡/D§ä ˆ©,RD¨7u(.-K¸pÕìÈu ç:wñû‹WI‹I¥&z:áÞÞ¶fúÚêL%EE.¯§¢®aåνT2éí`ûíû+ëæ½»CL)Ä4ÒÚL8ºŸÛÛ²þ]Ñ·„ÿgå‹DH‹93Àwçk+Ó Šù™‰âFN)çÈ…èÔ‚œ#îí¾2jº¾¶¦†š*¯·/ŸSq"úÆý¬\Ñ4&z:K§Gø9Ùêh+(@kGgn)çÝG¨'¦6I»ÅŽót»i¨Iã@šI¢=pjë u´x½}E¥‡Î]4F‰é¼Ò‡š´_Pi0â%ý B¡çÓÐGr]ÿˆK€Û©ï®Xîí!œäd—ž‹3ÔÑts.­®M+øgW˜ø¾!Mï¯X²0"¤“Û’_È Ó=ímöm|ýÓc?s˜¨ Im1S¬,?Zµ¼Ï¿™”ænkýѪåe5uE¥vfÓý}:¹Ý9¥å|€mf2740ÜÇý=ßä•WPÉC\Zfymø99˜è=3“¨ ¿NnwYM·§ÇÌ@o²§›Ÿ“Ê»Kkj‰4Ÿ¾þr¤¿O7—QTÂLõõ|íLõu)Nr¨ê³×_&JšZP¬£¡>#À7ÄÃeÃÿd—ɪß¶ÅÙšUUßx7=«/ÐÕT·37£)üs±–øÊ"Ÿ™ãå`ëëh/:Éñ´c@bvž¬Š9?,ø¥¨iýýýåµõÙ%e¼¾> 5UEÆŠŸ>É~¾;\ÒC¥6eÂÃŽÝÞÕ•SÊÑTSõq´ó´coùúp\zÅ«*)õôg•ôðz u´½ì¼ìm7}yè^F6‘`²‡ëgkW)MTlïê*¬¨êàru4Ô=ím„k M em’v=éC=6]4ÔÒgÌ ô8uõE¥fzAîÎvÖË·ý·²¡‘Jç•>Ô¤ý‚Jƒ(é¿PB=Ÿ†˜ä„{»+MTŒKË€†Ö¶¢ŠªPO·/OŸ#ÞMNMNvwtsN+(úüÄ™A'M0ÅÇsaDH^gã›Û;ÀÁÒü臛7.Y üNJÉ/Ü~äGk£_vm½t÷þ®ã§¬,NlOtºuáNüWgί7,šûRÔ´ÏÖ®šÿÞ*y8õ÷îìîõ«‡ûn´‰-ËÿµxjØŠ¨©;Žü3ü}"ý}J«k×îÞßü¨H“|â õГf2*Ðoº¿O§rٶω%/„l}eùÎWWÎ}w»Lªß‘eálÍJ/,^ýÙC& RYâÅgd¿¹x^€«œ:+\èÂfµutæ”rd[ÌžŒŽ Aœ­,àAvþp H imÊŠè uQ~¿úÒÖW–OÝðÅ_º—xé^¢hûøÕ—^œFìPëêìZ·jƒ±ïÔÙÓ×o=ùqÒ²ªM1]OúPM×jé3I8}ý–ppþèåesC—ψ Sì¼Ò„š´_6Ò@Éä !„Ðshˆ»«Eøxtuw?Èùk‡/!+×D_×ÖÜTV›\2-v~ÿñ…yåWï'©©(º:‰¦,©®uUU vˆt´‡\ç7¿^È-å˜ê¸8ŽR¤ˆã3n6VÄ¿ ÃCàÀÙ ÂŽÌ- €ýïÀq iŦzƒ%±Þ¾>PSV–¾²†SR][^Sgn¨/¼MŸ‹5KUYùáß L†Å,ZêL¨¨o€Ñ¤ÉT#îX(üûÏÊGºN]½ðutüƒ¤Ü-uf¸·»dYŠŽÐÃëµ02 þ]<5t¢¢â¯1w†Û%M0vP×£QŠA¡–Ñ6ìÒUpdYHÜyejÒ#} ¤£BÉ¥!Žäº:‹þ¤™óRÔ´—ÂÇOõ–˜³«ÏŸì/ºelfCý(ú:}Ø{^ßËÈv´²pa³¿À@VZÛx½½ZL&ñ¯“µe¿è)X2çȲàós D&fçzر]جøÌé7Q\Us9þÁÌ@¿;ßî»p'áJÂÃ|N¥4•5¤«‰I¯ÏŸîíN\ãí` "çªA1I ù°‘™¾ëÿõ—oœ¹q[&[éíë»þ Et‰…¡ ›%Í:“r |íìÌÍb“Ó©¤×ÑP_254ÔËÍPG»·_ßÒÊ Ó5ÔT‰w‰½Û+÷÷qÒ£Q›ƒºÞI)ćz4Ô45÷ñùúÚZw^™‡š´ÁH(™ŒQ!„äÏàIŽƒšŠ²®¦Æ»ÿ^D,¡Óhäæ|ô+²Ù$ƒ/Nò-%%‰WÛÐÚ ö%}|p–Å Ó›ÚnÝ0†Øqàˆ©¢,«­l;|"½°déôðe‘Ë"#2‹Jž»˜œW(Ãʺ’ððµy³føû“œIÎ 2É›bŠ×ÖÑ©£¡n¢§SÝØ,\¨•¸¸bÓ‹ „K$›ðˆÉäo±qðÖ’ùÂ%s'¸Û²«ê)Þ )¯Œ#&{ºŠ¹o„ëã§K=êìú ÙVÖíԌڦæHŸ¹¡#2É!-&•RHéVJF~y…­¹é'¯­ŸR|ƒ¡Ò䤩¬AÖΟíÂfUÕ7R¼J¤¦± ]‡KðkÌm¾@0?,xÕìHÉŒYmJê1èz¤¡¦iŠ)«Î+Í@JÚ`HEšY!„œyìt5?g{È-ã J”^TÂç üœÿù‚©oiK/,v·eÇØSXQE§ÑXækwï'¦Câ\¹Ÿäʶú×”ÉG?z»ª¡±¦±™FS07п—‘½ëøiêYñp½óí¾¢Êj%%[sS@°ëøÏ¢ H3 ±ÉikæÎ\áaÇîíã[›¿±÷kÑÃ9|!zН'±òN¥ž–¦ðîa#"&“Ññ\§ûûÜ>´/·Œ£­Îd›™p{z¶>NqåÍí¿Ü¸½42âø¶-ùå•í]\mMÑa^n{ß|­¾¹µ¼¶®©­©ªìigÃ`Ð/\'Ȫ²àäå˜wW,öwqLÉ+>¶H‹IZ ™øÏÿ}èý·fø¹9g—ööñ¢I¥ÁPirWá…; ³kSc õNn÷Ö喝‡ßoÇ/™èjcUXQ%ôk¨©ª)+÷ øDN]ÃŽÃ?n[½|ÝÂ9+fNåÔÕ·wq™*ÊlS“µ»÷g•”‘&³Ú”&ÔcÐõHCM1ÒSVWš”´Áj̾PBÉ:è›ÿy)jšµ©ñ—®Ö6· Jæínn¨›œÖò÷m:ã3²õ´4uuXƆꪪY%ewR3Û:»„“ >3'·Œ£¦¬lª¯gad ¤¨XÛÔŸ™“_^ ¯Î‹*«©ý±Ÿðê¼(Nmýõ)6f&aÞîiÅÍ:¬M4™jé…%ŸûéÉûª‘f²µ£³¤ºÆÊÄÈÊÔX]U%§´üNZf[G'±‰ÌâRáõñ°rÖtMá‡K׈ÏÞŒÓbª™ê[vóxwÓ³mÍMÛ::á͸Äd269½­£ÓÌ@ÏÎÂlƒ—–ùá¡c…ÕÔW~?;¯›Ç3ÑÓe™šê+((pªn¥d¤äÿõL=PÐ×Ö´04p`™ëjh”V×ÿóÚ^û'{b+‹ºœ2΂ð`¥‰ßž¿TTùXH‹IZ ?'{w[ë[)éƒÖL]{÷çk±'L0ÐѲ³037Ôïàrïgå^IHj¹*l¸CZ›¯Ì™ÑÇüxù±Å:²,‚Ý]’ó Ò ŠI‹Ùß߯¡ªªª¬debdª¯ËíáÅ&§mýî‡âªŠeììî¾™”JS ±Œ YV&FLUåÊúƤÜ|á}‹«jn&¥Ñi4 u3=#]m:ž^X’˜Kô}ÒRÖ&•®G±w?Å®G%Ô¤­Z|&©J|ç•a¨Åô ñ †J ¤üBA!ô|R'ßg.Ó3|w¾¶ò§+1_‰<Èâ©37Ð?¿g‡˜k¢“;Þ3ÒÕ™²þ] B!„=4 ¬¼2g<Ì)ÀP iÅÌ©,‹“Wn`(B!„Шb`$3Õ×sÇšE•Õm]ŠÄÕ M‡/DcpD}µi]WO·…¡½¥ybV.ñHu„B!„FNr$ÔÒÞ‘ZPlmbdkn:00PÓÔ|9þÁþ_~ÇÈ bid §¥ÑÐÚvôËßžÿ‚B!„FÛ3yMB!„B ¯ÉA!„BÉœä „B!„ä NrB²d¬«cª¯‹q@!„ÐS$á$ÇËÞ&æÀžc½#Ágý]/ùÙ[‹çç¸|÷þÆä….Ö¬gºš½ìm’OüîýÏb]HÀÜ@?ùÄÁ#l–8¶(ÉÌð÷¹¸ï“…á!rYº'»Õˆ†Á3Ÿ~øÛçÛÆa&‡B¡g—„wW›à«ÉTÓdªyØ²Ó ‹‡Lcn Ï ÓKkj-×bªékk)((Œç¸ÜÏÊ­mn/{#]1)‡+¦ I¿‰þþà ×iÆ Ô…{»À­”t‰Èw‹=ã¼_Ë•aPṲ̂¶©yfr¸ñ!„zvIx$çnz6_ ¨ªoî[3ØÝùüž‹§N~Fãr<úúŽ#?î8òcnGL²1(¦L6ÑÓÛ ÜÞ(åa¼Uw˜·;ÜNÉ8·¨ñ`QÄ䳟o½ÿý×·íûæõÞ¶Ïb)H‡Ág%“RŽ!„Ð8$ᑜ۩“VmÀð=+:¹ÝÐÞÅ} k ­åȲÈ/¯¨æWsÒhTý÷ÕS|=ëš[b“Ò´Ô™~Nö~Nöï~sävjƳUgb¤’Éçj|@!ôœÀ‡>* ¥½ýy(l˜—›‚‚‚˜SÑH Ñäæ<Å×39¯àõÿî'–¸8~ýÎúÕ/Ìxæ&98> „BãÖc“œµóg¿òÂŒ7ö|ý 'Pº/7® rwöYùÆwïoô9·äå{³JÊDSn\2ùŒ)Äëá! D.AŽÚôa}K+ñ:ØÝ%ÄÃÕPG‹×Û—QTzèÜÅ‚Š**9¶0Ô?·{GSۣȷþ#º<ÄÝå‹Mko&¥¾wà(„{»¯Œš®¯­©¡¦ÊëíËçTœˆ¾q?+—H¼tzøæ¥ ÷Ÿ9òJŒp É'Ö65Ï~{+•lP,¦xâ3)“Mur»ZÛ†|KL]æ4µ‰ÑªÙ‘vf†:Ú :½¥½=!3÷Ø¥«ƒŽ¢l\2ŸÈÃÀÀ@kGguCÓá Ñ)ùE¢ifønY¾ˆSW¿rç^±Qõ€[ÉÈS‹€…áÁóBƒ, ¸=¼Äì¼£\æÔ5oɰ.¨³37ýöýt:íµÏ¿Ê+¯pdYÀÕûI Y¹µMÍV&F2̃‰žN¸·‡·ƒ­™¾¶:SIQ‘Ë멨k¶ñµI(ÒaP8¢ÎðÑÑP¯oiKÉ/²Ò¥)¦øRPÌ$éø€B=‹›ä_ršLµ'Ói2ÕZÛ; .-³¼¶üœL ôžL™Y\z.6ÎPG;Ð͹´º6­àŸ/lÑ53=N]}FQ©™^»³‡õòmÿ%~PS×P\YÍ63±37{¸€ÈUªJaWT¢IDATJýýE%=¼^Cmo;/{ÛM_º—‘-“ÀQ,¦xâ3)“M½üÉÞ²šº!ßS¤y M`ga6Ýß§“ÛSZÎØf&sCÃ}ÜߨóM^y‘òø¶-ÎÖ¬ªúÆ»éY}|®¦º¹Maðc¯Îbªª8[³„Ÿ»uwȲh1ÕÜl¬8µõÃ]¸ÿdyjQŸ½þ2íÔ‚b õ¾!.þw ³¸L¶uAÛÔøÿÞ}sƒ±ù«CÄ&Ú:;ÀÁÒâ‘JaV54 ÿ•>óÂ_ŠšÖßß_^[Ÿ]RÆëëÓPSUdL ØõHE: À‘6{رۻºR е˜jóBƒ%¾˜âKA%“TÆ„BèY4h’Ó ºêÄ¿K§‡UT'倞–±§uêZ,ñîîõ«‡üâŒMNMNvwtsN+(úüÄ™!7|úú­/OŸ#^ôò²¹¡ËgD —x›Iil3“@7çÇvIÝ]øÁ•¿!¾t/ñÒ½Dá»Q~¿úÒ‹ÓÂdµKJ±˜â‰Ï¤L6!$fFL]æb&/܉ÿêÌyâõ†Es_ŠšöÙÚUóßÛŽ, gkVzañêϾ_„êÆ&¢ÉU ?õr£ÓébNE2|´¨¨@¿éþ>œÊeÛ>'–¼°õ•å;_]9÷Ýí²­ *ìÌM¿yg=ƒNë‹ÿˆy˜ºvþ칓ÊkëN_¿Ÿ¼¶Ri¢âáߣ‰2ÌÃÇGOFÇ? ë‘Št|qZ˜‡;¯Œóﻉ%®lÖ±­[„ dRLñ¥ Í$Åñ!„z=ö«aMc3èkk€­¹éæ¥ _Ÿ?‹xKWCCô§Vé _»tˆóX¨ˆIJ…¿ïLðv°ÕÕÔHÈÌî#Ññzx½Fã¹2žV&¥© |óë…ÜR޹¡~€‹#ôöõ€š²2é·~wüàoßûæÈ“§S …y¹À­äô%µ ,öÿ½_Ä%¤›èº:ɶ.HMrv8üÁ¦þ×>ÿRôü«–öŽ ÿ;Ðü¨ýíeÿŠ9°çüî3|÷œü…¨ÙæA´aKÙõŠTT ;â`š ‹ùlr!„ÐØxìHNIu-ëê@°»38[[€¡¶ƒA¯¬o¥LÔ45÷ñùÄ䊊²šºÒêZ{Ks=ÝêÆ&˜æçŠü¨©£¡¾djh¨—›¡Žvo¿¾¥•A§k¨©Ž«èÃLŽ´.$s/#ÛÑÊÂ…ÍJÈÊ-®ª¹ÿ`f ßo÷]¸“p%áa>§rÈOµ´ws01|íê›[s†¿Mó ä£E9²,ø|ÁÃÜÑ…‰Ù¹vl6+~˜ ›du!Þ$g‡%ÓÂj›š×ï=@ÄS”•‰‘–:óAv^UCS›3,žš”[@LUdÞõDEº~3“>>?µ`Ø»6ˤ˜ÏÄ(‡B=ƒï®VßÜj¤« a^îé…Åî¶ì™¾uÍ-P1j“àõö©(M¤ž>æaê«ó¢¦Oò&öz§úz=êìŠýûçymuæ‘6›ê—×Ô]½ŸÔÛ×g¨£mýø•ÍOݸÍäHëBĉ‘êªím;|"½°déôðe‘Ë"#2‹Jž»˜œW8ÒÕÎ ðÀ`ˆ¹I—˜rТ zSÛ£A ›µSEy,ëÂXO‡A§«()iª©šäxÙÛ|´jÙ‡)úX²bæÔ7ÏûrãZá9u²j²ízƒ%No}¢.‘²˜ÏD›D!„ž–Á“œ¢Êj7+S}]{Kó7ÿw`í‚ÙáÞqi™PRõØ•ÜâÏ!¡=z¢ã×Ì9#ÀçØ¥«Ó'y3UUN]½)|wñ”PsCýè{¶9!\s`ÏDÅ¿®<æöô€’¢¢”Ù¦˜¤™›Hʤ˜#ͤ“ ?ƒpþö½ó·ï¹²Y³‚&Í ø¿->=vJôz*þ:MÜÍ£‡M -ŠÏhª ¾kˆŽ†:tˆ„z êâü­{­o,|á»ÿlÚüÕ!уK/N §ÑhÂ+pàÇË7Ül¬&{ºyز…¬”I{²ë‘JÌ0(´†ºƒËà@IQLê¥xúCB!4Ößɧ€SÉTUyqZXË£ö„¬ÜëR\]جþþ~áý—í]]0ÜOþ\.¨©(R¾«›³óXÆFî6Ö ÃC~»'|×D_n§¦÷ñ¦¶v ròzGW7(OT”y1I396‘¤‚4#ͤ£T >?'³¸l×ñÓ+vì¦ÑhëÍô®¶:sÕìÈoáVëïâØÖÑ)æ·p1 ž¡5\òÊ+ º¯£Ý "@vIù×ʼnèÿù¿ïi4…/6®Í’®¦:ïjM ®TWU‘m¤ìzT5Ü0˜W^A§Ó'{¸RY³dŤ^ ñc5‡!„zZOr +ª`^hеÄd¸–˜Ì Ó£ý*êŸ«ÖØú†» ™¸›Ÿ“ÃèeýìÍ;°yéB;vBfŽèí§k› ÐÕy¸Ïæ•qÁdOW[sSñ[!n«èæ,ób’frÌ"IŠ4¤ Ì õ…¯çûû:Ù·¶wOse³D?êìúƒ3~òÚÊu çìÞ°ÆÏÉþÉM„{»+MT$Ž:‰4Á³Ò¢†‹Ão±qðÖ’ùÂ%s'¸Û²«êEo6uA¸™œöÎþï úž ¯²M‰…Äáµóg “hkMõó(®ª‘y¤ézâ%~<{Þ^¶PÌùcR“b)HÇj*$5B!ô´ þB%&9Š&§L4´¶%fç¸:å”–J›œ¶fîÌ¥‘vìÞ>¾µ©ñ{¿Î)ýëzîú–6â’ž˜{ +ªè4šË|íîýÂÒ‹KË*¯©s´²Ñ'0Àï·ã„‡Ì tµ±*¬¨ú5ÔTÕ”•û|"As{Ç/7n/Œ8¾mK~ye{×@[sÈ­ÜLJ]ýÂŒe‘®l«þþ~“4“cIR¤y MâázçÛ}E•Õ*JJ¶æ¦`×ñŸ‰·Â¼Üö¾ùZ}skym]S[;SUÙÓΆÁ _¸?(&zºÄ S}½'o°&͹jÏV‹.Ññ\§ûûÜ>´/·Œ£­Îd›™p{z¶>.š1¨ ¡„¬ÜüøÉë/µyݬÍÀÉ+7"|<^yaÆdO×|N¥ª’’Ÿ³½Š’Òo7㈫wdž‰»ž˜@‘ƒ—î%ú9;DúûœúäƒâÊê¶ŽN¿oÍ/«PS,éXM…Ä¡F!„ž:蛈þßÞÅ]žWVñß׈%]Ý=Óý}.Æ%d?>Ïiíè,©®±21²25VWUÉ)-¿“–ÙÖÑ)LŸ‘­§¥i¬«Ã26TWUÍ*)»“šÙÖÙecfæížY\š˜'L¼rÖtMá‡K×F”{^_ßdO×ÜRÎÿývQtygw÷ͤTšelèȲ°21bª*WÖ7&åæÇ¥eiîgçuóx&zº,C3C}…NÕ­”ŒAoíè,¯­³65²a1I3O%“Rn‚õº ̓øêN+(n~Ôamj¤ÉTK/,ùäØO¢·¨R}mM C–¹®†Fiuíñ?¯ [ hãôv°-ª¬úüÄé'˲}õ¿ùÁÇGOWXÒÏJ‹‡Øäô¶ŽN3=; ³ F\Z懇ŽVT‹V÷¨ÖÅ“-ª¸ªÆ@[ËÛÁÖÎÂôúƒ”¶Î®›IiL+Ck+c=²šº.]=|!Z†íÁÏÉÞÝÖúVJzQeµ]J H‡ÁØäôú–V=M +c3=ƒQ\U}3)Mؤ,&õ„t¬&%¾ë!„Bã8ùbÐè™à»óµ•?]‰ùJäù-2àâøõ;ëo£¨ÔÌ@/ÈÝÙÃÎzù¶ÿV64R©,3“¨ ¿NnwYM·§ÇÌ@o²§›Ÿ“Ê»‰KÆIó—–Y^[~N¦z’59ñ¢Ò6yäƒÍvìö®®Ô‚b-¦Ú¼Ð ê¡¡µ 4™jO¾¥ÉTkmï ™Ô¦œº†âÊj¶™‰¹©è¼(ØÃn§dȤMʪÑÀÀ€ ¿z„Bèy3x’s?+7½°ØÝ–ý寵_9Ç©kéuuv­[5ÁØwêìéë·$H0ÅÇsaDH^gã›Û;ÀÁÒü臛7.Y@ìP:²,œ­Yé…Å«?ûbÈ<&ˆMNMNvwtsN+(z*—hÇgd¿¹x^€«œ:+\èÂfµutæ”r(®äÒ½ÄK÷…ÿFú}üêK/N £¸CI1§¯ßúòô9âõG//›¸|F‘˜´²îÄuæ<ñzËò-ž¶"jêŽ#?RÉék±Ä‹ÝëW¹sOÚ¢HEšÒb¾8-ÌÃŽWÆù÷ŽÝÄG\Ù¬c[·P¬Ç†ÖVÐÕP'þ]:=¼¨¢:)¯ô´4ˆ}wÒ8H_›¤n&¥±ÍLÝœ›ä¸»ð‚+÷“dÒ&eÕh€/@Ÿ=B!ô¼âa «?ûâJÂà wçßþ»ýë·ßpqÑO ¨¨økÌáv7I,™ ;¿ÿ‰Ø›€¼òŠ«÷“ÔT”] ·¯Ô”•‡Ëi‚ñ ¤º¶¼¦ÎÜPŸ8E\¬YªÊÊŸ8a‰ºèø=¼^ #Ùfu€øIŽ]º Ž, Š•õ¤Ñ7ÀÍÆJVÙ#mQÒŠ´˜Q~ œ<@fñžîRÓØ úÚZ`knºyéÂ×çÏ"ÞÒÕШjh›Ú$“” áÞîÂ%Þ¶ºš ™9cÜ&©àöô@Ww „Bè93ô-¤·~wüÈ…Ë ÂƒgMúúõ÷³r·}w\ô*1ˆ×+÷JœÀÙŠÕÇçÏ ö]È263=(®ª¹ÿ`f ßo÷]¸“p%áa>§R41i‚qâjbÒëóg‡{»Wàx;ØÂHÎU õ%SCC½Ü u´{ûøõ-­ :]CMuôò\ÓÔÜÇç»ãT*ëI ­m¼Þ^-&SVY"mQÒŠ´˜6f&}|~j„··.©®c]vwgkK0ÔÖb0è•õ 0:Õ&©²šºÒêZ{Ks=ÝêÆ&˜æçŠºû69œÖöNhí蟃/B!„Fϰ­¨oøòô¹/OŸûlíËÓ'ùìX³â­/RY£ºª*ˆ½eiƒ/Nò-%%âŶÃ'Ò K–N_±,2"³¨ô๋Éyÿ\êMš`<¸’ððµy³føû“œIÎ0’I޶:óÈ›Í õËkê®ÞOêíë3ÔѶ61ílózûT”&R¯¬'õñt:MVù!mQÒŠ´˜t:½UºÛôÕ7·éj@˜—;qÊèÌߺæ¢3ŽMmRó0õÕyQÓ'yG¦úz=êìŠMNºmrHÄ…FMmíãª×#„Bh 0HS|xèkã@7gS}ݪ†&â %EÅáÒwr»@[]½¾¥M²|¾ ›Ç [÷ŽøŒ¿}ïüí{®lÖ¬ IsBþoˆO½€4¤Å”‰êÆæ‡9ù~ÎŽ,‹Ü2Ž«U^‡¸ŠÅSBÍ õ£ï=Ø~ä„paÌ='Œ¨ýýbe‘¢’‡ ›i Hó@ZL@ 5Ôm¨+ª¬v³±2Õ×µ·4óÖ.˜îí—– %UµTâ }mRŸ¸fîÌ>Ç.]>É›©ªrêêM™·I™4Øwêìxûi!„Bc€Ò¯éµM- ­Î„¿s†=q[Ûy¡'È+¯`ªªø:ÚQÉ[fqÙ®ã§WìØM£ÑÖ/š+A‚!‘S¨£«”'*J–àìÍ8Xáí1Á¸ñ0•zå™èëÀíÔt)KÑÁ倚Š$—0¨²¤ÌC{W5«°¢ æ…]KL€k‰É :=*Яâ‰;ék“¢³7ïü{gUÙ=ðÃÌ€ìÛ0ÃÀ þïÈ"«¨(jn¹§f¶hõ¶½ÕÛffe‘YYþ´¬L³²4R\‘@}ß÷}“†~\&”¹—™AQÏ÷Ãw.gÎ=÷<ç<ó<÷Y.¼¼f¹§½MJnôöÓÊŠIÒÄ¡r››Fžÿ*údôöû¬VFADaÆ6úWF„®šÓÐÚ^ÑÐ(bè¹ÙXÑh´ƒ%:z¿]LX~hÛ«ÅÕu=}ýƆúÒjš[·øqÛæu[—/Z?vMsKO_¿Ž¦† »%zO^E©ÀÙ«én6VFÌüîíWê[ÛÛ:h4¾1;)'ç¡£0kºû®ÿ<ÝÒÑUÝÔÜÞÝ££¥áeoË`Ðc.$6 Bz›.¥gn~dÞÚÈp7«‘‘kžé³»¾”ÞšTàÈ™¸×Ö¯ôwuº^TÚØÞA½ðþHH^²84ÐÍÖª´¶^,ÑÓÖÒÖÐOè.Z:»‰E q{?)­­§ÓhŽ–ü-Ñ{¨ìdMZX¡bC|FÖ“‹ç¯‰ ÷´·–x’4¢HEjémžJJõsqŒô÷ùùý7Ë뺽Ì[ûAS„è䨩ª3*[»ºSó‹Üœ *«ÇHŽçÅK“"—³òª›¬Ì@úžJŒIÒÄ¡r›!ž®Àç°‰é XÝ#‚ ÈÃØ\éÏõ­m¢A–žƒ9ߎÏÕTWÏ«¨Ú{<æ·¸D‰ÌÕü"¡HÄeYr9f¶ŠŠJIMýß×s®ß|+_y}ã¥ô,:ÆÔÓ53f™ÒiôìÒŠÔüÂÎä܂ªm ›enb¬®¦ÖÔÞ™œ[P\}s“4Paê›sŒ-ùFzz• M‡NŸÿáôy‰‘¤`ÎaGúûUÕÜñ ¤·IÐ%è­nj¶æ™XóLuµ4 *«³r»¥v¢#(¨ªY¬©>íë“§&4µ¦W(¼”žIS¡Yšrœ,Í­¸&:Zu-mé…Å’ŒR¼‹äœ|–¾©ÓÒ”£«¥•WQ•˜™ÛÝÛgkÆåí‘[^)½ÂÆsi4•NÝt¦ì¢¢A¶ Òž¬hh´âšXÝæIÙEÅQ¤6Æd|FvKgK_ÏŠkjfÌb0åõ —ҳƸz«¤üpì¬ëAä¡Bœ}Ñ ÷œ#Û_71bF<÷ºAAA„†.¸ç¬Ÿ?ÛÑÒüÈÙ‹è AAQºà^ñÅK[û„æc ~j^ááXìä ‚ ‚ ˆÀNÎ=ÃÂĘe ×ÚÕýÝŸg¾>y‚ ‚ ‚ J×ä ‚ ‚ ò@krAAy ÀN‚ ‚ ‚ ØÉAAAä7€ÇæEÌ ð屇†Û:Ò K¾:Cú­oþ÷âtG;ÉÇÇwìÊ«¨š2©üúÁ[ :}ù;ÐFµ"Lw°~îÉÚæÖM|:Ñïb5ˆ ‚Ü+öNÎK«—­ ŠD…¥***\63*ÈJ'çj^aSG'Lw°51bÊ!€ŒoÌfÐé•MJÑfcÆmj︷6 ˆr‘/ª ûù¾ú:Úú:Úžv6Y¥åÒŒÕ ‚ r¯¸s'ç‘€%¡<6KcšZ[÷œÒÊã—ó+«¼û_Ú'®~{g㛇b/ÑÏm¾ã7©"M°‡Ëç/m=ù£Ã¿>Ì6<0¼²fùê¹aðè;ª›‰“Ól¿yã¥ß/]þøÇ±Î8¼/³¸ì©>WV$[Ã¿é ƒý­Ì5ÕÕ;oô¤–ˆ‰mhkÇÔ»+ÙùQA3šÛ;'ÚÃ¥Vƒ„׋JŸþø Ì/A„”;tr¢ŸÛîã%‰2‹ËFGÇ6šè[ÝÔüàur¦;Ø2ôäÜÒÆIx8Š 3‘þ>IÙùîÎóü}ö8EœŠ€A§fêê@ÿ€ˆø¨x$[ƒ›µã©}%5õb±ØÁ‚ä7ÃÕqÕ[t z±ìÆ™3cÓóèAäþbl'ÇÑ‚îãUÛÜúÄŸJ~ï¹,fCÛØ И6 Ú»o` ˆ ñp5ÐÕ9tú¼¦ú´H©N΀hTc«âLÿÀ(£ "ÕP×ÚöäÎϤ‡ìý'ûy¾¿œÇâCA€±­ KS\ÉΓ~¢)ݼX37ìå5Ë÷üzòÈÙ8ÉÉŒÃûšÚ;¾òŽä —Å\37ÜÏÙÃ4TQ.AoaeÍk{¿¥.0ÇoúŠˆ™æüQ-©©;r&.1+WÚÔW- ñtã0 FGG»½ ­íbb¯—QxqÕÒuó"±ea!ËÂBˆãŽ=sÿó?ê·©Ä’Ü—¿Ø9+OrrÓÂÈ­Ëí?ñ×÷£¨gihÐÒYA&:Ö-è­njIÍ/<{‘Š'm͸ë"#ìø\#}=-Í>á@viù?bKjë¥/!»,¸,f˜·§·£™1ÛPWG]M­_4PÛܺqÇ.B ÌÛccÔ\¶¡¾ž¶–hp¨¸¦öpìÅ«y…’bºcY@ÔKoµtvQŒ‡-KÎ ðaêé¶tv_/.PY(nÃ{O®Ÿîhg £ÝÐÖñËùKD;[(}ÿ×٘İæšlZionÆa2èôΞž”܃§ÎE\¼$4ÈÂĸ@”š_ôÝŸgjš[•åjR ʊɨ ¿Öή첊¸´Ì×Ö¯t·µÊ)«¡H ÆØM :ÜÉ!­‚H¡¢aÌò÷¿¯gû8ÙëiiM(ldWb¤…¥”¨V$»IÞʶ¤!G1eWæ‚ 2QÆvrˆf‹•…"Jgzº}¸e“ú4µž¾¾ÒÚzA?SO×ËÁ–ºÀÿÖ¯ZÒÛ/¼^\Ê Ó½lw¿øÌ"Ú‹phÛ«.Ö–õ-mW²ó††ÅFúºö|3šÊ?'Ù¹å•'â/s˜†î.• MY%7M{ûîšë“s ¦;Úù:9H7(½ìm 5¿ˆ¢’žy<ÒßG(å”U ‹Å<6ËÇÉžÇ6’trd{ÒÖŒä×Û/¬jlî03fÍôr÷sv\¿=Z² ™´,–Î Þ5gdd¤º©%¿¢J44¤§­¥ÆP•©¥®>2:’SV1 ä0 ½í§;ؽôùþ¤œ|e’Þ© ß¾ù²§½MO__fI¹Žö’Р •…â6DÍèôÆ¥e:[Y¼½iÝÐðð¥ô,;ë·7­«jlÎ)«´77›ëïÓÛ/,¨¬‹m̸‹CÃ|<žýä«¢êZâ>ó8!“YRÎÔÓàâéúü§{sË«”âjR J‰Itsùór $dæ¼¶~e˜·'ÑÉ!Ö ß>]Éé •RÉ¡!ÌÛÒ ‹•XË‘–âQ­`v“†ýå¬Üê¦fðsvä³îèÒ#…´2'ŒàÏ6‚ B…±œ«y…Ù¥åv6Ÿ¿¸å‹_OH?E¦ˆ©sçÖMª ÆîŸ½ð·>^ËÃCŠªj^ül_G-øß½õò‹«–?ÞN–æ.Ö–Ù¥å›?üìŽ6 ÄgdÇgd{¸º»d•”Ý“…æÉ9ùÿY¹$ÀÍ~>.9éjcÙ-è-¨¬¡¢až¿O¤¿OeCÓ–è=7zˆ“‡÷Q÷$ALbò¿ž$Ž_]÷èÊÙ³ÖGÍÞþíÔ5À{߉M¾vG;O%¥žJJ•|Œ ô{ï© «çÌ"šA¤eAjÃê9³<ímŠªjÛM|ÅÍÆòà;¯R/ Åm€ëÅ¥ï~û£5×ä·ïœºruç¡£ÎVæ‡ß}]ÒÄãêçW,Þ5çÃ-›–¾¾pË\Ÿ’šºµÛ>" xç‰u;žÚ¸øµw•âjR ŠÇ$„y{¨OS»œ• ­]Ýeµõ¡^îŸ=ñO¥C§môüŠÅyåU?»Dœô÷+¥ ¢®á¥ÕËôµµ}íu45w9F}耴#uµâQ­xv“†ýÏ·&ïE?·y¼NÅÒºZ°X CÃÃ÷ÿÏ.‚ r7¸ÃË@7øÙÙ”´ —ß?~÷ËWž pu𯕳C§©©‹Kï·ŸT`ÕœPØñýOÄ/7Uמ»š®­©èæ ƒCC ­¡1ž ¤SІ¦êÆf>‡MÌ®WkK- ´ª“—‡…ÀÞã1’ÎD=y;Ä»­ÕD5ŒZ)›|m@4hnbLQžÔ†¨@?nFK†>”u?T44€®–¦áu~u,¦°²†Ïa)¶lV0ì¹Õ€?/§d•”óŒYJwõxI÷ñì ¯ÝúJJ^!—mdÇçûˆq›9~Þá>^›E­‘AŸP)Uu k#ã‚üXú‚þþÁA%Ör¤®V<ªÏîÉ`¢ÙM½®&–lõ ïÞ`;‚ r_sç-¤ßùæÐ·1g–…/šñ埻šW¸í›C÷"~AÏ^M“[ÀÅÊrhxxQ°¿ôIKS03f@y}ã™äkóý¿Þ“˜r6%­¸¦NZ˜T`Šp.5ý™¥ ü=ˆÕÞŽv0‘yAÎÖ###Ò3‹&êÉÛiíê èèÈ­áv˜zº«f‡†Nwç0 ‡†[:»tºž6Õõ¤6Øšq‡†‡3KÊaÒ˜¨TTþ9¦Óiã©MÊÉw²2wµ±LÉ+t²4§–H ¤æzÚÛ¸ÚX&çÜW+“èær-ÿŸNQJnÁ†¨9!ž®¥µõ èã6aÞÉ9ùî.®NÄD5b$Gñ*ˆºï [ üœžZõÎë´4Ô)n<@Z‰Ý…¨V<»•‚‚!G½®îêé€.` Öä‚ ÈdÜ—Ö¶´~~ôÄçGO|¸åñ¹3|¶?¹þ…ÏöQÑH<Ö±e©± ™xÉÆ4ÕÕ‰ƒmg—V¬™¶62|mdxnYå¾eý³0—T`*p6%íé% æùû Ê.Ž0‘%ƒN—½5OÞÎаXÒ.—Oƒ4†º:ß¾ù2ŸÃ®nl>w5}phˆÃ4´æšL FÉl Óé]“¼Ežâ~¸#­]]’Œ`0îPšÄŽ&¥1IÅ]­xLú8Úkkjéë½öØ â F€ w—ïþ< ݽ½ª —eä`Áö“/™zº¾^Ä„®ÎRª  ihéìúëÊÕëÅeÇv¾ýÌÒ;9¤•Ø]ˆjų[q”rëêºÖ6hïî™Ru8‚ 2eaJ¼µÿk®i » mTßÚNÌPWSO¾·_†ºº-Ýò ‹…"Ѭ­ÿ•mØÉ„¤“ In6– ‚f, ø¿WŸÿààÏÒ³ÃId@z›J¡¡­#­ ØÏÅÑÉÒ¼°ªÆÍÖª¨ª¦µ«›ºÙRôä¤jXÊç°c“®½ûíaÉɸ½ŸLSS••Û±Xl £­xq(bƒ4õ­m›¿Ä3u"#†‡ÅúÚcï‚©§ ‚~¡]=©1äán¶VnÿžådiNt z5ÕÕü=}ý× Š3²×Í KË„[VAr”,©††¶ö²ºkK36‹hLˆ´S<³H£ZñÜ$ { ãMTJÈåºz÷Ïǧڃ*AdÊBé‘^S{'êêÀ­i2¦\{Œ. ”[ ¨ºVGKÓ×ÉžŠm¹åU;]¿=šF£=·b±w„ô6% 4¦©É'püÒeX6+(ÜÛS•Á¸˜–I½ðª›µ55œ­Ì•âÉIÒÀe@Bf6‰ûû@ûNC¤6U×Òéô™žn &ƒ"6H³øÕwwýtœŠ¤“=”ÔÖ—`0èc.áïêùÕJt5)ŠÄd ›s}k›÷†­ÒŸÿò;F›à 7Grèá>žW²ó >#KWK+ÐÝ:oôP©‚äƒT‡i¢¡!*ÚH+1Å3‹4ªÏMÒ°—ÐÓךêÓä9Òz(ÔÕR3ÊêAA(0¶“cl¨ïhÁ—>cÇçy9؈ÅbbÙkQUX,žéå&YI<†cq ÃbñÒYÁ›FÊ-ïn^ïak=žÝn6–Òoôö‰Å#jR/$ émJ r­492sšÚ;"ý}‡@ÜD”®]€—V/“œÓá¡âIÙ(®¡±­Ý\d‹[Öú9;ÊaÉø+ðÊÚå'£D(Âç°%Ç‹‚ý}ºzñÙð{üexaÕR‰Àâ™v6õ-m7ª¢èjRäŽI¦¡…)§à¶.±ƒŸ³´w÷p˜†ÎV ™9PÝÔRÕØ4Û×K,·ßè¡R‘Bª©«3f䙥 ŒôõÊë(ŽY‘VbŠgiT++&e„½„¶®pû~ÔCNF5H±®Þ´0òüWÑ'£·‚ ‚P`ìÉʈÐõQsZÛ+EƒC,=7+v𯳄@Gà·‹ k"Ãm{µ¸º®§¯ßØP_ZCMsëö?nÛ¼nëòEëçÏ®inééë×ÑÔ°áq·DïÉ«¨"8{5ÝÍÆêш™ß½ýJ}k[c[¦Â7f'åäï¯[ÐûÛÅB@¶'m͸³¼=rË+¥W–o\0—FSùáÔy*ÀÏÙÁÃÎúïëÙw4¾W(¼”žIS¡Yšrœ,Í­¸&:Zu-mé…Åcö…KÎÉgè›1-M9ºZZyU‰™¹Ý½}TlˆÏÈnéìbéëYqMÍŒY £¼¾áRzÖD_š.· O-‰ªjlº}Ðã©%Q5M-®]'\URÞqC`Í3Ñ×ÑÎ.­xÿàO)Rï†ÏÈîôš³ìÍÍTŒËY¹oí?XZûWwµl Æä†¨9Ö<ÓNkºÕc‘0ËÛƒÏaÇgdÑh*sü¼“só‰qHôõ/ ®lh"ÞJ¥ ’ ©š MMU•Ç6²ãó8LÃîÞ¾¸´ÌmM(ûH+1RW+ÕŠg7iØK*±Š†F+®‰Õ¿+1êÙ-£¤RWÀ°XìíhŸUR~8öþr#‚ ¤¨€³/zážsdûë&F̈ç^SPߘ}ò“íTÞ¬‡ÜeæøîxzãOgã¾zÎÓ‚ ‚ wºàž³~þlGKó#g/*®ê‰Eó ­ ½ŠL‘˜DA¹û0Ð÷Š/^ÚÚ7 4ç;XðSó ‰—‘OˆÙ¾^ÛŸ\_V×Ð-èSSeXóL™zº ­íbbѽÈ=‰IAA©vrî&Æ,½Ö®îïþ<óõÉÓrhèìd–”[sMìø¼ÑÑÑÆöŽ3É×öüöú¹W1‰ ‚ 2À59‚ ‚ ‚½W”Åcó"æøòØFƒCÃmé…%_Áйßùõƒ·túò7vÜÍ‹*ÕßüïÅéŽv’ïØ•WQ…åˆ ‚ r;“5’3?ÀW_GÛÍÖÊÓÎf<¾1ÛÊÔD w—V/{aÕR3cÖõ¢²üŠj uµ¨ ?Œ›3î45U¥«¼¨¾šWx:)õtRjS{Çd;Gö] èIA™âük$gºƒí7o¼ôû¥Ëÿøë¹ŒÃû2‹ËžúèsŠz¯dçGÍhnïÌ*-¿£@°‡Ëç/m=ù£Ã¿Ê§áî°2"´O(\ýöÎÆÉoYŽÇ+k–¯ž¾±£ª±y¢…õHHÀ’Ð@›¥1M­­ûFNiåñK‰ù•ÕÔ ­aŽßô…ÁþÎVæšêê7zÒ KÄÄ6´µ?„é4©Q}(öqýÜf#æ=¼ ä¡ò¤âÈýU^׋JŸþø Œ^Aûur„¢A`Ðéc„˜º:Ð? ¢®7!3gƦç±Lq Š3ÝÁ–Á 'ç–ÞÃDúû$eçº;Ïó÷ÙwâÔ„ +ú¹Íá>^B‘(³¸ltxl£ù¾ÕMÍÔÛ(²5˜±Y;žÚØ70PRS/‹,øQA~3\W½õA— l F5‚PGñ AAî ÿêä ˆ@•1v¡q¦`àaóŽÆ´iÐÞ}ãÚâáj «sèôyMõi‘R*…åhÁ÷ñªmn}âƒO%].‹ÙÐFµÏFª¡®µíÉŸI/Ùÿú >Nöó|9 † ÷/ŠW ‚ r¯3’#cìBƒRƒaÞ£æ² õõ´µDƒCÅ5µ‡c/^Í+$þKº<úÅUK×Í‹ Ž—……, ‘ü+ꥷZ:»H5Øšq×EFØñ¹Fúz:Zš}ÂìÒòÄ–ÔÖK‹- Z:+È„C§Óº½ÕM-©ù…‡c/RñËxFvÜè™ûŸÿIÄ–‡/ ²01î¥æ}÷癚æVɹ,f˜·§·£™1ÛPWG]M­_4PÛܺqÇ.ê%ä×ÚÙ•]V—–ùÚú•î¶V9e• ËÒ”W²ó¤U&Ô@¡¢aLéü}=ÛÇÉ^OKkBÈe1×Ì ÷svà0 UT KÐ[XYóÚÞo©{rŽßô3Ìù£0ZRSwäL\bV®ô%¶,]8/À‡©§ÛÒÙ}½¸ôvdh 9Å£š4³(ò⪥!žn¦Áèèh— ·¡µý@Lìõâ2EÂ^rTÂ^6Ö\“M #íÍÍ8LCÞÙÓ“’[xðÔ9éÁR*Å-Ã*— âjÙˆl =I´÷Òô_37ìå5Ë÷üzòÈÙ8ÉÉŒÃûšÚ;¾òÅì¦"@ê(Ò°W$/Aû‘urš::áN3 ˆÁ^¡ø¨¥®>2:’SV1 ä0 ½í§;ؽôùþ¤œ|¸œ•[ÝÔ ~ÎŽB‘(§¬bX,æ±Y>Nö<¶ÅNÎxFööÿ3œõá3Ïõ÷éíf–”3õtçø†xº>ÿéÞÜò›×¥³‚7DÍ©njɯ¨ éik©1&¶Ø=ÐÍåÏË)™óÚú•aÞžD'‡JaM+ ¹ãC aÞVXLý+3=Ý>ܲI}šZO__im½ ¿Ÿ©§ëå`+ õäÿÖ¯ZÒÛ/¼^\Ê Ó½lw¿øÌŠIL!¾}óeO{›ž¾¾Ì’rí%¡Acl­4äjÒ̢¡m¯ºX[Ö·´]ÉÎéëÚóÍh*T÷!½ *a/{s3âë•ÕÃb±wqh`˜Ç³Ÿ|UT]K±¸eÛ@央&­@dk âIÒ ½ç(^PÉnRRG‘†=żŒ<¼-Aˆ;l!M´›yl£çW,Î+¯úéÜ%⌠¿Ÿ8•”z*)U"è÷ÞSVÏ™Eüºÿ|k’Rôs›ïؘ‹ÏÈŽÏÈöp twÉ*)»}I.©‚˜Ää/~=I¿ºîÑ•³g­š½ýÛ`ž¿O¤¿OeCÓ–è=7z™ŒÃû¨û…ÔȨ@¿¹þ>%5uk·}Dœy$$à'Öíxjãâ×Þ•–|ï»#±É×ä+ž0oõij—³r µ«»¬¶>ÔËýó£'(ÖÕ¼ÂìÒr;›Ï_Üòů'¨?n—@]ÃK«—ékkû:Ûëhjî:rŒú#RS#æÎ­›TŒÝ??záo’ãy2ÂÇkyxHQUÍ‹Ÿíë耣ÿ»·^~qÕ2¢´zÎ,O{›¢ªšÇ¶G_q³±<øÎ«Ô5†œR¢Zvf‘âdiîbm™]Z¾ùÃÏä‹7%†½l¤=ùüŠÅ¢æ|¸eÓÒ×·S)nŠ6Ⱦ„lWS©@dk õ$Å»·(^f7©©£HÞz^ ‹Å04<ž}Bᵂ›£")y…\¶‘ŸG½°6øÙÙ”´ —ß?~÷ËWž puš¨ 5¬ òcè úû©ë_9;tššÚ±¸DÙ=ž\5'v|ÿÑ€¢êÚsWÓµ55ˆ²ˆ ôéÎá˜aR ¤!7L4³‡†@[CcòL¢öÔùêXLae ŸÃZã·6Œw‰ñ\-G2ÑÂ’#äî V ¤ÙM*@ê(Ò°§žÄRÆ>áC·úAädìHŽ OH „y{$ç亻¸:sŸ$#9L=ÝU³CC§»s˜†ƒCÃ-] :]O[ëÞFkW·hpÐ@G‡øèlm122r9+oò®èdi><,N+,‘>™š_èioãjc™œ[ ”«º¹\ËÿgÞWJnÁ†¨9!ž®¥µõ ÞùæÐ·1g–…/šñ埻šW¸í›CÚúŒŠï [ üœžZõÎë´4Ô)n<@ôÎ^M“ÛK.V–CÃË‚ý¥OZšš€™1 l͸CÃÙ%årk 9¥ `f•×7žI¾6?Ð/ñëÝ1‰)gSÒŠkêî‹°OÊÉw²2wµ±L¡°I>Æ\B¶«©T –!w¯P¤!ÍnRRG‘†=õ¼èêé€.`JùA‘±œîÞ^UƒË2r°à?ûÉ—L=Ý_/bÒHg uu¾}óe>‡]ÝØ|îjúàЇihͽ÷o»Óé7¦túdo‰Æ`ÜáÄs_Må®Ä‡‚ È=dìsèîÞ^U=ÜÇóJvÄgdéjiº»@çಠ!3›TuO_hªO»ã‰ùTÚ2ŸþÊÖ ƒêÆfmM g+óÉs\Qu-ƒA÷u²—>éïêùÕJ¹D ›s}k›÷†­ÒŸÿò;F›àK¥°îHS{'êÊ?ÏŠT‡i¢¡!*Úˆ”—„*R:ZšcÊbŒNŸéé&·Š(ÕÔ3‹Xp¥1MV+<·¼jç¡£ë·GÓh´çV,VÖ]LRØû8Ù@I-¥ÉuòÙ } RW“V  K¶')†œ¡®Î¦…‘áÞž“' wúƒ2V"‘f7©À„r“4ìI.¤f”Õ5‚ rÿ3¶“ÓÞÝÃa:[Y$dæ@uSKUcÓl_/±XÜ~£ÛÚ ÐÍ…Tu[× o-±ªŸ³£ÜdpáÚuxiõ2É¥wx~¿ /¬Z*9³xf€‡M}Kõ e`Â4´0åÜÖh+¨¬?g*…el¨ïhÁ—þºŸçå`#‹)nøKª©«3f䙥 ŒôõÊëZ»º©\âX\°X¼tV𦅑òùêX\¼»y½‡­õNÄ_€WÖ.oB©Š(ÕÔ3‹p,Ñ›ƒ›¥ôǽ}bñˆƒ¡¬»PVØó9lÉñ¢`_g‡®A|F¶SOÆ%H]MZP,,ž¤rï?½qëòEÑÏ?IdýdÈþEU5b±x¦—›d7”‰f7RG‘†=żش0òüWÑ'£·‚ rÿ3¶–oíìÒ×Ñ JñéÙO<2¯¹£“øøGBò²°Å¡n¶V¥µõbñˆž¶–¶†Æxì¶›ñYO.ž¿&2ÜÓÞfphØšgúì®/‰6:´tv›“Æíý¤´¶žN£9Zò·Dï‘jÁ˜Ø_/ByIMË@Ÿx«‰M¾àê4×ß'aÿîªC]3nÿÀÀ¶‡”¢ßÏÅ «ÆÞlvYÅð°ØÏÅ‘Ja­Œ]5§¡µ½¢¡Q48Ä2Ðs³±¢Ñhÿ:KÑ R ‚gl]¶¨¢¡©­«›N£YóLYúB‘è£ÃG)^¢¦¹uû·m^·uù¢õóg×4·ôôõëhjØð¸[¢÷ÜþÆÌÛ9{5ÝÍÆêш™ß½ýJ}k[c[¦Â7f'åäïÄÓ-ñëÝeu šêêv|žX,Þyè妞ŒKºš´¡XX2EU5w|JŸ‘Ý-è53fÙ››©2—³rßÚ°´öŸ©~ÎvÖ_Ï–cþƨ9Ö<ÓNkºÕc‘0ËÛƒÏaÇgdÑh*² «¾µm@4È2Ðs0çÛñ¹šêêyU{Çü—HÑ R 4ššª*mdÇçq˜†Ý½}qi™ÛšÐ-—×7^JÏ¢ÓhL=]3c–‰‘!FÏ.­HÍ/$vP õdrnAaU¶†Í271VWSkjïLÎ-(®®“VKgK_ÏŠkjfÌb0åõ —Ò³$ïó‘­JÈ)Õʬê¦fkž‰õm™¥*lC}sޱ£%ßHO¯²¡éÐéó?œ>?Ñð“q¤a/“Y%å7Ö<}íìÒŠ÷þ$½¯iq˶ôT\-»¡^X2»|‘¶¦†“•ù#3°èAäacìHÎó+‡xº²ôõÕÕÔ„"QeCSBfΑ³q÷×]½±aÕl¿éºZZ0,w zKkëÏ]M?“’†E~Ÿ²uÙÂù~¦á¢WÞil︛—®hhzsß÷`lhðª¥è¨û†֮n3c64µw>Ìñ€ ‚ 'c;9.V–¦&ıCÓÝÎÚÝÎzIhÐóŸîmhk¿_îÊ„Côp€A§éëéë¸9Gú=»ë+,õû;¾‡ix¯®~áÚu°æšLýFí½uÔÔ±á…Ýû ¨miŒG÷Q< ‚ ÈÃɸÓÕþuöÈ™‹ÍÀç°?ÚúÄýx{{Å|òãoÇã…"ø¹8®™†¥Ž <µ-­_ùór ºAABÆÝx`߉S°ç·?ŽôŽ¥©‰“•y»KRN>ñßM #çÎðæ±DCCyåU?ž¹x½¸Œø—¯“ý#!V\]=m-ÑàPIMÝ¡Ø Wó ¥õ¯ˆ˜¹0h†×dX,®jlŽILŽIT~säPìâ £¨4úù'`¦§Û/çãÀÝÖÊßÕiºƒŸÃÖÑÔPSUí Û:ž:w1-“¢‘Äì>&“FSéôV56Ÿü;éRFuÙ¼±a•»­µ‘¾ž¶††P$*­­?~ér\z&EW'ûÅ45µ®Áìç_'Θ±YìzÒ Š·~ò¥.=ýÙÄCú?ür2!‰ÔQ Úpû"ï¿v¿/9–^ð-#&)Âe­Œ˜éçâÈa¨2݂ޒšúÏ\¤¾FŸÔÒx;/¨; ^XµtçÖ'ètZy]ã¡Óç³r•’›Ôm} ¹YQßôÄ#‘Žæ,=ÑàP\Z¦Ž–†Ÿ³#ƒN¿˜výÃ~¡²+•:Jvê!‚ 2Å!ß]íèù¿ß|| „xºœŸÞûŸƒŸøï45µ@w—7çÿzâï+àæ<×ßGòuUcº£§½Í¿üærVqò«ÿ>çïêtS€‹µ¥‹µ¥“¥ùÎCG'é>%íH#}=â`ï«ÏkL›&-£¥¡aËç©2$º’¯³ ؆ÅÕu’k‘ â`Á·1ãÇ: ÍéŽvÓí>ùñ·c—©¸:«¤|†«“®Ž5פ¢¡ \¬-áÔü"9Üøßµ=œß/]–ôpd;Jé6ÜÙ1I…9~Ó·=ñ˜ô DyýYY6ÆÃ]Ë kž)qàjcùé O¿úåÉÖywÁÒKÈÈMKSÎÜ7c^cÚ´åá!™%¡AUÍ×(•:Jvê!‚ 2Å!ïäHžnÚñyðƆÕDK®¾µíÜÕt#=½E!þ4íÕu+®—V7µH¾x>5½¢¾iºƒ­Ÿ‹#F{zÉ¢±ù‘yDçZ~Ñ¥ô,Mõi+"fš²Œ–Î >”š[^5÷i{«½BÌ[“0::ÚÐÖÞÜÑ9 ÒÒP·01.®®£bäü_¢ÁZXY“˜<2:Âe9YšTUšI¨ðGBRbfn—@ ¸X[, ÌÓÒÏÕ)y…3\ÀÏÅ‘è`¸ÚXŽ)Vêø9;¬œ IÙùÿø+q’ÔQ ÚPÑÐôúWßwíhi>ÚÕÓKü·¥³kB19v|ÞŽ§72èt(¯k¸œ•ÛÓׯ­©áhÁOÎ- âRHãAÁ¼ â(é¤Î-«t³µòwuRQQyrñ|¢“sl ~‰;榥)†‡Å;¾?òؼ[>¯©½cÿ‰S[–-41bÎpqüå|ohX|&%ÍŠk²qÁ\¦ž®Äk® ©µã%õÔCAdj2—ŽŽ‚—½šª*\Í+$ZrpñZæ–e‹@ê ý*êôõh4𱡮–¦¾Ž6˜›'ìß=F’Ë6RîíeÞ'ýQ(ý|.žô[¶f\R#S󋇆ÔTU×F†¸9MI?Ÿš!½©¼ìmVÎu0çèjˆi´›E˜sØ5Í­²]ÝÒÙÕÐÖQXYãde>ÝÑNrkœS0QO>¿b±¦ºz}KÛÆ»&ä(%Ú0ž‹äˆÉÛ”Øòõp¨Ø ;¨xr2¸Ë¹©ÜÛô÷ñΟž¾~˜¦ªª¸ˆ§‰¦‚ ‚ S ò—rY7[Ý‚^MM⸽ûŸ7ˆ×¶ÜüÉ—ìÚ<†^¡8И¦¦¯}óy6ƒN×ÖÔüINNÞ­W×>óÑke@ÅÈŠ†¦ÿîù†Ø}ÎÒÔdëòE1»ÞûøÙÍ%¤¤Ìðýæ—Â}¼¸l#MuuC=]ý[£***¤®&.¦]U#*Ð/ÄÓ•h«%çæOÔ{j U`èyÙÛLÈQJ´áŽÈ“cK\G zúú&ÏÙñp¯òâ.çæd\‚§ÑÑQeùA¾ÔCAdJA>’äîBVÕKÀPWG"`abLÜè%o#ö é…%[¢÷Löí}sò´ ¿¿[ÐWR[WÕØLñ[LÉ+\ðòÛÁ.aÞž³¦{hkjDøz}¯ÿÊî¦( ›'ÍSQQ‹ßÜ÷}|F6¼ýøÚÅ¡òÀÙ”´g—?Â`ÐÍ(¯k€Ñàß×s&êÉÿûýÏõógèê|öâ– ïE³):J)6HZ±cZ™ŠÇ$¡„¥¦FÚ#’4¤å³AF<(1/ÆsÔÝÌÍñlPnú÷ôõWÔ7v zïøßyý@=õÆ‹AAî9$#9flÖcó#ˆãKéYY¥åƒCC0ÃÅQ"³ hqSVAz½ÂªÑà xÚÛH Ü‘76¬Nýþ«ØÏ>t£6éèv¾ýó̯Î¥¦SïáLÔÈ+Ùùï}w$tË+EU5àngÍû÷¬Rñ064€¦ö¢™·íš@…ö=W²óÀÛÑ.Â× äB©ki{kÿÁ‘‘mMÝ/<3!G)ņÁAâ€Ãü׺Åc²JÊ%!'CŒ˜&F†rÛpÇx˜PÈÉç(冽|6(ñŸ‘½ò­žþø‹;þ··ÿæàŒ)‹9QÍSo¼xPn%† ‚ ˆŒ;’óô’(]-ÍH_b¿äœ|âm!&¦<1SGKóØÎwâÒ2¹,£ù¾0,Ÿü›Òòú˜Ä”•³Ctúþ×_ˆÏÈ.«kÐTŸfiÊÑÖÐn¯8Yš/ c¦Á–e‹î°τŒ\;7ìñ…‘I9ù%5uý"m–>ñ]ဈŠ)ͦ3cöË«—¥– óØ,9îå÷øË³¼=TTT.¤^—Ï'i…%GÎÄmX0Ç”³ã© Û¦^šŠÛ Y¾òÖãkϦ¤‰GF,M9MíûNœR<&>w)ÜÇ“N§/ v´à_+(ºÑÛ§©®ÎcÅ¥gJvÜjë¾ÑÞ}ÃH_ÏÅÚò½'7T44Úñy¹e•Ç.%’Ú@=©ˆ£”•›ŠØ ¬KÒÖ}£[Ы¯£íleñîæÇjšZìø¼ìÒ *Û£QL=ñ0*1Ay˜·“óäâ(ÉqnYå ŸÝ\Á}ä7'+sg+ +®ÉSKnÊŒŒŒDÿø+•å.°ë§cV\'{:>Ûoúl¿éÄù¾[Óâ ††‡%ÇÃbñ]ö ©‘öæfú:Ú ‚fHØÄ&]ëèP åçó—Þz|-¬‰ _.÷½\+(.¯k ^úÑ-è¥þ–žÛùêxÌ,o>‡=?Ð/!3'>#›bi*nÃÙ”´u‘ Ïa?½tÁMµùEJ‰É‚ªš‡Ž¾±q5ƒNw²2w²2—ü«KÐ+éäÀ±¸Ä­Ë@TŸt ’Ú@=©ˆ£”•›ŠØ ¬KPáX\"Q’ÝÞDCCÊM½ñâa*Tb‚ ò03¶“SPYÍa2õttzO__imýùÔŒSI©Ò2ÞûdÓÂÈHÛh`p0·¬êÐéóÙÔæl‰Þ3?Àw~€¯½¹™Ž¦f¯PØÒÙ%™2DPV×p8öÂò°à–Îî½Çbî¾kdùó¹K‚~¡·£—e¤Ê ÷ *êÏ]M—ìL*@Ê É7+"fZóLõµµGa´§¯¿¥£«²±‰Ê»_¤9™ôÚc+àô¿‹Röüzr÷‹ÏÀËk–“y¨”¦â6T44ý÷ËožX4Ï̘¥­¡Ñ70P×Òšrë%6ŠÇ䟗S²KËWD„z;Ú™ª«©õˆê[ÛêþÝM:x꜊ <À60è ”×5JÞö#Û*ñ@Ñ“Š8J)¹©  J¹ÄċŋBü9LÃÞ~aqumµ•`ÔSOF„6 ‚ ‚ ØÉy`QSUmjï0Ò×ë¼!¸VP´ï÷¿Nd ’qx:AA&\“ƒ ‚ ‚ È ]€ ‚ ‚ ȃvrAAy ø£'*1—X<IEND®B`‚cccl-2.5.0/.devcontainer/img/nsight.png000066400000000000000000000330301463375617100177530ustar00rootroot00000000000000‰PNG  IHDR$ä1fYìiCCPkCGColorSpaceGenericRGB8U]hU>›¹³+$΃Ԧ¦’þ5”´lRÑ„Úèþe³mÜ,“l´AÉìÝi&3ãü¤i)>AÁ¨à“àÿ[Á'!j«í‹-¢´P¢ƒ(øÐúG¡Ò ë¹3³»“¸k½ËÜùæœï~çÞsîÞ ¸,[–Þ%,®-åÓâ³ÇæÄÄ:tÁ}Ð }Ð-+Ž•*•&ã¿Úíï ÆÞ×ö·÷ÿgë®PGˆÝ…ج8Ê"âeþŲ]€AûÈ ×bø Ä;lœ âõWžð²Ï™‘2ˆ_E,(ªŒþÄÛˆç#öZsðÛŽ<5¨­)"ËEÉ6«šN#Ó½ƒû¶EÝkÄÛƒO³0}߸ö—*r–ᇟUäÜtˆ¯.i³Åÿe¹i ñ#]»¼…r ñ>ÄcU{¼èt©ª7ÑÀ+§Ô™g߃xuÁ<ÊÆîDüµ1_œ u~Rœ æàâ*-°z÷#°Mi*ˆËWh6Çòˆø¸æf}î-gi:×Ð9¥fŠA,î‹ãòV§>ÄW©ž—Bý_-·Æ%=†^œ tÈ0uüõúvW™â’9 Œ%/VµñBÈWµ'¤_¶tâÜÈMÛ“ÊŸ¿ŸåP““í\>ĘÉ@Á„yì0`D i|[`£§ èh¡è¥h¡øÕàìßÂ)ùþ·TjþÈëèÑ0B¦ÿ#ðЪÂïhU#¼ ~yh«uÐ fp#Ô1I/I’ƒø"“ä0!£ ’'ÉSdŒdÑ:J5Ç–"sdó¹ÑÔy#RŸ7‹¼‹èwAÆþgd˜à´ÏÅJŸ7ØÏØÏkÊ•×n^:}nW‹»FVŸ»Ösét$gj-tÈÚÔrÏÿÂ_ç×°_ç7Z þ~ëÛV·5ë4ÌV }ºo[ÄGó=Nd>¥-Ula³£¢Y5VúË}¹x»g[üä÷É?’kÉ÷’&ãÞä>áÎsŸrŸq߀È]à.r_r_qsŸGjÔyï4k± æi—QÜŸBZØ-<(d…=ÂÃÂdKO膄 a/zv7«]»ǰod«}¬€©sìn¬³Öá?TF–'|¦ãï3Nnã„#I?"…m»z„íõ¦v~K=Ú¯Æsñl<b|_|4>?Âpƒß‹¾QìñÔré²ËâŒi´µšêŠÃÉäãb ¯2* åÀ (ëºè»Ѧµ—hå°{28ÂoIþýÛy¥esŸ8ü';÷Z¶9à¬ÐûhË6€gã½ï¬>¦xöRx'Äbß8ÕƒÃÁWOÏ«ëõ[xn%ÞØ|½^ÿûýz}óÔ߸ ÿÿ%x ÅcÖËleXIfMM*>F(‡iN*–ƒd)Ÿ $ äóo—™ pHYs  F¸}ú¨¿%K–øRaÊêîÔnÖ¬U­ZÕ­gddÐÎ;©  À­ü’iÏéÝÔ²V«@é÷C'¾þúkÊÉÉ¡¿þõ¯tðàAª^½:Mœ8‘æÍ›G»ví¢;>LkÖ¬¡Ç{Ì/u%$F–Ü•J•*QTT¥¤¤ )=ôჴóÈ6êÔ¬;M0…ªÅUw.ÎD|õÕWôþûï ’Ù¸q#U«Vš7oN×_=µk׎¾ÿþ{úàƒèÛo¿yRSSiìØ±~GJºRhh¨ £S§NÑŽ;Jüê@ƒJLL$tÍ5׈p°Ò©´ã› ¿¯¥»ÿy=ß h; D¼øbp!°nÝ:AFÇ'üÈ¿öÚk´wï^zúé§mÛ¶¥÷Þ{oÖ¬Y"í™gž¡¸¸8§?‰î>$ØÊ………¥þ´?þøCÌÓ¤R ‰‹‰§Á=†Pfv&Múj, [ðK? ]ç>º@BBU®\Yh;-[¶¤>úˆ^zé%ºõÖ[Eé&MšÐ;ï¼#Òq}ß¾}ÂG¢ò7Ñ<ä&)™L!4ªï(šÿä¨^†´~ϯB[Z´c‘'𕚄?dÈ]Fxເo“ÿÊRïûꫯ ŸŠ³6à‡ñoûýþûïÎ.K[¸p¡[XíÙ³‡¶lÙR¬¼V ±±±B‚¯èܹsF={ö¤ÈÈHu ˜oH?{ö,}üñÇdÙçLäQü$`(B‚ s4.]ºD¹¹¹‚”Ú·oï'PúÞÌÖµ[ÓÂßÑ#7=®‹¶ôÏþ“’’’Äß[k_ž×Õ«W}=²¯µô˜ã}ñºšã"^Ò·Þz«ÔŠñ£øé§ŸÒ¨Q£J$%h#ãÆyK­Ô‹ 0ÃÐÞ ˆv¬^½ZÔ‚ixžwÞy§Ð–ààG';;›Îœ9C³gϦåË—‹tùÏP„„Qƒ6mÚˆ¿ˆˆa°Íá  §‘7¿@Ÿ û‚êk¨-a˜·Ýv›ÝwZFl5¹îNfF>ÇÙÅÐhQÖäçç‹VO$sàÀñâå”­”v_äÍËËudffÚ¥¾}ûÒéÓ§KÕ’ðÃ8yòd±ÌÃ)AÓ3f ÕªU‹ ™é!óçÏ#h;w¦3fЀEþÅeË– ­läÈ‘„>‡éÒ¥ Í™3‡zôè!úùÆoøÔ,Ô Â…vf+‡Ó棭 ùçÎk›ìvX§vƒ j¥·’žžNÛ·o/V¼uëÖÅÒ‚%¡•eÀ‹鵟¦Ó—ë>¾¥võ¨^¥ú^AðÛo¿QLLŒÝ¬qŒÌà‹²Üpà â×6<<œžzê)êß¿?ÝÿýâÊÏœ9SŒæ aóæÍ4mÚ4ñâ#ŽA [Ù´iMŸ>]]ÇÀÅ믿.|!¶ùÃhÏwß}'’ÑøG@ x1!¥ÝyŽ?N=ôxQïÕ«=û쳊Ñ)hâXnqIÌË/¿L&Lšú²Ñ‹/¾Hµk׎d™^R}Þ^CûaÛÊÖ­[©[·n‚xnK<À ð=/_dÿþý´aÃñ~ÚN%€k£}xoásíàlG~´é0#=Ïr;©¶k£Fœ\q? ¿‚Xpè(-Z´pL ªxò‘dZ½w¥ès¥øÊïuÿ¡Êׯ__•Ç õá‡Ò½÷Þ+H~ÌiI 8PåsÀóá…=z´ÐðrÚ L…îÝ»F†à×ÀˆÐÒ¥KiðàÁ¶ÙìÂØ÷dôÈ#Ð_þò1: B€CâÎ}‘}Å}{÷îM+W®Ú4o¼Øü€Â÷䎠ÜÔ©SiüøñŠ”`Ρ¿X kKRîÔçMžo¾ùFŒªá]ÃdHXø„ÉY'û­ô“O>ýs·%µ ¤ŒgÌ$é ÇŽE:¾S¶ép±àǦ§d„z}"$,9r¤¡`„ ê#‹çdæfÒ´Ÿ^¥%¾%³åßMm{Ó¤;¦P¥è¢ÓóI˜=5kÖTEñË/¾ê˜Ýí¸O„$¿¼øÂ9ú‚eÈÞm¤Ý̘|dM\4†Î^he¶"³õø‘<˜L˜ PÖs¦ì AƒÀÑ,Mbü(`9 |lð—'üã?.òȹJeÝfoïWº~îmÍ\Îmrs³éÎÝA ×Eâ*Ò´¿Î¢·î{Ç'ÍñæÐ†lµ ÿCKxóÍ7…ÓxÕªUô /ˆ_b”…–2ÀËŸ ÖQI™3 þ\ƒ9„!r)’à.^¼H—/_/“#Iȼ¶Ÿ7ß|³Ø3fF­†j7úWÚ}e]¸Ì8Ã?ûì3ñÙµkWyY¸µCuÑ€˜»u†%’œœ,œÅï¾û®"G¬cûòË/ňLKä6Š Ö ðÄròr(õüI͵"Û®Á ùË/¿­¦~iñKŠáY,Æ„ÀÇ"QŸxâ á3Á°6Ì.8+áã ü¤I“„ óÜsÏ‰ë “+VˆëÐ>à×X¼x1ýðÃß´®Ò$‡97?ÿü³ : qCÛ’soJ»¯¬£³ X¼¤(ƒ‘Bô‚—„§¹¿ú3eÊñ<~üñGá÷ƒþ"Ì9¬…G7ðºï¾û¡ûKe;™$åô9´÷úÏúÏè Ëçm-ìçiÙ$hD  wKç.FÁð‡9HðÕHÍ÷aAË€©†)Ž#&0`& ,ÖL¡n˜eR`2Àdï:?Üh[ 8È1ú¿×“O>i·Ýki÷µôˆ¶ÃTµíÖHBümÀ há·Â~Ü xÌCÂb[<#H ùQÎ_šˆã:1i;·SàÛ2¢†ªOo=«UwBÂ/~õœý¥·F9UJ^OäØäOo9Ì1Ù°q˜þ8üjX@ aø>he†v]ÐÍdþØšÄhã,^´`C^k:›½]cÜ€”`æùƒù†QLÌÃÐ,&:Žú¸ÛgÎçÿ@3áGšO¯uþÌ4hFLF®±Õ•ðâa+GÉ®g™oÓØ1µ(>úΆ4¾]ÉŠ·®½r–£rQÇ–9»d¸4| ù‹h¸Ç 2ºûœõCû®¤GÓJôò’c´ït–«,*½¤zT&0Œ€ß  «†ä Ì3r%ýgnµ,.%º·½u%¹«¼%Õ㪠§3Œ€q( “] ÈÈ])©wëðÇ|q­šúc³¹ÍŒ@©” !…äe–Ú0w2hU;÷2RžÂ‡QÜÖɢ,Œ@€!à“Ɇ…“Ø›§gÊ}qÜÁç¿¡ét‘\kIîÔ<áW»›5 òe¾0b«Ö¤¨¯>¡ìû¨¾qg‚Ÿ {®`àÉD¯Úùèb^é>¢ÒÌ·ˆ´ÝÁùô,M!¡OŽ¢Ð fàÄ{pøDH@ójä~5î¢SlÙÿaf‰Ù'Zw t• nƒVþbY úý¯Õ²=e¤ Z¸ã…@ùø,«ö±PÖAù`^ýŸ=ÕJèqnî…í Þ\–( Ê…Ð1,ûˆLsïHcG PΖ8¶[ËxAúª:svÅõî¢â`ür#$·gŽÇ¤2B9Ëɪ+ÌÛþ ˜©cTœŒ€?"P®„’{•*l{Ímó fò£‹å¸hË™]¶bzcEl b¿š-öKL– »ÌX•_ÞâlÇHÌ3ÂÐ>FÓ|Ý1§¢¢ÄV±jI²™©ÇÈ©ÿ™aò~üÉh…€Ï£lZ5j,”õ—ŲZõ[zbjÖã‘7=€å:uG \M6Ý{70Úi/ãZóò§Àp¢¡0Œ†dh”ÜhNóÀv+ݺu'fÈ"8 gÁã¤Xy2®}ûí·3G#wêÔIfŸ8õ§‘JÁQÓ˜ o{¶NxÅAˆîzZM-³ªOó©Ët©oªüÃ*•†ÀÁƒÅÉ#ØVGë`ŸÕŒóëq™‡?bö½cÛl¯‹Ì6ÿá8$y®°À‰8—÷Bÿ{ôè!îeS„ƒŒ€LHvpxñàÅÃDч.ZÎsÅæÍ›'*ÅeRð’.ZTtXÁG}$6´1HÁqÍ8ÐÐQ°Aÿ!CDòÊ•+ÅaŠ83ÍÕf.uÖï¦ óÞ¥è!ÃUU;vì ½{÷Š8v²”‚ ð'L˜@õ-;|"HçÙ;#$y]–•Ÿ’@F“'Oí“×°cæÒ¥KÅa‡Mš4‘ÉüÉØ!ô&›íñÒvÈ8DRSSKÜéQžŒ ÍB ^\)¶/64)8Î/«3Áa‹8]VžÒ³%YÆtw/,öYõ…Éz6µX:vñœ>}º8ó gyሠœ'‚­‰‡ ¦þp¨#„‹CArcÇŽ¥×_]œ| ’…&ɸB ¨ éÀâ¸ayš©+Ž9Bǧ_ýÕUaŽà"¶ªÅñÏÍ›7‹OœIííAÞæÄU½8‡B+Â1Óýû÷‹OÛÿ²¦Î² G5¿¡XZ5Ä^ç0Ûš6-ò7áPFOD†`å_Û¶mEqIš‘‘‘â8ÐpèС‚\èÉ-8o!Ô&Lœ’ óbâĉ”˜˜XìñcJÄÈ‘#ÅYõð¸øH°?̶­[·RïÞ½içÎ";üJR AAÁA}úôfÌ–-[„‰†“Y†ú°«¤aÆŽY¨ FíbiŽ q–}½ÓmL4ìwþÎ;ïЙ3gÄi±Ø^wÀ€ŽÅJŒã”Yœ%/¥E‹¢OðGAÒÓÓiĈ lÞ¼9õìÙSô[æçOFÀâo€cŽãØágŸ}–^~ùe¥¹È.ã…áE}ôÑGe²ËOi¶áäU˜x 'ˆ­¹&µ¡öíۋà=€ @JŽ2fÌá’çÕCÓñ9SÓ’IÉœ™Gq}»ÛÅ~çpŒCàû‚™å‰€p Ê?i®âl˜š8ÜR_¾|9=šV¬XáÉ-8o!Ô’|ÖÐT €”àØ…¦„ãj@F8~zðàÁ2k‰ŸÐ V­Z%6)AlÍ58¹¥·dÉ‚O/5~%Ih"Áò´ hF F´åJò^y…ÂîyÔÕe‘nÞ|€:Ö_L‹-18¬_|ñE‘Ž{ùþî»ïŠÏ+±¹Ø¬Y3გI8 [ LMü]¾|YøÈàІ)÷õ×_Sß¾}e6þdìz I¢ 4eÊY<ÿüó‡»d„z¤Ùç-ê€Øšk’¤Ž!u ³Ãñ cÛv¨i­7nœÈƒ:ñR»’síÝ[\Ûâ›Ô »ˆe]r¸š4%w>'ø¶ä_DD„(ºlÙ2?~<ýðÃ?0£ŒØC f( #à ëOš³«A–†sß¡‰Ì˜1C¼@<ð€Ç@Ë–„Ñ3ˆ3s ¾Œ IÁH4%8»»w·7«p–œßß}÷xÁabJ·,OÌKº×Òv‹3Ê6Ùi¸zn6m°øfÏž-4œ‰e[7L:©Eá:FÕàs“âx›ôA‹ƒÉ­»víÄ+—î@«²â ëâOF°†äð=YÀßá ¡*[Ç·­¹íGï;:ÏåèÔºuëZS½í¶Û„– FÎ_rÌBú´}Ñh™ã57OEóçþ‹6T¨*’@€i„fG´-ù ´2L‹ðÙŠãõãÇ‹Ë ôàƒ rÇœ'´ ¦'Fól Ͷ.3@À0‹kõ~òZïû”wýÎÛÊ6•õÎ’0Í0Ї½×ëÖ­+ˆO¶…?g°†ä •I3ÝaoþE-ùºL{Ó ææSIÿR™6€oæw0!ùÝ#+¥Á÷ßB¦MÄjÿôùßPῦ©¡<¥Â`Œˆ’ŸŠmÊ˜ó ¥¯X«jÈ4D…ˆ™óº]œ#Œ€‘`B2ÒÓЩ-y‹?W5›Æ½¦Â`Œ†’ÑžˆíÉí~³]­±ãŸ³‹s„0 LHFy:·#wýÏÖ;¼»Àæ#` ˜ ô0ôlJ^³ÖdªhÝg;öÑ{õ¼×Íx…’W°ùg¡Ìÿ­¯­·œzË &$£=ÛSXµ™\«î׿— s€0LHFx e؆Œdë6'æõ{ÈdYŽÂÂrY\‹Ý«V­*vQÄžÐX£…Å¥ø;wîa›}0GDЉ“ØŠÛµ¥§ìÐçf\+#à!eJHØŠ§gà” WrÂ6­Ø¨  ÀU6N÷Lœ”kÞ̇ÎPÈ¥4*¬\Ň¹(#  eFHX\‰ » iiibÁ«W¯Šm:°å¶ÙÀ6ª8ê+ϱ·Ð† <ÚŸGH‚¤–{,s“¾)š Ó±=¥:$çn2Yí­[O` ì}ñâE—˜€œ[w`ËŠäädMH)XVû»ÖÉ©%áRΦ_)¿QÉÛ—8©‚“MÐÝ©]¥JA0Ø‚ûI—DFèöØÁÆ_ l† Î@R,: ðŒõ¬¸ÈŽ7êp®’ð ]ßtIëÖ­É`/iÇ-ZKj*6ÃñCpzËÍâKÊÏ×ïÈHÙª ›W[¶*ñѨ*ã#ຒ<>&[ûöí=šÜˆµlMš4DBb)Ì1±dj]_Ý,®G’ s€ÐÝWûËÖ+V$œQó [‹àxø…ä(:Šuoõë×§Zµj Í gÂ_ºtIo ¸~Òÿ»Ñº=Éžã3seŽšä‹£Œ€öèºÚ›°%%% ¹ÀüjÑ¢…XN"»‚‰“ )8¿åÐþÉ“'éСCš:²yµ¿Dܽψ”uÞo Êœ‘vV…9Àè…€n&2vìØQ´;%%…°– k×®¥5jmZvÌÎλà:4'Kù"›Ø"¢BÉœ]´I^ì³§Œ·>(ßFñÝ]4$hBݺuö?:}út¹É’ç ìøм¡“*ÈZ’‚‚:! ¹S;,,LhF ¥½{÷‚ŒtÂ.à«Í¯Û€LÕ¬KvâÙ/1 x¸ƒeŽ€¦„6FÓ`Š=zT ×—yø†š"¹ÉfÀOˆ,³ïY½ДZ¶l)öÅÆÉ!ÐŽXüÂø dj^Gu$®Wgæ# 5šæ aƒþË—/‹}³m‡óµn4×W¶¤ÿš¢nhÞq„LY™*ÎF@K4!$ÌjÞ¼¹Ø÷h“å¸f>WMËGd€ºBCÉÔóÕØÄæ# %>&3&$$ˆÕÞ÷dßl-;Âué‹@ú·?©˜O]¦ÐÔ“*ÎF@+|"$9¢†Æ@3âùZ=ƒÖóè]ªaQIìKR`p@3|"$˜f˜g´eËqø£f­âŠ ‰@Æï[Û•žKá;-‹oY ð‰ÐŽóçÏÓ™3g4lWedÌãF¨æEôäíIП I“Vp%~ƒ@æóìÚµøK»8G_`Bò½ -[8g†êyècÿ§Â`|E€ ÉWƒ°|ÖýÚõ:æ_3íâa¼E€ É[䂼\þ·Ÿ(L/ÍRa0¾ À„ä zA\6§ç-v½ûŒ]œ#Œ€70!yƒ—äl\cEâ½/¬a1^"À„ä%p\Œ(ÿúædª\tnðˆ{ø†…ð &$Ÿàã™›6+Ì?¬Ua0Þ À„ä j\F!PX¥*™UWñ¸[{¨0O`Bò1Î_ Œßl´¤¿“)7·XN`ÜA€ É”8O‰˜-çî™:5Wyb;·Wa0ž À„ä Zœ×%éËÖ¨kæ#ç($킊s€p&$w‘â|¥#po•'¦cæ#à.LHî"ÅùJE ãýÏTó¥, ;°GÅ9À¸ƒ’;(q÷xî1•7²S/æ#àLHî ÄyÜF ã¥évy#^jç#PLH%¡Ã×¼B pÖKª\Ø}CT˜Œ@i0!•†_÷¬!OÙ•‰þ÷»8GW0!¹B†Ó}B ÿ‹Uù‘“T˜Œ@I0!•„_óœ¾wØ•™:Æ.ÎFÀLHÎPá4MÈ]³LÕczcž s€p…’+d8ÝgòÚX–ÄE¨zb‡=¬Â`œ!À„ä NÓ ì” Öº¾´ž~kMä#`E€ ÉŠ‡t@  Fm2Õ©¬jŽ»Û~ë[uŒ€&$þèŽ@ÆÆ­êæ5Û‰ TœŒ€-LH¶hpXÌÑ1dº¡¡ª;îÆDæ#`‹’-Ö ôUëUÝæ}')äêç# `B’H𧾘Ldº¥“ºGLÇv*ÌF@"&Z}ÆÇÇSݺu Ÿ™™™tîÜ9:sæŒVÕs=~Œ@úSl•¢ý·Íç®RØñ#”_·÷ˆ›®5šˆ¨uëÖd²üJAÚÙ³giË–-TXX(“ù3XzÑܯDï£:u¥ôÓ§ƒ î·43Ù*UªDmÚ´¡¼¼<Ú³g­ZµŠ6oÞL—/_¦êÕ«Óõ×_ïäöœldLŸ­ºlÎ. ˆ”u*ÎF@3BªS§Ž@sÿþýtäÈÊÊʦ4#4%F˜'T@„÷¨Â`4#$øŒ ©©©v¨‚˜.^¼H‘‘‘a9‚…È|z´Q_}bçHð" !egg +T¨`‡fhh(U¬XQ¤Áœs&ð9]sÍ5TµjU&-g`ZÁ¼·T¯BŸ¥Ân4#$©5nܘBB¬Õ^wÝuF'Ož$³Ù\ m\ïÝ»7uîÜ™:uêD}ûö¥víÚÙÕQ¬'ø=Ùw ²ëCÌSíâ N¬Ìácÿ1’†!~h9=zôî.]ºP«V­(''‡8PìUªT¡¶mÛ sîÔ©StðàA‚¦U«V-JLLdR*†X`%ä-ÿFuÈ4ÕêìV‰:4#$h?U©@#‚#oÐŒ~ûí71'É]hBµk×Ò¶mÛñÕ«WÓ‰'±°XܤîDaÖ¯`ìóOng¹gn!`ý6¸•½äL˜gRÙ°¡hË ÓöíÛ’|KQQQÂá}åÊU1ê8|ø°ˆW«VM¥s 0ÈÙd3ìÿñ¢Àì$÷Êm4%$·ïjÉ(GÜ œ¬ü–(eOêå¼þ…@~½FdªV4B‹–Ç=x§u€[«)åFHð]½z• EGGÛuª^½z"~éÒ%»tŽ&™)EsÕÐ;óíÚÉàG`öœ{åˆ@¹|N0é IIIT³fM1ô'xÆ )##CøŸÌñÀC °BE25/šX‹ÞÅõîxä¹…€¦kÙܺ£M¦Ó–uL˜"ТE jßÞ²ÿòÿ>%8È™s2鿦PìµµD§ÌÛ“);‹ÌQöšs`õ˜{ã rÓdc0 ‡6ˆ(99™Ö­[çÔ.Ëðg"`ä0õ¸Au,61A…9<”«†˜1³;!!.›ci]Z4]©Û‰òZÖ¥Âð Éˤð«Ç)"m7EœÛH!YƒçÉaOÓýdÝžää% =sа'7Kð  ;!Á$ƒãËGl×X.R¹reЉ‰¡ y­i_¶e7€¢­rú ¥œ*ÍÄ5¾‡âŽ.¡è?xhXˆÁ·}¶Tô,*±e?ˆ½ä>¹@@WB uíÚU­eslC†9‚¾OO¤‹¡îÍ7J¯åU¸žâö̡ܫŽÕq<Èx{Å~ö¿_¦ô\ ßµ•òZM  €îqJA@WBªQ£†¸ý±cÇIJ óKÁ(Û¹OSv%÷ÈH–ƒÆD-‡Q…m¯É$þ 0Ì£Ÿ$ÓŒ÷D¯"zÜJyig¬‡ÜWèîÔÞ·oíÚµKì‰-må_Z­~2²‹RÊj4À‹’\ÄÈ=Ù®™QKÚÅ9¸hJHð5hЀ:vì¨ W³²ebaô5óË™Žû7™)›¬Xô‹5Ì¡€C@³‰‘Øä[ÏbÓ~lM ÿ‘<Á‹f±“¤¬MeÂÿ>'sˆº7®D¯ÞU_¤Xª  ‹Òå¬|‡œEQÔÃø^s-™T#ó‘s¢³q·õ¤ôek¿ãAØCÍ ØíرCø‹@JX4‹3Ù@TØ’V:º‘ e±6ÍQÒ-'™vš´‘b#B©z¥h:|.Ý1‹]\Ôc—‘@E #y ÅÔ¸Ntϼa™rsÉÌçüÜãÖ”€ÎñãÇÅ_IHaÕ>ʺš™‘[P*¡<¯þ/ åÀºò1%6#sÊï¢c±];Pú¦ÕIî iæCòK¬ÚL+úryZåxÕ¿§¨ùþôÿ«:aþã,…\²ÑU9à·”!1¬Ú÷””åX‚ÓݽTÇc:´WaåJHØB«öa~¹#ȇü¼õˆ;hfžôÿ£:f¾”Ea÷©8üÍ}HÞ@"Ì·Ó¿Ö¦a9f`óŽ‘Þ $ež~ˆhvѲ’Ȥž”ÏÛ“̃7ÝtÓMæ£GL‡\uDîÛíê:§û±U¬póΧœ›úùW¸µN(W“Íi‹8‘pÂécT®°¨0ü&$ÿ~~AÛú¬¡ÏÚõ=ú³ìâñO˜üó¹q«-äþžÂ!äÿÆ«0ü&$ÿ}vAßòœ[í·1Ž™þRÐcâï0!ùû òöçþRtd`0½67ÈÑðÿî3!ùÿ3 êäµí@¦˜p…AìÓCT˜þ‡’ÿ=3n±Y)É֔ϭ“5‘Cþ‚’¿<)n§K j]G¦Ú•Ôõ¸<'Iág&$?{`Ü\çdlܪ.˜ÙF–íIUœþƒ@ÈêÕ«Å~EþÓdn)#PsL,™Ú6Pâz$©0ü¹­¬ÿ4™[Ê8G }µÕ—dÞ{‚B®^qž‘S ‹€å0¶Ú ût¸až!`Ù»Ýt‹U3ŠIâíI<°üs3•ÿ3àhˆ@ú¾Wµ™Ï\¡ÐGTœÆG ›ñ³0…À»Uw¢“ºª0Œ@ˆ<ÄøMå2î!1kŽÊh¶±é7瀱` ÉØÏ‡[ç%æ Ϩ’á·X5&•ÈC"À>$C>n”¯d>;֮ЍoØÅ9bLØd3æsáVi€@Áû³T-¡{N…9`\Ød3î³á–ùˆ@ö½–½·m$æíé61ÖŒøT¸Mš!·ô+U—iÒ›*Ìc"À’1Ÿ ·J#r;÷ ³ºJcG?¥QÍ\°†¤ª\§¡ÈIYgmÏ_[Ã2ºjHñññôüóÏSÿþý‰ç;îÙMƒòë7"Óµqª¿qƒí·¾U8PîXuYšrçwÒˆ#èí·ß¦?þ˜j×®­Ã]¸JF t2S¶¨Læeë‰x…‚ÂÃH21Ù>L7Þx#­\¹’{ì1 3ĹFzÜ(¬X‰LM­?ˆq}ºé|G®Þt5Ùdƒ^yåzä‘G(--^zé%Z´hµhÑB^æOF LH_›¢îcÞzˆLÙY*Îc P&ººvíZêÛ·/}ðÁ‚Œ–,YB£F¢èèhc Á­|,š¹©{ÕÏØ¤v*Ìc  «ɱ‹™™™4mÚ4‚oiÿþý4lØ0úñÇ©kW^‘íˆÇõA }ñJU±ùD…žMUq”?eb²9vs÷îÝ‚”@N5jÔ O?ý”Þxã ªR¥ŠcVŽ3Ú#ð€õ€¨$ë†nÚ߈kô23Ù–——'Ì·>}úÐúõëé®»î¢U«V ¢â)Žhq\K2Þo­îJ…ï² ÀbÊÔdsÖããÇÓÃ?L#GŽs•Þ|óMš?>Õ­[×YvNc4AÀüÂßT==¬“Jä@¹ Pî„„^ã ï¾ûŽzõêE‹/¦nݺъ+èïÿ;…‡[O%-„ø¦‰@æØWìúùãb»8GÊC:‚iÏ=÷œÐ˜ÎŸ?OcƌժU«òA‡ïо9Yõ/ìÁ¡*ÌòCÀ§Ž¬[·NMhÚ´© % # %Y<©eu\—ÂdsÖ¬¬,1E`êԩ·ólr†§ù‚@öîÍEŇÞçK5\V#t]ÃP\\eddxÜ\,Ì…£3¼ácš3gŽøô¸".À”€@A­ë(#íl 9øRY"`²¼ìf ³7lØP“ûÖ¯_Ÿ4h –‰`òcvv¶ÇõÞ|óÍͨZµj„9K˜Ñº|‘mÛxh×ü¸,#P„i5ç“§OŸN )gΜ¡Ñ£G‹e#2­¤ÏêÕ«Ó¤I“è–[n!˜l'N¤ PAAAIÅø#À>ù:tè Ö¢EDDпÿýoAF'Nœ³®1ɃmG‚%"ÎVúÃ74hÐ Z½zµ £ŸþYÔ…ÜLFòMãn0n ൠ«õ¿øâ ‚)´|ùrÂÐÂtÃæýêÇ!‘О°.mÀ€tá±­H@£Êc¯ðÚ© ¢Á6³ ¥­[·ÒÆiûöíÔ¯_?êܹ³X¥Ö>úè#Â%æa2$ #À0Îði¦¶³ šÆ3µúd¸]Œ€ŸÚÖª8Ä0Œ€o”Û¹l¾5›K3Œ@ "ÀR >Uî#à§ø4ìï§}æf3Œ€A`“Í †›Å#l²ãSç>3E€5$ƒ>n#Œ°)Ÿ:÷™0(l²ôÁp³`D€M¶`|êÜgFÀ °ÉfÐÃÍb‚&¤`|êÜgFÀ „ÈC Ú>n#À–ýÒXI ¢çÍ]e Àÿü{/p>I†IEND®B`‚cccl-2.5.0/.devcontainer/img/open_in_container_manual.png000066400000000000000000001623051463375617100235150ustar00rootroot00000000000000‰PNG  IHDRb=æŠFÎgAMA± üa cHRMz&€„ú€èu0ê`:˜pœºQ<bKGDÿÿÿ ½§“caNv@Î&Ë¢î€IDATxÚìux××ÇÏ̺o6îîžÇ=¸J¡nÔÝh¡PÚ:oÝ)EJ¡¸[ $D BÜe“MÖuæýc’ÉF ÒBz?O˜¹rî;ß9WÓét666€@ @Xq¹\…B @  ±±±Á‘@ ýA2@ 1H&"@ €©+ÊÂBFÝüˆH ÚÙû¸Âï O€ª @ ˆ.°+üÆ8vSÓBfdþ#™î£û¨Cr놾÷¶„üÏb@ 0€ØëÙçT²ë|ë:f­Iòf+Å›-É=…}" ÿ!b/t¿ ñ‚¢°Ã£%S÷6`@`d/¥7Ñ­x“eb'"Ùg›ìuÂM+"o"@ ÿ-ú?úyÙ-(1ÀÈ^b‘ò,Þd?"óŸ2Do]h-ÉOûT$øÓí)¤Á¬´­1kÏ"Ö«Çù¦õ>3obžÉÞùïш}b¿3¯ðk$V $@ÀbÑkHþsJ‘y³²:¸Fì%­ø_©H&" ±‹d_Õˆuïû§”"ó&æ±×Ï~‘$¯Öý_¨ h @ ˆ>ò®[ öê}¦5bo¥Ø_MÞ8n†Lì?+¥Ÿ:$:ÔsþäMD ÑK ö³Z‡žÑB+ŽxãŠÌ›˜SkÅ7€FìÞÃ`ŽÆ€bÊ@ Ä­ÍRŠÀ¬F(B¯•qnš?‘yórÖý«ÛAHZu1w-óƒ‹õÏÌèF ¸Í >à¨Å~Ë( v(2ÿÁ\ÒÞDgóŸ‰@ âv†’ƒ”+±çÓ,Öe¹)EüÆgƒÞêëJ${~b8Òˆ@ C‚¤?ÜFB·¤ê9tÓ¦và7:WÞI/yC’Àd¡G $Ùãw²×l`€›±n ~“òÑ/¹V®DGD >Ý3=Hè»øôMp(â7<å½¶û \Hð›¤M@ F.]êHÞ”^güæg¨ß&Æ@@ q Øï|ƒÁohr‡¶u8#@\ d?\ÿÎÜ6<‘ùOd§Gð’ÿ¹¯6#@Ü(èQ|ô×ün&ø¿”=@ Ä-ÍÍñ&öq]b˜41–ëꀡi.@ þc¡¯oé8—}å^cràõ´ûŽåFp3dâµ; ¥‰±Îl>[HZ,¨® øO1"ogiblGzÖNï§ o´P¼™N»áËE®«ƒª²iD@ ÿAH‹EUÙÈuu¸EÒóOõíM2b8Ž4"@ ˆÿ,¤Årµ¡wÿÜ<4@ â6åæJF$@ pSe"Zü@ â&sÓò&"@ܾÜD¯’‰@ ˆ@2ñöcZlèçÝåá`;‚óøéŠ¥+fŒGe@ Ä¿’‰ˆ›Èsó§¼wÿBd@ nG˜È·™—*ËZšÊœÇOwÑ ¨¬ø¹Íd¢˜Ïe2í*͹Ìj­B­Ùy¬hjE7'@ ÿ.·™Lt–IŸ˜5¡´¾ù\IE~e­É<Ø'[8,ÖGÝ‘u¹ê`váܤ(_g{ áå¶‹Á˜çïe+š-–Ê&ùìÂÊÞÒ$ÊÇ}bT°«LJY'ï8’[t±ºž>úò¢é">ïýmûæ'Ç„z¸°˜Ìª¦Öçòj[Û­‘ðyÓG……zºŠù\ÎP\Û¸÷üZä…yº>:cÜæíjí̸pW;Á”UZùwF¾… ®”¯i±¡³â#?üó`MKÛ5,&cBx`l€·½Xh²XZ:”ç/W*(¥Ž2p|bTp|€—Dd0š.7´ìÍÌo²raRxgóžÙ 1¾\»N®Øq&»ª¹:áóÇîê³±ùDÆÙâr!tX@ˆ»³L$`1òNuÆ¥ŠcùÅÖ_9ÿtÅÒâšÆ¯öb\žö²©±a¾ÎöK¡Öä”Õ̾hêþ®Ïü¤è‰QÁooÞîå–â' v¥ç¿P‚Z@ „±æ™Ç×þ²ý“ɺÒQh€ººiKM‹€Ë rwåï96,ÀV,TéôÝÀñ0ScBµãÔ˜Pìr}³JoÈ.«fâøSs'%z×µ).VÕË;Õ®NcBýjZÛ[;UÔµ#ƒîšÈÀ°ó—«ëÛ:ü\’Cü”Z­SBüøv„·»ˆÏÍ)«nSªC½\½/V7¨tzê{‰è……Ó¼íJꚊkuFSŒŸg|€WNyÞh©x”¿—… gŒ ¯oë(kl±òü\9lfqm㕌àçâàêt¶¸œÊøµÂb2žž3)!ȧS«Ë-«nTt ¹Üwç—ð‡¦æß¡Õ/­Rjõ‘>î‰Á¾%µJ­®ÇlVˆ»³›½¬¨¶±C£ rsŽõ÷Ì,­¢r'WªœeR&ÿýDF~em~emyc«Ö`ŒôqŸZת¸ÜÐR/WØùñÞÿbU O.ïTg]®b\éíöج |.» ²®¬±…Íd$ùø:;œ/­¤ôg°»³·“½Ÿä[ÓÒ^ÝÒÖ¤è¬oë@­@ n)D^NêÂÒ+$Ãúþµ]ôƵ²æŽ·™7Q©Ñm:~nÛéóQ>‰A>ÉÁ¾)!~Míç.Ud–V©´èE?‡£yÅ»Îå‘ÝÞªé£Â½lÿoï ZEý‘ÿÒ¢é‹ÇŒZûûn’$e"ÁœÄ¨v•æÃ?¨u8˜Søê¢é GÇ^¨¬£U ˆÇmhëØ¸ç8A’YZùÌÜÉ‹Fúl×ê„e ’|{󅺫—ÜËÑöÙySfÆEüvüÂo·Ïw-kl‹µrÉŒ”ÿÝùƒûJû0Ü@fÆEx9Ú¿P²ãL½S*àQ±~žá^®«ê¾9pŠ2[FiåS³'Þ5>áýmû{ê1ŸWÝÚþÅÇÌãÂŽM òÝŸUçK«F‡ú ¸ìó¥UÖQV×g—V™­Ý\Z×T\Ûèfgã,“X_²ãL-ËΗ€»½lð¤ªuêƒ!âqÅ|î…ª: Ãü]¿p¸Fù{qÙ¬ãJ¸míìËÕ‚ècícùÅH#"1nï™Îí*Íþ¬‚Ù}í“‚}ãü½Â½\ÿJÏ=–WLŸÓЦ°ö¨ÙŠB.ÇÏÅaý},Ô"ðš;”®v6PÖÐb}¨¼¡\dRzA’ÕÍrës*šZÝœ\lm:4:O[˜:5&´O,$IbÐ ¬V®°>Ú¡ÖÖ)†ˆXÄç° ªêWð5ºÙÚh ÆÆöÎ^hl vwv‘Iéý“ÉZr™ÌÞÀã°O*†aãÃSBý$"ÌÊ%.îöeÈàqQÖp©Å>ÁVõ.2@ Wb$,ˆã …¸;¸:af4™û̃Vv÷SPÊébuý‰ —úU/W—Å•®×‚,T_3uˆB«7d¯/ä¨õà²YÀç²à`öÅ˽åf7ýu½Áh}€ æ‚aB¥P©Õ])4.›¥ÐôLMõéS×RèŒæ>çqՔώ˜ZZ×tòÂ¥Öl!d"Áãâ™ø`¾íÁ㢬ýá4mo;€®{ðbW}ÐêÑm@ ÄP¸e"ŸÃŽõóŒôör´€òÆÖ½™rËk &Ó WQG9L楺+ΕћL âq¬;Šx\úPW¸lì•¢ËjR…Áh AÑ¿•B1Ÿ7È T~­ñyôµ× ÇÇ…Ö¶¶Ù=¦‚ܯ3G“TZ=ÝóŽ@ â:¹ýd"ŽaÁΉ>aÞnLïÐèå+.″ÖN•Ö`ôr´“ xW˜"]/WÄúyú:Û7XMƒõu¶€†öë”x:ÚY/£ããd m ¨ni€Ho·Ù$ykÙP®¤Œ`Ëf2ìw®kS„x¸8Ùˆ­WÀñq²ëc«B$Ž÷r. ¸l6‹Y+WXËk?gûëÌQus[¬Ÿg”¯;’‰@Ü(n³),.¶Òwî™·bÆø0o·‚ʺÿÛ{bõ¯í>—7ôI $ gŠÊXLÆcâ¬'»0xŒo×ô”ì²j‚ &E…ð»Çö‰ø¼ñá&‹%·¼Ö:´™qát¿§Ÿ‹C ›SYC ¥>ë䊪æ6W;›I‘ÁÖ—yœàëvž]'$ éÅå"wVB¤õ~I·‘š›<;!’î@ösqñp©kSô°88Z½ËbqX=o#*Þd¶ø8ÚÑÆwŠÆE]gŽÎ—VL¦Ñ¡þ¾½§“øªSj@ ÈmæMq9Zýì¬ËUýG¡ ‘ýç ¼ì"¼ÝV-u±ºÞ`¶Ø AîN“9§¼ÚUšÝùs“¢WÞ93·¬à Æ×CÄçm=}ÞºZ¥Ó ¸œ—î˜^XU/æóFxL¦miYô ¿=û̼És“¢#}<Êš1 s–IÜœ²/W ²¢á?þóü\&Dù»8^ªkbำ­ÔN$X³éoÈ.«åïáíþò¢ÔâÚF‰€ãçi4™ÿ8ž1¬XJë›#¼ÝWÌ_Vßb&,…U umŠ3Eeã#_Z8­¨¦ÏåÄú{]ªmŒôq¿žì¨õ†_ޤß?uôÓs'_¬ªoRtrX,O™—£ÝoÇÏõYó@ ÄP¸ÍdbiC³õº}׆Ébùb÷±±¡q^IÁ~ЩÑV7Ћ9À‘¼âv•fBdprˆÔÉÛ7ŸÌ,°Zÿ,ùåžcó“bF‡ú³™ÌÊÆÖçò¬û©[;UïoÝ?9:$ÜËu|dÑdV¨µ§ JÏ•Tüë–4š-Ÿí:2)*$Ößs\D ÑdnéPíž!N’ä·OMŠ Nôžd4› «öf^hN3¤•Ù‰EÑ~¾N*­¾®M±+=×`2ò÷Ô¦TïÉÈ¿T×t2.TÕ}øç)Ñ!.Ž¡ž.Z½A®Tÿ‘_TÝ€îs@ ®Œ¬¸ˆMZr#‚"»gîRd׊/$ÙõÝ\þ•‚p^<«ñdÞma8êóq«~ù Õ!@ 7çqQ[÷\á Þû+,øÍø ¹åóÛrym@ ÄÍÉD@ Ä ™ˆ@ €‘ð–…¶@F@ 1‚AÞD@ Ä0Ú.¶67 $²kb35Í™$I »ÿ% I’$H’‚ dw@ ˆ[äMD @ À­%I‚À T*@ þ›` IÜ*ý®·–LÔ×·ˆ¼‘RD ñc0DÞÎúú–[$=·ÖLçŽsÙÒÄXÇäP G½á@ þ[¡¯oé8—}‹¤ç[‡$;Ò³P-A ø×AN;@ Ä ™ˆ@ $@ €d"@ bLD @ ’‰@ ˆ@2@ 1H&"@ ÉD@ Ä ™ˆ@ €[ìc}Lxø‚ÌXlT6@ þs˜ŒÐÞ5å`6ýëi¹•d"“a±À¢‚@ â? ‹ Žn –ÂÅì])ÞJξH#"@O¾ÿz*n%™(s@µ@ à–ÐE·’LDã@ (n]t‹MaA ¸%Á1ì©ic±îŸÿwôŒÁdÙYF2@ âêø;Ùr÷|­Ñ|6;£¼:ýrÕÈÎ2Z7qä3:ЇÜôÉ‹3'þ§r=+:”ÜôÉC¯3œ7æM%7}2ÊÇ}èÆ\³p:¹é“(O×>×ãƒýÈMŸ<;}ªöWBâçñÀb;/Lý/ÛÁnb2eYr,ªˆáÒ§a¼¸†ö9Áϳ䣕scà ÂÃ… IçÇWÛ=úºÉbIö÷€gû¿ž{ð_ióÿnooâŠI)ÿ÷ÀÔ¶… TzCƒ¢3»²v{FþÞ¼" Aü3Éà°˜O\”îî,áñmVeí¦3Y¤çÞÀ40¸é— fæ/úôÇ‘Ý5m|ÛQ"¢¶Í¢E©:}©bÝ®ÃjFvÆÿ¸nÎSÇXï! ‚0MŠNMYµ¦¼HYéÀ¹Q×Õ‘%ál6a6›Úª:MY5a0ÜŒ]Ïd Ð~æ¼úR%*ÄM"íÍg‚\ì}ý¿™ýäïý/?*æqw>ÿàoiYZ]a]“R§€¼êúqÁ~<6ëyS ’ä;yýÆÜªºf‘Ðé|¶´² ¶Ã@Èåø;Ù/Mн{t\FYõâÏ~ªiSÜìØ}l÷¼øH°«c‡VwæRe³R%ðS¼S#ƒ—&ÅÎúè›Ý>jƼõYeKÛmT¦F³åÇ“Àc³¢½Ü–$FÏŽ M~óÓüšú8%?ŸÎ²”Q«§9ÅÙ,Ž£ÇÑNàëÑô÷‘[Ðf•ú†ÛaÄùx|eEµ\±ø³Ÿ™˜ôÔ´±8†}}ô,uôliå3ÓÇñ{ñ÷][Ïå~íñ£+Ÿ²þÿ²+kG’F‚LÜ–‘÷É“ôOWÉ'÷,X¹ïåGG½ñ?½é&.M)ärö¿¼"ÀÙ~þ«¶í£Æ+Ç—¥ÄÞ3&îV°R§O»Tq{•©ÎhZñÃVúçGËæ¾0c«s&-ýâ—8%µmµm#éž§Ñ7µz#ƒÇá8؆€(ÈW™_lÑêFd~)´•µÚÚˆó}=lÇõŒm0«4¦%†c,©˜!à`·¦:Î_øWâÅXLr¤ùGŒêf‚Ø—WTTßôì¯ýv&ë™iã6I£Žn<’¦1¿:z¦¶­ƒãf‚q¹Ø-zÇ_;#p K½¢sÉg?Yùø„ÿ‡&$~qè4}(ÎÇãõySFúˆ¸Üjyû–s¹ïîwj¸‡óá‚Kó6|?ø3‹‚Çf=;}ÜÒäX?G;½ÉTÚØúkÚù/§ «¤¡7ôÆÃßýñÝñsÖéìwâ'ßûûÈk[öXïiÖÄ–ÎYôéfæ÷Ï55flÊúG.ö“×o4[ˆßÏf¿>oÊÒäk™(åó¦GŸ(*P#À½cãàí¿xT­ï“d'œ{ë9o{Û½¹Eµ Ñ^nOO;5<0ñÍ;µzú|7™$}ͳ—›ZMËŠòt]éí`÷Æÿ’,nh¾÷«M?¯Xv¾¢æ³ƒ§¨ó›:UðØä”±A¾gK«vfØxsbÂv½ðÐò¿n:“=ˆ‰‰‹:ràåU×ÿ|*ÓBSÂZqW€³=}Wü²ÂM&=QTÆf2Ì"9ÀûÄO¶©µ;ÎçËUg©8ÉßkZD-óÖ¿éáêýì[U­íC/PêýŒnÀÏÑîôê§%|Þά‚Úv…§lYJljdpܪÿY·‹â£Æûn:“½;§pFTð› ¦û8ØÞó›nTMÃ1l÷‹ö;UR~ª¤Ü×ÁnïKž*)ü*'©è½;gŸ(*Ûqþ‚ÖhŒðpy|rÊØ ßøU¬Lß÷ò£NÑ®ì[¡`~\Äá× ~i}]{WßZ”ºjþ´²fù'N ¹œ·ï˜qîšæÜ™Ú;{îcO8Žvö“G㜞…»Ø2)[&å{»·ì?AŸ) õ·Iˆ¢Ýfïêoõqo9tº ÄYLÇYX1…ãôñ-GÒôuÊ”ÆvL×Õ©Ëb"MB4i!ÔW+‚+!ô÷æº:ve„Áùâl–üĹ+/ðõÀÙ,j»ãük†f9%Ä©Jl7>‘ïÝóªÃàrx.<eAIßËVê8g2Îê œçî̲5î8ØÇžÀÀE¡½vðy<>çî¬ÈÈ\CѪÃ.£q G»ù¬¹ÌŠzª¤|óÙœ6µæªÏ,à±YÇ_2ÁÏ3¯º~ã‘46ƒááòÂÌ ´LJ3>x;s÷ÿý¶jÞ4ñc?n£Îï?±÷dqYY³üî1q¯oÝKX n¾wL|›Z³ÛÊg1\¶>}_jdðá‚K;³ XL†Ÿ£Ýƒã_Û²‡z\õ)pmísvç–5Ë|ô®è•Rñ 8ìÔÈ`ÇäSƒ]l$kN/ªoZ¿ëVarŒØqÒ.UZ"¶»Q– ù?­¸ëdqù¬¾¡W9z~Æøÿ-›·,yÔϧ3‹ê›.Ô4,ˆ‹xüÇmÆî†xA\›ÉØœ>°Þ² üíjÛ:Êè'ÁX·d–·½-íÏ€µ SW/˜öæ‚éÏÿ¶“>m\°ßë[÷®Ûu0 Ûöô} ã#Çùž,.kîTý~6ûçËjäŠßÒ²¬_¹eoc‡’ì¾9Ÿÿmç™5Ϭ[2ë÷³9ä•§# L xaÆ„·ÿ:¸z{—»…ãÛŸ¹ÿ•Ù“¾?q®¢{˜£“T\ÒØüÒzZo¼ÿާ¬ùÄÚ&R>ï:Kó‰)cÀÚ‘öã£wYH2èÅuôðÓD?¯S«ŸzkÑŒû¿þ>mZDÐøw¾ 2õæŸû¼²âîÑqߟȠö\?wŽì÷ãÉŒ¾ÙLíY–ûÛãw~US‡ÊùñÕmVc–¥ÄþöøÝËRb¿·R$ŽÑùŠš)ë7R5ö©ic?»gÁƒã×î8>¶+çN¹Xט¸úcÁŸ8™·þ¥áfÁã ƒ»¾eV«Ýc1í&&SR@ßЬ.­Ä¸$"„)²mml£ÛNeÇÁÖ&>ŠÒˆ¦ŽNmu=K,¢ÄÇÉA:*Bq.·Wt|na«K+€$~žƒ 8f›Û°mICŒÔ®«“±MaVixî.Q¨ÿ5ËD®«£E£34·rœì|ð}Üy~Àó9N]ßH B]:ØÐqx ­uuÆV×Õ‘ã` âð cK›¶º×¸[޽-åÎdIE,©˜B!ÏÝE[Y«oh&Mqde(me­±Uf­¶»5Úª:s‡Ê¢ÓãßÛçîÒØpMi1„q8ƒXõʈãhgêèÔ×7ãl6a0ŽÔ‡Îu2'&lî†ïvçñ™E=GüS°™Œï¾3ÉßÛÏÑZ•jJ;JD§V=àlU­íßý²²µ­M­iWkC\Ú¾Yw±¶1·ªî‰Ÿ¶˜š9bÄÑ›L NÈåpXL¸+9VÌã~¼ÿ„ˆË± ¨¿?ÒsMKjT0uÉæôZDÈÒäX“ÅògæÀƒx¨Ù¸õŠŽÁSÂÀñ¥I1 ŠÎO­P¾¿ûh»Z»Úç­®k=tº3÷bó¾c´(…ö½€$›÷—;Û´ë0=Ë„m'C³\y±„V{ú†&åÅåÅmE &sö}™ùêÒ ]]£¶ª®íDa2Æd²m†bÕk(#}SkÓÎÊŒ¼¶Ó™}®š}yÅ´FÊ3 ǰ‡&$¶(Õ¯oÙkN½¢s¸Íø íÌùùôy AÜ76žÞCmS3¯ƒ¹×¥NOɵ¡<®­}îº×˜Ì»GÇ5(:Wü°5òµœ_E­[òÖ¢T—ñêû¡/¿ŸÜ3 &³ÛSkÂ_yÿÅM»L˃ã¯w!¶[Š‘¼¼6Ža@I¨x_ØûÒ#ýOs–võý‘ž³~ɬ;“b¨{ÕQ"šâ·/¯X¡Ñ^O2|m…\Îá‹—¬ûLµFcNUÝä°g©˜îѾPÓ`½€N][ Á'ásßZ4cIb4½ˆ ¯AÜœƒÇïëÉg³¾|ks€ÉbÉ©ì5ùÇùüÇ&§ä¬{ñ÷3ÙG.–ž,)³îU€Á»Â­nQÆ£“’éŸʱoF¿2R¥¹rs§ô¹$IÃ莳—{-’Q^Md„õp±ë#ÂÃ¥®½×’$Ó˪‚»;.¯Djdðëó¦Æûz°Œm j½á²Õ =ÑÔ¦ÖÐeáîÒ?ƒgK«VLJ¹†ŒfsGF>­Z€Ó­'˜b‘Ç}wô½ÃX6C³œã(ëªÒÕõô>mEµ(Ä0擬ê!i2Ón3]M#a4Qª‚-³ÑÕ61Rë}t/*5o·ëÞg³-æk™ˆ£¹\M¹»L*Òb¦4.Ýó{Í0|JZ€–ÖI©­¨aˤÀ¶—õ¹ÄÐ,§rDZ“BI­}Ó_ ×ÅQâÏqQ.{¥dh®ýA¬z-eTPBþS˓ݾd”W[ÿ¼ê3ËÇÁÖFÀÿ;ç"=u²ClÆog†Hƒ¢s~ñ¼Qá>·S«ç°˜K“còªëóª¯}mŠ¥:íRÅã¼íe»²/+,½X×Dw‘ å)pÍí3ͶŒ›­5ïHˆbàø•zœ ¹S®6ÒÁS"æqé“û_.æqi™ØGQ™ ø`_ Ãö½ôh’¿×§2O—ËÕj AΊ}bÊhk°w€Áã²ðjÛ:øæ÷þVZ)¤ýð4G.–Îüð›×æL~b꘧¦5[ˆ¿².<õóŸý³?8Z½ôáW©‚»+9æÓ{ìxö¤5ŸP¯¼Ti¾»óð‰âËUD°Na¯lZˆvµVÌãܨj&æqû¿›¶\-³©‘Á{^z¤¼YþúÖ½åÍr­ÑL‘c2®äP´Î—µqˆîm Çû\Nç«WÖ†-¾¯‡Ý¸+:3¨ä!ØáŠV½†22 óöÿoÒÔ{üUŸY>Ê+8Äf|ðvfè|âܬèÐ%‰1ß;;76ÜFÀ_óçë´Éì¾]»(õΤª§¥ZÞ¾jÛþ_ÓÎÃО×Ö>[³4)ǰôËUyÕõ”£'§²nA\Dœ‡… æÄ„¯èzës’Š£=]#=]çÆ†°š9b½‰£}8N/_¤Ò ¹S5øðÕÍé9Iþ fÅ„n=—»4)Fk4þ}Åá·r•¦¬Yîçhççh7ˆßŽâÚÇÕGïQ^aÌÓIò÷JðþôÀÉg­Æ¦Œ ô¹Në©ôG‰èDq™õà+ÝŠ}Ø—W´/¯HÊç ò½gLÜ Q¶BÁ¤u_^[Jí—‡ÓÄ¢Õ ¦½4sâ;;QÉ£Ål=En@ìÅÂ^ÕÛxô ‚ €‰3¬³)ás‡ž<¥No/ôÙéЯ¬ûð\êx³…óÖg´zæ°˜¢aŠWªæØ‹…«ßA,F)—šå9l§¹“™B!`˜Mb´¾¾‰rüÐ -mm'˜ÃA ×#Œ]:çöØÁíÉKŸ 0ŒVŠx÷™¤Ñ<ôH{UBZÇÜwUo§×ÕW74µR]öŽ ý½UWój= ÏÚ8=Ù'ˆ>“´ê|€á,y.îîOÔ7´´§§:¬é©ÜC¶Ã­z-edF+à Á潋ùªÏ¬N­œmÄW¼Á‡ÖŒß(öä6wªîÿͱ³÷7š-¿Ÿ¬ïˆr10½Æ‡ôi~;´ºg~Ùñܯ…º9¥F†<›:îçwUËÛO•”å)pmísWõ6™7N ÷pY³pº€¯Ôé]žX­1Woߟàç™ùöóp±®ñéŸÿÃòÖ½ä(Yâb]ã·ÝÃiF#sl"Ža”#zk÷ðyJò/Šü­çr-qgR´§,Éßëï싚A\ÿ|*VÍŸ6àQ‡ åÍmj½!Á×Óº{‘ÏfG{¹ÊUš+Í¡à¦"Hèç\ô°µ¡sG36È÷: x¾¢†ÍdÐîôk C«û;çâ¢O!ÄÏf_OzÞß}´E©~qæDª¡ò» .¿šÓˆú’M‚¯'Ç鯹P/å®2 }‚˜ÇõÎ”Ì 5 n2©»­”ÞƒaX’Ÿ×àWyØÙÔ´)¬=¬c}ða:À.Ô6ôÏ`r€×pmKŒY]ïBL‘@ÐxC·˜fˤ@‚Y¥¡ÿ,zÇÑž’5Æ–.‘Ê÷t¥‡!ò}ºú\H a”wXG‡±˜üîOZñ<œiÏ“±]1ôHo4åÕ´l’ÆEZOdŽƒ­4.,--žø¾t;Å÷éÚ6gâwÉu©Š>£?™‚®îBmU-¥Ùv6ÃÒˆƒsÛ•ÑmÊUŸY-m 6ÑÏëJ­ëõ7ãbHÎE³…øåôù$¯ñÁ~SÃwç\”«[¹½«ùµ‘XïŒõv¨¦“µì9ºìË_1 ›C{ \[ûLa²XžøiûØ·>sz|Õå¦V&§:pÚÔšÄ7?½öÓ1o}³ò#j$(Žc­*õźF›‡_‹zíç~þs$UÅ(]m$[ž¾o|°_AmãÝãg;“¥ÖVLNÝÛÓâêd=G¤¹Su¼¨,52øÑIɆm>{•/R|ràdicë=câ>Z6×ú^Å1ì®äØÏ?DÝc¤çºØHžšÖó‘´gM° ~;“5ô|$١չɤÖ;«åí0.ØÞ³49fôu{ÿïÈøð®¹}FË¥FË„ƒH ðæZ äb2p Ÿk4[ÌÝΉe)±ÏN'æq‡•­Ñøñþ>÷™éã ·ªî\YU¤‡kŸo+Û‰Ó{Oa™H‡ãk¦À¦n³çTÕÀãèRû`éîp¢m>›ÃÀñµV_¾+9æª_ªåížv6^ÝÃÑöûKç ·Œ¶eäYâµ¹“éŠçïdm+ºk+kÍÝ×âî)\šŠjª{c2gN…ðÜ]„þÞ¶£ã\ïœ- ëš¡*.£\ƒ >Ïiî$IL¨Ý„$j`"¨K+ú/:h;.A–'K‰µ›Dí±huúúæ¡Gzë@ŒŠs]­ÆdØMHr¹c¦ý”ÑöSǸ.™í8k×¹k*4=Úïéf?uŒ$:ÔqÆDv÷í¬*,nÔt·(Ì_" âØÛ€©³»(Ã> _ûÉ£o`–o»2ºM¹ê3‹ És ß]2Óú—náuÍÍxÚÕZ +â^ýMƒzàþþä= ÿ©ß„÷>äTÖÀ=câh º,%6Éß‹>ÁY*usêÕtù 3š†ø¸¶ö¹k¦ú;Ù¿¾e/íÓ1[ˆ3¥•i—*èùf ñð·[B\–ŽÄ/¤„Nç;¢‚\º>ÖãåÆdàé—«ö={«U©¾ûÿ~ÛòÔ}'ÞxrOnaa]“ËIðõLðó¼ÿëßs¬>¸ùlöä°€—fNìÐê\(òÈÅÒɃ>,7>35<èÜÚç6§ç˜-–EñQ5mŠÁß¼ûSÞ,_·ëðªùÓòß{ùÏÌ|!—³,%öTqy¡<$HRyñ’,9˜b!ßÛ][YKšÌò#gì§ŒÆ9l†€×µþv? ÍòŽó¤q€a,©D%éu¨ß*f–0…ÞÖ±·ŸÉ¢z{‡é-…¦¬“¥ÄP^=¦HÀY÷suùü”%l{¾§ðÜœynÎôªÂRíð¿«­©ç8ÙS(”Ć@göECk›ª°”ãL±Èv|"ÛÎÀ¹ì’ßÛ±ŒnG†òÌZóçqÁ~ÏN7.È÷ha)‹Áswöq°õyöíëiÆû?wæ ßóÒ#'‹ËŒfËžÜÂ+ML)ih>[Z™àÝÔ¡¼êÓ³¬Y¾?¿8528óíçÓ/Wù;Ù ôÝ›[43:„:Á×Ñîôê§Ï•U]¨ihQª½ìd â#--@¯ú¸¶öÙšQ>î/Ïš˜QVM-Wì$?21igVÕ%éáúÊìI›ÎfíÍ-:WVõå¡Óÿ[6÷ÈÅK·Õ§q¯ÊH‰ÉÞÉÞIªt†Eçé9[3r÷æõ™]±3« nÕÿ^™=iBˆÿŒ¨vµ¶¼YþÚ–=ûòЬOÛqþÂÆûïà°˜fæeÁ§Š–¶¨•>ÄHo)4eUúú&a×ʼn%á,6a6›:•ºÊ:uY÷Tt’”Køzü¼Ù¶RœÅ"ŒFCk{Ÿ£ ãŽ(ºÌàrø¾žL>ÏzÊŽ¶²V ŽbIĄѨ«ièȺà¼`ú ÌïíXF·#W}fiÆ ï|ñ⬠K“bžš:Vc0–6µ|¸çØu6ã}øêèÛ;¢FúàÖÔ©dþòog²’¼MËʀȻ¾øeÃòyscÃ\Η׌çó91á´L¼ÔØòÞßG&„øÏ!æq뿟ÉY·ë0Õ‡W} \[ûl ŽaZ£ Ã0³ .òËæÚø£¼=æüï[xhBâÒ䘥É1¿¦kÇÁÑ>ÓÏÑ~„ÉDŒ¬¸ˆMZr#‚ºî™ÎI“Q»€@ŒHüDa|õÀâG'%‡¼¼~ˆËÞú$x|e…… …\öÇûOø;ÙG{¹y>½²Þy¡E©.ªoz.u<`²w|úãß×ñÕ™Iïÿe0 00¬ç¨Ÿ€õ¼7^÷¦É-Ÿ_ql⃠çdmû9Ä×Õ{@ ƒc+,K‰=[Z9b4"œ-­œñÁ×jâWmxé÷¿•yØÚØø<6+ÒÃõDQÙ‹›v}ë³¢úæ…Ÿüpã5â-ÀH^^@ ÄÍfL Ïøÿy£Â…\κ¿°Ü¾T1ö­Ï¨íôË•P÷Å`2pêç™ÒʰWÞ©…{E™¸çDZvaIU}ãHÍ9@ ˆëgZDðëó¦4v(_Ü´konÑÎinUý;;aÝ+Ýgu¯Í<‚Ac@ nIþí±‰Ìä»:òùE¥_mÙQ^Û3IíÁ…s»sá=¯®)*¯€1±Q¿úÜ;_ýÐ$o[±dA€—‡J£Ýúì›¶™­¾Àf±–Θ2)1ÎÉN¦Ñê.––mÚ½¿¤¢Š:Êe³?[ùü#ë?EåŽ@ q‹ƒ/95ãBaJLÄëVyº8 ~vrtÄǯ>×Þ©üûøi­^¿|vêÓËÓGÙ,Ö篿ððóZÚ;Ÿ8••àíñþ OÆ„)-&#*@ €[B1ŸZ¶›´ÄËÕù—÷Ö<¸pîêÏ¿äìñq±+Ö¾—ST|.wˆwçO™ðåÛ FÜ¿`vˆŸÏ |š‘‘êb ø¿òì#‹ç?¶æ=’¼ÚwIÛ[ÀÑ Õ @ hoùד€+Õ[©D¥Ñ•UŽ ½Ê¼‡Îž£4"hõúCg2¸¶—« `6oÒ¸‚Ò²’Šj‰H( %B!Ø™œ|™‹Ã¾“[S:5ª@ þëèÔPSþ¯§‚9ñþÇ~ÛåA$ðAÜ~%ÕÖ?›ÛÚ@$à€‹½X$ Ø÷õÇ‹Åb±˜»1™LR±¨¾ùj¢Øl‚‹Ùàá 2`±ÿuÓ @üÓ˜ŒÐÞ5å`6ýëia®{öñgÖm ¼þ¬èà@Ã,W–‰jm¯¯@Z àbÎä^زï0I’@’$IAý_=´…upD’kkÁÙ–Ý4@ ÿ0„Ñ¢7*ˆ[ 1Ì©) ÓVo€‡ͽž€4:=ð8œ¬Â¢+,ˆ3Ó˜: .u\Bµ@ âߥëc}b¡ÀÏ㺦Ե´(Õš0_{™ 2+@ ÄíN—L|ò®;ø\îõDÄ®c'Ù,æ ÷-c2ô~&“‘ @ q{Áܸyûï¬rwvÌ-¾x=a}»mW¨Ÿ÷˜Ø¨ß?|;-'O­Õ:ØØDøªÔê3ÙùÈÖ@ ·øöCÇÚ::ï_ùVk{Çu†e4™žY÷ñg¿méÔhfŒMY0y‚Ÿ§[VañW[v C#@Ü^ÜJßtF @ Æ?úMgÙ@ Ñ$G>QAYÛ~¾{Nê*×cb£²¶ýrǼ¬m?xyü+åøëûk÷}õɰž?{ÏYÛ~vwr¼Òœµíçÿ½ṵ̈°†ËÅÕãÛo¨?†DŒ 2 “á<{‚4:™âFèí<{K,ÙÙdÞÖ©_8uÂkßGm©ÑéZÛ%•ÕGÒ3Órò‚üg’Áf±æM7)1ÎÏÃM(à+ÕšâòÊý§ÓžI¿i`àxÆ–ed½üÑç#»RúîsY÷#ÁBí¹Å¥?þµûruíÈݨ°à¯Þ|õ¯#'ÞýúGëý¿ýÌV*yÿ»_¶ôõóÁ…s»sá=¯®)*¯0m|.÷±;N(àñJ«ªÿï«/ÿ”]Xèt$½ëÔÐ"?xæÜ¸¸w'ÇÚ¦®ž¯èà Aä—\îSŽƒÔ:UOܵhJr‚Ï+©¨þø—Í¥eÖiðòxtñüèà@.‡]×ܲëè©ÍûÒc$®j™oÖ®¤í– ïRG÷Ÿ>»ê³¯ûg6*(`úè¤ê†¦çÞûØ`ìõΖ[|龕o™-f`³XwL›4vT´—«³X(hëè<••ûÕ–ô=Bá`kóÀ‚9ÉQáö2¥Zs©²zÓž}j×3¦,IâdgÛØ*ÿ鯽?e}”Íb-Ÿ=}úè$7'ƒÑTPZöÝö]z§S’ã§&'ùxÚJ%Z½>·¨ô«-;¬»Ú©×˜ùO½œ1HÔñá¡/]àå¡ÒhŸÍüâ÷­C¿´™ç;÷‡Óª7¨,ës06[4aÄïÊÞNE¯x3%BÒdÖÕ5+KÊÁjØÆdˆ‚¼y.N‹aRªÕ%}l‚1B_®‹ƒÏ%-$¡×ÚÊÂ2 þåJuŒ@™Aü°c÷¨°àÉIñ´Lœ»þ¹'ÊcY*6Ø×ëÁ…s¢ƒ[ûž… ¤¥?°`ö´Ñ‰Ö2Q$à'E…g• ¨`ÖøÑðÝö]·ìÝ#‡¤"ÑOëÞtq°KËÉ/«© ôò¼sÆ”ÄȰûV®µ˜åhkûãºÕµMÍ{Ož ôöœ˜0ÊÅÞîž×ÖQY×ðæ߬}ò‘¢òÊ?ö¢Îoëè€ES'Æ„å_*=‘™-ðÇÅÅlxåÙUŸ}½ÿôÙAL4H\ÖO¾Òªš='Ò,"!"tÍ{8;mܼ½§UbàŸ¿þ¢£mvQ ‹É´Xˆˆ¿oÖ®ìP©Žgdw¨Tv6Òpߤ¨púAþû‡oxyÌyâņ–atžb´¼w'ÇïÞ~]ÈçŸ8ŸÝ,ow²³M›œy÷+oZ+ÅI‰q±¡AûO§ŸÎÎK‰Žxdñ|7'ÇÕŸ}£jŽã¿ú\lhPNQInq©›“ç+Ÿ§œƒ[| `vLH -£ƒr‹/QÎÂèàJíIDBo7—âŠ*m¿!hƒÔ`2Ÿ®|ÞV*9‘™-‹ÆÇÅ|¹ê¥EϼJ{ò¢‚¾\õ2†ÁÁ´s ¥*%&â¹{—†øy¿þÉÿ 1ãßÿ¹K¥™<..æ‹MÛZÚÛ ®©e{ä׿÷õш´³ßÎFúÔòÅÙ…%Ç2²ô£¿§ûÓ&Å„Ýóêú/WçoÖ®´‹Òó ÍŠD~©c’­eâw-Š=“›Ÿ[\:!>võãvªÕ'ÏçPGÙ,Öÿ½ùJD€ß¹ü‹gr/ù¼ ±ß¾ýúsï}|6÷•2ûÔ²Å*õù‹ÅŠN¥“½í„øØ„Èл_y³º¡Éú´Á£NŒ ûtå Z~×±SZ~BB¬‡óµÌ¶1Õ×IRë\ZmOU _zåâÚÓ¸ÛÛ‰&NÄǵ|ü‰±¦«ë–ãïoÿÔ“8×óuqe¹¸ ’›ÿ·¡ÿÈ<Ûîãwéä$[¾Œa#íܹkX‘ÒSFsCºBÃX,áø±8Ÿ'ÿöÛk»ûXî¯Ä¹]yá…‡³œß|“4܃Ärqåt5Î9¹Û·[5ËÛÚÛôï>¹Ø2‰,) ôõ-ƒ‘ëh+õeI…9E khúyêj› rVM"ƒÇ±kÖètµML©ëlÏàså§³iáËu¶—Ɔ“¾±•4š™R‘Ðß“-“´¥çY‹cQˆÇNf+LŠNâ –äº8ØD‡’¤EߨjÑ\ÇÁÖ WP2g³ìÆÄ2ø\Cs›©^Í’ˆÞn{™<-‹4õ$˜ãhËqšÛL*®³$20™xîÎ,±ÀÐÚ†³Y\'{Y|d˱tk™ÅàqlSbÌj]]ÛÎFàí^à­o–;”Ò$oÛwª—þûróv¹¢ƒì¾Ï?þeó÷o¿ñÄ]‹¤¥“W~3$.ˆ]>;õ»í»è¥Ñqûà…§î›7k×±“õÍ] ÏV*©ªo\ôÌ«´Žyõá{q{ðw¬m"\ïJ™‹§O€‚ÒrzÏ›Ooòøo¶þE÷ªÇd1®z—r1~ùû¶ÏßxqZJâÁ´sÖ~èAȸP<..ætvÞàSX"ýàüÅ¢Á”+:¦=üt§JMï¡,™:&i×±.ŸÜÚ'‘IÄ/~øÙ‰Ì?}ŸÈ‡øzßùâë-m øcß¡ßÞ_{×Ìi´V{páœ0ß§×ý/=¯€ÚóÅïÛ~}oÍ«Ý;÷ɯt¿<´ú]*@ /Wç_Þ[óà¹}Þ7‰ðW¾×b±Ü·r-%.¿Ý¾óÇwW·¾a\®hÂø®µÐR›ßóu+Ûî§äš¹¥Yyð0¡ÑF§ðÂÂp¡ÐnÅ£ o¬‚À¸\ûÇ£4"¡V«ÏœÅX,áèћŰ±±{ôѦ·ßîãNãëòóÍ­r^L4S&ÉŒTmæySCÃ#íZH°E¡0\.ã0$àÇÅ2¶nµtv^ÃÝÇññ! mv6ËÙ‰JÓÎŽ©=Ÿ5àùÜ z[}üø-øØ’Da&?“cR(@UR!Kˆä¹:êZôMr³Jc”+ÀÏÓÔ©ÒÕ5õ¹–m+U•Tª/WQ?mF…qíÙ2 %ep6SllëPdä“ݪKàã!õå¹9êj{BcÛHZO·h®8± g³¤QA„Å$?•eÑvµü†a¬.9! öað¹Ê¢2My—'’òƒŠ|”…—{ŠÏѶ­;§šŠZûññ’È`³ZÝz"“4[@ä#ô÷ä9;èê›­³©,,£|œŽÛç{ºF3f}}³,1JàíÖy¡«‘W—ú¯ª²°ÌvtŒ(اLä:Ù·¥çRæÂ˜ ûññ|OJäÀssdÛJµµy] 2¯¥Ízv9ÎfsdººæŽÜž¶c2oÔ˜Ú‘» ŽÑdRjÔ|.—ÍbÀôÑIoÓîƒ|.W*Q‡Îf˜-–䨮ONL;' ’¢Âé@¦I2[,Ç2nt¨Ù¸-W¸†ãشщ­íŠÍ{Ñ;ÞµW©Ö¤ŽI¶>³IÞöÓÎ=Ô6I’;ž€ o¯ÁÃomWP7>—+“ˆù\nZN¾“­›£Ã W ×¢iõãöCÇh[‰ÂÝ'ÒpKŒ ³çÓ_ÿèïë¢u6…JÓãóøõïý~þÝZ ü`³V>rßÊGî[ûä#üïÝe³¦7·µ»m'uÔ×Ý-*(`ÏÉ4½ÁH§°¶±¹¨¼2%&Ò:œŒ …´"$òë-ÀŒ±7lÌè´”D:XŠié•u ƒ_¥Õë/UVûº»Š…ˆ €ÜâRÈ+¹LËGjþJNQÉ5$ìãŸ7Ó¯C;ž€ ï®W¦P?O§ã™Ùt7´… ¨¾ò›1š–ºM®ú‘'£ÉDÕ ŽKDB[©$«°Ø`4 ër>xy„úù¤åä[kDªþ[ÿüñ¯=´¤+­ª)*¯¤ß1 [0eB~Éåâò*ºÚ`€I?ïâ`çéât¥´Qb& l¥•F[TV9*4¸ÏiƒDæïëæè°÷äÚ©7ük÷ÐÍ(™7×ãÛoÜ?ÿLº`CÛO?ÒÎ?¦ƒ#/<€$Z>þD}ê”6;[þÅFB£¦½=/4ññTç/´lø¸cûvÅæÍmßw½á°Ýݸ}¿×ª:v¼õ‹/[¶4¿»®«wÃÉÉCÔs[[ãÚµòo¿mù裮]Îö¸Ö¥£H²ùÃä_}Ýôö»æ¶®×E¶çL†TÒÓF56ÜjÏ,–˜)äë›ä”r¢2¨ºT<7§«^nÑëÕeÕ=Lu#°$"ê'ÏÕ c24嵓‰³YÔŸ®¡‰$IŽƒ­u8êòšA4"•ŒÁP_®¦5"õér=bÏÅÑ¢7ZÏÅV—Õ&3Ï­×#I_ßBçÔ¬Öš:Ô8‹¡*©¤4"Pê)ôʦNO‡L„¾±ÃqmMfC«‚0­¯¢5"Ædâ6ÆdšÛ\.CÀ³Y×ÐL{I³EWߌ1L¿Û€Ž@‚úRÏÐ#]]3í°´²C/QHšÍÿú0†ëgÄzÇpªúRÍ4|ºòùþ§ÙÙtµ‡Îf<¹ìŽ©) ”CE& IËÉë3:j¸¸9:ò¹ÜŒüBë>S½ÁXRYjg#¥{´/W×Z/ Ó,o‡!¸â„|ÞŠ% §¦$Èz¯+fg#ÄÍ9x\¡~>\ûÀ7Ÿ`.©”Þ6[,%•ÕÖGËZ4uâ¦Þ:–žq¡0§¨¤Ïr'ƒw…÷´˜Læ‚)èŸrEÇë×ÑŽÃP?ox`ÁìÌîw—’8ŽÓ]çù½ç|\¼\N„¿§ûªcþžîÍmmÖÝÜ$I\.÷vsüÂìÂ’_蘒€SY¹ÑÁíÊšÆ&È+)16ÙAfÓÒ®ˆ $⦭hõz*´®vÓhêT©…ÝC²¨ûôŒ_¼\n2›o e®äèˆÎ õóa2ÖÕ˜Ú –®Ì¼Ú ×’Š*ëŸ-íŠ0_ªJ¸8ØÙˆE6!G~øb v@z¥ùìž.NOÞµ8!2”ÏåÒ; ‚Ä0ÌÚ9HÔ”aûŒ€¤†œ^ª#G5ç2èŸ_Ÿ®- wY¿¾ÿùl]AǧkùOSCƒ±¶ËÓ£ÍÍ%M&ŒÅ¶·¾¤×k‰&=Ú°(•ú¢b^T$°=݇i¯ÐΦ-˜ššI£‘š‹cÝ>, —/««€4›M L[Ûë í_‡Z~¯O?¦I¡$I’)\õrS§ÆZ‘X À»=|,©d ý/dpؽcT]%åŠ2ù<ŒÉ0ÉÛ­{ŠI‹ÅÔ©âØÙà6=\ÒÔÙËY`1X 2+{b'ô`p8½’§ì•M*4“²wPz#ƒÛsÆbˆ}x.Ž8‡Õ;ãkAlêì•qJ\Ò.R–HhÑë-:ƒõ9Æ%]4„Ñhlïä»;3y<}“ÜÐÖNª¼Ý±2‘Ãf ù|­^Oy¶(ôê†/ûk>u÷øž†–Ö —ÊÆŽŠærØzƒqrR<ŽcWêqjšªƒ­lð”PgzN+ 5’LÀãÒ·›Új¤X 0¬™ýÁ0ì³×_ ÷÷Ý}"-§¨¤C©"HrtLäâé“YÌÁ wð¸D~s[ûÚ/¿ë¡õ˜BE§’èÝ©”YPøÌº ÷/˜uÇ´IKR§XâxFÖßÿÚ?ûƒ£ÖêÆß»ÄBÁôÑI/Þ¿üߺÿõ·)™X(€vìκXÜÿZë'wGo·¥… ”àÆ=H<^e}C¿Šqõ´œ¢’»ç¤Æ„žÊʤµ ¥Þ¢ƒOgçxy–ÕÔZûb‡j½~Î AÐ…+àqûWH’$*ÁMxĶw*¥"‘½LJUäèˆO^}®®¹eãæíuM-z£Ö=û8ÕB>äW­9Ö?Íf tO^ p:;ÏÚ¯Os¥µ–ìe6?¾»Çñ-û_®®Õhõ$÷ÏŸˆc˜Åª² 5esEo›+”ª¡›ÑPVf¬ªfûxs||@!`+•€Fw] ä†ûûFømÞ{è?m²ÎûuZO«ÓË$âìÂb 1ظнégróÏäæ‹üèàÀYãGONŠ—ˆ„­}ÿÚR¢Tk¶8b#=|Ǽ»ç¤~ÿçßtišÌæ>ÓÒû#õZŸãbÖ+Ô×Ç †u6i¯ÛPÐèt6bQŸ2‰äªæ•”$‰¼Ý\vé)UY×СRÅ„©´ZÇnÔÊÞiÖ÷¯†ÙHDÝÏÝë· Í…Ke>n®qa!õÍ'9mÙ¬é‚xhÕ»´~e³X|+—õbcí̾¶Œó8œ«Vkf- ú ˆ|hÑÜkˆÚ¦·ÍûלAÐ_,ìÜ»0ÌnÅ ~L4ˆ§LÑœ9cjl«¹,¤Åܸz-ô2Oè´Ö§1¬£Æ0º'šÐö}‹fˆÅ–ö.g9C$0´Á#íÕbX?ÎÉë³Eš­µÈÕ»öô—zóÂñãúøMÿu¨ÎV¼·oÚ3x/ðÐ7a4õñVpæUƒ2™€Áe ((©£8—Õ7\6¦z™O¶„m#ÑTÖ*/öøòÙv’aÐd°hzUx“Yyñ²²°Œ)âslÞîÒè`‹NU›ßâŒÌ±‰8Ž?°`>ÛÕ/Sx¹&%ŽüÂÃg3‚œšœèloîï{*+W§ìÕŠZ æJÏ —uÍÍZ½>Ì¿WW—ÃôòìP©®4‡úJw/ÞûÕÊÉÞú¬ÕÒgí½k ¨¼’ÅdŽ‹‹¹æTí©¬Ü—?ú<§¨dTh0·ß 6,~Þµ·½Sy÷œ”R),«€ ñ±8~• Ù[1‡ùûâ8FÍ…rÔ9XÍðxÃúÚÛåêZG[[G+2†aáþ¾C±OYM]·WJLXËÁü’ËÑÁÔJŠƒ¼Õ X†å6ëó.êçÃb2‡e‹…ò@_¥¨{äî938lVÿ£l Ç1p²³m’·Yû8£‚¬Ë—ªäñ¡×\‹êš[”jMx€Ÿƒ­ÍЯr²£n±žÅ/ÄBŸ‡Û°¢¦ à×»rú;$©Ø²¥k~+މgÍ vº“‡1˜l/³\Þó×ÖÆñ÷§Þ†Ê*ê4–‹ Û½k€?:ëvÙ+ú®ú$HJ¢6b17¸kÔ¬±ºvè‘Þ:˜êë —»:úù±±Òó0«.¦­lù².ç¿’<¥ضRë,1†aô¸î®’aßõ¦p]ìo@:;UÀ¶ø&2ku¤ÙÂ’J¬ÝºƒÁ’ £‰øÇWHeð8tö{š™tعV©\.ZO8Rñ€7©Y©Q—Õ(r €ã »¥î‚k`ÊD[›õÏ=TVSGÍ€ý§Ïjõú…S'öy:ú¸»ùxÑ?Û;•Y…E)1 ¦ŒÇ0ì@ZúàqmÞ{¨¦±i渔gï¹ÓZ á8>}tõµV‚ ɰ—Ù,IBŸp÷œ‘pßɳCÏA*¶O75\ÏZNKI¼~oâöCÇà™»ï¤G†Q$GGP³.®Dd ?ÝK òù&³ÙÒ=Û+uLò]³¦ ·sSo0þ¾ç Ï[:s\ª¬.(- ðòX>{ºõiR‘(9º×à›„ˆPÚ88Ž=ºd>X¤“Í™8–.µgî^bþ«rðÌ9:XŠé£“®:0‘"§èŽc÷ΩÑé.W÷ôxæ•”z»¹Œ‹…~#¯Z†BaYEMcÓÄ„8úCÕ ìÎ…@OšŠe” ô–’’WRzðÌ9O§_}®>‹ôûñÝUTehl•;ÛÛ¹8t=Æx\ÎÓË[ŸLM 9>>ÖzŸ*:øôב6ë•ï±~gc1™S’ã¯tUckÄ÷|5çɻ8D›×7·Î—BO”árØ÷ÏŸ} ÷¦¥½]Û=*Q0*žig ææf]ÁEj§í½÷JÌãGGóã㥋¹¼·Þöû©MMF½8¢ÃóÏI-²Yz§íƒR{LõõúK}ë›hâû'ž°Y¼Øqåk5JŒ$4gÓ‡é-Eûo¿ú.Ïœ8u†ëG:<û¬ýSO:¯]ã²npÜ8º³2.,ä®YÓ¼\ék½\ïš5ÍúûIB>ï®YÓúÌD¼fL ¥Y£ã:Ù³hña¢ o ;.I£º¥Ï°ÐÕ5‘f ßÓ•-ëåHcŠô4—ae±ý=|«»Çpê=$u Í .[àÕ3ÐYà뎳XÿJ÷«E¯‡Þâ›çêØÇCÊu}3` ôî ÇÍÑzÌ(Îa÷BгÙpã¾ô/2:''Å{¹8Ëñpv òñbàø…Ke¯nø‚žì©PªVþÍúçÿfíÊÓÙ¹åµõ|.7Ìß'ÌßwíÆï¬Çž8zÏÜ™*–^5ãJhõú§ÞýèÓ×^X>;uîÄq¹Å—:TjŸä/“ˆéµE6nÞžúܽKcCƒÊkë½<“£#j›¾Ý¾sX9ͺX8«Õë]"Bµzýá³ÎdïÉ´ûÌZýøƒ ¡­ŠŽQ¡ÁîÎŽ¹Å—¢ƒ‡á³'bý·?}ºò…ŸÖ½¹ÿôYÞ0!!¶¾¹õÚæ )$'†މ§O§VûkÿñG‡Ÿg¹¸bl¶8uÆÀE¯×·~õ•ý“Oà\.ЧM¥Y::ä_}݉./Ÿš¶Òû¦†zj{(‘ÞR˜[>Ú`·âQ¦à74¤‘¨ÿ¦$Ç/˜2¡C©¢'6…øz?ï][¡±øù{ï*­ªâ̼«Ò™_"KŒ´M‰ÖÕ7×Ñ–)P«át¥_¥!ŒF¾§ 00Bo"Íæ!~Ü™0š:r‹¥£Bm“côÍr³Zƒ1l©˜e#îÈ-î3{ãªAuæ•HcBíÇÇëå½ÁesìdÊârJªŠ+9ö2q¨/ÛNjViXÇÞÖ¬ÖªK«þù7*T¦NßÃ…Áåš:UL!Ÿëh¯ojå: ϱª«kæ»;óÝ™|ž±­ƒ!àq] r§Û«ÊðlSbŒJs§š0š<×Ùž0™ÑØÄ[‚ˆ¿ˆ?‚ ´zCk»â`Ú¹#é™i9ù}fWœÈ̾çÕ5÷Λ9*48%&R©ÖÔ5·|±iÛ™œ|ëÓŽgf¿úð½lëXF–É|õN“úæÖ»^Z5oÒ¸ÉIñQA>O©Ö•Wì;yöp÷âÉíÊûV¾õèâùcb£’£#Ú;•[ùzë_Ãðá¿b$GELKIÄ0ìЙ ¥Zóìú ÏÝ»4><4&8¨´ºæéw?rsr¸N™Ÿý¶%¯¤tIê”qq1<.§­£³¼¶îÇ»û¬?Ò‡­ŽNKI ñõ­7j›Ö|ùíÞ“g®¿”µzý–ý‡Z4wIê”ÿÚ]ÛÔ¼ìåÕ÷Λ9&6êÎSôcs[û¶ƒGÿ>vÚúª£çÎï>qšú ‹B©úfë_ÔR‘î±µï¿pÿ²Q¡!ÆÓÙ¼ ÿûqÓ‡/=m C•‰A<÷ÞÇ/]45%!ÔÏ÷ruÍ3ë6DùM&^"Ið>s™K*ªô#—æäïpëÃP’STòÀïA}©ïäùœ7>ýê¾ù³M›Ø¡TËÈÚ¸yûÎ/>´§²®aùË«Z47%:2>"´S¥.©¬ê¿lä`Ï “鉷?¼cú¤Ô1És'Ž€–öö3¹yÌ·´+V¬yg6[² KVþõãK ·ŸË¿øÔ;=¾tá¼IãÔZÝ᳟oÚšöÛµ¬,mjlÔåçó¢¢@œÜù÷n‹RiQ©šÞ]';Ž?jËÙã° ¥Ê¬h7”^ÖåçÝc  —J›Ö¾-š:¥û+,¤YÞªËÏWêù ‹5í›6 ÊË„cÇ1dR‹¼Muô˜ÊjÅÁ!FzKa¬®n\ý¦ )‰ÅöpÇ…°f¹\WP I;ýï~ÐÙØÖÑ––# ôæ:Ùc Ü¢Ñ)‹Ê4VB$ç E!¾|wgŒÁ°è C”‰ oj•ŸsÆÜÜ|3btyo=ÓÖÚùE}: @ÜvÜÞ2‘âBiYYu-ðy\g§i£gŒM¾x¹üÕ _6ÉÛnv쮎öŸ¼ú¼·›‹J£Í¿t¹½S)ð#ƒ’£#¦N|vý†Ý>—«kZõnCkëíZÀŽóyl//¶—/6¦ùÝõ¤Å<‚ïIÒd4V×tm[ˆ&ÛÃÃnÅ ¦½]Ï.°ÜÜXnnâÔéµO>IŒ·š,J%es›|7@ÜÊŒ™x$=ó÷=韶6/Ü·lRbÜg¯¿°üå7&ÓÍ‹šÏå~þú‹ÎN›öø¿?þÔw?kqK“¾tÑ f?þÖ™…Ö¡mxåÙÑ1‘3V<'WtôkéÌ©ÎN{OžùøçÍÖû ‚Ü{òÌñŒž‡œL"~dñü±£¢eq[GçñŒìo¶ý¥Tk¨£|.÷Ô¯_ï?}öë-=ß]±¡Aq¡ðƒï¥â úzÍk01aTÖ¶Ÿ©«æ?õrmSs˜¿ï܉ccB‚meI”VÕü²kß©¬\:ꨠ€ïÞ~ýÓ_ÿøõïýC‰‹fZJâ’Sü<Ü8£¼¶në#{NôtþúþZ[‰dñó¯=¹lñø¸[©dÙË«K«j¢‚X0;ÐÛS$à+ÕšËÕµ¿íÞŸq¡ðŠR›y¾sï^`{x8­zƒÚÉrq±>c³EÆócc™ÎN‹eiïÐ(ì·ôÎ Ç×G4y Ç׋HƒÑÔP¯9—¡>}¢;XWçµoRÛõ/¿"HHŽͰ±1Ëåª#GÕ'O+Ò^¡½ô2?.V8v<ÓÞÖÒÞ¡:~Tuøè ¹îuí‹/Z:•àöñ\(ù·ß’:½xæL¶‡;i0h³rÛ¶’ÆÁ|²»—ÓQyðPç®]d÷»ÓÎV<}:äÐ %š8ÁféR0557¯_'™;—‹ ¦ú†Žꋊ@2{¶dÎìžÜsìž{ÀPVÖüþƒ!]´åîÁr°Çù|`2 µÚXS£>~BWP@_eQãªUõê°Ê¨áÕׄcÆð˜62Å–?TÇŽ£g@Àˆ”‰@Ä;v žœOËÄ ±ëŸ{BÑ©<–‘¥Òhƒ}½\8':8ð±µïYâ@Zú fOh-E~RTxvQÉ€f ßmß5àQm÷(:©HôÓº7]ìÒròËjj½<ïœ1%12쾕kÕZ}¾£­íëV×65ï=y&ÐÛsbÂ({»{^[KDe]Û_|³öÉGŠÊ+ÿØwˆ:¿­£M”©ôDf¶HÀ³á•gW}öõþÓg1Ñ qQ'<{ÏËg§–VÕì9‘f± ¡kžxØÃÙiãæít þùë/:ÚÙf•°˜L‹…ˆðûfíÊ•êxFv‡Jeg# ÷÷MŠ §eâï¾àå1ç‰Z†Ñ nª¯’ B«¥÷ãB¡ãK/°\\{*´½hâA|\ËÇŸkººnES&ÙÜq`]¶0>ãçÇñóÄǵ|ú)iìëo¶}à>nP0µÍrr’-_ư‘vîÜ5¬H{B»ï>nh}¦Íâ%¤ÑÜGwaÊhnHWÚ0K8~,ÎçÉ¿ýöJç³\\9]2'·cûvë£fy[ûo›z*üp …s9ޝ¼B«v¶§‡ýÓO7½õŽ©¡þj­S4y²õ†D ç…‡+¶n\CѪÃ.£‡äøù¡ç@ôadÊDÈ+)%2ÈÇ“ú) Ö<ñpNQɳë?¦»¡—ÍšþܽKSÇ&ï9‘VQ[¹ºvB|ì{ßþl2w }›Ëb2¦0 ©HäîäØÜÖ^Ût•OܵÈÅÁŽöçÀ£‹ç?|ǼGÏßðÓïôi1!7oÿaÇnÀ0ìýžœ˜0**È?§èR{§òÀéôµO>Ò$oÛwª—þûróv¹¢ƒìv}üËæïß~㉻HK'É+N $.ˆ]>;õ»í»¾Ú²ƒ:DZ^xê¾y³v;YßÜ¥ðl¥’ªúÆEϼJ âW¾DZßxÇÚ&"ÿzŠãrEÆSR›ŸO²}à~J ˜[š•`t /,  íV<ÚðÆ* ޝÍ‹©ËM õÚÜ<–£T,p¤ *6ÿÑ'FnP°.?ßÜ*çÅD3e2ÌHÕfž754 1Ò^¡…†kjÌ­­¼ÈHŒÉÑäI×,¹!Á…Âp¹ŒÀH€ËغÕÒÙ9ðùAô¶úø`N²áŠ!•2$}a!`8¥\1C4aBû¦ßôEE¤Á ž1ð@›•e¬¬s‡‚ºÖÜÖ¦ÍÎ675[”JœÇåÇÅñ""@:wžæôYB¯»ºµêpˈãçgj¨×ã<>¡Ñ¢@PŒØqŒ&“R£æs¹l ¦Nðx›väs¹R‘ˆú;t6Ãl±$GEP—L;' ’¢Âé@¦I2[,Ç2²ŒB&@K[ûULŒcÓF'¶¶+6ï=Dïüy×^¥Z“:&ÙúÌ&yÛO;÷PÛ$Iî—+“ˆù\nZN¾“­›£Ã W ×¢iõãöCÇh[‰ÂÝ'ÒpKŒ ³çÓ_ÿÐö›zLël •Õs÷׿÷oøù÷N•z(…(™7×ãÛoÜ?ÿLº`CÛO?ÒÓr™޼ðp’hùøõ©SÚìlù ˜öö¼ÐPNœHI‹¢£éÝõ;wÉ¿þšîUŽKzì•àcÇ[¿øR±eKó»ëºús1\œ<ôH­Ñåå5½ó®ü«¯l¡ö°œœ0ûÚ*¶¹­­qíZù·ß¶|ôQ×. g{x\é|†TÒS. ƒ„| †RlÛÞòɧ-¬Í­+¸ ÍÍmûö{B¯ŒÃaûú ŃXõÊH©´é­w[¶¶ýô“&#=‚bÄzÇp $T˜¿/|ºòùþ§ÙÙt=GÍxrÙSS¨}2‰xThHZN=‚ðÚpstäs¹ù…f‹…Þ©7K*«âÃCíl¤töåêZëtšåí0WœÏ[±dáÔ”Yï5öìl¤ƒ¸9+ÔχËaøæÓÌ%•ÒÛf‹¥¤²Úúè±sY‹¦NÜôÁ[ÒÒ3.æ•X÷ªÀà]უ:rTs®çΡõ†»¬_ßÿ|¶‡‡® €ãÓuš6/—çÍÌMœ“Évw7”—[_¨IO§6,J¥¾¨˜ lO÷¡GÚ+ÙGRƒÿŒÕ=¶Ây|Ë5M.ÖœM§Ü]¦¦fÒh¤¤Îã]ÿý2\C‘³úÄ jÛXSÍzäàpCBD“&r¼½©—Ö0$Ò!U†+[õZÊèàÁ‘=}@ ®+9l–Ï×êõ”g‹@¯nø²¿æSwukhi½p©lì¨h.‡­7''Åã8v¥ghïT€ƒ­lð”ù<údk¨‘…W®è› aƒ1Hà†}öú‹áþ¾»O¤å•t(UIŽŽ‰\<}2‹9Xá—HÀonk_ûåwý/´S¨èT½;ï2 ŸY·áþ³î˜6iIê AÏÈúàû_ûg(ÊÊŒUÕloJÁˆgÎ&“]‡_M@c<à|õ“P©zò«ìÙÆù}ñ({Rké¾ çñ‡©5æÖ®õ\H³Éºä®­b›å=ö'Mæ.~ÅÐ,=Ñ,gË• b¸†²(:è©0$í?Æ®ÞAÁ·{ø¡+Í’®lÕk(#SSÓHm âz±21*(DZâò*ê'Õ1ÚÞ©Ì)*䪃gÎE.uølæô1IzƒñTvî•NîP©j›šÝÝñÛQî4Y¿ÏiØJ% Ñ]×bÑáþ¾~›÷úßO›¬ó~ÖÓêô2‰8»°ØB ¶n߀CÏäæŸÉÍ øÑÁ³Æžœ/ [ûþ5$C±°sï^À0»+ø1Ñ ž2Es挩±¬æ²sãêµ@Xú\Nè´@h5”‹ ‰èC qÏ6¡í;![Ú»Æ0º¯êmH‘ö²­cnĈ¤u‡>yõõ—zÖBާ/¹bý®¡H³å*Uá ˆ§OëJ[IIûÏ?›ÛÚ$é©ÜC¶Ã­z-edD+à Č̱‰8Ž?°`>ÛÕGYx¹&%ŽüÂÃg3‚œšœèloîï{*+W§ìùA-óТ¹åq9P×ܬÕëÃü}˜V~A.‡èåÙ¡R]iu¨ç0ÞÛ ådo Eå•Ö;cB¯Ó€Eå•,&s\\Ì5‡ ÒhOeå¾üÑç9E%£Bƒ¹×:€$[¶T—=މgÍ vÊ+¨ ŒÁd{y˜åòž¿¶6Ž¿?i2€±¢Ë8ü¨hzt?>¡+l³ÙX[Û'BARµÁ‹¹ÁAÔ¶±ºvè‘Þ:˜êë —/wå:6Vº`fåffÚÙÊ–/£Ìr †º Ý£,0V¯ÒgÚØPÚ¬l³¼ H’íé9,88·]!Ä-Ëô&RËkdž•ÕÔý}ü4µsÿé³+î\°pêÄÃg3­×šöqwe³X%UÔÏöNeVaQJLDUC†aÒÒkóÞC3ǥ̗¢P*¿Ú²Ãjym|jrœ‰cë‚ ɘ7iÜ’Ô)›ö N¸{Î ‰Hh½*øU!B¥Ñöéã¦>3Hù›–’xýÞÄ퇎ýÌÝw^(-³²ÉÑ/—2X32п¸¢ŠžKÎÀq!Ÿo2›-Ý_I“l#í:zJ£Ó ==–öví¹ AJ2FÅwþµÓ,o377ë .òÂÃÀöÞ{Ùn®ÆÊj`±Øü¸QL™L—ŸGAuì?n`ÃFêôúkÚœ\–“35Ô§O÷_tP4qS&3·¶òb¢1€$4gÓ`ˆ‘ÞR´ÿö›ãk¯â\ˆSgÇŽ3VU“3½¼¶bÛv€k1ÔUJ­£ƒéà¢)“q4™ ee†Š SK G(ñÔɤN‡ñ¹’™³n`~oÇ2B ˆ[“‘ ''Å{¹8Ëñpv òñbàø…Ke¯nø‚Ö+ ¥jõç߬îñoÖ®<[^[ÏçrÃü}Âü}×nüŽ–‰pàtz|xè=sgª4Úô¼‚Á£ÖêõO½ûѧ¯½°|vê܉ãr‹/u¨Ô">?2È_&Ók\oܼ=!"ô¹{—Ɔ•×Özy&GGÔ46}»}ç°ršu±xBBì;Ϭ¨nh"bÛÁ£/—WTÍŸ<ÞÁÖ¦¤¢ÚÛÍe\\̉Ììññ±×cÒ³¹~ٵ3¶òÞñŒ¬VE‡L"Ž rwrœóÄ‹ƒÈÄG—ÌòöÊ,(lh‘@bdX€—Ǧ=è¹ÏwÏI ðò8‘™3,™ʃÉI€a€câéÓ©ÕþÚüÑáÅçY.®›-N1à…†òòŽíJ- c¹¸J¬ÖÒ3”•ulÿ³ÿ%º¼|jÚJOìÐË%Ò[ SCcËGìV<Ê´³\  W솼6C]åîÈ˧–ldÚÙIæÍ€Î» ªÃG8>LGÛ‡cu5ÆdÞ@‡âmWFqk2dbD€_D€AZ½¡µ]q0íÜ‘ôÌ´œü>³+Ndfßóêš{çÍœ©Tkêš[¾Ø´íLN¾õiÇ3³_}ø^6‹u,#«ÏÂ.RßÜz×K«æM79)>*(@Àç)Õš¢òŠ}'ÏNïú¦\{§ò¾•o=ºxþ˜Ø¨äèˆöNåÖG¾Þú—j˜+´}øã¯ÉQÓR1 ;t&C©Ö<»~Ãs÷. *­®yúÝÜœ®S&Àg¿mÉ+)]’:e\\ Ëiëè,¯­ûqÇîÖvÅ Wm=ptZJbˆ¯÷˜Øh½ÁPÓØ´æËo÷ž•‡Ê+D“'rüý"a4˜ê´™êS§H‹¥\í›6 ÊË„cÇ1dR‹¼Muô˜ÊjÅÁ!FzKa¬®n\ý¦ )‰ÅöpÇ…°f¹\WP I;MÐy¸†ÕÑ£ ‘€ŸÈ”ÚXO²Ñfeɧ¦²œœ V——×±c‡óšµ70¿·c!Ä-FV\Ä&-¹A‘]£çº6È®Qí$ÙõÝ\þmk.ĈeÀOä!q‹A†††õüõ°ž•4®uI rËç#vym@ Äõ€d"@ bLD @ ÀHþX1tL õ5?‚ì€@  ò&"@ ÉD@ Ä ™ˆ@ $@ €d"@ bn­™Î8›% ôáºØál6*@ ÿ5£Qß W]ª Œ¦=1·–L$Œ¦Î‚K—P-A øwAÎ@ ˆ@2@ 1H&"@ ÉD@ Ä ™ˆ@ $@ €dâP•µíçy“Æ!SPl\ýòé_¿Av@ ˆëA–å4cìu"ô÷rž=%S?Ù2‰óì _wdÞ›‡(ÐÛyö–X8²³yk­›8\NðÚÃ÷QÛAjtºÖvEIeõ‘ôÌ´œ<‚ oëÜù{º/Iä “‘$ÙÐÚz.ÿâÖýGêš[n`,/]ôÀ‚ÙKž½¼¶nWtëª:½¡º¡qÿéô?ö²Ä ‰‚ã[~<–‘õòGŸ£¢A Äàö–‰JËʪk€Ïãz8;M8clòÅËå¯nø²IÞv›fê³»s!†a¥U5Ç2Γ$x8;Þ™:åŽi“}sý…Ò²=…ÿûñwŸ{›V;i|xès÷. òñ\õÙר!@ ÿÊ¢2œy½={ººFc›Â¬Ö {"n,#A&IÏü}ÏAú§ƒ­Í ÷-›”÷Ùë/,ùM£éßÿÖÍpY0eÂãK5¶Ê_ûxãÅËåô~»gî¾S&•Ü ‰¼]\ÖUÅÅÁþ·÷צŽIþnû®ê†&Ô ˆ³R}ýXt‹Î€Œ‰¸áŒ™Ø‡–6ÅkoܸêåQaÁó&Ûzà}hb¨å³Sý=ÝI K«j~Þ¹÷tvžõµ6ë®™Ó§Ntwr45MûNµÁš`¯/ÞxIo4>¶öýšÆ&ñõ~pᜨ >ÛØ*?|6óû?ÿ¦uê³÷ܹ|vê‚§_=òxûÏ~Ûb-p)ø\îSËM¦§Þý¨ª¾ÑúPC‹ü•ÿ}Áçöøð¼<]~Š:ᛵ+cB`ˆw©=ûOŸ]õÙ×lëŽi“ÆŽŠöru m§²r¿Ú²CiõªºqõËáþ~cî~dˆqQØËlZ8wtl¤­TÒ¡T¥ç|µeGs[;utLlÔǯ>÷ÎW?´w*X0ÛÏÃ=ãÂÅ>ø”Ïå.Ÿ=}jJ¢³½ÉlnikÏ*,ÞðÓïf‹åšëICKëéì¼™ãR‚¼½¬eâ´”Ä%3¦øy¸1pFymÝÖGöœH£º8Ø?´hΨÐ`{™F§«kjÙ:}Ëþñ¡A_¯yªcYÛ~¦NžÿÔ˵MÍaþ¾s'Ž r´•$QZUóË®}§²ré0̵H °hP›…@Üv`L†Ð׃ëâÀàsI Ièõ†6…²° d‰Ql™¸i_WSi7vÎa·žÈ‡øqì0œa+”K-:[&y³¤"ÒLh«ëU—*é(„þ^¢ oùélS‡rÀ4p]x®,‰ç°I³ÅØÞ¡ºTe­PÅ!~_÷Öc'[¾§+ƒÏU•k*j % ý=Ø2 ÆbzƒQÞ¡¾\eÖèºŽŠ…Â@o¶­cà­^[ݨ©¬²ëÁÄq´•ÅGtæ_"ŒFa€S$ ôFõåjmM†c¢@®«#Îa™:U.™•Ú†N©cuuÍê²jq¨ÛFB]}‹²¨ ’ïá,ðqgxAUR¡oèšÅ²ó=œÙ2)ƒÇLJµ¦¼Fß$§O ÓcÑëEÞL‰4™uuÍÊ’r°º†1¢ ož‹Æb˜”juIŰJùöeÊD ⇻G…ONЧEÞ²YÓŸ»w©\ѱëØ)Ç&ý¼&ÙdIDAT'Åüêsë¾ùiÇáã]u…ÍúzÍkaþ¾¥U5Ûa1™þžËg§(£‚>]ù|{§ò±µïS]Ûb×?÷„¢Sy,#K¥Ñûz=¸pNtpàckß³ýöôòÅñ¡Y‹/\*ëPªú‡<..F$àï?}¶F¤Ñêõt¾\õ2†ÁÁ´s ¥*%&â¹{—†øy¿þÉÿYŸÿÄ]‹âÃCÏäæç—Nˆ]ýøƒjõÉó9ðýŸ»TšÉãâb¾Ø´­¥½êšZÀÎFúÔòÅÙ…%Ç2²ô£¿§ûÓ&Å„ÝóêšÁ½³ƒÄîNŽß½ýºÏ?q>»YÞîdg›:69%&òîWÞ¤•"Œ‰STrð̹N•Þ{þ‰ä舌 …'Îg3 w'ǹÇ}±i%ÇÇÇ~ôÒÓ{N¤­ùòÛáµÝÖZ“»¥U5{N¤Y,DBDèš'öpvÚ¸y;ù¼ß]%ðgf×5µˆ|?÷™ãR¶ì?\Y×ðæ߬}ò‘¢òÊ?ö¢Bkëè€ES'Æ„å_*=‘™-ðÇÅÅlxåÙUŸ}½ÿôYëÄôÉuÆ…‹ ¸í°‰ ã8È òv}S+†á ïá¢*® Ëš&\–…3ú9CÈã:Ù1ø\å…RYR¤±U¡«mâ8Ú ¼,£¶ª~ˆi‡ø“A® FËu¶çØÉä§³Ìj­õi¢ŽÌ W˜„qàÖžëâ`J’}c«Eo`p9[ƒ\AÉD¶L"KŠ}}‹Å`ä:ÚŠC}YRaGN‘u G[ŽƒÌÐÜfêPqí$‘„ÉÄswf‰†Ö6œÍâ:ÙËâ#[Ž¥[Ë,c›cîTëêšØv6o7°èô¢o}³ÜØ¡ä9;ØÄ„´ª4fU—¾x¹²eRc{§¾©g±¸Nv6qá¹Eººæ^éqqœì Ím¦šF¶½Làë( Ëè"‘ÅG°m¥Æ¶c{'ƒÏµIˆ4¶u\O)ß.ŒL™y%¥AùxR?íížZ¾¸±U~÷+k:T*øaÇîß?|ûÅû—ÈÌnïTÀ£‹„ùûþ¾çà†Ÿ§Ãq°µéxbdØG/=S×ÜòÄÛPR@,¬yâᜢ’g×L )J˜¦ŽM¶vG…ø-{iumSó•Rà ç ŠÏ †a«{Åd<¸êÝ‚Ò2øò÷mŸ¿ñⴔăiç¬U!¾Þw¾øzK›þØwè·÷×Þ5s%Ý2.Ɔ‹‹9g݉,WtL{øiJ¢Q¤ŽI~ûéGSÇ$í:vjT ¼ùÄC ±ð™Wè1£á~ß¾µrÅ’k7~G2vTô |JgA&'GGôñ¥ x<½Ñx=5ÄÅÁ~tL$AÅUÔžøðÐå³S¿Û¾ë«-;¨=8Ž}ðÂS÷Í›µëØÉúæÖ¤¨p[©ä¯~ØyôdO«*à@{§òÀéôµO>Ò$oÛwª—þûróv¹¢ƒì~“þø—Íß¿ýÆw-:–NïìŸk°hÄíÎfsdººæŽÜžVc2IË'ÏáÑÞÑ–SH)$itÏÍÑ&!²#§Hߨ ø%–ýÄ$·ÛÐeb[ZŽEßÓ+ÍòíÆŒú{vä[ŸÆ¶‘´ž:oéö ”–4*ˆ°˜ä§²,Ú.Ÿ†a«KNH"ƒ0 “ŸÉ1)” *©%Dò\õ -}|xmÝçh*jíÇÇK"ƒÍjuë‰LÒlQÐß“çì «ïy\²m¥ÊÂ2Êljá¸Ýøx¾§ i4ÓiÖ×7Ë£Þn.Q—(‹Ë }ÏÃBYXf;:FìÓG&rìÛÒs)å‡1öããùž”È#€çæÈ¶•jk;óJ¨óy-mÒèë)åÛ¦Ô;Óh2)5j>—Ëf±`jJ“Áøõïý”F¹¢ã÷=Ù,Öä¤xÀq|Þ¤qíÊl·‡R<ÖLHˆýøÕçÊkëys¥`úè$·i÷A>—+‰¨¿Cg3ÌKrT„õå¿íÞ?ˆF[©Zƒg0ÔÏÇÓÅéxfvA÷t Aüß;`æ¸ë3ük‹Òªš¢òJZ=b=J#2p\"ÚJ%Y…Å£iTXðà—¯»[TPÀž“izƒ‘6QmcsQyeJL¤u gr/X«% “¹× ™F§#º}´eÕµ~þý@ZúUkÅ䤸•Ü·ò‘û6¼òìÖ ëÄBÁ÷îjhi¥Ž.š6Qo0n?tŒNžX Ü}" DZÄÈ0kãX‡©Òh´µ]AÉA>—+“ˆù\nZN¾“­›£ÃUs@ F$ÙK.f3ƒõE*‹zz<)dV©(„Ñdlmc ùŽ 1´FÄYLœÃ!LS‡Šm××¢.¯D#ÏÍ c0Ô—«i$IR®G–˜)äë›ä”þ£Ž©.UPZ‡£¯o¡Ï1«µ¦5Îb¨J*ÉÊ5S,è• ^SÙõÚL„¾±ÃqmMfC«‚0­¯¢5"Ædâ6ÆdšÛ\.CÀ³Y×ÐL{I³EWߌ1L¿+×®Ž@‚Úª—_W×L;,¯¹”o F¬7p §ª/xy@nñ%먟~nàê`/ Neåê ƒù¨&&ŒJˆ Ë/¹üìú tç/„ùûÀ§+Ÿï‰M¯'/WÜÜ ˜£‹—ËMf³¿g¯µ²Jº½e-íŠ0_ljA‚IŽŽxpáœP?&ƒa•éà©$®P?ox`ÁìÌîwk‘Ö鱞µíʼ’Ò9ƸØÛÌÊ9_PT^[o퇫kné?Äs@"ü"üèŸ?þµûë­Ñ?Cý|¸öo> ¥RȺX¬TkV=ö`JLäÙÜ Y‹­ûʯ„Ï[±dáÔ”™DÜ»bH­_úä@Œ £ÑØÞÉwwfòxú&¹¡­ow%H³Å¢íÑj„Þ¦Þ3],à\޵\¦/ öáØÉ0&Ã*¦¾§™ªÁÃaID`”+>*@ŸÞX“BI’$SÔKð™:ûf‡"³Õ@,*× §×UJµð" Æ,£72¸=Wa,†(ЇçâˆsXÖ§18kAlêì•qJ\Ò.R–HhÑëûL2v(éL]C)ß.ŒX™Èa³„|¾V¯7™Í àñ {¬ Õ×,äó@Èç€üj>¼0_ŽgZkDèîy|u×Ê~먵½¼M}ÒÐêû« 2Kg¡ç–'IE§ŠÊ¬Uz½šÍü 49:â“WŸ«knÙ¸y{]S Õ½»îÙÇ)×ì —X(€vìκX<@³huç÷7Ñsï}üèâSS(wfc«üÿþø³OßîPØðóï¿ï9Èd0|Ü]_}øÞûæÍ*­ª9|6“.Äæ¶öµ_~×ÿBÊã¨PªxãíÇ—.;-% JË>úqSaÙ¥?†aŸ½þb¸¿ïîi9E%JA’£c"OŸÌb2‡U1ÄmŠ"ó‚0Лçê ¶ó‹N¯*©ÔÕ]qÂl¶þI5ޤ©÷7êµ’7ç²mGÇ`šªzs§Š0[€a€[& ë-¼®2]šRNýÀÃ) Jôó¶#ÎêÕ⑽óH¥°Êc×C¡wIÓW ”ÕU²„H¶D[Ûhlë Œ& €ëhË÷vÅ}B¶ôƒ„îñ뀱•¾¦®§”oF¬LŒ Àq¬¸¼Šú©ÑéÀV*±ÖU”w‡’q”¾¹ª·ì«-;ÆÇǬX²@£ÓmÞ{ˆÞO©ÆöNeNQÉà!WsA”–/ž>9.<äïã§9M£ÓÓYè¹E1ÌF"jl•_§õ–Íšn!ˆ‡V½K›‹Íbñ¹œë “2‘ÉlÎ,(®‰TíG?þ¶áçß}Ü\’£#îš9mí“4ÉÛ¯jí1[,¥U5Ͻ÷ñ_Ÿ}ðêC÷¦çP¥¯Õéeqvañ nWÕ7¾üÑçl+ÔÏgBB좩?]ùü‚§_Q^a¹²p߈¿Í{ýï§MÖ•ó*¸M!LfåÅËÊÂ2¦ˆÏu°x»K£ƒ-:}¯Û̓ï˜ŠóõM­ôNaà£®Ú QBÁe ((©.cœÃî³ç°ï˾I°m$l‰¦²Vy±g±a¶Ý°•#M–3uK•òMbdŽMÄqüsàðÙ jOiU ô{{ºõ~©H”<¨à³³‘ú¸¹Zï‘…`èžJâæèp׬iÖ³L†ÈöCÇ:Uê¥3§QÝôÛ€gî¾³S99:‚ª6~n}Ü·‘ôU…T­ƒm¯FšÓm­ §¥$èMbÑ ˆÛ œÃî3,g³àŸœkÑé¡·zûö¤8dtuM¤Å"ô÷dX… Çp6 L ¥Y£ã:ÙÓ_— ySþóÆ·èûfœçêÈ– Û›¨«o „Þ=á¸9Zë­PÊ7‰‘Ðé<9)ÞËÅx\އ³SÇ/\*{uÃô¤Ô†–Ö/ßöôò%[þ÷î‘ôó8ŽMINIÄï÷ í3ûfÛ_±¡AwÍš”YPÄd0|=\Ýç<ñBŸµzýSï~ôo¾ºú±‡Œ&Ó¡3 ¥jõç߬îñoÖ®<[^[ÏçrÃü}Âü}×nü®ÏÄŽ«²ãðq©HøØ ^ÿæ¥ÊjjƆ»“c˜¿¯ÙbÞ¼ï $ùÎW?~ñƋ߾õúÁ3éJurt„Ÿ‡Ûá³™ô4C!»°^zàî¨àÞPVSwò|ζƒG#Ã~Z·úÀét ALJÕ$o£'‰_3«?ÿ曵¯=½|É„øØÜâRðóp|èLÆÙÜ WºÊÍÑá»·_/(-»\]«Pªœíí&&ŒRª5»»ßü<ÝŸ¿÷®='ÒÎå_Þ¯7lÞwhÅ’KR'ÿ°c÷ÙÜ ¿ìÚwÏÜÛ?yïxFV«¢C&dž¹;9ÎyâE¥Z3..æ¡EsÏÕ46kõú oÏäèˆÂ²Š¼’R*À¬‹ÅbßyfEuCAÛ½x¹¼¸¢jþäñ¶6%ÕÞn.ãâbNdf½¶¢7,ñŸ‚)àÙ¦Ä;”æN5a41x®³=a2ÿ“²IW×$ôó”Dqìe½mgÃpí×àW#Œ¦Î¼iL¨ýøx}£Ü¢70¸lŽLY\Nå¨3¿D–i›­«o& &®£-S$è³Î?†Q¡2uªø. .×Ô©b ù\G{}S+×É~˜læ»;óÝ™|ž±­ƒ!àq] r§Û«z+”òͪÀ#à&¤¦¯¡ÕZÛÓÎIÏLËÉï3“÷—]û[åwÍœ6òx¸TUýÎW?X/A¢7]³þî93¦¥$.I¬7«ÙµwÀHÕZÝoðõš×Þ~z…Ñd>‘™}"3ûžW×Ü;oæ¨Ðà”˜H¥ZS×ÜòŦmgrò¯!S?ìØ}:;ïÎScCƒ&%Æ@}KËæ}·8RßÜ5¸$§¨ä7ÞyäŽyããb¹v]S˧¿þ1Ä9¿4¥U5ë¾ùiÙ¬éËg§2Œý§Ïž<Ÿsò|Ο~ußüY‹¦MìPªedmܼ}çÿßÞ}‡5qþ/!!„=D\€ì½÷QPÜ»úÓÖY­¶îmÕÖ­uÒª¸q+K½§ÊPÙ#\~œœ!Œ‚ øý<<<¹Ëåî}ßùæw»Ú¹§ò ‹¦,_?Ýo¸«µÅÄažœú†¢²òKÁo>j«fîÛwg®ß²16ho#-Å()¯¼qêêÍöwÁDÝŸ6Â{ŠÏÐ wî×r8ûÿ ˆOÏœàíéfk%Å,«¬ÊÉ/8}5°¤¼!ßSYÉÜ@×Ü@ðÂÒ²#WÏÝúðØ›]§ý1 9Y˜y9;`v/,’UÃ^¼}÷’é“ìL­ 2sómý³¯šª8a¢È]Ó NX¾+ÜšÚšì\IeEF/LBç4Ô½)ªÉÊs„r‡àÕÕ—EÄËi3zõàóù e¥±)²†ÚŸ·¶º·Å<GZGS²§2F¥àõ õ¥å ïá5”U–…ÆÊè÷g¨õÀ¨»Ž•šÍ~ÙE7åóË#“䌵%{(Ò•åY5åQ Ti©O Ÿ_•(k %ÕG•¦(Ç­ª©ˆL +)aâ×°—¿Œÿ2ó˜Ð!{£©ï+ñ‚ÿ~ðŸÿþ5½`0áÂðép„a£ ûð‡ˆI„öa 7†µsKü€Ð;ˆa"ÂD „‰@€& L"@˜D€0ˆa"ÂD „‰@€& L"@˜(.Wk‹èKgý<Ü (ð=ø¦S?fÈÀU?Í ^ã8Ÿ]WWR^‘þ*÷ADThl<Žó»ÙÞÒÕè7ÁÛÓÚØ@UI‰Ïç¿-)yž|1èAAQñ÷yøn[<ˆ³}Ó€×ÔÖ¥¿zôà鋨ŽÚ„©ñáõËwžô¿x÷A‡¬ðä–µš½{yÌ\ð}î2ßo;L$$ffgçæ#„˜R õ^j^.Ã8%gå¬Ü}¨°´¬Û쪙£}çMƒaXæë¼G‘/ø|¤Þ«çDoÏq^s6lOÌÌþnâ‡Ï_TU×P©Ô¾=UmMŒìL·9uýáS8½€öèa⃈¨s·‚ÉIUeÅßfLñp°Ý¿æ·©Ë7446vƒ<Žö8ÒØw%¥«öNÎÊ!ç÷VUùeÚD%ùïù >yåfæë<âõËÝ+Ï0ÂD ºC˜(¤¸¬b՞Ç×-·11ôópl+do3Õ×[W£ñ3_ç½~;$&^ð³’tÚäáC½\ú©õ¬ohÈ{WxçYxk­†Zš×.ã44ÌÛôGÞ»B„‘vÿYcFXè1¥ïJJï‡G¼r“ŒSÿ0qª¯÷èE+Ül-G vï­Úcÿ¿‚nk˜ ÆÂ)ãnýóõ›w‚o½-.]ñ×A&ƒ!fÉ4 ²·í9PEQ>;¯`÷™sñé™=••ÿ0ÉÞÌXB‚“°ãÄYV ›ø”«µÅž•K¶9UËáüo”¯Foµò*ÖÍGÏN] äòx&*uª¯÷p7ç>={ÔqêcRÓ\}™ÿ†\ÀÂ@oæh_ýþ²ÒLV ;+7ÿßÀ ÈÄr¶ PÏ¢ã KËÔT”UJ+*‰™tmªïС.Ž}ÕTë“2³O\¾!XÿÚZÂfó›=~Bhù¬iËgM#Ž®as#„<ì†89hi(+È×r8q©™G®æä´,jÁÝýëôÉĻїÎ/ âÀW«†‰!ÇO] ´11ìhGyS|†.™>©´¢òÆ£g 6ØÑnÏÊ%ÛŽ¹zÿ1±€$vtã*]íÌ×y—‚Ð$$t5Ô§úz‹ - ôö­þµ¼Š5oÓDÓö@{ëíKTT±EFW³k µ5gai¨?oÓŽ“\4u¼™qtrZbFv%«!tn×f=Mõ –¾-.™7[+YifPH¸PŒHªåpÄÌ#™3}ݰØMb°£Ýþ5¿ÍZ»eïª_Ë*«‚ÞjiþVï=,ø)[kcƒ ˆ˜xgK³ÙãGõUë¹þÀQâ] Ãv.]8ÀÆ2óuÞùÛ÷z()z:Ù9˜›ÌÙ°=íåk„™žÎ±M«+««GÆTVW«(*˜êj;Z˜’a¢˜øQ†!„¸Ü÷ñ+Fû{à 3=ç Éaq‰2L©öÖÇ7¯Y²cOx\bÛ {C§ÓføùÜ~™˜Œª«¯'V»pÊøÊêšÉiU,µÊí¬íͧ­Øû¶P¨¨w÷úG;REQaÇñ÷abbÆ÷Û[ÀW®{†‰¡øôLçhi“½z¨,œ:þ]Ié´+««B§®žÛµyéÿ¦<‰Š)¯b!„æŒm¢«}îVðî³çÈõ¨*+¶\¹ƒ¹ÉŸË~)(*^°ygYeBHNFzゟbSÓoßCÖ~A›÷§[OBÉÏšêéLY¶>¿°Hü¼˜êi#„^$¥¶½˜8y$h«÷øÛbNtrÚÚ¹3ÿ¾&ðqÈî³çø|>…B9²aÅ`G»½ÿœ/.¯ ?åha:{öØÔ „Б€+×.6ÀéúÃ'Ä/g‡6–Ï¢ã–îÚã8B(ðqÈßV¬;sÊòõ¡anÎ 6kíÁ¼ËJ3‰â`ÛÜl­z*+%€š5f„‰®ö¢mEÄ'sž»ä¿cãʧüy)ŸÏo#aY¹ùŠrr3ü|R²_Þy.¸¡×o-.ûP8š}zý³cã¬1#ɸ¹µÝ=fÈ y¡µ_¡n{Cœ†ÆF»†É`Ði4„Ðg{ *Õÿf=”VTž»L§Ñ;Ú!„(ŠŸ‡[yëð…Ë‚ë í­÷¬\’“_0{Ã6"FD uq”–’ú/0˜É`(ÈÊ÷Â#¹<ž“…™àÇÿ Šýoí>{®ªº¦µ¼(+È#„JššP[óÑ<’N_ $£Æ p„•ú÷…+|>!„ãø½°H ëß··à§"Sˆˆ!„ãü£×BÃ8s† pBüïÞTó÷"95">I¿¿†Vß>äJ¹\ÁuV³k?µ[š5fÄêÙ3ÖÍ›udÃÊ?—-järwžô'ÞÂ0l´çÀ„ô¬´œ×äj1„=ˆxÑ[UE£·ÚGÖâÀÀ0LNFZYA¾š]›šýÊÆØPh±–»øVtÛÚD„£ „ˆÐGOS!—–!¸1©£Þ!ÔGµ‡œŒô³è8N}Cëdocon’žµxûn²©!d¢«Ú·ú×–QQl6¾$9ë¥ÐD Ö~Í#)37Ÿ|]ßÐXS[WXZ&˜ÒÊ*„P¥f5© é™Í3’ƒã¸®F?rë¬öË‚7B[w´0ÕÕè÷²àÍ£çÑc‡ úoçïwC#"SbSÓkjë>£[òp°%_óp|Ù®ýDk2B¨·ªŠ¢œ¬¢‘þƒSE­Yáõ›wm'¬5½Õ~ž<ÞÞÜX°c(Žó1 #¹Öv7ð­è¶a¢$&ÃdÖr8D-‘´”Bˆ¬ü#5j2L&BH†)…*ýX‰®6•B‰JJ ªPSåÊ݇Èa¤šÚfõRBiñ‘Š m/öÑ<’ØÍà Ç®žƒ’ RgV6¯ïäá8‹Í&6Jl½¨¬\(IÄÖ¥™R¡¨¤”_¶íþßhŸq^¼=y8þ82zçIbñ °¥ÉËÖe¾ÎcHÒ- õ7ý<{Ë/ó¦­Ø@ÔáÉIK#„BbâÏß¾×òƒÄøè¶&R%ÅÓ[×S(”€ ûY¹ùìZñÿ7ÊÇÒPŸ‚a<0ñ3v7ð•è¶a¢……‚¥å¼&&‰0HYA^ð»_I^5E!D’ÊÇB±#WÝí¬æNÍ®«Œ<ˆ¨±¼Š›šÞö«šÄ””™3~è`[S£›CÚXì£yl'YÁI*…"'-󦨄Ü:±-AÄ2* ‹K‹K•fZêû¸» v´“—•™·éO*ÀÖpê"â“Vï=|dÃÊU³§Ïÿ}'Bˆ]ÇAIIJF%¥´ñÙ6&’¯»‹œŒôÒ]ûŸDÅ3;²Cv7ð•èž})ÊÌÑ#B÷Ã#‰9D½‘…žàb†z¡ì¼„ЛâV ÛDW›!IocÍ ¿þ±765ý·SF{$ç§d½Dy8Ø|‰ì<}[Í®õp°ÕìÓKäR IqòØNæÍ×l¢«M¡`YMí×™¯óäd¤µúõi¶u=„P–@7B¨š]û,:nùŸbSÓmŒ ‰ï¨ŒNN M°35¶66@³jئz:"‡" ™0œ#„¨ÔfgŠšŠ2B(5çCƒ²œŒ´PË~kx<…Éð è†_WªÊŠÛ—Ì·66ÈÎ+ «ßî‡Gñp|Úo9ibŽ’¼Ü¤aCDD!„p¿þð©’¼Üü‰c×&Ô?!Ä©oX¼}ORföªŸ¦û¸»3ƒBÂk9œ1C EiZýúhi¶`oW§É>^dëmKµÎÿ.Òi´ý«3ÖÑ|KMEyû’öf&âä±ìÍŒ­Œôß7l΄QH cePHBhþÄ1d de¤ïdi–ù:è°h®¯K '"P)&³‘Ëåñðv “Wn"„~ëGìÖkžHÒi+fý ؆N“ðtz?¬§í„àªJJ‚›xWR†²24 çüOHVQTðt²«ãÔo9rŠX`΄Qý5£’RÞ—"„ÌMô4Õÿ»u—è<Úž’”™ý"9ÕÖÄÈ\_7!#ëèÅk¦zÚn¶VW÷ï ‰‰«åpú¨ªÚ›×r8÷ã>š°œ¼‚ò*Ö˜!é4‰ò*»®îÂû·Ÿ†þo´Ïúù³ìÍŒK**mŒ ûõê—–ai¨ÿÑäE§¤¹ÛYï]µ$65£‘Ë ‰‰'!|UºC˜h¦§c¦§ƒãx-§¾¤¼"8ôùƒˆ¨Ðؼù=™ÿ¹qç]Iéäá^£»#„2^çn9rêYt¹§¾aÎÆíÓF órv˜à=˜SßûöÝ?7n‹ÜhMmÝ‚Í;n\µyÑ܆F˜'Q1?¬Ü8Ýo¸±¡³•9«†]PT|ð¿Ka± ’ÍSWCbâ'bml@Œí}S\|þNðŻȂÍc{<|þ"ðIñ– Võ±‹×N] $ßåóùËvíŸ6Â{¸›Ëd¯:N}Xl¢àƒI.Þ}èåì`¤ÝßÕÚ’S_Ÿ÷®pã¡ã·Ÿ†‘kèÀ»®®¤¼"ýU¨ÐØxçwN2è4šŸ‡›‡ƒ­Žz_i&«†–ó*($"8,¢Ó@¥P"N?ŠŒ^þçî}PÞ;q@I^ŽxÍÃñòʪ¸´ÌÓ׳ró;- â—ö¶Åó‡8Û¯¹<^Mm훢’䬜ÛOÃRs^}‰´…ø»¹ùzThöéõ¿Q¾6&†Ê òµuœ’ŠŠÄŒìG‘ÑñI_ÏÞ×ìÓëòÞädCccQYyhLÂÉ+7+««»ÍAN6 £./*+'ç÷VíqóПŸtšÛ™^¿|çIÿ‹w´±XOe¥]K•UVþÄ©oˆMÍèðL‰™À7äÛ ‰™ÙÙ¹ù!¦C½—š—‹Ã°NÉY9+w*,-ûÒ[ïÓ³ÇÞ•¿öïÛ»š]›‘U^Å’“fšè9Yšy¹8,Þ¾»ËË'+7ÿÇu[ß–”|Cû´‘Ë |‚’¤Óõûk q¶`c9síæÌ×y_g‚>QU]ƒa˜œŒ´Žz¿ Þž¼=o= Ýzôt#—û5¤ÐL_çðº IzÆ«Üèä4š„D_5Õ‘ƒhõëóU…‰„·Å%Ï’B²ÒÒº“†q¶2›²|}§¾\²HtíDZ#·=ý%VþÓú­Õµµä¤µ±$¶÷Ÿ "£‰9 IúœÛ;á")”À7¤;„‰"¢ÎÝ &'U•›1ÅÃÁvÿšß¦.ßÐÐØøå6Íd0¬YªÞKí¿[wÿ¾p…Sß@̧P0oW§ánÎ_Cù°ëêâÓ3¿­}ZßиíØrrñ§úzÏðóY½÷ð×™à“Wn †°FÚý×ÏÿÑÇÝçó?|âkHáâi“’ôM‡Oñ7¡‡’¢±ŽÖWXžY¹ùä@§ÑþÞ°Â\_×ÛÕéêýÇߨå©Mo‹KG pæú­7Eÿ+.!#KpREQ!T^Å"çpêbRÒ;!›B)|CºC˜(¤¸¬b՞Ç×-·11ôóplþ0Òî?kÌ =¦ã]Iéýð¨“WnqäüIcgŽöÿûΨ¤Áµí^±ØÅÊ|ØÜ%¥•-·5iøõ^j·Ÿ†í9{^p>Žóo? {CÎQ’—›=~ÔK%y¹²ÊªÇ‘1Ç.]cÕ°‰w™ Æ3ÿ£A!áG®ý:c²µ±B(21eçIb»ÖÆG7®B ²·‰¾t–øÔ¨…Ëó ‹LtµG`edÐSY ç㙯óþ¹qçYt¹i ½›×ìó¿à3Hœm‘¼œ& óÔQïK¥Psò .Þ}pëI(ù®ÿ›”ååÇÿºêç)ãÝm­”ä§,_Ÿù:ÏÂ@oæh_ýþ²ÒLV ;+7ÿßÀ ÈÄ”öìÓ‹wLõõ6ÐÒœÙCIñÇ1#]¬Í•ä+YÕñIG® ¶ßù¸»Œ:¸ŸZOš„DyUUbfö‘ W ŠŠ‰wé4ÚTß¡C]ûª©Ö74&efŸ¸|#13»íÒ3Á©9¯æÿþGÀîm¾î.çnÝÍÎ+x¾Q©S}½‡»9÷éÙ£ŽS“š~$àêËü7bB>)F:ýË«X‚1"B¨¤¼âITŒÐ’LcÁ䱞Nö2L©ô—¹{þ9Ÿ”™-¸€ž¦úœñ£, õ’ô‚¢âŸ¿Lô¯ðrvغxÞª=‡î‡G X³ÔÑÂôÄåG®sN?ÝoøØÅ+_¿y'Na646ÞxøÌ\_× ³ Óù“Nºïþ2m‚©®N#—¿Ïÿ‚`,õÑ ¿aF-\île6ÁÛSMEù]Ié™k·o>~öѬ»tmゟ挽þÀÑ6k£Àgó›=~Bhù¬iËgM#.€Ãæ.FEœ O\¼}·‚¬ìƒS‰UؼFhåg¯ß>ðßErÒXGkºßpKC}¦TiEeljÆÉ+7óÞ"„Ú¾Ôˆ“qr„rµ¶Ø³rÉ–#§ KËæN­§©^Í® ?øß%._Ût¦n&"„p?u5ÐÆÄp°£&´·Þ¾dAEëQdt5»ÖP[sÖ˜–†úó6íàáøÝЈ™£}½\ÃDYi¦£…iLjºÈ/i„» BèÄå"ß­åpˆ ²²g¶mè­ª›—¯¯©1q˜§ƒ¹ÉŒÕ›jjëÈå{*+ŸÞ¶>¿°èöÓ0ýþƒìmz÷PùaÕ&Ç_¼ÝpðØ¦Ÿg§æ¼ºpç±|YeBhìAVF ™O¢bd¥™n¶V»W,^·ÿhPHxEÔÆ¶ˆˆ/¿Ì×y·ž„òx¸½™ñÆ?©÷R;|þ2¹*•r`ÍÒž*Ê1©é4 7ÓÓ9¶iueuõãȘÊêjES]mG S2L<·k³ž¦úˆKßBõ †a!Áoˆ~j=Ol^#Ãd>ySTZ®¦¢ì=ÀÉÙÊ|ÚŠ D¤øÃÈa‹¦NxYðææãg =••ÌM‚CŸa"QAe¦§ó¹ KCýò*–˜1b¾ÐÐöé,þI×GUõئU±©çnëjôîæl¤Ýÿ‡UÉÖínˆ°`òX;Sã°¸„¸´ÌvÖëçϪª©yú"¶í\ŦfD%¥ uq8}-ðUÁ[‘Ë´]à£bètÚ ?ŸÛOÃ"“Buõõ-/Dëu±²âlâò "æC©(*,š:ApIO'»-¿Ìklä>ŠŒ../WURr´0NI#>Òö¥Fœ”ˆ“#’“¥™«µED|ÒÍÇ!öfÆS}½)e÷Ùsðµ @gêža"B(>=Çùdý“œŒôÆ?Ŧ¦/Þ¾‡¬˜â3tÉôIÞœn= }™ÿ&+7 õŽãgÉÎdí¬iÁ¡ÏEnBAV¶ŸZÏ¢²òV2-˜<¶·ª YŸ‡š3~ÔOãüfµû̇«ž•‘þáó—O] DaöÇo?²·±0ÐMÍ(¯bÝ ‰ØôóìÂÒ²;ϚŇÎ_.­¨äóßÿßóÏù“›×.˜<önh9³¥6¶…²35žêë-XD¡`;[8ÃÏçÆ£§d™²‚üë7ïÆþ²’ ˆWþ4BÁf­Ý"X&²ÒÌvîÍñC#„’2sÈ9üÈãác~YAö¬2ÕÓ9þûê¹Fo:|!äëîú¦¨dʲõäÞ¤R(’t:ñzÖ˜&ºÚ‹¶ýEvË;xî’ÿŽ+œ>òç¥m”ö'‰II›9Ú×P»?1éåì0ÀÆòYtÜÒ]û‰˜/ðqÈßV¬;sÊòõbî„Ð꽇‰Pø“Òy?ŸB¡à8þ(2zÖ˜»·Ý {þ"9-9+‡ ^{«ª(ÊÉ*é“¶„ÊíÛC[?)Bˆ¬ÒÕÓTgÕ°_¼\&.-ÃÑÂTW£9ÿóƒâÔ7l8xìйKN–f&ºÚ–†úîvÖîvÖWî=Þ~ü ¹X-‡C6J"„ꫪkÈú$"”‰Kkv/¢lu5ú“±©é#ºZê?‰Š±2Ò'Vsê¬ cS3, õZ®¡¥Þª=€ØÔôE[ÿ"Ï žÎŸtÒ Žž®ãÔgåæY0$éœúq®„tn¡âò ]mâPüè®ùûÂ+ó¹FÿúÇ^¡·Ä)ðŽb¨¥‰ŠJJíØKÍgç(ýe³k QƒÞþðIºm˜(I§É0™µ—•»µŒùjšîÔð¶¸$1#{€%ñ 1ØÑŽBÁZkqFMcU••ÚN ñ+Ô)5õ!“–b”V§„ÀÃy!*•ÚÆÊ1 Û¿f©©®và“ÐØÔôJV5Îç»X™:˜&ÑÖÎm{[²ÒÌ¢²òM‡D ÑìSXQÅúŒJJùeÛîÿöçå1ÁÛ“‡ã#£wžôo™ý¶ÕÔÖ¹OŸ‹’“‘êâ¸ôSw-]ø¿5›‰Z.9„Щ«ÑÉi-?Kü08zñjUMÍèÁg5{ü(v]ÝåàG_¸Âåñ䤥B!1ñ‚UM¤¼ç1¶”¼ÛŸ´”TQ‹ºg¢d¤›â°Ï; ÄW\^qýáS¢†Ò\_wײEc† ¼I–d [¸ç"ÇÉ­KK1ZÌ|>¿¢ªZZê}ˆÆq+Cý§/b- ônÛ>Å?é*XÂË•gLƒSß ÎuƒP”“ýiœß´Þ'¯Ü$÷f#—+4,]ŽóÏÝ >w+¸§²’½™É¸¡Óý†×Õן¸|ƒ(s)IÉ6>Þ!¬Œ Bd…+»®®åa@Ìa×Öuþ9’‘åóÎ/Ó&Zê‹ ¸["ŠN(†)Ê˾+yßÙñmqIai™•‘¾Žz_9i¢Þ(.-s†Ÿ“Á0ÑÕzžÜF¯Ù濨¬|ëÑÓýÔzŽóò}Nô«ûèé,þI§('¼Œ¢œ,¹ 1¯íw$àê@;ëù“Æ ÝCQœï(DX¦¢¨ rTG]j:3G€öëž¿Ì(ÊÌÑ#B÷ÃßwfJÉz‰òp°iûƒ÷Ã#qœ?ÄÉ¡WS]ígÑqmßΗèÃþãØ‘"ß•bH"„ ŠŠj9]- !†$]_S£²ººµßî-ß«¬Y …Ze„ÐÓ>G•~žÔœW4 7[«Ï^C5»öYtÜò?Ħ¦Û2$éíIÏÙ·Ë«XÓF #j‰R²_"„ÚYS(?€‹ÊÊo>~6{öú†F+s„PAQ1«†mª§£ª¬øI¥ýI”äåF{ºóùüM·†É|''#­Õ¯àbÄíg?`¦é$~0¥b.OTµ Æ:Z4 Á,Ħfèiª°±ÄqœìbRÒ$é´ñCÓi´O½WŸÏßç!4ÒX1OgñO:]~ÄyJž³ºêù…EÄ=Pżn´ß«‚·wCŸÛ›òŸTà8GQ©í½˜ÃííLD¾+Î¥Fœ”ˆy¾Ý0LTUVܾd¾µ±Av^Áͦ»Ä…„×r8c† ºÉÊHŸ¨ ýèé,þI'+ÍœêûáqçS}½e¥™AMyóºÑ!Ž]¼ÆÃñÙãF}j%¬ª¤ÔÎ= ¯ãÔÏðó’"A¥*ÈÊŠy©'%bB€¯Dwhtìh§Ù»BHŠ!©ÞKÍ@K“J¡$fd¯Ü}°YÁª^àØö%ómZ—“ÿ†hÿ2ÑÕÞtø„`÷ó»!v¦Æ?Œ^Í®ýèCÌj9œ…[ÿÜ·ê·©¾Þ#¹Å¥eTV×È2™æºJòräÍŸ¿lof¼dú$kcƒœü}M 'K³¼w…Ç/_ÿ¤œF'§ ´·ÞòËÜÜ·…8Ž_ ~˜œ•“öòõ¨ÁîªÊŠé/sû÷íífkõ$*ÆÝκ=E—øÏ;?ŒvyïŽÇ‘Ñ%•JòrÖÆýÔzŽX°´1=s&Œ2试”BÜoÅÁÜDOSý¿[wÉá#ÓFxëiª?‰Šmûî}-]¼û`ºßðIÆüx·–ÃYàØ±M«M0ÐÎ:.-!¤£ÞׯÄð^Xdx\"BèкåoŠ‹3²‹ÊÊe˜Rƒìm0„ÎÝ~ÿ´ž£¯™êi»ÙZ]Ý¿3$&®–Ã飪jof\Ëá÷…nYÚmd|Ö˜ÄÃúd¥™:êý4ûôBÝ|²ó¤ÿ‡C+4ÂËÅÁÝÎÚÇÆç É*Š žNvuœú-GNµgg‰™Î±^ƒ&ûx½*x›ñ:·ŽS¯ª¬hkbD§Ñ"S·¸Ãvkø|þ–#§®]zü÷5Áa•¬'K3õ¾÷ãoþBÔ*ÊÉ’]{S²_Zê×r8é¯^F6O] t´0ýqìˆù¿§‹s:‹yÒ%fdOám¤Ý?ãU.Qúªà­ÓØgñ¯íWPTø8ÄÏÃíS <'¯ ¼Š5fÈ@:M¢¼ŠÅ®«»pçþg$ ‚U½ùÈÉ-‹æ]økë£È袲²ŠŠæ&‡Î_|"Î¥Fœ”ˆy¾Ý!L4ÓÓ1ÓÓÁq¼–S_R^úüADThl‚ÐèŠ'Q1?¬Ü8Ýo¸±¡³•9«†]PT|ð¿Ka± ‚‹=ŽŠYùÓt:ö(2Zœ§ñ¾)*™¼lŸ‡Û`G; =i¦«†šóòÎÓðûïŽò*֌տÏ?ÊÕÚÂÉÒ¬¼Šuñ¯ Þ)F»Nûcr²0órvÀ0ì^X$«†½xûî%Ó'Ù™[dææ-Úúg_5Õv†‰¡ýÿħgNðöt³µ’bH–UVå䜾XÒt›e‘.Þ}èåì`¤ÝßÕÚ’S_Ÿ÷®pã¡ã·Ÿ†µ/×r8A÷;r‚·çékù…ES–¯Ÿî7ÜÕÚbâ0ON}CQYù¥à‡7½¯?>ð_€½™‰­‰‘¢¼l%«:ýUîÙë·Éz¬†ÆÆ›wêáíê4rB¨¸¼<,.Qp ¶ÈÒn-y¶!.Ç®­{S\tÿ֓дæaŸÏ_¶kÿ´ÞÃÝ\&ûxÕqêÃb\ÍÉ/hOɈ™Î9w °±°16´51R“e×Ö¥d¿¼uõþcqFã’bSÓg®Ý2{œŸ»­5C’^PX¼Ïÿ‚àÓ2Byï K+*U»ôŦfXêǧe Žã_\ZF\Z†©±™¾NbFöGOg1Oº7ÅÅœügñ§ø märï< ßûÏyÁ®&b^7:ÄñKׇ p¬§Ày8¾â¯ƒ §Œ9È!I/.«ø¼0!t/,²°´l†Ÿ‹•9C’^ZQù-!„ãü^jÄL‰8‡à+ñ_&c:bUü÷ý¤Þ¿à¿àÀç¿ÿCM/pG@#Ö·nÿQ( À·G†0 °ˆ˜D“!„ÚѽžÀ87"@˜D€0ˆÐÖ­©åplÆM‡r€6@m"ÂD „‰@€&¾®‘Î:MV_‹Ñ[…B§Ã¾À÷ohà¼-­Îx‰74vyb¾®0oh¬JʨJÊ€£ kA£3ÂD „‰@xX€oÃz0é24ªë´rq~M#¯¤¶Ç绹èœÜuBÖº|wtx¾&øVQ1L]ŽS¨5| Žw^€EA|I:E]‚’Çâ´?2èª\tBî:'k]¾;:6_htð­êÁ¤ãjŸŠ£N®p„Õñ©8…ÚƒIÿvsÑ ¹ëœ¬uùîèØ|U¾÷0‘É`D_:»yÑœ/´þYcFD_:k¤Ý¿Ûä¨C^¿<ÄÿXk“à3DœÙ»ê×nœÁÎ?•À7A†F­çwÙY=Ÿ"C£~ë¹ø¢¹ë̬uùîè¨|UºI£³’¼\б}T eóß'oßÇÝ¥sÂÄ™£}çNM¡P^¼ ‰‰¯ohPQP°35doœ6wÓb±[OBcRÒ_¿y÷õ—!C’¾yáÜöÖ\nRfÎó„dI:Ý@KcÑÔ S}½G,øSßÐQÛúëô9i&£›K]ë§õ[«kk»q¿¡S |=(Ê Og‡þ}û0$ée•q©Wï?*(,FYê÷ïÛûÚýÇm¬J¡Ý{ð߀;OÃ>/ N–f¥•™¯ó>ïãÃÝ]†8Û«©¨à|¼¤¼"%ëå¹À»,Bh΄ю–f3VnùÙé£|¼\'ÿ¶†˜4ÖÑš=qŒFo5:6kÍæAö6‘ ÉÙyùí)ávæŽÜMgvlTQTømÇž´œW­¾—«c »6,6¡=Û2Òî/ÃdF%¥|ÞÇÇxyÌ;ò‡åëK+*¿““¨›„‰>î.åU¬èä4O'»>={¼)*ù¢›çå1ÒØ·Å%k÷IÌÈþPšTêg{?wrNQYyQYù7Q†kçÎho•”²ñàñâò r¾¥¡þ²™Si& U¦‚öKÈÈêÞü†N%ð• `Øš¹3-Í"’O_¹É®­UUVrµµúkå’ ‹W!„,ôÚÛ´&¶ßæ…sž¼ˆÝü÷ÉÏûøoÿ›âjcq9ø!£hôéåjmù0"ŠÛVQÅÊ/,"'Ϙ\R^±z÷!.—[ɪž2»´²²ab;sG°0ÔSQT@ v² … uuz[TÒÎ0qÆ(=Mu¿Ÿ—Á "¦î&hiê¨÷½pç~x|âgûáœ]º.¸@oÕ?ŽaclØCI‘]WWPXt_äÚäee¬Yª£ÞwÕžÃO_Ķ\@ZJjÁäq ·þ™û¶Pð-.wçYøýð(rά1#æMóÃÊ©9¯ŽmZme¤ ؽ•x7($|Ýþ£âçT³O¯Ë{wœºxøüer&ÑŠ·ýø™+÷š]ìôûkü2m‚©®N#—¿ÏÿByU«— ½¡.޹o —ìØSßü!’qi3VÿÎåqß1TêT_ïánÎ}zö¨ãÔǤ¦ ¸ú2ÿ ¹¼ÿ›”åå'/[·dú$+sI:=)+{Ÿ@úË×ä2‡×/7ÕÕq6»µô(ÉËÍ?j€¥’¼\YeÕãȘc—®±jØ!kcƒ£W!„ÙÛD_:K,?járÁ«aÿ0qª¯÷èE+Ül-G vï­Úcÿ¿çn#„¼œ& óÔQïK¥Psò .Þ}pëI¨˜©B¹Z[ìY¹dË‘SµÎÿFùjôV+¯bÝ|ôìÔÕ@.'¸ž¶7DàØÅ+Lëéd/ÔJ™»çŸóI™Ùmä+*àLx|ââí»Û³„±ŽÖt¿á–†ú2L©ÒŠÊØÔŒ“Wnæ½{œëiªÏ?ÊÒPŸ!I/(*¾ñðÙù;Ád ² KËæN­§©^Í® ?øß%¢fó›=~Bhù¬iËgMC—U ›»!äéd7ÄÉÁ@KCYA¾–ÉKÍ<pUðç„à©$ζtmªïС.Ž}ÕTë“2³O\¾‘ØTLã™ÿÑ ðSWM`a ‡aX˾à5~ØGK³ýþî> 'g^~8ØÉNÌ5Ðh8ïÂ,(ÊɹÙY_ðäXÀUræé+7©±zÎÝ|ôìfSóš•Ú[µÇ‡OS³_"„h_Ñ·¿‡ƒ]CccÆ«\kË#®46ráèýzt‡0Ñ×Ý!ö<5ûe«ÚÇÝõøåü¦é2L©Ó[×ÉJ3GÅËJ3uÔû ws&ª(*^¿¼—ŠÊâí{Z«”v³µ”aJÝy.#’¹¢ñ“WnT³»ÙZüïRqy9Bˆhøøú¨ªÛ´*65ãÜí`]~ÃÝœ´ûÿ°jc§^äò>î.!ÿ›wêE=h¼¡ñýL Ãv.]8ÀÆ2óuÞùÛ÷z()z:Ù9˜›ÌÙ°=M ” Z· ð+÷++È{»:Û´jÖÚ-Y¹býlU•=³mCoU•ÐØ„ì¼|}M‰Ã<ÌMf¬ÞTS[÷ªà톃Ç6ý<;5çÕ…;÷ˆ”UV!„Üí¬ÿ\¶èÖ“Ð‡Ž·½‰ESÇÛ™G'§%fdW²ªQSø˜ù:ïÖ“P·73Þ¸à'õ^jdDÞvª®w¶ÖÆA!!1ñΖf³Çê«Ösý?>º!¢÷­þUYAþITŒ‚œ¬»­Õ¡uËÆþ²R°–÷ãçö§¯ÄÓÉnË/ó¹"£‹ËËU•”-L£SÒˆ0ÑÂ@ïк冂CŸW°ª­Ì–LŸd¤ÓÍÞ¿WâdiæjmŸtóqˆ½™ñT_o FÙ}öBèqT N›áçsûiXdb2B¨®þý¹pÊøÊêšÉiU,µÊí¬íͧ­ØÐÚYöÑm!„è4ÚßV˜éé¾yÍ’{ÂãÉ•ôTV>¹ymQYù“¨yYøVè$¨ÔQžÓ³cDƒð(„мÉc}@Ý9¾!”÷¶pî†mDî¡ÿ.Nó®Ñ«×Å ûnwa.d˜R¡wÅÍÚÇø|¾Ð/O]~s'ÕVï[R^p'øASUÙèìåêøË“Bó'›?y\ZÎ+Cíþ¡_~˜DÌßväThL|—äQŠ!éhi——¸rÎÿìÍLD¦äȦÕê½Õôûk t°A>~ö÷¹ËpœwÆ©Ô ®^.oŠJˆj’Qã¼<, õcSÓ‰-L•ä·9uýáSòS²ÒÌ–«êÕCåï +delÞ™Øz¥‹©žB(:9õS“™˜bmlèfkß²Õ•!Io9<¹OOÕÏ+3}£¯oªUýiœßœñ£¦vìâµÖ–G½øX¦¼œØX>‹Ž[ºk?Žã¡ÀÇ!oX±vîÌ)Ëד‹)ÉËeçåÿ¼eQÏtûiè±M«—þoêœÛÅIü‚Éc{«ªìó¿à3ˆ˜3gü¨ŸÆùÍ?j÷™såU¬»!›~ž]XZv§Å€˜Lõt¦,[OV@Ú™Oõõ>qùÆ‘¦Ÿì ¶ó·…3ü|n‘>u%Šr²ëçýXî¾jSAÑû04 ⋠ðuófÑ$¨³Öm%εCç.X»ÔËÙ!8ôù³è8r=î¶Ös7í N@&ƒ°{ëhχÎ_ªohÌÊÍW”“›áç“’ýRhßý¸~kqÙ‡øU³O¯vlœ5f¤`„ÝRÛBÍ3ÂDW{Ѷ¿"Ⓢåž»ä¿cãʧüy)ùKÒÊHßÿfÐþøÝß3-õ¾²Ò̈„¤Öð¿q‡J¡:Zšþº}BˆÛôó^ANöÇq~G.\ÉWD¥vñèãw%¥•¬êqÞž¬ê¸Ô vˆ¡„ÒL©å?M¿õ8äbÐ}G»_ÿ75÷Í;¡ßäÏ^ĦdåÛ¼öìµ[O¢b¸\®CòØæµ§¯Ü$N^â×r—p±¶`HÒ=Ÿ–É®«óp´&®ÙshÓ¢¹E¥eÄLdQ€/᛿!Ž«…‚¬lpØsb2èY8jªDÖ‡ªÙÂý5z«Ø¼FZJjÎÆí‰m6Ì)ÉË!„Jšw_õqwY={ù×[UåS3"A¥z»: ý™éé|^±T³kÿ "'ÿ ªf×z»:~$Så•m¯vØ'„ÐÁÿ.1"BèErjD|’~ ­¾}—{înhÄG·òo``#õX¯Aœú†Ë÷)ÈÊrÒ2OB)ÌÁÜä“R™˜BD„!ç ¸†6ÀYÌ ‘öœ=O´×>Aô×øÔÃà“Vâ=ÀIŠ!yæú-2FD5r¹¬j„±Ž–FoµÇQ1d³5Çÿ¾p!4ÜÍYp=÷Ÿ“?Òj9œ{a‘ IºfŸÞm'•ˆ1 ““‘VV¯fצf¿²16lûSml ðўÒ³Òr^“¥!ìAċު*½ÕÈ•TVW ¸1b7ÓCQ!TTZÖÚ5ìZv]‡•–•–-!:¶÷ìù¸ÔŒÒŠÊ¢Ò.îËåñ¶;Z=wfÀÞí‡6¬˜>Ê—èÆG’–’ÚïáÆÃ§‘ É;Oüêa»ÚX ­§ŽSO䥚Í&2KL²Øl"ûõ ]•G»JVulJz#—“`cj$²R¿¬²ª‘ËåÔ7 ®awç{_•o¾6ÑÇÍ!DF‰™Ùo‹K;Úî<ù1ä":9UÃ^7o–³•yx\btrZËŽð½{ÿ} —ËûiýÖ¥Ä0ÃÝmŒ cÓÀ'¡o‹K?)#5µu­Ýç3Š%+7_°}¹ŽSŸ•›gedÀ¤·g$Šž¦:«†ý²àà̸´ G S]~ä|ç§d¿\&!=ËÎÔXG½Ÿ`‘H}{öd2‘ )‚­*œú†ôW¯íLUÚ_VPTLô2ü¨ä¬fÉ3ÖÑbHÒïÛ×rI…OJUBzfó åà8®«ÑOÌ j9²; B¨¾¡±ªº†¨Õß§®ÄPK!•”ÚÚ®'öµPî¹\2w„ô—¹‚“Äé&²þ¾ù9¨öóäñöæÆLƇ!ð8ÎÇ0¬®mõVUQ”“U4Òpê ˆÒVT Ïô¬Ü|‘}-@7ðÑ}CC£(>Џ]—ÐÌ‘ƒŒ4€œ M »TRFöÌÕ¿›èj›èšèéŒê1ÜÝeåŸûÉáuœú¤¦‘”\.7¿°Hœä_IîTLô´=ãá8BèaDÔw;ë­ítrËZs}]¡™d¿v¢Ûè(¾í0QAVÖÙÊ<óužàŠ»¡ÏgŽödoC´jU°ªg®Ý<ÒXw[k/g„PRföŸ§ÿŒcúöT•“‘ŽˆO§³ ñ£³Góßs'úí›7Kð„é*-ÁuBLCd˜X^ÅR•í¡¤Ðö qi)©–A612FZ þ`±kx8Þrëâ:Ä2-GÛÅ.-Å(­è€ò!k²ÒÌ¢²òM‡N´\òmqÉ'¥ª²ºFp޳Øli))17D¨a ·§ðpœJý´»¶~êJd˜L„PkQ¸´£e ðùüŠªj2wï·Ûü¾<<œ‡¢`m5\ôPR<½u=…B ºŸ•›Ï®åðÿ£|, õ)ÖÆ“¯ÚØ–œ´4B($&^°˜$xÿ¡ƒtÄ‘¬Öã“vªšŸÂŸŠÐз!Îö…¥e‚÷ÄÈÌý´ÛÇp¹Üø´Œø´ „±ŽÖöß~ž1zÄú}ïûW³Ù‚ 766Òi´/Tªž;'; …™˜B\c_¿y[ZQ9ÈÑ®Ãİ؄üwÚŽ¬ äee=&ç56!ßv˜è=ÀQ‚JÕÓTüe@ðqw!;?½~ónùŸè4š±ŽÖ@{ë±Cí[ýëèE+È1ªaq ïJÊfŽöݺxÞª=‡Z»96!)3{œ—‡‰ÑÍÇ!œ_"aBcÜDÆ^Šrr-æÈ"„j9‘kNÌÈÖêÛÇÖÄèMQ[''»®ŽhžDÌa ã“–¡R(‚‘"±uqîÑM,Ór+Ê ò!v§c.vÍ#Ú:Ž’¼\LJšPtû©RhÞ\B¥Pä¤eÈàû£êBDÈ¥¢¨ 2l"ò(T†)Ê˾+)mç¦}Ý]äd¤—îÚÿ$*†œùãØ‘íY'‘`)IÉÞ š›»¥œ¼‚jv­£¹éO:s»8Ž ¡sµ¶ˆKËlç-cH)Ù/³ró5ûôê’RíðÜ r°EmûuàLEõÞjym_ß©«‚“;—.ÔÓTÿè0G@ú¶û&-ÎCnº¡.DŒU·35ù.QýfÑÞòPSQF¥æ|¨à—“‘ÖQïÛžì³jئz:_®|͸<ÞõOÌ t½ZôÉ&oˆÓØÈeпê'ðJ1$…z"J1${õP©h÷ˆŽóù|É/Vï(½þýÔzÞ|ôlÅŸûÉ¿­ŸDy8ЏiQCc#Þ• þ>}Ãa¢Žz_ýþñé™›ŸØü÷IÁ¿s·ƒ)Êp7b1¡*¢{,§^øÖ0ûü/\¼ûÀÛÕiÍœÿa­‡ 캺Cç.Ñi´ƒk—™6bB§Ñˆj³Ö°Øl„Pü*¾ò*VqyÅKr+ýûö?tpË%e¥™S}½ÉÉ©¾Þ²ÒÌ ÖÇǧg‡=×è­¶g塯U3}Ó[× ‹A!¡ùÇ‘´•‘¾“¥Yæë<¡‹s'ަ4=³ÈÊHßÎÔ865ã£B8οÙCIq‚·'9sÚˆaò²2wž†7-ƒW³kU••„>Û·§êd/¡± â¸|ïBè—i…®ÈN–fr2Òb¦Š`ofLÜ!D¡`s&ŒB…„‹¹¡.ô,¼ŽS?Ãϧ¯Àøz *UAV!”’ý2ï]á {[ò©ÊT eÞÄ1¡OoNTá«*5ÛwïJÊBV†䜟'ì¤øp¿öà‰$¶bÖ‚Á=MBÂSìÛæoÚÅ;÷"âùaÒ†Ÿg{pvµ±;ÔcßÚe?ŽE,÷öŒ4s˜›³®¦ºzoµ¯0 * §¶oX9{ÆÐNN–æ¾]w-_¬ '{ýA{o ŽãxAa‘›µ©žŽž¦úG{ ޶8ŸùŒlò/,6!)3{ ½MËúš¼·…&ºÚvf&zšê=”àç_'ù†}º"„n‹z€Ò݈_¦Môqw9}-ÐÍÖêDZ#_$¥æ½+ªåp úk8Yš¥d¿Œo>Ô€°ëÔ¿4 ‰QƒÝëvžôomÓ—‚ÊÉHÏ0úôÖu9ùY¹ù\eyc-yY™¬ÜüÖšábRÒBËfN³0Ô«ãÔg爼ƒw.ܹ·hê„ÿvm‰Ž—“‘vµ¶‹Kðp°Z,1#{Úo#íþ¯rõ4ÕØX¾*xë/0ö¹¥ÍŸ”¤ÑÜí¬oü31#ûMq±$®¯©¡Ù§1Ä !t74ÂËÅÁÝÎÚÇÆç É*Š žNvuœú-GN ®ª¼Š%/#óᅦÆ&÷M¬åpvò3‡Ï_¶73^2}’µ±AN~¾¦†“¥YÞ»Âã—¯“ËD'§ ´·ÞòËÜÜ·…8Ž_ ~Ȫaëhôûuúä[OBŸ'$R©†Ç%þsãÎ#‡]Þ»ãqdtIE¥’¼œµ±A?µž#,%"qR…ŠˆO:¸vYPHDYe•³¥™~;ωý.憺J«zó‘“[Í»ð×ÖG‘ÑEee=ÌM¿ø8„Ïço9rúàÚ¥Ç_Qɪq²4ÓQï{?<꓎᜼‚ò*Ö˜!é4‰ò*»®îÂû·Ÿ†þo´Ïúù³ìÍŒK**mŒ ûõê—–ai¨ßž½xÍTOÛÍÖêêþ!1qµNUU{3ãZGðø »âáøÖ#§<m=fŽ!I§—UTƦ¦ÿqì ±@Xl½ÐçÓGùÊJ3‰û&~mY(.¯8}妭©Ñ_o9Ù:N}N^þ?OjÿÊ»4w☭¿. R;ÿ¾‰Tª›­uLrZËþÐÁ!KgM3Ó×Mjþp©€;÷úôì±jÎ I:î›ØÜ|´ÄN÷ìôãíêÔÐØx?<²å[åU¬ð¸DWk S=˜øžÊJæºæº†–– ¸zîV°Èˆ|>ûñ³tmüÐÁõ ûü/´¶õ“Wn>};~è`C7+*•RÁªŽIM¿ùèytk=Ï2_çm;vfŠÏЩ¾ÞTjPHø§†‰þ7ƒ$¨ÔÑžG rÍW´íØéÒŠª–aâ›ââ?Nþ³ø‡‰S|†6r¹wž…ïýç|k÷Ö&pê–îÚ?ÀÆrÄ@WbT]}CCNÞ›}þ®ÞB´Vóùüe»öOá=ÜÍe²W§>,6Qèi!.—·`ó®_gLã9!)™‘µÏ?@ü¦Éò*֌տÏ?ÊÕÚÂÉÒ¬¼Šuñ¯ ÞÆh×i CNf^Î†Ý ‹lgŒµÿ߀øôÌ Þžn¶VR ɲʪœü‚ÓWKšnF-NªBŸ¿|B<…¥‚U}ìâ5¡ž1ÝPºYXZ6ÃÏÇÅÊœ!I/­¨|žŸöþUljú̵[fós·µfHÒ ‹÷ù_sh9‰‡ã+þ:¸pÊø‘ƒÜ’ôⲊ wî—WÌݸã—i=m¹\^LJúúGçOÛÎì446.ؼkÜPoW§‘ƒÜBÅååaq‰då.èöp¿y?,²µ£qïÙs{Ï~¸ïéÑ€«GžwB,3ì§E]•þú††k÷·ñ8Á– ^½ûùúìµ[g¯Ý"^7r¹BILÏš¿qGWeËãM\²Jä[ž¿xôüñZ0Í¥•‚¹ëW‚^ ~ø5ë_,PÄø/“1 ±*~SLH¼ ÿ7ý‘¯]P¹ : ñ˜8â lßòñq×;nŒ mJÒåxW6‹)Q¸éåìo=_.wœµ.ß’€Á†!Œ‚0¬é‚0 ! a! ‘½æÚ݃Ÿpà›¿½6À÷ û¢k§t‹\øNá܆n°é.ÌÅ—NR§e­ËwG'îÄÎ ª(Ý"¾G\œêùü.¸)Ÿ£†z.Îÿ¦sñ¥s×iYëòÝÁçãüúºIÀWópwŸ6g^PHD@Ðývî£ãUˆžˆÄ ¡Ž‰Ð7@Q“–”àãÕµuTIFé¼nT|çÕsd™R\ŒRÈ®ÿFsÑ ¹ëœ¬uùîàã8¯#+Õ1 ø\ c¢@E¢o"ñ‚Ð}%†8Û74r‡»9·;L€!ÄæüeG<øN•Ô6ô•c*+Ыjj9÷Ô5¦¬ Ïã£w¬Úo7»ÎÉZ—ï *UY^ǰw•p_3¬Å rªã›qßt–•f Ý×ãÓ Ô& sï, µ‰:†…ÒKŽ©ÈdHP:¯«çWÔrÞ±j¹ôàÍ.ÉEç䮲Ö廣ÃÐ&¾ˆªDÁ Å­Mü7ÄAÍÂDâµ`¼()õý]ÇÚ ã7‹Ñ— )Eeå•pqçC4ÀçÀšEVÂmÍ#(Ãç.9½u±ŽÖËL‹—|ìd€vÀÞÿ}™^‰Êäá^úý5ö­þUNFº,ôZ¸Ÿ%†0 á_ј€oÃûÈ~‘H‘òëŒÉg¯ßV•5ÑÕîÐ| æ5ŸdÀ‹‹øÐð ð©°÷ÿ0g¡º¹ŽCAÉËÊ „8õõ“ôÖf V(r»~À?À·„ª"ºÅ¹ƒ;&"„(‹¶ý5~èà”ì—ñ陑WúPƒ(T¡ÈÇ— »àã0òÁy˜@UbÓ[d|ÕÑ$ KÊŽ\=w+ÿ‚O˜iº¿6†!>1ÅG85âH‚Ö1/@7ô>|Ô Ö¼“≦:ð¾‰M>ô;ä7ý㿟IÞFQpUQ(£Â3 >øÐ$‹„o”(ôt>¬ã[œù$¾H–øÂ¯†!>¿©6‘øØû71„pâ‘ Ðüï뮊 œhBIþoaV„†t<‰/“Cþ‡×DPH<[æC¤ˆÞÇ‹¨i’ÀGßW"4¸ µ4cM!!Ö"Xl6¤ã« _*³|¡I¡H‘ÿ>'‚Á"úÎbD„F3Í`=cÁ–•‹}ÑàIâ åaüfMÏ-#E$PwØZ%b·o€†ÚD| pDÁè÷Aa‹DÁ@â _0ŸmEŠèý $*Fä7+«n}0@˜$êùÆ¢ú)vbŒˆ¾d˜ˆ„;)~ˆšùã7½ûý&ß9¬•y‚qaó¨3bDô…ÃD¡NŠd€ØA¨“v¬“[óÀì/Ò.Aœ êˆ@› ð5`]¼ï´wöôáØÎçuYI‡ýÇ ¾¬³ ÅX—6aíUç ­)¢ oa]¾i?ÛgºL äX„};Ó ëNà«Àºu÷ø±·‘UñN_ |¶^„¸GÿÈõ"G!xgaD"@_k×A“mltޏäsÉ€E: DØ£¼Cf ¯azm? ×iDòC=GȰ¢¶W!¾¶v7[ö¡Ï±N"’.‘‹´ DÚG!ì!„#À·C †Îöu‘tŽ ´X¤“@¤}Ò‚à]‡"£=ÓÕïѰFe%l û¤‘‹pDZÞv¬)a[½Â6vb{Ù6™JZ3Ž50z×áÈí»Êˆtø—3LA…}¸·r tÎÖ.ƒ·Æ(ÖI8ÂfàÄ"8Ç`!!Þ®v„c|]E$lHK-«š„X®­jd Ö‹àíþÇñÎ"’®ºŒ@hôGõ΀°ÏÓ:²HÛ'T X¤c¥H[ÌÁˆà5Ö´-€¾ÐYuHÛGXûÖ™Ö¶¬}ÕH«W/‚·½h7Ú"-o;Îð°åe5Ç Vµ+.aÝ—9«FÜ3½¨Ý“2-õ"¬@g Dð-M̶P£]ìµD$ªFHo'‰ìOÊu!m5"è/XÛMg{ µÝ¤íª T,Â1Xk@‘¶.#ˆýk븉ðgëÅÀªAœÆ´UŠ@ |?0¶~œl“;û…–‹ 5"Wø ÿ°ß»¨i5€b‘N}?ðýÀ;´ËpÖèðlûرþý®ígX0„¸õ¤ˆm4øq ´XÆtœË;4`Çù¢ðíu÷®=pbü‹vߜ˄£¿%Àw¨[7hè» €þ4pc¨ .oÜP/€þ±úÓ@wõëà–ãáC8Þ@Ëc4Ô@†_Ä"!„0’¸¦³”ÞhQÕÁd^Á–‰8^_ž^žô¢8ö~cudð5`ÈÀz$“øÝaÖOüá8™­?FÜúº§íMB &j.[eõ›ë+i¹á5yŸkK12™*¢$¬l%¢d‰ãŒÂˆkyÿ2¡šè&&Â0„‘†µþ‘†!„! !„!RËûgEdM§©ºlj®-Ï >[}g6qÌÀ/©®`³HJoT}YZ’÷ÒZ>._ƒ¬¤ÄP+3âu%­æe`ðß¡îÆ"?ußU9ójöTeE_[y£c ‚ª/KOóÙ”òxŸˆ‚þ´«¼B²ß a!·.ùýW.3Xè†Ý¹<}”ûÏvÈæOv精ƠŸ¹Ü\Ÿ-Ø?A~þÍ‹æ<;s$ôö%¿«gB{×, »s™—Bùž“mª«vçò¬1žß`[vfÆWÿÞéÿßÿˆ-ª)ʇݹ¼rÆäŸªœte°©Ñª“y[6uâoóg‘I?Ë=úçí/"¢l£êô[eƇä«pf3÷™Ë“_4VèM¾¨=îxÜõéÜçß»f™›½MÇéS~Ý’–“ûÓf8;k#ƒSÛ7î?õöó×?Àî¨)Êß=²xãx}CC%­:%+' "ú¹_PþÍR2ÄÒl´“£¡–º˜ˆpCcSzNžoh¸÷«w5uõ}µ 2‰|ëâÛà°CIîhùt¯ ®ÎÏüÓsò›š¾Ã.›æ5oÂèþºÉJJX¿ª¬²ò¿ÇÏè Ÿâ“ ÌŒ6dÔP‡=g.eäåk(+&gåÌÛºÛÒ@ïèïk5U”’2³mŒ Æ:ùëÜåêÚº5~ÒX#Q ßÖPS”úô·Ï"„š‚è¬wû¹î·œ›ò¿ÏÎÿ!,‚VSË>¥º®κ^zâë—˜™÷=v%Î/.ù‹¢òñIKˆYéµ2_4iÜ–£ÿ†Ç%~í­Sùxw­\âlcÑÔÜ“œö1*–—WW]uÕÌ)3G{ŽY¾ŽÞÐØïY´pûžêºöbJ°73IÍÎÝvì kʹ{ï½zÛÌ`|ÏÉNÉÊY°mO~IÉ×Þ…./åÈ•›oƒÃXEwñ––ýØ£+^n.k™Šº¸gëù{µT•c’SBñiL&îha:u„«ë`ë¦ææ£›Ö®ùûÇmå‡ñ“Æ"’z#¨j©O72ª9>"ó iŽüÇ™ÉVr|T}GÆØKÞjnaÄ̦Ï|Ñ<}Ë;93ûçÌÞ¯§¨¬¼¨¬üûL[JVÎÞ³—Xoù©|3G{.ô{xÓÚÙ¿ïÌÈýº=¶.™çlc÷ljÿ—W°¦›éél˜7“ÂÃó=Ä"QI)?|•‹JJfŸ’ž“÷ý'»¶¾>21ùÛäB¨¼ŠÆšBohüÁú÷iü°¡k™ú68ì¼÷ãU3&/™2!t÷å[„Pžž›7wü¨üâÒ5ûŽÔÑé‡[sì÷_Wýu¨÷ሩ‘° À‹€Î;£ðóñ-ô{öîƒoyÑøIcÃñÕ…åÉÏ;ù ÃøÄTÞù—˜‚Ћš£Jh+MxÒã­KˆŠ,š<~ˆ¥™„¨HYeÕ»àð³wîs/^“=†Oá*'%YXZvÓçU§•«úƒæOcª«-ÀO-()}rþÞ#V-ñš_¦Îí9aÕoC­ÌÆwR‘>öß­ëO^tž?’âó&Œ±35’–§ÕÔ&ed]{ò<8:®;é R?\=óÌ/ðÌ­û¿Î™na ‹ ŽŽÛþjiE%BhѤq‹&Gmœ?kãüY¡â²ŠKÖ „\í¬ÝìluÕU%ÅDëèôˆøäÓ·¼ÙwvþÄ1K§NüeÓñi!G ÓÛÖî>}¡°´lÉ” Új*ÕµuÏüO\»Ãþ5t”“ÃdáÊr²žòªªèäÔÓ7½s‹Š;ÝwC-±.CÌõue%%˜8393ûÊCŸa_z”ëé ÿ»ó€„a '[:u"{‹†´„ø‚‰c,L$ÅD+iÕA‘1§oy1ÑzuþÞ£oÞc_Ûô‘î¿Î™þûᓯC:nËTWÛÃapV~áÚ}‡Ûµ D$$ÍÙüg3£¥ò‡Lž9ÚsäP{EYézzCx|âé[Þì7Ë«ï”õZ³iùt/W;!þÄô¬ÃWnßÕ, tÏüñ;BÈÅÆ2ìÎå–ëéÊ9…E²’^î.v¦ÆòÒRT>ÞÜ¢â'¾þÿ=~Îd2Y+¹u)02zÍ_‡ºSN\ÁÞ8ÖdáÄAdeNÛÕúVš•deÖΞfe¤ÏdâŸâ]º¾}Ùm5§ÙKˆˆv^›)sl¹w~÷Ve9Y·+ÙÔä__1c²“•¹¤˜èŒÛ“3³¹ŸDgwn6××AÝ:´‡˜òÌ/pÛ±3¦ºÚçvm9zõæÕGÏX[t±±œ9ÚSKUGxrföåOýÂ#YŸ9?~åF{sã)ž®rR’%¥—î?}ôîC§™/&,üú âõ¹][8>½üàéñk·»:›¸œ2Zai9BÈçCPf^Á¯ûº¶6××aÈã×îèk¨Ý~ñ†ÞÐ(+)A&“D„ fï·ëåæbe¨Ïd⯂8¯'T>ÞC¿­6ÐTŠŠ MøfYñ3Æ"$ UHÁ¼(òÎìä 2ª£/ŽîjÙŠ´w8£IT;DZˆ˜°ð¥½;d¤ü?E¥fç訩Nájkb8góήõ—L™°ÀklNaѧ/ù©|K§N$n ìœm,þZ»¼¢Šö68¬º¶NOCmþÄ1fz:Kwîc¿œ­š9ÙÚØ ,6!:)µ’VÝéæÔåÏîÜ,."ó20XLXØXGÓÓÑŽ8Cº™~YIÉ‹{·ç=} 3HÕÅÆRAZê—ßw2™Ìw!á¼¼”9ãF=}‹ªoh –Z9creuMhlBEMNZÒÙÚÂÆÄ`Öo;²ò ¹d©™±£…iPdÌ£w~6Æ3G{’0Ò¡Ë׉O;bÕÌ)é¹yÞ}hlj’•”´51|áÿ±«XÄËÍÅ\_7*)Ù7$\XP`¨•ù¡ßÖl;væ™_`÷ÕGÏ~;ÒÁÜ„—B!nEÊr²çvmð /*-—“’ôbgon2ë·Eeåaq e•Unö6±ˆ»ƒmÎ~'`7ÊÉ!tõ‘G B`Ý1 Û¿~åK³äÌìO_JKˆ»ÚYÛš.ÞñWBzfÛE‡|tó¯’b¢¾!áb"ÂNVæ'·mðZ½©¸¼"#7lj³;W,ŠO˸é󒘿¬² !dmlàå6,46>,.‡L6ÓÓY5sŠŠ¼ÜîÓ¸ä—r2à ö_ÿØ”´íËd^ºßv}˜ä>ÌH[“õVFRüüî­â""¯‚‚s ‹Muµÿ÷çæª}Ó%“IÇ·¬—•’ O¤ððw)î'Ñù{«k‡µ2?qíNqy9B(·°óaÆ(µ³§•VT>|ûD†¶>¼iíÞ³—¼_½cŸmùt/k#ƒ€ˆ¨ˆ„dgk‹íËæWÕÔ¼ýÔq…utúöãgÌMÝìmÎÝ}˜]ÐrRK‰‹­š9…Ënr?eôÍ((*æ¹ÿÇógÎüíÚúúWA!ìÁAX\BX\K4°qÞL*/ï†ÓÇkëû ï׿£ÿî_·bûÒù &ãmp8k:•÷àúUšê;Nüï["èçŒE¥õ0©¶0¦ÓOIª‚Í"Ädäžìø)³¹¡®$I@F÷³[Y2e{UGX\Â_âÔU‘bÿþ±xòø…“Æ-š<þÐ¥ë×£(+=o˜´œÜ9›ÿ¬§7 „nú¼º~`û<"B‚,_ø)>qÍ_‡Y7âRâ9ÄŽØ.ÁH[sƆí9…E\R¾sÅ" Q‘õŽù†´•Qi qÖ¥§;é7××9uãîïÇÄ-ðïu+\l,Muµ>Å'¥d刋ˆÌ7*.5ÝçC»ü‚í{ŠË*ØoWöý1âØíÇÏpI°“•Å’û>Å'"„¨Ô[‡öLpu>yãqcíä˜WT2cÃö¦æ–º2‰ÄÇËÛÕÚNÞ¸[ZQ‰ã-¿átøÊó»¶.ŸîõÜ?ˆ5±ûêèô¤Ì,cmM eEâ~¿cùƒ9qõo¬r#mÍÿý¹yÉ” ;Oc2™¯C¦ŽpÕ×DÔýeÀ@Sý™_`WU¦Æ:š¡ÐØxî‰q··biö!,býcÄÍþñ;¿wü¶uɼ·³f“‰KM_¾ëQ–¦xºn˜7sÜp§³·ï—WÑžûí\±¨°´ŒãØù…E¾ðÿÈÞ[sËâ¹ã† ½üà)—òÆ¥œ ¸‚Ÿ–Ÿ–±uɼҊJöÌq07eE–N™()&ÊÝnYÒ½ÓX¤±©ÉçC Œ„„›½ÍǨXV“š¢<÷X„û)3ÐïG·Ÿ¿fk¹bº×ß篶–I1K½ò*ZHL(2Fgªº’"ûœ‡/ß`Ýû¼ñEéRå¾òÊêjb^ EBTDRLÔ7ä†aD[FW¸—“¦`³I¤áƒ­3ó žû±&þïïG¯Þä(Ì=;‰8¸ÙÛðÉW=#„PiEåõ'/x)”áƒÛ IuñþV蓜™Ÿ–¡«®Ú‡—ëîœ2‘š¢ü¾µËžœúçÂî­Qa!bº™žöµ¿wn]<÷ÐÆÕ‡[ÍC&#„ KÊšš›Ç8;>>yðȦ5†;õ> Mÿ9¿sÅÂ!–f¼Êß¿®0ÖÖÜrôô·DÐOÚFC&¾ã=\ž°ås¦oØÖ±ïª’¬¬•ÇÞ›ÞИ˜‘imd %.ÆÑRŽÒRSFE·o”‰JJžèæÌzk¨¥:º¹“'ò¥ÄEÙ߯¦¤sO61tGHkƒeÓŸ’•Ãd¶åpQi9BHXP€ûÖUäVLŸlcb @¥²&2™8†a\ê$Ó³ØßÕ¶¬m½ ›?qÌ­C{_| MˆMIcUtJH€É”‰nö6Ä ž-'Ÿßíº./BˆH¿æ „м £çMàlÄqœD"1™Ì˜äÔüâRW;›#WnKy8¦ÕÔéô†¶š ­¦6=·]Wʈ„¤Á¦FZªÊ¬éut:«þœ¸fUU× ðv7§t›0ÜYUAc;Aˆ¾Š]á^N~˜‚ݶ!9~*_lJ{y..«(,)cݺ¯™ÁHÌÈê““¨cQ!ÊGQAiª(µ?û2Ùß—Wji%¹O.×Ý9eФ(#íhaúÌ/(8:.65½ ¤”˜þÛüYñi»Ï\ÒVUÞ÷ëò‰®Î·ž¿NÊÌv]°JO]ÍPK}ü0'sï×¾½OCcSӦçö®Yºk墔¬M¥ÍGþ ŒŒé— ùc‘¦º2„E°‡_sy…d›ë+z¶,qAg¯á$-î‚üÔÒ+äçGUÐÚ-RѾEœ¸n:t²cØšöPâšB„Piee/Óϱ]“"“É\6--!~qÏv‰tëÙ«”¬œÚ::Žð¹ãG™éé0ŒÑõe´Óm‘°–:¿3·½«jj& w^4yü¢Éãkëëï¾xûïÍ{>c‰aر-ë´4ûúŠO¬¤U3qÜÁÜd²Çp OO)11„Peu BHDH!tÁûqXgm±¬[Å‹€ ¹ãG›êjG$$i(+©+)z¿zÇå¡Ðò*š˜°°´„X^·g2ùù;¶¯GS-Ô¨©ål,`0™ÜBhùt¯9ãF…ÆÆßôyY\^ÑÌ`ÈIIn^4‡û_ÜËÉQ°;;«;…ªÄ"U4Ž;qO¢NÓÉ‘«Dþ9Év Ú•–æfBC}¦›§ÌuýéKöï¼вœìï'¥•¥•Ù…DPˆjjnŽNNNN5ÐTççãë«457ÿqòܽ#é©«]~èÓ_ú9czE&BHHΰ,ÑçK—%Q¨RZ•~=Û4qÞr|áFIЉ"„jë;i8 z*‰‹ˆÔÓÛn3â"Âìó•´åU4VSEW>{ê—ZâöÙ'éï¾ÑN"B‚Íù ¼Æöòp3™øõ'/®?y!+)acl8ÉcØìq#ëÎÝ}Øqf#- cmÍO_þsék¢©®v·.Èϯ«®ÚØÔD<«B©¦ææ˜8.K½ðÿ8wühwÛˆ„$¢±†h¸éJtRªº’¢•¡~^Ñ{.³ÕÖ×w`‰—²gõ*otrêŒQî‰Y=xf°OüŒýE«‹ëËÒÅ5‡u§©…ƒ˜úPŒL¡åôð7r‹ŠêètC-u¶/RT>^5ÕÊêêŽ 4¡”Ì„1[ß7„‰N»»c\J:Bh˜­eï3‡è/iÝÅ]¤éïg"„ÈävÅONJ!ŸÖVÙ."$ÈQ!ÜEeåÞ}X´coCc“C-ÍrÒ’¬L`!žì™YckD/ÎYcFðñvR ÁK¡HB(93[DHP]¹]×"ÒJÉÊéæ·?RûsGTXˆŸÊ—˜‘ÅÞ~añ…}:pûóW€ÂbzC£¦:ûDi q¢à±ÃkÊ´ö«E òó+Ë}þ(ºs1ŒŽg¢}™# 7ÕÓ& 'ú†ºyÊ 8ÙEÏü‚Täd7ÌqãÀ®#›ÖÓ÷œ½¤ª w÷ð_{V/ñ ÷~ý!4HQáæÁ]»V.šêéZXZö!¼oÂ*ïõ+Muµ~?ü諾‡Bcãw¯ZìlmÑ/ò“þMYòs>QEq —Î>Ä™Mõ̦Îcy‹Ù8³¹,ñy϶Ëdâ/‚¥%ħxº²&Î3BTXÈç}猾 a2ñ¹FQùZýP‘—9Ôž}žg~utúD7Žk‡º²¢®ºÚ¥è}æ`nâÔ¾D­þ=H§ˆ*w ö‰%e!s½¶®Ž+¦Oboóî+C}öî T*¹Óg_BDG}öàÃÝÞ¶gõ"üT¾^cçM]G§Ÿ¾Õò€nRFVLrª¶šÊÌÑì3‹ Û™µëŒùÜ/H\DxÞÄ1Š²Ò¯‚B¸7ŠG&&¿ø¨ª wxÓZIqöŒu4/îÙFT¹?ó B-›:‘uY7××±33NÎÌæèDµ 3«këd$Û» ZuCc“±¶&ë^®"/7u„k/Ý€+ØŸÅ`2_… RR`ÿ™ˆ…^c9~v„è„1ÆeHËešDZ=kJw~Ѧ;'­–8ûŬçU`ƒÉœ5ÆSDH˜"!*2m„[cSÓë oy­îæ)3}”»—› 8r ‹vŸ¹8oÛžÙ›ÿljnlÉçø´Œ©ë·­þëð/¿ïÜvü,Ü745646=öõw]°rÙ®¬Ê’Þ ß5ÑÑÚrôtPTLSsóï‡ÿ ‹MعbaŸDÿ_ê'ë¬èÓò¿¨8­§å„p ½Ú\_z̪ӥdŒ& ÉGÝ&zœôÌ©wmŒ ÖΞfa ›–“«£¦jgfœ]Pø¿»:?·¨ø‚÷£^coÜýöc?•ÏÓÑîS|"ûyXA«Þ~üì_k—ݹÙ/<"-'O€J5ÔR7ÔÒØyêGç²ÏÚqâìÙ›¬_“”™%*$d¤­™”‘õÇÉÿõ ýJËÎ-¯¢Mtsæ¥ð”WÑjëëoú¼zúÞî„QÛ—Í·16(©¨´4ÐS–—HH2ÓÓéͱ>¹mc^qqtRjQY¹¿‹%†Ðõ§…›’–ž9~¸“Œ¤xbzÖ %…¡Væ¾!áNÝø® ¥ª¼yÑ„/…"#)n¨¥!@¥—Ul=vš}|”íÇÏžÝùûª™Sœ­-ˆÚM%KC½—ÁѬÙ^|\6Í‹¨]îôÙ­ïú÷<…âdmñðÄÁè¤Ô¼âb>^^5U5Eù²Ê*¢»îsÿ w['k‹«ûþø+%.æjg]Ooà>HGa± Î6»W/ÉÊ/d2™w^¼¡ÕÔz¿z7m¤Û•}FF‹ z8þëbÓ«‹Ú€+ØÝ¼Ø™ï^µÔÑÂ4¯¨ÄTW[UA.5;—=8x²têÄyFë RÉ/.5ÑÑ¢òñ¦dåmF\tç$"9Ý0o–©žv=½!5;·ã#¸ùÅ%'¯ßY5sÊ­ö¼ %‘0W; Q‘¿Ï]騫ækëÎ)óëìé5uõĈ¥†a¿/œM&‘Yô"„jëëYÊ´ŽÒóÞz½óâMZ_ ãËÏÇ÷ÏÆUšêÛŽŸ hÍæææM‡Oí_·üe IéÕ· :ÒX¤™NËñ;ª6l‹Ö¨’î/Á»Ñ[XÁLÕù÷¦ÚÒœ€^ý$XymÎæ?Oïhajgf\^E»ýüõ™Û÷¹üèÑé[Þ´ê©#\§r/()=}Ë;5;‡ãk´oHø/›þ˜=n¤¥ž½¹ ­¦6·¨øÄµ;Ÿ¢¾4…¹ù37n_à5ÖÞÌÄÚØ ªº&1#“5^BÒ߃ÉüíŸ+gLë2”ÊÇ[\VqÓçUqyÅ’?ö­ž5uØ`«æfFx\âöãg–Móêå±>~í–±¡•¡¾¸¨p%­:1#ëòƒ§°0™øš¿­=ÍÚÈÀ\O79+{ÕžƒJr2݉Ed¤'¸:#„êèôJZuhL|§¿—SX4cãöÙãF¿4Aoh,*+¿óâÍ£·í:!å•Ä¥¦hªçs´uŠÞиþÀ±!–fcœ µ4LtµÓ²óŽ^½éýÊ—˜Çñ ŽÍã9r¨ÃôQîõô†€OÑ#ÛvÇ‹W1 Ù™»ÛÛbö2 ˜VS{üÚí::ÝÓÑnúH÷¼¢’“×ï†ÄÄõ2p»;ŠË+l۳早C­ÌqK\¸}ïÁ«ÙËIm}ýÒ¯›;ÃÒ@¿Q»)02úŸ‹×lX%‰D?»òÏžDÉ™Ù{Ï^š1ÊcæhO2ù™_`§Ã\yèSPR:}¤ûøáN¡¤Ì¬Ý§/ôKg‚nž2Ñd÷aÆÚš×Ÿ¾$ž‡ÒQS!Æ!z7µ4s²6¿ôàiV~á Ÿ—Ãm­¶-™·`ûÞÞÿ¶Ñ¾_—jil?~–ã€665ývèäÁõ«¶/WUSÿͲCÖàˆá8ÂñÖ]8Âq„3[&²^y{“Öè$´Ý+Rߦ¿ÜÞ\_ÉeN±Aš£bO¢÷’êܰ~¾T>ÞWçNĤ¤.ûs?äÆOeü°¡ëçÎxðæÃñë·çO3ÕÓ•DÂŽ_»sóÙ+„ÐÕ};Ô•›š›Ïß{—pø·5 vìýìHBŸee¨',(À>â*Gœ3näÅûOºjËþL„a#µŒ…Áz0„!„0ÔÚ:ù“Ö‹RŸnÒÄHZ®Fr†YïöU¤¾îXAÂ+,£h³DÆx³¹!éÁ2D½$*,TU]Ãz»xò~*_¿ 0ú×ý7ïyxÈkfM>Ø Çñ¿Ï_Y4iÑ×XŸ_MAþÚ“ÊŠK¦ŒÇñq%•+÷üÓû@!Ä}|wzCãé[÷¿qVüÔ±ÎlJ}²NÞr®¢Ýr­Ñ‡hù•éïk cšë*1 UDIDÅFTu0F¦ÔäGe¼þ³®$ N@/=>õO`Dtf^/…b¦§m¤­™œ™}¿/F¯ÎoÉ$²¦úá+7Ê«höfÆZªÊ!} 5‰äúéÔÍ{cœ'{ ûퟓyÅ%?j>üÔm4,TQe9ËÙ’:î<üœ}Ëk‹â‹"n”Ä?@|PÀwbü™6Ɔ2’â¼JQiù»°ó÷õyÇ0Íå¾tÊzC#Çq׫zß;¤_u·b‘6‰Ä/¥Í/¡ÁÃ/†pFcuQmQ\cM1œ¾e9YOÇÁÄë²Êª{íy‚Xý©»±Ü¿П @‚Xý bô'ˆEП @‚Xý bô'ˆEП @‚Xý bÐCBü›Íñ9s$äÖE¿«g»³ˆ©®vØË³Æxr™ÇÂ@7ìÎåé£Ü\†,š4.ìÎem5•o¹ÑîdiŸ8µ}c7òO%äÖ¥#¿ÿÚ¿i˜âép휄¨ñVÑv±ÍºXAY8:Ý´têÄ7N òóCVô#È‚>·wÍ27{â5“ɬ©«OÌȼõìõûÐO?Òn.Ÿ>i‚«ó3¿ÀôœüƦ¦ï?Ád)øÖÅ·Áa‡R Øß½UMA~ؼå.åTêü‰c¼ñ-¯¢Áqì™ëO_LéöËØÿÞ¼¹Ñ_ ùZÞ| ­ª®!“ÉJ²2V†úÖF»O_xðæý³ƒv¦Æ©Ù¹ÛŽcý3øçâuA*䇅Û÷T×Õõc&º9KˆŠ\úŽEUU×ø|œ6Òíòƒ§ut:dH¿€Xäk9ïQrf6ñzˆ¥Ù¡ßÖ,™2áGŠE¤ÄÅ¢’’á@ÿ$Òrr!:ŠJJéߌžWTÇ¢7^{¹¹¸ÚY?|ûr£_@,ò-|‹(,-““’”+­¨$&òR(3G{x8 V’“ihlŠIN=w÷atr*û‚.6–3G{j©*ãOÎ̾üà©_x$ëÓ5¿L9ÚsâêßÜqv”É.(ºàýèe@0ûJ¸oH€JýpõÌ3¿À3·îÿ:gº….B(8:nÿù«¬¤rØ8Ödá!k#ƒ°;—Bgoß?{çBH[Meñäñfz:T>ÞÜ¢â‡o>ÜðyÁdâ\2g²Çð©#\å¤$ KËnú¼êôžçno;e„«¦Š™DNËɽýüõ_Ö§WÿÞ))*:ù×ßW̘ìde.)&:cãvV H°0Ð=óÇïD–iF_¹1§°¨gÉ R—Nèî`+ÈÏŸœ™õïMïŽópÏùó»·êRs¿‚ý«™Dzyîx­ÚkÍ&bŠ´„ø‚‰c,L$ÅD+iÕA‘1§oy••sI[Ÿ§¶o4ÒÒtœµˆ}%ãWn´77žâé*'%YPRzéþÓGï¸]Ê µ4ƺ 1×ו•”`âÌäÌì+}>„E|ö r³·™ä>L[M…L"””úŠ:{ç~=½¡ûûË%©¬òÀzATaÊJJx¹»Ø™ËKK㉯ÿŸ3™LÖÊCn] ŒŒ^ó×!ö¢èµfÓòé^®v6Bü‰éY‡¯Üˆi^s?¦Ž¦‡7­Ý}úBymÞ„Ñš*ÊÁѱëöí˜-FÚš*òrǯÝþlJêŽ5›. ¥…‘xêËR #®—Æ=d}*$o,m8ADÉ’WXG̺âĂЋiïX3ˆ©Õ2ã厦º2›ERÚUYÉWμMо8JÉqµ¤Ž™W¨®81ËwMATW)!S,W…”&<ÉþŸªÓF!S&£±,Á'ûýœÙ,m8AÞr6Ÿ˜rcuAŽßÑòä—íN:]¥ÁË„•,Hþ†Êì’˜û…WñÖÃÑU"B‚r†Š6‹„Íɼ‚ Õùe‰ÏóƒÏ2›XkŽHHª Ub±H!#Å™r¼Ãkì;IÙ0[+ ¥{¯Þ•UV±&Îí!$ÀéþSzc#q—úwÇo£†:$edù…Gæ[ë{¹‹OË îŽ¡£<¶/›O&“žù¥då˜ééŒîTZY•žIÌ`kbh¬£¥"/ç`nòúch\jº¡¦úhgÇÂÒ²¤Ö;ñg7Dáá™;atumÝœñ£›š‚"cH$Ò`#K½o?àx'÷ã ZuDB’£¥Yf~ÁáË7Þ…„‡Ç'–WÑLuµÏîܬ,/ûæchXl‚²¼ìˆ!ö* ro?† ÊIIŽqÔri^2eÂÊ™“iµµßù••Oé./-¥,/“œÆº—¬=­ªºæCXD|j†š‚üøáN<<<¡±ñÄ \…-L 4Õ?%$e…Æ$TÐÚ5Ÿ746¦fç:[[ħe¿vû]Hø»ð„´Ì¦ææÏ&»#‰t|Ëz7{›„ôŒ—Á [6m"BHQV†uÜ?›ó|ÊP+óŒ¼ü”¬Öš›îtóÙ«OñI!e9ÙK{·ji|ŒŽý[][çfo3r¨ýË€àÚúúN³´OŠMG#‡ÚËJJ\¼ÿ˜}%Râ¢cœ‡„Ç%$efë©r³·IÊÌÎÊ/èj%˦yYêǦ¤†DÇåëhî”SXœšÃå„Ú0oæêYSÉ$ò›¡‘‰ÉáƒûúW×Öu¹$5·¨XCE‰BáÙõïy¢`D$$Ójk]l-çO›˜‘Ÿ˜œ™-+)1bˆ½´„8{ð´pÒ¸œÂ¢çþA¬¢("$8ØÔH[M%0"º¸¼ÂÚHßÝÁöéû€Úú–ˆó³ÇTUAÎÃa0Žã ½ÆåE$&••GÇuÌ™ŽvÖÆ¼””²&Š(YЍØGßmªm©,Qº^Õi£¾²2Í·¶(Ž_bŒ±‰L¡e·DŸJ«DUlj £«²‚*³…LeŒ½*³ëJ[*>©âjRz#Ž+Ú.m¨Ì®Îo¬.¬Ê ’1žÄCä (£[•áßXS$ªj+©7¢4¶ós‡LQ°YÈh )X/h®« eSÄÅ5œÈ|ÂRšÊŽ««ó>Õ'ÉIé,O~Õ\ߢ +šëO¹DS)O~ULS‘ÒMW-OyÅ=‘âZÃt'ž!ó W¤½«Î #óòKŒV´(‹Ì~±4Ô5ÓÓ¹úèƒ-Ö}p§Æ°Nþ†0„ñ!¨ù6†Z™ËJJäWVWSæOc¨¥±jï?A‘1Ä”×ï\Ý÷Ǧ³Ç®X㸼´ÔÊ™“ JJgýö±ÔïÇ×ìZ?w†oH8{?5m5•ië·S.?xzóŸ=ëæÎxó1Œ¸®}vCÄDs}S7î^ð~ŒÂ0ìïu+\l,Muµˆ;"‡ø´Œø´Œ­Kæ•VTú|$&b¶mé| yþ¶=Ä·À“×ïߺÞÝÞö…ÿÇN¿û*ÊJÏ›0&-'wÎæ?‰ï¸7}^]?°‹}k#ƒ™£=ÏÝ}xú–wk€í_·rθQß¾gUMKЉfæx­ÞÔUsoyí¹_Ð΋ KËXiîY²B#†ØYè>z÷áÏSç‰)žŽv»V-fŸç³9ÿ:(dý¼™nö¶ìéñpŒbUQìX¾€Á`N\ý[ai1ÅH[ón^2eÂÎSç:&¬¯ŠM7ék šº~KqYBè¦ÏËÿþÞ9}¤;—>Ú'oÜ-­¨d•ºÃWnœßµuùt¯çþAF½!{3“)ž®ÑÉ©ËwígU„ˆ Ò¿h¹$ÕçCàD7Q!!öò ‹|áÿ‘½_ö–ÅsÇ zùÁSÖ†Ž$DEâRÓ—ï:@,8ÅÓuü™ã†;½}ÿ‹ŽéK³uûr¯42ÒÖ@%edq™GDÅVÞrN^ÐéÜÀ-ežDÒ}XÞj~q콆Ê\„P®ÿÑÆÚÔz²|÷LûOÉquiâSÄv\Ä5œ“®¬Hóe_?E@²¶06áÎBœÑˆ’5›¡æò»´ñļÀS\R%¬d™å»¿0ü BˆÄÃg4û¾ŒÉ$F}UìÕÉôÊl„PY‚î¤ÿÉ™ÏÈxµ“8QÕÝþ$‘)q7f•.9~Gu½ÎHêŽ(K|Æ^…ÑHª¨†ÇZnh’÷r"…!y‹Ù*N$õG±W%edÛ›™èR娜ß<ÓûµÌŸ8fó¢9Û–Î?½cÓÁ «šš›÷Ÿ¿Úr-À° ®ÎQ‰) i™bÂÂư×A¡ 2Rª r!7{2ùê£g¬ð¥´¢òú“¼ÊðÁÖìºõìëš[^E»éóRŸˆ¥Y77D(,-»ôà ñÇq¢_‹î µîﯦºª‚Ü»pVu4ƒÉ$Z.Fµït×Á6$vÑû ë“]Pøô}û<^î.ô†Æ»/ß²Ò/"(ôØ×ŸDÂlM ÙçkÊsÿ ŒÜ|ÖÛîä|­:$:ÎÖÄPDHXŠ—Bq²6OËÈ.(Di(+™êj?yïOohd­$§ (>-ÃÞܤӄõI±é¾‹÷Ÿww„Prfv|Z†®º*—ùKÊ+ˆ˜C€J• Rý?EÉII*ÉÊtµÈW'„ÐÁ ÿ± BˆVSKÜ黿¿_šT„Peu5±^ EBTDRLÔ7ä†aD;&‡/ß`E0Þø"„tµl«ûÇ4 "ú³­W2’MM5uÜÂGYÓ)Ì&zQô-~qâÌ'Z÷#‘DUíˆykЉ˜ƒÌ+H$ó V¤¿ç–§Š*·Ë ?Ž@„õn?ë6_s!$(£Ç=å Uy…Ÿþ#^3›*R^“ȼÅ1÷ˆ@!T•ÔTWÎ/¥E¼’3¢J¨•§¼fµþà8#Çÿ8BHJ4—DJê$ó †]%ó ²r ,éÎlSsd_°¼ª !$+%7¯~õ"_Ë0[+Ök“¹áÀ±Àˆhâ­‚Œ”¸ˆ°¸¾Îë ':.(%.–™W@ S‘ЮZ‚x«©¢Ä>‘£÷Ôk©*?óëÖ†ˆ×)Y9ìÝ#ŠJËB‚ÝßßN›’ÖÔܬ¥ªÜé"Zjʬ³íNòD7gÖ[Mu*ïó³4–K‰‰±^73‰\¿öa²‰ì-*+cï´ãxLJÚ %…îb„Ћ€ƒM†ÙZÝí‹r´0 R_ølÝ÷A¡yFÏ›0šc 8Ž“H$f‡Êä>)6Ý—ØÚB(.¯0ÔÒè4a!þ%S&ºÙÛ°Ã`eHW5 zêƒjëëãÓ2ºø:Ýß/M*Bði#Ý& wVUÃ0Œ=µ\ò¤ŽN'BIBCcSUuÿ—ÓØ”´Ï濨÷@!$$gD¢PÍ¿ëø¯ 4ñ‚Ì'¤d·RR׃" É>EPš „j £;®„ÑTG¯l;õ˜Í Íõ•<|ÂÜSUWš‚ð¶œo¬-AÕ•$²ÏÓT[ÂJ¡€´B¨:/œ}†šÂhœÑ$ ­Í1‘c÷B::©¤á’bK«©%²tàÞt4ˆE¾–é¶%gfSùxÍôtv®X´{õÒY¿í .¸"‚‚!¿ðÈO_v\ètI ¼ÃÞã!D|‘h"TTU·›§’ÆZ¼;j9Û?—È`2Bd2¹ûû+ÈOe¥ÇñŠªê®"¦st쨠µÛaA¢²ò';iÈ/.a˳G­¼=H6‘òŒ¼|މÄ÷*B7sþ]pøæEsÜìmˆXÄÝÁÇñWA- 4"BB¡ ÞÃb:®¤ÓF>)6ÝÇq#lnf ®ûmavlËz#-ǾþŸâ+iÕLw07™ì1œÂÓå…HH¿¤¼’Ëèæþ~QR ˧{Í7*46þ¦ÏËâòŠfCNJró¢9¼ ·<©å L&ëTêþ1娩NÑX•j]!SE« ÓŸoëøQCU.q`t&œ–7)‰{@Ë k®¯@8SL}¨¬é4™—}þ¦ÚÒŽ+aÐk:ì‘>sé`4¶[ g2BŒ†Zމ©%«É¼‚¡¦Ú2Ž-5ÕUy…¸$’‡*‚Jy¼ŽA祹¡Ý>^^„PÃ@*釱È×Eoh ŠŒÙ|äÔé›~_4{ÙŸûBD/6~>¾˜¸®$ší%ÅDÙo“Ä·IޏA\T±=z"!&ÂZ¼;ê+Ķ8¾ïb&.*ÌÞ±®ã>Š‹ˆÔÓÛ¢ q‘vߨêêé¢"áq Ü;”áø·K6‘rŽt"„$DE9VûÙœ¯­¯÷ÿE<ûSOop07‰HHf5%MNMÍÍÝ?|}Rl¾#- cmÍO_þsék¢©®6÷¥jjë¥ÄE{¿¿_|e$“§x¸&¦g.ßµŸUehcÜÛÁL»Lñn”éÊêšAJ $Æå™/fC-E@’–‚ãŒNg’7V0-üô_Ö»}¬‰ÂŠ}yšõÑ–"Ø®ÚaE@¼–Ç%‘Ä‚MueÕ¹aÜ7ATWÀqýú‹| a± þŸ¢¬ ˆ–æÜ¢bZM­‘¶¦Œ¤xW‹_9®Ô¦zÚ¡ÔìvO½šèh±¿5ÖÖDftgC}¥ÓhªSxxØŸa—’™ÃJ0Ûî´[C|Z…‡g¨•yïSH\¢HÖËdÙ++))+ÙÖ´Œa˜‘–ëm÷sþ…ÿG‰äjgílcÁK¡°žÈ@Å¥¦#„œ­-H$RoŽÂ—›¯DNZ’8 ìÍõu¸/•ž!Èϯ¯1¨—û˃ÁàÈdQa!~*_bFûmÞÂ@¯—™ÐƒcÊEZN.†aÊr²\æ©)ŠÅÈqMç®fà‘GÕƲOV²ø®.¡u%I!aÅv×!9#ŒL©+á6ÊQMa,BHBÛõ³›PUÿÒbúÄ"ßÈù{B ½Æ!„˜Læý×¾|¼”ßæÿÂÃÖBááqµkép÷*0„ÁdÎãɪƒ•™6­±©éuPûš§xº²¾ÖKˆŠLáV[_OôzëΆúJ\jzvA¡‹ë¶A&‘–Nˆâx<åuP“‰Ï0ŠÊ×R¬"/ÇÑcôîË·¡Õ³¦r4ÒÛ™¶vš“ɬ®­“‘”èe²B/>’HØâ)ãYS<³:‹|QÎû…GÖÑénv¶îö¶ &óÍÇPÖGIY1É©Új*3G{°/"&,lgfÜiÂú¤Ø|%Äc#ìÁ‡»½ígëE¼_ù"„ÖÍÁ*$!!A¢¡¤ûûË­¦VH€_€Ú6°l­º¡±ÉX[“uøTä妎píe&ôà˜r•˜‚Ò×Pç2OqÔ-„Êõ!év[äÈCE5ÐòBÂJ–¬$uGpÜõû]Ma ½2K\ÓUP®¥Ç:†‘•ìW"„JžpY°4þ1£©NÖd ÇñKjÊê³OÑSW+,-c=Ü„š>ÊÝËÍå;¼›ü æ‰IN ·2Ô7ÑÑŠJJ9sû¾‘¶ÆP+sïcûýÂ#êètEcƒ::ýU`B(¿¸ääõ;«fN¹õÏž×A¡$æjg#!*ò÷¹+’3³¯Øõ"à#BÈÝÞVBTd׿çY•íŸÝP_Áq|÷é‹'¶®ÿߟ[^UÒjìÌŒ5U”^†tõgnQñïG ¼ÆÞ8¸ûíÇ0~*Ÿ§£Ý§øDö‹r`Dô•‡>¿Œq÷ȾwÁa%•¢"ºÊr²c–¯'º›u_Xl‚³ÅîÕK²ò ™Læoh5µ_šl„χÀÑNŽcœ‡(ÉÊ|ŠOR–—u±± ‰‰³6j«ÃïfÎ765ù†|1ÄŽÉă¢¢«ªÛµ£o?~öìÎßWÍœâlm‘ŒÒTQ²4Ô{Ìê Í®¯ŠÍ×›’–ž9~¸“Œ¤xbzÖ %…¡Væ¾!áNÖܾ‚DDÝ~þz²Çpï£û߇}ª§7(ÉÉØ™Oþus~qI÷÷÷3#.ÁÉÚâÈïk?Å'557û…G&gf{¿z7m¤Û•}FF‹ z8þëbcÙË|øÒcÊEPTLSs³•‘þ3¿.ãæÊ ÿ‚ЋòVsMæ<.OyÝX[LQ¶¢Š©Džso¦WÕÆÔÅË{ñ ËÔ%ðKª‹kºT¤¾×ö]@q<ýÅ݉gô§\.Kòi®«ä( ¥U–ô¢"õ-—åšë+Ò|~×uPoò¥Êtßú²4E@HÞHHÞ8ýÅÖÚ¢–1ŠÄE„5U”ï¾|þ쯳§×ÔÕ_‡ÀױȷsáÞc+Cý…“Æ­Ø} ±©iù®“<†y:ÚuŠ*./ˆˆf¿¦\yèSPR:}¤ûøáN¡¤Ì¬Ý§/tüæzðâÄš’b¢Ù…‡/ß n0„îl¨¯|ŠOœ·u÷¢I㜬,¨|¼¹…ÅG¯Þ¼þ„Û/eœ¾å]A«ž:Âuú(÷‚’ÒÓ·¼S³s8¾ ûïVdbòOסVæüT¾²Êª´œÜ‹ÞKÊ+¾4….^Å0dgjìno‹aØË€`ZMm’Íd2×î;¼lš—›½¦FJVö꽇LuµØc‘îçüsÿ CìH$Œõ KNaÑŒÛgéha:u„+½¡±¨¬ü΋7Þvù¸KŸ›¯ÉÄ×üuhíìiÖFæzºÉYÙ«öT’“á‹ „öŸ¿›’æå6lÔP„PAIéíç¯Y=…»¹¿ÜÝ{ùNQFfø`+S]m‰TVY•œ™}üÚí::ÝÓÑnúH÷¼¢’“×ï†ÄÄõ>éÁ1íJUuÍ»àpgk‹¿Î^jjnîj¶ìÿTç}’5›.®éB¦ð7Ö•Ö—¦äÿ¯±º!„3™I÷—©:mQ,¢dU[’˜to Ÿ˜ò÷‹ Ts–âàeâÃHjCeNöûƒ…áW?»`Eê›ØkS¬ç‹(Y‹©i¦WÑ+srüŽT¦· ±:|°5‰„=zç÷]íòOCÖ ™8Žp!á!á8™-Y¯¾ö&ÖàÖ\F^ŠÍÌXGóÂîm›œâ>„?àîò_;Læ¼-» +úaÂHlƒ®’Ú»ÚÚq ú‹À@”ú!,bθQ=fcl` ©~¢?뾈E`;rå¦oH8Çsé ûø©Ôýç¯vú{à›þ"0€e¿’ zÆ7$2¡ßA| Ð_Ä"èO‹ ?A,€þ±úÄ"èO‹ ?A,€þ±úÄ"èO‹ ?A,€þ±úÄ"èO‹ ?A,€þ±úÄ"èO‹ ?A,€þÄY@oa˜˜©.UA#Ap¾ œÉ¤çWF&"ïn©DHC\DJ€„a ÷˜8^Z×VAÿÂÊÉHFq€ž›^Ãùú‡˜™¿’W|ðÕ`Fâ Ò K»¹ˆ¦¸ˆŒ Š%èÃB(ÈËÃG&—Óº½Ž0¬“?„! !D¼FÚhè=ª‚ døÞJš”äès_©\A,@oAÓ øK4Í€¯á+•+¸† ?A,€þ±úÄ"èO‹ ?A,€þ±úÄ"€>0ÅÆ&ÿèq]yùl¿”%$òŸë8ñwîųÇ[7ÿö£îÝO€oÕ úQw~ü°Ÿú'!=sãÁãßaÚ^oܤ¯Øòó Í͹ååO¢"¾|Qߨø32iŠíK+=A>¾²šê©iWü?¦¥öÉúG˜˜¨JJýûö œZ½jùd¯‰-Ų±‘F«ÎÈÌüòÔçyuuunèÆ—?EF8x!$""@÷ÈÉÚâà†UÄëf£¦®.;¿(21ùþkߜ¢¯½uc/·aÆ:š¢ÂBuõôø´ ŸÏý?2™ÌÞ¯\DHpª§«ÿ§¨ø´Œ¦Òêë¾|¦ò3Ð_åêf ¨8ëÌéº;wCC|úÔØÜÜã5ˆ \Y¸ØJ]=©°àVðÇòÚZYg=}o‹Õ .œó‰Šê}"G›:êè|Q,’[Q1hý¯Í ÆSð>ëêµ´ª*2„„¸‰‘ÑÊåKgLŸºuÛQÑ1_cs"Â"óæÎ.+/ç‹üùÇ6§¡C“1qÙ°aZ²r·‚ƒ»}ó{.œAOþ{fÝÚÕ“½&ü{æÄD99ÙÅ‹ØXY äåå?xôøî½û¬²±oï.-MÍu7­\¾ÔÔĸ™Áð÷>õÏ¥ON^¿Ë±~~*ßü‰cŠÊÊ9„PtRjtR[3¿¹¾îÂIc 4ÕI)1#óÜ݇£b‰´ÕT®Øõç©sâ¢"]]¤%Är ‹O\¿ó>ôBÈÎÌøØæu¡ßÎù}á„Ð÷vý{^RLtöØ‘6&òÒRÂ’³²Ïß{Äo}ÏýE:R‘”DUÔÖouåå¬^[M§_øð¾¬¦f°¦æŸ&*Š‹ï|ÐRZÖyŽXçá’ž¶ûÑC)aáyC†:jk{²""»'Nº¿jͰ¿÷•×Ö „xy½W®V“–¾RT¤.#3×ÁÑTUuÜ‘ÃÍ­EÚNSË\Um¿Ï“¼Š /+ë #Ff••z‡…usD®.^zúí›>>fªª;Æoln^óFç×q„Ð\VH°+‹–Økiy‡……e¤+«Ì2ÔrР±G7±µ¡üoÞüçÑѳϞ•ÞãåuaþÂ!{w3qü÷Û··Žk¬¬<ÿü9bβš„Ð/•µuç?¼¯ª¯3QV™eï ¯ 8þØ.)éjÝÌXmmmí>>™¥%Ì.b»ïÊSŸç«W.·µ±&bI ‰³§O Þó~PXT4ÄÑaͪ òGd-",,|pÿ_7nÞ>wþ¢¾žÞÊK›š÷íÿ‡ø4?¿ ¼¬!Ÿ°s×ÞÛ6ߺ}×/ !ÔiÕK~~>BhˆƒýËW¯¹Thh¨ÿµw×óç/ßú¾wvº`þܼü‚—¯Zª„‰ÎÑÑ1§þ=+..æ5q¼¥…ù¼…Kòó 2³²Ê+*ÌÌLˆXD_O—Jåc2q33Ó×oÞ"„LMMš››cãâºÚ4//¯³³Ó£ÇO™LüÅË×ãÆŒ¶¶²üšaéâEÓ§Mùø1Ä/ @^NnÏ®%%%ìk˜0n ­ºúî=ïêê]]íqcFkjh,[±º«-rÏáç'EòŠKBC­ÌŸùuúí™D"Ùô«¥¡Þ3¿Àè¤T=ASG¸šèjÍߺ»©Û-â"B‚G7¯»ñôå¹»ùùBcœ·-Ÿ[T|ýÉóÒÊ*5Ey'k "á§òݹYYNöþkßÌüyÙÉà 4ÕçoÝM„;Új*çvmùïñ³#W¾ì[Tzn^trª±¶¦ˆ ­¦!´uɼ1ÎŽ¯C¼_½ðrviïö·””FDÓjj=³Ç"ƒ1 {ö!¨ãÊ-ôuE„ï½zˈp°168¶e}qYù¥ûOšŒqÆÛ¼nÓ¡“oƒÛnl ¼Æfäåï>}¡©¹yÉ” ûׯ¿rc~qIlJÚ¶cgv­Z|ýé ":)­¨Di(+9Y[¼ùšSX$$À?ÆyÈÑß]±û@ptÜ€(„$Œ¤,!æçw50kn‘UV™•E|úפ)Uõõ®û÷ÑêëBWüó++99ÿÏ×7¿²BNTlµ›{pZÚ¤Lj;ܫؘÇk×ý>zô’KY›NK=àó!”^\<ÿü9ÿ­ÛVººÑÌòá®ZrrcŽŠÊÎ&fLI¾²hÉh3óûá-EZXØ~÷®üÊ „PDV–½–ö,;‡îÇ"ÂTêú×GF „"³³´å䦶Ûrïn§MÚrr¡ØÜ\.+ejf¯¥uâõ«½SÒJŠ·Œ3Ívð•€¶NïXÏé9sí´´ü““crsJª«ŒÀÔöÕÎ>{¦¶¡¥Îò* ˆÍÍÝ;i²µºFHzZW)éjÝÌX9Q1§¿ö¤}õ^\}¥¡¡!77Õ 2wÎ/’¿®ÿ-8$!tÿÁÃýûöNöšøðá“ÌÖ,((°oÿ·ïÞ#„“ R=jÄ¡#ljHbÝÆMÄl4Zu||B(;''""²«$%§ÙÛö¾s34,,.>!44<«5‡Y´45-]A¬ðž÷ƒ»·®;šˆE¤¥¥fÿ23*:fåê_ BÈ?0èì¿'–,Z°ý]¡ÈÈh3Sb=¦¦&Å%%……Eff&D,bfj’˜Dïºn{ˆ£ƒ  ±­˜˜ØÂÂ"O7V,"#-=uÊ$ÿ€ÀM›·õ41±qÿµ»¤¤í›Ã†M[ê[¿y¢G(9%uÝÚÕÆÆFÑ]tÓážÃÎOúLobz懰ˆ!–fOþ=´kÕâ)ž®jŠížEfkii¨wéÁ“mÇÎÜyñæÏSçN\¿£¯1hŒ‹c÷·"ÈÏÿïÍ{ï?þŸä)*,´qþ/i9y36l?{ç÷«w‡.]Ÿ¾a[ËqìHueÅÅüuèòuïWïŽ\¹¹ñàqC- W;›Þïorf6†aÊr²!+Cý±.Cþ¹tmó‘Sw^¼¹àýxöæ$iÞ„Ñ¡¦ææ×A!vfÆÂ‚¬Å='gf§çæu\³º²"B()#›{ÖÏYWOŸýûÎ Þ¯<ô™³ùÏò*Úú¹3Él¿^GoX»ïHHL\DBÒÖc§1„qvDÑjjcSÒBYùáq‰áq‰DõUtrʸ•Ž^½éýêÝ•‡>³6í(()›=nä@)„2""Á;vïØùzã¦ßFŽ NK›þï)"°µÑиÚÐÜÌG¡">‘I¤Ášš!}}éÌ»·¬¯Úá™™ÓR‡é°ÿ¢÷Ýж¯e™¥%án†FÄÛqæá™‰¬õû%'WÕÕ9hk³ ÍH'BtN¶š”T÷w°‰Áð‰n«zŒÊÉ¦É bbÎ,L¥"„ºjÁ!¸á8~êM[ϧ ÞÓ›šX;ExÞÖå‹T%¹%›ˆðH|Ê«¸X„©Š —E¸l¢;‘•9€‘–Ó³®ŽD"!„ìíRSÓˆ@!Ädâ×oÜBÙÛn;úM;ïýÚ.¹II<<<²22=NÀ–í;9VPP0ÌÅyíê•×ÿ»táÜMM öybãâ‰@!ÔÔÔ¥¨ @¼µµ±!“É7oÝa´V¡ÅÅÅGFEÛ ¶%‘0„Pdd”¤¤$o™™šDDDFDDš›š"„¨T>]íÈHn½§=ÝÝ23³RRRB8Ž¿zýÆÑÁ^°õ*jccM"‘nݾËj0òÌËËg_+!“ɼ¼¼A!=].ϯ¾Îáþõóö]ýíŸã‡;bçfoëéh‡JLÏüóßóə١!–f8Ž_yèÚÿÖ³W‹&ji~ïå»nnÇñ§ïÛêœÌM¨|¼¼Õ±]pYEÓÍÞ&:)%5;——B!¦„ÄÄÓjj­ŒôŸû!„"“-'Íîáu¤žŽ"êfÜìm›šžøú³6TU]—`eÔÒ·ã¹ÿÇ ®Î.6–ß~@©+)jª(ûïV§k&ÖÉjHꔲœì %…O_–Wш)´šÚû¯}N§«®—ÚÒÛüU`0«yµ¤¼¢¤¢BIŽÛyÅ^ÃK¡0ÌàèX7û>ݾŠÚÚu7®c¦ .¶`¨“‚¸ë!-YY„Ðw÷5îîKI !„ˆ Ž^®É……¶š’BB%­_攕³Ï]^f¦¦†"“HjRRƒ¤¥3êtý„‚ª*öªét1îï`1ÆÞ›µ†NG‰ f ’Ž3Qˆ0•J´›t^$$Jª«+ëÚZÄësËˉæ-–ªJŽÕŠ rI§®¼ü†#kj±ï(×=íjÝÌØÜòòwÁhhh`2™d2YZZ**º]ç³ÌÌ,„Ûè2åååì½%ˆ~ Â"Â=N@SSó=ï÷¼ðòòêéê æ2nìè#ÿ˜>k6ÖRà Ø©¢ÑDZ·(/'‹bUÛ°’mjb,&*V^Q…273ÍÏ/0248züTaaáì_fJJH¨«âáá‰è:‘”°¶¶¼sï¾¼œ1%*:fÖÌé.ÎΟ$%-8ìØïô»©bØ<ÏÒi/×®V\X8LßÀPIé}b"·uv²BŒ‰3¹o—K¢eDD¬^[VS³÷ñ£ÌÒ’úÆF~^ÞÛËW’¸îiW›èfÆ6ôâÉç~A¥ò)+)edd²å@'ƒ=[º8ú}ðñŠøˆÑáÑkŽ-r¦ # ŽÊÈ̬ªª255IIMãç爈,-+mnn6334HÁ`DÇÄv•$7×á$iʤ‰S&MdŸîéáFÄ"ŸÝqI ‰O«¨¨8}ö¹¹yô†*ß±#ÿp)‡_/‡ûŒu†›š"’"’jêêæŽí`nâó!uq!fÝäðWE2™ÔqÍÝ-@Ã0,8:îô-oŽh]Mì>m5ljQFH$RUuÍš}‡9æaÆ8Ž¿ðÿ8s´‡”¸XiE¥»ýàˆ„äâòŠNלž“‡Ò¤ò:(¤ kwñB_vç[1}Ò/cGÜyñæÔ{•´j&ŽOáêno;ÏÅ‚ÊÊ“o^ÿ>jô0}ƒ7ñq™%%!~ðÌÌNçÏ)/GiËʱœAiÊÊÖ64°z¿"„´dåØ{`hÊÈæ”•!„š™ÌÜòr1ñ®Öÿí=‰^ê2lÆ`{.±HNy¹å A¢U­÷u*…¢(.˜’ÒÝòÖ¡Œ¹‰ðóO=u22»å3k ºø3¶OŒôô$“ÉAÁ!Ä…¢¤¤tš*û *ª*¡‚ž<ŽŽ£öÞMNIAIu¯Ý° °!4HMxp† ª¢R___UE#®E‘QÑf&&©©i¥¥eÄS͉IÉff&ƒÔÔ’’’ë»®ýõðpKNI¹xé*ûD+K‹ñãÆ((Èççä"„”••KÙž^Šœ”dAkW#¢Ê„}âÙ.ˆU´T•;~Ä`2 JJe%%b’S9þØíé e%#-Ø”4"Í9E¢ÂBùÅ%bºã™_ ‰Dr³³1ÖÖT‘zæØÕÊÃã«këF ±çã¥ty()E RT`Ÿ¨¦ Ê/.íÖÕª³nÄ»€ˆ¨¿Ï]yú)*)%&9•B¸Åï’ßZ}ýw„PNyyxfæŽ!ÕÅ(d2Bèm||3“¹ÈÙ™§µBÈLUÕVCãM|{³È¼!C‰ùBCttõˆÎ¡ŸÂ5ee§¶c_?…Lþ¢V˜>šžþ&>n”©é"'göé†M°´$ž²yƒaØR—al;8„ÚÚã;jD¨TöïšDÑ"“Ú¦,sÞ›ùÞ2¶÷ÛÚ,[ºˆF«¾sç1% 0HSSÃÊÒ‚uŒ¦M™DLïÁúëëêׯ„ªŠŠ¦†:ÇDg''„PÇÇz;õ18˜Á`L™ìEn=#ôõtMMŒƒ>¶5 GDFIKKáÁjމˆŒ²±²Ô×ÓåÒ@£¥¥©©¡þêõÛ~þì7nÝÆ0ÌÃÝ !Êd⓽&°–²·¬ÈvI$Ê!‰ÜV‘9cÚ”Þ5qq1}1¶îY:ÚZÚZZ\fè_?i½ˆš¢<…‡‡£¹dø`+„qï÷ ôt´›9ÚóÔ–§X§xºòñR>„Eo‰:+#ý—Á!2‰4u„÷úŠ¢74Î?ÚÿSk¬ ÈRø"àãœq£Æ Ê>:…‡‡ŸÊGÄ={¦WE^nÿú•¡³wS^|åä°zÖÔ'βßã%ÅDË*[ú¤då¤åäz8V’“ijnf¦†ó:Bo¸xÿñª™SþX¾hlj³ìUAúƒlŒ /ÞœSX”‘›?r¨ýÅû‰.#‚ã‡;—U$¦gvg/ˆ6"‚Bìq'amÁ´º²¢ƒ…)—•¨++ òóǦ¤±öÚH[³¶®žÕ'WŸ_]Y±¸¬¼¨¬šó«éô+þ+†»:êèø%%­¿qýþê5O]ãcPRa¿€‚¢§±±ÍŸÓh…U•Ç^¾øÕÃóΊUO¢"¤„„ç J«¯ÿëñcöuòS(·—¯|ø)\FDt‘³s1vâuKmö‘χêêœ:m¨®npZ ÃÔedF™˜nó¾÷ðSÿ ÷·òê•«‹—þ1~Âdk›w ñuµÒÂ".úúZ²r .œC=‰ŒHqXåê¦ &ž™i¤¤4ÍvpdvÖõ Ànn"*;û{‡½“&‡¤§1˜ø‹˜èI‰õ'™sþÃû¦æfcc‰öÅìK}‡û¥FA«ª"“Éâââ&ÆFºº:eåå[·ýQÖÚ>xáÒGGû}{wßó¾_T\ìè`oeiqûî=¢×È—*¯¨(..;zduuuMMm^^^B‡XUUUþÚóg|BbddTiY™°°ð`k]]è˜Xÿ€ný’’Ò+W¯ÍóËñ£‡Þù¾óš8¡¦¦†5b Bˆ8TUTnÝn¹ìGDDΚ1õQ§<=ÜB~þœ£çç¤gdxº»]¸x¹¸¸øö»S§L:ð÷Þ€€ 9y¹ñãÆdff ´vn §Óvnßrç®wSsóGQQ‘ÞÄ¡C7¬[»ç¯ý>ÏžSØ×ØÐ8qò´®fè_?m,¢ppê¸Ôôð¸ÄÒŠJ!A;3c}AQI)Â>!„^…Nž0oÂh9)Éè¤]uµ±.CâÓ2ˆ.!ÿ𨢲ò­Kæi(+ÕÖ×lÍøÜ(ÑUÕ5/þ·eñÜkûÿ|æTVY¥$+ckbHÓkkb(,(€a˜° €î 5cC“ñ÷¹+A‘-†EÆx¿z7ÁÕYYNÆ7ôSM]¢ŒŒƒ…IdBòÞ³—Xëyö!hÅŒIjŠòŸ¢ØûÍttõÑ3%Y™ ®Î¦ºZïB KËùùÍôtÌõun=k¹ó¼øß±-ë/íÝ~ÿõ{“1Öe¨¤˜è¦C'»9Rgy­¨¬|üp'ZmMum]naq|ZÆû°/7—íËD&$+ÉÉLr–ž“§­Öåã[Ï5ÑѲ›¾€˜È$ÒÅ=ÛbSÒælþ“˜Á@SýÔö—<=~ív¿Ësï}uZëîá—””TXà~àï5î#LLfÙ;TÖÕ¥í{ú„Õsð™OAeå¼!C·Wߨø!)ñ¯'ÙA­üïêü!C6ŽÅÇÔšºÍûnik·ÖºÆÆñG,6|Œ™¹»¡Q}ccnEùÍà}õË/=PYW7áØ‘i¶ƒÇ[X䳤R˪«ƒÓÓ6ݾ”šŠbâø/gO¯óðkn1ÖÜ¢¬¦úü‡÷Ÿù4u{€ö»¡!&**#MLgÙÙcf±c[Nyùìÿù}Ô˜M#GÕ56¾Ž‹]}í¿ˆ?w÷x/¾ÃŒýRÄÝ·©©‰F£¥gd;qê©Ïó¶Æâ²²²ÅKV,^´`Ô¨üyùyG¼sÏ»Ç[üsϾ•Ë–¬Yµ‚B¡ÞE“Æ»ÙÛH‰‹•WÒÞ| =sÛ»¦®­ÉPgêÆù³ôÔÕjêêŸøú¿ þïïc^¶ŽcÓvfÆsÆÒUW%a¤ü’’W!ÿk­±à§òý2f„«‚Œ½¡±°´,02úÖ³×%å!S]mî±ûïÑ0˜ÌšÚºœÂ¢ð¸Äû¯}s‹Š9fíì8a¸“¦Š2†aÅåå É÷^¾eo¦!Æ7Ã0lÓ¡“Ýé bgf<ÑÕÙH[SDH°®žž˜‘ùô}À3¿@&³¥Â\_wѤqšê†%ef»ûc±²ŽpûðÞ¢Òò•{ZF`´0Ð]óË4 eE^ …댟ʷbú$KQa¡ŒÜüó÷hª³@Ç1ÖÙùÝ[9b‘à[Ùck#ƒžÅ"ò£¿·3†Ôf玜øÈà¢àqwŸã³S¨wþ$nüw¹¤´tÕšu.å¹ÅÝž—‰0 a$„a­$„a­]í1ÔÚÊü“Æ"ô!ˆEÀ7±ÈE¡PØÙÎÖÖúŸýûÎüïü•«×ܾ|Xž£¾®‘ž..NA«ªhZZšÆ)..ö¾ÿr†±ðu¥¤¥¹¸8Íš1MXX˜F«~ûÎ÷ôÙó5}1dÃÚhè­ï°ü¨ ô»¯ÑF÷oô'ˆEП @‚Xý bô'ˆEП  ·ðnÿ"߬¤1;ûukzé+•+ˆEè-z~1døÞJZi]äès_©\‘‘ŒâÀ̼Ãk J èô¢2*†A!_ÎdÒóŠ*#»¿H½Læ§¡X‚>ÁÄñ’º†´ Ú•\¶QÎØþXcµNw_Œ» €bô'ˆEП @‚XýiàÆ"ð”0€tyã†zô§#:à;Ô­ôÀ‰E°î!úÆeÂÀªÁzô¾½îÞµZ ÖýýÀ÷ãòAßUô/žœvÖÏû¿Ó‹ÃѾñ‡µ¼èÚªÁÚ½Æ:î.¾íïÎ⼕·ˆm4­¿5ÜöǹWèWìwçÖ?Îp¤Å€k£Á†·5Ç`lÿáÐH|'p„a4ÐtŽ ¨zŽv™vµ#aô¾8ˆ{v5"í£‘:¾Û^µëáпð¶›2ÆÖ‚Ñ.iŒ ´6 !œÕLƒ±ž¡aÛ) áxÛÃ5øF0„p„‘ZV¥ÖV}Àùô B²¿+ú "bÇpV8‚# kí;ÂÖ³B alÿcmM3¬@k<º®A&i /Ø«FXõ"DüÁ–]æD$@ïuJ•Õw³µ9†Õe¤ÝÄNÖ5Ç:k6ˆÿ1Vu˜§c\Ïý}­-ìèø/«u¦eÖŽKœX¤cÕ«9†U)‚wõƒ5P| ì¡Ö¾‚¤ý¿•"XÛ›:|k ꈴ¶ÝtÌ)ˆH€>Óñ]¶ªœ2]t†X,Â^5Ò.iíA‚°Ö¡ŽÕ$¬7”=ƒu=™£þ£ë@ØÏôvŽ Ö@¤õBí‚’næ#zk×)¤“šÖ‡ô9šv;ËŽ –vĈ°½À·€qöQíØ.ƒý¿GÓº'í~}¦}D‚:"Ð&| Xï;í‚ýýE8öïð¼nKDÒaÿ1F€¯ëlBg1ÖecÅ€EX{ÕyDB`kÊhè[X—oÚÏö™.9aßCÿL&A×àúu÷ø±È—ì-¾7$Èô#ˆEП @‚Xý bô'TW ¹€þõ"èO‹ ?A,€þ±úÄ"èO?ÊïÑ ¯ñòR›jkHKˆ×74œ¸r‹L&{±ÓTSnjnŽŒKzÂ`2{¹ˆE€ï‹’¼¬‰¡ª¢œ @]==*!ùM@ˆµ‰¡­™¡ˆ°P]==¯°8<619#ë«&CUAnÞä±¼¼¼Ÿb¢“KJ+xÈäõ g‰‰3(ËËj¨*¾~ÇñÞlCƒ à¨ß 7ÛQñö¿?_QEá˜ócd̵ϾR2d¥$Ö-˜™|×çUcS31qÚhw; Ž9/Üy—Ø›mA½ð½éâPOo¸÷üMRzVM]½¤¸èš¹ÓÅEE˜LüÈÅëÙù…ü|¼ê*JãÝ]lMÂcÓ2¿FJ¦ŽrOÏÉ»ñè9{‡Ö •Žsj©)÷2¾«À÷ÂDO‡D"½ ‰Š«ª®a0Å¥åÙ¹¡â²òŒœ<ƒQSW˜òð•/BÈÌ@÷k$CNZRSMùÑ+_ŽÆ—¦fFÇ™›š›{¹9ˆE€ï…„¨0B(¯¨„}"•!DåãcŸ˜WXŒ’èÐpÓ'ÔU”hÕ5ùÅ¥Ó£â“:ÎÒËÍA,|/Hd2BˆLj»;SxxTäåBb"Bb¢¬éyõ¶…¡ž¦ªrS3#2>)5+§÷›ƒXø^ÔÓB憺¦úÚutº ??†a5uuÇ.Ýš?y¬ŽºÚîuËjëêyxÈ|¼¼¡úzú7N!“‰‡FLJÅ$H‰‹UTÑúdÐF|”äe=ìBYyµõt*µº¶.$*îÀÙ«Å%‡Îÿ÷>øSye•ÊÇ`0‰ { ->O …‡§©©‰Ë ÂB‚ÛW-”–ï“ÍA½Ðÿøùx—Íœ$,(€züÆ/)=“c†ºzúÝg¯ï>{M¼ÞõëRAþùSÆþ}úRQiy&FDX¨ª¦ŽË µuõ!a!Á‚’ÒÞoêE€þg¢§C"ìt5ÔVÏ™&,$ˆ"“H‹§M°37昇ÂÃccjÔ·‰QS’ÏÌÍã2ƒÁ(,)SS”ï“ÍA,ô?q±NžÎ5ÒÑÔTSV‘—E‰‹Šêhšv2 ˆ„X_>Ù+!&*'%—’Î}¶äŒ,}mõ>Ù"´Ñß5Žñà;÷åæÌ t*iÕE9.³””:Z™‰‰WÒª{¹EˆE€þGŒ]Æ9±¨¤žÞP\^ª®­«¤Uç–tœ-·°¨SÂÃÃ#.*²iÉœÏÎI¬€aXøîðÉÚúZê* rR⢼ަ榊ªêÜ¢ĴÌÈøä†ÆÆfg!@‚ñEП @‚Xý bô§ÿâºË Ù½rIEND®B`‚cccl-2.5.0/.devcontainer/img/run.png000066400000000000000000000031141463375617100172630ustar00rootroot00000000000000‰PNG  IHDR·Þ•%iCCPkCGColorSpaceGenericRGB8U]hU>›¹³+$΃Ԧ¦’þ5”´lRÑ„Úèþe³mÜ,“l´AÉìÝi&3ãü¤i)>AÁ¨à“àÿ[Á'!j«í‹-¢´P¢ƒ(øÐúG¡Ò ë¹3³»“¸k½ËÜùæœï~çÞsîÞ ¸,[–Þ%,®-åÓâ³ÇæÄÄ:tÁ}Ð }Ð-+Ž•*•&ã¿Úíï ÆÞ×ö·÷ÿgë®PGˆÝ…ج8Ê"âeþŲ]€AûÈ ×bø Ä;lœ âõWžð²Ï™‘2ˆ_E,(ªŒþÄÛˆç#öZsðÛŽ<5¨­)"ËEÉ6«šN#Ó½ƒû¶EÝkÄÛƒO³0}߸ö—*r–ᇟUäÜtˆ¯.i³Åÿe¹i ñ#]»¼…r ñ>ÄcU{¼èt©ª7ÑÀ+§Ô™g߃xuÁ<ÊÆîDüµ1_œ u~Rœ æàâ*-°z÷#°Mi*ˆËWh6Çòˆø¸æf}î-gi:×Ð9¥fŠA,î‹ãòV§>ÄW©ž—Bý_-·Æ%=†^œ tÈ0uüõúvW™â’9 Œ%/VµñBÈWµ'¤_¶tâÜÈMÛ“ÊŸ¿ŸåP““í\>ĘÉ@Á„yì0`D i|[`£§ èh¡è¥h¡øÕàìßÂ)ùþ·TjþÈëèÑ0B¦ÿ#ðЪÂïhU#¼ ~yh«uÐ fp#Ô1I/I’ƒø"“ä0!£ ’'ÉSdŒdÑ:J5Ç–"sdó¹ÑÔy#RŸ7‹¼‹èwAÆþgd˜à´ÏÅJŸ7ØÏØÏkÊ•×n^:}nW‹»FVŸ»Ösét$gj-tÈÚÔrÏÿÂ_ç×°_ç7Z þ~ëÛV·5ë4ÌV }ºo[ÄGó=Nd>¥-Ula³£¢Y5VúË}¹x»g[üä÷É?’kÉ÷’&ãÞä>áÎsŸrŸq߀È]à.r_r_qsŸGjÔyï4k± æi—QÜŸBZØ-<(d…=ÂÃÂdKO膄 a/zv7«]»ǰod«}¬€©sìn¬³Öá?TF–'|¦ãï3Nnã„#I?"…m»z„íõ¦v~K=Ú¯Æsñl<b|_|4>?Âpƒß‹¾QìñÔré²ËâŒi´µšêŠÃÉäãb ¯2* åÀ (ëºè»Ѧµ—hå°{28ÂoIþýÛy¥esŸ8ü';÷Z¶9à¬ÐûhË6€gã½ï¬>¦xöRx'Äbß8ÕƒÃÁWOÏ«ëõ[xn%ÞØ|½^ÿûýz}óÔ߸ ÿÿ%x ÅcÖËleXIfMM*>F(‡iN™Ü Ûyö  ,j§Ä pHYs  ÝÌfLblIDATH cäIÙÿŸN€‰Nö€­!h™Œ0CC¨*ƒ/;Åî"h™‘"CŠ­8éfS Š,$hÈô½7>0„M½ÆP À°(K‡A”L_eÈÂS·ß3Ø6žfxðæ'Ãi /ýÈð%Ñ–,üúë/CÝªÛ ¡P_.ÈÔfáaIH² fâi /퀾|øæ0.M|Mˆ‹K²,Y òeýjˆ/ë€q93M æœ4Ù–ÁLüøùÃÇo¿„¹ ' L©4 #CŠ›Cµ§Cí¦ ‹ö=&hY–©Jò2LOTå?›æ³À¸ûFÐ"’,cafdHw—c¨âšõ÷|Êð„’•hËÔ¤€¾IPcøøóƒUÓY†Ço‰ó ²—‰²ÌJY€aO¹ CÕº{ Ë‘æ’,»øàús¯&oÈðäÝd½$³Gn}FrXáÑ@q ‚Çl )ºZ Èj§_ÖA¯IEND®B`‚cccl-2.5.0/.devcontainer/launch.sh000077500000000000000000000102141463375617100170050ustar00rootroot00000000000000#!/usr/bin/env bash set -euo pipefail # Ensure the script is being executed in the cccl/ root cd "$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/.."; print_help() { echo "Usage: $0 [-c|--cuda ] [-H|--host ] [-d|--docker]" echo "Launch a development container. If no CUDA version or Host compiler are specified," echo "the top-level devcontainer in .devcontainer/devcontainer.json will be used." echo "" echo "Options:" echo " -c, --cuda Specify the CUDA version. E.g., 12.2" echo " -H, --host Specify the host compiler. E.g., gcc12" echo " -d, --docker Launch the development environment in Docker directly without using VSCode." echo " -h, --help Display this help message and exit." } parse_options() { local OPTIONS=c:H:dh local LONG_OPTIONS=cuda:,host:,docker,help local PARSED_OPTIONS=$(getopt -n "$0" -o "${OPTIONS}" --long "${LONG_OPTIONS}" -- "$@") if [[ $? -ne 0 ]]; then exit 1 fi eval set -- "${PARSED_OPTIONS}" while true; do case "$1" in -c|--cuda) cuda_version="$2" shift 2 ;; -H|--host) host_compiler="$2" shift 2 ;; -d|--docker) docker_mode=true shift ;; -h|--help) print_help exit 0 ;; --) shift break ;; *) echo "Invalid option: $1" print_help exit 1 ;; esac done } launch_docker() { DOCKER_IMAGE=$(grep "image" "${path}/devcontainer.json" | sed 's/.*: "\(.*\)",/\1/') echo "Found image: ${DOCKER_IMAGE}" docker pull ${DOCKER_IMAGE} docker run \ -it --rm \ --user coder \ --workdir /home/coder/cccl \ --mount type=bind,src="$(pwd)",dst='/home/coder/cccl' \ ${DOCKER_IMAGE} \ /bin/bash } launch_vscode() { # Since Visual Studio Code allows only one instance per `devcontainer.json`, # this code prepares a unique temporary directory structure for each launch of a devcontainer. # By doing so, it ensures that multiple instances of the same environment can be run # simultaneously. The script replicates the `devcontainer.json` from the desired CUDA # and compiler environment into this temporary directory, adjusting paths to ensure the # correct workspace is loaded. A special URL is then generated to instruct VSCode to # launch the development container using this temporary configuration. local workspace="$(basename "$(pwd)")" local tmpdir="$(mktemp -d)/${workspace}" mkdir -p "${tmpdir}" mkdir -p "${tmpdir}/.devcontainer" cp -arL "${path}/devcontainer.json" "${tmpdir}/.devcontainer" sed -i 's@\\${localWorkspaceFolder}@$(pwd)@g' "${tmpdir}/.devcontainer/devcontainer.json" local path="${tmpdir}" local hash="$(echo -n "${path}" | xxd -pu - | tr -d '[:space:]')" local url="vscode://vscode-remote/dev-container+${hash}/home/coder/cccl" local launch="" if type open >/dev/null 2>&1; then launch="open" elif type xdg-open >/dev/null 2>&1; then launch="xdg-open" fi if [ -n "${launch}" ]; then echo "Launching VSCode Dev Container URL: ${url}" code --new-window "${tmpdir}" exec "${launch}" "${url}" >/dev/null 2>&1 fi } main() { parse_options "$@" # If no CTK/Host compiler are provided, just use the default environment if [[ -z ${cuda_version:-} ]] && [[ -z ${host_compiler:-} ]]; then path=".devcontainer" else path=".devcontainer/cuda${cuda_version}-${host_compiler}" if [[ ! -f "${path}/devcontainer.json" ]]; then echo "Unknown CUDA [${cuda_version}] compiler [${host_compiler}] combination" echo "Requested devcontainer ${path}/devcontainer.json does not exist" exit 1 fi fi if ${docker_mode:-'false'}; then launch_docker else launch_vscode fi } main "$@" cccl-2.5.0/.devcontainer/make_devcontainers.sh000077500000000000000000000132331463375617100214000ustar00rootroot00000000000000#!/bin/bash # This script parses the CI matrix.yaml file and generates a devcontainer.json file for each unique combination of # CUDA version, compiler name/version, and Ubuntu version. The devcontainer.json files are written to the # .devcontainer directory to a subdirectory named after the CUDA version and compiler name/version. # GitHub docs on using multiple devcontainer.json files: # https://docs.github.com/en/codespaces/setting-up-your-project-for-codespaces/adding-a-dev-container-configuration/introduction-to-dev-containers#devcontainerjson set -euo pipefail # Ensure the script is being executed in its containing directory cd "$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"; function usage { echo "Usage: $0 [--clean] [-h/--help] [-v/--verbose]" echo " --clean Remove stale devcontainer subdirectories" echo " -h, --help Display this help message" echo " -v, --verbose Enable verbose mode (set -x)" exit 1 } # Function to update the devcontainer.json file with the provided parameters update_devcontainer() { local input_file="$1" local output_file="$2" local name="$3" local cuda_version="$4" local compiler_name="$5" local compiler_exe="$6" local compiler_version="$7" local os="$8" local devcontainer_version="$9" local IMAGE_ROOT="rapidsai/devcontainers:${devcontainer_version}-cpp-" local image="${IMAGE_ROOT}${compiler_name}${compiler_version}-cuda${cuda_version}-${os}" jq --arg image "$image" --arg name "$name" \ --arg cuda_version "$cuda_version" --arg compiler_name "$compiler_name" \ --arg compiler_exe "$compiler_exe" --arg compiler_version "$compiler_version" --arg os "$os" \ '.image = $image | .name = $name | .containerEnv.DEVCONTAINER_NAME = $name | .containerEnv.CCCL_BUILD_INFIX = $name | .containerEnv.CCCL_CUDA_VERSION = $cuda_version | .containerEnv.CCCL_HOST_COMPILER = $compiler_name | .containerEnv.CCCL_HOST_COMPILER_VERSION = $compiler_version '\ "$input_file" > "$output_file" } make_name() { local cuda_version="$1" local compiler_name="$2" local compiler_version="$3" echo "cuda$cuda_version-$compiler_name$compiler_version" } CLEAN=false VERBOSE=false while [[ $# -gt 0 ]]; do case "$1" in --clean) CLEAN=true ;; -h|--help) usage ;; -v|--verbose) VERBOSE=true ;; *) usage ;; esac shift done MATRIX_FILE="../ci/matrix.yaml" COMPUTE_MATRIX="../.github/actions/workflow-build/build-workflow.py" # Enable verbose mode if requested if [ "$VERBOSE" = true ]; then set -x cat ${MATRIX_FILE} fi # Read matrix.yaml and convert it to json matrix_json=$(python3 ${COMPUTE_MATRIX} ${MATRIX_FILE} --devcontainer-info) if [ "$VERBOSE" = true ]; then echo "$matrix_json" fi # Get the devcontainer image version and define image tag root readonly DEVCONTAINER_VERSION=$(echo "$matrix_json" | jq -r '.devcontainer_version') # Get unique combinations of cuda version, compiler name/version, and Ubuntu version readonly combinations=$(echo "$matrix_json" | jq -c '.combinations[]') # Update the base devcontainer with the default values # The root devcontainer.json file is used as the default container as well as a template for all # other devcontainer.json files by replacing the `image:` field with the appropriate image name readonly base_devcontainer_file="./devcontainer.json" readonly NEWEST_GCC_CUDA_ENTRY=$(echo "$combinations" | jq -rs '[.[] | select(.compiler_name == "gcc")] | sort_by((.cuda | tonumber), (.compiler_version | tonumber)) | .[-1]') readonly DEFAULT_CUDA=$(echo "$NEWEST_GCC_CUDA_ENTRY" | jq -r '.cuda') readonly DEFAULT_COMPILER_NAME=$(echo "$NEWEST_GCC_CUDA_ENTRY" | jq -r '.compiler_name') readonly DEFAULT_COMPILER_EXE=$(echo "$NEWEST_GCC_CUDA_ENTRY" | jq -r '.compiler_exe') readonly DEFAULT_COMPILER_VERSION=$(echo "$NEWEST_GCC_CUDA_ENTRY" | jq -r '.compiler_version') readonly DEFAULT_OS=$(echo "$NEWEST_GCC_CUDA_ENTRY" | jq -r '.os') readonly DEFAULT_NAME=$(make_name "$DEFAULT_CUDA" "$DEFAULT_COMPILER_NAME" "$DEFAULT_COMPILER_VERSION") update_devcontainer ${base_devcontainer_file} "./temp_devcontainer.json" "$DEFAULT_NAME" "$DEFAULT_CUDA" "$DEFAULT_COMPILER_NAME" "$DEFAULT_COMPILER_EXE" "$DEFAULT_COMPILER_VERSION" "$DEFAULT_OS" "$DEVCONTAINER_VERSION" mv "./temp_devcontainer.json" ${base_devcontainer_file} # Create an array to keep track of valid subdirectory names valid_subdirs=() # The img folder should not be removed: valid_subdirs+=("img") # For each unique combination for combination in $combinations; do cuda_version=$(echo "$combination" | jq -r '.cuda') compiler_name=$(echo "$combination" | jq -r '.compiler_name') compiler_exe=$(echo "$combination" | jq -r '.compiler_exe') compiler_version=$(echo "$combination" | jq -r '.compiler_version') os=$(echo "$combination" | jq -r '.os') name=$(make_name "$cuda_version" "$compiler_name" "$compiler_version") mkdir -p "$name" new_devcontainer_file="$name/devcontainer.json" update_devcontainer "$base_devcontainer_file" "$new_devcontainer_file" "$name" "$cuda_version" "$compiler_name" "$compiler_exe" "$compiler_version" "$os" "$DEVCONTAINER_VERSION" echo "Created $new_devcontainer_file" # Add the subdirectory name to the valid_subdirs array valid_subdirs+=("$name") done # Clean up stale subdirectories and devcontainer.json files if [ "$CLEAN" = true ]; then for subdir in ./*; do if [ -d "$subdir" ] && [[ ! " ${valid_subdirs[@]} " =~ " ${subdir#./} " ]]; then echo "Removing stale subdirectory: $subdir" rm -r "$subdir" fi done fi cccl-2.5.0/.devcontainer/verify_devcontainer.sh000077500000000000000000000053761463375617100216150ustar00rootroot00000000000000#!/bin/bash function usage { echo "Usage: $0" echo echo "This script is intended to be run within one of CCCL's Dev Containers." echo "It verifies that the expected environment variables and binary versions match what is expected." } check_envvars() { for var_name in "$@"; do if [[ -z "${!var_name:-}" ]]; then echo "::error:: ${var_name} variable is not set." exit 1 else echo "$var_name=${!var_name}" fi done } check_host_compiler_version() { local version_output=$($CXX --version) if [[ "$CXX" == "g++" ]]; then local actual_version=$(echo "$version_output" | head -n 1 | cut -d ' ' -f 4 | cut -d '.' -f 1) local expected_compiler="gcc" elif [[ "$CXX" == "clang++" ]]; then if [[ $version_output =~ clang\ version\ ([0-9]+) ]]; then actual_version=${BASH_REMATCH[1]} else echo "::error:: Unable to determine clang version." exit 1 fi expected_compiler="llvm" elif [[ "$CXX" == "icpc" ]]; then local actual_version=$(echo "$version_output" | head -n 1 | cut -d ' ' -f 3 ) # The icpc compiler version of oneAPI release 2023.2.0 is 2021.10.0 if [[ "$actual_version" == "2021.10.0" ]]; then actual_version="2023.2.0" fi expected_compiler="oneapi" else echo "::error:: Unexpected CXX value ($CXX)." exit 1 fi if [[ "$expected_compiler" != "${CCCL_HOST_COMPILER}" || "$actual_version" != "$CCCL_HOST_COMPILER_VERSION" ]]; then echo "::error:: CXX ($CXX) version ($actual_version) does not match the expected compiler (${CCCL_HOST_COMPILER}) and version (${CCCL_HOST_COMPILER_VERSION})." exit 1 else echo "Detected host compiler: $CXX version $actual_version" fi } check_cuda_version() { local cuda_version_output=$(nvcc --version) if [[ $cuda_version_output =~ release\ ([0-9]+\.[0-9]+) ]]; then local actual_cuda_version=${BASH_REMATCH[1]} else echo "::error:: Unable to determine CUDA version from nvcc." exit 1 fi if [[ "$actual_cuda_version" != "$CCCL_CUDA_VERSION" ]]; then echo "::error:: CUDA version ($actual_cuda_version) does not match the expected CUDA version ($CCCL_CUDA_VERSION)." exit 1 else echo "Detected CUDA version: $actual_cuda_version" fi } main() { if [[ "$1" == "-h" || "$1" == "--help" ]]; then usage exit 0 fi set -euo pipefail check_envvars DEVCONTAINER_NAME CXX CUDAHOSTCXX CCCL_BUILD_INFIX CCCL_HOST_COMPILER CCCL_CUDA_VERSION CCCL_HOST_COMPILER_VERSION check_host_compiler_version check_cuda_version echo "Dev Container successfully verified!" } main "$@" cccl-2.5.0/.git-blame-ignore-revs000066400000000000000000000012601463375617100165360ustar00rootroot00000000000000# Exclude these commits from git-blame and similar tools. # # To use this file, run the following command from the repo root: # # ``` # $ git config blame.ignoreRevsFile .git-blame-ignore-revs # ``` # # Include a brief comment with each commit added, for example: # # ``` # 8f1152d4a22287a35be2dde596e3cf86ace8054a # Increase column limit to 100 # ``` # # Only add commits that are pure formatting changes (e.g. clang-format version changes, etc). # Apply formatting to thrust (#1616) 165a06aa2c50b98d50fa56d027f0674da9d382e4 # Apply formatting to libcudacxx (#1610) baee3f502d4fd5febc7f602428592b2bec5d2d3b # Apply clang-format to cub (#1602) eefcca862226672eae8eacb244f39f2e7b4bf7c4 cccl-2.5.0/.github/000077500000000000000000000000001463375617100137775ustar00rootroot00000000000000cccl-2.5.0/.github/CODEOWNERS000066400000000000000000000012321463375617100153700ustar00rootroot00000000000000# general codeowners for all files # (Order matters. This needs to be at the top) * @nvidia/cccl-codeowners # Libraries thrust/ @nvidia/cccl-thrust-codeowners @nvidia/cccl-codeowners cub/ @nvidia/cccl-cub-codeowners @nvidia/cccl-codeowners libcudacxx/ @nvidia/cccl-libcudacxx-codeowners @nvidia/cccl-codeowners # Infrastructure .github/ @nvidia/cccl-infra-codeowners @nvidia/cccl-codeowners ci/ @nvidia/cccl-infra-codeowners @nvidia/cccl-codeowners .devcontainer/ @nvidia/cccl-infra-codeowners @nvidia/cccl-codeowners # cmake **/CMakeLists.txt @nvidia/cccl-cmake-codeowners @nvidia/cccl-codeowners **/cmake/ @nvidia/cccl-cmake-codeowners @nvidia/cccl-codeowners cccl-2.5.0/.github/ISSUE_TEMPLATE/000077500000000000000000000000001463375617100161625ustar00rootroot00000000000000cccl-2.5.0/.github/ISSUE_TEMPLATE/bug_report.yml000066400000000000000000000110561463375617100210600ustar00rootroot00000000000000name: Bug Report description: Create a report to help us improve title: '[BUG]: ' labels: ['bug'] body: - type: checkboxes id: check-duplicates attributes: label: Is this a duplicate? description: Check for duplicate issues. options: - label: I confirmed there appear to be no [duplicate issues](https://github.com/NVIDIA/cccl/issues) for this bug and that I agree to the [Code of Conduct](CODE_OF_CONDUCT.md) required: true - type: dropdown id: bug-type attributes: label: Type of Bug description: What kind of bug are you running into? multiple: false options: - Silent Failure - Runtime Error - Compile-time Error - Performance - Something else validations: required: true - type: dropdown id: component attributes: label: Component description: Which CCCL component does this apply to? multiple: false options: - Thrust - CUB - libcu++ - Not sure validations: required: true - type: textarea id: description attributes: label: Describe the bug description: A clear and concise description of what problem you are running into. placeholder: "Attempting to use structured bindings with `cuda::std::tuple` fails to compile." validations: required: true - type: textarea id: reproduction attributes: label: How to Reproduce description: Steps used to reproduce the bug. placeholder: | 0. See godbolt link below for exact reproducer 1. Construct a `cuda::std::tuple` 2. Use structured bindings to destructure the elements of the tuple. 3. Compilation fails with the error message: ``` (5): error: cannot bind to non-public member "cuda::std::__4::tuple<_Tp...>::__base_ [with _Tp=]" auto [a,b] = t; ``` validations: required: true - type: textarea id: expected-behavior attributes: label: Expected behavior description: A clear and concise description of what you expected to happen. placeholder: "Using structured bindings with `cuda::std::tuple` should successfully compile and destructure the elements of the tuple." validations: required: true - type: input id: reproduction-link attributes: label: Reproduction link description: If applicable, please provide a Compiler Explorer (godbolt) link to help explain your problem. placeholder: https://godbolt.org/z/dT5nMcf7W validations: required: false - type: markdown attributes: value: '# System information' - type: input id: operating-system attributes: label: Operating System description: If applicable, the OS version where this bug occurs. placeholder: Ubuntu Linux 20.04 validations: required: false - type: textarea id: nvidia-smi-output attributes: label: nvidia-smi output description: If applicable, the output from running the `nvidia-smi` command. placeholder: | +-----------------------------------------------------------------------------+ | NVIDIA-SMI 495.29.05 Driver Version: 495.29.05 CUDA Version: 11.5 | |-------------------------------+----------------------+----------------------+ | GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |===============================+======================+======================| | 0 NVIDIA GeForce ... Off | 00000000:41:00.0 On | N/A | | 0% 25C P8 8W / 320W | 491MiB / 10015MiB | 0% Default | | | | N/A | +-------------------------------+----------------------+----------------------+ validations: required: false - type: textarea id: nvcc-version attributes: label: NVCC version description: If applicable, the version of nvcc you're using. placeholder: | nvcc --version nvcc: NVIDIA (R) Cuda compiler driver Copyright (c) 2005-2021 NVIDIA Corporation Built on Thu_Nov_18_09:45:30_PST_2021 Cuda compilation tools, release 11.5, V11.5.119 Build cuda_11.5.r11.5/compiler.30672275_0 validations: required: false cccl-2.5.0/.github/ISSUE_TEMPLATE/config.yml000066400000000000000000000005111463375617100201470ustar00rootroot00000000000000blank_issues_enabled: true contact_links: - name: Question url: https://github.com/NVIDIA/cccl/discussions about: Check out our Discussions page to ask and answer questions. - name: CUDA C++ Core Libraries Discord url: https://discord.gg/nvidiadeveloper about: Come chat about using and contributing to CCCL! cccl-2.5.0/.github/ISSUE_TEMPLATE/doc_request.yml000066400000000000000000000025221463375617100212230ustar00rootroot00000000000000name: Documentation Request description: Suggest an idea to improve CCCL title: '[DOC]: ' labels: ['doc'] body: - type: checkboxes id: check-duplicates attributes: label: Is this a duplicate? description: Check for duplicate issues. options: - label: I confirmed there appear to be no [duplicate issues](https://github.com/NVIDIA/cccl/issues) for this bug and that I agree to the [Code of Conduct](CODE_OF_CONDUCT.md) required: true - type: dropdown id: new_or_correction attributes: label: Is this for new documentation, or an update to existing docs? options: - New - Update validations: required: true - type: textarea id: problem attributes: label: Describe the incorrect/future/missing documentation placeholder: "Example: A code snippet mentions function foo(args) but I cannot find any documentation on it." validations: required: true - type: textarea id: search_locs attributes: label: If this is a correction, please provide a link to the incorrect documentation. If this is a new documentation request, please link to where you have looked. placeholder: | https://docs.nvidia.com/cuda/thrust/ https://docs.nvidia.com/cuda/cub/ https://docs.nvidia.com/cuda/cuda-c-std/index.html cccl-2.5.0/.github/ISSUE_TEMPLATE/feature_request.yml000066400000000000000000000044411463375617100221130ustar00rootroot00000000000000name: Feature Request description: Suggest an idea to improve CCCL title: '[FEA]: ' labels: ['feature request'] body: - type: checkboxes id: check-duplicates attributes: label: Is this a duplicate? description: Check for duplicate issues. options: - label: I confirmed there appear to be no [duplicate issues](https://github.com/NVIDIA/cccl/issues) for this request and that I agree to the [Code of Conduct](CODE_OF_CONDUCT.md) - type: dropdown id: area attributes: label: Area description: What area does this request apply to? multiple: false options: - Thrust - CUB - libcu++ - General CCCL - Infrastructure - Not sure validations: required: true - type: textarea id: description attributes: label: Is your feature request related to a problem? Please describe. description: A clear and concise description of what the problem is, e.g., "I would like to be able to..." placeholder: I would like to be able to use the equivalent of `std::variant` in both host and device code. validations: required: true - type: textarea id: proposed-solution attributes: label: Describe the solution you'd like description: A clear and concise description of what you want to happen. placeholder: | Provide the header `` that implements a heterogeneous `cuda::std::variant` type. validations: required: true - type: textarea id: alternatives attributes: label: Describe alternatives you've considered description: If applicable, please add a clear and concise description of any alternative solutions or features you've considered. placeholder: The alternatives to a `variant` are unappealing. They usually involve using a raw `union` which is not type safe and has none of the convenient machinery like `std::visit`. validations: required: false - type: textarea id: additional-context attributes: label: Additional context description: Add any other context about the request here. placeholder: This would be a helpful vocabulary type that could replace a lot of custom and error prone code. validations: required: false cccl-2.5.0/.github/PULL_REQUEST_TEMPLATE.md000066400000000000000000000007471463375617100176100ustar00rootroot00000000000000## Description closes ## Checklist - [ ] New or existing tests cover these changes. - [ ] The documentation is up to date with these changes. cccl-2.5.0/.github/actions/000077500000000000000000000000001463375617100154375ustar00rootroot00000000000000cccl-2.5.0/.github/actions/workflow-build/000077500000000000000000000000001463375617100204065ustar00rootroot00000000000000cccl-2.5.0/.github/actions/workflow-build/action.yml000066400000000000000000000074411463375617100224140ustar00rootroot00000000000000name: "CCCL Build Workflow" description: "Parses a matrix definition and exports a set of dispatchable build/test/etc jobs." inputs: workflows: description: "Space separated list of workflows in matrix file to run" required: true allow_override: description: "If true, the requested `workflows` will be ignored when a non-empty 'override' workflow exists in the matrix file." default: "false" required: false skip_tests: description: "Skip running tests" default: "false" required: false inspect_changes_script: description: "If defined, run this script to determine which projects/deps need to be tested." default: "" required: false inspect_changes_base_sha: description: "If defined, use this base ref for inspect-changes script." default: "" required: false matrix_file: description: "Path to the matrix file in the consumer repository." default: "ci/matrix.yaml" required: false matrix_parser: description: "Path to the matrix parser script (default if blank: build-workflow.py from action dir)" default: "" required: false outputs: workflow: description: "The dispatchable workflow matrix" value: ${{ steps.build-workflow.outputs.workflow }} workflow_keys: description: "The keys of the parsed workflow" value: ${{ steps.build-workflow.outputs.workflow_keys }} runs: using: "composite" steps: - name: Inspect changes if: ${{ inputs.inspect_changes_script != '' && inputs.inspect_changes_base_sha != '' }} id: inspect-changes shell: bash --noprofile --norc -euo pipefail {0} env: base_ref: ${{ inputs.inspect_changes_base_sha }} run: | echo "Running inspect-changes script..." ${{ inputs.inspect_changes_script }} ${base_ref} ${GITHUB_SHA} echo "Exporting summary..." mkdir workflow cp ${GITHUB_STEP_SUMMARY} workflow/changes.md - name: Parse matrix file into a workflow id: build-workflow shell: bash --noprofile --norc -euo pipefail {0} env: allow_override: ${{ inputs.allow_override == 'true' && '--allow-override' || ''}} skip_tests: ${{ inputs.skip_tests == 'true' && '--skip-tests' || ''}} dirty_projects_flag: ${{ steps.inspect-changes.outputs.dirty_projects != '' && '--dirty-projects' || ''}} dirty_projects: ${{ steps.inspect-changes.outputs.dirty_projects }} matrix_parser: ${{ inputs.matrix_parser && inputs.matrix_parser || '${GITHUB_ACTION_PATH}/build-workflow.py' }} run: | echo "Parsing matrix file into a workflow..." ${{ env.matrix_parser }} ${{ inputs.matrix_file }} \ --workflows ${{ inputs.workflows }} \ ${{ env.allow_override }} \ ${{ env.skip_tests }} \ ${{ env.dirty_projects_flag }} ${{ env.dirty_projects }} echo "::group::Workflow" cat workflow/workflow.json echo "::endgroup::" echo "::group::Runners" cat workflow/runner_summary.json | jq -r '"# \(.heading)\n\n\(.body)"' | tee -a "${GITHUB_STEP_SUMMARY}" echo "::endgroup::" echo "::group::Job List" cat workflow/job_list.txt echo "::endgroup::" echo "Setting outputs..." echo "::group::GHA Output: WORKFLOW" printf "WORKFLOW=%s\n" "$(cat workflow/workflow.json | jq -c '.')" | tee -a "${GITHUB_OUTPUT}" echo "::endgroup::" echo "::group::GHA Output: WORKFLOW_KEYS" printf "WORKFLOW_KEYS=%s\n" "$(cat workflow/workflow_keys.json | jq -c '.')" | tee -a "${GITHUB_OUTPUT}" echo "::endgroup::" - name: Upload artifacts uses: actions/upload-artifact@v3 with: name: workflow path: workflow/ cccl-2.5.0/.github/actions/workflow-build/build-workflow.py000077500000000000000000001134421463375617100237370ustar00rootroot00000000000000#!/usr/bin/env python3 """ Concepts: - matrix_job: an entry of a workflow matrix, converted from matrix.yaml["workflow"][id] into a JSON object. Example: { "jobs": [ "test" ], "ctk": "11.1", "gpu": "t4", "sm": "75-real", "cxx": { "name": "llvm", "version": "9", "exe": "clang++" }, "std": [ 17 ], "project": [ "libcudacxx", "cub", "thrust" ], "os": "ubuntu18.04" } Matrix jobs are read from the matrix.yaml file and converted into a JSON object and passed to matrix_job_to_dispatch_group, where the matrix job is turned into one or more dispatch groups consisting of potentially many jobs. - dispatch_group_json: A json object used in conjunction with the ci-dispatch-groups.yml GHA workflow. Example: { "": { "standalone": [ {}, ... ] "two_stage": [ {}, ] } } - two_stage_json: A json object that represents bulk-synchronous producer/consumer jobs, used with ci-dispatch-two-stage.yml. Example: { "id": "", # Used as a compact unique name for the GHA dispatch workflows. "producers": [ {}, ... ], "consumers": [ {}, ... ] } - job_json: A json object that represents a single job in a workflow. Used with ci-dispatch-job.yml. Example: { "id": "", # Used as a compact unique name for the GHA dispatch workflows. "name": "...", "runner": "...", "image": "...", "command": "..." }, } """ import argparse import base64 import copy import functools import json import os import re import struct import sys import yaml matrix_yaml = None # Decorators to cache static results of functions: # static_result: function has no args, same result each invocation. # memoize_result: result depends on args. def static_result(func): return functools.lru_cache(maxsize=1)(func) def memoize_result(func): return functools.lru_cache(maxsize=None)(func) def generate_guids(): """ Simple compact global unique ID generator. Produces up to 65535 unique IDs between 1-3 characters in length. Throws an exception once exhausted. """ i = 0 while True: # Generates a base64 hash of an incrementing 16-bit integer: hash = base64.b64encode(struct.pack(">H", i)).decode('ascii') # Strips off up-to 2 leading 'A' characters and a single trailing '=' characters, if they exist: guid = re.sub(r'^A{0,2}', '', hash).removesuffix("=") yield guid i += 1 if i >= 65535: raise Exception("GUID generator exhausted.") guid_generator = generate_guids() def write_json_file(filename, json_object): with open(filename, 'w') as f: json.dump(json_object, f, indent=2) def write_text_file(filename, text): with open(filename, 'w') as f: print(text, file=f) def error_message_with_matrix_job(matrix_job, message): return f"{matrix_job['origin']['workflow_location']}: {message}\n Input: {matrix_job['origin']['original_matrix_job']}" @static_result def get_all_matrix_job_tags_sorted(): required_tags = set(matrix_yaml['required_tags']) defaulted_tags = set(matrix_yaml['defaulted_tags']) optional_tags = set(matrix_yaml['optional_tags']) all_tags = required_tags | defaulted_tags | optional_tags # Sorted using a highly subjective opinion on importance: # Always first, information dense: sorted_important_tags = ['project', 'jobs', 'cudacxx', 'cxx', 'ctk', 'gpu', 'std', 'sm', 'cpu'] # Always last, derived: sorted_noise_tags = ['os', 'origin'] # In between? sorted_tags = set(sorted_important_tags + sorted_noise_tags) sorted_meh_tags = sorted(list(all_tags - sorted_tags)) return sorted_important_tags + sorted_meh_tags + sorted_noise_tags def lookup_os(ctk, host_compiler): key = f'ctk{ctk}-{host_compiler["name"]}{host_compiler["version"]}' if not key in matrix_yaml['default_os_lookup']: raise Exception(f"Missing matrix.yaml `default_os_lookup` entry for key `{key}`") return matrix_yaml['default_os_lookup'][key] def lookup_supported_stds(device_compiler=None, host_compiler=None, project=None): stds = set(matrix_yaml['all_stds']) if device_compiler: key = f"{device_compiler['name']}{device_compiler['version']}" if not key in matrix_yaml['lookup_cudacxx_supported_stds']: raise Exception(f"Missing matrix.yaml 'lookup_cudacxx_supported_stds' entry for key '{key}'") stds = stds & set(matrix_yaml['lookup_cudacxx_supported_stds'][key]) if host_compiler: key = f"{host_compiler['name']}{host_compiler['version']}" if not key in matrix_yaml['lookup_cxx_supported_stds']: raise Exception(f"Missing matrix.yaml 'lookup_cxx_supported_stds' entry for key '{key}'") stds = stds & set(matrix_yaml['lookup_cxx_supported_stds'][key]) if project: key = project if not key in matrix_yaml['lookup_project_supported_stds']: raise Exception(f"Missing matrix.yaml 'lookup_project_supported_stds' entry for key '{key}'") stds = stds & set(matrix_yaml['lookup_project_supported_stds'][key]) return sorted(list(stds)) @memoize_result def lookup_job_invoke_spec(job_type): if job_type in matrix_yaml['job_invoke']: return matrix_yaml['job_invoke'][job_type] return {'prefix': job_type} def get_formatted_project_name(project_name): if project_name in matrix_yaml['formatted_project_names']: return matrix_yaml['formatted_project_names'][project_name] return project_name def get_formatted_host_compiler_name(host_compiler): config_name = host_compiler['name'] if config_name in matrix_yaml['formatted_cxx_names']: return matrix_yaml['formatted_cxx_names'][config_name] return config_name def get_formatted_job_type(job_type): if job_type in matrix_yaml['formatted_jobs']: return matrix_yaml['formatted_jobs'][job_type] # Return with first letter capitalized: return job_type.capitalize() def is_windows(matrix_job): return matrix_job['os'].startswith('windows') def generate_dispatch_group_name(matrix_job): project_name = get_formatted_project_name(matrix_job['project']) ctk = matrix_job['ctk'] device_compiler = matrix_job['cudacxx'] host_compiler_name = get_formatted_host_compiler_name(matrix_job['cxx']) compiler_info = "" if device_compiler['name'] == 'nvcc': compiler_info = f"nvcc {host_compiler_name}" elif device_compiler['name'] == 'llvm': compiler_info = f"clang-cuda" else: compiler_info = f"{device_compiler['name']}-{device_compiler['version']} {host_compiler_name}" return f"{project_name} {compiler_info} CTK{ctk}" def generate_dispatch_job_name(matrix_job, job_type): std_str = ("C++" + str(matrix_job['std']) + " ") if 'std' in matrix_job else '' cpu_str = matrix_job['cpu'] gpu_str = (', ' + matrix_job['gpu'].upper()) if job_type in matrix_yaml['gpu_required_jobs'] else "" cuda_compile_arch = (" sm{" + matrix_job['sm'] + "}") if 'sm' in matrix_job else "" cmake_options = (' ' + matrix_job['cmake_options']) if 'cmake_options' in matrix_job else "" host_compiler_name = get_formatted_host_compiler_name(matrix_job['cxx']) host_compiler_info = f"{host_compiler_name}{matrix_job['cxx']['version']}" config_tag = f"{std_str}{host_compiler_info}" formatted_job_type = get_formatted_job_type(job_type) extra_info = f":{cuda_compile_arch}{cmake_options}" if cuda_compile_arch or cmake_options else "" return f"[{config_tag}] {formatted_job_type}({cpu_str}{gpu_str}){extra_info}" def generate_dispatch_job_runner(matrix_job, job_type): runner_os = "windows" if is_windows(matrix_job) else "linux" cpu = matrix_job['cpu'] if not job_type in matrix_yaml['gpu_required_jobs']: return f"{runner_os}-{cpu}-cpu16" gpu = matrix_job['gpu'] suffix = "-testing" if gpu in matrix_yaml['testing_pool_gpus'] else "" return f"{runner_os}-{cpu}-gpu-{gpu}-latest-1{suffix}" def generate_dispatch_job_image(matrix_job, job_type): devcontainer_version = matrix_yaml['devcontainer_version'] ctk = matrix_job['ctk'] image_os = matrix_job['os'] host_compiler = matrix_job['cxx']['name'] + matrix_job['cxx']['version'] if is_windows(matrix_job): return f"rapidsai/devcontainers:{devcontainer_version}-cuda{ctk}-{host_compiler}-{image_os}" return f"rapidsai/devcontainers:{devcontainer_version}-cpp-{host_compiler}-cuda{ctk}-{image_os}" def generate_dispatch_job_command(matrix_job, job_type): script_path = "./ci/windows" if is_windows(matrix_job) else "./ci" script_ext = ".ps1" if is_windows(matrix_job) else ".sh" job_invoke_spec = lookup_job_invoke_spec(job_type) job_prefix = job_invoke_spec['prefix'] job_args = job_invoke_spec['args'] if 'args' in job_invoke_spec else "" project = matrix_job['project'] script_name = f"{script_path}/{job_prefix}_{project}{script_ext}" std_str = str(matrix_job['std']) if 'std' in matrix_job else '' device_compiler_name = matrix_job['cudacxx']['name'] device_compiler_exe = matrix_job['cudacxx']['exe'] cuda_compile_arch = matrix_job['sm'] if 'sm' in matrix_job else '' cmake_options = matrix_job['cmake_options'] if 'cmake_options' in matrix_job else '' command = f"\"{script_name}\"" if job_args: command += f" {job_args}" if std_str: command += f" -std \"{std_str}\"" if cuda_compile_arch: command += f" -arch \"{cuda_compile_arch}\"" if device_compiler_name != 'nvcc': command += f" -cuda \"{device_compiler_exe}\"" if cmake_options: command += f" -cmake-options \"{cmake_options}\"" return command def generate_dispatch_job_origin(matrix_job, job_type): origin = matrix_job['origin'].copy() matrix_job = matrix_job.copy() del matrix_job['origin'] matrix_job['jobs'] = get_formatted_job_type(job_type) if 'cxx' in matrix_job: host_compiler = matrix_job['cxx'] formatted_name = get_formatted_host_compiler_name(host_compiler) matrix_job['cxx_name'] = formatted_name matrix_job['cxx_full'] = formatted_name + host_compiler['version'] del matrix_job['cxx'] if 'cudacxx' in matrix_job: device_compiler = matrix_job['cudacxx'] formatted_name = 'clang-cuda' if device_compiler['name'] == 'llvm' else device_compiler['name'] matrix_job['cudacxx_name'] = formatted_name matrix_job['cudacxx_full'] = formatted_name + device_compiler['version'] del matrix_job['cudacxx'] origin['matrix_job'] = matrix_job return origin def generate_dispatch_job_json(matrix_job, job_type): return { 'name': generate_dispatch_job_name(matrix_job, job_type), 'runner': generate_dispatch_job_runner(matrix_job, job_type), 'image': generate_dispatch_job_image(matrix_job, job_type), 'command': generate_dispatch_job_command(matrix_job, job_type), 'origin': generate_dispatch_job_origin(matrix_job, job_type) } # Create a single build producer, and a separate consumer for each test_job_type: def generate_dispatch_build_and_test_json(matrix_job, build_job_type, test_job_types): build_json = generate_dispatch_job_json(matrix_job, build_job_type) test_json = [] for test_job_type in test_job_types: test_json.append(generate_dispatch_job_json(matrix_job, test_job_type)) return { "producers": [build_json], "consumers": test_json } def generate_dispatch_group_jobs(matrix_job): dispatch_group_jobs = { "standalone": [], "two_stage": [] } # The jobs tag is left unexploded to optimize scheduling here. job_types = set(matrix_job['jobs']) # Identify jobs that require a build job to run first: build_required = set(matrix_yaml['build_required_jobs']) & job_types if build_required and not 'build' in job_types: raise Exception(error_message_with_matrix_job( matrix_job, f"Internal error: Missing 'build' job type required by other jobs ({build_required}).")) if build_required: dispatch_group_jobs['two_stage'].append( generate_dispatch_build_and_test_json(matrix_job, "build", list(build_required))) job_types -= {'build'} job_types -= build_required # Remaining jobs are assumed to be standalone (e.g. nvrtc): for job_type in job_types: dispatch_group_jobs['standalone'].append(generate_dispatch_job_json(matrix_job, job_type)) return dispatch_group_jobs def matrix_job_to_dispatch_group(matrix_job, group_prefix=""): return {group_prefix + generate_dispatch_group_name(matrix_job): generate_dispatch_group_jobs(matrix_job)} def merge_dispatch_groups(accum_dispatch_groups, new_dispatch_groups): for group_name, group_json in new_dispatch_groups.items(): if group_name not in accum_dispatch_groups: accum_dispatch_groups[group_name] = group_json else: # iterate standalone and two_stage: for key, value in group_json.items(): accum_dispatch_groups[group_name][key] += value def compare_dispatch_jobs(job1, job2): "Compare two dispatch job specs for equality. Considers only name/runner/image/command." # Ignores the 'origin' key, which may vary between identical job specifications. return (job1['name'] == job2['name'] and job1['runner'] == job2['runner'] and job1['image'] == job2['image'] and job1['command'] == job2['command']) def dispatch_job_in_container(job, container): "Check if a dispatch job is in a container, using compare_dispatch_jobs." for job2 in container: if compare_dispatch_jobs(job, job2): return True return False def remove_dispatch_job_from_container(job, container): "Remove a dispatch job from a container, using compare_dispatch_jobs." for i, job2 in enumerate(container): if compare_dispatch_jobs(job, job2): del container[i] return True return False def finalize_workflow_dispatch_groups(workflow_dispatch_groups_orig): workflow_dispatch_groups = copy.deepcopy(workflow_dispatch_groups_orig) # Check to see if any .two_stage.producers arrays have more than 1 job, which is not supported. # See ci-dispatch-two-stage.yml for details. for group_name, group_json in workflow_dispatch_groups.items(): if 'two_stage' in group_json: for two_stage_json in group_json['two_stage']: num_producers = len(two_stage_json['producers']) if num_producers > 1: producer_names = "" for job in two_stage_json['producers']: producer_names += f" - {job['name']}\n" error_message = f"ci-dispatch-two-stage.yml currently only supports a single producer. " error_message += f"Found {num_producers} producers in '{group_name}':\n{producer_names}" print(f"::error file=ci/matrix.yaml::{error_message}", file=sys.stderr) raise Exception(error_message) # Merge consumers for any two_stage arrays that have the same producer(s). Print a warning. for group_name, group_json in workflow_dispatch_groups.items(): if not 'two_stage' in group_json: continue two_stage_json = group_json['two_stage'] merged_producers = [] merged_consumers = [] for two_stage in two_stage_json: producers = two_stage['producers'] consumers = two_stage['consumers'] # Make sure this gets updated if we add support for multiple producers: assert (len(producers) == 1) producer = producers[0] if dispatch_job_in_container(producer, merged_producers): producer_index = merged_producers.index(producers) matching_consumers = merged_consumers[producer_index] producer_name = producer['name'] print(f"::notice file=ci/matrix.yaml::Merging consumers for duplicate producer '{producer_name}' in '{group_name}'", file=sys.stderr) consumer_names = ", ".join([consumer['name'] for consumer in matching_consumers]) print(f"::notice file=ci/matrix.yaml::Original consumers: {consumer_names}", file=sys.stderr) consumer_names = ", ".join([consumer['name'] for consumer in consumers]) print(f"::notice file=ci/matrix.yaml::Duplicate consumers: {consumer_names}", file=sys.stderr) # Merge if unique: for consumer in consumers: if not dispatch_job_in_container(consumer, matching_consumers): matching_consumers.append(consumer) consumer_names = ", ".join([consumer['name'] for consumer in matching_consumers]) print(f"::notice file=ci/matrix.yaml::Merged consumers: {consumer_names}", file=sys.stderr) else: merged_producers.append(producer) merged_consumers.append(consumers) # Update with the merged lists: two_stage_json = [] for producer, consumers in zip(merged_producers, merged_consumers): two_stage_json.append({'producers': [producer], 'consumers': consumers}) group_json['two_stage'] = two_stage_json # Check for any duplicate jobs in standalone arrays. Warn and remove duplicates. for group_name, group_json in workflow_dispatch_groups.items(): standalone_jobs = group_json['standalone'] if 'standalone' in group_json else [] unique_standalone_jobs = [] for job_json in standalone_jobs: if dispatch_job_in_container(job_json, unique_standalone_jobs): print(f"::notice file=ci/matrix.yaml::Removing duplicate standalone job '{job_json['name']}' in '{group_name}'", file=sys.stderr) else: unique_standalone_jobs.append(job_json) # If any producer/consumer jobs exist in standalone arrays, warn and remove the standalones. two_stage_jobs = group_json['two_stage'] if 'two_stage' in group_json else [] for two_stage_job in two_stage_jobs: for producer in two_stage_job['producers']: if remove_dispatch_job_from_container(producer, unique_standalone_jobs): print(f"::notice file=ci/matrix.yaml::Removing standalone job '{producer['name']}' " + f"as it appears as a producer in '{group_name}'", file=sys.stderr) for consumer in two_stage_job['consumers']: if remove_dispatch_job_from_container(producer, unique_standalone_jobs): print(f"::notice file=ci/matrix.yaml::Removing standalone job '{consumer['name']}' " + f"as it appears as a consumer in '{group_name}'", file=sys.stderr) standalone_jobs = list(unique_standalone_jobs) group_json['standalone'] = standalone_jobs # If any producer or consumer job appears more than once, warn and leave as-is. all_two_stage_jobs = [] duplicate_jobs = {} for two_stage_job in two_stage_jobs: for job in two_stage_job['producers'] + two_stage_job['consumers']: if dispatch_job_in_container(job, all_two_stage_jobs): duplicate_jobs[job['name']] = duplicate_jobs.get(job['name'], 1) + 1 else: all_two_stage_jobs.append(job) for job_name, count in duplicate_jobs.items(): print(f"::warning file=ci/matrix.yaml::" + f"Job '{job_name}' appears {count} times in '{group_name}'.", f"Cannot remove duplicate while resolving dependencies. This job WILL execute {count} times.", file=sys.stderr) # Remove all named values that contain an empty list of jobs: for group_name, group_json in workflow_dispatch_groups.items(): if not group_json['standalone'] and not group_json['two_stage']: del workflow_dispatch_groups[group_name] elif not group_json['standalone']: del group_json['standalone'] elif not group_json['two_stage']: del group_json['two_stage'] # Natural sort impl (handles embedded numbers in strings, case insensitive) def natural_sort_key(key): return [(int(text) if text.isdigit() else text.lower()) for text in re.split('(\d+)', key)] # Sort the dispatch groups by name: workflow_dispatch_groups = dict(sorted(workflow_dispatch_groups.items(), key=lambda x: natural_sort_key(x[0]))) # Sort the jobs within each dispatch group: for group_name, group_json in workflow_dispatch_groups.items(): if 'standalone' in group_json: group_json['standalone'] = sorted(group_json['standalone'], key=lambda x: natural_sort_key(x['name'])) if 'two_stage' in group_json: group_json['two_stage'] = sorted( group_json['two_stage'], key=lambda x: natural_sort_key(x['producers'][0]['name'])) # Assign unique IDs in appropriate locations. # These are used to give "hidden" dispatch jobs a short, unique name, # otherwise GHA generates a long, cluttered name. for group_name, group_json in workflow_dispatch_groups.items(): if 'standalone' in group_json: for job_json in group_json['standalone']: job_json['id'] = next(guid_generator) if 'two_stage' in group_json: for two_stage_json in group_json['two_stage']: two_stage_json['id'] = next(guid_generator) for job_json in two_stage_json['producers'] + two_stage_json['consumers']: job_json['id'] = next(guid_generator) return workflow_dispatch_groups def find_workflow_line_number(workflow_name): regex = re.compile(f"^( )*{workflow_name}:", re.IGNORECASE) line_number = 0 with open(matrix_yaml['filename'], 'r') as f: for line in f: line_number += 1 if regex.match(line): return line_number raise Exception( f"Workflow '{workflow_name}' not found in {matrix_yaml['filename]']} (could not match regex: {regex})") def get_matrix_job_origin(matrix_job, workflow_name, workflow_location): filename = matrix_yaml['filename'] original_matrix_job = json.dumps(matrix_job, indent=None, separators=(', ', ': ')) original_matrix_job = original_matrix_job.replace('"', '') return { 'filename': filename, 'workflow_name': workflow_name, 'workflow_location': workflow_location, 'original_matrix_job': original_matrix_job } def remove_skip_test_jobs(matrix_jobs): '''Remove jobs defined in `matrix_file.skip_test_jobs`.''' new_matrix_jobs = [] for matrix_job in matrix_jobs: jobs = matrix_job['jobs'] new_jobs = set() for job in jobs: if not job in matrix_yaml['skip_test_jobs']: new_jobs.add(job) if new_jobs: new_matrix_job = copy.deepcopy(matrix_job) new_matrix_job['jobs'] = list(new_jobs) new_matrix_jobs.append(new_matrix_job) return new_matrix_jobs @static_result def get_excluded_matrix_jobs(): return parse_workflow_matrix_jobs(None, 'exclude') def apply_matrix_job_exclusion(matrix_job, exclusion): # Excluded tags to remove from unexploded tag categories: { tag: [exluded_value1, excluded_value2] } update_dict = {} for tag, excluded_values in exclusion.items(): # Not excluded if a specified tag isn't even present: if not tag in matrix_job: return matrix_job # print(f"tag: {tag}, excluded_values: {excluded_values}") # Some tags are left unexploded (e.g. 'jobs') to optimize scheduling, # so the values can be either a list or a single value. # Standardize to a list for comparison: if type(excluded_values) != list: excluded_values = [excluded_values] matrix_values = matrix_job[tag] if type(matrix_values) != list: matrix_values = [matrix_values] # Identify excluded values that are present in the matrix job for this tag: matched_tag_values = [value for value in matrix_values if value in excluded_values] # Not excluded if no values match for a tag: if not matched_tag_values: return matrix_job # If there is only a partial match to the matrix values, record the matches in the update_dict. # If the match is complete, do nothing. if len(matched_tag_values) < len(matrix_values): update_dict[tag] = matched_tag_values # If we get here, the matrix job matches and should be updated or excluded entirely. # If all tag matches are complete, then update_dict will be empty and the job should be excluded entirely if not update_dict: return None # If update_dict is populated, remove the matched values from the matrix job and return it. new_matrix_job = copy.deepcopy(matrix_job) for tag, values in update_dict.items(): for value in values: new_matrix_job[tag].remove(value) return new_matrix_job def remove_excluded_jobs(matrix_jobs): '''Remove jobs that match all tags in any of the exclusion matrix jobs.''' excluded = get_excluded_matrix_jobs() filtered_matrix_jobs = [] for matrix_job_orig in matrix_jobs: matrix_job = copy.deepcopy(matrix_job_orig) for exclusion in excluded: matrix_job = apply_matrix_job_exclusion(matrix_job, exclusion) if not matrix_job: break if matrix_job: filtered_matrix_jobs.append(matrix_job) return filtered_matrix_jobs def validate_required_tags(matrix_job): for tag in matrix_yaml['required_tags']: if tag not in matrix_job: raise Exception(error_message_with_matrix_job(matrix_job, f"Missing required tag '{tag}'")) all_tags = get_all_matrix_job_tags_sorted() for tag in matrix_job: if tag not in all_tags: raise Exception(error_message_with_matrix_job(matrix_job, f"Unknown tag '{tag}'")) if 'gpu' in matrix_job and matrix_job['gpu'] not in matrix_yaml['gpus']: raise Exception(error_message_with_matrix_job(matrix_job, f"Unknown gpu '{matrix_job['gpu']}'")) def set_default_tags(matrix_job): generic_defaults = set(matrix_yaml['defaulted_tags']) generic_defaults -= set(['os']) # handled specially. for tag in generic_defaults: if tag not in matrix_job: matrix_job[tag] = matrix_yaml['default_'+tag] def set_derived_tags(matrix_job): if 'os' not in matrix_job: matrix_job['os'] = lookup_os(matrix_job['ctk'], matrix_job['cxx']) # Expand nvcc device compiler shortcut: if matrix_job['cudacxx'] == 'nvcc': matrix_job['cudacxx'] = {'name': 'nvcc', 'version': matrix_job['ctk'], 'exe': 'nvcc'} if 'sm' in matrix_job and matrix_job['sm'] == 'gpu': if not 'gpu' in matrix_job: raise Exception(error_message_with_matrix_job(matrix_job, f"\"sm: 'gpu'\" requires tag 'gpu'.")) if not matrix_job['gpu'] in matrix_yaml['gpu_sm']: raise Exception(error_message_with_matrix_job(matrix_job, f"Missing matrix.yaml 'gpu_sm' entry for gpu '{matrix_job['gpu']}'")) matrix_job['sm'] = matrix_yaml['gpu_sm'][matrix_job['gpu']] if 'std' in matrix_job and matrix_job['std'] == 'all': host_compiler = matrix_job['cxx'] if 'cxx' in matrix_job else None device_compiler = matrix_job['cudacxx'] if 'cudacxx' in matrix_job else None project = matrix_job['project'] if 'project' in matrix_job else None matrix_job['std'] = lookup_supported_stds(device_compiler, host_compiler, project) if matrix_job['project'] in matrix_yaml['project_expanded_tests'] and 'test' in matrix_job['jobs']: matrix_job['jobs'].remove('test') matrix_job['jobs'] += matrix_yaml['project_expanded_tests'][matrix_job['project']] if (not 'build' in matrix_job['jobs'] and any([job in matrix_job['jobs'] for job in matrix_yaml['build_required_jobs']])): matrix_job['jobs'].append('build') def next_explode_tag(matrix_job): for tag in matrix_job: if not tag in matrix_yaml['non_exploded_tags'] and isinstance(matrix_job[tag], list): return tag return None def explode_tags(matrix_job, explode_tag=None): if not explode_tag: explode_tag = next_explode_tag(matrix_job) if not explode_tag: return [matrix_job] result = [] for value in matrix_job[explode_tag]: new_job = copy.deepcopy(matrix_job) new_job[explode_tag] = value result.extend(explode_tags(new_job)) return result def preprocess_matrix_jobs(matrix_jobs, explode_only=False): result = [] if explode_only: for matrix_job in matrix_jobs: result.extend(explode_tags(matrix_job)) else: for matrix_job in matrix_jobs: validate_required_tags(matrix_job) set_default_tags(matrix_job) for job in explode_tags(matrix_job): set_derived_tags(job) # The derived tags may need to be exploded again: result.extend(explode_tags(job)) return result def parse_workflow_matrix_jobs(args, workflow_name): # Special handling for exclusion matrix: don't validate, add default, etc. Only explode. is_exclusion_matrix = (workflow_name == 'exclude') if not workflow_name in matrix_yaml['workflows']: if (is_exclusion_matrix): # Valid, no exclusions if not defined return [] raise Exception(f"Workflow '{workflow_name}' not found in matrix file '{matrix_yaml['filename']}'") matrix_jobs = matrix_yaml['workflows'][workflow_name] if not matrix_jobs or len(matrix_jobs) == 0: return [] workflow_line_number = find_workflow_line_number(workflow_name) # Tag with the original matrix info, location, etc. for error messages and post-processing. # Do this first so the original tags / order /idx match the inpt object exactly. if not is_exclusion_matrix: for idx, matrix_job in enumerate(matrix_jobs): workflow_location = f"{matrix_yaml['filename']}:{workflow_line_number} (job {idx + 1})" matrix_job['origin'] = get_matrix_job_origin(matrix_job, workflow_name, workflow_location) # Fill in default values, explode lists. matrix_jobs = preprocess_matrix_jobs(matrix_jobs, explode_only=is_exclusion_matrix) if args: if args.skip_tests: matrix_jobs = remove_skip_test_jobs(matrix_jobs) if args.dirty_projects: matrix_jobs = [job for job in matrix_jobs if job['project'] in args.dirty_projects] # Don't remove excluded jobs if we're currently parsing them: if not is_exclusion_matrix: matrix_jobs = remove_excluded_jobs(matrix_jobs) # Sort the tags by, *ahem*, "importance": sorted_tags = get_all_matrix_job_tags_sorted() matrix_jobs = [{tag: matrix_job[tag] for tag in sorted_tags if tag in matrix_job} for matrix_job in matrix_jobs] return matrix_jobs def parse_workflow_dispatch_groups(args, workflow_name): # Add origin information to each matrix job, explode, filter, add defaults, etc. # The resulting matrix_jobs list is a complete and standardized list of jobs for the dispatch_group builder. matrix_jobs = parse_workflow_matrix_jobs(args, workflow_name) # If we're printing multiple workflows, add a prefix to the group name to differentiate them. group_prefix = f"[{workflow_name}] " if len(args.workflows) > 1 else "" # Convert the matrix jobs into a dispatch group object: workflow_dispatch_groups = {} for matrix_job in matrix_jobs: matrix_job_dispatch_group = matrix_job_to_dispatch_group(matrix_job, group_prefix) merge_dispatch_groups(workflow_dispatch_groups, matrix_job_dispatch_group) return workflow_dispatch_groups def write_outputs(final_workflow): job_list = [] runner_counts = {} id_to_full_job_name = {} total_jobs = 0 def process_job_array(group_name, array_name, parent_json): nonlocal job_list nonlocal runner_counts nonlocal total_jobs job_array = parent_json[array_name] if array_name in parent_json else [] for job_json in job_array: total_jobs += 1 job_list.append(f"{total_jobs:4} id: {job_json['id']:<4} {array_name:13} {job_json['name']}") id_to_full_job_name[job_json['id']] = f"{group_name} {job_json['name']}" runner = job_json['runner'] runner_counts[runner] = runner_counts.get(runner, 0) + 1 for group_name, group_json in final_workflow.items(): job_list.append(f"{'':4} {group_name}:") process_job_array(group_name, 'standalone', group_json) if 'two_stage' in group_json: for two_stage_json in group_json['two_stage']: process_job_array(group_name, 'producers', two_stage_json) process_job_array(group_name, 'consumers', two_stage_json) # Sort by descending counts: runner_counts = {k: v for k, v in sorted(runner_counts.items(), key=lambda item: item[1], reverse=True)} runner_heading = f"ðŸƒâ€ Runner counts (total jobs: {total_jobs})" runner_counts_table = f"| {'#':^4} | Runner\n" runner_counts_table += "|------|------\n" for runner, count in runner_counts.items(): runner_counts_table += f"| {count:4} | `{runner}`\n" runner_json = {"heading": runner_heading, "body": runner_counts_table} os.makedirs("workflow", exist_ok=True) write_json_file("workflow/workflow.json", final_workflow) write_json_file("workflow/workflow_keys.json", list(final_workflow.keys())) write_json_file("workflow/job_ids.json", id_to_full_job_name) write_text_file("workflow/job_list.txt", "\n".join(job_list)) write_json_file("workflow/runner_summary.json", runner_json) def write_override_matrix(override_matrix): os.makedirs("workflow", exist_ok=True) write_json_file("workflow/override.json", override_matrix) def print_gha_workflow(args): workflow_names = args.workflows if args.allow_override and 'override' in matrix_yaml['workflows']: override_matrix = matrix_yaml['workflows']['override'] if override_matrix and len(override_matrix) > 0: print(f"::notice::Using 'override' workflow instead of '{workflow_names}'") workflow_names = ['override'] write_override_matrix(override_matrix) final_workflow = {} for workflow_name in workflow_names: workflow_dispatch_groups = parse_workflow_dispatch_groups(args, workflow_name) merge_dispatch_groups(final_workflow, workflow_dispatch_groups) final_workflow = finalize_workflow_dispatch_groups(final_workflow) write_outputs(final_workflow) def print_devcontainer_info(args): devcontainer_version = matrix_yaml['devcontainer_version'] matrix_jobs = [] # Remove the `exclude` and `override` entries: ignored_matrix_keys = ['exclude', 'override'] workflow_names = [key for key in matrix_yaml['workflows'].keys() if key not in ignored_matrix_keys] for workflow_name in workflow_names: matrix_jobs.extend(parse_workflow_matrix_jobs(args, workflow_name)) # Remove all but the following keys from the matrix jobs: keep_keys = ['ctk', 'cxx', 'os'] combinations = [{key: job[key] for key in keep_keys} for job in matrix_jobs] # Remove duplicates and filter out windows jobs: unique_combinations = [] for combo in combinations: if not is_windows(combo) and combo not in unique_combinations: unique_combinations.append(combo) for combo in unique_combinations: combo['compiler_name'] = combo['cxx']['name'] combo['compiler_version'] = combo['cxx']['version'] combo['compiler_exe'] = combo['cxx']['exe'] del combo['cxx'] combo['cuda'] = combo['ctk'] del combo['ctk'] devcontainer_json = {'devcontainer_version': devcontainer_version, 'combinations': unique_combinations} # Pretty print the devcontainer json to stdout: print(json.dumps(devcontainer_json, indent=2)) def main(): parser = argparse.ArgumentParser(description='Compute matrix for workflow') parser.add_argument('matrix_file', help='Path to the matrix YAML file') parser_mode_group = parser.add_argument_group('Output Mode', "Must specify one of these options.") parser_mode = parser_mode_group.add_mutually_exclusive_group(required=True) parser_mode.add_argument('--workflows', nargs='+', help='Print GHA workflow with jobs from [pull_request, nightly, weekly, etc]') parser_mode.add_argument('--devcontainer-info', action='store_true', help='Print devcontainer info instead of GHA workflows.') parser.add_argument('--dirty-projects', nargs='*', help='Filter jobs to only these projects') parser.add_argument('--skip-tests', action='store_true', help='Remove jobs defined in `matrix_file.skip_test_jobs`.') parser.add_argument('--allow-override', action='store_true', help='If a non-empty "override" workflow exists, it will be used instead of those in --workflows.') args = parser.parse_args() # Check if the matrix file exists if not os.path.isfile(args.matrix_file): print(f"Error: Matrix file '{args.matrix_file}' does not exist.") sys.exit(1) with open(args.matrix_file, 'r') as f: global matrix_yaml matrix_yaml = yaml.safe_load(f) matrix_yaml['filename'] = args.matrix_file if args.workflows: print_gha_workflow(args) elif args.devcontainer_info: print_devcontainer_info(args) else: parser.print_usage() sys.exit(1) if __name__ == '__main__': main() cccl-2.5.0/.github/actions/workflow-results/000077500000000000000000000000001463375617100210105ustar00rootroot00000000000000cccl-2.5.0/.github/actions/workflow-results/action.yml000066400000000000000000000127441463375617100230200ustar00rootroot00000000000000name: "CCCL Workflow Sentinal" description: "Check the results of the dispatched jobs and comment on the PR." inputs: github_token: description: "The GitHub token to use for commenting on the PR. No comment will be made if not provided." required: false pr_number: description: "The PR number to comment on, if applicable. No comment will be made if not provided." required: false outputs: success: description: "Whether any jobs failed." value: ${{ steps.check-dispatch.outputs.success }} runs: using: "composite" steps: - name: Download workflow artifacts uses: actions/download-artifact@v3 with: name: workflow path: workflow/ - name: Download job artifacts continue-on-error: true # This may fail if no jobs succeed. The checks below will catch this. uses: actions/download-artifact@v3 with: name: jobs path: jobs - name: Clean up job artifacts continue-on-error: true shell: bash --noprofile --norc -euo pipefail {0} run: | # Fix artifacts written on windows: echo "::group::Fixing line endings in job artifacts" sudo apt-get update sudo apt-get install -y dos2unix find jobs -type f -exec dos2unix -v {} \; echo "::endgroup::" echo "::group::Job artifacts" tree jobs echo "::endgroup::" - name: Fetch workflow job info if: ${{ inputs.github_token != ''}} continue-on-error: true uses: actions/github-script@v4 with: github-token: ${{ inputs.github_token }} script: | const fs = require('fs'); const owner = context.repo.owner; const repo = context.repo.repo; const runId = context.runId; github.paginate( 'GET /repos/{owner}/{repo}/actions/runs/{run_id}/jobs', { owner: owner, repo: repo, run_id: runId } ) .then(jobs => { console.log('::group::Jobs JSON'); console.log(JSON.stringify(jobs, null, 2)); console.log('::endgroup::'); fs.mkdirSync("results", { recursive: true }); fs.writeFileSync('results/jobs.json', JSON.stringify(jobs, null, 2)); console.log(`Fetched ${jobs.length} jobs and saved to results/jobs.json`); }) .catch(error => { console.error(error); }); - name: Parse job times continue-on-error: true shell: bash --noprofile --norc -euo pipefail {0} run: | echo "Parsing job times..." python3 "${GITHUB_ACTION_PATH}/parse-job-times.py" workflow/workflow.json results/jobs.json - name: Prepare execution summary continue-on-error: true shell: bash --noprofile --norc -euo pipefail {0} run: | echo "Generating execution summary..." python3 "${GITHUB_ACTION_PATH}/prepare-execution-summary.py" workflow/workflow.json results/job_times.json - name: Prepare final summary id: final-summary continue-on-error: true shell: bash --noprofile --norc -euo pipefail {0} run: | echo "::group::Final Summary" python3 "${GITHUB_ACTION_PATH}/final-summary.py" | tee final_summary.md echo "::endgroup::" # This allows multiline strings and special characters to be passed through the GHA outputs: url_encode_string() { python3 -c "import sys; from urllib.parse import quote; print(quote(sys.stdin.read()))" } echo "::group::GHA Output: SUMMARY" printf "SUMMARY=%s\n" "$(cat final_summary.md | url_encode_string)" | tee -a "${GITHUB_OUTPUT}" echo "::endgroup::" cp final_summary.md ${GITHUB_STEP_SUMMARY} - name: Comment on PR if: ${{ !cancelled() && inputs.pr_number != '' && inputs.github_token != ''}} continue-on-error: true env: PR_NUMBER: ${{ fromJSON(inputs.pr_number) }} COMMENT_BODY: ${{ steps.final-summary.outputs.SUMMARY }} uses: actions/github-script@v4 with: github-token: ${{ inputs.github_token }} script: | const pr_number = process.env.PR_NUMBER; const owner = context.repo.owner; const repo = context.repo.repo; // Decode URL-encoded string for proper display in comments const commentBody = decodeURIComponent(process.env.COMMENT_BODY); console.log('::group::Commenting on PR #' + pr_number + ' with the following message:') console.log(commentBody); console.log('::endgroup::'); github.issues.createComment({ owner: owner, repo: repo, issue_number: pr_number, body: commentBody }); - name: Check for job success id: check-dispatch shell: bash --noprofile --norc -euo pipefail {0} run: | echo "::group::Checking for success artifacts" "${GITHUB_ACTION_PATH}/verify-job-success.py" workflow/job_ids.json result=$? echo "::endgroup::" if [[ $result -ne 0 ]]; then echo "success=false" >> "${GITHUB_OUTPUT}" exit 1 fi if [ -f workflow/override.json ]; then echo "::notice::Workflow matrix was overridden. Failing jobs." echo "Override matrix:" cat workflow/override.json | jq -c '.' echo "success=false" >> "${GITHUB_OUTPUT}" exit 1 fi echo "success=true" >> "${GITHUB_OUTPUT}" cccl-2.5.0/.github/actions/workflow-results/final-summary.py000077500000000000000000000034571463375617100241620ustar00rootroot00000000000000#!/usr/bin/env python3 import json import os import re import sys def read_file(filepath): with open(filepath, 'r') as f: return f.read().rstrip("\n ") def print_text_file(filepath): if os.path.exists(filepath): print(read_file(filepath) + "\n\n") def print_json_summary(summary, heading_level): print(f"
{summary['heading']}\n") print(summary["body"] + "\n") print("
\n") def print_summary_file(filepath, heading_level): if os.path.exists(filepath): with open(filepath, 'r') as f: print_json_summary(json.load(f), heading_level) def print_json_file(filepath, heading): if os.path.exists(filepath): json_data = json.load(open(filepath)) print(f"

{heading}

\n") print('```json') print(json.dumps(json_data, indent=2)) print('```') print("
\n") def main(): # Parse project summaries and sort them by the number of failed jobs: projects = [] project_file_regex = "[0-9]+_.+_summary.json" for filename in sorted(os.listdir("execution/projects")): match = re.match(project_file_regex, filename) if match: with open(f"execution/projects/{filename}", 'r') as f: projects.append(json.load(f)) print(f"
{read_file('execution/heading.txt')}\n") print("
    ") for project in projects: print("
  • ") print_json_summary(project, 3) print("
\n") print_json_file('workflow/override.json', 'ðŸ› ï¸ Override Matrix') print_text_file('workflow/changes.md') print_summary_file("workflow/runner_summary.json", 2) print("
") if __name__ == '__main__': main() cccl-2.5.0/.github/actions/workflow-results/parse-job-times.py000077500000000000000000000112001463375617100243600ustar00rootroot00000000000000#!/usr/bin/env python3 import argparse import datetime import json import os import sys def get_jobs_json(jobs_file): # Return the contents of jobs.json with open(jobs_file) as f: result = json.load(f) return result def get_workflow_json(workflow_file): # Return the contents of ~/cccl/.local/tmp/workflow.json with open(workflow_file) as f: return json.load(f) def write_json(filepath, json_object): with open(filepath, 'w') as f: json.dump(json_object, f, indent=4) def generate_job_id_map(workflow): '''Map full job name to job id''' job_id_map = {} for group_name, group_json in workflow.items(): standalone = group_json['standalone'] if 'standalone' in group_json else [] for job in standalone: name = f"{group_name} / s.{job['id']} / {job['name']}" job_id_map[name] = job['id'] two_stage = group_json['two_stage'] if 'two_stage' in group_json else [] for pc in two_stage: producers = pc['producers'] for job in producers: name = f"{group_name} / t.{pc['id']} / p.{job['id']} / {job['name']}" job_id_map[name] = job['id'] consumers = pc['consumers'] for job in consumers: name = f"{group_name} / t.{pc['id']} / c.{job['id']} / {job['name']}" job_id_map[name] = job['id'] return job_id_map def main(): # Accept two command line arguments: parser = argparse.ArgumentParser(description='Parse job times') parser.add_argument('workflow', type=str, help='Path to workflow.json') parser.add_argument('jobs', type=str, help='Path to jobs.json') args = parser.parse_args() jobs = get_jobs_json(args.jobs) workflow = get_workflow_json(args.workflow) # Converts full github job names into job ids: job_id_map = generate_job_id_map(workflow) # Map of id -> { } result = {} unknown_jobs = [job for job in jobs if job['name'] not in job_id_map] jobs = [job for job in jobs if job['name'] in job_id_map] # Process jobs: for job in jobs: name = job['name'] id = job_id_map[name] # Job times are 2024-05-09T06:52:20Z started_at = job['started_at'] started_time = datetime.datetime.strptime(started_at, "%Y-%m-%dT%H:%M:%SZ") started_time_epoch_secs = started_time.timestamp() completed_at = job['completed_at'] completed_time = datetime.datetime.strptime(completed_at, "%Y-%m-%dT%H:%M:%SZ") completed_time_epoch_secs = completed_time.timestamp() job_seconds = (completed_time - started_time).total_seconds() job_duration = str(datetime.timedelta(seconds=job_seconds)) result[id] = {} result[id]['name'] = name result[id]['started_at'] = started_at result[id]['completed_at'] = completed_at result[id]['started_epoch_secs'] = started_time_epoch_secs result[id]['completed_epoch_secs'] = completed_time_epoch_secs result[id]['job_duration'] = job_duration result[id]['job_seconds'] = job_seconds # Find the "Run command" step and record its duration: command_seconds = 0 for step in job['steps']: if step['name'].lower() == "run command": step_started_at = step['started_at'] step_started_time = datetime.datetime.strptime(step_started_at, "%Y-%m-%dT%H:%M:%SZ") step_completed_at = step['completed_at'] step_completed_time = datetime.datetime.strptime(step_completed_at, "%Y-%m-%dT%H:%M:%SZ") command_seconds = (step_completed_time - step_started_time).total_seconds() break command_duration = str(datetime.timedelta(seconds=command_seconds)) result[id]['command_seconds'] = command_seconds result[id]['command_duration'] = command_duration os.makedirs("results", exist_ok=True) write_json("results/job_times.json", result) print("::group::Unmapped jobs") print("\n".join([job['name'] for job in unknown_jobs])) print("::endgroup::") print("::group::Job times") print(f"{'Job':^10} {'Command':^10} {'Overhead':^10} Name") print(f"{'-'*10} {'-'*10} {'-'*10} {'-'*10}") for id, stats in result.items(): job_seconds = stats['job_seconds'] command_seconds = stats['command_seconds'] overhead = (job_seconds - command_seconds) * 100 / command_seconds print(f"{stats['job_duration']:10} {stats['command_duration']:10} {overhead:10.0f} {stats['name']}") print("::endgroup::") if __name__ == "__main__": main() cccl-2.5.0/.github/actions/workflow-results/prepare-execution-summary.py000077500000000000000000000262711463375617100265270ustar00rootroot00000000000000#!/usr/bin/env python3 import argparse import functools import json import os import re import sys def job_succeeded(job): # The job was successful if the success file exists: return os.path.exists(f'jobs/{job["id"]}/success') def natural_sort_key(key): # Natural sort impl (handles embedded numbers in strings, case insensitive) return [(int(text) if text.isdigit() else text.lower()) for text in re.split('(\d+)', key)] # Print the prepared text summary to the file at the given path def write_text(filepath, summary): with open(filepath, 'w') as f: print(summary, file=f) # Print the prepared JSON object to the file at the given path def write_json(filepath, json_object): with open(filepath, 'w') as f: json.dump(json_object, f, indent=4) def extract_jobs(workflow): jobs = [] for group_name, group in workflow.items(): if "standalone" in group: jobs += group["standalone"] if "two_stage" in group: for two_stage in group["two_stage"]: jobs += two_stage["producers"] jobs += two_stage["consumers"] return jobs @functools.lru_cache(maxsize=None) def get_sccache_stats(job_id): sccache_file = f'jobs/{job_id}/sccache_stats.json' if os.path.exists(sccache_file): with open(sccache_file) as f: return json.load(f) return None def update_summary_entry(entry, job, job_times=None): if 'passed' not in entry: entry['passed'] = 0 if 'failed' not in entry: entry['failed'] = 0 if job_succeeded(job): entry['passed'] += 1 else: entry['failed'] += 1 if job_times: time_info = job_times[job["id"]] job_time = time_info["job_seconds"] command_time = time_info["command_seconds"] if not 'job_time' in entry: entry['job_time'] = 0 if not 'command_time' in entry: entry['command_time'] = 0 if not 'max_job_time' in entry: entry['max_job_time'] = 0 entry['job_time'] += job_time entry['command_time'] += command_time entry['max_job_time'] = max(entry['max_job_time'], job_time) sccache_stats = get_sccache_stats(job["id"]) if sccache_stats: sccache_stats = sccache_stats['stats'] requests = sccache_stats.get('compile_requests', 0) hits = 0 if 'cache_hits' in sccache_stats: cache_hits = sccache_stats['cache_hits'] if 'counts' in cache_hits: counts = cache_hits['counts'] for lang, lang_hits in counts.items(): hits += lang_hits if 'sccache' not in entry: entry['sccache'] = {'requests': requests, 'hits': hits} else: entry['sccache']['requests'] += requests entry['sccache']['hits'] += hits return entry def build_summary(jobs, job_times=None): summary = {'projects': {}} projects = summary['projects'] for job in jobs: update_summary_entry(summary, job, job_times) matrix_job = job["origin"]["matrix_job"] project = matrix_job["project"] if not project in projects: projects[project] = {'tags': {}} tags = projects[project]['tags'] update_summary_entry(projects[project], job, job_times) for tag in matrix_job.keys(): if tag == 'project': continue if not tag in tags: tags[tag] = {'values': {}} values = tags[tag]['values'] update_summary_entry(tags[tag], job, job_times) value = str(matrix_job[tag]) if not value in values: values[value] = {} update_summary_entry(values[value], job, job_times) # Natural sort the value strings within each tag: for project, project_summary in projects.items(): for tag, tag_summary in project_summary['tags'].items(): tag_summary['values'] = dict(sorted(tag_summary['values'].items(), key=lambda item: natural_sort_key(item[0]))) # Sort the tags within each project so that: # - "Likely culprits" come first. These are tags that have multiple values, but only one has failures. # - Tags with multiple values and mixed pass/fail results come next. # - Tags with all failing values come next. # - Tags with no failures are last. def rank_tag(tag_summary): tag_failures = tag_summary['failed'] num_values = len(tag_summary['values']) num_failing_values = sum(1 for value_summary in tag_summary['values'].values() if value_summary['failed'] > 0) if num_values > 1: if num_failing_values == 1: return 0 elif num_failing_values > 0 and num_failing_values < num_values: return 1 elif tag_failures > 0: return 2 return 3 for project, project_summary in projects.items(): project_summary['tags'] = dict(sorted(project_summary['tags'].items(), key=lambda item: (rank_tag(item[1]), item[0]))) return summary def get_walltime(job_times): "Return the walltime for all jobs in seconds." start = None end = None for job_id, job_time in job_times.items(): job_start_timestamp = job_time['started_epoch_secs'] job_end_timestamp = job_time['completed_epoch_secs'] if not start or job_start_timestamp < start: start = job_start_timestamp if not end or job_end_timestamp > end: end = job_end_timestamp return end - start def format_seconds(seconds): days, remainder = divmod(seconds, 86400) hours, remainder = divmod(remainder, 3600) minutes, seconds = divmod(remainder, 60) days = int(days) hours = int(hours) minutes = int(minutes) seconds = int(seconds) if (days > 0): return f'{days}d {hours:02}h' elif (hours > 0): return f'{hours}h {minutes:02}m' else: return f'{minutes}m {seconds:02}s' def get_summary_stats(summary): passed = summary['passed'] failed = summary['failed'] total = passed + failed percent = int(100 * passed / total) if total > 0 else 0 pass_string = f'Pass: {percent:>3}%/{total}' stats = f'{pass_string:<14}' if 'job_time' in summary and total > 0 and summary['job_time'] > 0: job_time = summary['job_time'] max_job_time = summary['max_job_time'] total_job_duration = format_seconds(job_time) avg_job_duration = format_seconds(job_time / total) max_job_duration = format_seconds(max_job_time) stats += f' | Total: {total_job_duration:>7} | Avg: {avg_job_duration:>7} | Max: {max_job_duration:>7}' if 'sccache' in summary: sccache = summary['sccache'] requests = sccache["requests"] hits = sccache["hits"] hit_percent = int(100 * hits / requests) if requests > 0 else 0 hit_string = f'Hits: {hit_percent:>3}%/{requests}' stats += f' | {hit_string:<17}' return stats def get_summary_heading(summary, walltime): passed = summary['passed'] failed = summary['failed'] if summary['passed'] == 0: flag = '🟥' elif summary['failed'] > 0: flag = '🟨' else: flag = '🟩' return f'{flag} CI finished in {walltime}: {get_summary_stats(summary)}' def get_project_heading(project, project_summary): if project_summary['passed'] == 0: flag = '🟥' elif project_summary['failed'] > 0: flag = '🟨' else: flag = '🟩' return f'{flag} {project}: {get_summary_stats(project_summary)}' def get_tag_line(tag, tag_summary): passed = tag_summary['passed'] failed = tag_summary['failed'] values = tag_summary['values'] # Find the value with an failure rate that matches the tag's failure rate: suspicious = None if len(values) > 1 and failed > 0: for value, value_summary in values.items(): if value_summary['failed'] == failed: suspicious = value_summary suspicious['name'] = value break # Did any jobs with this value pass? likely_culprit = suspicious if suspicious and suspicious['passed'] == 0 else None note = '' if likely_culprit: flag = '🚨' note = f': {likely_culprit["name"]} {flag}' elif suspicious: flag = 'ðŸ”' note = f': {suspicious["name"]} {flag}' elif passed == 0: flag = '🟥' elif failed > 0: flag = '🟨' else: flag = '🟩' return f'{flag} {tag}{note}' def get_value_line(value, value_summary, tag_summary): passed = value_summary['passed'] failed = value_summary['failed'] total = passed + failed parent_size = len(tag_summary['values']) parent_failed = tag_summary['failed'] is_suspicious = failed > 0 and failed == parent_failed and parent_size > 1 is_likely_culprit = is_suspicious and passed == 0 if is_likely_culprit: flag = '🔥' elif is_suspicious: flag = 'ðŸ”' elif passed == 0: flag = '🟥' elif failed > 0: flag = '🟨' else: flag = '🟩' left_aligned = f"{flag} {value}" return f' {left_aligned:<20} {get_summary_stats(value_summary)}' def get_project_summary_body(project, project_summary): body = ['```'] for tag, tag_summary in project_summary['tags'].items(): body.append(get_tag_line(tag, tag_summary)) for value, value_summary in tag_summary['values'].items(): body.append(get_value_line(value, value_summary, tag_summary)) body.append('```') return "\n".join(body) def write_project_summary(idx, project, project_summary): heading = get_project_heading(project, project_summary) body = get_project_summary_body(project, project_summary) summary = {'heading': heading, 'body': body} write_json(f'execution/projects/{idx:03}_{project}_summary.json', summary) def write_workflow_summary(workflow, job_times=None): summary = build_summary(extract_jobs(workflow), job_times) walltime = format_seconds(get_walltime(job_times)) if job_times else '[unknown]' os.makedirs('execution/projects', exist_ok=True) write_text('execution/heading.txt', get_summary_heading(summary, walltime)) # Sort summary projects so that projects with failures come first, and ties # are broken by the total number of jobs: def sort_project_key(project_summary): failed = project_summary[1]['failed'] total = project_summary[1]['passed'] + failed return (-failed, -total) for i, (project, project_summary) in enumerate(sorted(summary['projects'].items(), key=sort_project_key)): write_project_summary(i, project, project_summary) def main(): parser = argparse.ArgumentParser() parser.add_argument('workflow', type=argparse.FileType('r')) parser.add_argument('job_times', type=argparse.FileType('r')) args = parser.parse_args() workflow = json.load(args.workflow) # The timing file is not required. try: job_times = json.load(args.job_times) except: job_times = None write_workflow_summary(workflow, job_times) if __name__ == '__main__': main() cccl-2.5.0/.github/actions/workflow-results/verify-job-success.py000077500000000000000000000013351463375617100251110ustar00rootroot00000000000000#!/usr/bin/env python3 import argparse import json import os import sys def main(): parser = argparse.ArgumentParser() parser.add_argument("job_id_map", type=argparse.FileType('r')) args = parser.parse_args() job_id_map = json.load(args.job_id_map) # For each job id, verify that the success artifact exists success = True for job_id, job_name in job_id_map.items(): success_file = f'jobs/{job_id}/success' print(f'Verifying job with id "{job_id}": "{job_name}"') if not os.path.exists(success_file): print(f'Failed: Artifact "{success_file}" not found') success = False if not success: sys.exit(1) if __name__ == '__main__': main() cccl-2.5.0/.github/copy-pr-bot.yaml000066400000000000000000000002541463375617100170370ustar00rootroot00000000000000# Configuration file for `copy-pr-bot` GitHub App # https://docs.gha-runners.nvidia.com/apps/copy-pr-bot/ enabled: true additional_trustees: - ahendriksen - gonzalobg cccl-2.5.0/.github/problem-matchers/000077500000000000000000000000001463375617100172435ustar00rootroot00000000000000cccl-2.5.0/.github/problem-matchers/problem-matcher.json000066400000000000000000000003601463375617100232160ustar00rootroot00000000000000{ "problemMatcher": [ { "owner": "nvcc", "pattern": [ { "regexp": "^\\/home\\/coder\\/(.+):(\\d+):(\\d+): (\\w+): \"(.+)\"$", "severity": 4, "message": 5 } ] } ] } cccl-2.5.0/.github/workflows/000077500000000000000000000000001463375617100160345ustar00rootroot00000000000000cccl-2.5.0/.github/workflows/backport-prs.yml000066400000000000000000000017411463375617100211710ustar00rootroot00000000000000name: Backport merged pull request on: pull_request_target: types: [closed] issue_comment: types: [created] permissions: contents: write # so it can comment pull-requests: write # so it can create pull requests jobs: backport: name: Backport pull request runs-on: ubuntu-latest # Only run when pull request is merged # or when a comment containing `/backport` is created by a maintainer if: > ( github.event_name == 'pull_request_target' && github.event.pull_request.merged ) || ( github.event_name == 'issue_comment' && github.event.issue.pull_request && contains(fromJSON('["MEMBER", "COLLABORATOR", "OWNER"]'), github.event.comment.author_association) && contains(github.event.comment.body, '/backport') ) steps: - uses: actions/checkout@v3 - name: Create backport pull requests uses: korthout/backport-action@v1 with: merge_commits: 'skip' cccl-2.5.0/.github/workflows/build-docs.yml000066400000000000000000000045021463375617100206050ustar00rootroot00000000000000name: Deploy CCCL pages on: # Runs on pushes targeting the default branch push: branches: ["main"] # Allows you to run this workflow manually from the Actions tab workflow_dispatch: # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages permissions: contents: read pages: write id-token: write # Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued. # However, do NOT cancel in-progress runs as we want to allow these production deployments to complete. concurrency: group: "pages" cancel-in-progress: false jobs: # Build job build: runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v3 - name: Setup Pages uses: actions/configure-pages@v3 # Build helper image for Thrust/CUB - name: Build helper image run: | bash ./docs/make_env.bash "cccl:docs" # Build top level docs for CCCL - name: Build landing page uses: actions/jekyll-build-pages@v1 with: source: ./docs/jekyll destination: ./_site # CUB - name: Build CUB docs run: | bash ./docs/build_docs.bash "cccl:docs" /cccl/cub/docs/gen_docs.bash sudo mkdir -p ./_site/cub sudo cp -rf ./cub/docs/_build/docs/CUB/latest/* ./_site/cub # Libcudacxx - name: Build libcudacxx docs uses: actions/jekyll-build-pages@v1 with: source: ./libcudacxx/docs destination: ./_site/libcudacxx # Thrust - name: Build Thrust markdown in Docker run: bash ./docs/build_docs.bash "cccl:docs" /cccl/thrust/docs/build_docs_locally.bash - name: Build Thrust docs uses: actions/jekyll-build-pages@v1 with: source: ./thrust/build_docs/github_pages destination: ./_site/thrust # Upload build artifacts - name: Upload artifact uses: actions/upload-pages-artifact@v2 # Deployment job deploy: environment: name: github-pages url: ${{ steps.deployment.outputs.page_url }} runs-on: ubuntu-latest needs: build steps: - name: Deploy to GitHub Pages id: deployment uses: actions/deploy-pages@v2 cccl-2.5.0/.github/workflows/ci-workflow-nightly.yml000066400000000000000000000070421463375617100225010ustar00rootroot00000000000000# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # This is the main workflow that runs on every PR and push to main name: nightly defaults: run: shell: bash --noprofile --norc -euo pipefail {0} on: schedule: - cron: '0 7 * * *' # 7AM UTC, 12AM PST, 3AM EST concurrency: group: ${{ github.workflow }}-on-${{ github.event_name }}-from-${{ github.ref_name }} jobs: build-workflow: name: Build workflow from matrix runs-on: ubuntu-latest permissions: contents: read outputs: workflow: ${{ steps.build-workflow.outputs.workflow }} workflow_keys: ${{ steps.build-workflow.outputs.workflow_keys }} steps: - name: Checkout repo uses: actions/checkout@v3 - name: Build workflow id: build-workflow uses: ./.github/actions/workflow-build with: workflows: nightly run-workflow: name: ${{ matrix.name }} needs: build-workflow permissions: id-token: write contents: read strategy: fail-fast: false matrix: name: ${{ fromJSON(needs.build-workflow.outputs.workflow_keys) }} uses: ./.github/workflows/workflow-dispatch.yml with: name: ${{ matrix.name }} jobs: ${{ toJSON(fromJSON(needs.build-workflow.outputs.workflow)[matrix.name]) }} # This job acts as a sentry and will fail if any leaf job in the workflow tree fails, as # run-workflow always succeeds. Use this job when checking for successful matrix workflow job completion. verify-workflow: name: Verify and summarize workflow results if: ${{ always() && !cancelled() }} needs: - build-workflow - run-workflow permissions: contents: read runs-on: ubuntu-latest steps: - name: Checkout repo uses: actions/checkout@v3 - name: Check workflow success id: check-workflow uses: ./.github/actions/workflow-results # Check all other job statuses. This job gates branch protection checks. ci: name: CI # !! Important: This job is used for branch protection checks. # !! Need to use always() instead of !cancelled() because skipped jobs count as success # !! for Github branch protection checks. Yes, really: by default, branch protections # !! can be bypassed by cancelling CI. See NVIDIA/cccl#605. if: ${{ always() }} needs: - verify-workflow runs-on: ubuntu-latest steps: - name: Check results run: | status="success" check_result() { name=$1 expected=$2 result=$3 echo "Checking if $name job result ('$result') is '$expected'..." if [[ "$result" != "$expected" ]]; then echo "$name job failed" status="failed" fi } check_result "verify-workflow" "success" "${{needs.verify-workflow.result}}" if [[ "$status" != "success" ]]; then exit 1 fi cccl-2.5.0/.github/workflows/ci-workflow-pull-request.yml000066400000000000000000000117161463375617100234700ustar00rootroot00000000000000# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # This is the main workflow that runs on every PR and push to main name: pull_request defaults: run: shell: bash --noprofile --norc -euo pipefail {0} on: push: branches: - "pull-request/[0-9]+" concurrency: group: ${{ github.workflow }}-on-${{ github.event_name }}-from-${{ github.ref_name }} cancel-in-progress: true jobs: build-workflow: name: Build workflow from matrix runs-on: ubuntu-latest permissions: contents: read pull-requests: read outputs: workflow: ${{ steps.build-workflow.outputs.workflow }} workflow_keys: ${{ steps.build-workflow.outputs.workflow_keys }} steps: - name: Checkout repo uses: actions/checkout@v3 - name: Lookup PR info id: get-pr-info uses: nv-gha-runners/get-pr-info@main - name: Build workflow id: build-workflow uses: ./.github/actions/workflow-build env: pr_worflow: ${{ !contains(github.event.head_commit.message, '[workflow:!pull_request]') && 'pull_request' || '' }} nightly_workflow: ${{ contains(github.event.head_commit.message, '[workflow:nightly]') && 'nightly' || '' }} with: allow_override: "true" skip_tests: ${{ toJSON(contains(github.event.head_commit.message, '[skip-tests]')) }} inspect_changes_script: ${{ toJSON(!contains(github.event.head_commit.message, '[all-projects]') && 'ci/inspect_changes.sh' || '') }} inspect_changes_base_sha: ${{ fromJSON(steps.get-pr-info.outputs.pr-info).base.sha }} workflows: >- ${{ env.pr_worflow }} ${{ env.nightly_workflow }} run-workflow: name: ${{ matrix.name }} needs: build-workflow permissions: id-token: write contents: read strategy: fail-fast: false matrix: name: ${{ fromJSON(needs.build-workflow.outputs.workflow_keys) }} uses: ./.github/workflows/workflow-dispatch.yml with: name: ${{ matrix.name }} jobs: ${{ toJSON(fromJSON(needs.build-workflow.outputs.workflow)[matrix.name]) }} # This job acts as a sentry and will fail if any leaf job in the workflow tree fails, as # run-workflow always succeeds. Use this job when checking for successful matrix workflow job completion. verify-workflow: name: Verify and summarize workflow results if: ${{ always() && !cancelled() }} needs: - build-workflow - run-workflow permissions: contents: read pull-requests: write # Posts a comment back to the PR. runs-on: ubuntu-latest steps: - name: Checkout repo uses: actions/checkout@v3 - name: Get Base Branch from PR id: get-pr-info uses: nv-gha-runners/get-pr-info@main - name: Check workflow success id: check-workflow uses: ./.github/actions/workflow-results with: github_token: ${{ secrets.GITHUB_TOKEN }} pr_number: ${{ fromJSON(steps.get-pr-info.outputs.pr-info).number }} verify-devcontainers: name: Verify Dev Containers permissions: id-token: write contents: read uses: ./.github/workflows/verify-devcontainers.yml # Check all other job statuses. This job gates branch protection checks. ci: name: CI # !! Important: This job is used for branch protection checks. # !! Need to use always() instead of !cancelled() because skipped jobs count as success # !! for Github branch protection checks. Yes, really: by default, branch protections # !! can be bypassed by cancelling CI. See NVIDIA/cccl#605. if: ${{ always() }} needs: - verify-workflow - verify-devcontainers runs-on: ubuntu-latest steps: - name: Check results run: | status="success" check_result() { name=$1 expected=$2 result=$3 echo "Checking if $name job result ('$result') is '$expected'..." if [[ "$result" != "$expected" ]]; then echo "$name job failed" status="failed" fi } check_result "verify-workflow" "success" "${{needs.verify-workflow.result}}" check_result "verify-devcontainers" "success" "${{needs.verify-devcontainers.result}}" if [[ "$status" != "success" ]]; then exit 1 fi cccl-2.5.0/.github/workflows/project_automation_set_in_progress.yml000066400000000000000000000347741463375617100257710ustar00rootroot00000000000000# SPDX-FileCopyrightText: Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. name: Set PR and Linked Issues to In Progress on: pull_request_target: # Run this action when a PR is opened or edited # Issues do not have a graphQL connection to linked PRs so we can't use that event types: [opened, converted_to_draft] pull_request_review: # Run this action when a PR is reviewed types: [submitted] env: ORG: ${{ github.event.repository.owner.login }} PR_NUMBER: ${{ github.event.pull_request.number }} REPO: ${{ github.event.repository.name }} PR_GLOBAL_ID: ${{ github.event.pull_request.node_id}} # The environment vars below are hard-coded from external queries to save time + complexity here # Note: PVT means Project V2, not "Private" # PVT = Project V2, PVTSSF = Project V2 Single Select Field, PVTIF = Project V2 Iteration Field PROJECT_ID: "PVT_kwDOABpemM4AEhOI" STATUS_FIELD_ID: "PVTSSF_lADOABpemM4AEhOIzgCmnYc" WORKING_SPRINT_FIELD_ID: "PVTIF_lADOABpemM4AEhOIzgJlRho" START_SPRINT_FIELD_ID: "PVTIF_lADOABpemM4AEhOIzgJlRhU" IN_PROGRESS_PROJECT_OPTION_ID: "47fc9ee4" IN_REVIEW_PROJECT_OPTION_ID: "c6b49c6b" jobs: query_and_mutate_project_fields: runs-on: ubuntu-latest permissions: pull-requests: read steps: - name: Check if changes requested from a reviewer id: check_changes_requested if: github.event_name == 'pull_request_review' env: GH_TOKEN: ${{ github.token }} run: | if [ ${{ github.event.review.state }} != 'changes_requested' ]; then echo "Changes not requested, exiting" exit 0 fi continue-on-error: true - name: Generate token id: generate_token if: github.event_name == 'pull_request_target' uses: tibdex/github-app-token@v1.8.0 with: app_id: ${{ secrets.CCCL_AUTH_APP_ID }} private_key: ${{ secrets.CCCL_AUTH_APP_PEM }} - name: Wait 1 Second id: sleep if: github.event_name == 'pull_request_target' run: sleep 1 # We sleep here to ensure the pr is added to the project before we query for it - name: Select Status Field Value id: select_status_field_value if: github.event_name == 'pull_request_target' env: GITHUB_TOKEN: ${{ steps.generate_token.outputs.token }} run: | # If it's not a draft and it's an opened trigger, the status should be "Ready for Review", otherwise "In Progress" if [ ${{ github.event.pull_request.draft }} == false ] && [ ${{ github.event.action }} == "opened" ]; then echo "PR is not a draft, setting status to 'In Review'" echo "STATUS_OPTION_ID=$IN_REVIEW_PROJECT_OPTION_ID" >> $GITHUB_ENV else echo "PR is a draft, setting status to 'In Progress'" echo "STATUS_OPTION_ID=$IN_PROGRESS_PROJECT_OPTION_ID" >> $GITHUB_ENV fi continue-on-error: true - name: Get PR Project ID id: get_pr_id if: github.event_name == 'pull_request_target' env: GITHUB_TOKEN: ${{ steps.generate_token.outputs.token }} run: | # Query up to 10 projects for the PR gh api graphql -f query=' query { organization(login: "${{ env.ORG }}") { repository(name: "${{ env.REPO }}") { issueOrPullRequest(number: ${{ env.PR_NUMBER }}) { ... on PullRequest { id projectItems(first: 10) { edges { node { id project { id } } } } } } } } }' > project_data.json # Filter the json result to only the project-specific ID for the PR # A PR can be in multiple projects so we need to filter by the project ID we want pr_id=$(jq -r '.data.organization.repository.issueOrPullRequest.projectItems.edges[] | select(.node.project.id == "${{ env.PROJECT_ID }}") | .node.id' project_data.json) echo "PR_ID=$pr_id" >> $GITHUB_ENV continue-on-error: true - name: Set PR Fields id: set_pr_fields if: github.event_name == 'pull_request_target' env: GITHUB_TOKEN: ${{ steps.generate_token.outputs.token }} run: | gh api graphql -f query=' mutation { updateProjectV2ItemFieldValue( input: { projectId: "${{ env.PROJECT_ID }}" itemId: "${{ env.PR_ID }}" fieldId: "${{ env.STATUS_FIELD_ID }}" value: { singleSelectOptionId: "${{ env.STATUS_OPTION_ID }}" } } ) { projectV2Item { id } } }' # Check if the PR has a start sprint assigned, save the result for the linked issues gh api graphql -f query=' query { node(id: "${{ env.PR_ID }}") { ... on ProjectV2Item { id fieldValueByName(name: "Start Sprint") { ... on ProjectV2ItemFieldIterationValue { id } } } } }' > start_sprint_exists_data.json start_sprint_option_id=$(jq -r '.data.node.fieldValueByName.id' start_sprint_exists_data.json) echo "START_SPRINT_OPTION_ID=$start_sprint_option_id" >> $GITHUB_ENV # If there is no start sprint assigned, assign the current start sprint if [ "$start_sprint_option_id" == 'null' ]; then # Get current start sprint iteration id # The current sprint is always the first iteration in the list gh api graphql -f query=' query MyQuery { node(id: "${{ env.PROJECT_ID }}") { ... on ProjectV2 { id field(name: "Start Sprint") { ... on ProjectV2IterationField { id name configuration { iterations { id } } } } } } }' > start_sprint_option_data.json current_start_sprint_option_id=$(jq -r '.data.node.field.configuration.iterations[0].id' start_sprint_option_data.json) echo "CURRENT_START_SPRINT_OPTION_ID=$current_start_sprint_option_id" >> $GITHUB_ENV # The query below is constructed differently than the ones above due to bash variable syntax + github actions syntax interactions QUERY="mutation { updateProjectV2ItemFieldValue( input: { projectId: \"$PROJECT_ID\" itemId: \"$PR_ID\" fieldId: \"$START_SPRINT_FIELD_ID\" value: { iterationId: \"$current_start_sprint_option_id\" } } ) { projectV2Item { id } } }" gh api graphql --field query="$QUERY" fi # Assign the current working sprint to the PR (faster/simpler to just overwrite even if it is the same) gh api graphql -f query=' query { node(id: "${{ env.PROJECT_ID }}") { ... on ProjectV2 { id field(name: "Working Sprint") { ... on ProjectV2IterationField { id name configuration { iterations { id } } } } } } }' > working_sprint_options_data.json current_working_sprint_option_id=$(jq -r '.data.node.field.configuration.iterations[0].id' working_sprint_options_data.json) echo "CURRENT_WORKING_SPRINT_OPTION_ID=$current_working_sprint_option_id" >> $GITHUB_ENV # Set the working sprint to the current working sprint QUERY="mutation { updateProjectV2ItemFieldValue( input: { projectId: \"$PROJECT_ID\" itemId: \"$PR_ID\" fieldId: \"$WORKING_SPRINT_FIELD_ID\" value: { iterationId: \"$current_working_sprint_option_id\" } } ) { projectV2Item { id } } }" gh api graphql --field query="$QUERY" continue-on-error: true - name: Sync Linked Issues id: sync_linked_issues if: github.event_name == 'pull_request_target' env: GITHUB_TOKEN: ${{ steps.generate_token.outputs.token }} run: | # Find the linked issues to the PR gh api graphql -f query=' query { organization(login: "${{ env.ORG }}") { repository(name: "${{ env.REPO }}") { issueOrPullRequest(number: ${{ env.PR_NUMBER }}) { ... on PullRequest { id closingIssuesReferences(first: 10) { edges { node { id projectItems(first: 10) { nodes { id } edges { node { id project { id } } } } } } } } } } } }' > linked_issues.json issue_ids=$(jq -r '.data.organization.repository.issueOrPullRequest.closingIssuesReferences.edges[].node.projectItems.edges[] | select(.node.project.id == "${{ env.PROJECT_ID }}") | .node.id' linked_issues.json) # For each linked issue, set the status to "In Progress", the Working Sprint to the current working sprint # If there's no Start Sprint, set that to the current Start Sprint as well for issue_id in $issue_ids; do # Set the status of the linked issues to "In Progress" QUERY="mutation { updateProjectV2ItemFieldValue( input: { projectId: \"$PROJECT_ID\" itemId: \"$issue_id\" fieldId: \"$STATUS_FIELD_ID\" value: { singleSelectOptionId: \"$STATUS_OPTION_ID\" } } ) { projectV2Item { id } } }" gh api graphql --field query="$QUERY" # Set the working sprint of the linked issues to the current working sprint QUERY="mutation { updateProjectV2ItemFieldValue( input: { projectId: \"$PROJECT_ID\" itemId: \"$issue_id\" fieldId: \"$WORKING_SPRINT_FIELD_ID\" value: { iterationId: \"$CURRENT_WORKING_SPRINT_OPTION_ID\" } } ) { projectV2Item { id } } }" gh api graphql --field query="$QUERY" # Set the start sprint of the linked issues to the current start sprint if it's null if [ ${{ env.START_SPRINT_OPTION_ID }} == 'null' ]; then QUERY="mutation { updateProjectV2ItemFieldValue( input: { projectId: \"$PROJECT_ID\" itemId: \"$issue_id\" fieldId: \"$START_SPRINT_FIELD_ID\" value: { iterationId: \"$CURRENT_START_SPRINT_OPTION_ID\" } } ) { projectV2Item { id } } }" gh api graphql --field query="$QUERY" fi done continue-on-error: true cccl-2.5.0/.github/workflows/project_automation_set_in_review.yml000066400000000000000000000150071463375617100254120ustar00rootroot00000000000000# SPDX-FileCopyrightText: Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. name: Set PR and Linked Issues to In Review on: pull_request_target: # Run this action when a PR is opened or edited # Issues do not have a graphQL connection to linked PRs so we can't use that event types: [ready_for_review] env: ORG: ${{ github.event.repository.owner.login }} PR_NUMBER: ${{ github.event.pull_request.number }} REPO: ${{ github.event.repository.name }} # The environment vars below are hard-coded from external queries to save time + complexity here # Note: PVT means Project V2, not "Private" # PVT = Project V2, PVTSSF = Project V2 Single Select Field, PVTIF = Project V2 Iteration Field PROJECT_ID: "PVT_kwDOABpemM4AEhOI" STATUS_FIELD_ID: "PVTSSF_lADOABpemM4AEhOIzgCmnYc" IN_REVIEW_PROJECT_OPTION_ID: "c6b49c6b" jobs: query_and_mutate_project_fields: runs-on: ubuntu-latest steps: - name: Generate token id: generate_token uses: tibdex/github-app-token@v1.8.0 with: app_id: ${{ secrets.CCCL_AUTH_APP_ID }} private_key: ${{ secrets.CCCL_AUTH_APP_PEM }} - name: Wait 1 Second id: sleep run: sleep 1 - name: Get PR Project ID id: get_pr_id env: GITHUB_TOKEN: ${{ steps.generate_token.outputs.token }} run: | # Query up to 10 projects for the PR gh api graphql -f query=' query { organization(login: "${{ env.ORG }}") { repository(name: "${{ env.REPO }}") { issueOrPullRequest(number: ${{ env.PR_NUMBER }}) { ... on PullRequest { id projectItems(first: 10) { edges { node { id project { id } } } } } } } } }' > project_data.json # Filter the json result to only the project-specific ID for the PR # A PR can be in multiple projects so we need to filter by the project ID we want pr_id=$(jq -r '.data.organization.repository.issueOrPullRequest.projectItems.edges[] | select(.node.project.id == "${{ env.PROJECT_ID }}") | .node.id' project_data.json) echo "PR_ID=$pr_id" >> $GITHUB_ENV continue-on-error: true - name: Set PR to In Review id: set_pr_in_review env: GITHUB_TOKEN: ${{ steps.generate_token.outputs.token }} run: | # Update the PR status to In Review gh api graphql -f query=' mutation { updateProjectV2ItemFieldValue( input: { projectId: "${{ env.PROJECT_ID }}" itemId: "${{ env.PR_ID }}" fieldId: "${{ env.STATUS_FIELD_ID }}" value: { singleSelectOptionId: "${{ env.IN_REVIEW_PROJECT_OPTION_ID }}" } } ) { projectV2Item { id } } }' continue-on-error: true - name: Set Linked Issues to In Review id: update_linked_issues env: GITHUB_TOKEN: ${{ steps.generate_token.outputs.token }} run: | gh api graphql -f query=' query { organization(login: "${{ env.ORG }}") { repository(name: "${{ env.REPO }}") { issueOrPullRequest(number: ${{ env.PR_NUMBER }}) { ... on PullRequest { id closingIssuesReferences(first: 10) { edges { node { id projectItems(first: 10) { nodes { id } edges { node { id project { id } } } } } } } } } } } }' > linked_issue_data.json issue_ids=$(jq -r '.data.organization.repository.issueOrPullRequest.closingIssuesReferences.edges[].node.projectItems.edges[] | select(.node.project.id == "${{ env.PROJECT_ID }}") | .node.id' linked_issue_data.json) # Set Linked Issues to In Review for issue_id in $issue_ids; do # The query below is constructed differently than the others due to bash variable syntax + github actions syntax interactions QUERY="mutation { updateProjectV2ItemFieldValue( input: { projectId: \"$PROJECT_ID\" itemId: \"$issue_id\" fieldId: \"$STATUS_FIELD_ID\" value: { singleSelectOptionId: \"$IN_REVIEW_PROJECT_OPTION_ID\" } } ) { projectV2Item { id } } }" gh api graphql --field query="$QUERY" done continue-on-error: true cccl-2.5.0/.github/workflows/project_automation_set_issue_type.yml000066400000000000000000000114011463375617100256060ustar00rootroot00000000000000# SPDX-FileCopyrightText: Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. name: Set Issue Type Single Select on: issues: # Run this action when an issue is opened types: [opened] env: ISSUE_NODE_ID: ${{ github.event.issue.node_id }} # The environment vars below are hard-coded from external queries to save time + complexity here # Note: PVT means Project V2, not "Private" - although this is a private project # PVT = Project V2, PVTSSF = Project V2 Single Select Field, PVTIF = Project V2 Iteration Field PROJECT_ID: "PVT_kwDOABpemM4AEhOI" ISSUE_TYPE_FIELD_ID: "PVTSSF_lADOABpemM4AEhOIzgCzg-4" BUG_OPTION_ID: "e7e7e23f" FEATURE_OPTION_ID: "f8765953" DOCS_OPTION_ID: "cb6cb7bf" EPIC_OPTION_ID: "1d095615" THEME_OPTION_ID: "22f101c0" jobs: update_issue_type_in_project: runs-on: ubuntu-latest steps: - name: Sleep 1s id: sleep_1s run: sleep 1 # We sleep to ensure the issue is added to the project before we run this action - name: Generate token id: generate_token uses: tibdex/github-app-token@v1.8.0 with: app_id: ${{ secrets.CCCL_AUTH_APP_ID }} private_key: ${{ secrets.CCCL_AUTH_APP_PEM }} - name: Get Issue Project ID id: get_issue_id env: GITHUB_TOKEN: ${{ steps.generate_token.outputs.token }} run: | # Query up to 10 projects for the Issue gh api graphql -f query=' query { node(id: "${{ env.ISSUE_NODE_ID }}") { ... on Issue { projectItems(first: 10) { nodes { id project { id } } } } } }' > project_data.json # Filter the json result to only the project-specific ID for the PR # An issue can be in multiple projects so we need to filter by the project ID we want issue_id=$(jq -r '.data.node.projectItems.nodes[] | select(.project.id == "${{ env.PROJECT_ID }}") | .id' project_data.json) echo "ISSUE_PROJECT_ID=$issue_id" >> $GITHUB_ENV - name: Extract Issue Type Text id: extract_issue_type env: ISSUE_TITLE: ${{ github.event.issue.title }} run: | # Extract the text between two brackets in the issue title issue_type=$(echo "$ISSUE_TITLE" | grep -o '\[.*\]' | tr -d '[]') # Set the issue type option ID based on the extracted text if [ "$issue_type" == "BUG" ]; then option_id=${{ env.BUG_OPTION_ID }} elif [ "$issue_type" == "FEA" ]; then option_id=${{ env.FEATURE_OPTION_ID }} elif [ "$issue_type" == "DOC" ]; then option_id=${{ env.DOCS_OPTION_ID }} elif [ "$issue_type" == "EPIC" ]; then option_id=${{ env.EPIC_OPTION_ID }} elif [ "$issue_type" == "THEME" ]; then option_id=${{ env.THEME_OPTION_ID }} else option_id="Undefined" fi echo "TYPE_OPTION_ID=$option_id" >> $GITHUB_ENV - name: Set Issue Type id: set_issue_type env: GITHUB_TOKEN: ${{ steps.generate_token.outputs.token }} if: ${{ env.TYPE_OPTION_ID }} != "Undefined" run: | # Mutation to update the Issue's Issue Type field gh api graphql -f query=' mutation { updateProjectV2ItemFieldValue( input: { projectId: "${{ env.PROJECT_ID }}" itemId: "${{ env.ISSUE_PROJECT_ID }}" fieldId: "${{ env.ISSUE_TYPE_FIELD_ID }}" value: { singleSelectOptionId: "${{ env.TYPE_OPTION_ID }}" } } ) { projectV2Item { id } } }' cccl-2.5.0/.github/workflows/project_automation_set_roadmap.yml000066400000000000000000000151351463375617100250500ustar00rootroot00000000000000# SPDX-FileCopyrightText: Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. name: Set PR or Issue Roadmap Value on Close on: pull_request_target: # Run this action when a PR is closed types: [closed] issues: # Run this action when an issue is closed types: [closed] env: ORG: ${{ github.event.repository.owner.login }} PR_NUMBER: ${{ github.event.pull_request.number }} # evaluates to null for issues ISSUE_NUMBER: ${{ github.event.issue.number }} # evaluates to null for PRs REPO: ${{ github.event.repository.name }} # The environment vars below are hard-coded from external queries to save time + complexity here # Note: PVT means Project V2, not "Private" # PVT = Project V2, PVTSSF = Project V2 Single Select Field, PVTIF = Project V2 Iteration Field PROJECT_ID: "PVT_kwDOABpemM4AEhOI" ROADMAP_FIELD_ID: "PVTSSF_lADOABpemM4AEhOIzgC_MXI" jobs: set_roadmap_value: runs-on: ubuntu-latest steps: - name: Generate token id: generate_token uses: tibdex/github-app-token@v1.8.0 with: app_id: ${{ secrets.CCCL_AUTH_APP_ID }} private_key: ${{ secrets.CCCL_AUTH_APP_PEM }} - name: Get PR Project ID if: github.event_name == 'pull_request_target' id: get_pr_id env: GITHUB_TOKEN: ${{ steps.generate_token.outputs.token }} run: | # Query up to 10 projects for the PR gh api graphql -f query=' query { organization(login: "${{ env.ORG }}") { repository(name: "${{ env.REPO }}") { issueOrPullRequest(number: ${{ env.PR_NUMBER }}) { ... on PullRequest { id projectItems(first: 10) { edges { node { id project { id } } } } } } } } }' > project_data.json # Filter the json result to only the project-specific ID for the PR # A PR can be in multiple projects so we need to filter by the project ID we want pr_id=$(jq -r '.data.organization.repository.issueOrPullRequest.projectItems.edges[] | select(.node.project.id == "${{ env.PROJECT_ID }}") | .node.id' project_data.json) echo "ITEM_ID=$pr_id" >> $GITHUB_ENV continue-on-error: true - name: Get Issue Project ID if: github.event_name == 'issues' id: get_issue_id env: GITHUB_TOKEN: ${{ steps.generate_token.outputs.token }} run: | # Query up to 10 projects for the Issue gh api graphql -f query=' query { organization(login: "${{ env.ORG }}") { repository(name: "${{ env.REPO }}") { issueOrPullRequest(number: ${{ env.ISSUE_NUMBER }}) { ... on Issue { id projectItems(first: 10) { edges { node { id project { id } } } } } } } } }' > project_data.json # Filter the json result to only the project-specific ID for the PR # A PR can be in multiple projects so we need to filter by the project ID we want issue_id=$(jq -r '.data.organization.repository.issueOrPullRequest.projectItems.edges[] | select(.node.project.id == "${{ env.PROJECT_ID }}") | .node.id' project_data.json) echo "ITEM_ID=$issue_id" >> $GITHUB_ENV continue-on-error: true - name: Get Current Release id: get_current_release env: GITHUB_TOKEN: ${{ steps.generate_token.outputs.token }} run: | # Get current roadmap id # We maintain the roadmap as a single select field in the project, with the first value being the upcoming release gh api graphql -f query=' query MyQuery { node(id: "${{ env.PROJECT_ID }}") { ... on ProjectV2 { id field(name: "Roadmap") { ... on ProjectV2SingleSelectField { id options { id } } } } } }' > roadmap_option_data.json current_roadmap_option_id=$(jq -r '.data.node.field.options[0].id' roadmap_option_data.json) echo "CURRENT_ROADMAP_OPTION_ID=$current_roadmap_option_id" >> $GITHUB_ENV continue-on-error: true - name: Set Item Roadmap id: set_item_roadmap env: GITHUB_TOKEN: ${{ steps.generate_token.outputs.token }} run: | # Update the PR status to In Review gh api graphql -f query=' mutation { updateProjectV2ItemFieldValue( input: { projectId: "${{ env.PROJECT_ID }}" itemId: "${{ env.ITEM_ID }}" fieldId: "${{ env.ROADMAP_FIELD_ID }}" value: { singleSelectOptionId: "${{ env.CURRENT_ROADMAP_OPTION_ID }}" } } ) { projectV2Item { id } } }' continue-on-error: true cccl-2.5.0/.github/workflows/project_automation_sync_pr_issues.yml000066400000000000000000000220451463375617100256200ustar00rootroot00000000000000# SPDX-FileCopyrightText: Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. name: Synchronize Linked Issues on: pull_request_target: # Run this action when a PR is opened or edited # Issues do not have a graphQL connection to linked PRs so we can't use that event types: [edited] env: ORG: ${{ github.event.repository.owner.login }} PR_NUMBER: ${{ github.event.pull_request.number }} REPO: ${{ github.event.repository.name }} # The environment vars below are hard-coded from external queries to save time + complexity here # Note: PVT means Project V2, not "Private" PROJECT_ID: "PVT_kwDOABpemM4AEhOI" STATUS_FIELD_ID: "PVTSSF_lADOABpemM4AEhOIzgCmnYc" WORKING_SPRINT_FIELD_ID: "PVTIF_lADOABpemM4AEhOIzgJlRho" START_SPRINT_FIELD_ID: "PVTIF_lADOABpemM4AEhOIzgJlRhU" jobs: query_and_mutate_project_fields: runs-on: ubuntu-latest steps: - name: Generate token id: generate_token uses: tibdex/github-app-token@v1.8.0 with: app_id: ${{ secrets.CCCL_AUTH_APP_ID }} private_key: ${{ secrets.CCCL_AUTH_APP_PEM }} - name: Get PR Project ID id: get_pr_id env: GITHUB_TOKEN: ${{ steps.generate_token.outputs.token }} run: | # Query up to 10 projects for the PR gh api graphql -f query=' query { organization(login: "${{ env.ORG }}") { repository(name: "${{ env.REPO }}") { issueOrPullRequest(number: ${{ env.PR_NUMBER }}) { ... on PullRequest { id projectItems(first: 10) { edges { node { id project { id } } } } } } } } }' > project_data.json # Filter the json result to only the project-specific ID for the PR # A PR can be in multiple projects so we need to filter by the project ID we want pr_id=$(jq -r '.data.organization.repository.issueOrPullRequest.projectItems.edges[] | select(.node.project.id == "${{ env.PROJECT_ID }}") | .node.id' project_data.json) echo "PR_ID=$pr_id" >> $GITHUB_ENV continue-on-error: true - name: Get Status, Start Sprint, and Working Sprint Fields from the PR id: get_pr_status env: GITHUB_TOKEN: ${{ steps.generate_token.outputs.token }} run: | # Query the PR's status from the project gh api graphql -f query=' query { node(id: "${{ env.PR_ID }}") { ... on ProjectV2Item { id fieldValueByName(name: "Status") { ... on ProjectV2ItemFieldSingleSelectValue { optionId } } } } }' > status_field_data.json status_option_id=$(jq -r '.data.node.fieldValueByName.optionId' status_field_data.json) echo "STATUS_OPTION_ID=$status_option_id" >> $GITHUB_ENV # Query the PR's start sprint from the project gh api graphql -f query=' query { node(id: "${{ env.PR_ID }}") { ... on ProjectV2Item { id fieldValueByName(name: "Start Sprint") { ... on ProjectV2ItemFieldIterationValue { iterationId } } } } }' > start_sprint_data.json start_sprint_iteration_id=$(jq -r '.data.node.fieldValueByName.iterationId' start_sprint_data.json) echo "START_SPRINT_ITERATION_ID=$start_sprint_iteration_id" >> $GITHUB_ENV # Query the PR's working sprint from the project gh api graphql -f query=' query { node(id: "${{ env.PR_ID }}") { ... on ProjectV2Item { id fieldValueByName(name: "Working Sprint") { ... on ProjectV2ItemFieldIterationValue { iterationId } } } } }' > working_sprint_data.json working_sprint_iteration_id=$(jq -r '.data.node.fieldValueByName.iterationId' working_sprint_data.json) echo "WORKING_SPRINT_ITERATION_ID=$working_sprint_iteration_id" >> $GITHUB_ENV continue-on-error: true - name: Sync Linked Issues id: update_linked_issues env: GITHUB_TOKEN: ${{ steps.generate_token.outputs.token }} run: | gh api graphql -f query=' query { organization(login: "${{ env.ORG }}") { repository(name: "${{ env.REPO }}") { issueOrPullRequest(number: ${{ env.PR_NUMBER }}) { ... on PullRequest { id closingIssuesReferences(first: 10) { edges { node { id projectItems(first: 10) { nodes { id } edges { node { id project { id } } } } } } } } } } } }' > project_data.json issue_ids=$(jq -r '.data.organization.repository.issueOrPullRequest.closingIssuesReferences.edges[].node.projectItems.edges[] | select(.node.project.id == "${{ env.PROJECT_ID }}") | .node.id' project_data.json) for issue_id in $issue_ids; do # The query below is constructed differently than the others due to bash variable syntax + github actions syntax interactions # Update Status QUERY="mutation { updateProjectV2ItemFieldValue( input: { projectId: \"$PROJECT_ID\" itemId: \"$issue_id\" fieldId: \"$STATUS_FIELD_ID\" value: { singleSelectOptionId: \"$STATUS_OPTION_ID\" } } ) { projectV2Item { id } } }" gh api graphql --field query="$QUERY" # Update Start Sprint QUERY="mutation { updateProjectV2ItemFieldValue( input: { projectId: \"$PROJECT_ID\" itemId: \"$issue_id\" fieldId: \"$START_SPRINT_FIELD_ID\" value: { iterationId: \"$START_SPRINT_ITERATION_ID\" } } ) { projectV2Item { id } } }" gh api graphql --field query="$QUERY" # Update Working Sprint QUERY="mutation { updateProjectV2ItemFieldValue( input: { projectId: \"$PROJECT_ID\" itemId: \"$issue_id\" fieldId: \"$WORKING_SPRINT_FIELD_ID\" value: { iterationId: \"$WORKING_SPRINT_ITERATION_ID\" } } ) { projectV2Item { id } } }" gh api graphql --field query="$QUERY" done continue-on-error: true cccl-2.5.0/.github/workflows/triage_rotation.yml000066400000000000000000000050441463375617100217540ustar00rootroot00000000000000# SPDX-FileCopyrightText: Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. name: Assign Issues for Triage on: issues: types: - opened env: GH_TOKEN: ${{ github.token }} jobs: assign_issues: runs-on: ubuntu-latest if: ${{ ! contains(fromJSON('["OWNER", "MEMBER", "CONTRIBUTOR", "COLLABORATOR"]'), github.event.issue.author_association)}} steps: - name: Calculate assignee id: calculate_assignee run: | USERS=("allisonvacanti" "gevtushenko" "jrhemstad" "elstehle" "wmaxey" "miscco" "griwes") WEEK=$(date +%U) INDEX=$(( WEEK % 7 )) echo "ASSIGNEE=${USERS[$INDEX]}" >> $GITHUB_ENV - name: Get User ID run: | gh api graphql -f query=' query{ user(login: "${{ env.ASSIGNEE }}") { id } }' > user_info.json echo 'ASSIGNEE_ID='$(jq '.data.user.id' user_info.json) >> $GITHUB_ENV - name: Assign issue run: | gh api graphql -f query=' mutation { addAssigneesToAssignable(input: {assignableId: "${{ github.event.issue.node_id }}", assigneeIds: [ ${{ env.ASSIGNEE_ID }} ] }) { clientMutationId } }' - name: add-triage-label run: | issue_url=${{ github.event.issue.html_url }} gh issue edit ${issue_url} --add-label "Needs Triage" - name: add-comment-to-issue run: | issue_url=${{ github.event.issue.html_url }} author=${{ github.event.issue.user.login }} gh issue comment ${issue_url} --body "Hi @${author}! Thanks for submitting this issue - the CCCL team has been notified and we'll get back to you as soon as we can! In the mean time, feel free to add any relevant information to this issue." cccl-2.5.0/.github/workflows/verify-devcontainers.yml000066400000000000000000000070731463375617100227340ustar00rootroot00000000000000name: Verify devcontainers on: workflow_call: defaults: run: shell: bash -euo pipefail {0} permissions: contents: read jobs: get-devcontainer-list: name: Verify devcontainer files are up-to-date outputs: devcontainers: ${{ steps.get-list.outputs.devcontainers }} runs-on: ubuntu-latest steps: - name: Checkout repository uses: actions/checkout@v3 - name: Setup jq and yq run: | sudo apt-get update sudo apt-get install jq -y sudo wget -O /usr/local/bin/yq https://github.com/mikefarah/yq/releases/download/v4.34.2/yq_linux_amd64 sudo chmod +x /usr/local/bin/yq - name: Run the script to generate devcontainer files run: | ./.devcontainer/make_devcontainers.sh --verbose - name: Check for changes run: | if [[ $(git diff --stat) != '' || $(git status --porcelain | grep '^??') != '' ]]; then git diff --minimal git status --porcelain echo "::error:: Dev Container files are out of date or there are untracked files. Run the .devcontainer/make_devcontainers.sh script and commit the changes." exit 1 else echo "::note::Dev Container files are up-to-date." fi - name: Get list of devcontainer.json paths and names id: get-list run: | devcontainers=$(find .devcontainer/ -name 'devcontainer.json' | while read -r devcontainer; do jq --arg path "$devcontainer" '{path: $path, name: .name}' "$devcontainer" done | jq -s -c .) echo "devcontainers=${devcontainers}" | tee --append "${GITHUB_OUTPUT}" verify-devcontainers: needs: get-devcontainer-list name: ${{matrix.devcontainer.name}} runs-on: ubuntu-latest strategy: fail-fast: false matrix: devcontainer: ${{fromJson(needs.get-devcontainer-list.outputs.devcontainers)}} permissions: id-token: write contents: read steps: - name: Check out the code uses: actions/checkout@v3 # devcontainer/ci doesn't supported nested devcontainer.json files, so we need to copy the devcontainer.json # file to the top level .devcontainer/ directory - name: Copy devcontainer.json to .devcontainer/ run: | src="${{ matrix.devcontainer.path }}" dst=".devcontainer/devcontainer.json" if [[ "$src" != "$dst" ]]; then cp "$src" "$dst" fi # We don't really need sccache configured, but we need the AWS credentials envvars to be set # in order to avoid the devcontainer hanging waiting for GitHub authentication - name: Get AWS credentials for sccache bucket uses: aws-actions/configure-aws-credentials@v2 with: role-to-assume: arn:aws:iam::279114543810:role/gha-oidc-NVIDIA aws-region: us-east-2 role-duration-seconds: 43200 # 12 hours) - name: Set environment variables run: | echo "SCCACHE_BUCKET=rapids-sccache-devs" >> $GITHUB_ENV echo "SCCACHE_REGION=us-east-2" >> $GITHUB_ENV echo "SCCACHE_IDLE_TIMEOUT=32768" >> $GITHUB_ENV echo "SCCACHE_S3_USE_SSL=true" >> $GITHUB_ENV echo "SCCACHE_S3_NO_CREDENTIALS=false" >> $GITHUB_ENV - name: Run in devcontainer uses: devcontainers/ci@v0.3 with: push: never env: | SCCACHE_REGION=${{ env.SCCACHE_REGION }} AWS_ACCESS_KEY_ID=${{ env.AWS_ACCESS_KEY_ID }} AWS_SESSION_TOKEN=${{ env.AWS_SESSION_TOKEN }} AWS_SECRET_ACCESS_KEY=${{ env.AWS_SECRET_ACCESS_KEY }} runCmd: | .devcontainer/verify_devcontainer.sh cccl-2.5.0/.github/workflows/workflow-dispatch-job.yml000066400000000000000000000211031463375617100227730ustar00rootroot00000000000000name: "Workflow/Dispatch/Job" defaults: run: shell: bash --noprofile --norc -euo pipefail {0} on: workflow_call: outputs: success: value: ${{ contains(toJSON(jobs.*.outputs.success), 'true') }} inputs: name: {type: string, required: true} image: {type: string, required: true} runner: {type: string, required: true} command: {type: string, required: true} id: {type: string, required: true} env: {type: string, required: false} permissions: contents: read jobs: linux: name: ${{inputs.name}} if: ${{ startsWith(inputs.runner, 'linux') }} outputs: success: ${{ steps.done.outputs.SUCCESS }} permissions: id-token: write contents: read runs-on: ${{inputs.runner}} container: options: -u root image: ${{inputs.image}} env: NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }} steps: - name: Checkout repo uses: actions/checkout@v3 with: path: ${{github.event.repository.name}} persist-credentials: false - name: Link files to coder user home directory run: | ln -s "$(pwd)/${{github.event.repository.name}}" /home/coder/${{github.event.repository.name}} chown -R coder:coder ${{github.event.repository.name}} chown -R coder:coder /home/coder/${{github.event.repository.name}} - name: Add NVCC problem matcher run: | echo "::add-matcher::${{github.event.repository.name}}/.github/problem-matchers/problem-matcher.json" - name: Get AWS credentials for sccache bucket uses: aws-actions/configure-aws-credentials@v2 with: role-to-assume: arn:aws:iam::279114543810:role/gha-oidc-NVIDIA aws-region: us-east-2 role-duration-seconds: 43200 # 12 hours) - name: Set environment variables run: | echo "SCCACHE_BUCKET=rapids-sccache-devs" >> $GITHUB_ENV echo "SCCACHE_REGION=us-east-2" >> $GITHUB_ENV echo "SCCACHE_IDLE_TIMEOUT=32768" >> $GITHUB_ENV echo "SCCACHE_S3_USE_SSL=true" >> $GITHUB_ENV echo "SCCACHE_S3_NO_CREDENTIALS=false" >> $GITHUB_ENV - name: Run command # Do not change this step's name, it is checked in parse-job-times.py shell: su coder {0} env: # Dereferencing the command from and env var instead of a GHA input avoids issues with escaping # semicolons and other special characters (e.g. `-arch "60;70;80"`). COMMAND: ${{inputs.command}} run: | set -eo pipefail cd ~/${{github.event.repository.name}} echo -e "\e[1;34mRunning as 'coder' user in $(pwd):\e[0m" echo -e "\e[1;34m${COMMAND}\e[0m" eval "${COMMAND}" || exit_code=$? if [ ! -z "$exit_code" ]; then echo -e "::group::ï¸â— \e[1;31mInstructions to Reproduce CI Failure Locally\e[0m" echo "::error:: To replicate this failure locally, follow the steps below:" echo "1. Clone the repository, and navigate to the correct branch and commit:" echo " git clone --branch $GITHUB_REF_NAME --single-branch https://github.com/$GITHUB_REPOSITORY.git && cd $(echo $GITHUB_REPOSITORY | cut -d'/' -f2) && git checkout $GITHUB_SHA" echo "" echo "2. Run the failed command inside the same Docker container used by the CI:" echo " docker run --rm -it --gpus all --pull=always --volume \$PWD:/repo --workdir /repo ${{ inputs.image }} ${COMMAND}" echo "" echo "For additional information, see:" echo " - DevContainer Documentation: https://github.com/NVIDIA/cccl/blob/main/.devcontainer/README.md" echo " - Continuous Integration (CI) Overview: https://github.com/NVIDIA/cccl/blob/main/ci-overview.md" exit $exit_code fi - name: Prepare job artifacts id: done run: | echo "SUCCESS=true" | tee -a "${GITHUB_OUTPUT}" result_dir="jobs/${{inputs.id}}" mkdir -p "$result_dir" touch "$result_dir/success" # Finds a matching file in the repo directory and copies it to the results directory. find_and_copy() { filename="$1" filepath="$(find ${{github.event.repository.name}} -name "${filename}" -print -quit)" if [[ -z "$filepath" ]]; then echo "${filename} does not exist in repo directory." return 1 fi cp -v "$filepath" "$result_dir" } find_and_copy "sccache_stats.json" || true # Ignore failures echo "::group::Job artifacts" tree "$result_dir" echo "::endgroup::" - name: Upload job artifacts uses: actions/upload-artifact@v3 with: name: jobs path: jobs windows: name: ${{inputs.name}} if: ${{ startsWith(inputs.runner, 'windows') }} outputs: success: ${{ steps.done.outputs.SUCCESS }} permissions: id-token: write contents: read runs-on: ${{inputs.runner}} env: SCCACHE_BUCKET: rapids-sccache-devs SCCACHE_REGION: us-east-2 SCCACHE_IDLE_TIMEOUT: 0 SCCACHE_S3_USE_SSL: true SCCACHE_S3_NO_CREDENTIALS: false steps: - name: Get AWS credentials for sccache bucket uses: aws-actions/configure-aws-credentials@v2 with: role-to-assume: arn:aws:iam::279114543810:role/gha-oidc-NVIDIA aws-region: us-east-2 role-duration-seconds: 43200 # 12 hours - name: Checkout repo uses: actions/checkout@v3 with: path: ${{github.event.repository.name}} persist-credentials: false - name: Fetch ${{ inputs.image }} run: docker pull ${{ inputs.image }} - name: Prepare paths for docker id: paths run: | echo "HOST_REPO=${{ github.workspace }}\${{ github.event.repository.name }}".Replace('\', '/') | Out-File -FilePath $env:GITHUB_OUTPUT -Append echo "MOUNT_REPO=C:/${{ github.event.repository.name }}" | Out-File -FilePath $env:GITHUB_OUTPUT -Append cat $env:GITHUB_OUTPUT shell: powershell - name: Run command # Do not change this step's name, it is checked in parse-job-times.py run: | docker run \ --mount type=bind,source="${{steps.paths.outputs.HOST_REPO}}",target="${{steps.paths.outputs.MOUNT_REPO}}" \ --workdir "${{steps.paths.outputs.MOUNT_REPO}}" \ ${{ inputs.image }} \ powershell -c " [System.Environment]::SetEnvironmentVariable('AWS_ACCESS_KEY_ID','${{env.AWS_ACCESS_KEY_ID}}'); [System.Environment]::SetEnvironmentVariable('AWS_SECRET_ACCESS_KEY','${{env.AWS_SECRET_ACCESS_KEY}}'); [System.Environment]::SetEnvironmentVariable('AWS_SESSION_TOKEN','${{env.AWS_SESSION_TOKEN }}'); [System.Environment]::SetEnvironmentVariable('SCCACHE_BUCKET','${{env.SCCACHE_BUCKET}}'); [System.Environment]::SetEnvironmentVariable('SCCACHE_REGION','${{env.SCCACHE_REGION}}'); [System.Environment]::SetEnvironmentVariable('SCCACHE_IDLE_TIMEOUT','${{env.SCCACHE_IDLE_TIMEOUT}}'); [System.Environment]::SetEnvironmentVariable('SCCACHE_S3_USE_SSL','${{env.SCCACHE_S3_USE_SSL}}'); [System.Environment]::SetEnvironmentVariable('SCCACHE_S3_NO_CREDENTIALS','${{env.SCCACHE_S3_NO_CREDENTIALS}}'); git config --global --add safe.directory '${{steps.paths.outputs.MOUNT_REPO}}'; ${{inputs.command}}" - name: Prepare job artifacts id: done run: | echo "SUCCESS=true" | tee -a "${GITHUB_OUTPUT}" result_dir="jobs/${{inputs.id}}" mkdir -p "$result_dir" touch "$result_dir/success" # Finds a matching file in the repo directory and copies it to the results directory. find_and_copy() { filename="$1" filepath="$(find ${{github.event.repository.name}} -name "${filename}" -print -quit)" if [[ -z "$filepath" ]]; then echo "${filename} does not exist in repo directory." return 1 fi cp -v "$filepath" "$result_dir" } find_and_copy "sccache_stats.json" || true # Ignore failures echo "::group::Job artifacts" find "$result_dir" # Tree not available in this image. echo "::endgroup::" - name: Upload job artifacts uses: actions/upload-artifact@v3 with: name: jobs path: jobs cccl-2.5.0/.github/workflows/workflow-dispatch-two-stage.yml000066400000000000000000000037431463375617100241450ustar00rootroot00000000000000name: "Workflow/Dispatch/TwoStage" defaults: run: shell: bash --noprofile --norc -euo pipefail {0} on: workflow_call: inputs: producers: {type: string, required: true} consumers: {type: string, required: true} permissions: contents: read jobs: # It is impossible to accumulate output variables across a matrix, # and we cannot rely on the results of the dispatch-job workflow to determine success. # See the note in ci-dispatch-job.yml for more information. # # Since we cannot accumulate results from multiple producers, only support a single producer for now. # This is enforced by compute-matrix.py. producers: # This is an internal dispatch job and the name is not important. # Give the job a short and unique name, otherwise github will bloat the job name with the matrix values. # This keeps the UI from getting cluttered. name: "p.${{ matrix.id }}" permissions: id-token: write contents: read strategy: fail-fast: false matrix: include: ${{fromJSON(inputs.producers)}} uses: ./.github/workflows/workflow-dispatch-job.yml with: name: ${{ matrix.name }} runner: ${{ matrix.runner }} image: ${{ matrix.image }} command: ${{ matrix.command }} id: ${{ matrix.id }} consumers: # This is an internal dispatch job and the name is not important. # Give the job a short and unique name, otherwise github will bloat the job name with the matrix values. # This keeps the UI from getting cluttered. name: "c.${{ matrix.id }}" needs: producers permissions: id-token: write contents: read strategy: fail-fast: false matrix: include: ${{fromJSON(inputs.consumers)}} uses: ./.github/workflows/workflow-dispatch-job.yml with: name: ${{ matrix.name }} runner: ${{ matrix.runner }} image: ${{ matrix.image }} command: ${{ matrix.command }} id: ${{ matrix.id }} cccl-2.5.0/.github/workflows/workflow-dispatch.yml000066400000000000000000000032241463375617100222270ustar00rootroot00000000000000name: "Workflow/Dispatch/Group" defaults: run: shell: bash --noprofile --norc -euo pipefail {0} on: workflow_call: inputs: name: {type: string, required: true} jobs: {type: string, required: true} permissions: contents: read jobs: standlone-jobs: # This is an internal dispatch job and the name is not important. # Give the job a short and unique name, otherwise github will bloat the job name with the matrix values. # This keeps the UI from getting cluttered. name: "s.${{ matrix.id }}" if: ${{ fromJSON(inputs.jobs)['standalone'] != null }} permissions: id-token: write contents: read strategy: fail-fast: false matrix: include: ${{fromJSON(inputs.jobs)['standalone']}} uses: ./.github/workflows/workflow-dispatch-job.yml with: name: ${{ matrix.name }} runner: ${{ matrix.runner }} image: ${{ matrix.image }} command: ${{ matrix.command }} id: ${{ matrix.id }} two-stage-jobs: # This is an internal dispatch job and the name is not important. # Give the job a short and unique name, otherwise github will bloat the job name with the matrix values. # This keeps the UI from getting cluttered. name: "t.${{ matrix.id }}" if: ${{ fromJSON(inputs.jobs)['two_stage'] != null }} permissions: id-token: write contents: read strategy: fail-fast: false matrix: include: ${{fromJSON(inputs.jobs)['two_stage']}} uses: ./.github/workflows/workflow-dispatch-two-stage.yml with: producers: ${{ toJSON(matrix.producers) }} consumers: ${{ toJSON(matrix.consumers) }} cccl-2.5.0/.gitignore000066400000000000000000000001521463375617100144250ustar00rootroot00000000000000.idea/ build*/ .cache .aws .config _deps/catch2-src/ .vscode/ compile_commands.json CMakeUserPresets.json cccl-2.5.0/.pre-commit-config.yaml000066400000000000000000000017721463375617100167270ustar00rootroot00000000000000# Copyright (c) 2024, NVIDIA CORPORATION. ci: autofix_commit_msg: | [pre-commit.ci] auto code formatting autofix_prs: false autoupdate_branch: '' autoupdate_commit_msg: '[pre-commit.ci] pre-commit autoupdate' autoupdate_schedule: quarterly skip: [] submodules: false repos: - repo: https://github.com/pre-commit/pre-commit-hooks rev: v4.5.0 hooks: - id: end-of-file-fixer - id: mixed-line-ending - id: trailing-whitespace - repo: https://github.com/pre-commit/mirrors-clang-format rev: v17.0.6 hooks: - id: clang-format types_or: [file] files: | (?x)^( ^.*\.c$| ^.*\.cpp$| ^.*\.cu$| ^.*\.cuh$| ^.*\.cxx$| ^.*\.h$| ^.*\.hpp$| ^.*\.inl$| ^.*\.mm$| ^libcudacxx/include/.*/[^.]*$ ) args: ["-fallback-style=none", "-style=file", "-i"] default_language_version: python: python3 cccl-2.5.0/.vscode/000077500000000000000000000000001463375617100140005ustar00rootroot00000000000000cccl-2.5.0/.vscode/launch.json000066400000000000000000000005721463375617100161510ustar00rootroot00000000000000{ "version": "0.2.0", "configurations": [ { "name": "(cuda-gdb) Launch", "type": "cuda-gdb", "request": "launch", "program": "${command:cmake.launchTargetPath}", "stopAtEntry": true, "cwd": "${workspaceFolder}", "initCommands": ["set cuda api_failures stop"] } ] } cccl-2.5.0/CITATION.md000066400000000000000000000004701463375617100141740ustar00rootroot00000000000000# Citation Guide ## To Cite CCCL If you use CCCL in a publication, please use citations in the following format (BibTeX entry for LaTeX): ```tex @Manual{, title = {{CCCL}: {CUDA} {C++} {C}ore {L}ibraries}, author = {{CCCL Development Team}}, year = {2023}, url = {https://github.com/NVIDIA/cccl}, } ``` cccl-2.5.0/CMakeLists.txt000066400000000000000000000046331463375617100152050ustar00rootroot00000000000000# 3.15 is the minimum for including the project with add_subdirectory. # 3.21 is the minimum for the developer build. cmake_minimum_required(VERSION 3.15) # sccache cannot handle the -Fd option generationg pdb files if ("MSVC" STREQUAL "${CMAKE_CXX_COMPILER_ID}") cmake_policy(SET CMP0141 NEW) endif() # Determine whether CCCL is the top-level project or included into # another project via add_subdirectory() if ("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_LIST_DIR}") set(CCCL_TOPLEVEL_PROJECT ON) endif() # Enable CXX so CMake can configure install paths project(CCCL LANGUAGES CXX) # Optionally include installation rules for non-top-level builds: option(CCCL_ENABLE_INSTALL_RULES "Enable installation of CCCL." ${CCCL_TOPLEVEL_PROJECT}) if (CCCL_ENABLE_INSTALL_RULES) include(cmake/CCCLInstallRules.cmake) endif() # Support adding CCCL to a parent project via add_subdirectory. if (NOT CCCL_TOPLEVEL_PROJECT) include(cmake/CCCLAddSubdir.cmake) return() endif() # We require a higher cmake version for dev builds cmake_minimum_required(VERSION 3.21) option(CCCL_ENABLE_LIBCUDACXX "Enable the libcu++ developer build." ON) option(CCCL_ENABLE_CUB "Enable the CUB developer build." ON) option(CCCL_ENABLE_THRUST "Enable the Thrust developer build." ON) option(CCCL_ENABLE_TESTING "Enable CUDA C++ Core Library tests." ON) option(CCCL_ENABLE_EXAMPLES "Enable CUDA C++ Core Library examples." ON) option(CCCL_ENABLE_BENCHMARKS "Enable CUDA C++ Core Library benchmarks." OFF) option(CCCL_ENABLE_UNSTABLE "Enable targets and developer build options for unstable projects." OFF) if (CCCL_ENABLE_UNSTABLE) option(CCCL_ENABLE_CUDAX "Enable the CUDA Experimental developer build." ON) endif() include(CTest) enable_testing() include(cmake/CCCLUtilities.cmake) # include this first include(cmake/CCCLClangdCompileInfo.cmake) if (CCCL_ENABLE_LIBCUDACXX) set(LIBCUDACXX_TOPLEVEL_PROJECT ON) endif() if (CCCL_ENABLE_CUB) set(CUB_TOPLEVEL_PROJECT ON) endif() if (CCCL_ENABLE_THRUST) set(THRUST_TOPLEVEL_PROJECT ON) endif() if (CCCL_ENABLE_CUDAX) set(cudax_TOPLEVEL_PROJECT ON) endif() add_subdirectory(libcudacxx) add_subdirectory(cub) add_subdirectory(thrust) if (CCCL_ENABLE_UNSTABLE) add_subdirectory(cudax) endif() if (CCCL_ENABLE_TESTING) add_subdirectory(test) endif() if (CCCL_ENABLE_EXAMPLES) add_subdirectory(examples) endif() # Must stay at the end of this file. include(cmake/CCCLHideThirdPartyOptions.cmake) cccl-2.5.0/CMakePresets.json000066400000000000000000000513411463375617100156640ustar00rootroot00000000000000{ "version": 3, "cmakeMinimumRequired": { "major": 3, "minor": 21, "patch": 0 }, "configurePresets": [ { "name": "base", "hidden": true, "generator": "Ninja", "binaryDir": "${sourceDir}/build/$env{CCCL_BUILD_INFIX}/${presetName}", "cacheVariables": { "CMAKE_BUILD_TYPE": "Release", "CMAKE_CUDA_ARCHITECTURES": "60;70;80", "CCCL_ENABLE_UNSTABLE": true, "CCCL_ENABLE_LIBCUDACXX": false, "CCCL_ENABLE_CUB": false, "CCCL_ENABLE_THRUST": false, "CCCL_ENABLE_CUDAX": false, "CCCL_ENABLE_TESTING": false, "CCCL_ENABLE_EXAMPLES": false, "libcudacxx_ENABLE_INSTALL_RULES": true, "CUB_ENABLE_INSTALL_RULES": true, "THRUST_ENABLE_INSTALL_RULES": true, "cudax_ENABLE_INSTALL_RULES": true } }, { "name": "all-dev", "inherits": "base", "cacheVariables": { "CCCL_ENABLE_LIBCUDACXX": true, "CCCL_ENABLE_CUB": true, "CCCL_ENABLE_THRUST": true, "CCCL_ENABLE_CUDAX": true, "CCCL_ENABLE_TESTING": true, "CCCL_ENABLE_EXAMPLES": true, "CCCL_ENABLE_BENCHMARKS": true, "LIBCUDACXX_ENABLE_LIBCUDACXX_TESTS": true, "CUB_ENABLE_TESTING": true, "CUB_ENABLE_EXAMPLES": true, "CUB_SEPARATE_CATCH2": true, "CUB_IGNORE_DEPRECATED_CPP_DIALECT": true, "CUB_ENABLE_DIALECT_CPP11": true, "CUB_ENABLE_DIALECT_CPP14": true, "CUB_ENABLE_DIALECT_CPP17": true, "CUB_ENABLE_DIALECT_CPP20": true, "THRUST_ENABLE_MULTICONFIG": true, "THRUST_MULTICONFIG_WORKLOAD": "LARGE", "THRUST_IGNORE_DEPRECATED_CPP_DIALECT": true, "THRUST_MULTICONFIG_ENABLE_DIALECT_CPP11": true, "THRUST_MULTICONFIG_ENABLE_DIALECT_CPP14": true, "THRUST_MULTICONFIG_ENABLE_DIALECT_CPP17": true, "THRUST_MULTICONFIG_ENABLE_DIALECT_CPP20": true, "THRUST_MULTICONFIG_ENABLE_SYSTEM_CPP": true, "THRUST_MULTICONFIG_ENABLE_SYSTEM_CUDA": true, "THRUST_MULTICONFIG_ENABLE_SYSTEM_OMP": true, "THRUST_MULTICONFIG_ENABLE_SYSTEM_TBB": true, "cudax_ENABLE_HEADER_TESTING": true, "cudax_ENABLE_TESTING": true, "cudax_ENABLE_DIALECT_CPP17": true, "cudax_ENABLE_DIALECT_CPP20": true } }, { "name": "all-dev-debug", "displayName": "all-dev debug", "inherits": "all-dev", "cacheVariables": { "CCCL_ENABLE_BENCHMARKS": false, "CMAKE_BUILD_TYPE": "Debug", "CMAKE_CUDA_FLAGS": "-G" } }, { "name": "libcudacxx-codegen", "displayName": "libcu++: codegen", "inherits": "base", "cacheVariables": { "CCCL_ENABLE_LIBCUDACXX": true, "LIBCUDACXX_ENABLE_LIBCUDACXX_TESTS": false, "libcudacxx_ENABLE_CMAKE_TESTS": false, "libcudacxx_ENABLE_CODEGEN": true, "LIBCUDACXX_ENABLE_CUDA": false } }, { "name": "libcudacxx-base", "hidden": true, "inherits": "base", "cacheVariables": { "CCCL_ENABLE_LIBCUDACXX": true, "LIBCUDACXX_ENABLE_LIBCUDACXX_TESTS": true } }, { "name": "libcudacxx-cpp11", "displayName": "libcu++: C++11", "inherits": "libcudacxx-base", "cacheVariables": { "CMAKE_CXX_STANDARD": "11", "CMAKE_CUDA_STANDARD": "11", "LIBCUDACXX_TEST_STANDARD_VER": "c++11" } }, { "name": "libcudacxx-cpp14", "displayName": "libcu++: C++14", "inherits": "libcudacxx-base", "cacheVariables": { "CMAKE_CXX_STANDARD": "14", "CMAKE_CUDA_STANDARD": "14", "LIBCUDACXX_TEST_STANDARD_VER": "c++14" } }, { "name": "libcudacxx-cpp17", "displayName": "libcu++: C++17", "inherits": "libcudacxx-base", "cacheVariables": { "CMAKE_CXX_STANDARD": "17", "CMAKE_CUDA_STANDARD": "17", "LIBCUDACXX_TEST_STANDARD_VER": "c++17" } }, { "name": "libcudacxx-cpp20", "displayName": "libcu++: C++20", "inherits": "libcudacxx-base", "cacheVariables": { "CMAKE_CXX_STANDARD": "20", "CMAKE_CUDA_STANDARD": "20", "LIBCUDACXX_TEST_STANDARD_VER": "c++20" } }, { "name": "libcudacxx-nvrtc-base", "hidden": true, "inherits": "libcudacxx-base", "cacheVariables": { "LIBCUDACXX_TEST_WITH_NVRTC": true, "CMAKE_CUDA_ARCHITECTURES": "70" } }, { "name": "libcudacxx-nvrtc-cpp11", "displayName": "libcu++ NVRTC: C++11", "inherits": "libcudacxx-nvrtc-base", "cacheVariables": { "CMAKE_CXX_STANDARD": "11", "CMAKE_CUDA_STANDARD": "11", "LIBCUDACXX_TEST_STANDARD_VER": "c++11" } }, { "name": "libcudacxx-nvrtc-cpp14", "displayName": "libcu++ NVRTC: C++14", "inherits": "libcudacxx-nvrtc-base", "cacheVariables": { "CMAKE_CXX_STANDARD": "14", "CMAKE_CUDA_STANDARD": "14", "LIBCUDACXX_TEST_STANDARD_VER": "c++14" } }, { "name": "libcudacxx-nvrtc-cpp17", "displayName": "libcu++ NVRTC: C++17", "inherits": "libcudacxx-nvrtc-base", "cacheVariables": { "CMAKE_CXX_STANDARD": "17", "CMAKE_CUDA_STANDARD": "17", "LIBCUDACXX_TEST_STANDARD_VER": "c++17" } }, { "name": "libcudacxx-nvrtc-cpp20", "displayName": "libcu++ NVRTC: C++20", "inherits": "libcudacxx-nvrtc-base", "cacheVariables": { "CMAKE_CXX_STANDARD": "20", "CMAKE_CUDA_STANDARD": "20", "LIBCUDACXX_TEST_STANDARD_VER": "c++20" } }, { "name": "cub-base", "hidden": true, "inherits": "base", "cacheVariables": { "CCCL_ENABLE_CUB": true, "CUB_ENABLE_TESTING": true, "CUB_ENABLE_EXAMPLES": true, "CUB_SEPARATE_CATCH2": true, "CUB_ENABLE_DIALECT_CPP11": false, "CUB_ENABLE_DIALECT_CPP14": false, "CUB_ENABLE_DIALECT_CPP17": false, "CUB_ENABLE_DIALECT_CPP20": false } }, { "name": "cub-cpp11", "displayName": "CUB: C++11", "inherits": "cub-base", "cacheVariables": { "CUB_IGNORE_DEPRECATED_CPP_DIALECT": true, "CUB_ENABLE_DIALECT_CPP11": true } }, { "name": "cub-cpp14", "displayName": "CUB: C++14", "inherits": "cub-base", "cacheVariables": { "CUB_ENABLE_DIALECT_CPP14": true } }, { "name": "cub-cpp17", "displayName": "CUB: C++17", "inherits": "cub-base", "cacheVariables": { "CUB_ENABLE_DIALECT_CPP17": true } }, { "name": "cub-cpp20", "displayName": "CUB: C++20", "inherits": "cub-base", "cacheVariables": { "CUB_ENABLE_DIALECT_CPP20": true } }, { "name": "thrust-base", "hidden": true, "inherits": "base", "cacheVariables": { "CCCL_ENABLE_THRUST": true, "THRUST_ENABLE_MULTICONFIG": true, "THRUST_MULTICONFIG_ENABLE_SYSTEM_CPP": true, "THRUST_MULTICONFIG_ENABLE_SYSTEM_CUDA": true, "THRUST_MULTICONFIG_ENABLE_SYSTEM_OMP": true, "THRUST_MULTICONFIG_ENABLE_SYSTEM_TBB": true, "THRUST_MULTICONFIG_ENABLE_DIALECT_CPP11": false, "THRUST_MULTICONFIG_ENABLE_DIALECT_CPP14": false, "THRUST_MULTICONFIG_ENABLE_DIALECT_CPP17": false, "THRUST_MULTICONFIG_ENABLE_DIALECT_CPP20": false } }, { "name": "thrust-cpp11", "displayName": "Thrust: C++11", "inherits": "thrust-base", "cacheVariables": { "THRUST_IGNORE_DEPRECATED_CPP_DIALECT": true, "THRUST_MULTICONFIG_ENABLE_DIALECT_CPP11": true } }, { "name": "thrust-cpp14", "displayName": "Thrust: C++14", "inherits": "thrust-base", "cacheVariables": { "THRUST_MULTICONFIG_ENABLE_DIALECT_CPP14": true } }, { "name": "thrust-cpp17", "displayName": "Thrust: C++17", "inherits": "thrust-base", "cacheVariables": { "THRUST_MULTICONFIG_ENABLE_DIALECT_CPP17": true } }, { "name": "thrust-cpp20", "displayName": "Thrust: C++20", "inherits": "thrust-base", "cacheVariables": { "THRUST_MULTICONFIG_ENABLE_DIALECT_CPP20": true } }, { "name": "cudax-base", "hidden": true, "inherits": "base", "cacheVariables": { "CCCL_ENABLE_CUDAX": true, "cudax_ENABLE_HEADER_TESTING": true, "cudax_ENABLE_TESTING": true, "cudax_ENABLE_DIALECT_CPP17": false, "cudax_ENABLE_DIALECT_CPP20": false } }, { "name": "cudax-cpp17", "displayName": "cudax: C++17", "inherits": "cudax-base", "cacheVariables": { "cudax_ENABLE_DIALECT_CPP17": true } }, { "name": "cudax-cpp20", "displayName": "cudax: C++20", "inherits": "cudax-base", "cacheVariables": { "cudax_ENABLE_DIALECT_CPP20": true } }, { "name": "cccl-infra", "displayName": "CCCL Infrastructure", "inherits": "base", "cacheVariables": { "CMAKE_CUDA_COMPILER": "nvcc", "CCCL_ENABLE_EXAMPLES": true, "CCCL_ENABLE_TESTING": true } } ], "buildPresets": [ { "name": "all-dev", "configurePreset": "all-dev" }, { "name": "all-dev-debug", "configurePreset": "all-dev-debug" }, { "name": "libcudacxx-codegen", "configurePreset": "libcudacxx-codegen", "targets": ["libcudacxx.atomics.codegen"] }, { "name": "libcudacxx-codegen-install", "configurePreset": "libcudacxx-codegen", "targets": ["libcudacxx.atomics.codegen.install"] }, { "name": "libcudacxx-nvrtcc", "hidden": true, "targets": ["libcudacxx.nvrtcc"] }, { "name": "libcudacxx-base", "hidden": true, "targets": [ "libcudacxx.test.internal_headers", "libcudacxx.test.public_headers", "libcudacxx.test.public_headers_host_only", "libcudacxx.test.lit.precompile" ] }, { "name": "libcudacxx-nvrtc-cpp11", "hidden": false, "inherits": ["libcudacxx-nvrtcc"], "configurePreset": "libcudacxx-nvrtc-cpp11" }, { "name": "libcudacxx-nvrtc-cpp14", "hidden": false, "inherits": ["libcudacxx-nvrtcc"], "configurePreset": "libcudacxx-nvrtc-cpp14" }, { "name": "libcudacxx-nvrtc-cpp17", "hidden": false, "inherits": ["libcudacxx-nvrtcc"], "configurePreset": "libcudacxx-nvrtc-cpp17" }, { "name": "libcudacxx-nvrtc-cpp20", "hidden": false, "inherits": ["libcudacxx-nvrtcc"], "configurePreset": "libcudacxx-nvrtc-cpp20" }, { "name": "libcudacxx-cpp11", "configurePreset": "libcudacxx-cpp11", "inherits": ["libcudacxx-base"] }, { "name": "libcudacxx-cpp14", "configurePreset": "libcudacxx-cpp14", "inherits": ["libcudacxx-base"] }, { "name": "libcudacxx-cpp17", "configurePreset": "libcudacxx-cpp17", "inherits": ["libcudacxx-base"] }, { "name": "libcudacxx-cpp20", "configurePreset": "libcudacxx-cpp20", "inherits": ["libcudacxx-base"] }, { "name": "cub-cpp11", "configurePreset": "cub-cpp11" }, { "name": "cub-cpp14", "configurePreset": "cub-cpp14" }, { "name": "cub-cpp17", "configurePreset": "cub-cpp17" }, { "name": "cub-cpp20", "configurePreset": "cub-cpp20" }, { "name": "thrust-cpp11", "configurePreset": "thrust-cpp11" }, { "name": "thrust-cpp14", "configurePreset": "thrust-cpp14" }, { "name": "thrust-cpp17", "configurePreset": "thrust-cpp17" }, { "name": "thrust-cpp20", "configurePreset": "thrust-cpp20" }, { "name": "cudax-cpp17", "configurePreset": "cudax-cpp17" }, { "name": "cudax-cpp20", "configurePreset": "cudax-cpp20" }, { "name": "cccl-infra", "configurePreset": "cccl-infra" } ], "testPresets": [ { "name": "base", "hidden": true, "output": { "outputOnFailure": true }, "execution": { "noTestsAction": "error", "stopOnFailure": false } }, { "name": "all-dev", "configurePreset": "all-dev", "inherits": "base" }, { "name": "all-dev-debug", "configurePreset": "all-dev-debug", "inherits": "all-dev" }, { "name": "libcudacxx-ctest-base", "hidden": true, "inherits": [ "base" ], "filter": { "exclude": { "name": "^libcudacxx\\.test\\.lit$" } } }, { "name": "libcudacxx-codegen", "configurePreset": "libcudacxx-codegen", "filter": { "include": { "name": "^libcudacxx\\.atomics\\.codegen.*$" } } }, { "name": "libcudacxx-ctest-cpp11", "configurePreset": "libcudacxx-cpp11", "inherits": [ "libcudacxx-ctest-base" ] }, { "name": "libcudacxx-ctest-cpp14", "configurePreset": "libcudacxx-cpp14", "inherits": [ "libcudacxx-ctest-base" ] }, { "name": "libcudacxx-ctest-cpp17", "configurePreset": "libcudacxx-cpp17", "inherits": [ "libcudacxx-ctest-base" ] }, { "name": "libcudacxx-ctest-cpp20", "configurePreset": "libcudacxx-cpp20", "inherits": [ "libcudacxx-ctest-base" ] }, { "name": "libcudacxx-lit-base", "hidden": true, "inherits": [ "base" ], "filter": { "include": { "name": "^libcudacxx\\.test\\.lit$" } }, "output": { "verbosity": "extra", "outputOnFailure": false } }, { "name": "libcudacxx-lit-cpp11", "configurePreset": "libcudacxx-cpp11", "inherits": [ "libcudacxx-lit-base" ] }, { "name": "libcudacxx-lit-cpp14", "configurePreset": "libcudacxx-cpp14", "inherits": [ "libcudacxx-lit-base" ] }, { "name": "libcudacxx-lit-cpp17", "configurePreset": "libcudacxx-cpp17", "inherits": [ "libcudacxx-lit-base" ] }, { "name": "libcudacxx-lit-cpp20", "configurePreset": "libcudacxx-cpp20", "inherits": [ "libcudacxx-lit-base" ] }, { "name": "libcudacxx-nvrtc-base", "hidden": true, "inherits": [ "libcudacxx-lit-base" ] }, { "name": "libcudacxx-nvrtc-cpp11", "configurePreset": "libcudacxx-nvrtc-cpp11", "inherits": [ "libcudacxx-nvrtc-base" ] }, { "name": "libcudacxx-nvrtc-cpp14", "configurePreset": "libcudacxx-nvrtc-cpp14", "inherits": [ "libcudacxx-nvrtc-base" ] }, { "name": "libcudacxx-nvrtc-cpp17", "configurePreset": "libcudacxx-nvrtc-cpp17", "inherits": [ "libcudacxx-nvrtc-base" ] }, { "name": "libcudacxx-nvrtc-cpp20", "configurePreset": "libcudacxx-nvrtc-cpp20", "inherits": [ "libcudacxx-nvrtc-base" ] }, { "name": "cub-base", "hidden": true, "inherits": "base" }, { "name": "cub-nolid-base", "hidden": true, "inherits": "cub-base", "filter": { "exclude": { "name": "^cub.*\\.lid_[0-2].*$" } } }, { "name": "cub-lid0-base", "hidden": true, "inherits": "cub-base", "filter": { "include": { "name": "^cub.*\\.lid_0.*$" } } }, { "name": "cub-lid1-base", "hidden": true, "inherits": "cub-base", "filter": { "include": { "name": "^cub.*\\.lid_1.*$" } } }, { "name": "cub-lid2-base", "hidden": true, "inherits": "cub-base", "filter": { "include": { "name": "^cub.*\\.lid_2.*$" } } }, { "name": "cub-nolid-cpp11", "configurePreset": "cub-cpp11", "inherits": "cub-nolid-base" }, { "name": "cub-nolid-cpp14", "configurePreset": "cub-cpp14", "inherits": "cub-nolid-base" }, { "name": "cub-nolid-cpp17", "configurePreset": "cub-cpp17", "inherits": "cub-nolid-base" }, { "name": "cub-nolid-cpp20", "configurePreset": "cub-cpp20", "inherits": "cub-nolid-base" }, { "name": "cub-lid0-cpp11", "configurePreset": "cub-cpp11", "inherits": "cub-lid0-base" }, { "name": "cub-lid0-cpp14", "configurePreset": "cub-cpp14", "inherits": "cub-lid0-base" }, { "name": "cub-lid0-cpp17", "configurePreset": "cub-cpp17", "inherits": "cub-lid0-base" }, { "name": "cub-lid0-cpp20", "configurePreset": "cub-cpp20", "inherits": "cub-lid0-base" }, { "name": "cub-lid1-cpp11", "configurePreset": "cub-cpp11", "inherits": "cub-lid1-base" }, { "name": "cub-lid1-cpp14", "configurePreset": "cub-cpp14", "inherits": "cub-lid1-base" }, { "name": "cub-lid1-cpp17", "configurePreset": "cub-cpp17", "inherits": "cub-lid1-base" }, { "name": "cub-lid1-cpp20", "configurePreset": "cub-cpp20", "inherits": "cub-lid1-base" }, { "name": "cub-lid2-cpp11", "configurePreset": "cub-cpp11", "inherits": "cub-lid2-base" }, { "name": "cub-lid2-cpp14", "configurePreset": "cub-cpp14", "inherits": "cub-lid2-base" }, { "name": "cub-lid2-cpp17", "configurePreset": "cub-cpp17", "inherits": "cub-lid2-base" }, { "name": "cub-lid2-cpp20", "configurePreset": "cub-cpp20", "inherits": "cub-lid2-base" }, { "name": "cub-cpp11", "configurePreset": "cub-cpp11", "inherits": "cub-base" }, { "name": "cub-cpp14", "configurePreset": "cub-cpp14", "inherits": "cub-base" }, { "name": "cub-cpp17", "configurePreset": "cub-cpp17", "inherits": "cub-base" }, { "name": "cub-cpp20", "configurePreset": "cub-cpp20", "inherits": "cub-base" }, { "name": "thrust-base", "hidden": true, "inherits": "base", "filter": { "exclude": { "name": "^thrust.*\\.test\\.async[._].*$" } } }, { "name": "thrust-gpu-base", "hidden": true, "inherits": "thrust-base", "filter": { "include": { "name": "^thrust.*\\.cuda\\..*$" } } }, { "name": "thrust-cpu-base", "hidden": true, "inherits": "thrust-base", "filter": { "exclude": { "name": "^thrust.*\\.cuda\\..*$" } } }, { "name": "thrust-gpu-cpp11", "configurePreset": "thrust-cpp11", "inherits": "thrust-gpu-base" }, { "name": "thrust-gpu-cpp14", "configurePreset": "thrust-cpp14", "inherits": "thrust-gpu-base" }, { "name": "thrust-gpu-cpp17", "configurePreset": "thrust-cpp17", "inherits": "thrust-gpu-base" }, { "name": "thrust-gpu-cpp20", "configurePreset": "thrust-cpp20", "inherits": "thrust-gpu-base" }, { "name": "thrust-cpu-cpp11", "configurePreset": "thrust-cpp11", "inherits": "thrust-cpu-base" }, { "name": "thrust-cpu-cpp14", "configurePreset": "thrust-cpp14", "inherits": "thrust-cpu-base" }, { "name": "thrust-cpu-cpp17", "configurePreset": "thrust-cpp17", "inherits": "thrust-cpu-base" }, { "name": "thrust-cpu-cpp20", "configurePreset": "thrust-cpp20", "inherits": "thrust-cpu-base" }, { "name": "thrust-cpp11", "configurePreset": "thrust-cpp11", "inherits": "thrust-base" }, { "name": "thrust-cpp14", "configurePreset": "thrust-cpp14", "inherits": "thrust-base" }, { "name": "thrust-cpp17", "configurePreset": "thrust-cpp17", "inherits": "thrust-base" }, { "name": "thrust-cpp20", "configurePreset": "thrust-cpp20", "inherits": "thrust-base" }, { "name": "cudax-base", "hidden": true, "inherits": "base" }, { "name": "cudax-cpp17", "configurePreset": "cudax-cpp17", "inherits": "cudax-base" }, { "name": "cudax-cpp20", "configurePreset": "cudax-cpp20", "inherits": "cudax-base" }, { "name": "cccl-infra", "configurePreset": "cccl-infra", "inherits": "base" } ] } cccl-2.5.0/CODE_OF_CONDUCT.md000066400000000000000000000066241463375617100152460ustar00rootroot00000000000000# Contributor Covenant Code of Conduct ## Overview Define the code of conduct followed and enforced for CCCL. ### Intended audience Community | Developers | Project Leads ## Our Pledge In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, sex characteristics, gender identity and expression, level of experience, education, socio-economic status, nationality, personal appearance, race, religion, or sexual identity and orientation. ## Our Standards Examples of behavior that contributes to creating a positive environment include: * Using welcoming and inclusive language * Being respectful of differing viewpoints and experiences * Gracefully accepting constructive criticism * Focusing on what is best for the community * Showing empathy towards other community members Examples of unacceptable behavior by participants include: * The use of sexualized language or imagery and unwelcome sexual attention or advances * Trolling, insulting/derogatory comments, and personal or political attacks * Public or private harassment * Publishing others' private information, such as a physical or electronic address, without explicit permission * Other conduct which could reasonably be considered inappropriate in a professional setting ## Our Responsibilities Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior. Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. ## Scope This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers. ## Enforcement Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting GitHub_Conduct@nvidia.com. All complaints will be reviewed and investigated and will result in a response that is deemed necessary and appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately. Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership. ## Attribution This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html [homepage]: https://www.contributor-covenant.org For answers to common questions about this code of conduct, see https://www.contributor-covenant.org/faq cccl-2.5.0/CONTRIBUTING.md000066400000000000000000000273571463375617100147060ustar00rootroot00000000000000 # Contributing to CCCL Thank you for your interest in contributing to the CUDA C++ Core Libraries (CCCL)! ## Getting Started 1. **Fork & Clone the Repository**: Fork the [CCCL GitHub Repository](https://github.com/nvidia/cccl) and clone the fork. For more information, check [GitHub's documentation on forking](https://docs.github.com/en/github/getting-started-with-github/fork-a-repo) and [cloning a repository](https://docs.github.com/en/github/creating-cloning-and-archiving-repositories/cloning-a-repository). 2. **Set up Development Environment**: CCCL uses Development Containers to provide a consistent development environment for both local development and CI. Contributors are strongly encouraged to use these containers as they simplify environment setup. See the [Dev Containers guide](.devcontainer/README.md) for instructions on how to quickly get up and running using dev containers with or without VSCode. ## Making Changes 1. **Create a New Branch**: ```bash git checkout -b your-feature-branch ``` 2. **Make Changes**. 3. **Build and Test**: Ensure changes don't break existing functionality by building and running tests. ```bash ./ci/build_[thrust|cub|libcudacxx].sh -cxx -std -arch ./ci/test_[thrust|cub|libcudacxx].sh -cxx -std -arch ``` For more details on building and testing, refer to the [Building and Testing](#building-and-testing) section below. 4. **Commit Changes**: ```bash git commit -m "Brief description of the change" ``` ### Developer Guides For more information about design and development practices for each CCCL component, refer to the following developer guides: #### CUB - [CUB Developer Guide](cub/docs/developer_overview.rst) - General overview of the design of CUB internals - [CUB Test Overview](cub/docs/test_overview.rst) - Overview of how to write CUB unit tests - [CUB Tuning Infrastructure](cub/docs/tuning.rst) - Overview of CUB's performance tuning infrastructure - [CUB Benchmarks](cub/docs/benchmarking.rst) - Overview of CUB's performance benchmarks #### Thrust Coming soon! #### libcudacxx Coming soon! ## Building and Testing CCCL components are header-only libraries. This means there isn't a traditional build process for the library itself. However, before submitting contributions, it's a good idea to [build and run tests](#developer-guides). There are multiple options for building and running our tests. Which option you choose depends on your preferences and whether you are using [CCCL's DevContainers](.devcontainer/README.md) (highly recommended!). ### Using Manual Build Scripts #### Building Use the build scripts provided in the `ci/` directory to build tests for each component. Building tests does not require a GPU. ```bash ci/build_[thrust|cub|libcudacxx].sh -cxx -std -arch - **HOST_COMPILER**: The desired host compiler (e.g., `g++`, `clang++`). - **CXX_STANDARD**: The C++ standard version (e.g., `11`, `14`, `17`, `20`). - **GPU_ARCHS**: A semicolon-separated list of CUDA GPU architectures (e.g., `"70;85;90"`). This uses the same syntax as CMake's [CUDA_ARCHITECTURES](https://cmake.org/cmake/help/latest/prop_tgt/CUDA_ARCHITECTURES.html#prop_tgt:CUDA_ARCHITECTURES): - `70` - both PTX and SASS - `70-real` - SASS only - `70-virtual` - PTX only **Example:** ```bash ./ci/build_cub.sh -cxx g++ -std 14 -arch "70;75;80-virtual" ``` #### Testing Use the test scripts provided in the `ci/` directory to run tests for each component. These take the same arguments as the build scripts and will automatically build the tests if they haven't already been built. Running tests requires a GPU. ```bash ci/test_[thrust|cub|libcudacxx].sh -cxx -std -arch ``` **Example:** ```bash ./ci/test_cub.sh -cxx g++ -std 14 -arch "70;75;80-virtual" ``` ### Using CMake Presets [CMake Presets](https://cmake.org/cmake/help/latest/manual/cmake-presets.7.html) are a set of configurations defined in a JSON file that specify project-wide build details for CMake. They provide a standardized and sharable way to configure, build, and test projects across different platforms and development environments. Presets are available from CMake versions 3.19 and later. There are three kinds of Presets - Configure Presets: specify options for the `cmake` command, - Build Presets: specify options for the `cmake --build` command, - Test Presets: specify options for the `ctest` command. In CCCL we provide many presets to be used out of the box. You can find the complete list in our corresponding [CMakePresets.json](./CMakePresets.json) file. These commands can be used to get lists of the configure, build, and test presets. ```bash cmake --list-presets # Configure presets cmake --build --list-presets # Build presets ctest --list-presets # Test presets ``` While there is a lot of overlap, there may be differences between the configure, build, and test presets to support various testing workflows. The `dev` presets are intended as a base for general development while the others are useful for replicating CI failures. #### Using CMake Presets via Command Line CMake automatically generates the preset build directories. You can configure, build and test for a specific preset (e.g. `thrust-cpp11`) via cmake from the root directory by appending `--preset=thrust-cpp11` to the corresponding commands. For example: ```bash cmake --preset=thrust-cpp11 cmake --build --preset=thrust-cpp11 ctest --preset=thrust-cpp11 ``` That will create `build//thrust-cpp11/` and build everything in there. The devcontainer name is inserted automatically on devcontainer builds to keep build artifacts separate for the different toolchains. It's also worth mentioning that additional cmake options can still be passed in and will override the preset settings. As a common example, the presets are currently always `60;70;80` for `CMAKE_CUDA_ARCHITECTURES`, but this can be overridden at configure time with something like: ```bash cmake --preset=thrust-cpp20 "-DCMAKE_CUDA_ARCHITECTURES=89" ``` > __Note__: Either using the `cmake` command from within the root directory or from within the build directory works, but will behave in slightly different ways. Building and running tests from the build directory will compile every target and run all of the tests configured in the configure step. Doing so from the root directory using the `--preset=` option will build and run a subset of configured targets and tests. #### Using CMake Presets via VS Code GUI extension (Recommended when using DevContainers) The recommended way to use CMake Presets is via the VS Code extension [CMake Tools](https://marketplace.visualstudio.com/items?itemName=ms-vscode.cmake-tools), already included in [CCCL's DevContainers](.devcontainer/README.md). As soon as you install the extension you would be able to see the sidebar menu below. ![cmaketools sidebar](/.devcontainer/img/cmaketools_sidebar.png) You can specify the desired CMake Preset by clicking the "Select Configure Preset" button under the "Configure" node (see image below). ![cmaketools presets](.devcontainer/img/cmaketools_presets.png) After that you can select the default build target from the "Build" node. As soon as you expand it, a list will appear with all the available targets that are included within the preset you selected. For example if you had selected the `all-dev` preset VS Code will display all the available targets we have in cccl. ![cmaketools presets](.devcontainer/img/cmaketools_targets.png) You can build the selected target by pressing the gear button ![gear](.devcontainer/img/build_button.png) at the bottom of the VS Code window. Alternatively you can select the desired target from either the "Debug" or "Launch" drop down menu (for debugging or running correspondingly). In that case after you select the target and either press "Run" ![run](.devcontainer/img/run.png) or "Debug" ![debug](.devcontainer/img/debug.png) the target will build on its own before running without the user having to build it explicitly from the gear button. --- We encourage users who want to debug device code to install the [Nsight Visual Studio Code Edition extension](https://marketplace.visualstudio.com/items?itemName=NVIDIA.nsight-vscode-edition) that enables the VS Code frontend for `cuda-gdb`. To use it you should launch from the sidebar menu instead of pressing the "Debug" button from the bottom menu. ![nsight](.devcontainer/img/nsight.png) ## Creating a Pull Request 1. Push changes to your fork 2. Create a pull request targeting the `main` branch of the original CCCL repository. Refer to [GitHub's documentation](https://docs.github.com/en/github/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/about-pull-requests) for more information on creating a pull request. 3. Describe the purpose and context of the changes in the pull request description. ## Code Formatting (pre-commit hooks) CCCL uses [pre-commit](https://pre-commit.com/) to execute all code linters and formatters. These tools ensure a consistent coding style throughout the project. Using pre-commit ensures that linter versions and options are aligned for all developers. Additionally, there is a CI check in place to enforce that committed code follows our standards. The linters used by CCCL are listed in `.pre-commit-config.yaml`. For example, C++ and CUDA code is formatted with [`clang-format`](https://clang.llvm.org/docs/ClangFormat.html). To use `pre-commit`, install via `conda` or `pip`: ```bash conda config --add channels conda-forge conda install pre-commit ``` ```bash pip install pre-commit ``` Then run pre-commit hooks before committing code: ```bash pre-commit run ``` By default, pre-commit runs on staged files (only changes and additions that will be committed). To run pre-commit checks on all files, execute: ```bash pre-commit run --all-files ``` Optionally, you may set up the pre-commit hooks to run automatically when you make a git commit. This can be done by running: ```bash pre-commit install ``` Now code linters and formatters will be run each time you commit changes. You can skip these checks with `git commit --no-verify` or with the short version `git commit -n`. ## Continuous Integration (CI) CCCL's CI pipeline tests across various CUDA versions, compilers, and GPU architectures. For external contributors, the CI pipeline will not begin until a maintainer leaves an `/ok to test` comment. For members of the NVIDIA GitHub enterprise, the CI pipeline will begin immediately. For a detailed overview of CCCL's CI, see [ci-overview.md](ci-overview.md). There is a CI check for pre-commit, called [pre-commit.ci](pre-commit.ci). This enforces that all linters (such as `clang-format`) pass. If pre-commit.ci is failing, you can comment `pre-commit.ci autofix` on a pull request to trigger the auto-fixer. The auto-fixer will push a commit to your pull request that applies changes made by pre-commit hooks. ## Review Process Once submitted, maintainers will be automatically assigned to review the pull request. They might suggest changes or improvements. Constructive feedback is a part of the collaborative process, aimed at ensuring the highest quality code. For constructive feedback and effective communication during reviews, we recommend following [Conventional Comments](https://conventionalcomments.org/). Further recommended reading for successful PR reviews: - [How to Do Code Reviews Like a Human (Part One)](https://mtlynch.io/human-code-reviews-1/) - [How to Do Code Reviews Like a Human (Part Two)](https://mtlynch.io/human-code-reviews-2/) ## Thank You! Your contributions enhance CCCL for the entire community. We appreciate your effort and collaboration! cccl-2.5.0/LICENSE000066400000000000000000001020561463375617100134500ustar00rootroot00000000000000============================================================================== Thrust is under the Apache Licence v2.0, with some specific exceptions listed below libcu++ is under the Apache License v2.0 with LLVM Exceptions: ============================================================================== Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ============================================================================== Legacy LLVM License (https://llvm.org/docs/DeveloperPolicy.html#legacy): ============================================================================== ---- LLVM Exceptions to the Apache 2.0 License ---- As an exception, if, as a result of your compiling your source code, portions of this Software are embedded into an Object form of such source code, you may redistribute such embedded portions in such Object form without complying with the conditions of Sections 4(a), 4(b) and 4(d) of the License. In addition, if you combine or link compiled forms of this Software with software that is licensed under the GPLv2 ("Combined Software") and if a court of competent jurisdiction determines that the patent provision (Section 3), the indemnity provision (Section 9) or other Section of the License conflicts with the conditions of the GPLv2, you may retroactively and prospectively choose to deem waived or otherwise exclude such Section(s) of the License, but only in their entirety and only with respect to the Combined Software. ============================================================================== Software from third parties included in the LLVM Project: ============================================================================== The LLVM Project contains third party software which is under different license terms. All such code will be identified clearly using at least one of two mechanisms: 1) It will be in a separate directory tree with its own `LICENSE.txt` or `LICENSE` file at the top containing the specific license and restrictions which apply to that software, or 2) It will contain specific license and restriction terms at the top of every file. ============================================================================== Legacy LLVM License (https://llvm.org/docs/DeveloperPolicy.html#legacy): ============================================================================== The libc++ library is dual licensed under both the University of Illinois "BSD-Like" license and the MIT license. As a user of this code you may choose to use it under either license. As a contributor, you agree to allow your code to be used under both. Full text of the relevant licenses is included below. ============================================================================== University of Illinois/NCSA Open Source License Copyright (c) 2009-2019 by the contributors listed in CREDITS.TXT All rights reserved. Developed by: LLVM Team University of Illinois at Urbana-Champaign http://llvm.org Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal with the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimers. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimers in the documentation and/or other materials provided with the distribution. * Neither the names of the LLVM Team, University of Illinois at Urbana-Champaign, nor the names of its contributors may be used to endorse or promote products derived from this Software without specific prior written permission. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE SOFTWARE. ============================================================================== Copyright (c) 2009-2014 by the contributors listed in CREDITS.TXT Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ============================================================================== Some libcudacxx components are covered by the below license. Each source file indicates which license it is under. ============================================================================== NVIDIA SOFTWARE LICENSE This license is a legal agreement between you and NVIDIA Corporation ("NVIDIA") and governs your use of the NVIDIA/CUDA C++ Library software and materials provided hereunder (“SOFTWAREâ€). This license can be accepted only by an adult of legal age of majority in the country in which the SOFTWARE is used. If you are under the legal age of majority, you must ask your parent or legal guardian to consent to this license. By taking delivery of the SOFTWARE, you affirm that you have reached the legal age of majority, you accept the terms of this license, and you take legal and financial responsibility for the actions of your permitted users. You agree to use the SOFTWARE only for purposes that are permitted by (a) this license, and (b) any applicable law, regulation or generally accepted practices or guidelines in the relevant jurisdictions. 1. LICENSE. Subject to the terms of this license, NVIDIA grants you a non-exclusive limited license to: (a) install and use the SOFTWARE, and (b) distribute the SOFTWARE subject to the distribution requirements described in this license. NVIDIA reserves all rights, title and interest in and to the SOFTWARE not expressly granted to you under this license. 2. DISTRIBUTION REQUIREMENTS. These are the distribution requirements for you to exercise the distribution grant: a. The terms under which you distribute the SOFTWARE must be consistent with the terms of this license, including (without limitation) terms relating to the license grant and license restrictions and protection of NVIDIA’s intellectual property rights. b. You agree to notify NVIDIA in writing of any known or suspected distribution or use of the SOFTWARE not in compliance with the requirements of this license, and to enforce the terms of your agreements with respect to distributed SOFTWARE. 3. LIMITATIONS. Your license to use the SOFTWARE is restricted as follows: a. The SOFTWARE is licensed for you to develop applications only for use in systems with NVIDIA GPUs. b. You may not reverse engineer, decompile or disassemble, or remove copyright or other proprietary notices from any portion of the SOFTWARE or copies of the SOFTWARE. c. You may not modify or create derivative works of any portion of the SOFTWARE. d. You may not bypass, disable, or circumvent any technical measure, encryption, security, digital rights management or authentication mechanism in the SOFTWARE. e. You may not use the SOFTWARE in any manner that would cause it to become subject to an open source software license. As examples, licenses that require as a condition of use, modification, and/or distribution that the SOFTWARE be (i) disclosed or distributed in source code form; (ii) licensed for the purpose of making derivative works; or (iii) redistributable at no charge. f. Unless you have an agreement with NVIDIA for this purpose, you may not use the SOFTWARE with any system or application where the use or failure of the system or application can reasonably be expected to threaten or result in personal injury, death, or catastrophic loss. Examples include use in avionics, navigation, military, medical, life support or other life critical applications. NVIDIA does not design, test or manufacture the SOFTWARE for these critical uses and NVIDIA shall not be liable to you or any third party, in whole or in part, for any claims or damages arising from such uses. g. You agree to defend, indemnify and hold harmless NVIDIA and its affiliates, and their respective employees, contractors, agents, officers and directors, from and against any and all claims, damages, obligations, losses, liabilities, costs or debt, fines, restitutions and expenses (including but not limited to attorney’s fees and costs incident to establishing the right of indemnification) arising out of or related to use of the SOFTWARE outside of the scope of this Agreement, or not in compliance with its terms. 4. PRE-RELEASE. SOFTWARE versions identified as alpha, beta, preview, early access or otherwise as pre-release may not be fully functional, may contain errors or design flaws, and may have reduced or different security, privacy, availability, and reliability standards relative to commercial versions of NVIDIA software and materials. You may use a pre-release SOFTWARE version at your own risk, understanding that these versions are not intended for use in production or business-critical systems. 5. OWNERSHIP. The SOFTWARE and the related intellectual property rights therein are and will remain the sole and exclusive property of NVIDIA or its licensors. The SOFTWARE is copyrighted and protected by the laws of the United States and other countries, and international treaty provisions. NVIDIA may make changes to the SOFTWARE, at any time without notice, but is not obligated to support or update the SOFTWARE. 6. COMPONENTS UNDER OTHER LICENSES. The SOFTWARE may include NVIDIA or third-party components with separate legal notices or terms as may be described in proprietary notices accompanying the SOFTWARE. If and to the extent there is a conflict between the terms in this license and the license terms associated with a component, the license terms associated with the components control only to the extent necessary to resolve the conflict. 7. FEEDBACK. You may, but don’t have to, provide to NVIDIA any Feedback. “Feedback†means any suggestions, bug fixes, enhancements, modifications, feature requests or other feedback regarding the SOFTWARE. For any Feedback that you voluntarily provide, you hereby grant NVIDIA and its affiliates a perpetual, non-exclusive, worldwide, irrevocable license to use, reproduce, modify, license, sublicense (through multiple tiers of sublicensees), and distribute (through multiple tiers of distributors) the Feedback without the payment of any royalties or fees to you. NVIDIA will use Feedback at its choice. 8. NO WARRANTIES. THE SOFTWARE IS PROVIDED "AS IS" WITHOUT ANY EXPRESS OR IMPLIED WARRANTY OF ANY KIND INCLUDING, BUT NOT LIMITED TO, WARRANTIES OF MERCHANTABILITY, NONINFRINGEMENT, OR FITNESS FOR A PARTICULAR PURPOSE. NVIDIA DOES NOT WARRANT THAT THE SOFTWARE WILL MEET YOUR REQUIREMENTS OR THAT THE OPERATION THEREOF WILL BE UNINTERRUPTED OR ERROR-FREE, OR THAT ALL ERRORS WILL BE CORRECTED. 9. LIMITATIONS OF LIABILITY. TO THE MAXIMUM EXTENT PERMITTED BY LAW, NVIDIA AND ITS AFFILIATES SHALL NOT BE LIABLE FOR ANY SPECIAL, INCIDENTAL, PUNITIVE OR CONSEQUENTIAL DAMAGES, OR ANY LOST PROFITS, PROJECT DELAYS, LOSS OF USE, LOSS OF DATA OR LOSS OF GOODWILL, OR THE COSTS OF PROCURING SUBSTITUTE PRODUCTS, ARISING OUT OF OR IN CONNECTION WITH THIS LICENSE OR THE USE OR PERFORMANCE OF THE SOFTWARE, WHETHER SUCH LIABILITY ARISES FROM ANY CLAIM BASED UPON BREACH OF CONTRACT, BREACH OF WARRANTY, TORT (INCLUDING NEGLIGENCE), PRODUCT LIABILITY OR ANY OTHER CAUSE OF ACTION OR THEORY OF LIABILITY, EVEN IF NVIDIA HAS PREVIOUSLY BEEN ADVISED OF, OR COULD REASONABLY HAVE FORESEEN, THE POSSIBILITY OF SUCH DAMAGES. IN NO EVENT WILL NVIDIA’S AND ITS AFFILIATES TOTAL CUMULATIVE LIABILITY UNDER OR ARISING OUT OF THIS LICENSE EXCEED US$10.00. THE NATURE OF THE LIABILITY OR THE NUMBER OF CLAIMS OR SUITS SHALL NOT ENLARGE OR EXTEND THIS LIMIT. 10. TERMINATION. Your rights under this license will terminate automatically without notice from NVIDIA if you fail to comply with any term and condition of this license or if you commence or participate in any legal proceeding against NVIDIA with respect to the SOFTWARE. NVIDIA may terminate this license with advance written notice to you if NVIDIA decides to no longer provide the SOFTWARE in a country or, in NVIDIA’s sole discretion, the continued use of it is no longer commercially viable. Upon any termination of this license, you agree to promptly discontinue use of the SOFTWARE and destroy all copies in your possession or control. Your prior distributions in accordance with this license are not affected by the termination of this license. All provisions of this license will survive termination, except for the license granted to you. 11. APPLICABLE LAW. This license will be governed in all respects by the laws of the United States and of the State of Delaware as those laws are applied to contracts entered into and performed entirely within Delaware by Delaware residents, without regard to the conflicts of laws principles. The United Nations Convention on Contracts for the International Sale of Goods is specifically disclaimed. You agree to all terms of this Agreement in the English language. The state or federal courts residing in Santa Clara County, California shall have exclusive jurisdiction over any dispute or claim arising out of this license. Notwithstanding this, you agree that NVIDIA shall still be allowed to apply for injunctive remedies or an equivalent type of urgent legal relief in any jurisdiction. 12. NO ASSIGNMENT. This license and your rights and obligations thereunder may not be assigned by you by any means or operation of law without NVIDIA’s permission. Any attempted assignment not approved by NVIDIA in writing shall be void and of no effect. 13. EXPORT. The SOFTWARE is subject to United States export laws and regulations. You agree that you will not ship, transfer or export the SOFTWARE into any country, or use the SOFTWARE in any manner, prohibited by the United States Bureau of Industry and Security or economic sanctions regulations administered by the U.S. Department of Treasury’s Office of Foreign Assets Control (OFAC), or any applicable export laws, restrictions or regulations. These laws include restrictions on destinations, end users and end use. By accepting this license, you confirm that you are not a resident or citizen of any country currently embargoed by the U.S. and that you are not otherwise prohibited from receiving the SOFTWARE. 14. GOVERNMENT USE. The SOFTWARE has been developed entirely at private expense and is “commercial items†consisting of “commercial computer software†and “commercial computer software documentation†provided with RESTRICTED RIGHTS. Use, duplication or disclosure by the U.S. Government or a U.S. Government subcontractor is subject to the restrictions in this license pursuant to DFARS 227.7202-3(a) or as set forth in subparagraphs (b)(1) and (2) of the Commercial Computer Software - Restricted Rights clause at FAR 52.227-19, as applicable. Contractor/manufacturer is NVIDIA, 2788 San Tomas Expressway, Santa Clara, CA 95051. 15. ENTIRE AGREEMENT. This license is the final, complete and exclusive agreement between the parties relating to the subject matter of this license and supersedes all prior or contemporaneous understandings and agreements relating to this subject matter, whether oral or written. If any court of competent jurisdiction determines that any provision of this license is illegal, invalid or unenforceable, the remaining provisions will remain in full force and effect. This license may only be modified in a writing signed by an authorized representative of each party. (v. August 20, 2021) ================================================================================ Some portions of Thrust may be licensed under other compatible open-source licenses. Any divergence from the Apache 2 license will be noted in the source code where applicable. Portions under other terms include, but are not limited to: ================================================================================ Various C++ utility classes in Thrust are based on the Boost Iterator, Tuple, System, and Random Number libraries, which are provided under the Boost Software License: Boost Software License - Version 1.0 - August 17th, 2003 Permission is hereby granted, free of charge, to any person or organization obtaining a copy of the software and accompanying documentation covered by this license (the "Software") to use, reproduce, display, distribute, execute, and transmit the Software, and to prepare derivative works of the Software, and to permit third-parties to whom the Software is furnished to do so, all subject to the following: The copyright notices in the Software and this entire statement, including the above license grant, this restriction and the following disclaimer, must be included in all copies of the Software, in whole or in part, and all derivative works of the Software, unless such copies or derivative works are solely in the form of machine-executable object code generated by a source language processor. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================================================ Portions of the thrust::complex implementation are derived from FreeBSD with the following terms: ================================================================================ Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice[1] unmodified, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. [1] Individual copyright notices from the original authors are included in the relevant source files. ============================================================================== CUB's source code is released under the BSD 3-Clause license: ============================================================================== Copyright (c) 2010-2011, Duane Merrill. All rights reserved. Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cccl-2.5.0/README.md000066400000000000000000000670741463375617100137340ustar00rootroot00000000000000[![Open in GitHub Codespaces](https://github.com/codespaces/badge.svg)](https://codespaces.new/NVIDIA/cccl?quickstart=1&devcontainer_path=.devcontainer%2Fdevcontainer.json) |[Contributor Guide](https://github.com/NVIDIA/cccl/blob/main/CONTRIBUTING.md)|[Dev Containers](https://github.com/NVIDIA/cccl/blob/main/.devcontainer/README.md)|[Discord](https://discord.gg/nvidiadeveloper)|[Godbolt](https://godbolt.org/z/x4G73af9a)|[GitHub Project](https://github.com/orgs/NVIDIA/projects/6)|[libcudacxx Docs](https://nvidia.github.io/cccl/libcudacxx/)|[Thrust Docs](https://nvidia.github.io/cccl/thrust/)|[CUB Docs](https://nvidia.github.io/cccl/cub/)| |-|-|-|-|-|-|-|-| # CUDA C++ Core Libraries (CCCL) Welcome to the CUDA C++ Core Libraries (CCCL) where our mission is to make CUDA C++ more delightful. This repository unifies three essential CUDA C++ libraries into a single, convenient repository: - [Thrust](thrust) ([former repo](https://github.com/nvidia/thrust)) - [CUB](cub) ([former repo](https://github.com/nvidia/cub)) - [libcudacxx](libcudacxx) ([former repo](https://github.com/nvidia/libcudacxx)) The goal of CCCL is to provide CUDA C++ developers with building blocks that make it easier to write safe and efficient code. Bringing these libraries together streamlines your development process and broadens your ability to leverage the power of CUDA C++. For more information about the decision to unify these projects, see the [announcement here](https://github.com/NVIDIA/cccl/discussions/520). ## Overview The concept for the CUDA C++ Core Libraries (CCCL) grew organically out of the Thrust, CUB, and libcudacxx projects that were developed independently over the years with a similar goal: to provide high-quality, high-performance, and easy-to-use C++ abstractions for CUDA developers. Naturally, there was a lot of overlap among the three projects, and it became clear the community would be better served by unifying them into a single repository. - **Thrust** is the C++ parallel algorithms library which inspired the introduction of parallel algorithms to the C++ Standard Library. Thrust's high-level interface greatly enhances programmer productivity while enabling performance portability between GPUs and multicore CPUs via configurable backends that allow using multiple parallel programming frameworks (such as CUDA, TBB, and OpenMP). - **CUB** is a lower-level, CUDA-specific library designed for speed-of-light parallel algorithms across all GPU architectures. In addition to device-wide algorithms, it provides *cooperative algorithms* like block-wide reduction and warp-wide scan, providing CUDA kernel developers with building blocks to create speed-of-light, custom kernels. - **libcudacxx** is the CUDA C++ Standard Library. It provides an implementation of the C++ Standard Library that works in both host and device code. Additionally, it provides abstractions for CUDA-specific hardware features like synchronization primitives, cache control, atomics, and more. The main goal of CCCL is to fill a similar role that the Standard C++ Library fills for Standard C++: provide general-purpose, speed-of-light tools to CUDA C++ developers, allowing them to focus on solving the problems that matter. Unifying these projects is the first step towards realizing that goal. ## Example This is a simple example demonstrating the use of CCCL functionality from Thrust, CUB, and libcudacxx. It shows how to use Thrust/CUB/libcudacxx to implement a simple parallel reduction kernel. Each thread block computes the sum of a subset of the array using `cub::BlockReduce`. The sum of each block is then reduced to a single value using an atomic add via `cuda::atomic_ref` from libcudacxx. It then shows how the same reduction can be done using Thrust's `reduce` algorithm and compares the results. [Try it live on Godbolt!](https://godbolt.org/z/x4G73af9a) ```cpp #include #include #include #include #include constexpr int block_size = 256; __global__ void reduce(int const* data, int* result, int N) { using BlockReduce = cub::BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; int const index = threadIdx.x + blockIdx.x * blockDim.x; int sum = 0; if (index < N) { sum += data[index]; } sum = BlockReduce(temp_storage).Sum(sum); if (threadIdx.x == 0) { cuda::atomic_ref atomic_result(*result); atomic_result.fetch_add(sum, cuda::memory_order_relaxed); } } int main() { // Allocate and initialize input data int const N = 1000; thrust::device_vector data(N); thrust::fill(data.begin(), data.end(), 1); // Allocate output data thrust::device_vector kernel_result(1); // Compute the sum reduction of `data` using a custom kernel int const num_blocks = (N + block_size - 1) / block_size; reduce<<>>(thrust::raw_pointer_cast(data.data()), thrust::raw_pointer_cast(kernel_result.data()), N); auto const err = cudaDeviceSynchronize(); if (err != cudaSuccess) { std::cout << "Error: " << cudaGetErrorString(err) << std::endl; return -1; } int const custom_result = kernel_result[0]; // Compute the same sum reduction using Thrust int const thrust_result = thrust::reduce(thrust::device, data.begin(), data.end(), 0); // Ensure the two solutions are identical std::printf("Custom kernel sum: %d\n", custom_result); std::printf("Thrust reduce sum: %d\n", thrust_result); assert(kernel_result[0] == thrust_result); return 0; } ``` ## Getting Started ### Users Everything in CCCL is header-only. Therefore, users need only concern themselves with how they get the header files and how they incorporate them into their build system. #### CUDA Toolkit The easiest way to get started using CCCL is via the [CUDA Toolkit](https://developer.nvidia.com/cuda-toolkit) which includes the CCCL headers. When you compile with `nvcc`, it automatically adds CCCL headers to your include path so you can simply `#include` any CCCL header in your code with no additional configuration required. If compiling with another compiler, you will need to update your build system's include search path to point to the CCCL headers in your CTK install (e.g., `/usr/local/cuda/include`). ```cpp #include #include #include ``` #### GitHub Users who want to stay on the cutting edge of CCCL development are encouraged to use CCCL from GitHub. Using a newer version of CCCL with an older version of the CUDA Toolkit is supported, but not the other way around. For complete information on compatibility between CCCL and the CUDA Toolkit, see [our platform support](#platform-support). Everything in CCCL is header-only, so cloning and including it in a simple project is as easy as the following: ```bash git clone https://github.com/NVIDIA/cccl.git nvcc -Icccl/thrust -Icccl/libcudacxx/include -Icccl/cub main.cu -o main ``` > **Note** > Use `-I` and not `-isystem` to avoid collisions with the CCCL headers implicitly included by `nvcc` from the CUDA Toolkit. All CCCL headers use `#pragma system_header` to ensure warnings will still be silenced as if using `-isystem`, see https://github.com/NVIDIA/cccl/issues/527 for more information. #### Conda CCCL also provides conda packages of each release via the `conda-forge` channel: ```bash conda config --add channels conda-forge conda install cccl ``` This will install the latest CCCL to the conda environment's `$CONDA_PREFIX/include/` and `$CONDA_PREFIX/lib/cmake/` directories. It is discoverable by CMake via `find_package(CCCL)` and can be used by any compilers in the conda environment. For more information, see [this introduction to conda-forge](https://conda-forge.org/docs/user/introduction/). If you want to use the same CCCL version that shipped with a particular CUDA Toolkit, e.g. CUDA 12.4, you can install CCCL with: ```bash conda config --add channels conda-forge conda install cuda-cccl cuda-version=12.4 ``` The `cuda-cccl` metapackage installs the `cccl` version that shipped with the CUDA Toolkit corresponding to `cuda-version`. If you wish to update to the latest `cccl` after installing `cuda-cccl`, uninstall `cuda-cccl` before updating `cccl`: ```bash conda uninstall cuda-cccl conda install -c conda-forge cccl ``` > **Note** > There are also conda packages with names like `cuda-cccl_linux-64`. > Those packages contain the CCCL versions shipped as part of the CUDA Toolkit, but are designed for internal use by the CUDA Toolkit. > Install `cccl` or `cuda-cccl` instead, for compatibility with conda compilers. > For more information, see the [cccl conda-forge recipe](https://github.com/conda-forge/cccl-feedstock/blob/main/recipe/meta.yaml). ##### CMake Integration CCCL uses [CMake](https://cmake.org/) for all build and installation infrastructure, including tests as well as targets to link against in other CMake projects. Therefore, CMake is the recommended way to integrate CCCL into another project. For a complete example of how to do this using CMake Package Manager see [our example project](examples/example_project). Other build systems should work, but only CMake is tested. Contributions to simplify integrating CCCL into other build systems are welcome. ### Contributors Interested in contributing to making CCCL better? Check out our [Contributing Guide](CONTRIBUTING.md) for a comprehensive overview of everything you need to know to set up your development environment, make changes, run tests, and submit a PR. ## Platform Support **Objective:** This section describes where users can expect CCCL to compile and run successfully. In general, CCCL should work everywhere the CUDA Toolkit is supported, however, the devil is in the details. The sections below describe the details of support and testing for different versions of the CUDA Toolkit, host compilers, and C++ dialects. ### CUDA Toolkit (CTK) Compatibility **Summary:** - The latest version of CCCL is backward compatible with the current and preceding CTK major version series - CCCL is never forward compatible with any version of the CTK. Always use the same or newer than what is included with your CTK. - Minor version CCCL upgrades won't break existing code, but new features may not support all CTK versions CCCL users are encouraged to capitalize on the latest enhancements and ["live at head"](https://www.youtube.com/watch?v=tISy7EJQPzI) by always using the newest version of CCCL. For a seamless experience, you can upgrade CCCL independently of the entire CUDA Toolkit. This is possible because CCCL maintains backward compatibility with the latest patch release of every minor CTK release from both the current and previous major version series. In some exceptional cases, the minimum supported minor version of the CUDA Toolkit release may need to be newer than the oldest release within its major version series. For instance, CCCL requires a minimum supported version of 11.1 from the 11.x series due to an unavoidable compiler issue present in CTK 11.0. When a new major CTK is released, we drop support for the oldest supported major version. | CCCL Version | Supports CUDA Toolkit Version | |--------------|------------------------------------------------| | 2.x | 11.1 - 11.8, 12.x (only latest patch releases) | | 3.x (Future) | 12.x, 13.x (only latest patch releases) | [Well-behaved code](#compatibility-guidelines) using the latest CCCL should compile and run successfully with any supported CTK version. Exceptions may occur for new features that depend on new CTK features, so those features would not work on older versions of the CTK. For example, C++20 support was not added to `nvcc` until CUDA 12.0, so CCCL features that depend on C++20 would not work with CTK 11.x. Users can integrate a newer version of CCCL into an older CTK, but not the other way around. This means an older version of CCCL is not compatible with a newer CTK. In other words, **CCCL is never forward compatible with the CUDA Toolkit.** The table below summarizes compatibility of the CTK and CCCL: | CTK Version | Included CCCL Version | Desired CCCL | Supported? | Notes | |:-----------:|:---------------------:|:--------------------:|:----------:|:--------------------------------------------------------:| | CTK `X.Y` | CCCL `MAJOR.MINOR` | CCCL `MAJOR.MINOR+n` | ✅ | Some new features might not work | | CTK `X.Y` | CCCL `MAJOR.MINOR` | CCCL `MAJOR+1.MINOR` | ✅ | Possible breaks; some new features might not be available| | CTK `X.Y` | CCCL `MAJOR.MINOR` | CCCL `MAJOR+2.MINOR` | ⌠| CCCL supports only two CTK major versions | | CTK `X.Y` | CCCL `MAJOR.MINOR` | CCCL `MAJOR.MINOR-n` | ⌠| CCCL isn't forward compatible | | CTK `X.Y` | CCCL `MAJOR.MINOR` | CCCL `MAJOR-n.MINOR` | ⌠| CCCL isn't forward compatible | For more information on CCCL versioning, API/ABI compatibility, and breaking changes see the [Versioning](#versioning) section below. ### Operating Systems Unless otherwise specified, CCCL supports all the same operating systems as the CUDA Toolkit, which are documented here: - [Linux](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) - [Windows](https://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html#system-requirements) ### Host Compilers Unless otherwise specified, CCCL supports all the same host compilers as the CUDA Toolkit, which are documented here: - [Linux](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#host-compiler-support-policy) - [Windows](https://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html#system-requirements) ### C++ Dialects - C++11 (Deprecated in Thrust/CUB, to be removed in next major version) - C++14 (Deprecated in Thrust/CUB, to be removed in next major version) - C++17 - C++20 ### GPU Architectures Unless otherwise specified, CCCL supports all the same GPU architectures/Compute Capabilities as the CUDA Toolkit, which are documented here: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#compute-capability Note that some features may only support certain architectures/Compute Capabilities. ### Testing Strategy CCCL's testing strategy strikes a balance between testing as many configurations as possible and maintaining reasonable CI times. For CUDA Toolkit versions, testing is done against both the oldest and the newest supported versions. For instance, if the latest version of the CUDA Toolkit is 12.3, tests are conducted against 11.1 and 12.3. For each CUDA version, builds are completed against all supported host compilers with all supported C++ dialects. The testing strategy and matrix are constantly evolving. The matrix defined in the [`ci/matrix.yaml`](ci/matrix.yaml) file is the definitive source of truth. For more information about our CI pipeline, see [here](ci-overview.md). ## Versioning **Objective:** This section describes how CCCL is versioned, API/ABI stability guarantees, and compatibility guidelines to minimize upgrade headaches. **Summary** - The entirety of CCCL's API shares a common semantic version across all components - Only the most recently released version is supported and fixes are not backported to prior releases - API breaking changes and incrementing CCCL's major version will only coincide with a new major version release of the CUDA Toolkit - Not all source breaking changes are considered breaking changes of the public API that warrant bumping the major version number - Do not rely on ABI stability of entities in the `cub::` or `thrust::` namespaces - ABI breaking changes for symbols in the `cuda::` namespace may happen at any time, but will be reflected by incrementing the ABI version which is embedded in an inline namespace for all `cuda::` symbols. Multiple ABI versions may be supported concurrently. **Note:** Prior to merging Thrust, CUB, and libcudacxx into this repository, each library was independently versioned according to semantic versioning. Starting with the 2.1 release, all three libraries synchronized their release versions in their separate repositories. Moving forward, CCCL will continue to be released under a single [semantic version](https://semver.org/), with 2.2.0 being the first release from the [nvidia/cccl](www.github.com/nvidia/cccl) repository. ### Breaking Change A Breaking Change is a change to **explicitly supported** functionality between released versions that would require a user to do work in order to upgrade to the newer version. In the limit, [_any_ change](https://www.hyrumslaw.com/) has the potential to break someone somewhere. As a result, not all possible source breaking changes are considered Breaking Changes to the public API that warrant bumping the major semantic version. The sections below describe the details of breaking changes to CCCL's API and ABI. ### Application Programming Interface (API) CCCL's public API is the entirety of the functionality _intentionally_ exposed to provide the utility of the library. In other words, CCCL's public API goes beyond just function signatures and includes (but is not limited to): - The location and names of headers intended for direct inclusion in user code - The namespaces intended for direct use in user code - The declarations and/or definitions of functions, classes, and variables located in headers and intended for direct use in user code - The semantics of functions, classes, and variables intended for direct use in user code Moreover, CCCL's public API does **not** include any of the following: - Any symbol prefixed with `_` or `__` - Any symbol whose name contains `detail` including the `detail::` namespace or a macro - Any header file contained in a `detail/` directory or sub-directory thereof - The header files implicitly included by any header part of the public API In general, the goal is to avoid breaking anything in the public API. Such changes are made only if they offer users better performance, easier-to-understand APIs, and/or more consistent APIs. Any breaking change to the public API will require bumping CCCL's major version number. In keeping with [CUDA Minor Version Compatibility](https://docs.nvidia.com/deploy/cuda-compatibility/#minor-version-compatibility), API breaking changes and CCCL major version bumps will only occur coinciding with a new major version release of the CUDA Toolkit. Anything not part of the public API may change at any time without warning. #### API Versioning The public API of all CCCL's components share a unified semantic version of `MAJOR.MINOR.PATCH`. Only the most recently released version is supported. As a rule, features and bug fixes are not backported to previously released version or branches. The preferred method for querying the version is to use `CCCL_[MAJOR/MINOR/PATCH_]VERSION` as described below. For backwards compatibility, the Thrust/CUB/libcudacxxx version definitions are available and will always be consistent with `CCCL_VERSION`. Note that Thrust/CUB use a `MMMmmmpp` scheme whereas the CCCL and libcudacxx use `MMMmmmppp`. | | CCCL | libcudacxx | Thrust | CUB | |------------------------|----------------------------------------|-------------------------------------------|------------------------------|---------------------------| | Header | `` | `` | `` | `` | | Major Version | `CCCL_MAJOR_VERSION` | `_LIBCUDACXX_CUDA_API_VERSION_MAJOR` | `THRUST_MAJOR_VERSION` | `CUB_MAJOR_VERSION` | | Minor Version | `CCCL_MINOR_VERSION` | `_LIBCUDACXX_CUDA_API_VERSION_MINOR` | `THRUST_MINOR_VERSION` | `CUB_MINOR_VERSION` | | Patch/Subminor Version | `CCCL_PATCH_VERSION` | `_LIBCUDACXX_CUDA_API_VERSION_PATCH` | `THRUST_SUBMINOR_VERSION` | `CUB_SUBMINOR_VERSION` | | Concatenated Version | `CCCL_VERSION (MMMmmmppp)` | `_LIBCUDACXX_CUDA_API_VERSION (MMMmmmppp)`| `THRUST_VERSION (MMMmmmpp)` | `CUB_VERSION (MMMmmmpp)` | ### Application Binary Interface (ABI) The Application Binary Interface (ABI) is a set of rules for: - How a library's components are represented in machine code - How those components interact across different translation units A library's ABI includes, but is not limited to: - The mangled names of functions and types - The size and alignment of objects and types - The semantics of the bytes in the binary representation of an object An **ABI Breaking Change** is any change that results in a change to the ABI of a function or type in the public API. For example, adding a new data member to a struct is an ABI Breaking Change as it changes the size of the type. In CCCL, the guarantees about ABI are as follows: - Symbols in the `thrust::` and `cub::` namespaces may break ABI at any time without warning. - The ABI of `thrust::` and `cub::` [symbols includes the CUDA architectures used for compilation](https://nvidia.github.io/cccl/cub/developer_overview.html#symbols-visibility). Therefore, a `thrust::` or `cub::` symbol may have a different ABI if: - compiled with different architectures - compiled as a CUDA source file (`-x cu`) vs C++ source (`-x cpp`) - Symbols in the `cuda::` namespace may also break ABI at any time. However, `cuda::` symbols embed an ABI version number that is incremented whenever an ABI break occurs. Multiple ABI versions may be supported concurrently, and therefore users have the option to revert to a prior ABI version. For more information, see [here](libcudacxx/docs/releases/versioning.md). **Who should care about ABI?** In general, CCCL users only need to worry about ABI issues when building or using a binary artifact (like a shared library) whose API directly or indirectly includes types provided by CCCL. For example, consider if `libA.so` was built using CCCL version `X` and its public API includes a function like: ```c++ void foo(cuda::std::optional); ``` If another library, `libB.so`, is compiled using CCCL version `Y` and uses `foo` from `libA.so`, then this can fail if there was an ABI break between version `X` and `Y`. Unlike with API breaking changes, ABI breaks usually do not require code changes and only require recompiling everything to use the same ABI version. To learn more about ABI and why it is important, see [What is ABI, and What Should C++ Do About It?](https://wg21.link/P2028R0). ### Compatibility Guidelines As mentioned above, not all possible source breaking changes constitute a Breaking Change that would require incrementing CCCL's API major version number. Users are encouraged to adhere to the following guidelines in order to minimize the risk of disruptions from accidentally depending on parts of CCCL that are not part of the public API: - Do not add any declarations to the `thrust::`, `cub::`, `nv::`, or `cuda::` namespaces unless an exception is noted for a specific symbol, e.g., specializing a type trait. - **Rationale**: This would cause symbol conflicts if a symbol is added with the same name. - Do not take the address of any API in the `thrust::`, `cub::`, `cuda::`, or `nv::` namespaces. - **Rationale**: This would prevent adding overloads of these APIs. - Do not forward declare any API in the `thrust::`, `cub::`, `cuda::`, or `nv::` namespaces. - **Rationale**: This would prevent adding overloads of these APIs. - Do not directly reference any symbol prefixed with `_`, `__`, or with `detail` anywhere in its name including a `detail::` namespace or macro - **Rationale**: These symbols are for internal use only and may change at any time without warning. - Include what you use. For every CCCL symbol that you use, directly `#include` the header file that declares that symbol. In other words, do not rely on headers implicitly included by other headers. - **Rationale**: Internal includes may change at any time. Portions of this section were inspired by [Abseil's Compatibility Guidelines](https://abseil.io/about/compatibility). ## Deprecation Policy We will do our best to notify users prior to making any breaking changes to the public API, ABI, or modifying the supported platforms and compilers. As appropriate, deprecations will come in the form of programmatic warnings which can be disabled. The deprecation period will depend on the impact of the change, but will usually last at least 2 minor version releases. ## Mapping to CTK Versions Coming soon! ## CI Pipeline Overview For a detailed overview of the CI pipeline, see [ci-overview.md](ci-overview.md). ## Related Projects Projects that are related to CCCL's mission to make CUDA C++ more delightful: - [cuCollections](https://github.com/NVIDIA/cuCollections) - GPU accelerated data structures like hash tables - [NVBench](https://github.com/NVIDIA/nvbench) - Benchmarking library tailored for CUDA applications - [stdexec](https://github.com/nvidia/stdexec) - Reference implementation for Senders asynchronous programming model ## Projects Using CCCL Does your project use CCCL? [Open a PR to add your project to this list!](https://github.com/NVIDIA/cccl/edit/main/README.md) - [AmgX](https://github.com/NVIDIA/AMGX) - Multi-grid linear solver library - [ColossalAI](https://github.com/hpcaitech/ColossalAI) - Tools for writing distributed deep learning models - [cuDF](https://github.com/rapidsai/cudf) - Algorithms and file readers for ETL data analytics - [cuGraph](https://github.com/rapidsai/cugraph) - Algorithms for graph analytics - [cuML](https://github.com/rapidsai/cuml) - Machine learning algorithms and primitives - [CuPy](https://cupy.dev) - NumPy & SciPy for GPU - [cuSOLVER](https://developer.nvidia.com/cusolver) - Dense and sparse linear solvers - [cuSpatial](https://github.com/rapidsai/cuspatial) - Algorithms for geospatial operations - [GooFit](https://github.com/GooFit/GooFit) - Library for maximum-likelihood fits - [HeavyDB](https://github.com/heavyai/heavydb) - SQL database engine - [HOOMD](https://github.com/glotzerlab/hoomd-blue) - Monte Carlo and molecular dynamics simulations - [HugeCTR](https://github.com/NVIDIA-Merlin/HugeCTR) - GPU-accelerated recommender framework - [Hydra](https://github.com/MultithreadCorner/Hydra) - High-energy Physics Data Analysis - [Hypre](https://github.com/hypre-space/hypre) - Multigrid linear solvers - [LightSeq](https://github.com/bytedance/lightseq) - Training and inference for sequence processing and generation - [MatX](https://github.com/NVIDIA/matx) - Numerical computing library using expression templates to provide efficient, Python-like syntax - [PyTorch](https://github.com/pytorch/pytorch) - Tensor and neural network computations - [Qiskit](https://github.com/Qiskit/qiskit-aer) - High performance simulator for quantum circuits - [QUDA](https://github.com/lattice/quda) - Lattice quantum chromodynamics (QCD) computations - [RAFT](https://github.com/rapidsai/raft) - Algorithms and primitives for machine learning - [TensorFlow](https://github.com/tensorflow/tensorflow) - End-to-end platform for machine learning - [TensorRT](https://github.com/NVIDIA/TensorRT) - Deep learning inference - [tsne-cuda](https://github.com/CannyLab/tsne-cuda) - Stochastic Neighborhood Embedding library - [Visualization Toolkit (VTK)](https://gitlab.kitware.com/vtk/vtk) - Rendering and visualization library - [XGBoost](https://github.com/dmlc/xgboost) - Gradient boosting machine learning algorithms cccl-2.5.0/SECURITY.md000066400000000000000000000016211463375617100142300ustar00rootroot00000000000000## Security NVIDIA is dedicated to the security and trust of our software products and services, including all source code repositories managed through our organization. If you need to report a security issue, please use the appropriate contact points outlined below. **Please do not report security vulnerabilities through GitHub.** ## Reporting Potential Security Vulnerability in an NVIDIA Product To report a potential security vulnerability in any NVIDIA product: - Web: [Security Vulnerability Submission Form](https://www.nvidia.com/object/submit-security-vulnerability.html) - E-Mail: psirt@nvidia.com - We encourage you to use the following PGP key for secure email communication: [NVIDIA public PGP Key for communication](https://www.nvidia.com/en-us/security/pgp-key) - Please include the following information: - Product/Driver name and version/branch that contains the vulnerability cccl-2.5.0/benchmarks/000077500000000000000000000000001463375617100145545ustar00rootroot00000000000000cccl-2.5.0/benchmarks/cmake/000077500000000000000000000000001463375617100156345ustar00rootroot00000000000000cccl-2.5.0/benchmarks/cmake/CCCLBenchmarkRegistry.cmake000066400000000000000000000033521463375617100227110ustar00rootroot00000000000000find_package(CUDAToolkit REQUIRED) set(cccl_revision "") find_package(Git) if(GIT_FOUND) execute_process( COMMAND ${GIT_EXECUTABLE} describe WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} OUTPUT_VARIABLE cccl_revision OUTPUT_STRIP_TRAILING_WHITESPACE) if(cccl_revision STREQUAL "") # Sometimes, there is no tag (shallow copy) execute_process( COMMAND ${GIT_EXECUTABLE} rev-parse --short HEAD WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} OUTPUT_VARIABLE cccl_revision OUTPUT_STRIP_TRAILING_WHITESPACE) endif() endif() # Sometimes this script is used outside of a Git repository. # In this case, we read the revision from cccl/cccl_version instead. if ("${cccl_revision}" STREQUAL "") file(READ "${CMAKE_SOURCE_DIR}/cccl_version" cccl_revision) string(STRIP "${cccl_revision}" cccl_revision) string(REPLACE "\n" "" cccl_revision "${cccl_revision}") endif() message(STATUS "Git revision: ${cccl_revision}") function(get_meta_path meta_path) set(meta_path "${CMAKE_BINARY_DIR}/cccl_meta_bench.csv" PARENT_SCOPE) endfunction() function(create_benchmark_registry) get_meta_path(meta_path) set(ctk_version "${CUDAToolkit_VERSION}") message(STATUS "CTK version: ${ctk_version}") file(REMOVE "${meta_path}") file(APPEND "${meta_path}" "ctk_version,${ctk_version}\n") file(APPEND "${meta_path}" "cccl_revision,${cccl_revision}\n") endfunction() function(register_cccl_tuning bench_name ranges) get_meta_path(meta_path) if ("${ranges}" STREQUAL "") file(APPEND "${meta_path}" "${bench_name}\n") else() file(APPEND "${meta_path}" "${bench_name},${ranges}\n") endif() endfunction() function(register_cccl_benchmark bench_name) register_cccl_tuning("${bench_name}" "") endfunction() cccl-2.5.0/benchmarks/scripts/000077500000000000000000000000001463375617100162435ustar00rootroot00000000000000cccl-2.5.0/benchmarks/scripts/.gitignore000066400000000000000000000000371463375617100202330ustar00rootroot00000000000000__pycache__/ *.pyc *.pyo *.pyd cccl-2.5.0/benchmarks/scripts/analyze.py000077500000000000000000000645171463375617100203000ustar00rootroot00000000000000#!/usr/bin/env python3 import os import re import json import cccl import math import argparse import itertools import functools import numpy as np import pandas as pd import matplotlib.pyplot as plt from scipy.stats import mannwhitneyu from scipy.stats.mstats import hdquantiles pd.options.display.max_colwidth = 100 default_colors = plt.rcParams['axes.prop_cycle'].by_key()['color'] color_cycle = itertools.cycle(default_colors) color_map = {} precision = 0.01 sensitivity = 0.5 def get_bench_columns(): return ['variant', 'elapsed', 'center', 'samples', 'bw'] def get_extended_bench_columns(): return get_bench_columns() + ['speedup', 'base_samples'] def compute_speedup(df): bench_columns = get_bench_columns() workload_columns = [col for col in df.columns if col not in bench_columns] base_df = df[df['variant'] == 'base'].drop(columns=['variant']).rename( columns={'center': 'base_center', 'samples': 'base_samples'}) base_df.drop(columns=['elapsed', 'bw'], inplace=True) merged_df = df.merge( base_df, on=[col for col in df.columns if col in workload_columns]) merged_df['speedup'] = merged_df['base_center'] / merged_df['center'] merged_df = merged_df.drop(columns=['base_center']) return merged_df def get_ct_axes(df): ct_axes = [] for col in df.columns: if '{ct}' in col: ct_axes.append(col) return ct_axes def get_rt_axes(df): rt_axes = [] excluded_columns = get_ct_axes(df) + get_extended_bench_columns() for col in df.columns: if col not in excluded_columns: rt_axes.append(col) return rt_axes def ct_space(df): ct_axes = get_ct_axes(df) unique_ct_combinations = [] for _, row in df[ct_axes].drop_duplicates().iterrows(): unique_ct_combinations.append({}) for col in ct_axes: unique_ct_combinations[-1][col] = row[col] return unique_ct_combinations def extract_case(df, ct_point): tuning_df_loc = None for ct_axis in ct_point: if tuning_df_loc is None: tuning_df_loc = (df[ct_axis] == ct_point[ct_axis]) else: tuning_df_loc = tuning_df_loc & (df[ct_axis] == ct_point[ct_axis]) tuning_df = df.loc[tuning_df_loc].copy() for ct_axis in ct_point: tuning_df.drop(columns=[ct_axis], inplace=True) return tuning_df def extract_rt_axes_values(df): rt_axes = get_rt_axes(df) rt_axes_values = {} for rt_axis in rt_axes: rt_axes_values[rt_axis] = list(df[rt_axis].unique()) return rt_axes_values def extract_rt_space(df): rt_axes = get_rt_axes(df) rt_axes_values = [] for rt_axis in rt_axes: values = df[rt_axis].unique() rt_axes_values.append(["{}={}".format(rt_axis, v) for v in values]) return list(itertools.product(*rt_axes_values)) def filter_variants(df, group): rt_axes = get_rt_axes(df) unique_combinations = set( df[rt_axes].drop_duplicates().itertuples(index=False)) group_combinations = set( group[rt_axes].drop_duplicates().itertuples(index=False)) has_all_combinations = group_combinations == unique_combinations return has_all_combinations def extract_complete_variants(df): return df.groupby('variant').filter(functools.partial(filter_variants, df)) def compute_workload_score(rt_axes_values, rt_axes_ids, weights, row): rt_workload = [] for rt_axis in rt_axes_values: rt_workload.append("{}={}".format(rt_axis, row[rt_axis])) weight = cccl.bench.get_workload_weight(rt_workload, rt_axes_values, rt_axes_ids, weights) return row['speedup'] * weight def compute_variant_score(rt_axes_values, rt_axes_ids, weight_matrix, group): workload_score_closure = functools.partial(compute_workload_score, rt_axes_values, rt_axes_ids, weight_matrix) score_sum = group.apply(workload_score_closure, axis=1).sum() return score_sum def extract_scores(dfs): rt_axes_values = {} for subbench in dfs: rt_axes_values[subbench] = extract_rt_axes_values(dfs[subbench]) rt_axes_ids = cccl.bench.compute_axes_ids(rt_axes_values) weights = cccl.bench.compute_weight_matrices(rt_axes_values, rt_axes_ids) score_dfs = [] for subbench in dfs: score_closure = functools.partial( compute_variant_score, rt_axes_values[subbench], rt_axes_ids[subbench], weights[subbench]) grouped = dfs[subbench].groupby('variant') scores = grouped.apply(score_closure).reset_index() scores.columns = ['variant', 'score'] stat = grouped.agg(mins=('speedup', 'min'), means=('speedup', 'mean'), maxs=('speedup', 'max')) scores = pd.merge(scores, stat, on='variant') score_dfs.append(scores) score_df = pd.concat(score_dfs) result = score_df.groupby('variant').agg( {'score': 'sum', 'mins': 'min', 'means': 'mean', 'maxs': 'max'}).reset_index() return result.sort_values(by=['score'], ascending=False) def distributions_are_different(alpha, row): ref_samples = row['base_samples'] cmp_samples = row['samples'] # H0: the distributions are not different # H1: the distribution are different _, p = mannwhitneyu(ref_samples, cmp_samples) # Reject H0 return p < alpha def remove_matching_distributions(alpha, df): closure = functools.partial(distributions_are_different, alpha) return df[df.apply(closure, axis=1)] def get_filenames_map(arr): if not arr: return [] prefix = arr[0] for string in arr: while not string.startswith(prefix): prefix = prefix[:-1] if not prefix: break return {string: string[len(prefix):] for string in arr} def is_finite(x): if isinstance(x, float): return x != np.inf and x != -np.inf return True def iterate_case_dfs(args, callable): storages = {} algnames = set() filenames_map = get_filenames_map(args.files) for file in args.files: storage = cccl.bench.StorageBase(file) algnames.update(storage.algnames()) storages[filenames_map[file]] = storage pattern = re.compile(args.R) exact_values = {} if args.args: for value in args.args: name, val = value.split('=') exact_values[name] = val for algname in algnames: if not pattern.match(algname): continue case_dfs = {} for subbench in storage.subbenches(algname): for file in storages: storage = storages[file] df = storage.alg_to_df(algname, subbench) df = df.map(lambda x: x if is_finite(x) else np.nan) df = df.dropna(subset=['center'], how='all') for _, row in df[['ctk', 'cccl']].drop_duplicates().iterrows(): ctk_version = row['ctk'] cccl_version = row['cccl'] ctk_cub_df = df[(df['ctk'] == ctk_version) & (df['cccl'] == cccl_version)] for gpu in ctk_cub_df['gpu'].unique(): target_df = ctk_cub_df[ctk_cub_df['gpu'] == gpu] target_df = target_df.drop(columns=['ctk', 'cccl', 'gpu']) target_df = compute_speedup(target_df) for key in exact_values: if key in target_df.columns: target_df = target_df[target_df[key] == exact_values[key]] for ct_point in ct_space(target_df): point_str = ", ".join(["{}={}".format(k, ct_point[k]) for k in ct_point]) case_df = extract_complete_variants(extract_case(target_df, ct_point)) case_df['variant'] = case_df['variant'].astype(str) + " ({})".format(file) if point_str not in case_dfs: case_dfs[point_str] = {} if subbench not in case_dfs[point_str]: case_dfs[point_str][subbench] = case_df else: case_dfs[point_str][subbench] = pd.concat([case_dfs[point_str][subbench], case_df]) for point_str in case_dfs: callable(algname, point_str, case_dfs[point_str]) def case_top(alpha, N, algname, ct_point_name, case_dfs): print("{}[{}]:".format(algname, ct_point_name)) if alpha < 1.0: case_df = remove_matching_distributions(alpha, case_df) for subbench in case_dfs: case_dfs[subbench] = extract_complete_variants(case_dfs[subbench]) print(extract_scores(case_dfs).head(N)) def top(args): iterate_case_dfs(args, functools.partial(case_top, args.alpha, args.top)) def case_coverage(algname, ct_point_name, case_dfs): num_variants = cccl.bench.Config().variant_space_size(algname) min_coverage = 100.0 for subbench in case_dfs: num_covered_variants = len(case_dfs[subbench]['variant'].unique()) coverage = (num_covered_variants / num_variants) * 100 min_coverage = min(min_coverage, coverage) case_str = "{}[{}]".format(algname, ct_point_name) print("{} coverage: {} / {} ({:.4f}%)".format( case_str, num_covered_variants, num_variants, min_coverage)) def coverage(args): iterate_case_dfs(args, case_coverage) def parallel_coordinates_plot(df, title): # Parallel coordinates plot adaptation of https://stackoverflow.com/a/69411450 import matplotlib.cm as cm from matplotlib.path import Path import matplotlib.patches as patches # Variables (the first variable must be categoric): my_vars = df.columns.tolist() df_plot = df[my_vars] df_plot = df_plot.dropna() df_plot = df_plot.reset_index(drop=True) # Convert to numeric matrix: ym = [] dics_vars = [] for v, var in enumerate(my_vars): if df_plot[var].dtype.kind not in ["i", "u", "f"]: dic_var = dict([(val, c) for c, val in enumerate(df_plot[var].unique())]) dics_vars += [dic_var] ym += [[dic_var[i] for i in df_plot[var].tolist()]] else: ym += [df_plot[var].tolist()] ym = np.array(ym).T # Padding: ymins = ym.min(axis=0) ymaxs = ym.max(axis=0) dys = ymaxs - ymins ymins -= dys*0.05 ymaxs += dys*0.05 dys = ymaxs - ymins # Adjust to the main axis: zs = np.zeros_like(ym) zs[:, 0] = ym[:, 0] zs[:, 1:] = (ym[:, 1:] - ymins[1:])/dys[1:]*dys[0] + ymins[0] # Plot: fig, host_ax = plt.subplots(figsize=(20, 10), tight_layout=True) # Make the axes: axes = [host_ax] + [host_ax.twinx() for i in range(ym.shape[1] - 1)] dic_count = 0 for i, ax in enumerate(axes): ax.set_ylim( bottom=ymins[i], top=ymaxs[i] ) ax.spines.top.set_visible(False) ax.spines.bottom.set_visible(False) ax.ticklabel_format(style='plain') if ax != host_ax: ax.spines.left.set_visible(False) ax.yaxis.set_ticks_position("right") ax.spines.right.set_position(("axes", i/(ym.shape[1] - 1))) if df_plot.iloc[:, i].dtype.kind not in ["i", "u", "f"]: dic_var_i = dics_vars[dic_count] ax.set_yticks(range(len(dic_var_i))) if i == 0: ax.set_yticklabels([]) else: ax.set_yticklabels([key_val for key_val in dics_vars[dic_count].keys()]) dic_count += 1 host_ax.set_xlim(left=0, right=ym.shape[1] - 1) host_ax.set_xticks(range(ym.shape[1])) host_ax.set_xticklabels(my_vars, fontsize=14) host_ax.tick_params(axis="x", which="major", pad=7) # Color map: colormap = cm.get_cmap('turbo') # Normalize speedups: df["speedup_normalized"] = ( df["speedup"] - df["speedup"].min()) / (df["speedup"].max() - df["speedup"].min()) # Make the curves: host_ax.spines.right.set_visible(False) host_ax.xaxis.tick_top() for j in range(ym.shape[0]): verts = list(zip([x for x in np.linspace(0, len(ym) - 1, len(ym)*3 - 2, endpoint=True)], np.repeat(zs[j, :], 3)[1: -1])) codes = [Path.MOVETO] + [Path.CURVE4 for _ in range(len(verts) - 1)] path = Path(verts, codes) color_first_cat_var = colormap(df.loc[j, "speedup_normalized"]) patch = patches.PathPatch( path, facecolor="none", lw=2, alpha=0.05, edgecolor=color_first_cat_var) host_ax.add_patch(patch) host_ax.set_title(title) plt.show() def case_coverage_plot(algname, ct_point_name, case_dfs): data_list = [] for subbench in case_dfs: for _, row_description in case_dfs[subbench].iterrows(): variant = row_description['variant'] speedup = row_description['speedup'] if variant.startswith('base'): continue varname, _ = variant.split(' ') params = varname.split('.') data_dict = {'variant': variant} for param in params: print(variant) name, val = param.split('_') data_dict[name] = int(val) data_dict['speedup'] = speedup # data_dict['variant'] = variant data_list.append(data_dict) df = pd.DataFrame(data_list) parallel_coordinates_plot(df, "{} ({})".format(algname, ct_point_name)) def coverage_plot(args): iterate_case_dfs(args, case_coverage_plot) def case_pair_plot(algname, ct_point_name, case_dfs): import seaborn as sns data_list = [] for subbench in case_dfs: for _, row_description in case_dfs[subbench].iterrows(): variant = row_description['variant'] speedup = row_description['speedup'] if variant.startswith('base'): continue varname, _ = variant.split(' ') params = varname.split('.') data_dict = {} for param in params: print(variant) name, val = param.split('_') data_dict[name] = int(val) data_dict['speedup'] = speedup data_list.append(data_dict) df = pd.DataFrame(data_list) sns.pairplot(df, hue='speedup') plt.title("{} ({})".format(algname, ct_point_name)) plt.show() def pair_plot(args): iterate_case_dfs(args, case_pair_plot) def qrde_hd(samples): """ Computes quantile-respectful density estimation based on the Harrell-Davis quantile estimator. The implementation is based on the following post: https://aakinshin.net/posts/qrde-hd by Andrey Akinshin """ min_sample, max_sample = min(samples), max(samples) num_quantiles = math.ceil(1.0 / precision) quantiles = np.linspace(precision, 1 - precision, num_quantiles - 1) hd_quantiles = [min_sample] + list(hdquantiles(samples, quantiles)) + [max_sample] width = [hd_quantiles[idx + 1] - hd_quantiles[idx] for idx in range(num_quantiles)] p = 1.0 / precision height = [1.0 / (p * w) for w in width] return width, height def extract_peaks(pdf): peaks = [] for i in range(1, len(pdf) - 1): if pdf[i - 1] < pdf[i] > pdf[i + 1]: peaks.append(i) return peaks def extract_modes(samples): """ Extract modes from the given samples based on the lowland algorithm: https://aakinshin.net/posts/lowland-multimodality-detection/ by Andrey Akinshin Implementation is based on the https://github.com/AndreyAkinshin/perfolizer LowlandModalityDetector class. """ mode_ids = [] widths, heights = qrde_hd(samples) peak_ids = extract_peaks(heights) bin_area = 1.0 / len(heights) x = min(samples) peak_xs = [] peak_ys = [] bin_lower = [x] for idx in range(len(heights)): if idx in peak_ids: peak_ys.append(heights[idx]) peak_xs.append(x + widths[idx] / 2) x += widths[idx] bin_lower.append(x) def lowland_between(mode_candidate, left_peak, right_peak): left, right = left_peak, right_peak min_height = min(heights[left_peak], heights[right_peak]) while left < right and heights[left] > min_height: left += 1 while left < right and heights[right] > min_height: right -= 1 width = bin_lower[right + 1] - bin_lower[left] total_area = width * min_height total_bin_area = (right - left + 1) * bin_area if total_bin_area / total_area < sensitivity: mode_ids.append(mode_candidate) return True return False previousPeaks = [peak_ids[0]] for i in range(1, len(peak_ids)): currentPeak = peak_ids[i] while previousPeaks and heights[previousPeaks[-1]] < heights[currentPeak]: if lowland_between(previousPeaks[0], previousPeaks[-1], currentPeak): previousPeaks = [] else: previousPeaks.pop() if previousPeaks and heights[previousPeaks[-1]] > heights[currentPeak]: if lowland_between(previousPeaks[0], previousPeaks[-1], currentPeak): previousPeaks = [] previousPeaks.append(currentPeak) mode_ids.append(previousPeaks[0]) return mode_ids def hd_displot(samples, label, ax): if label not in color_map: color_map[label] = next(color_cycle) color = color_map[label] widths, heights = qrde_hd(samples) mode_ids = extract_modes(samples) min_sample, max_sample = min(samples), max(samples) xs = [min_sample] ys = [0] peak_xs = [] peak_ys = [] x = min(samples) for idx in range(len(widths)): xs.append(x + widths[idx] / 2) ys.append(heights[idx]) if idx in mode_ids: peak_ys.append(heights[idx]) peak_xs.append(x + widths[idx] / 2) x += widths[idx] xs = xs + [max_sample] ys = ys + [0] ax.fill_between(xs, ys, 0, alpha=0.4, color=color) quartiles_of_interest = [0.25, 0.5, 0.75] for quartile in quartiles_of_interest: bin = int(quartile / precision) + 1 ax.plot([xs[bin], xs[bin]], [0, ys[bin]], color=color) ax.plot(xs, ys, label=label, color=color) ax.plot(peak_xs, peak_ys, 'o', color=color) ax.legend() def displot(data, ax): for variant in data: hd_displot(data[variant], variant, ax) def variant_ratio(data, variant, ax): if variant not in color_map: color_map[variant] = next(color_cycle) color = color_map[variant] variant_samples = data[variant] base_samples = data['base'] variant_widths, variant_heights = qrde_hd(variant_samples) base_widths, base_heights = qrde_hd(base_samples) quantiles = [] ratios = [] base_x = min(base_samples) variant_x = min(variant_samples) for i in range(1, len(variant_heights) - 1): base_x += base_widths[i] / 2 variant_x += variant_widths[i] / 2 quantiles.append(i * precision) ratios.append(base_x / variant_x) ax.plot(quantiles, ratios, label=variant, color=color) ax.axhline(1, color='red', alpha=0.7) ax.legend() ax.tick_params(axis='both', direction='in', pad=-22) def ratio(data, ax): for variant in data: if variant != 'base': variant_ratio(data, variant, ax) def case_variants(pattern, mode, algname, ct_point_name, case_dfs): for subbench in case_dfs: case_df = case_dfs[subbench] title = "{}[{}]:".format(algname + '/' + subbench, ct_point_name) df = case_df[case_df['variant'].str.contains(pattern, regex=True)].reset_index(drop=True) rt_axes = get_rt_axes(df) rt_axes_values = extract_rt_axes_values(df) vertical_axis_name = rt_axes[0] if 'Elements{io}[pow2]' in rt_axes: vertical_axis_name = 'Elements{io}[pow2]' horizontal_axes = rt_axes horizontal_axes.remove(vertical_axis_name) vertical_axis_values = rt_axes_values[vertical_axis_name] vertical_axis_ids = {} for idx, val in enumerate(vertical_axis_values): vertical_axis_ids[val] = idx def extract_horizontal_space(df): values = [] for rt_axis in horizontal_axes: values.append(["{}={}".format(rt_axis, v) for v in df[rt_axis].unique()]) return list(itertools.product(*values)) if len(horizontal_axes) > 0: idx = 0 horizontal_axis_ids = {} for point in extract_horizontal_space(df): horizontal_axis_ids[" / ".join(point)] = idx idx = idx + 1 num_rows = len(vertical_axis_ids) num_cols = max(1, len(extract_horizontal_space(df))) if num_rows == 0: return fig, axes = plt.subplots(nrows=num_rows, ncols=num_cols, gridspec_kw = {'wspace': 0, 'hspace': 0}) for _, vertical_row_description in df[[vertical_axis_name]].drop_duplicates().iterrows(): vertical_val = vertical_row_description[vertical_axis_name] vertical_id = vertical_axis_ids[vertical_val] vertical_name = "{}={}".format(vertical_axis_name, vertical_val) vertical_df = df[df[vertical_axis_name] == vertical_val] for _, horizontal_row_description in vertical_df[horizontal_axes].drop_duplicates().iterrows(): horizontal_df = vertical_df for axis in horizontal_axes: horizontal_df = horizontal_df[horizontal_df[axis] == horizontal_row_description[axis]] horizontal_id = 0 if len(horizontal_axes) > 0: horizontal_point = [] for rt_axis in horizontal_axes: horizontal_point.append("{}={}".format(rt_axis, horizontal_row_description[rt_axis])) horizontal_name = " / ".join(horizontal_point) horizontal_id = horizontal_axis_ids[horizontal_name] ax=axes[vertical_id, horizontal_id] else: ax=axes[vertical_id] ax.set_ylabel(vertical_name) data = {} for _, variant in horizontal_df[['variant']].drop_duplicates().iterrows(): variant_name = variant['variant'] if 'base' not in data: data['base'] = horizontal_df[horizontal_df['variant'] == variant_name].iloc[0]['base_samples'] data[variant_name] = horizontal_df[horizontal_df['variant'] == variant_name].iloc[0]['samples'] if mode == 'pdf': # sns.histplot(data=data, ax=ax, kde=True) displot(data, ax) else: ratio(data, ax) if len(horizontal_axes) > 0: ax=axes[vertical_id, horizontal_id] if vertical_id == (num_rows - 1): ax.set_xlabel(horizontal_name) if horizontal_id == 0: ax.set_ylabel(vertical_name) else: ax.set_ylabel('') for ax in axes.flat: ax.set_xticklabels([]) fig.suptitle(title) plt.tight_layout() plt.show() def variants(args, mode): pattern = re.compile(args.variants_pdf) if mode == 'pdf' else re.compile(args.variants_ratio) iterate_case_dfs(args, functools.partial(case_variants, pattern, mode)) def file_exists(value): if not os.path.isfile(value): raise argparse.ArgumentTypeError(f"The file '{value}' does not exist.") return value def case_offload(algname, ct_point_name, case_dfs): for subbench in case_dfs: df = case_dfs[subbench] for rt_point in extract_rt_space(df): point_df = df for rt_kv in rt_point: key, value = rt_kv.split('=') point_df = point_df[point_df[key] == value] point_name = ct_point_name + " " + " ".join(rt_point) point_name = point_name.replace(',', '') bench_name = "{}.{}-{}".format(algname, subbench, point_name) bench_name = bench_name.replace(' ', '___') bench_name = "".join(c if c.isalnum() else "_" for c in bench_name) with open(bench_name + '.json', 'w') as f: obj = json.loads(point_df.to_json(orient='records')) json.dump(obj, f, indent=2) def offload(args): iterate_case_dfs(args, case_offload) def parse_arguments(): parser = argparse.ArgumentParser(description="Analyze benchmark results.") parser.add_argument( '-R', type=str, default='.*', help="Regex for benchmarks selection.") parser.add_argument( '--list-benches', action=argparse.BooleanOptionalAction, help="Show available benchmarks.") parser.add_argument( '--coverage', action=argparse.BooleanOptionalAction, help="Show variant space coverage.") parser.add_argument( '--coverage-plot', action=argparse.BooleanOptionalAction, help="Plot variant space coverage.") parser.add_argument( '--pair-plot', action=argparse.BooleanOptionalAction, help="Pair plot.") parser.add_argument( '--top', default=7, type=int, action='store', nargs='?', help="Show top N variants with highest score.") parser.add_argument( 'files', type=file_exists, nargs='+', help='At least one file is required.') parser.add_argument( '--alpha', default=1.0, type=float) parser.add_argument( '--variants-pdf', type=str, help="Show matching variants data.") parser.add_argument( '--variants-ratio', type=str, help="Show matching variants data.") parser.add_argument('-a', '--args', action='append', type=str, help="Parameter in the format `Param=Value`.") parser.add_argument( '-o', '--offload', action=argparse.BooleanOptionalAction, help="Offload samples") return parser.parse_args() def main(): args = parse_arguments() if args.list_benches: cccl.bench.list_benches() return if args.coverage: coverage(args) return if args.coverage_plot: coverage_plot(args) return if args.pair_plot: pair_plot(args) return if args.variants_pdf: variants(args, 'pdf') return if args.variants_ratio: variants(args, 'ratio') return if args.offload: offload(args) return top(args) if __name__ == "__main__": main() cccl-2.5.0/benchmarks/scripts/cccl/000077500000000000000000000000001463375617100171475ustar00rootroot00000000000000cccl-2.5.0/benchmarks/scripts/cccl/__init__.py000066400000000000000000000000241463375617100212540ustar00rootroot00000000000000from . import bench cccl-2.5.0/benchmarks/scripts/cccl/bench/000077500000000000000000000000001463375617100202265ustar00rootroot00000000000000cccl-2.5.0/benchmarks/scripts/cccl/bench/__init__.py000066400000000000000000000002121463375617100223320ustar00rootroot00000000000000from .config import * from .storage import * from .bench import Bench from .cmake import CMake from .score import * from .search import * cccl-2.5.0/benchmarks/scripts/cccl/bench/bench.py000066400000000000000000000553521463375617100216710ustar00rootroot00000000000000import os import json import time import fpzip import signal import itertools import subprocess import numpy as np from .cmake import CMake from .config import * from .storage import Storage, get_bench_table_name from .score import * from .logger import * def first_val(my_dict): values = list(my_dict.values()) first_value = values[0] if not all(value == first_value for value in values): raise ValueError('All values in the dictionary are not equal') return first_value class JsonCache: _instance = None def __new__(cls): if cls._instance is None: cls._instance = super().__new__(cls) cls._instance.bench_cache = {} cls._instance.device_cache = {} return cls._instance def get_bench(self, algname): if algname not in self.bench_cache: result = subprocess.check_output( [os.path.join('.', 'bin', algname + '.base'), "--jsonlist-benches"]) self.bench_cache[algname] = json.loads(result) return self.bench_cache[algname] def get_device(self, algname): if algname not in self.device_cache: result = subprocess.check_output( [os.path.join('.', 'bin', algname + '.base'), "--jsonlist-devices"]) devices = json.loads(result)["devices"] if len(devices) != 1: raise Exception( "NVBench doesn't work well with multiple GPUs, use `CUDA_VISIBLE_DEVICES`") self.device_cache[algname] = devices[0] return self.device_cache[algname] def json_benches(algname): return JsonCache().get_bench(algname) def create_benches_tables(conn, subbench, bench_axes): with conn: conn.execute(""" CREATE TABLE IF NOT EXISTS subbenches ( algorithm TEXT NOT NULL, bench TEXT NOT NULL, UNIQUE(algorithm, bench) ); """) for algorithm_name in bench_axes: axes = bench_axes[algorithm_name] column_names = ", ".join(["\"{}\"".format(name) for name in axes]) columns = ", ".join(["\"{}\" TEXT".format(name) for name in axes]) conn.execute(""" INSERT INTO subbenches (algorithm, bench) VALUES (?, ?) ON CONFLICT DO NOTHING; """, (algorithm_name, subbench)) if axes: columns = ", " + columns column_names = ", " + column_names conn.execute(""" CREATE TABLE IF NOT EXISTS "{0}" ( ctk TEXT NOT NULL, cccl TEXT NOT NULL, gpu TEXT NOT NULL, variant TEXT NOT NULL, elapsed REAL, center REAL, bw REAL, samples BLOB {1} , UNIQUE(ctk, cccl, gpu, variant {2}) ); """.format(get_bench_table_name(subbench, algorithm_name), columns, column_names)) def read_json(filename): with open(filename, "r") as f: file_root = json.load(f) return file_root def extract_filename(summary): summary_data = summary["data"] value_data = next(filter(lambda v: v["name"] == "filename", summary_data)) assert (value_data["type"] == "string") return value_data["value"] def extract_size(summary): summary_data = summary["data"] value_data = next(filter(lambda v: v["name"] == "size", summary_data)) assert (value_data["type"] == "int64") return int(value_data["value"]) def extract_bw(summary): summary_data = summary["data"] value_data = next(filter(lambda v: v["name"] == "value", summary_data)) assert (value_data["type"] == "float64") return float(value_data["value"]) def parse_samples_meta(state): summaries = state["summaries"] if not summaries: return None, None summary = next(filter(lambda s: s["tag"] == "nv/json/bin:nv/cold/sample_times", summaries), None) if not summary: return None, None sample_filename = extract_filename(summary) sample_count = extract_size(summary) return sample_count, sample_filename def parse_samples(state): sample_count, samples_filename = parse_samples_meta(state) if not sample_count or not samples_filename: return np.array([], dtype=np.float32) with open(samples_filename, "rb") as f: samples = np.fromfile(f, " 1: benchmarks[name] = parse_ranges(columns[1:]) else: benchmarks[name] = [] return ctk_version, cccl_revision, benchmarks class Config: _instance = None def __new__(cls, *args, **kwargs): if cls._instance is None: cls._instance = super().__new__(cls, *args, **kwargs) cls._instance.ctk, cls._instance.cccl, cls._instance.benchmarks = parse_meta() return cls._instance def label_to_variant_point(self, algname, label): if label == "base": return BasePoint() label_to_definition = {} for param_space in self.benchmarks[algname]: label_to_definition[param_space.label] = param_space.definition points = [] for point in label.split('.'): label, value = point.split('_') points.append(RangePoint(label_to_definition[label], label, int(value))) return VariantPoint(points) def variant_space(self, algname): variants = [] for param_space in self.benchmarks[algname]: variants.append([]) for value in range(param_space.low, param_space.high, param_space.step): variants[-1].append(RangePoint(param_space.definition, param_space.label, value)) return (VariantPoint(points) for points in randomized_cartesian_product(variants)) def variant_space_size(self, algname): num_variants = 1 for param_space in self.benchmarks[algname]: num_variants = num_variants * len(range(param_space.low, param_space.high, param_space.step)) return num_variants cccl-2.5.0/benchmarks/scripts/cccl/bench/logger.py000066400000000000000000000011421463375617100220550ustar00rootroot00000000000000import logging class Logger: _instance = None def __new__(cls, *args, **kwargs): if cls._instance is None: cls._instance = super().__new__(cls, *args, **kwargs) logger = logging.getLogger() logger.setLevel(logging.DEBUG) file_handler = logging.FileHandler('cccl_meta_bench.log') file_handler.setFormatter(logging.Formatter('%(asctime)s: %(message)s')) logger.addHandler(file_handler) cls._instance.logger = logger return cls._instance def info(self, message): self.logger.info(message) cccl-2.5.0/benchmarks/scripts/cccl/bench/score.py000066400000000000000000000056151463375617100217220ustar00rootroot00000000000000import math import numpy as np def importance_function(x): return 1 - math.exp(-x) def x_by_importance(y): return -math.log(1 - y) def compute_weights(num_values): least_importance = 0.6 most_importance = 0.999 assert(least_importance < most_importance) assert(least_importance >= 0 and least_importance < 1) assert(most_importance > 0 and most_importance < 1) begin = x_by_importance(least_importance) end = x_by_importance(most_importance) rng = end - begin step = rng / num_values return np.array([importance_function(begin + x * step) for x in range(num_values)]) def io_weights(values): return compute_weights(len(values)) def ei_weights(values): return np.ones(len(values)) def compute_axes_ids(rt_axes_values): result = {} for bench in rt_axes_values: rt_axes_ids = {} axis_id = 0 for rt_axis in rt_axes_values[bench]: rt_axes_ids[rt_axis] = axis_id axis_id = axis_id + 1 result[bench] = rt_axes_ids return result def compute_raw_weight_matrix(rt_axes_values, rt_axes_ids): rt_axes_weights = {} first_rt_axis = True first_rt_axis_name = None for rt_axis in rt_axes_values: if first_rt_axis: first_rt_axis_name = rt_axis first_rt_axis = False values = rt_axes_values[rt_axis] rt_axes_values[rt_axis] = values if '{io}' in rt_axis: rt_axes_weights[rt_axis] = io_weights(values) else: rt_axes_weights[rt_axis] = ei_weights(values) num_rt_axes = len(rt_axes_ids) for rt_axis in rt_axes_weights: shape = [1] * num_rt_axes shape[rt_axes_ids[rt_axis]] = -1 rt_axes_weights[rt_axis] = rt_axes_weights[rt_axis].reshape(*shape) weights_matrix = rt_axes_weights[first_rt_axis_name] for rt_axis in rt_axes_weights: if rt_axis == first_rt_axis_name: continue weights_matrix = weights_matrix * rt_axes_weights[rt_axis] return weights_matrix def compute_weight_matrices(rt_axes_values, rt_axes_ids): matrices = {} aggregate = 0.0 for bench in rt_axes_values: matrices[bench] = compute_raw_weight_matrix(rt_axes_values[bench], rt_axes_ids[bench]) aggregate = aggregate + np.sum(matrices[bench]) for bench in rt_axes_values: matrices[bench] = matrices[bench] / aggregate return matrices def get_workload_coordinates(rt_workload, rt_axes_values, rt_axes_ids): coordinates = [0] * len(rt_axes_ids) for point in rt_workload: rt_axis, rt_value = point.split('=') coordinates[rt_axes_ids[rt_axis]] = rt_axes_values[rt_axis].index(rt_value) return coordinates def get_workload_weight(rt_workload, rt_axes_values, rt_axes_ids, weights_matrix): coordinates = get_workload_coordinates(rt_workload, rt_axes_values, rt_axes_ids) return weights_matrix[tuple(coordinates)] cccl-2.5.0/benchmarks/scripts/cccl/bench/search.py000066400000000000000000000104411463375617100220450ustar00rootroot00000000000000import re import argparse import numpy as np from .bench import Bench, BaseBench from .config import Config from .storage import Storage from .cmake import CMake def list_benches(): print("### Benchmarks") config = Config() for algname in config.benchmarks: space_size = config.variant_space_size(algname) print(" * `{}`: {} variants: ".format(algname, space_size)) for param_space in config.benchmarks[algname]: param_name = param_space.label param_rng = (param_space.low, param_space.high, param_space.step) print(" * `{}`: {}".format(param_name, param_rng)) def parse_sub_space(args): sub_space = {} for axis in args: name, value = axis.split('=') if '[' in value: value = value.replace('[', '').replace(']', '') values = value.split(',') else: values = [value] sub_space[name] = values return sub_space def parse_arguments(): parser = argparse.ArgumentParser( description="Runs benchmarks and stores results in a database.") parser.add_argument('-R', type=str, default='.*', help="Regex for benchmarks selection.") parser.add_argument('-a', '--args', action='append', type=str, help="Parameter in the format `Param=Value`.") parser.add_argument( '--list-benches', action=argparse.BooleanOptionalAction, help="Show available benchmarks.") parser.add_argument('--num-shards', type=int, default=1, help='Split benchmarks into M pieces and only run one') parser.add_argument('--run-shard', type=int, default=0, help='Run shard N / M of benchmarks') parser.add_argument('-P0', action=argparse.BooleanOptionalAction, help="Run P0 benchmarks") return parser.parse_args() def run_benches(algnames, sub_space, seeker): for algname in algnames: try: bench = BaseBench(algname) ct_space = bench.ct_workload_space(sub_space) rt_values = bench.rt_axes_values(sub_space) seeker(algname, ct_space, rt_values) except Exception as e: print("#### ERROR exception occured while running {}: '{}'".format(algname, e)) def filter_benchmarks_by_regex(benchmarks, R): pattern = re.compile(R) return list(filter(lambda x: pattern.match(x), benchmarks)) def filter_benchmarks(benchmarks, args): if args.run_shard >= args.num_shards: raise ValueError('run-shard must be less than num-shards') algnames = filter_benchmarks_by_regex(benchmarks.keys(), args.R) if args.P0: algnames = filter_benchmarks_by_regex(algnames, '^(?!.*segmented).*(scan|reduce|select|sort).*') algnames.sort() if args.num_shards > 1: algnames = np.array_split(algnames, args.num_shards)[args.run_shard].tolist() return algnames return algnames def search(seeker): args = parse_arguments() if not Storage().exists(): CMake().clean() config = Config() print(" ctk: ", config.ctk) print("cccl: ", config.cccl) workload_sub_space = {} if args.args: workload_sub_space = parse_sub_space(args.args) if args.list_benches: list_benches() return run_benches(filter_benchmarks(config.benchmarks, args), workload_sub_space, seeker) class MedianCenterEstimator: def __init__(self): pass def __call__(self, samples): if len(samples) == 0: return float("inf") return float(np.median(samples)) class BruteForceSeeker: def __init__(self, base_center_estimator, variant_center_estimator): self.base_center_estimator = base_center_estimator self.variant_center_estimator = variant_center_estimator def __call__(self, algname, ct_workload_space, rt_values): variants = Config().variant_space(algname) for ct_workload in ct_workload_space: for variant in variants: bench = Bench(algname, variant, list(ct_workload)) if bench.build(): score = bench.score(ct_workload, rt_values, self.base_center_estimator, self.variant_center_estimator) print(bench.label(), score) cccl-2.5.0/benchmarks/scripts/cccl/bench/storage.py000066400000000000000000000035611463375617100222510ustar00rootroot00000000000000import os import fpzip import sqlite3 import numpy as np import pandas as pd db_name = "cccl_meta_bench.db" def get_bench_table_name(subbench, algname): return "{}.{}".format(algname, subbench) def blob_to_samples(blob): return np.squeeze(fpzip.decompress(blob)) class StorageBase: def __init__(self, db_path): self.conn = sqlite3.connect(db_path) def connection(self): return self.conn def exists(self): return os.path.exists(db_name) def algnames(self): with self.conn: rows = self.conn.execute('SELECT DISTINCT algorithm FROM subbenches').fetchall() return [row[0] for row in rows] def subbenches(self, algname): with self.conn: rows = self.conn.execute('SELECT DISTINCT bench FROM subbenches WHERE algorithm=?', (algname,)).fetchall() return [row[0] for row in rows] def alg_to_df(self, algname, subbench): table = get_bench_table_name(subbench, algname) with self.conn: df = pd.read_sql_query("SELECT * FROM \"{}\"".format(table), self.conn) df['samples'] = df['samples'].apply(blob_to_samples) return df def store_df(self, algname, df): df['samples'] = df['samples'].apply(fpzip.compress) df.to_sql(algname, self.conn, if_exists='replace', index=False) class Storage: _instance = None def __new__(cls, *args, **kwargs): if cls._instance is None: cls._instance = super().__new__(cls, *args, **kwargs) cls._instance.base = StorageBase(db_name) return cls._instance def connection(self): return self.base.connection() def exists(self): return self.base.exists() def algnames(self): return self.base.algnames() def alg_to_df(self, algname, subbench): return self.base.alg_to_df(algname, subbench) cccl-2.5.0/benchmarks/scripts/run.py000077500000000000000000000041121463375617100174220ustar00rootroot00000000000000#!/usr/bin/env python3 import os import sys import math import cccl.bench def elapsed_time_looks_good(x): if isinstance(x, float): if math.isfinite(x): return True return False def get_largest_problem_size(rt_values): # Small problem sizes do not utilize entire GPU. # Benchmarking small problem sizes in environments where we do not control # distributions comparison, e.g. CI, is not useful because of stability issues. elements = [] for element in rt_values: if element.isdigit(): elements.append(int(element)) return [str(max(elements))] def filter_runtime_workloads_for_ci(rt_values): for subbench in rt_values: for axis in rt_values[subbench]: if axis.startswith('Elements') and axis.endswith('[pow2]'): rt_values[subbench][axis] = get_largest_problem_size(rt_values[subbench][axis]) return rt_values class BaseRunner: def __init__(self): self.estimator = cccl.bench.MedianCenterEstimator() def __call__(self, algname, ct_workload_space, rt_values): failure_occured = False rt_values = filter_runtime_workloads_for_ci(rt_values) for ct_workload in ct_workload_space: bench = cccl.bench.BaseBench(algname) if bench.build(): # might throw results = bench.run(ct_workload, rt_values, self.estimator, False) for subbench in results: for point in results[subbench]: bench_name = "{}.{}-{}".format(bench.algorithm_name(), subbench, point) bench_name = bench_name.replace(' ', '___') bench_name = "".join(c if c.isalnum() else "_" for c in bench_name) elapsed_time = results[subbench][point] if elapsed_time_looks_good(elapsed_time): print("&&&& PERF {} {} -sec".format(bench_name, elapsed_time)) else: failure_occured = True print("&&&& FAILED {}".format(algname)) if failure_occured: sys.exit(1) def main(): print("&&&& RUNNING bench") os.environ["CUDA_MODULE_LOADING"] = "EAGER" cccl.bench.search(BaseRunner()) print("&&&& PASSED bench") if __name__ == "__main__": main() cccl-2.5.0/benchmarks/scripts/search.py000077500000000000000000000005001463375617100200600ustar00rootroot00000000000000#!/usr/bin/env python3 import cccl.bench as bench # TODO: # - driver version # - host compiler + version # - gpu clocks / pm # - ecc def main(): center_estimator = bench.MedianCenterEstimator() bench.search(bench.BruteForceSeeker(center_estimator, center_estimator)) if __name__ == "__main__": main() cccl-2.5.0/benchmarks/scripts/verify.py000077500000000000000000000043361463375617100201320ustar00rootroot00000000000000#!/usr/bin/env python3 import sys import argparse import cccl.bench def parse_arguments(): parser = argparse.ArgumentParser(description='Verify tuning variant') parser.add_argument('--variant', type=str, help='Variant to verify', default=None, required=True) variant = parser.parse_known_args()[0].variant sys.argv.remove('--variant={}'.format(variant)) return variant def workload_header(ct_workload_space, rt_workload_space): for ct_workload in ct_workload_space: for rt_workload in rt_workload_space: workload_point = ct_workload + rt_workload return ", ".join([x.split('=')[0] for x in workload_point]) def workload_entry(ct_workload, rt_workload): workload_point = ct_workload + rt_workload return ", ".join([x.split('=')[1] for x in workload_point]) class VerifySeeker: def __init__(self, variant_label): self.label = variant_label self.estimator = cccl.bench.MedianCenterEstimator() def __call__(self, algname, ct_workload_space, rt_workload_space): variant_point = cccl.bench.Config().label_to_variant_point(algname, self.label) print("{}, MinS, MedianS, MaxS".format(workload_header(ct_workload_space, rt_workload_space))) for ct_workload in ct_workload_space: bench = cccl.bench.Bench(algname, variant_point, list(ct_workload)) if bench.build(): base = bench.get_base() for rt_workload in rt_workload_space: workload_point = ct_workload + rt_workload base_samples, base_elapsed = base.do_run(workload_point, None) variant_samples, _ = bench.do_run(workload_point, base_elapsed * 10) min_speedup = min(base_samples) / min(variant_samples) median_speedup = self.estimator(base_samples) / self.estimator(variant_samples) max_speedup = max(base_samples) / max(variant_samples) point_str = workload_entry(ct_workload, rt_workload) print("{}, {}, {}, {}".format(point_str, min_speedup, median_speedup, max_speedup)) def main(): cccl.bench.search(VerifySeeker(parse_arguments())) if __name__ == "__main__": main() cccl-2.5.0/ci-overview.md000066400000000000000000000161371463375617100152300ustar00rootroot00000000000000# Continuous Integration (CI) Overview for CCCL The Continuous Integration (CI) process for CCCL ensures code quality and compatibility across various environments. This document provides an in-depth overview of the CI setup and workflows, enabling contributors to understand, debug, and reproduce CI checks locally. ## CI Environment and Configuration ### Development Containers CCCL's CI jobs use the same Development Containers as described in the [Dev Container setup](.devcontainer/README.md). Follow the instructions in that guide to set up a development container with the same environment as CI. ### Matrix Testing To ensure compatibility across various setups, CI tests are performed across a broad matrix of: - CUDA versions - Compilers - GPU architectures - Operating systems The exact combinations of these environments are defined in the [`ci/matrix.yaml`](ci/matrix.yaml) file. ### Viewing CI Workflow Results The results of every job in the CI pipeline are summarized on the bottom of the PR page. Click the "Details" link next to each run to provide more detailed information. ![Summary of all CI jobs on PR page.](docs/images/pr-checks.png). ### Special CI Commands Special commands are provided that can be included in commit messages to direct the CI pipeline execution: - `[skip ci]`: Skips the entire CI pipeline. Useful for documentation changes or others that don't require CI validation. - **Example:** `git commit -m "[skip ci] Update README."` - `[skip-tests]`: Skips CI jobs that execute tests, but runs all other jobs. Useful to avoid time-consuming tests when changes are unlikely to affect them. - `[all-projects]`: CI normally skips projects that don't have changes in themselves or their dependencies. This forces all projects to build. - `[workflow:]`: Execute jobs from the named workflow. Example: `[workflow:nightly]` runs all jobs defined in `matrix.yaml`'s `workflows.nightly` list. Use these commands judiciously. While they offer flexibility, they should be used appropriately to maintain the codebase's integrity and quality. ### Temporarily Overriding the Pull Request Matrix If a workflow named `override` exists in the matrix.yaml file, this matrix will be used for pull requests instead of the `pull_request` matrix. This is useful for reducing resource usage when launching many CI workflows from a PR (for example, while testing CI features). The overridden CI job will be marked as a failure until the override is removed. Example: ``` workflows: override: - {jobs: ['test'], std: 17, ctk: *ctk_curr, cxx: [*gcc12, *llvm16, *msvc2022]} pull_request: - <...> ``` ### Accelerating Build Times with `sccache` CCCL's CI uses [`sccache`](https://github.com/mozilla/sccache) to cache compiler artifacts for files that haven't changed and dramatically accelerate build times. Local builds inside [CCCL's Dev Containers](.devcontainer/README.md) can share the same cache such that local builds and CI jobs mutually benefit from accelerated build times. Follow the [GitHub Authentication](.devcontainer/README.md#optional-authenticate-with-github-for-sccache) guide to enable this feature. ### Build and Test Scripts CI jobs employ the build and test scripts in the `ci/` directory to build and run tests. These scripts provide a consistent entry point for building and testing in both local and CI environments. For more information on using these scripts, see the [CONTRIBUTING.md guide](CONTRIBUTING.md#building-and-testing). ### Reproducing CI Failures Locally If a pull request encounters a failure during CI testing, it is usually helpful to reproduce the issue locally to diagnose and fix it. Here is a step-by-step guide to recreate the exact environment and situation: 1. **Get the Appropriate Development Container**: CI jobs use the same [development containers](.devcontainer/README) as those used for local development. In order to simplify reproducing an issue in CI, it is recommended to use the same container locally. The CI logs will mention the exact environment used. 2. **Run the Build/Test Script**: CI jobs use the build and test scripts found in the `ci/` directory. Example: ```bash ./ci/build_cub.sh ./ci/test_cub.sh ``` The CI logs provide exact instructions on the scripts and parameters used. Here is an example of a CI failure message that includes instructions how to clone the exact same commit and run the relevant script in the appropriate container. Note that the instructions may have changed. Refer to the latest failure log for the most up-to-date instructions. ![Shows an example of a CI failure log with reproducer instructions](docs/images/repro_instructions.png). ## CI Workflow Details ### Triggering Mechanism and `copy-pr-bot` CCCL uses [NVIDIA's self-hosted action runners](https://docs.gha-runners.nvidia.com/runners/) for CI jobs. For security, PR workflows are triggered using the [`copy-pr-bot` GitHub application](https://docs.gha-runners.nvidia.com/onboarding/), which copies code to a prefixed branch to ensure only vetted code runs on the runners. The CI pipeline will not start automatically for external contributors. A repository member will first review the changes and initiate the CI pipeline with an `/ok to test` comment. ### SSH Signing Keys [Signed commits](https://docs.github.com/en/authentication/managing-commit-signature-verification/signing-commits) are required for any internal NVIDIA contributors who want the convenience of CI running automatically whenever a commit is pushed to a branch (i.e., doesn't require using `/ok to test`). This is not required for external contributions, which will always require an explicit `/ok to test` comment from an approved account for each CI run. To enable commit signing using your existing ssh key, set the following git options: ```bash git config --global gpg.format ssh git config --global user.signingKey ~/.ssh/YOUR_PUBLIC_KEY_FILE_HERE.pub # These settings are optional. They tell git to automatically sign all new commits and tags. # If these are set to false, use `git commit -S` to manually sign each commit. git config --global commit.gpgsign true git config --global tag.gpgsign true ``` Git is now configured to sign commits with your ssh key. To complete the process, upload the public key to your [Github Signing Keys](https://github.com/settings/keys) in your browser or using the `gh` CLI tool: ``` gh ssh-key add ~/.ssh/YOUR_PUBLIC_KEY_FILE_HERE.pub --type signing ``` Make sure that the key is uploaded to 'Signing Keys', not just 'Authentication Keys'. The same key may be used for both. ## Troubleshooting CI Failures 1. **Review CI logs**: Examine CI logs for specific error messages (see [Viewing CI Workflow Results](#viewing-ci-workflow-results)) 2. **Reproduce Locally**: Try replicating the issue locally (see [Reproducing CI Failures Locally](#reproducing-ci-failures-locally)) 3. **Ask for Assistance**: If stuck, don't hesitate to reach out to the @NVIDIA/cccl team on an issue or PR, or ask a question by starting a [Discussion](https://github.com/NVIDIA/cccl/discussions). cccl-2.5.0/ci/000077500000000000000000000000001463375617100130325ustar00rootroot00000000000000cccl-2.5.0/ci/build_common.sh000077500000000000000000000170231463375617100160430ustar00rootroot00000000000000#!/bin/bash set -eo pipefail # Ensure the script is being executed in its containing directory cd "$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"; # Script defaults VERBOSE=${VERBOSE:-} HOST_COMPILER=${CXX:-g++} # $CXX if set, otherwise `g++` CXX_STANDARD=17 CUDA_COMPILER=${CUDACXX:-nvcc} # $CUDACXX if set, otherwise `nvcc` CUDA_ARCHS= # Empty, use presets by default. GLOBAL_CMAKE_OPTIONS=() DISABLE_CUB_BENCHMARKS= # Enable to force-disable building CUB benchmarks. # Check if the correct number of arguments has been provided function usage { echo "Usage: $0 [OPTIONS]" echo echo "The PARALLEL_LEVEL environment variable controls the amount of build parallelism. Default is the number of cores." echo echo "Options:" echo " -v/--verbose: enable shell echo for debugging" echo " -cuda: CUDA compiler (Defaults to \$CUDACXX if set, otherwise nvcc)" echo " -cxx: Host compiler (Defaults to \$CXX if set, otherwise g++)" echo " -std: CUDA/C++ standard (Defaults to 17)" echo " -arch: Target CUDA arches, e.g. \"60-real;70;80-virtual\" (Defaults to value in presets file)" echo " -cmake-options: Additional options to pass to CMake" echo echo "Examples:" echo " $ PARALLEL_LEVEL=8 $0" echo " $ PARALLEL_LEVEL=8 $0 -cxx g++-9" echo " $ $0 -cxx clang++-8" echo " $ $0 -cxx g++-8 -std 14 -arch 80-real -v -cuda /usr/local/bin/nvcc" echo " $ $0 -cmake-options \"-DCMAKE_BUILD_TYPE=Debug -DCMAKE_CXX_FLAGS=-Wfatal-errors\"" exit 1 } # Parse options # Copy the args into a temporary array, since we will modify them and # the parent script may still need them. args=("$@") while [ "${#args[@]}" -ne 0 ]; do case "${args[0]}" in -v | --verbose) VERBOSE=1; args=("${args[@]:1}");; -cxx) HOST_COMPILER="${args[1]}"; args=("${args[@]:2}");; -std) CXX_STANDARD="${args[1]}"; args=("${args[@]:2}");; -cuda) CUDA_COMPILER="${args[1]}"; args=("${args[@]:2}");; -arch) CUDA_ARCHS="${args[1]}"; args=("${args[@]:2}");; -disable-benchmarks) DISABLE_CUB_BENCHMARKS=1; args=("${args[@]:1}");; -cmake-options) if [ -n "${args[1]}" ]; then IFS=' ' read -ra split_args <<< "${args[1]}" GLOBAL_CMAKE_OPTIONS+=("${split_args[@]}") args=("${args[@]:2}") else echo "Error: No arguments provided for -cmake-options" usage exit 1 fi ;; -h | -help | --help) usage ;; *) echo "Unrecognized option: ${args[0]}"; usage ;; esac done # Convert to full paths: HOST_COMPILER=$(which ${HOST_COMPILER}) CUDA_COMPILER=$(which ${CUDA_COMPILER}) if [[ -n "${CUDA_ARCHS}" ]]; then GLOBAL_CMAKE_OPTIONS+=("-DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCHS}") fi if [ $VERBOSE ]; then set -x fi # Begin processing unsets after option parsing set -u readonly PARALLEL_LEVEL=${PARALLEL_LEVEL:=$(nproc)} if [ -z ${CCCL_BUILD_INFIX+x} ]; then CCCL_BUILD_INFIX="" fi # Presets will be configured in this directory: BUILD_DIR="../build/${CCCL_BUILD_INFIX}" # The most recent build will always be symlinked to cccl/build/latest mkdir -p $BUILD_DIR rm -f ../build/latest ln -sf $BUILD_DIR ../build/latest # Now that BUILD_DIR exists, use readlink to canonicalize the path: BUILD_DIR=$(readlink -f "${BUILD_DIR}") # Prepare environment for CMake: export CMAKE_BUILD_PARALLEL_LEVEL="${PARALLEL_LEVEL}" export CTEST_PARALLEL_LEVEL="1" export CXX="${HOST_COMPILER}" export CUDACXX="${CUDA_COMPILER}" export CUDAHOSTCXX="${HOST_COMPILER}" export CXX_STANDARD source ./pretty_printing.sh print_environment_details() { begin_group "âš™ï¸ Environment Details" echo "pwd=$(pwd)" print_var_values \ BUILD_DIR \ CXX_STANDARD \ CXX \ CUDACXX \ CUDAHOSTCXX \ NVCC_VERSION \ CMAKE_BUILD_PARALLEL_LEVEL \ CTEST_PARALLEL_LEVEL \ CCCL_BUILD_INFIX \ GLOBAL_CMAKE_OPTIONS \ TBB_ROOT echo "Current commit is:" git log -1 || echo "Not a repository" if command -v nvidia-smi &> /dev/null; then nvidia-smi else echo "nvidia-smi not found" fi if command -v cmake &> /dev/null; then cmake --version else echo "cmake not found" fi if command -v ctest &> /dev/null; then ctest --version else echo "ctest not found" fi end_group "âš™ï¸ Environment Details" } fail_if_no_gpu() { if ! nvidia-smi &> /dev/null; then echo "Error: No NVIDIA GPU detected. Please ensure you have an NVIDIA GPU installed and the drivers are properly configured." >&2 exit 1 fi } function print_test_time_summary() { ctest_log=${1} if [ -f ${ctest_log} ]; then begin_group "â±ï¸ Longest Test Steps" # Only print the full output in CI: if [ -n "${GITHUB_ACTIONS:-}" ]; then cmake -DLOGFILE=${ctest_log} -P ../cmake/PrintCTestRunTimes.cmake else cmake -DLOGFILE=${ctest_log} -P ../cmake/PrintCTestRunTimes.cmake | head -n 15 fi end_group "â±ï¸ Longest Test Steps" fi } function configure_preset() { local BUILD_NAME=$1 local PRESET=$2 local CMAKE_OPTIONS=$3 local GROUP_NAME="ðŸ› ï¸ CMake Configure ${BUILD_NAME}" pushd .. > /dev/null run_command "$GROUP_NAME" cmake --preset=$PRESET --log-level=VERBOSE $CMAKE_OPTIONS "${GLOBAL_CMAKE_OPTIONS[@]}" status=$? popd > /dev/null return $status } function build_preset() { local BUILD_NAME=$1 local PRESET=$2 local green="1;32" local red="1;31" local GROUP_NAME="ðŸ—ï¸ Build ${BUILD_NAME}" local preset_dir="${BUILD_DIR}/${PRESET}" local sccache_json="${preset_dir}/sccache_stats.json" source "./sccache_stats.sh" "start" pushd .. > /dev/null run_command "$GROUP_NAME" cmake --build --preset=$PRESET -v status=$? popd > /dev/null sccache --show-adv-stats --stats-format=json > "${sccache_json}" minimal_sccache_stats=$(source "./sccache_stats.sh" "end") # Only print detailed stats in actions workflow if [ -n "${GITHUB_ACTIONS:-}" ]; then begin_group "💲 sccache stats" echo "${minimal_sccache_stats}" sccache -s end_group begin_group "🥷 ninja build times" echo "The "weighted" time is the elapsed time of each build step divided by the number of tasks that were running in parallel. This makes it an excellent approximation of how "important" a slow step was. A link that is entirely or mostly serialized will have a weighted time that is the same or similar to its elapsed time. A compile that runs in parallel with 999 other compiles will have a weighted time that is tiny." ./ninja_summary.py -C ${BUILD_DIR}/${PRESET} || echo "Warning: ninja_summary.py failed to execute properly." end_group else echo $minimal_sccache_stats fi return $status } function test_preset() { local BUILD_NAME=$1 local PRESET=$2 local GPU_REQUIRED=${3:-true} if $GPU_REQUIRED; then fail_if_no_gpu fi local GROUP_NAME="🚀 Test ${BUILD_NAME}" local preset_dir="${BUILD_DIR}/${PRESET}" local ctest_log="${preset_dir}/ctest.log" pushd .. > /dev/null run_command "$GROUP_NAME" ctest --output-log "${ctest_log}" --preset=$PRESET status=$? popd > /dev/null print_test_time_summary ${ctest_log} return $status } function configure_and_build_preset() { local BUILD_NAME=$1 local PRESET=$2 local CMAKE_OPTIONS=$3 configure_preset "$BUILD_NAME" "$PRESET" "$CMAKE_OPTIONS" build_preset "$BUILD_NAME" "$PRESET" } cccl-2.5.0/ci/build_cub.sh000077500000000000000000000025531463375617100153260ustar00rootroot00000000000000#!/bin/bash source "$(dirname "$0")/build_common.sh" print_environment_details # CUB benchmarks require at least CUDA nvcc 11.5 for int128 # Returns "true" if the first version is greater than or equal to the second version_compare() { if [[ "$(printf '%s\n' "$1" "$2" | sort -V | head -n1)" == "$2" ]]; then echo "true" else echo "false" fi } ENABLE_CUB_BENCHMARKS="false" ENABLE_CUB_RDC="false" if [[ "$CUDA_COMPILER" == *nvcc* ]]; then ENABLE_CUB_RDC="true" NVCC_VERSION=$($CUDA_COMPILER --version | grep release | awk '{print $6}' | cut -c2-) if [[ -n "${DISABLE_CUB_BENCHMARKS}" ]]; then echo "Benchmarks have been forcefully disabled." elif [[ $(version_compare $NVCC_VERSION 11.5) == "true" ]]; then ENABLE_CUB_BENCHMARKS="true" echo "nvcc version is $NVCC_VERSION. Building CUB benchmarks." else echo "nvcc version is $NVCC_VERSION. Not building CUB benchmarks because nvcc version is less than 11.5." fi else echo "Not building with NVCC, disabling RDC and benchmarks." fi if [[ "$HOST_COMPILER" == *icpc* ]]; then ENABLE_CUB_BENCHMARKS="false" fi PRESET="cub-cpp$CXX_STANDARD" CMAKE_OPTIONS=" -DCUB_ENABLE_BENCHMARKS="$ENABLE_CUB_BENCHMARKS"\ -DCUB_ENABLE_RDC_TESTS="$ENABLE_CUB_RDC" \ " configure_and_build_preset "CUB" "$PRESET" "$CMAKE_OPTIONS" print_time_summary cccl-2.5.0/ci/build_cudax.sh000077500000000000000000000003431463375617100156540ustar00rootroot00000000000000#!/bin/bash source "$(dirname "$0")/build_common.sh" print_environment_details PRESET="cudax-cpp$CXX_STANDARD" CMAKE_OPTIONS="" configure_and_build_preset "CUDA Experimental" "$PRESET" "$CMAKE_OPTIONS" print_time_summary cccl-2.5.0/ci/build_libcudacxx.sh000077500000000000000000000003401463375617100166730ustar00rootroot00000000000000#!/bin/bash source "$(dirname "$0")/build_common.sh" print_environment_details PRESET="libcudacxx-cpp${CXX_STANDARD}" CMAKE_OPTIONS="" configure_and_build_preset libcudacxx "$PRESET" "$CMAKE_OPTIONS" print_time_summary cccl-2.5.0/ci/build_thrust.sh000077500000000000000000000003311463375617100160760ustar00rootroot00000000000000#!/bin/bash source "$(dirname "$0")/build_common.sh" print_environment_details PRESET="thrust-cpp$CXX_STANDARD" CMAKE_OPTIONS="" configure_and_build_preset "Thrust" "$PRESET" "$CMAKE_OPTIONS" print_time_summary cccl-2.5.0/ci/infra_cccl.sh000077500000000000000000000005731463375617100154610ustar00rootroot00000000000000#!/bin/bash source "$(dirname "$0")/build_common.sh" print_environment_details PRESET="cccl-infra" CMAKE_OPTIONS="" GPU_REQUIRED="false" if [ -n "${GITHUB_SHA:-}" ]; then CMAKE_OPTIONS="$CMAKE_OPTIONS -DCCCL_EXAMPLE_CPM_TAG=${GITHUB_SHA}" fi configure_preset "CCCL Infra" "$PRESET" "$CMAKE_OPTIONS" test_preset "CCCL Infra" "$PRESET" "$GPU_REQUIRED" print_time_summary cccl-2.5.0/ci/inspect_changes.sh000077500000000000000000000122131463375617100165250ustar00rootroot00000000000000#!/bin/bash # Github action script to identify which subprojects are dirty in a PR set -u # Usage: inspect_changes.sh if [ "$#" -ne 2 ]; then echo "Usage: $0 " exit 1 fi base_sha=$1 head_sha=$2 # Github gives the SHA as the current HEAD of the target ref, not the common ancestor. # Find the common ancestor and use that for the base. git fetch origin --unshallow -q git fetch origin $base_sha -q base_sha=$(git merge-base $head_sha $base_sha) # Define a list of subproject directories by their subdirectory name: subprojects=( cccl libcudacxx cub thrust cudax ) # ...and their dependencies: declare -A dependencies=( [cccl]="" [libcudacxx]="cccl" [cub]="cccl libcudacxx thrust" [thrust]="cccl libcudacxx cub" [cudax]="cccl libcudacxx" ) declare -A project_names=( [cccl]="CCCL Infrastructure" [libcudacxx]="libcu++" [cub]="CUB" [thrust]="Thrust" [cudax]="CUDA Experimental" ) # Usage checks: for subproject in "${subprojects[@]}"; do # Check that the subproject directory exists if [ "$subproject" != "cccl" ] && [ ! -d "$subproject" ]; then echo "Error: Subproject directory '$subproject' does not exist." exit 1 fi # If the subproject has dependencies, check that they exist (except for "cccl") for dependency in ${dependencies[$subproject]}; do if [ "$dependency" != "cccl" ] && [ ! -d "$dependency" ]; then echo "Error: Dependency directory '$dependency' for subproject '$subproject' does not exist." exit 1 fi done done write_output() { local key="$1" local value="$2" echo "$key=$value" | tee --append "${GITHUB_OUTPUT:-/dev/null}" } tee_to_step_summary() { if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then tee -a "${GITHUB_STEP_SUMMARY}" else cat fi } dirty_files() { git diff --name-only "${base_sha}" "${head_sha}" } # Return 1 if any files outside of the subproject directories have changed inspect_cccl() { subprojs_grep_expr=$( IFS="|" echo "(${subprojects[*]})/" ) if dirty_files | grep -v -E "${subprojs_grep_expr}" | grep -q "."; then return 1 else return 0 fi } # inspect_subdir # Returns 1 if any files in the subdirectory have changed inspect_subdir() { local subdir="$1" if dirty_files | grep -E "^${subdir}/" | grep -q '.'; then return 1 else return 0 fi } # add_dependencies # if the subproject or any of its dependencies are dirty, return 1 add_dependencies() { local subproject="$1" # Check if ${subproject^^}_DIRTY is set to 1, return 1 if it is. local dirty_flag=${subproject^^}_DIRTY if [[ ${!dirty_flag} -ne 0 ]]; then return 1 fi for dependency in ${dependencies[$subproject]}; do dirty_flag="${dependency^^}_DIRTY" if [[ ${!dirty_flag} -ne 0 ]]; then return 1 fi done return 0 } main() { # Print the list of subprojects and all of their dependencies: echo "Subprojects: ${subprojects[*]}" echo echo "Dependencies:" for subproject in "${subprojects[@]}"; do printf " - %-27s -> %s\n" "$subproject (${project_names[$subproject]})" "${dependencies[$subproject]}" done echo echo "Base SHA: ${base_sha}" echo "HEAD SHA: ${head_sha}" echo check="+/-" no_check=" " get_checkmark() { if [[ $1 -eq 0 ]]; then echo "$no_check" else echo "$check" fi } # Print the list of files that have changed: echo "::group::Dirty files" dirty_files | sed 's/^/ - /' echo "::endgroup::" echo echo "

👃 Inspect Changes

" | tee_to_step_summary echo | tee_to_step_summary echo -e "### Modifications in project?\n" | tee_to_step_summary echo "| | Project" | tee_to_step_summary echo "|-----|---------" | tee_to_step_summary # Assign the return value of `inspect_cccl` to the variable `CCCL_DIRTY`: inspect_cccl CCCL_DIRTY=$? checkmark="$(get_checkmark ${CCCL_DIRTY})" echo "| ${checkmark} | ${project_names[cccl]}" | tee_to_step_summary # Check for changes in each subprojects directory: for subproject in "${subprojects[@]}"; do if [[ ${subproject} == "cccl" ]]; then # Special case handled above. continue fi inspect_subdir $subproject local dirty=$? declare ${subproject^^}_DIRTY=${dirty} checkmark="$(get_checkmark ${dirty})" echo "| ${checkmark} | ${project_names[$subproject]}" | tee_to_step_summary done echo | tee_to_step_summary echo -e "### Modifications in project or dependencies?\n" | tee_to_step_summary echo "| | Project" | tee_to_step_summary echo "|-----|---------" | tee_to_step_summary for subproject in "${subprojects[@]}"; do add_dependencies ${subproject} local dirty=$? declare ${subproject^^}_DIRTY=${dirty} checkmark="$(get_checkmark ${dirty})" echo "| ${checkmark} | ${project_names[$subproject]}" | tee_to_step_summary done echo "
" | tee_to_step_summary declare -a dirty_subprojects=() for subproject in "${subprojects[@]}"; do var_name="${subproject^^}_DIRTY" if [[ ${!var_name} -ne 0 ]]; then dirty_subprojects+=("$subproject") fi done write_output "DIRTY_PROJECTS" "${dirty_subprojects[*]}" } main "$@" cccl-2.5.0/ci/matrix.yaml000066400000000000000000000414501463375617100152260ustar00rootroot00000000000000ctk_11_1: &ctk_11_1 '11.1' ctk_11_8: &ctk_11_8 '11.8' ctk_12_0: &ctk_12_0 '12.0' ctk_curr: &ctk_curr '12.4' # The version of the devcontainer images to use from https://hub.docker.com/r/rapidsai/devcontainers devcontainer_version: '24.06' # gcc compiler configurations gcc6: &gcc6 { name: 'gcc', version: '6', exe: 'g++' } gcc7: &gcc7 { name: 'gcc', version: '7', exe: 'g++' } gcc8: &gcc8 { name: 'gcc', version: '8', exe: 'g++' } gcc9: &gcc9 { name: 'gcc', version: '9', exe: 'g++' } gcc10: &gcc10 { name: 'gcc', version: '10', exe: 'g++' } gcc11: &gcc11 { name: 'gcc', version: '11', exe: 'g++' } gcc12: &gcc12 { name: 'gcc', version: '12', exe: 'g++' } gcc13: &gcc13 { name: 'gcc', version: '13', exe: 'g++' } gcc-oldest: &gcc-oldest { name: 'gcc', version: '6', exe: 'g++' } gcc-newest: &gcc-newest { name: 'gcc', version: '13', exe: 'g++' } # LLVM Compiler configurations llvm9: &llvm9 { name: 'llvm', version: '9', exe: 'clang++' } llvm10: &llvm10 { name: 'llvm', version: '10', exe: 'clang++' } llvm11: &llvm11 { name: 'llvm', version: '11', exe: 'clang++' } llvm12: &llvm12 { name: 'llvm', version: '12', exe: 'clang++' } llvm13: &llvm13 { name: 'llvm', version: '13', exe: 'clang++' } llvm14: &llvm14 { name: 'llvm', version: '14', exe: 'clang++' } llvm15: &llvm15 { name: 'llvm', version: '15', exe: 'clang++' } llvm16: &llvm16 { name: 'llvm', version: '16', exe: 'clang++' } llvm17: &llvm17 { name: 'llvm', version: '17', exe: 'clang++' } llvm-oldest: &llvm-oldest { name: 'llvm', version: '9', exe: 'clang++' } llvm-newest: &llvm-newest { name: 'llvm', version: '17', exe: 'clang++' } # MSVC configs msvc2017: &msvc2017 { name: 'cl', version: '14.16', exe: 'cl++' } msvc2019: &msvc2019 { name: 'cl', version: '14.29', exe: 'cl++' } msvc2022_1436: &msvc2022_1436 { name: 'cl', version: '14.36', exe: 'cl++' } msvc2022: &msvc2022 { name: 'cl', version: '14.39', exe: 'cl++' } # oneAPI configs oneapi: &oneapi { name: 'oneapi', version: '2023.2.0', exe: 'icpc' } # GHA Workflow job matrices: workflows: # If any jobs appear here, they will be executed instead of `pull_request' for PRs. # This is useful for limiting resource usage when a full matrix is not needed. # The branch protection checks will fail when using this override workflow. # # Example: # override: # - {jobs: ['build'], project['thrust'], std: 17, ctk: *ctk_curr, cxx: [*gcc12, *llvm16]} # override: pull_request: # Old CTK - {jobs: ['build'], std: 'all', ctk: *ctk_11_1, cxx: [*gcc6, *gcc7, *gcc8, *gcc9, *llvm9, *msvc2017]} - {jobs: ['build'], std: 'all', ctk: *ctk_11_8, cxx: [*gcc11], sm: '60;70;80;90'} # Current CTK - {jobs: ['build'], std: 'all', cxx: [*gcc7, *gcc8, *gcc9, *gcc10, *gcc11, *gcc12]} - {jobs: ['build'], std: 'all', cxx: [*llvm9, *llvm10, *llvm11, *llvm12, *llvm13, *llvm14, *llvm15, *llvm16]} - {jobs: ['build'], std: 'all', cxx: [*oneapi, *msvc2019]} - {jobs: ['test'], std: 'all', cxx: [*gcc13, *llvm17, *msvc2022]} # Modded builds: - {jobs: ['build'], std: 'all', cxx: [*gcc-newest, *llvm-newest], cpu: 'arm64'} - {jobs: ['build'], std: 'all', cxx: [*gcc-newest], sm: '90a'} # default_projects: clang-cuda - {jobs: ['build'], std: [17, 20], cudacxx: *llvm-newest, cxx: *llvm-newest} # nvrtc: - {jobs: ['nvrtc'], project: 'libcudacxx', std: 'all'} # verify-codegen: - {jobs: ['verify_codegen'], project: 'libcudacxx'} # cudax has different CTK reqs: # - {jobs: ['build'], project: 'cudax', ctk: [*ctk_12_0, *ctk_curr], std: 'all', cxx: [*gcc9, *gcc10, *gcc11]} # - {jobs: ['build'], project: 'cudax', ctk: [*ctk_12_0, *ctk_curr], std: 'all', cxx: [*llvm9, *llvm10, *llvm11, *llvm12, *llvm13, *llvm14]} # - {jobs: ['build'], project: 'cudax', ctk: [ *ctk_curr], std: 'all', cxx: [*llvm15]} # - {jobs: ['build'], project: 'cudax', ctk: [*ctk_12_0, ], std: 'all', cxx: [*msvc2022_1436]} # - {jobs: ['build'], project: 'cudax', ctk: [ *ctk_curr], std: 'all', cxx: [*msvc2022]} # - {jobs: ['build'], project: 'cudax', ctk: [*ctk_12_0 ], std: 17, cxx: [*gcc12], sm: "90"} # - {jobs: ['build'], project: 'cudax', ctk: [ *ctk_curr], std: 17, cxx: [*gcc12], sm: "90a"} # - {jobs: ['build'], project: 'cudax', ctk: [ *ctk_curr], std: 'all', cxx: [*gcc12, *llvm16], cpu: 'arm64'} # - {jobs: ['build'], project: 'cudax', ctk: [ *ctk_curr], std: 17, cxx: [*oneapi]} # - {jobs: ['test'], project: 'cudax', ctk: [*ctk_12_0, *ctk_curr], std: 'all', cxx: [*gcc12]} # - {jobs: ['test'], project: 'cudax', ctk: [*ctk_12_0 ], std: 'all', cxx: [*llvm14]} # - {jobs: ['test'], project: 'cudax', ctk: [ *ctk_curr], std: 'all', cxx: [*llvm16]} # cccl-infra: - {jobs: ['infra'], project: 'cccl', ctk: *ctk_11_1, cxx: [*gcc-oldest, *llvm-oldest]} - {jobs: ['infra'], project: 'cccl', ctk: *ctk_curr, cxx: [*gcc-newest, *llvm-newest]} nightly: # libcudacxx build fails, CUB tests fail: - {jobs: ['build'], ctk: *ctk_11_1, gpu: 'v100', sm: 'gpu', cxx: *gcc6, std: [11], project: ['cub']} - {jobs: ['test'], ctk: *ctk_11_1, gpu: 'v100', sm: 'gpu', cxx: *gcc6, std: [11], project: ['thrust']} # - {jobs: ['test'], ctk: *ctk_11_1, gpu: 'v100', sm: 'gpu', cxx: *gcc6, std: [11] } # libcudacxx build fails, CUB tests fail: - {jobs: ['build'], ctk: *ctk_11_1, gpu: 't4', sm: 'gpu', cxx: *llvm9, std: [17], project: ['cub']} - {jobs: ['test'], ctk: *ctk_11_1, gpu: 't4', sm: 'gpu', cxx: *llvm9, std: [17], project: ['thrust']} # - {jobs: ['test'], ctk: *ctk_11_1, gpu: 't4', sm: 'gpu', cxx: *llvm9, std: [17] } # CUB + libcudacxx tests fails: - {jobs: ['build'], ctk: *ctk_11_8, gpu: 'rtx2080', sm: 'gpu', cxx: *gcc11, std: [17], project: ['libcudacxx', 'cub']} - {jobs: ['test'], ctk: *ctk_11_8, gpu: 'rtx2080', sm: 'gpu', cxx: *gcc11, std: [17], project: ['thrust']} # - {jobs: ['test'], ctk: *ctk_11_8, gpu: 'rtx2080', sm: 'gpu', cxx: *gcc11, std: [17] } # libcudacxx tests fail: - {jobs: ['build'], ctk: *ctk_curr, gpu: 'rtxa6000', sm: 'gpu', cxx: *gcc7, std: [14], project: ['libcudacxx']} - {jobs: ['build'], ctk: *ctk_curr, gpu: 'l4', sm: 'gpu', cxx: *gcc12, std: 'all', project: ['libcudacxx']} - {jobs: ['build'], ctk: *ctk_curr, gpu: 'rtx4090', sm: 'gpu', cxx: *llvm9, std: [11], project: ['libcudacxx']} - {jobs: ['build'], ctk: *ctk_curr, gpu: 'h100', sm: 'gpu', cxx: *gcc12, std: [11, 20], project: ['libcudacxx']} - {jobs: ['build'], ctk: *ctk_curr, gpu: 'h100', sm: 'gpu', cxx: *llvm16, std: [17], project: ['libcudacxx']} - {jobs: ['test'], ctk: *ctk_curr, gpu: 'rtxa6000', sm: 'gpu', cxx: *gcc7, std: [14], project: ['cub', 'thrust']} - {jobs: ['test'], ctk: *ctk_curr, gpu: 'l4', sm: 'gpu', cxx: *gcc12, std: 'all', project: ['cub', 'thrust']} - {jobs: ['test'], ctk: *ctk_curr, gpu: 'rtx4090', sm: 'gpu', cxx: *llvm9, std: [11], project: ['cub', 'thrust']} - {jobs: ['test'], ctk: *ctk_curr, gpu: 'h100', sm: 'gpu', cxx: *gcc12, std: [11, 20], project: ['cub', 'thrust']} - {jobs: ['test'], ctk: *ctk_curr, gpu: 'h100', sm: 'gpu', cxx: *llvm16, std: [17], project: ['cub', 'thrust']} # - {jobs: ['test'], ctk: *ctk_curr, gpu: 'rtxa6000', sm: 'gpu', cxx: *gcc7, std: [14] } # - {jobs: ['test'], ctk: *ctk_curr, gpu: 'l4', sm: 'gpu', cxx: *gcc12, std: 'all' } # - {jobs: ['test'], ctk: *ctk_curr, gpu: 'rtx4090', sm: 'gpu', cxx: *llvm9, std: [11] } # - {jobs: ['test'], ctk: *ctk_curr, gpu: 'h100', sm: 'gpu', cxx: *gcc12, std: [11, 20] } # - {jobs: ['test'], ctk: *ctk_curr, gpu: 'h100', sm: 'gpu', cxx: *llvm16, std: [17] } # nvrtc: - {jobs: ['nvrtc'], ctk: *ctk_curr, gpu: 't4', sm: 'gpu', cxx: *gcc12, std: [20], project: ['libcudacxx']} - {jobs: ['nvrtc'], ctk: *ctk_curr, gpu: 'rtxa6000', sm: 'gpu', cxx: *gcc12, std: [20], project: ['libcudacxx']} - {jobs: ['nvrtc'], ctk: *ctk_curr, gpu: 'l4', sm: 'gpu', cxx: *gcc12, std: 'all', project: ['libcudacxx']} # Fails on h100: # - {jobs: ['nvrtc'], ctk: *ctk_curr, gpu: 'h100', sm: 'gpu', cxx: *gcc12, std: [11, 20], project: ['libcudacxx']} # Any generated jobs that match the entries in `exclude` will be removed from the final matrix for all workflows. exclude: # Ubuntu 18.04 is EOL and we only use it to get access to CTK 11.1 containers for CUDA testing. # Disable non-CUDA tests on this platform. - {jobs: ['test_cpu'], os: 'ubuntu18.04'} # GPU runners are not available on Windows. - {jobs: ['test', 'test_gpu', 'test_nolid', 'test_lid0', 'test_lid1', 'test_lid2'], os: 'windows2022'} # # Resources for compute_matrix.py. These can be modified to add new jobs, etc. # # Jobs are executed by running scripts: # - Linux: 'ci/_.sh` # - Windows: `ci/windows/_.ps1` # A matrix entry must have the following tag. required_tags: - 'jobs' # A list of job types to run (e.g. 'build', 'test', 'nvrtc', 'infra', 'verify_codegen', ...) for # the specified configuration(s). # If a matrix entry omits these tags, a default value (defined later in `default_`) is used. defaulted_tags: - 'ctk' # CUDA ToolKit version. Will be exploded if a list. - 'cpu' # CPU architecture. Will be exploded if a list. - 'gpu' # GPU model. Will be exploded if a list. - 'cxx' # Host compiler {name, version, exe}. Will be exploded if a list. - 'cudacxx' # Device compiler as {name, version, exe} or 'nvcc' to use nvcc from the specified `ctk`. # Will be exploded if a list. - 'project' # Project name (e.g. libcudacxx, cub, thrust, cccl). Will be exploded if a list. - 'os' # Operating system. Will be exploded if a list. # These tags will only exist if needed: optional_tags: - 'std' # C++ standard. Passed to script with `-std `. Will be exploded if a list. # If set to 'all', all stds supported by the host/device compiler are used. - 'sm' # `CMAKE_CUDA_ARCHITECTURES` Passed to script with `-arch `. # Defaults to use the settings in the CMakePresets.json file. # Set to 'gpu' to only target the GPU in the `gpu` tag. # Can pass multiple architectures via "60;70-real;80-virtual" # Will be exploded if a list (e.g. `sm: ['60;70;80;90', '90a']` creates two jobs) - 'cmake_options' # Additional CMake options to pass to the build. Passed to script with `-cmake_options ""`. # Will be exploded if a list. # `default_`: Used when the tag is omitted. default_ctk: *ctk_curr default_cudacxx: 'nvcc' default_cxx: *gcc-newest default_cpu: 'amd64' default_gpu: 'v100' default_project: - 'libcudacxx' - 'cub' - 'thrust' # Special handling: lookup os from ctk/cxx info # See `matrix.yml` at https://github.com/rapidsai/devcontainers default_os_lookup: 'ctk11.1-gcc6': 'ubuntu18.04' 'ctk11.1-gcc7': 'ubuntu18.04' 'ctk11.1-gcc8': 'ubuntu18.04' 'ctk11.1-gcc9': 'ubuntu18.04' 'ctk11.1-llvm9': 'ubuntu18.04' 'ctk11.1-cl14.16': 'windows2022' 'ctk11.8-gcc11': 'ubuntu22.04' 'ctk12.0-gcc7': 'ubuntu20.04' 'ctk12.0-gcc8': 'ubuntu20.04' 'ctk12.0-gcc9': 'ubuntu20.04' 'ctk12.0-gcc10': 'ubuntu20.04' 'ctk12.0-gcc11': 'ubuntu22.04' 'ctk12.0-gcc12': 'ubuntu22.04' 'ctk12.0-llvm9': 'ubuntu20.04' 'ctk12.0-llvm10': 'ubuntu20.04' 'ctk12.0-llvm11': 'ubuntu20.04' 'ctk12.0-llvm12': 'ubuntu20.04' 'ctk12.0-llvm13': 'ubuntu20.04' 'ctk12.0-llvm14': 'ubuntu20.04' 'ctk12.0-llvm15': 'ubuntu22.04' 'ctk12.0-llvm16': 'ubuntu22.04' 'ctk12.0-cl14.29': 'windows2022' 'ctk12.0-cl14.36': 'windows2022' 'ctk12.0-cl14.39': 'windows2022' 'ctk12.0-oneapi2023.2.0': 'ubuntu22.04' 'ctk12.4-gcc7': 'ubuntu20.04' 'ctk12.4-gcc8': 'ubuntu20.04' 'ctk12.4-gcc9': 'ubuntu20.04' 'ctk12.4-gcc10': 'ubuntu20.04' 'ctk12.4-gcc11': 'ubuntu22.04' 'ctk12.4-gcc12': 'ubuntu22.04' 'ctk12.4-gcc13': 'ubuntu22.04' 'ctk12.4-llvm9': 'ubuntu20.04' 'ctk12.4-llvm10': 'ubuntu20.04' 'ctk12.4-llvm11': 'ubuntu20.04' 'ctk12.4-llvm12': 'ubuntu20.04' 'ctk12.4-llvm13': 'ubuntu20.04' 'ctk12.4-llvm14': 'ubuntu20.04' 'ctk12.4-llvm15': 'ubuntu22.04' 'ctk12.4-llvm16': 'ubuntu22.04' 'ctk12.4-llvm17': 'ubuntu22.04' 'ctk12.4-cl14.29': 'windows2022' 'ctk12.4-cl14.36': 'windows2022' 'ctk12.4-cl14.39': 'windows2022' 'ctk12.4-oneapi2023.2.0': 'ubuntu22.04' # Lookup supported C++ standards for a given compiler when `std: 'all'`. all_stds: [11, 14, 17, 20] lookup_cxx_supported_stds: 'gcc6': [11, 14 ] 'gcc7': [11, 14, 17 ] 'gcc8': [11, 14, 17 ] 'gcc9': [11, 14, 17 ] 'gcc10': [11, 14, 17, 20] 'gcc11': [11, 14, 17, 20] 'gcc12': [11, 14, 17, 20] 'gcc13': [11, 14, 17, 20] 'llvm9': [11, 14, 17 ] 'llvm10': [11, 14, 17 ] 'llvm11': [11, 14, 17, 20] 'llvm12': [11, 14, 17, 20] 'llvm13': [11, 14, 17, 20] 'llvm14': [11, 14, 17, 20] 'llvm15': [11, 14, 17, 20] 'llvm16': [11, 14, 17, 20] 'llvm17': [11, 14, 17, 20] 'cl14.16': [ 14 ] 'cl14.29': [ 14, 17 ] 'cl14.36': [ 14, 17, 20] 'cl14.39': [ 14, 17, 20] 'oneapi2023.2.0': [11, 14, 17 ] lookup_cudacxx_supported_stds: 'nvcc11.1': [11, 14, 17 ] 'nvcc11.8': [11, 14, 17 ] 'nvcc12.0': [11, 14, 17, 20] 'nvcc12.4': [11, 14, 17, 20] 'llvm16': [11, 14, 17, 20] lookup_project_supported_stds: 'cccl': [11, 14, 17, 20] 'libcudacxx': [11, 14, 17, 20] 'cub': [11, 14, 17, 20] 'thrust': [11, 14, 17, 20] 'cudax': [ 17, 20] # Tags that aren't exploded: non_exploded_tags: - 'jobs' # Keeping jobs as a list allows for dependency handling of build->test steps. # Jobs that have an implied prerequisite 'build' job: build_required_jobs: - 'test' - 'test_gpu' - 'test_cpu' - 'test_nolid' - 'test_lid0' - 'test_lid1' - 'test_lid2' # Jobs that require a GPU gpu_required_jobs: - 'test' - 'test_gpu' - 'test_nolid' - 'test_lid0' - 'test_lid1' - 'test_lid2' - 'nvrtc' - 'infra' # cccl infra's example project test launches a kernel # When --skip-tests is given to compute-matrix.py, these jobs are ignored. skip_test_jobs: - 'test' - 'test_cpu' - 'test_gpu' - 'test_nolid' - 'test_lid0' - 'test_lid1' - 'test_lid2' - 'nvrtc' - 'infra' # Map the job type to the script invocation spec: # The script is invoked as `ci/_.sh `. # 'prefix' is required. 'args' is optional. # If a job is not specified explicitly, the default is { 'prefix': '' }. job_invoke: 'test_cpu' : { 'prefix': 'test', 'args': '-cpu-only' } 'test_gpu' : { 'prefix': 'test', 'args': '-gpu-only' } 'test_nolid' : { 'prefix': 'test', 'args': '-no-lid' } 'test_lid0' : { 'prefix': 'test', 'args': '-lid0' } 'test_lid1' : { 'prefix': 'test', 'args': '-lid1' } 'test_lid2' : { 'prefix': 'test', 'args': '-lid2' } # When a listed project has a `test` job, it will be replaced with the specified list of finer-grain jobs. project_expanded_tests: 'thrust' : ['test_gpu', 'test_cpu'] 'cub' : ['test_nolid', 'test_lid0', 'test_lid1', 'test_lid2'] # Human readable name for jobs. Default behavior is to capitalize the first letter. formatted_jobs: 'nvrtc': 'NVRTC' 'verify_codegen': 'VerifyCodegen' 'test_cpu': 'TestCPU' 'test_gpu': 'TestGPU' 'test_nolid': 'TestGPU' 'test_lid0': 'HostLaunch' 'test_lid1': 'DeviceLaunch' 'test_lid2': 'GraphCapture' # Human readable name for projects. Default behavior uses the project name as-is. formatted_project_names: 'libcudacxx': 'libcu++' 'cub': 'CUB' 'thrust': 'Thrust' 'cccl': 'CCCL' # Human readable name for compilers. Default behavior uses the "compiler.name" tag as-is. formatted_cxx_names: 'llvm': 'clang' 'oneapi': 'Intel' 'cl': 'MSVC' # All known GPUs gpus: - 'v100' # 40 runners - 't4' # 8 runners - 'rtx2080' # 8 runners - 'rtxa6000' # 12 runners - 'l4' # 48 runners - 'rtx4090' # 10 runners - 'h100' # 16 runners # SM versions of GPUs gpu_sm: 'v100': '70' 't4': '75' 'rtx2080': '75' 'rtxa6000': '86' 'l4': '89' 'rtx4090': '89' 'h100': '90' # Memory size of GPUs gpu_mem_gb: 'v100': '32' 't4': '16' 'rtx2080': '8' 'rtxa6000': '48' 'l4': '24' 'rtx4090': '24' 'h100': '80' # GPUs that require `-testing` at the end of the runner pool name. testing_pool_gpus: - 't4' - 'rtx2080' - 'rtxa6000' - 'l4' - 'rtx4090' cccl-2.5.0/ci/ninja_summary.py000077500000000000000000000400511463375617100162630ustar00rootroot00000000000000#!/usr/bin/env python3 # Copyright (c) 2018 The Chromium Authors. All rights reserved. # Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. r"""Summarize the last ninja build, invoked with ninja's -C syntax. This script is designed to be automatically run after each ninja build in order to summarize the build's performance. Making build performance information more visible should make it easier to notice anomalies and opportunities. To use this script on Windows just set NINJA_SUMMARIZE_BUILD=1 and run autoninja.bat. On Linux you can get autoninja to invoke this script using this syntax: $ NINJA_SUMMARIZE_BUILD=1 autoninja -C out/Default/ chrome You can also call this script directly using ninja's syntax to specify the output directory of interest: > python3 post_build_ninja_summary.py -C out/Default Typical output looks like this: >ninja -C out\debug_component base ninja.exe -C out\debug_component base -j 960 -l 48 -d keeprsp ninja: Entering directory `out\debug_component' [1 processes, 1/1 @ 0.3/s : 3.092s ] Regenerating ninja files Longest build steps: 0.1 weighted s to build obj/base/base/trace_log.obj (6.7 s elapsed time) 0.2 weighted s to build nasm.exe, nasm.exe.pdb (0.2 s elapsed time) 0.3 weighted s to build obj/base/base/win_util.obj (12.4 s elapsed time) 1.2 weighted s to build base.dll, base.dll.lib (1.2 s elapsed time) Time by build-step type: 0.0 s weighted time to generate 6 .lib files (0.3 s elapsed time sum) 0.1 s weighted time to generate 25 .stamp files (1.2 s elapsed time sum) 0.2 s weighted time to generate 20 .o files (2.8 s elapsed time sum) 1.7 s weighted time to generate 4 PEFile (linking) files (2.0 s elapsed time sum) 23.9 s weighted time to generate 770 .obj files (974.8 s elapsed time sum) 26.1 s weighted time (982.9 s elapsed time sum, 37.7x parallelism) 839 build steps completed, average of 32.17/s If no gn clean has been done then results will be for the last non-NULL invocation of ninja. Ideas for future statistics, and implementations are appreciated. The "weighted" time is the elapsed time of each build step divided by the number of tasks that were running in parallel. This makes it an excellent approximation of how "important" a slow step was. A link that is entirely or mostly serialized will have a weighted time that is the same or similar to its elapsed time. A compile that runs in parallel with 999 other compiles will have a weighted time that is tiny.""" import argparse import errno import fnmatch import os import subprocess import sys # The number of long build times to report: long_count = 10 # The number of long times by extension to report long_ext_count = 10 class Target: """Represents a single line read for a .ninja_log file.""" def __init__(self, start, end): """Creates a target object by passing in the start/end times in seconds as a float.""" self.start = start self.end = end # A list of targets, appended to by the owner of this object. self.targets = [] self.weighted_duration = 0.0 def Duration(self): """Returns the task duration in seconds as a float.""" return self.end - self.start def SetWeightedDuration(self, weighted_duration): """Sets the duration, in seconds, passed in as a float.""" self.weighted_duration = weighted_duration def WeightedDuration(self): """Returns the task's weighted duration in seconds as a float. Weighted_duration takes the elapsed time of the task and divides it by how many other tasks were running at the same time. Thus, it represents the approximate impact of this task on the total build time, with serialized or serializing steps typically ending up with much longer weighted durations. weighted_duration should always be the same or shorter than duration. """ # Allow for modest floating-point errors epsilon = 0.000002 if (self.weighted_duration > self.Duration() + epsilon): print('%s > %s?' % (self.weighted_duration, self.Duration())) assert (self.weighted_duration <= self.Duration() + epsilon) return self.weighted_duration def DescribeTargets(self): """Returns a printable string that summarizes the targets.""" # Some build steps generate dozens of outputs - handle them sanely. # The max_length was chosen so that it can fit most of the long # single-target names, while minimizing word wrapping. result = ', '.join(self.targets) max_length = 65 if len(result) > max_length: result = result[:max_length] + '...' return result # Copied with some modifications from ninjatracing def ReadTargets(log, show_all): """Reads all targets from .ninja_log file |log_file|, sorted by duration. The result is a list of Target objects.""" header = log.readline() # Handle empty ninja_log gracefully by silently returning an empty list of # targets. if not header: return [] assert header == '# ninja log v5\n', \ 'unrecognized ninja log version %r' % header targets_dict = {} last_end_seen = 0.0 for line in log: parts = line.strip().split('\t') if len(parts) != 5: # If ninja.exe is rudely halted then the .ninja_log file may be # corrupt. Silently continue. continue start, end, _, name, cmdhash = parts # Ignore restat. # Convert from integral milliseconds to float seconds. start = int(start) / 1000.0 end = int(end) / 1000.0 if not show_all and end < last_end_seen: # An earlier time stamp means that this step is the first in a new # build, possibly an incremental build. Throw away the previous # data so that this new build will be displayed independently. # This has to be done by comparing end times because records are # written to the .ninja_log file when commands complete, so end # times are guaranteed to be in order, but start times are not. targets_dict = {} target = None if cmdhash in targets_dict: target = targets_dict[cmdhash] if not show_all and (target.start != start or target.end != end): # If several builds in a row just run one or two build steps # then the end times may not go backwards so the last build may # not be detected as such. However in many cases there will be a # build step repeated in the two builds and the changed # start/stop points for that command, identified by the hash, # can be used to detect and reset the target dictionary. targets_dict = {} target = None if not target: targets_dict[cmdhash] = target = Target(start, end) last_end_seen = end target.targets.append(name) return list(targets_dict.values()) def GetExtension(target, extra_patterns): """Return the file extension that best represents a target. For targets that generate multiple outputs it is important to return a consistent 'canonical' extension. Ultimately the goal is to group build steps by type.""" for output in target.targets: if extra_patterns: for fn_pattern in extra_patterns.split(';'): if fnmatch.fnmatch(output, '*' + fn_pattern + '*'): return fn_pattern # Not a true extension, but a good grouping. if output.endswith('type_mappings'): extension = 'type_mappings' break # Capture two extensions if present. For example: file.javac.jar should # be distinguished from file.interface.jar. root, ext1 = os.path.splitext(output) _, ext2 = os.path.splitext(root) extension = ext2 + ext1 # Preserve the order in the file name. if len(extension) == 0: extension = '(no extension found)' if ext1 in ['.pdb', '.dll', '.exe']: extension = 'PEFile (linking)' # Make sure that .dll and .exe are grouped together and that the # .dll.lib files don't cause these to be listed as libraries break if ext1 in ['.so', '.TOC']: extension = '.so (linking)' # Attempt to identify linking, avoid identifying as '.TOC' break # Make sure .obj files don't get categorized as mojo files if ext1 in ['.obj', '.o']: break # Jars are the canonical output of java targets. if ext1 == '.jar': break # Normalize all mojo related outputs to 'mojo'. if output.count('.mojom') > 0: extension = 'mojo' break return extension def SummarizeEntries(entries, extra_step_types, elapsed_time_sorting): """Print a summary of the passed in list of Target objects.""" # Create a list that is in order by time stamp and has entries for the # beginning and ending of each build step (one time stamp may have multiple # entries due to multiple steps starting/stopping at exactly the same time). # Iterate through this list, keeping track of which tasks are running at all # times. At each time step calculate a running total for weighted time so # that when each task ends its own weighted time can easily be calculated. task_start_stop_times = [] earliest = -1 latest = 0 total_cpu_time = 0 for target in entries: if earliest < 0 or target.start < earliest: earliest = target.start if target.end > latest: latest = target.end total_cpu_time += target.Duration() task_start_stop_times.append((target.start, 'start', target)) task_start_stop_times.append((target.end, 'stop', target)) length = latest - earliest weighted_total = 0.0 # Sort by the time/type records and ignore |target| task_start_stop_times.sort(key=lambda times: times[:2]) # Now we have all task start/stop times sorted by when they happen. If a # task starts and stops on the same time stamp then the start will come # first because of the alphabet, which is important for making this work # correctly. # Track the tasks which are currently running. running_tasks = {} # Record the time we have processed up to so we know how to calculate time # deltas. last_time = task_start_stop_times[0][0] # Track the accumulated weighted time so that it can efficiently be added # to individual tasks. last_weighted_time = 0.0 # Scan all start/stop events. for event in task_start_stop_times: time, action_name, target = event # Accumulate weighted time up to now. num_running = len(running_tasks) if num_running > 0: # Update the total weighted time up to this moment. last_weighted_time += (time - last_time) / float(num_running) if action_name == 'start': # Record the total weighted task time when this task starts. running_tasks[target] = last_weighted_time if action_name == 'stop': # Record the change in the total weighted task time while this task # ran. weighted_duration = last_weighted_time - running_tasks[target] target.SetWeightedDuration(weighted_duration) weighted_total += weighted_duration del running_tasks[target] last_time = time assert (len(running_tasks) == 0) # Warn if the sum of weighted times is off by more than half a second. if abs(length - weighted_total) > 500: print('Warning: Possible corrupt ninja log, results may be ' 'untrustworthy. Length = %.3f, weighted total = %.3f' % (length, weighted_total)) # Print the slowest build steps: print(' Longest build steps:') if elapsed_time_sorting: entries.sort(key=lambda x: x.Duration()) else: entries.sort(key=lambda x: x.WeightedDuration()) for target in entries[-long_count:]: print(' %8.1f weighted s to build %s (%.1f s elapsed time)' % (target.WeightedDuration(), target.DescribeTargets(), target.Duration())) # Sum up the time by file extension/type of the output file count_by_ext = {} time_by_ext = {} weighted_time_by_ext = {} # Scan through all of the targets to build up per-extension statistics. for target in entries: extension = GetExtension(target, extra_step_types) time_by_ext[extension] = time_by_ext.get(extension, 0) + target.Duration() weighted_time_by_ext[extension] = weighted_time_by_ext.get( extension, 0) + target.WeightedDuration() count_by_ext[extension] = count_by_ext.get(extension, 0) + 1 print(' Time by build-step type:') # Copy to a list with extension name and total time swapped, to (time, ext) if elapsed_time_sorting: weighted_time_by_ext_sorted = sorted( (y, x) for (x, y) in time_by_ext.items()) else: weighted_time_by_ext_sorted = sorted( (y, x) for (x, y) in weighted_time_by_ext.items()) # Print the slowest build target types: for time, extension in weighted_time_by_ext_sorted[-long_ext_count:]: print( ' %8.1f s weighted time to generate %d %s files ' '(%1.1f s elapsed time sum)' % (time, count_by_ext[extension], extension, time_by_ext[extension])) print(' %.1f s weighted time (%.1f s elapsed time sum, %1.1fx ' 'parallelism)' % (length, total_cpu_time, total_cpu_time * 1.0 / length)) print(' %d build steps completed, average of %1.2f/s' % (len(entries), len(entries) / (length))) def main(): log_file = '.ninja_log' metrics_file = 'siso_metrics.json' parser = argparse.ArgumentParser() parser.add_argument('-C', dest='build_directory', help='Build directory.') parser.add_argument( '-s', '--step-types', help='semicolon separated fnmatch patterns for build-step grouping') parser.add_argument( '-e', '--elapsed_time_sorting', default=False, action='store_true', help='Sort output by elapsed time instead of weighted time') parser.add_argument('--log-file', help="specific ninja log file to analyze.") args, _extra_args = parser.parse_known_args() if args.build_directory: log_file = os.path.join(args.build_directory, log_file) metrics_file = os.path.join(args.build_directory, metrics_file) if args.log_file: log_file = args.log_file if not args.step_types: # Offer a convenient way to add extra step types automatically, # including when this script is run by autoninja. get() returns None if # the variable isn't set. args.step_types = os.environ.get('chromium_step_types') if args.step_types: # Make room for the extra build types. global long_ext_count long_ext_count += len(args.step_types.split(';')) if os.path.exists(metrics_file): # Automatically handle summarizing siso builds. cmd = ['siso.bat' if 'win32' in sys.platform else 'siso'] cmd.extend(['metrics', 'summary']) if args.build_directory: cmd.extend(['-C', args.build_directory]) if args.step_types: cmd.extend(['--step_types', args.step_types]) if args.elapsed_time_sorting: cmd.append('--elapsed_time_sorting') subprocess.run(cmd) else: try: with open(log_file, 'r') as log: entries = ReadTargets(log, False) if entries: SummarizeEntries(entries, args.step_types, args.elapsed_time_sorting) except IOError: print('Log file %r not found, no build summary created.' % log_file) return errno.ENOENT if __name__ == '__main__': sys.exit(main()) cccl-2.5.0/ci/nvrtc_libcudacxx.sh000077500000000000000000000005151463375617100167340ustar00rootroot00000000000000#!/bin/bash source "$(dirname "$0")/build_common.sh" print_environment_details PRESET="libcudacxx-nvrtc-cpp${CXX_STANDARD}" CMAKE_OPTIONS="" configure_and_build_preset "libcudacxx NVRTC" "$PRESET" "$CMAKE_OPTIONS" source "./sccache_stats.sh" "start" test_preset "libcudacxx NVRTC" "${PRESET}" source "./sccache_stats.sh" "end" cccl-2.5.0/ci/pretty_printing.sh000066400000000000000000000065361463375617100166410ustar00rootroot00000000000000# Print "ARG=${ARG}" for all args. function print_var_values() { # Iterate through the arguments for var_name in "$@"; do if [ -z "$var_name" ]; then echo "Usage: print_var_values ..." return 1 fi # Dereference the variable and print the result echo "$var_name=${!var_name:-(undefined)}" done } # begin_group: Start a named section of log output, possibly with color. # Usage: begin_group "Group Name" [Color] # Group Name: A string specifying the name of the group. # Color (optional): ANSI color code to set text color. Default is blue (1;34). function begin_group() { # See options for colors here: https://gist.github.com/JBlond/2fea43a3049b38287e5e9cefc87b2124 local blue="34" local name="${1:-}" local color="${2:-$blue}" if [ -n "${GITHUB_ACTIONS:-}" ]; then echo -e "::group::\e[${color}m${name}\e[0m" else echo -e "\e[${color}m================== ${name} ======================\e[0m" fi } # end_group: End a named section of log output and print status based on exit status. # Usage: end_group "Group Name" [Exit Status] # Group Name: A string specifying the name of the group. # Exit Status (optional): The exit status of the command run within the group. Default is 0. function end_group() { local name="${1:-}" local build_status="${2:-0}" local duration="${3:-}" local red="31" local blue="34" if [ -n "${GITHUB_ACTIONS:-}" ]; then echo "::endgroup::" if [ "$build_status" -ne 0 ]; then echo -e "::error::\e[${red}m ${name} - Failed (â¬†ï¸ click above for full log ⬆ï¸)\e[0m" fi else if [ "$build_status" -ne 0 ]; then echo -e "\e[${red}m================== End ${name} - Failed${duration:+ - Duration: ${duration}s} ==================\e[0m" else echo -e "\e[${blue}m================== End ${name} - Success${duration:+ - Duration: ${duration}s} ==================\n\e[0m" fi fi } declare -A command_durations # Runs a command within a named group, handles the exit status, and prints appropriate messages based on the result. # Usage: run_command "Group Name" command [arguments...] function run_command() { local group_name="${1:-}" shift local command=("$@") local status begin_group "$group_name" echo "Running command: ${command[*]}" set +e local start_time=$(date +%s) "${command[@]}" status=$? local end_time=$(date +%s) set -e local duration=$((end_time - start_time)) end_group "$group_name" $status $duration command_durations["$group_name"]=$duration return $status } function string_width() { local str="$1" echo "$str" | awk '{print length}' } function print_time_summary() { local max_length=0 local group # Find the longest group name for formatting for group in "${!command_durations[@]}"; do local group_length=$(echo "$group" | awk '{print length}') if [ "$group_length" -gt "$max_length" ]; then max_length=$group_length fi done echo "Time Summary:" for group in "${!command_durations[@]}"; do printf "%-${max_length}s : %s seconds\n" "$group" "${command_durations[$group]}" done # Clear the array of timing info declare -gA command_durations=() } cccl-2.5.0/ci/sccache_hit_rate.sh000077500000000000000000000024411463375617100166420ustar00rootroot00000000000000#!/bin/bash set -euo pipefail # Ensure two arguments are provided if [ $# -ne 2 ]; then echo "Usage: $0 " >&2 exit 1 fi # Print the contents of the before file echo "=== Contents of $1 ===" >&2 cat $1 >&2 echo "=== End of $1 ===" >&2 # Print the contents of the after file echo "=== Contents of $2 ===" >&2 cat $2 >&2 echo "=== End of $2 ===" >&2 # Extract compile requests and cache hits from the before and after files requests_before=$(awk '/^[ \t]*Compile requests[ \t]+[0-9]+/ {print $3}' "$1") hits_before=$(awk '/^[ \t]*Cache hits[ \t]+[0-9]+/ {print $3}' "$1") requests_after=$(awk '/^[ \t]*Compile requests[ \t]+[0-9]+/ {print $3}' "$2") hits_after=$(awk '/^[ \t]*Cache hits[ \t]+[0-9]+/ {print $3}' "$2") # Calculate the differences to find out how many new requests and hits requests_diff=$((requests_after - requests_before)) hits_diff=$((hits_after - hits_before)) echo "New Compile Requests: $requests_diff" >&2 echo "New Hits: $hits_diff" >&2 # Calculate and print the hit rate if [ $requests_diff -eq 0 ]; then echo "No new compile requests, hit rate is not applicable" else hit_rate=$(awk -v hits=$hits_diff -v requests=$requests_diff 'BEGIN {printf "%.2f", hits/requests * 100}') echo "sccache hit rate: $hit_rate%" >&2 echo "$hit_rate" fi cccl-2.5.0/ci/sccache_stats.sh000077500000000000000000000033611463375617100162030ustar00rootroot00000000000000#!/bin/bash # This script prints the sccache hit rate between two calls to sccache --show-stats. # It should be sourced in your script before and after the operations you want to profile, # with the 'start' or 'end' argument respectively. mode=$1 if [[ "$mode" != "start" && "$mode" != "end" ]]; then echo "Invalid mode: $mode" echo "Usage: $0 {start|end}" exit 1 fi # Check if sccache is available if ! command -v sccache &> /dev/null; then echo "Notice: sccache is not available. Skipping..." exit 0 fi case $mode in start) export SCCACHE_START_HITS=$(sccache --show-stats | awk '/^[ \t]*Cache hits[ \t]+[0-9]+/ {print $3}') export SCCACHE_START_MISSES=$(sccache --show-stats | awk '/^[ \t]*Cache misses[ \t]+[0-9]+/ {print $3}') ;; end) if [[ -z ${SCCACHE_START_HITS+x} || -z ${SCCACHE_START_MISSES+x} ]]; then echo "Error: start stats not collected. Did you call this script with 'start' before your operations?" exit 1 fi final_hits=$(sccache --show-stats | awk '/^[ \t]*Cache hits[ \t]+[0-9]+/ {print $3}') final_misses=$(sccache --show-stats | awk '/^[ \t]*Cache misses[ \t]+[0-9]+/ {print $3}') hits=$((final_hits - SCCACHE_START_HITS)) misses=$((final_misses - SCCACHE_START_MISSES)) total=$((hits + misses)) prefix="" if [ ${GITHUB_ACTIONS:-false} = "true" ]; then prefix="::notice::" fi if (( total > 0 )); then hit_rate=$(awk -v hits="$hits" -v total="$total" 'BEGIN { printf "%.2f", (hits / total) * 100 }') echo ${prefix}"sccache hits: $hits | misses: $misses | hit rate: $hit_rate%" else echo ${prefix}"sccache stats: N/A No new compilation requests" fi unset SCCACHE_START_HITS unset SCCACHE_START_MISSES ;; esac cccl-2.5.0/ci/test_cub.sh000077500000000000000000000017011463375617100152000ustar00rootroot00000000000000#!/bin/bash set -euo pipefail NO_LID=false LID0=false LID1=false LID2=false ci_dir=$(dirname "$0") new_args=$("${ci_dir}/util/extract_switches.sh" -no-lid -lid0 -lid1 -lid2 -- "$@") eval set -- ${new_args} while true; do case "$1" in -no-lid) NO_LID=true shift ;; -lid0) LID0=true shift ;; -lid1) LID1=true shift ;; -lid2) LID2=true shift ;; --) shift break ;; *) echo "Unknown argument: $1" exit 1 ;; esac done source "${ci_dir}/build_common.sh" print_environment_details ./build_cub.sh "$@" if $NO_LID; then PRESETS=("cub-nolid-cpp$CXX_STANDARD") elif $LID0; then PRESETS=("cub-lid0-cpp$CXX_STANDARD") elif $LID1; then PRESETS=("cub-lid1-cpp$CXX_STANDARD") elif $LID2; then PRESETS=("cub-lid2-cpp$CXX_STANDARD") else PRESETS=("cub-cpp$CXX_STANDARD") fi for PRESET in ${PRESETS[@]}; do test_preset "CUB (${PRESET})" ${PRESET} done print_time_summary cccl-2.5.0/ci/test_cudax.sh000077500000000000000000000003101463375617100155260ustar00rootroot00000000000000#!/bin/bash source "$(dirname "$0")/build_common.sh" print_environment_details ./build_cudax.sh "$@" PRESET="cudax-cpp$CXX_STANDARD" test_preset "CUDA Experimental" ${PRESET} print_time_summary cccl-2.5.0/ci/test_libcudacxx.sh000077500000000000000000000012101463375617100165500ustar00rootroot00000000000000#!/bin/bash source "$(dirname "$0")/build_common.sh" print_environment_details PRESET="libcudacxx-cpp${CXX_STANDARD}" CMAKE_OPTIONS="" configure_preset libcudacxx "$PRESET" "$CMAKE_OPTIONS" # The libcudacxx tests are split into two presets, one for # regular ctest tests and another that invokes the lit tests # harness with extra options for verbosity, etc: CTEST_PRESET="libcudacxx-ctest-cpp${CXX_STANDARD}" LIT_PRESET="libcudacxx-lit-cpp${CXX_STANDARD}" test_preset "libcudacxx (CTest)" ${CTEST_PRESET} source "./sccache_stats.sh" "start" test_preset "libcudacxx (lit)" ${LIT_PRESET} source "./sccache_stats.sh" "end" print_time_summary cccl-2.5.0/ci/test_thrust.sh000077500000000000000000000015241463375617100157630ustar00rootroot00000000000000#!/bin/bash set -euo pipefail CPU_ONLY=false GPU_ONLY=false ci_dir=$(dirname "$0") new_args=$("${ci_dir}/util/extract_switches.sh" -cpu-only -gpu-only -- "$@") eval set -- ${new_args} while true; do case "$1" in -cpu-only) CPU_ONLY=true shift ;; -gpu-only) GPU_ONLY=true shift ;; --) shift break ;; *) echo "Unknown argument: $1" exit 1 ;; esac done source "${ci_dir}/build_common.sh" print_environment_details ./build_thrust.sh "$@" if $CPU_ONLY; then PRESETS=("thrust-cpu-cpp$CXX_STANDARD") GPU_REQUIRED=false elif $GPU_ONLY; then PRESETS=("thrust-gpu-cpp$CXX_STANDARD") GPU_REQUIRED=true else PRESETS=("thrust-cpp$CXX_STANDARD") GPU_REQUIRED=true fi for PRESET in ${PRESETS[@]}; do test_preset "Thrust (${PRESET})" ${PRESET} ${GPU_REQUIRED} done print_time_summary cccl-2.5.0/ci/update_version.sh000077500000000000000000000113361463375617100164240ustar00rootroot00000000000000#!/bin/bash # Usage: ./update_version.sh [--dry-run] # Example: ./update_version.sh --dry-run 2 2 1 # Run in root cccl/ cd "$(dirname "${BASH_SOURCE[0]}")/.." || exit DRY_RUN=false while [[ "$#" -gt 0 ]]; do case "$1" in --dry-run) DRY_RUN=true; ;; *) break ;; esac shift done major="$1" minor="$2" patch="$3" if [ -z "$major" ] || [ -z "$minor" ] || [ -z "$patch" ]; then echo "Usage: $0 [--dry-run] " exit 1 fi # Version file paths CCCL_VERSION_FILE="libcudacxx/include/cuda/std/__cccl/version.h" THRUST_VERSION_FILE="thrust/thrust/version.h" CUB_VERSION_FILE="cub/cub/version.cuh" CCCL_CMAKE_VERSION_FILE="lib/cmake/cccl/cccl-config-version.cmake" CUB_CMAKE_VERSION_FILE="cub/cub/cmake/cub-config-version.cmake" LIBCUDACXX_CMAKE_VERSION_FILE="libcudacxx/lib/cmake/libcudacxx/libcudacxx-config-version.cmake" THRUST_CMAKE_VERSION_FILE="thrust/thrust/cmake/thrust-config-version.cmake" CUDAX_CMAKE_VERSION_FILE="cudax/lib/cmake/cudax/cudax-config-version.cmake" # Calculated version codes new_cccl_version=$((major * 1000000 + minor * 1000 + patch)) # MMMmmmppp new_thrust_cub_version=$((major * 100000 + minor * 100 + patch)) # MMMmmmpp # Fetch current version from file current_cccl_version=$(grep -oP "define CCCL_VERSION \K[0-9]+" "$CCCL_VERSION_FILE") # Fetch the latest tag from git and strip the 'v' prefix if present latest_tag=$(git tag --sort=-v:refname | head -n 1 | sed 's/^v//') # Since the tags and versions are numerically comparable, we cast them to integers latest_tag_version=$(echo "$latest_tag" | awk -F. '{ printf("%d%03d%03d", $1,$2,$3) }') echo "Running in $(pwd)" echo "New MMMmmmppp version: $new_cccl_version" echo "New MMMmmmpp version: $new_thrust_cub_version" echo "Current CCCL version: $current_cccl_version" echo "Latest git tag: $latest_tag" # Check if new version is less than or equal to current or the latest tag if (( new_cccl_version < current_cccl_version )) || (( new_cccl_version < latest_tag_version )); then echo "Error: New version $new_cccl_version is less than current version $current_cccl_version or latest git tag version $latest_tag_version." exit 1 fi update_file () { local file=$1 local pattern=$2 local new_value=$3 if [ "$DRY_RUN" = true ]; then local temp_file=$(mktemp) sed "s/$pattern/$new_value/g" "$file" > "$temp_file" diff --color=auto -U 0 "$file" "$temp_file" || true rm "$temp_file" else sed -i "s/$pattern/$new_value/" "$file" fi } # Update version information in files update_file "$CCCL_VERSION_FILE" "^#define CCCL_VERSION \([0-9]\+\)" "#define CCCL_VERSION $new_cccl_version" update_file "$THRUST_VERSION_FILE" "^#define THRUST_VERSION \([0-9]\+\)" "#define THRUST_VERSION $new_thrust_cub_version" update_file "$CUB_VERSION_FILE" "^#define CUB_VERSION \([0-9]\+\)" "#define CUB_VERSION $new_thrust_cub_version" update_file "$CUB_CMAKE_VERSION_FILE" "set(CUB_VERSION_MAJOR \([0-9]\+\))" "set(CUB_VERSION_MAJOR $major)" update_file "$CUB_CMAKE_VERSION_FILE" "set(CUB_VERSION_MINOR \([0-9]\+\))" "set(CUB_VERSION_MINOR $minor)" update_file "$CUB_CMAKE_VERSION_FILE" "set(CUB_VERSION_PATCH \([0-9]\+\))" "set(CUB_VERSION_PATCH $patch)" update_file "$LIBCUDACXX_CMAKE_VERSION_FILE" "set(libcudacxx_VERSION_MAJOR \([0-9]\+\))" "set(libcudacxx_VERSION_MAJOR $major)" update_file "$LIBCUDACXX_CMAKE_VERSION_FILE" "set(libcudacxx_VERSION_MINOR \([0-9]\+\))" "set(libcudacxx_VERSION_MINOR $minor)" update_file "$LIBCUDACXX_CMAKE_VERSION_FILE" "set(libcudacxx_VERSION_PATCH \([0-9]\+\))" "set(libcudacxx_VERSION_PATCH $patch)" update_file "$THRUST_CMAKE_VERSION_FILE" "set(THRUST_VERSION_MAJOR \([0-9]\+\))" "set(THRUST_VERSION_MAJOR $major)" update_file "$THRUST_CMAKE_VERSION_FILE" "set(THRUST_VERSION_MINOR \([0-9]\+\))" "set(THRUST_VERSION_MINOR $minor)" update_file "$THRUST_CMAKE_VERSION_FILE" "set(THRUST_VERSION_PATCH \([0-9]\+\))" "set(THRUST_VERSION_PATCH $patch)" update_file "$CCCL_CMAKE_VERSION_FILE" "set(CCCL_VERSION_MAJOR \([0-9]\+\))" "set(CCCL_VERSION_MAJOR $major)" update_file "$CCCL_CMAKE_VERSION_FILE" "set(CCCL_VERSION_MINOR \([0-9]\+\))" "set(CCCL_VERSION_MINOR $minor)" update_file "$CCCL_CMAKE_VERSION_FILE" "set(CCCL_VERSION_PATCH \([0-9]\+\))" "set(CCCL_VERSION_PATCH $patch)" update_file "$CUDAX_CMAKE_VERSION_FILE" "set(cudax_VERSION_MAJOR \([0-9]\+\))" "set(cudax_VERSION_MAJOR $major)" update_file "$CUDAX_CMAKE_VERSION_FILE" "set(cudax_VERSION_MINOR \([0-9]\+\))" "set(cudax_VERSION_MINOR $minor)" update_file "$CUDAX_CMAKE_VERSION_FILE" "set(cudax_VERSION_PATCH \([0-9]\+\))" "set(cudax_VERSION_PATCH $patch)" if [ "$DRY_RUN" = true ]; then echo "Dry run completed. No changes made." else echo "Version updated to $major.$minor.$patch" fi cccl-2.5.0/ci/util/000077500000000000000000000000001463375617100140075ustar00rootroot00000000000000cccl-2.5.0/ci/util/extract_switches.sh000077500000000000000000000027511463375617100177360ustar00rootroot00000000000000#!/bin/bash # Similar to getopt, but only extracts recognized switches and leaves all other arguments in place. # # Example Usage: # new_args=$(extract_switches.sh -cpu-only -gpu-only -- "$@") # eval set -- ${new_args} # while true; do # case "$1" in # -cpu-only) CPU_ONLY=true; shift;; # -gpu-only) GPU_ONLY=true; shift;; # --) shift; break;; # *) echo "Unknown argument: $1"; exit 1;; # esac # done # # This leaves all unrecognized arguments in $@ for later parsing. # Parse switches switches=() for arg in "$@"; do case "$arg" in --help | -h) cat <<-EOF | cut -c 5- Usage: extract_switches.sh [ ...] -- Sorts any recognized switches in argv to the front and returns the result. Unrecognized switches are left in place. Example Usage: new_args=\$(extract_switches.sh -cpu-only -gpu-only eval set -- \${new_args} while true; do case "\$1" in -cpu-only) CPU_ONLY=true; shift;; -gpu-only) GPU_ONLY=true; shift;; --) shift; break;; *) echo "Unknown argument: \$1"; exit 1;; esac done EOF exit ;; --) shift break ;; *) switches+=("$arg") shift ;; esac done found_switches=() other_args=() for arg in "$@"; do for switch in "${switches[@]}"; do if [ "$arg" = "$switch" ]; then found_switches+=("$arg") continue 2 fi done other_args+=("$arg") done echo "${found_switches[@]} -- ${other_args[@]}" cccl-2.5.0/ci/verify_codegen_libcudacxx.sh000077500000000000000000000012711463375617100205700ustar00rootroot00000000000000#!/bin/bash set -eo pipefail # Ensure the script is being executed in its containing directory cd "$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"; # TBD: verbose? any extra args? source ./pretty_printing.sh pushd .. > /dev/null GROUP_NAME="ðŸ› ï¸ CMake Configure Libcudacxx Codegen" run_command "$GROUP_NAME" cmake --preset libcudacxx-codegen status=$? popd > /dev/null pushd .. > /dev/null GROUP_NAME="ðŸ—ï¸ Build Libcudacxx Codegen" run_command "$GROUP_NAME" cmake --build --preset libcudacxx-codegen status=$? popd > /dev/null pushd .. > /dev/null GROUP_NAME="🚀 Test Libcudacxx Codegen" run_command "$GROUP_NAME" ctest --preset libcudacxx-codegen status=$? popd > /dev/null cccl-2.5.0/ci/windows/000077500000000000000000000000001463375617100145245ustar00rootroot00000000000000cccl-2.5.0/ci/windows/build_common.psm1000066400000000000000000000156211463375617100200020ustar00rootroot00000000000000Param( [Parameter(Mandatory = $false)] [Alias("std")] [ValidateNotNullOrEmpty()] [ValidateSet(11, 14, 17, 20)] [int]$CXX_STANDARD = 17 ) $ErrorActionPreference = "Stop" # We need the full path to cl because otherwise cmake will replace CMAKE_CXX_COMPILER with the full path # and keep CMAKE_CUDA_HOST_COMPILER at "cl" which breaks our cmake script $script:HOST_COMPILER = (Get-Command "cl").source -replace '\\','/' $script:PARALLEL_LEVEL = (Get-WmiObject -class Win32_processor).NumberOfLogicalProcessors # Extract the CL version for export to build scripts: $script:CL_VERSION_STRING = & cl.exe /? if ($script:CL_VERSION_STRING -match "Version (\d+\.\d+)\.\d+") { $CL_VERSION = [version]$matches[1] Write-Host "Detected cl.exe version: $CL_VERSION" } if (-not $env:CCCL_BUILD_INFIX) { $env:CCCL_BUILD_INFIX = "" } # Presets will be configured in this directory: $BUILD_DIR = "../build/$env:CCCL_BUILD_INFIX" If(!(test-path -PathType container "../build")) { New-Item -ItemType Directory -Path "../build" } # The most recent build will always be symlinked to cccl/build/latest New-Item -ItemType Directory -Path "$BUILD_DIR" -Force # Convert to an absolute path: $BUILD_DIR = (Get-Item -Path "$BUILD_DIR").FullName # Prepare environment for CMake: $env:CMAKE_BUILD_PARALLEL_LEVEL = $PARALLEL_LEVEL $env:CTEST_PARALLEL_LEVEL = 1 $env:CUDAHOSTCXX = $HOST_COMPILER.FullName $env:CXX = $HOST_COMPILER.FullName Write-Host "========================================" Write-Host "Begin build" Write-Host "pwd=$pwd" Write-Host "BUILD_DIR=$BUILD_DIR" Write-Host "CXX_STANDARD=$CXX_STANDARD" Write-Host "CXX=$env:CXX" Write-Host "CUDACXX=$env:CUDACXX" Write-Host "CUDAHOSTCXX=$env:CUDAHOSTCXX" Write-Host "TBB_ROOT=$env:TBB_ROOT" Write-Host "NVCC_VERSION=$NVCC_VERSION" Write-Host "CMAKE_BUILD_PARALLEL_LEVEL=$env:CMAKE_BUILD_PARALLEL_LEVEL" Write-Host "CTEST_PARALLEL_LEVEL=$env:CTEST_PARALLEL_LEVEL" Write-Host "CCCL_BUILD_INFIX=$env:CCCL_BUILD_INFIX" Write-Host "Current commit is:" Write-Host "$(git log -1)" Write-Host "========================================" cmake --version ctest --version function configure_preset { Param( [Parameter(Mandatory = $true)] [ValidateNotNullOrEmpty()] [string]$BUILD_NAME, [Parameter(Mandatory = $true)] [ValidateNotNullOrEmpty()] [string]$PRESET, [Parameter(Mandatory = $true)] [AllowEmptyString()] [string]$CMAKE_OPTIONS ) $step = "$BUILD_NAME (configure)" # CMake must be invoked in the same directory as the presets file: pushd ".." # Echo and execute command to stdout: $configure_command = "cmake --preset $PRESET $CMAKE_OPTIONS --log-level VERBOSE" Write-Host $configure_command Invoke-Expression $configure_command $test_result = $LastExitCode If ($test_result -ne 0) { throw "$step Failed" } popd Write-Host "$step complete." } function build_preset { Param( [Parameter(Mandatory = $true)] [ValidateNotNullOrEmpty()] [string]$BUILD_NAME, [Parameter(Mandatory = $true)] [ValidateNotNullOrEmpty()] [string]$PRESET ) $step = "$BUILD_NAME (build)" # CMake must be invoked in the same directory as the presets file: pushd ".." sccache_stats('Start') cmake --build --preset $PRESET -v $test_result = $LastExitCode $preset_dir = "${BUILD_DIR}/${PRESET}" $sccache_json = "${preset_dir}/sccache_stats.json" sccache --show-adv-stats --stats-format=json > "${sccache_json}" sccache_stats('Stop') echo "$step complete" If ($test_result -ne 0) { throw "$step Failed" } popd } function test_preset { Param( [Parameter(Mandatory = $true)] [ValidateNotNullOrEmpty()] [string]$BUILD_NAME, [Parameter(Mandatory = $true)] [ValidateNotNullOrEmpty()] [string]$PRESET ) $step = "$BUILD_NAME (test)" # CTest must be invoked in the same directory as the presets file: pushd ".." sccache_stats('Start') ctest --preset $PRESET $test_result = $LastExitCode sccache_stats('Stop') echo "$step complete" If ($test_result -ne 0) { throw "$step Failed" } popd } function configure_and_build_preset { Param( [Parameter(Mandatory = $true)] [ValidateNotNullOrEmpty()] [string]$BUILD_NAME, [Parameter(Mandatory = $true)] [ValidateNotNullOrEmpty()] [string]$PRESET, [Parameter(Mandatory = $true)] [AllowEmptyString()] [string]$CMAKE_OPTIONS ) configure_preset "$BUILD_NAME" "$PRESET" "$CMAKE_OPTIONS" build_preset "$BUILD_NAME" "$PRESET" } function sccache_stats { Param ( [Parameter(Mandatory = $true)] [ValidateNotNullOrEmpty()] [ValidateSet('Start','Stop')] [string]$MODE ) $sccache_stats = sccache -s If($MODE -eq 'Start') { [int]$script:sccache_compile_requests = ($sccache_stats[0] -replace '[^\d]+') [int]$script:sccache_cache_hits_cpp = ($sccache_stats[2] -replace '[^\d]+') [int]$script:sccache_cache_hits_cuda = ($sccache_stats[3] -replace '[^\d]+') [int]$script:sccache_cache_miss_cpp = ($sccache_stats[5] -replace '[^\d]+') [int]$script:sccache_cache_miss_cuda = ($sccache_stats[6] -replace '[^\d]+') } else { [int]$final_sccache_compile_requests = ($sccache_stats[0] -replace '[^\d]+') [int]$final_sccache_cache_hits_cpp = ($sccache_stats[2] -replace '[^\d]+') [int]$final_sccache_cache_hits_cuda = ($sccache_stats[3] -replace '[^\d]+') [int]$final_sccache_cache_miss_cpp = ($sccache_stats[5] -replace '[^\d]+') [int]$final_sccache_cache_miss_cuda = ($sccache_stats[6] -replace '[^\d]+') [int]$total_requests = $final_sccache_compile_requests - $script:sccache_compile_requests [int]$total_hits_cpp = $final_sccache_cache_hits_cpp - $script:sccache_cache_hits_cpp [int]$total_hits_cuda = $final_sccache_cache_hits_cuda - $script:sccache_cache_hits_cuda [int]$total_miss_cpp = $final_sccache_cache_miss_cpp - $script:sccache_cache_miss_cpp [int]$total_miss_cuda = $final_sccache_cache_miss_cuda - $script:sccache_cache_miss_cuda If ( $total_requests -gt 0 ) { [int]$hit_rate_cpp = $total_hits_cpp / $total_requests * 100; [int]$hit_rate_cuda = $total_hits_cuda / $total_requests * 100; echo "sccache hits cpp: $total_hits_cpp `t| misses: $total_miss_cpp `t| hit rate: $hit_rate_cpp%" echo "sccache hits cuda: $total_hits_cuda `t| misses: $total_miss_cuda `t| hit rate: $hit_rate_cuda%" } else { echo "sccache stats: N/A No new compilation requests" } } } Export-ModuleMember -Function configure_preset, build_preset, test_preset, configure_and_build_preset, sccache_stats Export-ModuleMember -Variable BUILD_DIR, CL_VERSION cccl-2.5.0/ci/windows/build_cub.ps1000066400000000000000000000012251463375617100171010ustar00rootroot00000000000000Param( [Parameter(Mandatory = $false)] [Alias("std")] [ValidateNotNullOrEmpty()] [ValidateSet(11, 14, 17, 20)] [int]$CXX_STANDARD = 17 ) $ErrorActionPreference = "Stop" $CURRENT_PATH = Split-Path $pwd -leaf If($CURRENT_PATH -ne "ci") { Write-Host "Moving to ci folder" pushd "$PSScriptRoot/.." } Import-Module $PSScriptRoot/build_common.psm1 -ArgumentList $CXX_STANDARD $PRESET = "cub-cpp$CXX_STANDARD" $CMAKE_OPTIONS = "" if ($CL_VERSION -lt [version]"19.20") { $CMAKE_OPTIONS += "-DCUB_IGNORE_DEPRECATED_COMPILER=ON " } configure_and_build_preset "CUB" "$PRESET" "$CMAKE_OPTIONS" If($CURRENT_PATH -ne "ci") { popd } cccl-2.5.0/ci/windows/build_cudax.ps1000066400000000000000000000010661463375617100174370ustar00rootroot00000000000000 Param( [Parameter(Mandatory = $false)] [Alias("std")] [ValidateNotNullOrEmpty()] [ValidateSet(17, 20)] [int]$CXX_STANDARD = 17 ) $CURRENT_PATH = Split-Path $pwd -leaf If($CURRENT_PATH -ne "ci") { Write-Host "Moving to ci folder" pushd "$PSScriptRoot/.." } Remove-Module -Name build_common Import-Module $PSScriptRoot/build_common.psm1 -ArgumentList $CXX_STANDARD $PRESET = "cudax-cpp$CXX_STANDARD" $CMAKE_OPTIONS = "" configure_and_build_preset "CUDA Experimental" "$PRESET" "$CMAKE_OPTIONS" If($CURRENT_PATH -ne "ci") { popd } cccl-2.5.0/ci/windows/build_libcudacxx.ps1000066400000000000000000000011111463375617100204500ustar00rootroot00000000000000Param( [Parameter(Mandatory = $false)] [Alias("std")] [ValidateNotNullOrEmpty()] [ValidateSet(11, 14, 17, 20)] [int]$CXX_STANDARD = 17 ) $ErrorActionPreference = "Stop" $CURRENT_PATH = Split-Path $pwd -leaf If($CURRENT_PATH -ne "ci") { Write-Host "Moving to ci folder" pushd "$PSScriptRoot/.." } Import-Module $PSScriptRoot/build_common.psm1 -ArgumentList $CXX_STANDARD, $GPU_ARCHS $PRESET = "libcudacxx-cpp${CXX_STANDARD}" $CMAKE_OPTIONS = "" configure_and_build_preset "libcudacxx" "$PRESET" "$CMAKE_OPTIONS" If($CURRENT_PATH -ne "ci") { popd } cccl-2.5.0/ci/windows/build_thrust.ps1000066400000000000000000000012361463375617100176630ustar00rootroot00000000000000Param( [Parameter(Mandatory = $false)] [Alias("std")] [ValidateNotNullOrEmpty()] [ValidateSet(11, 14, 17, 20)] [int]$CXX_STANDARD = 17 ) $ErrorActionPreference = "Stop" $CURRENT_PATH = Split-Path $pwd -leaf If($CURRENT_PATH -ne "ci") { Write-Host "Moving to ci folder" pushd "$PSScriptRoot/.." } Import-Module $PSScriptRoot/build_common.psm1 -ArgumentList $CXX_STANDARD $PRESET = "thrust-cpp$CXX_STANDARD" $CMAKE_OPTIONS = "" if ($CL_VERSION -lt [version]"19.20") { $CMAKE_OPTIONS += "-DTHRUST_IGNORE_DEPRECATED_COMPILER=ON " } configure_and_build_preset "Thrust" "$PRESET" "$CMAKE_OPTIONS" If($CURRENT_PATH -ne "ci") { popd } cccl-2.5.0/ci/windows/test_thrust.ps1000066400000000000000000000016511463375617100175440ustar00rootroot00000000000000Param( [Parameter(Mandatory = $true)] [Alias("std")] [ValidateNotNullOrEmpty()] [ValidateSet(11, 14, 17, 20)] [int]$CXX_STANDARD = 17, [Parameter(Mandatory = $false)] [Alias("cpu-only")] [switch]$CPU_ONLY = $false ) $ErrorActionPreference = "Stop" # if not cpu-only, emit an error. GPU tests are not yet supported. if (-not $CPU_ONLY) { Write-Error "Thrust tests require the -cpu-only flag" exit 1 } $CURRENT_PATH = Split-Path $pwd -leaf If($CURRENT_PATH -ne "ci") { Write-Host "Moving to ci folder" pushd "$PSScriptRoot/.." } # Execute the build script: $build_command = "$PSScriptRoot/build_thrust.ps1 -std $CXX_STANDARD" Write-Host "Executing: $build_command" Invoke-Expression $build_command Import-Module $PSScriptRoot/build_common.psm1 -ArgumentList $CXX_STANDARD $PRESET = "thrust-cpu-cpp$CXX_STANDARD" test_preset "Thrust" "$PRESET" If($CURRENT_PATH -ne "ci") { popd } cccl-2.5.0/cmake/000077500000000000000000000000001463375617100135175ustar00rootroot00000000000000cccl-2.5.0/cmake/CCCLAddSubdir.cmake000066400000000000000000000003271463375617100170110ustar00rootroot00000000000000find_package(CCCL REQUIRED CONFIG NO_DEFAULT_PATH # Only check the explicit path in HINTS: HINTS "${CCCL_SOURCE_DIR}" COMPONENTS ${CCCL_REQUIRED_COMPONENTS} OPTIONAL_COMPONENTS ${CCCL_OPTIONAL_COMPONENTS} ) cccl-2.5.0/cmake/CCCLClangdCompileInfo.cmake000066400000000000000000000026341463375617100204700ustar00rootroot00000000000000# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # Tell cmake to generate a json file of compile commands for clangd: set(CMAKE_EXPORT_COMPILE_COMMANDS ON) # Symlink the compile command output to the source dir, where clangd will find it. set(compile_commands_file "${CMAKE_BINARY_DIR}/compile_commands.json") set(compile_commands_link "${CMAKE_SOURCE_DIR}/compile_commands.json") message(STATUS "Creating symlink from ${compile_commands_link} to ${compile_commands_file}...") cccl_execute_non_fatal_process(COMMAND "${CMAKE_COMMAND}" -E rm -f "${compile_commands_link}") cccl_execute_non_fatal_process(COMMAND "${CMAKE_COMMAND}" -E touch "${compile_commands_file}") cccl_execute_non_fatal_process(COMMAND "${CMAKE_COMMAND}" -E create_symlink "${compile_commands_file}" "${compile_commands_link}") cccl-2.5.0/cmake/CCCLHideThirdPartyOptions.cmake000066400000000000000000000017651463375617100214170ustar00rootroot00000000000000mark_as_advanced( BUILD_TESTING CATCH_BUILD_EXAMPLES CATCH_BUILD_EXTRA_TESTS CATCH_BUILD_STATIC_LIBRARY CATCH_BUILD_TESTING CATCH_ENABLE_COVERAGE CATCH_ENABLE_WERROR CATCH_INSTALL_DOCS CATCH_INSTALL_HELPERS CATCH_USE_VALGRIND CLANG_FORMAT CLANG_TIDY CPM_DONT_CREATE_PACKAGE_LOCK CPM_DONT_UPDATE_MODULE_PATH CPM_DOWNLOAD_ALL CPM_INCLUDE_ALL_IN_PACKAGE_LOCK CPM_LOCAL_PACKAGES_ONLY CPM_SOURCE_CACHE CPM_USE_LOCAL_PACKAGES CPM_USE_NAMED_CACHE_DIRECTORIES CPPCHECK CUB_DIR FETCHCONTENT_BASE_DIR FETCHCONTENT_FULLY_DISCONNECTED FETCHCONTENT_QUIET FETCHCONTENT_SOURCE_DIR_CATCH2 FETCHCONTENT_SOURCE_DIR_METAL FETCHCONTENT_UPDATES_DISCONNECTED FETCHCONTENT_UPDATES_DISCONNECTED_CATCH2 FETCHCONTENT_UPDATES_DISCONNECTED_METAL LIBCXX_CXX_ABI LIT_EXTRA_ARGS LLVM_DEFAULT_EXTERNAL_LIT LLVM_DEFAULT_TARGET_TRIPLE LLVM_EXTERNAL_LIT LLVM_HOST_TRIPLE LLVM_PATH METAL_BUILD_DOC METAL_BUILD_EXAMPLES METAL_BUILD_TESTS Thrust_DIR libcudacxx_DIR ) cccl-2.5.0/cmake/CCCLInstallRules.cmake000066400000000000000000000004251463375617100175700ustar00rootroot00000000000000# Bring in CMAKE_INSTALL_LIBDIR include(GNUInstallDirs) # CCCL has no installable binaries, no need to build before installing: set(CMAKE_SKIP_INSTALL_ALL_DEPENDENCY TRUE) install(DIRECTORY "${CCCL_SOURCE_DIR}/lib/cmake/cccl" DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/" ) cccl-2.5.0/cmake/CCCLUtilities.cmake000066400000000000000000000051761463375617100171320ustar00rootroot00000000000000# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # Passes all args directly to execute_process while setting up the following # results variables and propogating them to the caller's scope: # # - cccl_process_exit_code # - cccl_process_stdout # - cccl_process_stderr # # If the command # is not successful (e.g. the last command does not return zero), a non-fatal # warning is printed. function(cccl_execute_non_fatal_process) execute_process(${ARGN} RESULT_VARIABLE cccl_process_exit_code OUTPUT_VARIABLE cccl_process_stdout ERROR_VARIABLE cccl_process_stderr ) if (NOT cccl_process_exit_code EQUAL 0) message(WARNING "execute_process failed with non-zero exit code: ${cccl_process_exit_code}\n" "${ARGN}\n" "stdout:\n${cccl_process_stdout}\n" "stderr:\n${cccl_process_stderr}\n" ) endif() set(cccl_process_exit_code "${cccl_process_exit_code}" PARENT_SCOPE) set(cccl_process_stdout "${cccl_process_stdout}" PARENT_SCOPE) set(cccl_process_stderr "${cccl_process_stderr}" PARENT_SCOPE) endfunction() # Add a build-and-test CTest. # - full_test_name_var will be set to the full name of the test. # - name_prefix is the prefix of the test's name (e.g. `cccl.test.cmake`) # - subdir is the relative path to the test project directory. # - test_id is used to generate a unique name for this test, allowing the # subdir to be reused. # - Any additional args will be passed to the project configure step. function(cccl_add_compile_test full_test_name_var name_prefix subdir test_id) set(test_name ${name_prefix}.${subdir}.${test_id}) set(src_dir "${CMAKE_CURRENT_SOURCE_DIR}/${subdir}") set(build_dir "${CMAKE_CURRENT_BINARY_DIR}/${subdir}/${test_id}") add_test(NAME ${test_name} COMMAND "${CMAKE_CTEST_COMMAND}" --build-and-test "${src_dir}" "${build_dir}" --build-generator "${CMAKE_GENERATOR}" --build-options ${ARGN} --test-command "${CMAKE_CTEST_COMMAND}" --output-on-failure ) set(${full_test_name_var} ${test_name} PARENT_SCOPE) endfunction() cccl-2.5.0/cmake/PrintCTestRunTimes.cmake000066400000000000000000000067211463375617100202550ustar00rootroot00000000000000## This CMake script parses the output of ctest and prints a formatted list ## of individual test runtimes, sorted longest first. ## ## ctest > ctest_log ## cmake -DLOGFILE=ctest_log \ ## -DMINSEC=10 \ ## -P PrintCTestRunTimes.cmake ## ################################################################################ cmake_minimum_required(VERSION 3.15) # Prepend the string with "0" until the string length equals the specified width function(pad_string_with_zeros string_var width) set(local_string "${${string_var}}") string(LENGTH "${local_string}" size) while(size LESS width) string(PREPEND local_string "0") string(LENGTH "${local_string}" size) endwhile() set(${string_var} "${local_string}" PARENT_SCOPE) endfunction() ################################################################################ if (NOT LOGFILE) message(FATAL_ERROR "Missing -DLOGFILE= argument.") endif() if (NOT DEFINED MINSEC) set(MINSEC 10) endif() set(num_below_thresh 0) # Check if logfile exists if (NOT EXISTS "${LOGFILE}") message(FATAL_ERROR "LOGFILE does not exist ('${LOGFILE}').") endif() string(JOIN "" regex "[0-9]+/[0-9]+[ ]+Test[ ]+#" "([0-9]+)" # Test ID ":[ ]+" "([^ ]+)" # Test Name "[ ]*\\.+[ ]*\\**[ ]*" "([^ ]+)" # Result "[ ]+" "([0-9]+)" # Seconds "\\.[0-9]+[ ]+sec" ) message(DEBUG "LOGFILE: ${LOGFILE}") message(DEBUG "MINSEC: ${MINSEC}") message(DEBUG "regex: ${regex}") # Read the logfile and generate a map / keylist set(keys) file(STRINGS "${LOGFILE}" lines) foreach(line ${lines}) # Parse each build time string(REGEX MATCH "${regex}" _DUMMY "${line}") if (CMAKE_MATCH_COUNT EQUAL 4) set(test_id "${CMAKE_MATCH_1}") set(test_name "${CMAKE_MATCH_2}") set(test_result "${CMAKE_MATCH_3}") set(tmp "${CMAKE_MATCH_4}") # floor(runtime_seconds) if (tmp LESS MINSEC) math(EXPR num_below_thresh "${num_below_thresh} + 1") continue() endif() # Compute human readable time math(EXPR days "${tmp} / (60 * 60 * 24)") math(EXPR tmp "${tmp} - (${days} * 60 * 60 * 24)") math(EXPR hours "${tmp} / (60 * 60)") math(EXPR tmp "${tmp} - (${hours} * 60 * 60)") math(EXPR minutes "${tmp} / (60)") math(EXPR tmp "${tmp} - (${minutes} * 60)") math(EXPR seconds "${tmp}") # Format time components pad_string_with_zeros(days 3) pad_string_with_zeros(hours 2) pad_string_with_zeros(minutes 2) pad_string_with_zeros(seconds 2) # Construct table entry # Later values in the file for the same command overwrite earlier entries string(MAKE_C_IDENTIFIER "${test_id}" key) string(JOIN " | " ENTRY_${key} "${days}d ${hours}h ${minutes}m ${seconds}s" "${test_result}" "${test_id}: ${test_name}" ) # Record the key: list(APPEND keys "${key}") endif() endforeach() list(REMOVE_DUPLICATES keys) # Build the entry list: set(entries) foreach(key ${keys}) list(APPEND entries "${ENTRY_${key}}") endforeach() if (NOT entries) message(STATUS "LOGFILE contained no test times ('${LOGFILE}').") endif() # Sort in descending order: list(SORT entries ORDER DESCENDING) # Dump table: foreach(entry ${entries}) message(STATUS ${entry}) endforeach() if (num_below_thresh GREATER 0) message(STATUS "${num_below_thresh} additional tests took < ${MINSEC}s each.") endif() cccl-2.5.0/cmake/PrintNinjaBuildTimes.cmake000066400000000000000000000062171463375617100205650ustar00rootroot00000000000000## This CMake script parses a .ninja_log file (LOGFILE) and prints a list of ## build/link times, sorted longest first. ## ## cmake -DLOGFILE=<.ninja_log file> \ ## -P PrintNinjaBuildTimes.cmake ## ## If LOGFILE is omitted, the current directory's .ninja_log file is used. ################################################################################ cmake_minimum_required(VERSION 3.15) # Prepend the string with "0" until the string length equals the specified width function(pad_string_with_zeros string_var width) set(local_string "${${string_var}}") string(LENGTH "${local_string}" size) while(size LESS width) string(PREPEND local_string "0") string(LENGTH "${local_string}" size) endwhile() set(${string_var} "${local_string}" PARENT_SCOPE) endfunction() ################################################################################ if (NOT LOGFILE) set(LOGFILE ".ninja_log") endif() # Check if logfile exists if (NOT EXISTS "${LOGFILE}") message(FATAL_ERROR "LOGFILE does not exist ('${LOGFILE}').") endif() # Read the logfile and generate a map / keylist set(keys) file(STRINGS "${LOGFILE}" lines) foreach(line ${lines}) # Parse each build time string(REGEX MATCH "^([0-9]+)\t([0-9]+)\t[0-9]+\t([^\t]+)+\t[0-9a-fA-F]+$" _DUMMY "${line}") if (CMAKE_MATCH_COUNT EQUAL 3) set(start_ms ${CMAKE_MATCH_1}) set(end_ms ${CMAKE_MATCH_2}) set(command "${CMAKE_MATCH_3}") math(EXPR runtime_ms "${end_ms} - ${start_ms}") # Compute human readable time math(EXPR days "${runtime_ms} / (1000 * 60 * 60 * 24)") math(EXPR runtime_ms "${runtime_ms} - (${days} * 1000 * 60 * 60 * 24)") math(EXPR hours "${runtime_ms} / (1000 * 60 * 60)") math(EXPR runtime_ms "${runtime_ms} - (${hours} * 1000 * 60 * 60)") math(EXPR minutes "${runtime_ms} / (1000 * 60)") math(EXPR runtime_ms "${runtime_ms} - (${minutes} * 1000 * 60)") math(EXPR seconds "${runtime_ms} / 1000") math(EXPR milliseconds "${runtime_ms} - (${seconds} * 1000)") # Format time components pad_string_with_zeros(days 3) pad_string_with_zeros(hours 2) pad_string_with_zeros(minutes 2) pad_string_with_zeros(seconds 2) pad_string_with_zeros(milliseconds 3) # Construct table entry # Later values in the file for the same command overwrite earlier entries string(MAKE_C_IDENTIFIER "${command}" key) set(ENTRY_${key} "${days}d ${hours}h ${minutes}m ${seconds}s ${milliseconds}ms | ${command}" ) # Record the key: list(APPEND keys "${key}") endif() endforeach() list(REMOVE_DUPLICATES keys) # Build the entry list: set(entries) foreach(key ${keys}) list(APPEND entries "${ENTRY_${key}}") endforeach() if (NOT entries) message(FATAL_ERROR "LOGFILE contained no build entries ('${LOGFILE}').") endif() # Sort in descending order: list(SORT entries) list(REVERSE entries) # Dump table: message(STATUS "-----------------------+----------------------------") message(STATUS "Time | Command ") message(STATUS "-----------------------+----------------------------") foreach(entry ${entries}) message(STATUS ${entry}) endforeach() cccl-2.5.0/cub/000077500000000000000000000000001463375617100132105ustar00rootroot00000000000000cccl-2.5.0/cub/.clang-tidy000066400000000000000000000016521463375617100152500ustar00rootroot00000000000000--- Checks: 'modernize-*, -modernize-use-equals-default, -modernize-concat-nested-namespaces, -modernize-use-trailing-return-type' # -modernize-use-equals-default # auto-fix is broken (doesn't insert =default correctly) # -modernize-concat-nested-namespaces # auto-fix is broken (can delete code) # -modernize-use-trailing-return-type # just a preference WarningsAsErrors: '' HeaderFilterRegex: '' AnalyzeTemporaryDtors: false FormatStyle: none CheckOptions: - key: modernize-loop-convert.MaxCopySize value: '16' - key: modernize-loop-convert.MinConfidence value: reasonable - key: modernize-pass-by-value.IncludeStyle value: llvm - key: modernize-replace-auto-ptr.IncludeStyle value: llvm - key: modernize-use-nullptr.NullMacros value: 'NULL' ... cccl-2.5.0/cub/.gitignore000066400000000000000000000000471463375617100152010ustar00rootroot00000000000000.p4config *~ \#* /build .cache .vscode cccl-2.5.0/cub/CHANGELOG.md000066400000000000000000002001271463375617100150230ustar00rootroot00000000000000# CUB 2.1.0 ## Breaking Changes - NVIDIA/cub#553: Deprecate the `CUB_USE_COOPERATIVE_GROUPS` macro, as all supported CTK distributions provide CG. This macro will be removed in a future version of CUB. ## New Features - NVIDIA/cub#359: Add new `DeviceBatchMemcpy` algorithm. - NVIDIA/cub#565: Add `DeviceMergeSort::StableSortKeysCopy` API. Thanks to David Wendt (@davidwendt) for this contribution. - NVIDIA/cub#585: Add SM90 tuning policy for `DeviceRadixSort`. Thanks to Andy Adinets (@canonizer) for this contribution. - NVIDIA/cub#586: Introduce a new mechanism to opt-out of compiling CDP support in CUB algorithms by defining `CUB_DISABLE_CDP`. - NVIDIA/cub#589: Support 64-bit indexing in `DeviceReduce`. - NVIDIA/cub#607: Support 128-bit integers in radix sort. ## Bug Fixes - NVIDIA/cub#547: Resolve several long-running issues resulting from using multiple versions of CUB within the same process. Adds an inline namespace that encodes CUB version and targeted PTX architectures. - NVIDIA/cub#562: Fix bug in `BlockShuffle` resulting from an invalid thread offset. Thanks to @sjfeng1999 for this contribution. - NVIDIA/cub#564: Fix bug in `BlockRadixRank` when used with blocks that are not a multiple of 32 threads. - NVIDIA/cub#579: Ensure that all threads in the logical warp participate in the index-shuffle for `BlockRadixRank`. Thanks to Andy Adinets (@canonizer) for this contribution. - NVIDIA/cub#582: Fix reordering in CUB member initializer lists. - NVIDIA/cub#589: Fix `DeviceSegmentedSort` when used with `bool` keys. - NVIDIA/cub#590: Fix CUB's CMake install rules. Thanks to Robert Maynard (@robertmaynard) for this contribution. - NVIDIA/cub#592: Fix overflow in `DeviceReduce`. - NVIDIA/cub#598: Fix `DeviceRunLengthEncode` when the first item is a `NaN`. - NVIDIA/cub#611: Fix `WarpScanExclusive` for vector types. ## Other Enhancements - NVIDIA/cub#537: Add detailed and expanded version of a [CUB developer overview](https://github.com/NVIDIA/cub/blob/main/docs/developer_overview.md). - NVIDIA/cub#549: Fix `BlockReduceRaking` docs for non-commutative operations. Thanks to Tobias Ribizel (@upsj) for this contribution. - NVIDIA/cub#606: Optimize CUB's decoupled-lookback implementation. # CUB 2.0.1 ## Other Enhancements - Skip device-side synchronization on SM90+. These syncs are a debugging-only feature and not required for correctness, and a warning will be emitted if this happens. # CUB 2.0.0 ## Summary The CUB 2.0.0 major release adds a dependency on libcu++ and contains several breaking changes. These include new diagnostics when inspecting device-only lambdas from the host, an updated method of determining accumulator types for algorithms like Reduce and Scan, and a compile-time replacement for the runtime `debug_synchronous` debugging flags. This release also includes several new features. `DeviceHistogram` now supports `__half` and better handles various edge cases. `WarpReduce` now performs correctly when restricted to a single-thread “warpâ€, and will use the `__reduce_add_sync` accelerated intrinsic (introduced with Ampere) when appropriate. `DeviceRadixSort` learned to handle the case where `begin_bit == end_bit`. Several algorithms also have updated documentation, with a particular focus on clarifying which operations can and cannot be performed in-place. ## Breaking Changes - NVIDIA/cub#448 Add libcu++ dependency (v1.8.0+). - NVIDIA/cub#448: The following macros are no longer defined by default. They can be re-enabled by defining `CUB_PROVIDE_LEGACY_ARCH_MACROS`. These will be completely removed in a future release. - `CUB_IS_HOST_CODE`: Replace with `NV_IF_TARGET`. - `CUB_IS_DEVICE_CODE`: Replace with `NV_IF_TARGET`. - `CUB_INCLUDE_HOST_CODE`: Replace with `NV_IF_TARGET`. - `CUB_INCLUDE_DEVICE_CODE`: Replace with `NV_IF_TARGET`. - NVIDIA/cub#486: CUB's CUDA Runtime support macros have been updated to support `NV_IF_TARGET`. They are now defined consistently across all host/device compilation passes. This should not affect most usages of these macros, but may require changes for some edge cases. - `CUB_RUNTIME_FUNCTION`: Execution space annotations for functions that invoke CUDA Runtime APIs. - Old behavior: - RDC enabled: Defined to `__host__ __device__` - RDC not enabled: - NVCC host pass: Defined to `__host__ __device__` - NVCC device pass: Defined to `__host__` - New behavior: - RDC enabled: Defined to `__host__ __device__` - RDC not enabled: Defined to `__host__` - `CUB_RUNTIME_ENABLED`: No change in behavior, but no longer used in CUB. Provided for legacy support only. Legacy behavior: - RDC enabled: Macro is defined. - RDC not enabled: - NVCC host pass: Macro is defined. - NVCC device pass: Macro is not defined. - `CUB_RDC_ENABLED`: New macro, may be combined with `NV_IF_TARGET` to replace most usages of `CUB_RUNTIME_ENABLED`. Behavior: - RDC enabled: Macro is defined. - RDC not enabled: Macro is not defined. - NVIDIA/cub#509: A compile-time error is now emitted when a `__device__`-only lambda's return type is queried from host code (requires libcu++ ≥ 1.9.0). - Due to limitations in the CUDA programming model, the result of this query is unreliable, and will silently return an incorrect result. This leads to difficult to debug errors. - When using libcu++ 1.9.0, an error will be emitted with information about work-arounds: - Use a named function object with a `__device__`-only implementation of `operator()`. - Use a `__host__ __device__` lambda. - Use `cuda::proclaim_return_type` (Added in libcu++ 1.9.0) - NVIDIA/cub#509: Use the result type of the binary reduction operator for accumulating intermediate results in the `DeviceReduce` algorithm, following guidance from http://wg21.link/P2322R6. - This change requires host-side introspection of the binary operator's signature, and device-only extended lambda functions can no longer be used. - In addition to the behavioral changes, the interfaces for the `Dispatch*Reduce` layer have changed: - `DispatchReduce`: - Now accepts accumulator type as last parameter. - Now accepts initializer type instead of output iterator value type. - Constructor now accepts `init` as initial type instead of output iterator value type. - `DispatchSegmentedReduce`: - Accepts accumulator type as last parameter. - Accepts initializer type instead of output iterator value type. - Thread operators now accept parameters using different types: `Equality` , `Inequality`, `InequalityWrapper`, `Sum`, `Difference`, `Division`, `Max` , `ArgMax`, `Min`, `ArgMin`. - `ThreadReduce` now accepts accumulator type and uses a different type for `prefix`. - NVIDIA/cub#511: Use the result type of the binary operator for accumulating intermediate results in the `DeviceScan`, `DeviceScanByKey`, and `DeviceReduceByKey` algorithms, following guidance from http://wg21.link/P2322R6. - This change requires host-side introspection of the binary operator's signature, and device-only extended lambda functions can no longer be used. - In addition to the behavioral changes, the interfaces for the `Dispatch` layer have changed: - `DispatchScan`now accepts accumulator type as a template parameter. - `DispatchScanByKey`now accepts accumulator type as a template parameter. - `DispatchReduceByKey`now accepts accumulator type as the last template parameter. - NVIDIA/cub#527: Deprecate the `debug_synchronous` flags on device algorithms. - This flag no longer has any effect. Define `CUB_DEBUG_SYNC` during compilation to enable these checks. - Moving this option from run-time to compile-time avoids the compilation overhead of unused debugging paths in production code. ## New Features - NVIDIA/cub#514: Support `__half` in `DeviceHistogram`. - NVIDIA/cub#516: Add support for single-threaded invocations of `WarpReduce`. - NVIDIA/cub#516: Use `__reduce_add_sync` hardware acceleration for `WarpReduce` on supported architectures. ## Bug Fixes - NVIDIA/cub#481: Fix the device-wide radix sort implementations to simply copy the input to the output when `begin_bit == end_bit`. - NVIDIA/cub#487: Fix `DeviceHistogram::Even` for a variety of edge cases: - Bin ids are now correctly computed when mixing different types for `SampleT` and `LevelT`. - Bin ids are now correctly computed when `LevelT` is an integral type and the number of levels does not evenly divide the level range. - NVIDIA/cub#508: Ensure that `temp_storage_bytes` is properly set in the `AdjacentDifferenceCopy` device algorithms. - NVIDIA/cub#508: Remove excessive calls to the binary operator given to the `AdjacentDifferenceCopy` device algorithms. - NVIDIA/cub#533: Fix debugging utilities when RDC is disabled. ## Other Enhancements - NVIDIA/cub#448: Removed special case code for unsupported CUDA architectures. - NVIDIA/cub#448: Replace several usages of `__CUDA_ARCH__` with `` to handle host/device code divergence. - NVIDIA/cub#448: Mark unused PTX arch parameters as legacy. - NVIDIA/cub#476: Enabled additional debug logging for the onesweep radix sort implementation. Thanks to @canonizer for this contribution. - NVIDIA/cub#480: Add `CUB_DISABLE_BF16_SUPPORT` to avoid including the `cuda_bf16.h` header or using the `__nv_bfloat16` type. - NVIDIA/cub#486: Add debug log messages for post-kernel debug synchronizations. - NVIDIA/cub#490: Clarify documentation for in-place usage of `DeviceScan` algorithms. - NVIDIA/cub#494: Clarify documentation for in-place usage of `DeviceHistogram` algorithms. - NVIDIA/cub#495: Clarify documentation for in-place usage of `DevicePartition` algorithms. - NVIDIA/cub#499: Clarify documentation for in-place usage of `Device*Sort` algorithms. - NVIDIA/cub#500: Clarify documentation for in-place usage of `DeviceReduce` algorithms. - NVIDIA/cub#501: Clarify documentation for in-place usage of `DeviceRunLengthEncode` algorithms. - NVIDIA/cub#503: Clarify documentation for in-place usage of `DeviceSelect` algorithms. - NVIDIA/cub#518: Fix typo in `WarpMergeSort` documentation. - NVIDIA/cub#519: Clarify segmented sort documentation regarding the handling of elements that are not included in any segment. # CUB 1.17.2 ## Summary CUB 1.17.2 is a minor bugfix release. - NVIDIA/cub#547: Introduce an annotated inline namespace to prevent issues with collisions and mismatched kernel configurations across libraries. The new namespace encodes the CUB version and target SM architectures. # CUB 1.17.1 ## Summary CUB 1.17.1 is a minor bugfix release. - NVIDIA/cub#508: Ensure that `temp_storage_bytes` is properly set in the `AdjacentDifferenceCopy` device algorithms. - NVIDIA/cub#508: Remove excessive calls to the binary operator given to the `AdjacentDifferenceCopy` device algorithms. - Fix device-side debug synchronous behavior in `DeviceSegmentedSort`. # CUB 1.17.0 ## Summary CUB 1.17.0 is the final minor release of the 1.X series. It provides a variety of bug fixes and miscellaneous enhancements, detailed below. ## Known Issues ### "Run-to-run" Determinism Broken Several CUB device algorithms are documented to provide deterministic results (per device) for non-associative reduction operators (e.g. floating-point addition). Unfortunately, the implementations of these algorithms contain performance optimizations that violate this guarantee. The `DeviceReduce::ReduceByKey` and `DeviceScan` algorithms are known to be affected. We're currently evaluating the scope and impact of correcting this in a future CUB release. See NVIDIA/cub#471 for details. ## Bug Fixes - NVIDIA/cub#444: Fixed `DeviceSelect` to work with discard iterators and mixed input/output types. - NVIDIA/cub#452: Fixed install issue when `CMAKE_INSTALL_LIBDIR` contained nested directories. Thanks to @robertmaynard for this contribution. - NVIDIA/cub#462: Fixed bug that produced incorrect results from `DeviceSegmentedSort` on sm_61 and sm_70. - NVIDIA/cub#464: Fixed `DeviceSelect::Flagged` so that flags are normalized to 0 or 1. - NVIDIA/cub#468: Fixed overflow issues in `DeviceRadixSort` given `num_items` close to 2^32. Thanks to @canonizer for this contribution. - NVIDIA/cub#498: Fixed compiler regression in `BlockAdjacentDifference`. Thanks to @MKKnorr for this contribution. ## Other Enhancements - NVIDIA/cub#445: Remove device-sync in `DeviceSegmentedSort` when launched via CDP. - NVIDIA/cub#449: Fixed invalid link in documentation. Thanks to @kshitij12345 for this contribution. - NVIDIA/cub#450: `BlockDiscontinuity`: Replaced recursive-template loop unrolling with `#pragma unroll`. Thanks to @kshitij12345 for this contribution. - NVIDIA/cub#451: Replaced the deprecated `TexRefInputIterator` implementation with an alias to `TexObjInputIterator`. This fully removes all usages of the deprecated CUDA texture reference APIs from CUB. - NVIDIA/cub#456: `BlockAdjacentDifference`: Replaced recursive-template loop unrolling with `#pragma unroll`. Thanks to @kshitij12345 for this contribution. - NVIDIA/cub#466: `cub::DeviceAdjacentDifference` API has been updated to use the new `OffsetT` deduction approach described in NVIDIA/cub#212. - NVIDIA/cub#470: Fix several doxygen-related warnings. Thanks to @karthikeyann for this contribution. # CUB 1.16.0 ## Summary CUB 1.16.0 is a major release providing several improvements to the device scope algorithms. `DeviceRadixSort` now supports large (64-bit indexed) input data. A new `UniqueByKey` algorithm has been added to `DeviceSelect`. `DeviceAdjacentDifference` provides new `SubtractLeft` and `SubtractRight` functionality. This release also deprecates several obsolete APIs, including type traits and `BlockAdjacentDifference` algorithms. Many bugfixes and documentation updates are also included. ### 64-bit Offsets in `DeviceRadixSort` Public APIs Users frequently want to process large datasets using CUB's device-scope algorithms, but the current public APIs limit input data sizes to those that can be indexed by a 32-bit integer. Beginning with this release, CUB is updating these APIs to support 64-bit offsets, as discussed in NVIDIA/cub#212. The device-scope algorithms will be updated with 64-bit offset support incrementally, starting with the `cub::DeviceRadixSort` family of algorithms. Thanks to @canonizer for contributing this functionality. ### New `DeviceSelect::UniqueByKey` Algorithm `cub::DeviceSelect` now provides a `UniqueByKey` algorithm, which has been ported from Thrust. Thanks to @zasdfgbnm for this contribution. ### New `DeviceAdjacentDifference` Algorithms The new `cub::DeviceAdjacentDifference` interface, also ported from Thrust, provides `SubtractLeft` and `SubtractRight` algorithms as CUB kernels. ## Deprecation Notices ### Synchronous CUDA Dynamic Parallelism Support **A future version of CUB will change the `debug_synchronous` behavior of device-scope algorithms when invoked via CUDA Dynamic Parallelism (CDP).** This will only affect calls to CUB device-scope algorithms launched from device-side code with `debug_synchronous = true`. Such invocations will continue to print extra debugging information, but they will no longer synchronize after kernel launches. ### Deprecated Traits CUB provided a variety of metaprogramming type traits in order to support C++03. Since C++14 is now required, these traits have been deprecated in favor of their STL equivalents, as shown below: | Deprecated CUB Trait | Replacement STL Trait | |-----------------------|-----------------------| | cub::If | std::conditional | | cub::Equals | std::is_same | | cub::IsPointer | std::is_pointer | | cub::IsVolatile | std::is_volatile | | cub::RemoveQualifiers | std::remove_cv | | cub::EnableIf | std::enable_if | CUB now uses the STL traits internally, resulting in a ~6% improvement in compile time. ### Misnamed `cub::BlockAdjacentDifference` APIs The algorithms in `cub::BlockAdjacentDifference` have been deprecated, as their names did not clearly describe their intent. The `FlagHeads` method is now `SubtractLeft`, and `FlagTails` has been replaced by `SubtractRight`. ## Breaking Changes - NVIDIA/cub#331: Deprecate the misnamed `BlockAdjacentDifference::FlagHeads` and `FlagTails` methods. Use the new `SubtractLeft` and `SubtractRight` methods instead. - NVIDIA/cub#364: Deprecate some obsolete type traits. These should be replaced by the equivalent traits in `` as described above. ## New Features - NVIDIA/cub#331: Port the `thrust::adjacent_difference` kernel and expose it as `cub::DeviceAdjacentDifference`. - NVIDIA/cub#405: Port the `thrust::unique_by_key` kernel and expose it as `cub::DeviceSelect::UniqueByKey`. Thanks to @zasdfgbnm for this contribution. ## Enhancements - NVIDIA/cub#340: Allow 64-bit offsets in `DeviceRadixSort` public APIs. Thanks to @canonizer for this contribution. - NVIDIA/cub#400: Implement a significant reduction in `DeviceMergeSort` compilation time. - NVIDIA/cub#415: Support user-defined `CMAKE_INSTALL_INCLUDEDIR` values in Thrust's CMake install rules. Thanks for @robertmaynard for this contribution. ## Bug Fixes - NVIDIA/cub#381: Fix shared memory alignment in `dyn_smem` example. - NVIDIA/cub#393: Fix some collisions with the `min`/`max` macros defined in `windows.h`. - NVIDIA/cub#404: Fix bad cast in `util_device`. - NVIDIA/cub#410: Fix CDP issues in `DeviceSegmentedSort`. - NVIDIA/cub#411: Ensure that the `nv_exec_check_disable` pragma is only used on nvcc. - NVIDIA/cub#418: Fix `-Wsizeof-array-div` warning on gcc 11. Thanks to @robertmaynard for this contribution. - NVIDIA/cub#420: Fix new uninitialized variable warning in `DiscardIterator` on gcc 10. - NVIDIA/cub#423: Fix some collisions with the `small` macro defined in `windows.h`. - NVIDIA/cub#426: Fix some issues with version handling in CUB's CMake packages. - NVIDIA/cub#430: Remove documentation for `DeviceSpmv` parameters that are absent from public APIs. - NVIDIA/cub#432: Remove incorrect documentation for `DeviceScan` algorithms that guaranteed run-to-run deterministic results for floating-point addition. # CUB 1.15.0 (NVIDIA HPC SDK 22.1, CUDA Toolkit 11.6) ## Summary CUB 1.15.0 includes a new `cub::DeviceSegmentedSort` algorithm, which demonstrates up to 5000x speedup compared to `cub::DeviceSegmentedRadixSort` when sorting a large number of small segments. A new `cub::FutureValue` helper allows the `cub::DeviceScan` algorithms to lazily load the `initial_value` from a pointer. `cub::DeviceScan` also added `ScanByKey` functionality. The new `DeviceSegmentedSort` algorithm partitions segments into size groups. Each group is processed with specialized kernels using a variety of sorting algorithms. This approach varies the number of threads allocated for sorting each segment and utilizes the GPU more efficiently. `cub::FutureValue` provides the ability to use the result of a previous kernel as a scalar input to a CUB device-scope algorithm without unnecessary synchronization: ```cpp int *d_intermediate_result = ...; intermediate_kernel<<>>(d_intermediate_result, // output arg1, // input arg2); // input // Wrap the intermediate pointer in a FutureValue -- no need to explicitly // sync when both kernels are stream-ordered. The pointer is read after // the ExclusiveScan kernel starts executing. cub::FutureValue init_value(d_intermediate_result); cub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, cub::Sum(), init_value, num_items); ``` Previously, an explicit synchronization would have been necessary to obtain the intermediate result, which was passed by value into ExclusiveScan. This new feature enables better performance in workflows that use cub::DeviceScan. ## Deprecation Notices **A future version of CUB will change the `debug_synchronous` behavior of device-scope algorithms when invoked via CUDA Dynamic Parallelism (CDP).** This will only affect calls to CUB device-scope algorithms launched from device-side code with `debug_synchronous = true`. These algorithms will continue to print extra debugging information, but they will no longer synchronize after kernel launches. ## Breaking Changes - NVIDIA/cub#305: The template parameters of `cub::DispatchScan` have changed to support the new `cub::FutureValue` helper. More details under "New Features". - NVIDIA/cub#377: Remove broken `operator->()` from `cub::TransformInputIterator`, since this cannot be implemented without returning a temporary object's address. Thanks to Xiang Gao (@zasdfgbnm) for this contribution. ## New Features - NVIDIA/cub#305: Add overloads to `cub::DeviceScan` algorithms that allow the output of a previous kernel to be used as `initial_value` without explicit synchronization. See the new `cub::FutureValue` helper for details. Thanks to Xiang Gao (@zasdfgbnm) for this contribution. - NVIDIA/cub#354: Add `cub::BlockRunLengthDecode` algorithm. Thanks to Elias Stehle (@elstehle) for this contribution. - NVIDIA/cub#357: Add `cub::DeviceSegmentedSort`, an optimized version of `cub::DeviceSegmentedSort` with improved load balancing and small array performance. - NVIDIA/cub#376: Add "by key" overloads to `cub::DeviceScan`. Thanks to Xiang Gao (@zasdfgbnm) for this contribution. ## Bug Fixes - NVIDIA/cub#349: Doxygen and unused variable fixes. - NVIDIA/cub#363: Maintenance updates for the new `cub::DeviceMergeSort` algorithms. - NVIDIA/cub#382: Fix several `-Wconversion` warnings. Thanks to Matt Stack (@matt-stack) for this contribution. - NVIDIA/cub#388: Fix debug assertion on MSVC when using `cub::CachingDeviceAllocator`. - NVIDIA/cub#395: Support building with `__CUDA_NO_HALF_CONVERSIONS__`. Thanks to Xiang Gao (@zasdfgbnm) for this contribution. # CUB 1.14.0 (NVIDIA HPC SDK 21.9) ## Summary CUB 1.14.0 is a major release accompanying the NVIDIA HPC SDK 21.9. This release provides the often-requested merge sort algorithm, ported from the `thrust::sort` implementation. Merge sort provides more flexibility than the existing radix sort by supporting arbitrary data types and comparators, though radix sorting is still faster for supported inputs. This functionality is provided through the new `cub::DeviceMergeSort` and `cub::BlockMergeSort` algorithms. The namespace wrapping mechanism has been overhauled for 1.14. The existing macros (`CUB_NS_PREFIX`/`CUB_NS_POSTFIX`) can now be replaced by a single macro, `CUB_WRAPPED_NAMESPACE`, which is set to the name of the desired wrapped namespace. Defining a similar `THRUST_CUB_WRAPPED_NAMESPACE` macro will embed both `thrust::` and `cub::` symbols in the same external namespace. The prefix/postfix macros are still supported, but now require a new `CUB_NS_QUALIFIER` macro to be defined, which provides the fully qualified CUB namespace (e.g. `::foo::cub`). See `cub/util_namespace.cuh` for details. ## Breaking Changes - NVIDIA/cub#350: When the `CUB_NS_[PRE|POST]FIX` macros are set, `CUB_NS_QUALIFIER` must also be defined to the fully qualified CUB namespace (e.g. `#define CUB_NS_QUALIFIER ::foo::cub`). Note that this is handled automatically when using the new `[THRUST_]CUB_WRAPPED_NAMESPACE` mechanism. ## New Features - NVIDIA/cub#322: Ported the merge sort algorithm from Thrust: `cub::BlockMergeSort` and `cub::DeviceMergeSort` are now available. - NVIDIA/cub#326: Simplify the namespace wrapper macros, and detect when Thrust's symbols are in a wrapped namespace. ## Bug Fixes - NVIDIA/cub#160, NVIDIA/cub#163, NVIDIA/cub#352: Fixed several bugs in `cub::DeviceSpmv` and added basic tests for this algorithm. Thanks to James Wyles and Seunghwa Kang for their contributions. - NVIDIA/cub#328: Fixed error handling bug and incorrect debugging output in `cub::CachingDeviceAllocator`. Thanks to Felix Kallenborn for this contribution. - NVIDIA/cub#335: Fixed a compile error affecting clang and NVRTC. Thanks to Jiading Guo for this contribution. - NVIDIA/cub#351: Fixed some errors in the `cub::DeviceHistogram` documentation. ## Enhancements - NVIDIA/cub#348: Add an example that demonstrates how to use dynamic shared memory with a CUB block algorithm. Thanks to Matthias Jouanneaux for this contribution. # CUB 1.13.1 (CUDA Toolkit 11.5) CUB 1.13.1 is a minor release accompanying the CUDA Toolkit 11.5. This release provides a new hook for embedding the `cub::` namespace inside a custom namespace. This is intended to work around various issues related to linking multiple shared libraries that use CUB. The existing `CUB_NS_PREFIX` and `CUB_NS_POSTFIX` macros already provided this capability; this update provides a simpler mechanism that is extended to and integrated with Thrust. Simply define `THRUST_CUB_WRAPPED_NAMESPACE` to a namespace name, and both `thrust::` and `cub::` will be placed inside the new namespace. Using different wrapped namespaces for each shared library will prevent issues like those reported in NVIDIA/thrust#1401. ## New Features - NVIDIA/cub#326: Add `THRUST_CUB_WRAPPED_NAMESPACE` hooks. # CUB 1.13.0 (NVIDIA HPC SDK 21.7) CUB 1.13.0 is the major release accompanying the NVIDIA HPC SDK 21.7 release. Notable new features include support for striped data arrangements in block load/store utilities, `bfloat16` radix sort support, and fewer restrictions on offset iterators in segmented device algorithms. Several bugs in `cub::BlockShuffle`, `cub::BlockDiscontinuity`, and `cub::DeviceHistogram` have been addressed. The amount of code generated in `cub::DeviceScan` has been greatly reduced, leading to significant compile-time improvements when targeting multiple PTX architectures. This release also includes several user-contributed documentation fixes that will be reflected in CUB's online documentation in the coming weeks. ## Breaking Changes - NVIDIA/cub#320: Deprecated `cub::TexRefInputIterator`. Use `cub::TexObjInputIterator` as a replacement. ## New Features - NVIDIA/cub#274: Add `BLOCK_LOAD_STRIPED` and `BLOCK_STORE_STRIPED` functionality to `cub::BlockLoadAlgorithm` and `cub::BlockStoreAlgorithm`. Thanks to Matthew Nicely (@mnicely) for this contribution. - NVIDIA/cub#291: `cub::DeviceSegmentedRadixSort` and `cub::DeviceSegmentedReduce` now support different types for begin/end offset iterators. Thanks to Sergey Pavlov (@psvvsp) for this contribution. - NVIDIA/cub#306: Add `bfloat16` support to `cub::DeviceRadixSort`. Thanks to Xiang Gao (@zasdfgbnm) for this contribution. - NVIDIA/cub#320: Introduce a new `CUB_IGNORE_DEPRECATED_API` macro that disables deprecation warnings on Thrust and CUB APIs. ## Bug Fixes - NVIDIA/cub#277: Fixed sanitizer warnings in `RadixSortScanBinsKernels`. Thanks to Andy Adinets (@canonizer) for this contribution. - NVIDIA/cub#287: `cub::DeviceHistogram` now correctly handles cases where `OffsetT` is not an `int`. Thanks to Dominique LaSalle (@nv-dlasalle) for this contribution. - NVIDIA/cub#311: Fixed several bugs and added tests for the `cub::BlockShuffle` collective operations. - NVIDIA/cub#312: Eliminate unnecessary kernel instantiations when compiling `cub::DeviceScan`. Thanks to Elias Stehle (@elstehle) for this contribution. - NVIDIA/cub#319: Fixed out-of-bounds memory access on debugging builds of `cub::BlockDiscontinuity::FlagHeadsAndTails`. - NVIDIA/cub#320: Fixed harmless missing return statement warning in unreachable `cub::TexObjInputIterator` code path. ## Other Enhancements - Several documentation fixes are included in this release. - NVIDIA/cub#275: Fixed comments describing the `cub::If` and `cub::Equals` utilities. Thanks to Rukshan Jayasekara (@rukshan99) for this contribution. - NVIDIA/cub#290: Documented that `cub::DeviceSegmentedReduce` will produce consistent results run-to-run on the same device for pseudo-associated reduction operators. Thanks to Himanshu (@himanshu007-creator) for this contribution. - NVIDIA/cub#298: `CONTRIBUTING.md` now refers to Thrust's build instructions for developer builds, which is the preferred way to build the CUB test harness. Thanks to Xiang Gao (@zasdfgbnm) for contributing. - NVIDIA/cub#301: Expand `cub::DeviceScan` documentation to include in-place support and add tests. Thanks to Xiang Gao (@zasdfgbnm) for this contribution. - NVIDIA/cub#307: Expand `cub::DeviceRadixSort` and `cub::BlockRadixSort` documentation to clarify stability, in-place support, and type-specific bitwise transformations. Thanks to Himanshu (@himanshu007-creator) for contributing. - NVIDIA/cub#316: Move `WARP_TIME_SLICING` documentation to the correct location. Thanks to Peter Han (@peter9606) for this contribution. - NVIDIA/cub#321: Update URLs from deprecated github.com to preferred github.io. Thanks to Lilo Huang (@lilohuang) for this contribution. # CUB 1.12.1 (CUDA Toolkit 11.4) CUB 1.12.1 is a trivial patch release that slightly changes the phrasing of a deprecation message. # CUB 1.12.0 (NVIDIA HPC SDK 21.3) ## Summary CUB 1.12.0 is a bugfix release accompanying the NVIDIA HPC SDK 21.3 and the CUDA Toolkit 11.4. Radix sort is now stable when both +0.0 and -0.0 are present in the input (they are treated as equivalent). Many compilation warnings and subtle overflow bugs were fixed in the device algorithms, including a long-standing bug that returned invalid temporary storage requirements when `num_items` was close to (but not exceeding) `INT32_MAX`. Support for Clang < 7.0 and MSVC < 2019 (aka 19.20/16.0/14.20) is now deprecated. ## Breaking Changes - NVIDIA/cub#256: Deprecate Clang < 7 and MSVC < 2019. ## New Features - NVIDIA/cub#218: Radix sort now treats -0.0 and +0.0 as equivalent for floating point types, which is required for the sort to be stable. Thanks to Andy Adinets for this contribution. ## Bug Fixes - NVIDIA/cub#247: Suppress newly triggered warnings in Clang. Thanks to Andrew Corrigan for this contribution. - NVIDIA/cub#249: Enable stricter warning flags. This fixes a number of outstanding issues: - NVIDIA/cub#221: Overflow in `temp_storage_bytes` when `num_items` close to (but not over) `INT32_MAX`. - NVIDIA/cub#228: CUB uses non-standard C++ extensions that break strict compilers. - NVIDIA/cub#257: Warning when compiling `GridEvenShare` with unsigned offsets. - NVIDIA/cub#258: Use correct `OffsetT` in `DispatchRadixSort::InitPassConfig`. Thanks to Felix Kallenborn for this contribution. - NVIDIA/cub#259: Remove some problematic `__forceinline__` annotations. ## Other Enhancements - NVIDIA/cub#123: Fix incorrect issue number in changelog. Thanks to Peet Whittaker for this contribution. # CUB 1.11.0 (CUDA Toolkit 11.3) ## Summary CUB 1.11.0 is a major release accompanying the CUDA Toolkit 11.3 release, providing bugfixes and performance enhancements. It includes a new `DeviceRadixSort` backend that improves performance by up to 2x on supported keys and hardware. Our CMake package and build system continue to see improvements with `add_subdirectory` support, installation rules, status messages, and other features that make CUB easier to use from CMake projects. The release includes several other bugfixes and modernizations, and received updates from 11 contributors. ## Breaking Changes - NVIDIA/cub#201: The intermediate accumulator type used when `DeviceScan` is invoked with different input/output types is now consistent with [P0571](https://wg21.link/P0571). This may produce different results for some edge cases when compared with earlier releases of CUB. ## New Features - NVIDIA/cub#204: Faster `DeviceRadixSort`, up to 2x performance increase for 32/64-bit keys on Pascal and up (SM60+). Thanks to Andy Adinets for this contribution. - Unroll loops in `BlockRadixRank` to improve performance for 32-bit keys by 1.5-2x on Clang CUDA. Thanks to Justin Lebar for this contribution. - NVIDIA/cub#200: Allow CUB to be added to CMake projects via `add_subdirectory`. - NVIDIA/cub#214: Optionally add install rules when included with CMake's `add_subdirectory`. Thanks to Kai Germaschewski for this contribution. ## Bug Fixes - NVIDIA/cub#215: Fix integer truncation in `AgentReduceByKey`, `AgentScan`, and `AgentSegmentFixup`. Thanks to Rory Mitchell for this contribution. - NVIDIA/cub#225: Fix compile-time regression when defining `CUB_NS_PREFIX` /`CUB_NS_POSTFIX` macro. Thanks to Elias Stehle for this contribution. - NVIDIA/cub#210: Fix some edge cases in `DeviceScan`: - Use values from the input when padding temporary buffers. This prevents custom functors from getting unexpected values. - Prevent integer truncation when using large indices via the `DispatchScan` layer. - Use timesliced reads/writes for types > 128 bytes. - NVIDIA/cub#217: Fix and add test for cmake package install rules. Thanks to Keith Kraus and Kai Germaschewski for testing and discussion. - NVIDIA/cub#170, NVIDIA/cub#233: Update CUDA version checks to behave on Clang CUDA and `nvc++`. Thanks to Artem Belevich, Andrew Corrigan, and David Olsen for these contributions. - NVIDIA/cub#220, NVIDIA/cub#216: Various fixes for Clang CUDA. Thanks to Andrew Corrigan for these contributions. - NVIDIA/cub#231: Fix signedness mismatch warnings in unit tests. - NVIDIA/cub#231: Suppress GPU deprecation warnings. - NVIDIA/cub#214: Use semantic versioning rules for our CMake package's compatibility checks. Thanks to Kai Germaschewski for this contribution. - NVIDIA/cub#214: Use `FindPackageHandleStandardArgs` to print standard status messages when our CMake package is found. Thanks to Kai Germaschewski for this contribution. - NVIDIA/cub#207: Fix `CubDebug` usage in `CachingDeviceAllocator::DeviceAllocate`. Thanks to Andreas Hehn for this contribution. - Fix documentation for `DevicePartition`. Thanks to ByteHamster for this contribution. - Clean up unused code in `DispatchScan`. Thanks to ByteHamster for this contribution. ## Other Enhancements - NVIDIA/cub#213: Remove tuning policies for unsupported hardware (`: `CUB_VERSION`, `CUB_VERSION_MAJOR`, `CUB_VERSION_MINOR`, `CUB_VERSION_SUBMINOR`, and `CUB_PATCH_NUMBER`. - Platform detection machinery: - ``: Detects the C++ standard dialect. - ``: host and device compiler detection. - ``: `CUB_DEPRECATED`. - `: Includes ``, ``, ``, ``, ``, `` - `cub::DeviceCount` and `cub::DeviceCountUncached`, caching abstractions for `cudaGetDeviceCount`. ## Other Enhancements - Lazily initialize the per-device CUDAattribute caches, because CUDA context creation is expensive and adds up with large CUDA binaries on machines with many GPUs. Thanks to the NVIDIA PyTorch team for bringing this to our attention. - Make `cub::SwitchDevice` avoid setting/resetting the device if the current device is the same as the target device. ## Bug Fixes - Add explicit failure parameter to CAS in the CUB attribute cache to workaround a GCC 4.8 bug. - Revert a change in reductions that changed the signedness of the `lane_id` variable to suppress a warning, as this introduces a bug in optimized device code. - Fix initialization in `cub::ExclusiveSum`. Thanks to Conor Hoekstra for this contribution. - Fix initialization of the `std::array` in the CUB attribute cache. - Fix `-Wsign-compare` warnings. Thanks to Elias Stehle for this contribution. - Fix `test_block_reduce.cu` to build without parameters. Thanks to Francis Lemaire for this contribution. - Add missing includes to `grid_even_share.cuh`. Thanks to Francis Lemaire for this contribution. - Add missing includes to `thread_search.cuh`. Thanks to Francis Lemaire for this contribution. - Add missing includes to `cub.cuh`. Thanks to Felix Kallenborn for this contribution. # CUB 1.9.8-1 (NVIDIA HPC SDK 20.3) ## Summary CUB 1.9.8-1 is a variant of 1.9.8 accompanying the NVIDIA HPC SDK 20.3 release. It contains modifications necessary to serve as the implementation of NVC++'s GPU-accelerated C++17 Parallel Algorithms. # CUB 1.9.8 (CUDA 11.0 Early Access) ## Summary CUB 1.9.8 is the first release of CUB to be officially supported and included in the CUDA Toolkit. When compiling CUB in C++11 mode, CUB now caches calls to CUDA attribute query APIs, which improves performance of these queries by 20x to 50x when they are called concurrently by multiple host threads. ## Enhancements - (C++11 or later) Cache calls to `cudaFuncGetAttributes` and `cudaDeviceGetAttribute` within `cub::PtxVersion` and `cub::SmVersion`. These CUDA APIs acquire locks to CUDA driver/runtime mutex and perform poorly under contention; with the caching, they are 20 to 50x faster when called concurrently. Thanks to Bilge Acun for bringing this issue to our attention. - `DispatchReduce` now takes an `OutputT` template parameter so that users can specify the intermediate type explicitly. - Radix sort tuning policies updates to fix performance issues for element types smaller than 4 bytes. ## Bug Fixes - Change initialization style from copy initialization to direct initialization (which is more permissive) in `AgentReduce` to allow a wider range of types to be used with it. - Fix bad signed/unsigned comparisons in `WarpReduce`. - Fix computation of valid lanes in warp-level reduction primitive to correctly handle the case where there are 0 input items per warp. # CUB 1.8.0 ## Summary CUB 1.8.0 introduces changes to the `cub::Shuffle*` interfaces. ## Breaking Changes - The interfaces of `cub::ShuffleIndex`, `cub::ShuffleUp`, and `cub::ShuffleDown` have been changed to allow for better computation of the PTX SHFL control constant for logical warps smaller than 32 threads. ## Bug Fixes - #112: Fix `cub::WarpScan`'s broadcast of warp-wide aggregate for logical warps smaller than 32 threads. # CUB 1.7.5 ## Summary CUB 1.7.5 adds support for radix sorting `__half` keys and improved sorting performance for 1 byte keys. It was incorporated into Thrust 1.9.2. ## Enhancements - Radix sort support for `__half` keys. - Radix sort tuning policy updates to improve 1 byte key performance. ## Bug Fixes - Syntax tweaks to mollify Clang. - #127: `cub::DeviceRunLengthEncode::Encode` returns incorrect results. - #128: 7-bit sorting passes fail for SM61 with large values. # CUB 1.7.4 ## Summary CUB 1.7.4 is a minor release that was incorporated into Thrust 1.9.1-2. ## Bug Fixes - #114: Can't pair non-trivially-constructible values in radix sort. - #115: `cub::WarpReduce` segmented reduction is broken in CUDA 9 for logical warp sizes smaller than 32. # CUB 1.7.3 ## Summary CUB 1.7.3 is a minor release. ## Bug Fixes - #110: `cub::DeviceHistogram` null-pointer exception bug for iterator inputs. # CUB 1.7.2 ## Summary CUB 1.7.2 is a minor release. ## Bug Fixes - #108: Device-wide reduction is now "run-to-run" deterministic for pseudo-associative reduction operators (like floating point addition). # CUB 1.7.1 ## Summary CUB 1.7.1 delivers improved radix sort performance on SM7x (Volta) GPUs and a number of bug fixes. ## Enhancements - Radix sort tuning policies updated for SM7x (Volta). ## Bug Fixes - #104: `uint64_t` `cub::WarpReduce` broken for CUB 1.7.0 on CUDA 8 and older. - #103: Can't mix Thrust from CUDA 9.0 and CUB. - #102: CUB pulls in `windows.h` which defines `min`/`max` macros that conflict with `std::min`/`std::max`. - #99: Radix sorting crashes NVCC on Windows 10 for SM52. - #98: cuda-memcheck: --tool initcheck failed with lineOfSight. - #94: Git clone size. - #93: Accept iterators for segment offsets. - #87: CUB uses anonymous unions which is not valid C++. - #44: Check for C++11 is incorrect for Visual Studio 2013. # CUB 1.7.0 ## Summary CUB 1.7.0 brings support for CUDA 9.0 and SM7x (Volta) GPUs. It is compatible with independent thread scheduling. It was incorporated into Thrust 1.9.0-5. ## Breaking Changes - Remove `cub::WarpAll` and `cub::WarpAny`. These functions served to emulate `__all` and `__any` functionality for SM1x devices, which did not have those operations. However, SM1x devices are now deprecated in CUDA, and the interfaces of these two functions are now lacking the lane-mask needed for collectives to run on SM7x and newer GPUs which have independent thread scheduling. ## Other Enhancements - Remove any assumptions of implicit warp synchronization to be compatible with SM7x's (Volta) independent thread scheduling. ## Bug Fixes - #86: Incorrect results with reduce-by-key. # CUB 1.6.4 ## Summary CUB 1.6.4 improves radix sorting performance for SM5x (Maxwell) and SM6x (Pascal) GPUs. ## Enhancements - Radix sort tuning policies updated for SM5x (Maxwell) and SM6x (Pascal) - 3.5B and 3.4B 32 byte keys/s on TitanX and GTX 1080, respectively. ## Bug Fixes - Restore fence work-around for scan (reduce-by-key, etc.) hangs in CUDA 8.5. - #65: `cub::DeviceSegmentedRadixSort` should allow inputs to have pointer-to-const type. - Mollify Clang device-side warnings. - Remove out-dated MSVC project files. # CUB 1.6.3 ## Summary CUB 1.6.3 improves support for Windows, changes `cub::BlockLoad`/`cub::BlockStore` interface to take the local data type, and enhances radix sort performance for SM6x (Pascal) GPUs. ## Breaking Changes - `cub::BlockLoad` and `cub::BlockStore` are now templated by the local data type, instead of the `Iterator` type. This allows for output iterators having `void` as their `value_type` (e.g. discard iterators). ## Other Enhancements - Radix sort tuning policies updated for SM6x (Pascal) GPUs - 6.2B 4 byte keys/s on GP100. - Improved support for Windows (warnings, alignment, etc). ## Bug Fixes - #74: `cub::WarpReduce` executes reduction operator for out-of-bounds items. - #72: `cub:InequalityWrapper::operator` should be non-const. - #71: `cub::KeyValuePair` won't work if `Key` has non-trivial constructor. - #69: cub::BlockStore::Store` doesn't compile if `OutputIteratorT::value_type` isn't `T`. - #68: `cub::TilePrefixCallbackOp::WarpReduce` doesn't permit PTX arch specialization. # CUB 1.6.2 (previously 1.5.5) ## Summary CUB 1.6.2 (previously 1.5.5) improves radix sort performance for SM6x (Pascal) GPUs. ## Enhancements - Radix sort tuning policies updated for SM6x (Pascal) GPUs. ## Bug Fixes - Fix AArch64 compilation of `cub::CachingDeviceAllocator`. # CUB 1.6.1 (previously 1.5.4) ## Summary CUB 1.6.1 (previously 1.5.4) is a minor release. ## Bug Fixes - Fix radix sorting bug introduced by scan refactorization. # CUB 1.6.0 (previously 1.5.3) ## Summary CUB 1.6.0 changes the scan and reduce interfaces. Exclusive scans now accept an "initial value" instead of an "identity value". Scans and reductions now support differing input and output sequence types. Additionally, many bugs have been fixed. ## Breaking Changes - Device/block/warp-wide exclusive scans have been revised to now accept an "initial value" (instead of an "identity value") for seeding the computation with an arbitrary prefix. - Device-wide reductions and scans can now have input sequence types that are different from output sequence types (as long as they are convertible). ## Other Enhancements - Reduce repository size by moving the doxygen binary to doc repository. - Minor reduction in `cub::BlockScan` instruction counts. ## Bug Fixes - Issue #55: Warning in `cub/device/dispatch/dispatch_reduce_by_key.cuh`. - Issue #59: `cub::DeviceScan::ExclusiveSum` can't prefix sum of float into double. - Issue #58: Infinite loop in `cub::CachingDeviceAllocator::NearestPowerOf`. - Issue #47: `cub::CachingDeviceAllocator` needs to clean up CUDA global error state upon successful retry. - Issue #46: Very high amount of needed memory from the `cub::DeviceHistogram::HistogramEven`. - Issue #45: `cub::CachingDeviceAllocator` fails with debug output enabled # CUB 1.5.2 ## Summary CUB 1.5.2 enhances `cub::CachingDeviceAllocator` and improves scan performance for SM5x (Maxwell). ## Enhancements - Improved medium-size scan performance on SM5x (Maxwell). - Refactored `cub::CachingDeviceAllocator`: - Now spends less time locked. - Uses C++11's `std::mutex` when available. - Failure to allocate a block from the runtime will retry once after freeing cached allocations. - Now respects max-bin, fixing an issue where blocks in excess of max-bin were still being retained in the free cache. ## Bug fixes: - Fix for generic-type reduce-by-key `cub::WarpScan` for SM3x and newer GPUs. # CUB 1.5.1 ## Summary CUB 1.5.1 is a minor release. ## Bug Fixes - Fix for incorrect `cub::DeviceRadixSort` output for some small problems on SM52 (Mawell) GPUs. - Fix for macro redefinition warnings when compiling `thrust::sort`. # CUB 1.5.0 CUB 1.5.0 introduces segmented sort and reduction primitives. ## New Features: - Segmented device-wide operations for device-wide sort and reduction primitives. ## Bug Fixes: - #36: `cub::ThreadLoad` generates compiler errors when loading from pointer-to-const. - #29: `cub::DeviceRadixSort::SortKeys` yields compiler errors. - #26: Misaligned address after `cub::DeviceRadixSort::SortKeys`. - #25: Fix for incorrect results and crashes when radix sorting 0-length problems. - Fix CUDA 7.5 issues on SM52 GPUs with SHFL-based warp-scan and warp-reduction on non-primitive data types (e.g. user-defined structs). - Fix small radix sorting problems where 0 temporary bytes were required and users code was invoking `malloc(0)` on some systems where that returns `NULL`. CUB assumed the user was asking for the size again and not running the sort. # CUB 1.4.1 ## Summary CUB 1.4.1 is a minor release. ## Enhancements - Allow `cub::DeviceRadixSort` and `cub::BlockRadixSort` on bool types. ## Bug Fixes - Fix minor CUDA 7.0 performance regressions in `cub::DeviceScan` and `cub::DeviceReduceByKey`. - Remove requirement for callers to define the `CUB_CDP` macro when invoking CUB device-wide rountines using CUDA dynamic parallelism. - Fix headers not being included in the proper order (or missing includes) for some block-wide functions. # CUB 1.4.0 ## Summary CUB 1.4.0 adds `cub::DeviceSpmv`, `cub::DeviceRunLength::NonTrivialRuns`, improves `cub::DeviceHistogram`, and introduces support for SM5x (Maxwell) GPUs. ## New Features: - `cub::DeviceSpmv` methods for multiplying sparse matrices by dense vectors, load-balanced using a merge-based parallel decomposition. - `cub::DeviceRadixSort` sorting entry-points that always return the sorted output into the specified buffer, as opposed to the `cub::DoubleBuffer` in which it could end up in either buffer. - `cub::DeviceRunLengthEncode::NonTrivialRuns` for finding the starting offsets and lengths of all non-trivial runs (i.e., length > 1) of keys in a given sequence. Useful for top-down partitioning algorithms like MSD sorting of very-large keys. ## Other Enhancements - Support and performance tuning for SM5x (Maxwell) GPUs. - Updated cub::DeviceHistogram implementation that provides the same "histogram-even" and "histogram-range" functionality as IPP/NPP. Provides extremely fast and, perhaps more importantly, very uniform performance response across diverse real-world datasets, including pathological (homogeneous) sample distributions. # CUB 1.3.2 ## Summary CUB 1.3.2 is a minor release. ## Bug Fixes - Fix `cub::DeviceReduce` where reductions of small problems (small enough to only dispatch a single thread block) would run in the default stream (stream zero) regardless of whether an alternate stream was specified. # CUB 1.3.1 ## Summary CUB 1.3.1 is a minor release. ## Bug Fixes - Workaround for a benign WAW race warning reported by cuda-memcheck in `cub::BlockScan` specialized for `BLOCK_SCAN_WARP_SCANS` algorithm. - Fix bug in `cub::DeviceRadixSort` where the algorithm may sort more key bits than the caller specified (up to the nearest radix digit). - Fix for ~3% `cub::DeviceRadixSort` performance regression on SM2x (Fermi) and SM3x (Kepler) GPUs. # CUB 1.3.0 ## Summary CUB 1.3.0 improves how thread blocks are expressed in block- and warp-wide primitives and adds an enhanced version of `cub::WarpScan`. ## Breaking Changes - CUB's collective (block-wide, warp-wide) primitives underwent a minor interface refactoring: - To provide the appropriate support for multidimensional thread blocks, The interfaces for collective classes are now template-parameterized by X, Y, and Z block dimensions (with `BLOCK_DIM_Y` and `BLOCK_DIM_Z` being optional, and `BLOCK_DIM_X` replacing `BLOCK_THREADS`). Furthermore, the constructors that accept remapped linear thread-identifiers have been removed: all primitives now assume a row-major thread-ranking for multidimensional thread blocks. - To allow the host program (compiled by the host-pass) to accurately determine the device-specific storage requirements for a given collective (compiled for each device-pass), the interfaces for collective classes are now (optionally) template-parameterized by the desired PTX compute capability. This is useful when aliasing collective storage to shared memory that has been allocated dynamically by the host at the kernel call site. - Most CUB programs having typical 1D usage should not require any changes to accomodate these updates. ## New Features - Added "combination" `cub::WarpScan` methods for efficiently computing both inclusive and exclusive prefix scans (and sums). ## Bug Fixes - Fix for bug in `cub::WarpScan` (which affected `cub::BlockScan` and `cub::DeviceScan`) where incorrect results (e.g., NAN) would often be returned when parameterized for floating-point types (fp32, fp64). - Workaround for ptxas error when compiling with with -G flag on Linux (for debug instrumentation). - Fixes for certain scan scenarios using custom scan operators where code compiled for SM1x is run on newer GPUs of higher compute-capability: the compiler could not tell which memory space was being used collective operations and was mistakenly using global ops instead of shared ops. # CUB 1.2.3 ## Summary CUB 1.2.3 is a minor release. ## Bug Fixes - Fixed access violation bug in `cub::DeviceReduce::ReduceByKey` for non-primitive value types. - Fixed code-snippet bug in `ArgIndexInputIteratorT` documentation. # CUB 1.2.2 ## Summary CUB 1.2.2 adds a new variant of `cub::BlockReduce` and MSVC project solections for examples. ## New Features - MSVC project solutions for device-wide and block-wide examples - New algorithmic variant of cub::BlockReduce for improved performance when using commutative operators (e.g., numeric addition). ## Bug Fixes - Inclusion of Thrust headers in a certain order prevented CUB device-wide primitives from working properly. # CUB 1.2.0 ## Summary CUB 1.2.0 adds `cub::DeviceReduce::ReduceByKey` and `cub::DeviceReduce::RunLengthEncode` and support for CUDA 6.0. ## New Features - `cub::DeviceReduce::ReduceByKey`. - `cub::DeviceReduce::RunLengthEncode`. ## Other Enhancements - Improved `cub::DeviceScan`, `cub::DeviceSelect`, `cub::DevicePartition` performance. - Documentation and testing: - Added performance-portability plots for many device-wide primitives. - Explain that iterator (in)compatibilities with CUDA 5.0 (and older) and Thrust 1.6 (and older). - Revised the operation of temporary tile status bookkeeping for `cub::DeviceScan` (and similar) to be safe for current code run on future platforms (now uses proper fences). ## Bug Fixes - Fix `cub::DeviceScan` bug where Windows alignment disagreements between host and device regarding user-defined data types would corrupt tile status. - Fix `cub::BlockScan` bug where certain exclusive scans on custom data types for the `BLOCK_SCAN_WARP_SCANS` variant would return incorrect results for the first thread in the block. - Added workaround to make `cub::TexRefInputIteratorT` work with CUDA 6.0. # CUB 1.1.1 ## Summary CUB 1.1.1 introduces texture and cache modifier iterators, descending sorting, `cub::DeviceSelect`, `cub::DevicePartition`, `cub::Shuffle*`, and `cub::MaxSMOccupancy`. Additionally, scan and sort performance for older GPUs has been improved and many bugs have been fixed. ## Breaking Changes - Refactored block-wide I/O (`cub::BlockLoad` and `cub::BlockStore`), removing cache-modifiers from their interfaces. `cub::CacheModifiedInputIterator` and `cub::CacheModifiedOutputIterator` should now be used with `cub::BlockLoad` and `cub::BlockStore` to effect that behavior. ## New Features - `cub::TexObjInputIterator`, `cub::TexRefInputIterator`, `cub::CacheModifiedInputIterator`, and `cub::CacheModifiedOutputIterator` types for loading & storing arbitrary types through the cache hierarchy. They are compatible with Thrust. - Descending sorting for `cub::DeviceRadixSort` and `cub::BlockRadixSort`. - Min, max, arg-min, and arg-max operators for `cub::DeviceReduce`. - `cub::DeviceSelect` (select-unique, select-if, and select-flagged). - `cub::DevicePartition` (partition-if, partition-flagged). - Generic `cub::ShuffleUp`, `cub::ShuffleDown`, and `cub::ShuffleIndex` for warp-wide communication of arbitrary data types (SM3x and up). - `cub::MaxSmOccupancy` for accurately determining SM occupancy for any given kernel function pointer. ## Other Enhancements - Improved `cub::DeviceScan` and `cub::DeviceRadixSort` performance for older GPUs (SM1x to SM3x). - Renamed device-wide `stream_synchronous` param to `debug_synchronous` to avoid confusion about usage. - Documentation improvements: - Added simple examples of device-wide methods. - Improved doxygen documentation and example snippets. - Improved test coverege to include up to 21,000 kernel variants and 851,000 unit tests (per architecture, per platform). ## Bug Fixes - Fix misc `cub::DeviceScan, BlockScan, DeviceReduce, and BlockReduce bugs when operating on non-primitive types for older architectures SM1x. - SHFL-based scans and reductions produced incorrect results for multi-word types (size > 4B) on Linux. - For `cub::WarpScan`-based scans, not all threads in the first warp were entering the prefix callback functor. - `cub::DeviceRadixSort` had a race condition with key-value pairs for pre-SM35 architectures. - `cub::DeviceRadixSor` bitfield-extract behavior with long keys on 64-bit Linux was incorrect. - `cub::BlockDiscontinuity` failed to compile for types other than `int32_t`/`uint32_t`. - CUDA Dynamic Parallelism (CDP, e.g. device-callable) versions of device-wide methods now report the same temporary storage allocation size requirement as their host-callable counterparts. # CUB 1.0.2 ## Summary CUB 1.0.2 is a minor release. ## Bug Fixes - Corrections to code snippet examples for `cub::BlockLoad`, `cub::BlockStore`, and `cub::BlockDiscontinuity`. - Cleaned up unnecessary/missing header includes. You can now safely include a specific .cuh (instead of `cub.cuh`). - Bug/compilation fixes for `cub::BlockHistogram`. # CUB 1.0.1 ## Summary CUB 1.0.1 adds `cub::DeviceRadixSort` and `cub::DeviceScan`. Numerous other performance and correctness fixes and included. ## Breaking Changes - New collective interface idiom (specialize/construct/invoke). ## New Features - `cub::DeviceRadixSort`. Implements short-circuiting for homogenous digit passes. - `cub::DeviceScan`. Implements single-pass "adaptive-lookback" strategy. ## Other Enhancements - Significantly improved documentation (with example code snippets). - More extensive regression test suit for aggressively testing collective variants. - Allow non-trially-constructed types (previously unions had prevented aliasing temporary storage of those types). - Improved support for SM3x SHFL (collective ops now use SHFL for types larger than 32 bits). - Better code generation for 64-bit addressing within `cub::BlockLoad`/`cub::BlockStore`. - `cub::DeviceHistogram` now supports histograms of arbitrary bins. - Updates to accommodate CUDA 5.5 dynamic parallelism. ## Bug Fixes - Workarounds for SM10 codegen issues in uncommonly-used `cub::WarpScan`/`cub::WarpReduce` specializations. # CUB 0.9.4 ## Summary CUB 0.9.3 is a minor release. ## Enhancements - Various documentation updates and corrections. ## Bug Fixes - Fixed compilation errors for SM1x. - Fixed compilation errors for some WarpScan entrypoints on SM3x and up. # CUB 0.9.3 ## Summary CUB 0.9.3 adds histogram algorithms and work management utility descriptors. ## New Features - `cub::DevicHistogram256`. - `cub::BlockHistogram256`. - `cub::BlockScan` algorithm variant `BLOCK_SCAN_RAKING_MEMOIZE`, which trades more register consumption for less shared memory I/O. - `cub::GridQueue`, `cub::GridEvenShare`, work management utility descriptors. ## Other Enhancements - Updates to `cub::BlockRadixRank` to use `cub::BlockScan`, which improves performance on SM3x by using SHFL. - Allow types other than builtin types to be used in `cub::WarpScan::*Sum` methods if they only have `operator+` overloaded. Previously they also required to support assignment from `int(0)`. - Update `cub::BlockReduce`'s `BLOCK_REDUCE_WARP_REDUCTIONS` algorithm to work even when block size is not an even multiple of warp size. - Refactoring of `cub::DeviceAllocator` interface and `cub::CachingDeviceAllocator` implementation. # CUB 0.9.2 ## Summary CUB 0.9.2 adds `cub::WarpReduce`. ## New Features - `cub::WarpReduce`, which uses the SHFL instruction when applicable. `cub::BlockReduce` now uses this `cub::WarpReduce` instead of implementing its own. ## Enhancements - Documentation updates and corrections. ## Bug Fixes - Fixes for 64-bit Linux compilation warnings and errors. # CUB 0.9.1 ## Summary CUB 0.9.1 is a minor release. ## Bug Fixes - Fix for ambiguity in `cub::BlockScan::Reduce` between generic reduction and summation. Summation entrypoints are now called `::Sum()`, similar to the convention in `cub::BlockScan`. - Small edits to documentation and download tracking. # CUB 0.9.0 ## Summary Initial preview release. CUB is the first durable, high-performance library of cooperative block-level, warp-level, and thread-level primitives for CUDA kernel programming. cccl-2.5.0/cub/CMakeLists.txt000066400000000000000000000070741463375617100157600ustar00rootroot00000000000000# 3.15 is the minimum for including the project with add_subdirectory. # 3.21 is the minimum for the developer build. # 3.27.5 is the minimum for MSVC build with RDC=true. cmake_minimum_required(VERSION 3.15) # CXX is only needed for AppendOptionIfAvailable. project(CUB LANGUAGES NONE) # Determine whether CUB is the top-level project or included into # another project via add_subdirectory(). if ("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_LIST_DIR}") set(CUB_TOPLEVEL_PROJECT ON) endif () # This must be done before any languages are enabled: if (CUB_TOPLEVEL_PROJECT) cmake_minimum_required(VERSION 3.21) endif() # This must appear after our Compiler Hacks or else CMake will delete the cache # and reconfigure from scratch. # This must also appear before the installation rules, as it is required by the # GNUInstallDirs CMake module. enable_language(CXX) option(CUB_ENABLE_INSTALL_RULES "Enable installation of CUB" ${CUB_TOPLEVEL_PROJECT}) if (CUB_ENABLE_INSTALL_RULES) include(cmake/CubInstallRules.cmake) endif() # Support adding CUB to a parent project via add_subdirectory. # See examples/cmake/add_subdir/CMakeLists.txt for details. if (NOT CUB_TOPLEVEL_PROJECT AND NOT CUB_IN_THRUST) include(cmake/CubAddSubdir.cmake) return() endif() option(CUB_ENABLE_HEADER_TESTING "Test that all public headers compile." ON) option(CUB_ENABLE_TESTING "Build CUB testing suite." ON) option(CUB_ENABLE_BENCHMARKS "Build CUB benchmarking suite." "${CCCL_ENABLE_BENCHMARKS}") option(CUB_ENABLE_TUNING "Build CUB tuning suite." OFF) option(CUB_ENABLE_EXAMPLES "Build CUB examples." ON) # This is needed for NVCXX QA, which requires a static set of executable names. # Only a single dialect may be enabled when this is off. option(CUB_ENABLE_CPP_DIALECT_IN_NAMES "Include C++ dialect information in target/object/etc names." ON ) mark_as_advanced(CUB_ENABLE_CPP_DIALECT_IN_NAMES) # This option is only used when CUB is built stand-alone; otherwise the Thrust # option has the same effect. if (NOT CUB_IN_THRUST) option(CUB_IGNORE_DEPRECATED_API "Suppress warnings about deprecated Thrust/CUB API." OFF ) endif() # Check if we're actually building anything before continuing. If not, no need # to search for deps, etc. This is a common approach for packagers that just # need the install rules. See GH issue NVIDIA/thrust#1211. if (NOT (CUB_ENABLE_HEADER_TESTING OR CUB_ENABLE_TESTING OR CUB_ENABLE_EXAMPLES)) return() endif() include(cmake/AppendOptionIfAvailable.cmake) include(cmake/CubBuildCompilerTargets.cmake) include(cmake/CubBuildTargetList.cmake) include(cmake/CubCudaConfig.cmake) include(cmake/CubUtilities.cmake) if ("" STREQUAL "${CMAKE_BUILD_TYPE}") set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING "Choose the type of build." FORCE) set_property( CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS Debug Release RelWithDebInfo MinSizeRel ) endif () set(CMAKE_CXX_EXTENSIONS OFF) # Where to put build outputs. Use CMAKE_BINARY_DIR so they'll show up alongside # Thrust targets when building as part of Thrust. set(CUB_LIBRARY_OUTPUT_DIR "${CMAKE_BINARY_DIR}/lib") set(CUB_EXECUTABLE_OUTPUT_DIR "${CMAKE_BINARY_DIR}/bin") cub_build_target_list() if (CUB_ENABLE_HEADER_TESTING) include(cmake/CubHeaderTesting.cmake) endif() # Both testing and examples use ctest if (CUB_ENABLE_TESTING OR CUB_ENABLE_EXAMPLES) include(CTest) enable_testing() endif() if (CUB_ENABLE_TESTING) add_subdirectory(test) endif() if (CUB_ENABLE_EXAMPLES) add_subdirectory(examples) endif() if (CUB_ENABLE_BENCHMARKS OR CUB_ENABLE_TUNING) add_subdirectory(benchmarks) endif() cccl-2.5.0/cub/CODE_OF_CONDUCT.md000066400000000000000000000073461463375617100160210ustar00rootroot00000000000000 # Code of Conduct ## Overview This document defines the Code of Conduct followed and enforced for NVIDIA C++ Core Compute Libraries. ### Intended Audience * Community * Developers * Project Leads ## Our Pledge In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, sex characteristics, gender identity and expression, level of experience, education, socio-economic status, nationality, personal appearance, race, religion, or sexual identity and orientation. ## Our Standards Examples of behavior that contributes to creating a positive environment include: - Using welcoming and inclusive language. - Being respectful of differing viewpoints and experiences. - Gracefully accepting constructive criticism. - Focusing on what is best for the community. - Showing empathy towards other community members. Examples of unacceptable behavior by participants include: - The use of sexualized language or imagery and unwelcome sexual attention or advances. - Trolling, insulting/derogatory comments, and personal or political attacks. - Public or private harassment. - Publishing others’ private information, such as a physical or electronic address, without explicit permission. - Other conduct which could reasonably be considered inappropriate. ## Our Responsibilities Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior. Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. ## Scope This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project email address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers. ## Enforcement Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting [cpp-conduct@nvidia.com](mailto:cpp-conduct@nvidia.com). All complaints will be reviewed and investigated and will result in a response that is deemed necessary and appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately. Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project’s leadership. ## Attribution This Code of Conduct was taken from the [NVIDIA RAPIDS] project, which was adapted from the [Contributor Covenant version 1.4]. Please see this [FAQ] for answers to common questions about this Code of Conduct. ## Contact Please email [cpp-conduct@nvidia.com] for any Code of Conduct related matters. [cpp-conduct@nvidia.com]: mailto:cpp-conduct@nvidia.com [FAQ]: https://www.contributor-covenant.org/faq [NVIDIA RAPIDS]: https://docs.rapids.ai/resources/conduct/ [Contributor Covenant version 1.4]: https://www.contributor-covenant.org/version/1/4/code-of-conduct.html cccl-2.5.0/cub/CONTRIBUTING.md000066400000000000000000000054771463375617100154560ustar00rootroot00000000000000# Table of Contents 1. [Contributing to CUB](#contributing-to-cub) 1. [CMake Options](#cmake-options) 1. [Development Model](#development-model) # Contributing to CUB CUB uses Github to manage all open-source development, including bug tracking, pull requests, and design discussions. CUB is tightly coupled to the Thrust project, and a compatible version of Thrust is required when working on the development version of CUB. To setup a CUB development branch, it is recommended to recursively clone the Thrust repository and use the CUB submodule at `dependencies/cub` to stage changes. CUB's tests and examples can be built by configuring Thrust with the CMake option `THRUST_INCLUDE_CUB_CMAKE=ON`. This process is described in more detail in Thrust's [CONTRIBUTING.md](https://nvidia.github.io/thrust/contributing.html). The CMake options in the following section may be used to customize CUB's build process. Note that some of these are controlled by Thrust for compatibility and may not have an effect when building CUB through the Thrust build system. This is pointed out in the documentation below where applicable. # CMake Options A CUB build is configured using CMake options. These may be passed to CMake using ``` cmake -D= [Thrust or CUB project source root] ``` or configured interactively with the `ccmake` or `cmake-gui` interfaces. The configuration options for CUB are: - `CMAKE_BUILD_TYPE={Release, Debug, RelWithDebInfo, MinSizeRel}` - Standard CMake build option. Default: `RelWithDebInfo` - `CUB_ENABLE_INSTALL_RULES={ON, OFF}` - If true, installation rules will be generated for CUB. Default is `ON` when building CUB alone, and `OFF` when CUB is a subproject added via CMake's `add_subdirectory`. - `CUB_ENABLE_HEADER_TESTING={ON, OFF}` - Whether to test compile public headers. Default is `ON`. - `CUB_ENABLE_TESTING={ON, OFF}` - Whether to build unit tests. Default is `ON`. - `CUB_ENABLE_EXAMPLES={ON, OFF}` - Whether to build examples. Default is `ON`. - `CUB_ENABLE_DIALECT_CPPXX={ON, OFF}` - Setting this has no effect when building CUB as a component of Thrust. See Thrust's dialect options, which CUB will inherit. - Toggle whether a specific C++ dialect will be targeted. - Multiple dialects may be targeted in a single build. - Possible values of `XX` are `{11, 14, 17}`. - By default, only C++14 is enabled. - `CUB_ENABLE_RDC_TESTS={ON, OFF}` - Enable tests that require separable compilation. - Default is `ON`. - `CUB_FORCE_RDC={ON, OFF}` - Enable separable compilation on all targets that are agnostic of RDC. - Targets that explicitly require RDC to be enabled or disabled will ignore this setting. - Default is `OFF`. # Development Model CUB follows the same development model as Thrust, described [here](https://nvidia.github.io/thrust/releases/versioning.html). cccl-2.5.0/cub/LICENSE.TXT000066400000000000000000000030521463375617100146730ustar00rootroot00000000000000Copyright (c) 2010-2011, Duane Merrill. All rights reserved. Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cccl-2.5.0/cub/README.md000066400000000000000000000244341463375617100144760ustar00rootroot00000000000000

About CUB

CUB provides state-of-the-art, reusable software components for every layer of the CUDA programming model: - [Device-wide primitives](https://nvlabs.github.io/cub/group___device_module.html) - Sort, prefix scan, reduction, histogram, etc. - Compatible with CUDA dynamic parallelism - [Block-wide "collective" primitives](https://nvlabs.github.io/cub/group___block_module.html) - I/O, sort, prefix scan, reduction, histogram, etc. - Compatible with arbitrary thread block sizes and types - [Warp-wide "collective" primitives](https://nvlabs.github.io/cub/group___warp_module.html) - Warp-wide prefix scan, reduction, etc. - Safe and architecture-specific - [Thread and resource utilities](https://nvlabs.github.io/cub/group___util_io.html) - PTX intrinsics, device reflection, texture-caching iterators, caching memory allocators, etc. ![Orientation of collective primitives within the CUDA software stack](http://nvlabs.github.io/cub/cub_overview.png) CUB is included in the NVIDIA HPC SDK and the CUDA Toolkit. We recommend the [CUB Project Website](http://nvlabs.github.io/cub) for further information and examples.

A Simple Example

```cpp #include // Block-sorting CUDA kernel __global__ void BlockSortKernel(int *d_in, int *d_out) { using namespace cub; // Specialize BlockRadixSort, BlockLoad, and BlockStore for 128 threads // owning 16 integer items each typedef BlockRadixSort BlockRadixSort; typedef BlockLoad BlockLoad; typedef BlockStore BlockStore; // Allocate shared memory __shared__ union { typename BlockRadixSort::TempStorage sort; typename BlockLoad::TempStorage load; typename BlockStore::TempStorage store; } temp_storage; int block_offset = blockIdx.x * (128 * 16); // OffsetT for this block's ment // Obtain a segment of 2048 consecutive keys that are blocked across threads int thread_keys[16]; BlockLoad(temp_storage.load).Load(d_in + block_offset, thread_keys); __syncthreads(); // Collectively sort the keys BlockRadixSort(temp_storage.sort).Sort(thread_keys); __syncthreads(); // Store the sorted segment BlockStore(temp_storage.store).Store(d_out + block_offset, thread_keys); } ``` Each thread block uses `cub::BlockRadixSort` to collectively sort its own input segment. The class is specialized by the data type being sorted, by the number of threads per block, by the number of keys per thread, and implicitly by the targeted compilation architecture. The `cub::BlockLoad` and `cub::BlockStore` classes are similarly specialized. Furthermore, to provide coalesced accesses to device memory, these primitives are configured to access memory using a striped access pattern (where consecutive threads simultaneously access consecutive items) and then transpose the keys into a [blocked arrangement](index.html#sec4sec3) of elements across threads. Once specialized, these classes expose opaque `TempStorage` member types. The thread block uses these storage types to statically allocate the union of shared memory needed by the thread block. (Alternatively these storage types could be aliased to global memory allocations).

Supported Compilers

CUB is regularly tested using the specified versions of the following compilers. Unsupported versions may emit deprecation warnings, which can be silenced by defining CUB_IGNORE_DEPRECATED_COMPILER during compilation. - NVCC 11.0+ - GCC 5+ - Clang 7+ - MSVC 2019+ (19.20/16.0/14.20)

Releases

CUB is distributed with the NVIDIA HPC SDK and the CUDA Toolkit in addition to GitHub. See the [changelog](CHANGELOG.md) for details about specific releases. | CUB Release | Included In | | ------------------------- | --------------------------------------- | | 2.0.1 | CUDA Toolkit 12.0 | | 2.0.0 | TBD | | 1.17.2 | TBD | | 1.17.1 | TBD | | 1.17.0 | TBD | | 1.16.0 | TBD | | 1.15.0 | NVIDIA HPC SDK 22.1 & CUDA Toolkit 11.6 | | 1.14.0 | NVIDIA HPC SDK 21.9 | | 1.13.1 | CUDA Toolkit 11.5 | | 1.13.0 | NVIDIA HPC SDK 21.7 | | 1.12.1 | CUDA Toolkit 11.4 | | 1.12.0 | NVIDIA HPC SDK 21.3 | | 1.11.0 | CUDA Toolkit 11.3 | | 1.10.0 | NVIDIA HPC SDK 20.9 & CUDA Toolkit 11.2 | | 1.9.10-1 | NVIDIA HPC SDK 20.7 & CUDA Toolkit 11.1 | | 1.9.10 | NVIDIA HPC SDK 20.5 | | 1.9.9 | CUDA Toolkit 11.0 | | 1.9.8-1 | NVIDIA HPC SDK 20.3 | | 1.9.8 | CUDA Toolkit 11.0 Early Access | | 1.9.8 | CUDA 11.0 Early Access | | 1.8.0 | | | 1.7.5 | Thrust 1.9.2 | | 1.7.4 | Thrust 1.9.1-2 | | 1.7.3 | | | 1.7.2 | | | 1.7.1 | | | 1.7.0 | Thrust 1.9.0-5 | | 1.6.4 | | | 1.6.3 | | | 1.6.2 (previously 1.5.5) | | | 1.6.1 (previously 1.5.4) | | | 1.6.0 (previously 1.5.3) | | | 1.5.2 | | | 1.5.1 | | | 1.5.0 | | | 1.4.1 | | | 1.4.0 | | | 1.3.2 | | | 1.3.1 | | | 1.3.0 | | | 1.2.3 | | | 1.2.2 | | | 1.2.0 | | | 1.1.1 | | | 1.0.2 | | | 1.0.1 | | | 0.9.4 | | | 0.9.2 | | | 0.9.1 | | | 0.9.0 | |

Development Process

CUB and Thrust depend on each other. It is recommended to clone Thrust and build CUB as a component of Thrust. CUB uses the [CMake build system](https://cmake.org/) to build unit tests, examples, and header tests. To build CUB as a developer, the following recipe should be followed: ```bash # Clone Thrust and CUB from Github. CUB is located in Thrust's # `dependencies/cub` submodule. git clone --recursive https://github.com/NVIDIA/thrust.git cd thrust # Create build directory: mkdir build cd build # Configure -- use one of the following: cmake -DTHRUST_INCLUDE_CUB_CMAKE=ON .. # Command line interface. ccmake -DTHRUST_INCLUDE_CUB_CMAKE=ON .. # ncurses GUI (Linux only) cmake-gui # Graphical UI, set source/build directories and options in the app # Build: cmake --build . -j # invokes make (or ninja, etc) # Run tests and examples: ctest ``` By default, the C++14 standard is targeted, but this can be changed in CMake. More information on configuring your CUB build and creating a pull request is found in [CONTRIBUTING.md](CONTRIBUTING.md).

Open Source License

CUB is available under the "New BSD" open-source license: ``` Copyright (c) 2010-2011, Duane Merrill. All rights reserved. Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ``` cccl-2.5.0/cub/benchmarks/000077500000000000000000000000001463375617100153255ustar00rootroot00000000000000cccl-2.5.0/cub/benchmarks/CMakeLists.txt000066400000000000000000000075431463375617100200760ustar00rootroot00000000000000include(${CMAKE_SOURCE_DIR}/benchmarks/cmake/CCCLBenchmarkRegistry.cmake) # Defer dependencies collection to nvbench helper add_subdirectory(nvbench_helper) set(benches_root "${CMAKE_CURRENT_LIST_DIR}") if(NOT CMAKE_BUILD_TYPE STREQUAL "Release") message(FATAL_ERROR "CUB benchmarks must be built in release mode.") endif() if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) message(FATAL_ERROR "CMAKE_CUDA_ARCHITECTURES must be set to build CUB benchmarks.") endif() set(benches_meta_target cub.all.benches) add_custom_target(${benches_meta_target}) function(get_recursive_subdirs subdirs) set(dirs) file(GLOB_RECURSE contents CONFIGURE_DEPENDS LIST_DIRECTORIES ON "${CMAKE_CURRENT_LIST_DIR}/bench/*" ) foreach(test_dir IN LISTS contents) if(IS_DIRECTORY "${test_dir}") list(APPEND dirs "${test_dir}") endif() endforeach() set(${subdirs} "${dirs}" PARENT_SCOPE) endfunction() create_benchmark_registry() function(get_bench_ranges src bench_name) file(READ "${src}" file_data) set(param_regex "//[ ]+%RANGE%[ ]+([^ ]+)[ ]+([^ ]+)[ ]+([^\n]*)") string(REGEX MATCHALL "${param_regex}" matches "${file_data}") set(ranges "") foreach(match IN LISTS matches) string(REGEX MATCH "${param_regex}" unused "${match}") set(def ${CMAKE_MATCH_1}) set(label ${CMAKE_MATCH_2}) set(range ${CMAKE_MATCH_3}) set(ranges "${ranges}${def}|${label}=${range},") string(REPLACE ":" ";" range "${range}") list(LENGTH range range_len) if (NOT "${range_len}" STREQUAL 3) message(FATAL_ERROR "Range should be represented as 'start:end:step'") endif() endforeach() string(LENGTH "${ranges}" ranges_length) math(EXPR last_character_index "${ranges_length} - 1") string(SUBSTRING "${ranges}" 0 ${last_character_index} ranges) register_cccl_tuning("${bench_name}" "${ranges}") endfunction() function(add_bench target_name bench_name bench_src) set(bench_target ${bench_name}) set(${target_name} ${bench_target} PARENT_SCOPE) add_executable(${bench_target} "${bench_src}") target_link_libraries(${bench_target} PRIVATE nvbench_helper nvbench::main) set_target_properties(${bench_target} PROPERTIES ARCHIVE_OUTPUT_DIRECTORY "${CUB_LIBRARY_OUTPUT_DIR}" LIBRARY_OUTPUT_DIRECTORY "${CUB_LIBRARY_OUTPUT_DIR}" RUNTIME_OUTPUT_DIRECTORY "${CUB_EXECUTABLE_OUTPUT_DIR}" CUDA_STANDARD 17 CXX_STANDARD 17) endfunction() function(add_bench_dir bench_dir) file(GLOB bench_srcs CONFIGURE_DEPENDS "${bench_dir}/*.cu") file(RELATIVE_PATH bench_prefix "${benches_root}" "${bench_dir}") file(TO_CMAKE_PATH "${bench_prefix}" bench_prefix) string(REPLACE "/" "." bench_prefix "${bench_prefix}") foreach(bench_src IN LISTS bench_srcs) # base tuning get_filename_component(bench_name "${bench_src}" NAME_WLE) string(PREPEND bench_name "cub.${bench_prefix}.") set(base_bench_name "${bench_name}.base") add_bench(base_bench_target ${base_bench_name} "${bench_src}") add_dependencies(${benches_meta_target} ${base_bench_target}) target_compile_definitions(${base_bench_target} PRIVATE TUNE_BASE=1) if (CUB_ENABLE_TUNING) # tuning set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS "${bench_src}") get_bench_ranges("${bench_src}" "${bench_name}") set(tuning_name "${bench_name}.variant") set(tuning_path "${CMAKE_BINARY_DIR}/${tuning_name}.h") add_bench(bench_target ${tuning_name} "${bench_src}") # for convenience, make tuning variant buildable by default file(WRITE "${tuning_path}" "#pragma once\n#define TUNE_BASE 1\n") target_compile_options(${bench_target} PRIVATE "-include${tuning_path}") else() # benchmarking register_cccl_benchmark("${bench_name}" "") endif() endforeach() endfunction() get_recursive_subdirs(subdirs) foreach(subdir IN LISTS subdirs) add_bench_dir("${subdir}") endforeach() cccl-2.5.0/cub/benchmarks/bench/000077500000000000000000000000001463375617100164045ustar00rootroot00000000000000cccl-2.5.0/cub/benchmarks/bench/adjacent_difference/000077500000000000000000000000001463375617100223275ustar00rootroot00000000000000cccl-2.5.0/cub/benchmarks/bench/adjacent_difference/subtract_left.cu000066400000000000000000000104731463375617100255260ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #include #include // %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1 // %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32 #if !TUNE_BASE struct policy_hub_t { struct Policy350 : cub::ChainedPolicy<350, Policy350, Policy350> { using AdjacentDifferencePolicy = cub::AgentAdjacentDifferencePolicy; }; using MaxPolicy = Policy350; }; #endif // !TUNE_BASE template void left(nvbench::state& state, nvbench::type_list) { constexpr bool may_alias = false; constexpr bool read_left = true; using input_it_t = const T*; using output_it_t = T*; using difference_op_t = cub::Difference; using offset_t = cub::detail::choose_offset_t; #if !TUNE_BASE using dispatch_t = cub:: DispatchAdjacentDifference; #else using dispatch_t = cub::DispatchAdjacentDifference; #endif // TUNE_BASE const auto elements = static_cast(state.get_int64("Elements{io}")); thrust::device_vector in = generate(elements); thrust::device_vector out(elements); input_it_t d_in = thrust::raw_pointer_cast(in.data()); output_it_t d_out = thrust::raw_pointer_cast(out.data()); state.add_element_count(elements); state.add_global_memory_reads(elements); state.add_global_memory_writes(elements); std::size_t temp_storage_bytes{}; dispatch_t::Dispatch(nullptr, temp_storage_bytes, d_in, d_out, static_cast(elements), difference_op_t{}, 0); thrust::device_vector temp_storage(temp_storage_bytes); std::uint8_t* d_temp_storage = thrust::raw_pointer_cast(temp_storage.data()); state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) { dispatch_t::Dispatch( d_temp_storage, temp_storage_bytes, d_in, d_out, static_cast(elements), difference_op_t{}, launch.get_stream()); }); } using types = nvbench::type_list; NVBENCH_BENCH_TYPES(left, NVBENCH_TYPE_AXES(types, offset_types)) .set_name("base") .set_type_axes_names({"T{ct}", "OffsetT{ct}"}) .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4)); cccl-2.5.0/cub/benchmarks/bench/copy/000077500000000000000000000000001463375617100173565ustar00rootroot00000000000000cccl-2.5.0/cub/benchmarks/bench/copy/memcpy.cu000066400000000000000000000270341463375617100212070ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #include // %RANGE% TUNE_THREADS tpb 128:1024:32 // %RANGE% TUNE_BUFFERS_PER_THREAD bpt 1:18:1 // %RANGE% TUNE_TLEV_BYTES_PER_THREAD tlevbpt 2:16:2 // %RANGE% TUNE_LARGE_THREADS ltpb 128:1024:32 // %RANGE% TUNE_LARGE_BUFFER_BYTES_PER_THREAD lbbpt 4:128:4 // %RANGE% TUNE_PREFER_POW2_BITS ppb 0:1:1 // %RANGE% TUNE_WARP_LEVEL_THRESHOLD wlt 32:512:32 // %RANGE% TUNE_BLOCK_LEVEL_THRESHOLD blt 1024:16384:512 // %RANGE% TUNE_BLOCK_MAGIC_NS blns 0:2048:4 // %RANGE% TUNE_BLOCK_DELAY_CONSTRUCTOR_ID bldcid 0:7:1 // %RANGE% TUNE_BLOCK_L2_WRITE_LATENCY_NS bll2w 0:1200:5 // %RANGE% TUNE_BUFF_MAGIC_NS buns 0:2048:4 // %RANGE% TUNE_BUFF_DELAY_CONSTRUCTOR_ID budcid 0:7:1 // %RANGE% TUNE_BUFF_L2_WRITE_LATENCY_NS bul2w 0:1200:5 #include #include #include #include #include #include #include template struct offset_to_ptr_t { T* d_ptr; OffsetT* d_offsets; __device__ T* operator()(OffsetT i) const { return d_ptr + d_offsets[i]; } }; template struct reordered_offset_to_ptr_t { T* d_ptr; OffsetT* d_map; OffsetT* d_offsets; __device__ T* operator()(OffsetT i) const { return d_ptr + d_offsets[d_map[i]]; } }; template struct offset_to_bytes_t { OffsetT* d_offsets; __device__ OffsetT operator()(OffsetT i) const { return (d_offsets[i + 1] - d_offsets[i]) * sizeof(T); } }; template struct offset_to_size_t { OffsetT* d_offsets; __device__ OffsetT operator()(OffsetT i) const { return d_offsets[i + 1] - d_offsets[i]; } }; #if !TUNE_BASE template using delay_constructor_t = nvbench::tl::get, cub::detail::fixed_delay_constructor_t, cub::detail::exponential_backoff_constructor_t, cub::detail::exponential_backoff_jitter_constructor_t, cub::detail::exponential_backoff_jitter_window_constructor_t, cub::detail::exponential_backon_jitter_window_constructor_t, cub::detail::exponential_backon_jitter_constructor_t, cub::detail::exponential_backon_constructor_t>>; using buff_delay_constructor_t = delay_constructor_t; using block_delay_constructor_t = delay_constructor_t; struct policy_hub_t { struct policy_t : cub::ChainedPolicy<350, policy_t, policy_t> { using AgentSmallBufferPolicyT = cub::detail::AgentBatchMemcpyPolicy< TUNE_THREADS, TUNE_BUFFERS_PER_THREAD, TUNE_TLEV_BYTES_PER_THREAD, TUNE_PREFER_POW2_BITS, TUNE_LARGE_THREADS * TUNE_LARGE_BUFFER_BYTES_PER_THREAD, TUNE_WARP_LEVEL_THRESHOLD, TUNE_BLOCK_LEVEL_THRESHOLD, buff_delay_constructor_t, block_delay_constructor_t>; using AgentLargeBufferPolicyT = cub::detail::AgentBatchMemcpyLargeBuffersPolicy; }; using MaxPolicy = policy_t; }; #endif template void gen_it(T* d_buffer, thrust::device_vector& output, thrust::device_vector offsets, bool randomize, thrust::default_random_engine& rne) { OffsetT* d_offsets = thrust::raw_pointer_cast(offsets.data()); if (randomize) { const auto buffers = output.size(); thrust::device_vector map(buffers); thrust::sequence(map.begin(), map.end()); thrust::shuffle(map.begin(), map.end(), rne); thrust::device_vector sizes(buffers); thrust::tabulate(sizes.begin(), sizes.end(), offset_to_size_t{d_offsets}); thrust::scatter(sizes.begin(), sizes.end(), map.begin(), offsets.begin()); thrust::exclusive_scan(offsets.begin(), offsets.end(), offsets.begin()); OffsetT* d_map = thrust::raw_pointer_cast(map.data()); thrust::tabulate(output.begin(), output.end(), reordered_offset_to_ptr_t{d_buffer, d_map, d_offsets}); } else { thrust::tabulate(output.begin(), output.end(), offset_to_ptr_t{d_buffer, d_offsets}); } } template void copy(nvbench::state& state, nvbench::type_list, std::size_t elements, std::size_t min_buffer_size, std::size_t max_buffer_size, bool randomize_input, bool randomize_output) { using offset_t = OffsetT; using it_t = T*; using input_buffer_it_t = it_t*; using output_buffer_it_t = it_t*; using buffer_size_it_t = offset_t*; using buffer_offset_t = std::uint32_t; using block_offset_t = std::uint32_t; constexpr bool is_memcpy = true; #if !TUNE_BASE using policy_t = policy_hub_t; #else using policy_t = cub::detail::DeviceBatchMemcpyPolicy; #endif using dispatch_t = cub::detail::DispatchBatchMemcpy< input_buffer_it_t, output_buffer_it_t, buffer_size_it_t, buffer_offset_t, block_offset_t, policy_t, is_memcpy>; thrust::device_vector input_buffer = generate(elements); thrust::device_vector output_buffer(elements); thrust::device_vector offsets = generate.uniform.segment_offsets(elements, min_buffer_size, max_buffer_size); T* d_input_buffer = thrust::raw_pointer_cast(input_buffer.data()); T* d_output_buffer = thrust::raw_pointer_cast(output_buffer.data()); offset_t* d_offsets = thrust::raw_pointer_cast(offsets.data()); const auto buffers = offsets.size() - 1; thrust::device_vector input_buffers(buffers); thrust::device_vector output_buffers(buffers); thrust::device_vector buffer_sizes(buffers); thrust::tabulate(buffer_sizes.begin(), buffer_sizes.end(), offset_to_bytes_t{d_offsets}); thrust::default_random_engine rne; gen_it(d_input_buffer, input_buffers, offsets, randomize_input, rne); gen_it(d_output_buffer, output_buffers, offsets, randomize_output, rne); // Clear the offsets vector to free memory offsets.clear(); offsets.shrink_to_fit(); d_offsets = nullptr; input_buffer_it_t d_input_buffers = thrust::raw_pointer_cast(input_buffers.data()); output_buffer_it_t d_output_buffers = thrust::raw_pointer_cast(output_buffers.data()); buffer_size_it_t d_buffer_sizes = thrust::raw_pointer_cast(buffer_sizes.data()); state.add_element_count(elements); state.add_global_memory_writes(elements); state.add_global_memory_reads(elements); state.add_global_memory_reads(buffers); state.add_global_memory_reads(buffers); state.add_global_memory_reads(buffers); std::size_t temp_storage_bytes{}; std::uint8_t* d_temp_storage{}; dispatch_t::Dispatch( d_temp_storage, temp_storage_bytes, d_input_buffers, d_output_buffers, d_buffer_sizes, buffers, 0); thrust::device_vector temp_storage(temp_storage_bytes); d_temp_storage = thrust::raw_pointer_cast(temp_storage.data()); state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch& launch) { dispatch_t::Dispatch( d_temp_storage, temp_storage_bytes, d_input_buffers, d_output_buffers, d_buffer_sizes, buffers, launch.get_stream()); }); } template void uniform(nvbench::state& state, nvbench::type_list tl) { const auto elements = static_cast(state.get_int64("Elements{io}")); const auto max_buffer_size = static_cast(state.get_int64("MaxBufferSize")); const auto min_buffer_size_ratio = static_cast(state.get_int64("MinBufferSizeRatio")); const auto min_buffer_size = static_cast(static_cast(max_buffer_size) / 100.0) * min_buffer_size_ratio; copy( state, tl, elements, min_buffer_size, max_buffer_size, state.get_int64("Randomize"), state.get_int64("Randomize")); } template void large(nvbench::state& state, nvbench::type_list tl) { const auto elements = static_cast(state.get_int64("Elements{io}")); const auto max_buffer_size = elements; constexpr auto min_buffer_size_ratio = 99; const auto min_buffer_size = static_cast(static_cast(max_buffer_size) / 100.0) * min_buffer_size_ratio; // No need to randomize large buffers constexpr bool randomize_input = false; constexpr bool randomize_output = false; copy(state, tl, elements, min_buffer_size, max_buffer_size, randomize_input, randomize_output); } using types = nvbench::type_list; #ifdef TUNE_OffsetT using u_offset_types = nvbench::type_list; #else using u_offset_types = nvbench::type_list; #endif NVBENCH_BENCH_TYPES(uniform, NVBENCH_TYPE_AXES(types, u_offset_types)) .set_name("uniform") .set_type_axes_names({"T{ct}", "OffsetT{ct}"}) .add_int64_power_of_two_axis("Elements{io}", nvbench::range(25, 29, 2)) .add_int64_axis("MinBufferSizeRatio", {1, 99}) .add_int64_axis("MaxBufferSize", {8, 64, 256, 1024, 64 * 1024}) .add_int64_axis("Randomize", {0, 1}); NVBENCH_BENCH_TYPES(large, NVBENCH_TYPE_AXES(types, u_offset_types)) .set_name("large") .set_type_axes_names({"T{ct}", "OffsetT{ct}"}) .add_int64_power_of_two_axis("Elements{io}", {28, 29}); cccl-2.5.0/cub/benchmarks/bench/for_each/000077500000000000000000000000001463375617100201525ustar00rootroot00000000000000cccl-2.5.0/cub/benchmarks/bench/for_each/base.cu000066400000000000000000000063741463375617100214270ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #include #include template struct op_t { int* d_count{}; __device__ void operator()(T val) const { if (val == T{}) { atomicAdd(d_count, 1); } } }; template void for_each(nvbench::state& state, nvbench::type_list) { using input_it_t = const T*; using output_it_t = int*; using offset_t = OffsetT; const auto elements = static_cast(state.get_int64("Elements{io}")); thrust::device_vector in(elements, T{42}); input_it_t d_in = thrust::raw_pointer_cast(in.data()); // `d_out` exists for visibility // All inputs are equal to `42`, while the operator is searching for `0`. // If the operator finds `0` in the input sequence, it's an issue leading to a segfault. output_it_t d_out = nullptr; state.add_element_count(elements); state.add_global_memory_reads(elements); op_t op{d_out}; std::size_t temp_size{}; cub::DeviceFor::ForEachN(nullptr, temp_size, d_in, elements, op); thrust::device_vector temp(temp_size); auto* temp_storage = thrust::raw_pointer_cast(temp.data()); state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) { cub::DeviceFor::ForEachN(temp_storage, temp_size, d_in, elements, op, launch.get_stream()); }); } NVBENCH_BENCH_TYPES(for_each, NVBENCH_TYPE_AXES(fundamental_types, offset_types)) .set_name("base") .set_type_axes_names({"T{ct}", "OffsetT{ct}"}) .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4)); cccl-2.5.0/cub/benchmarks/bench/for_each/copy.cu000066400000000000000000000060741463375617100214640ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #include #include template struct op_t { int* d_count{}; __device__ void operator()(T val) const { if (val == T{}) { atomicAdd(d_count, 1); } } }; template void for_each(nvbench::state& state, nvbench::type_list) { using input_it_t = const T*; using output_it_t = int*; using offset_t = OffsetT; const auto elements = static_cast(state.get_int64("Elements{io}")); thrust::device_vector in(elements, T{42}); input_it_t d_in = thrust::raw_pointer_cast(in.data()); output_it_t d_out = nullptr; state.add_element_count(elements); state.add_global_memory_reads(elements); op_t op{d_out}; std::size_t temp_size{}; cub::DeviceFor::ForEachCopyN(nullptr, temp_size, d_in, elements, op); thrust::device_vector temp(temp_size); auto* temp_storage = thrust::raw_pointer_cast(temp.data()); state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) { cub::DeviceFor::ForEachCopyN(temp_storage, temp_size, d_in, elements, op, launch.get_stream()); }); } NVBENCH_BENCH_TYPES(for_each, NVBENCH_TYPE_AXES(fundamental_types, offset_types)) .set_name("base") .set_type_axes_names({"T{ct}", "OffsetT{ct}"}) .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4)); cccl-2.5.0/cub/benchmarks/bench/histogram/000077500000000000000000000000001463375617100204015ustar00rootroot00000000000000cccl-2.5.0/cub/benchmarks/bench/histogram/even.cu000066400000000000000000000131651463375617100216750ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #include "histogram_common.cuh" #include // %RANGE% TUNE_ITEMS ipt 4:28:1 // %RANGE% TUNE_THREADS tpb 128:1024:32 // %RANGE% TUNE_RLE_COMPRESS rle 0:1:1 // %RANGE% TUNE_WORK_STEALING ws 0:1:1 // %RANGE% TUNE_MEM_PREFERENCE mem 0:2:1 // %RANGE% TUNE_LOAD ld 0:2:1 // %RANGE% TUNE_LOAD_ALGORITHM_ID laid 0:2:1 // %RANGE% TUNE_VEC_SIZE_POW vec 0:2:1 template static void even(nvbench::state& state, nvbench::type_list) { constexpr int num_channels = 1; constexpr int num_active_channels = 1; using sample_iterator_t = SampleT*; #if !TUNE_BASE using policy_t = policy_hub_t; using dispatch_t = cub::DispatchHistogram; #else // TUNE_BASE using dispatch_t = cub::DispatchHistogram; #endif // TUNE_BASE const auto entropy = str_to_entropy(state.get_string("Entropy")); const auto elements = state.get_int64("Elements{io}"); const auto num_bins = state.get_int64("Bins"); const int num_levels = static_cast(num_bins) + 1; const SampleT lower_level = 0; const SampleT upper_level = get_upper_level(num_bins, elements); thrust::device_vector input = generate(elements, entropy, lower_level, upper_level); thrust::device_vector hist(num_bins); SampleT* d_input = thrust::raw_pointer_cast(input.data()); CounterT* d_histogram = thrust::raw_pointer_cast(hist.data()); CounterT* d_histogram1[1] = {d_histogram}; int num_levels1[1] = {num_levels}; SampleT lower_level1[1] = {lower_level}; SampleT upper_level1[1] = {upper_level}; std::uint8_t* d_temp_storage = nullptr; std::size_t temp_storage_bytes{}; cub::Int2Type is_byte_sample; OffsetT num_row_pixels = static_cast(elements); OffsetT num_rows = 1; OffsetT row_stride_samples = num_row_pixels; state.add_element_count(elements); state.add_global_memory_reads(elements); state.add_global_memory_writes(num_bins); dispatch_t::DispatchEven( d_temp_storage, temp_storage_bytes, d_input, d_histogram1, num_levels1, lower_level1, upper_level1, num_row_pixels, num_rows, row_stride_samples, 0, is_byte_sample); thrust::device_vector tmp(temp_storage_bytes); d_temp_storage = thrust::raw_pointer_cast(tmp.data()); state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) { dispatch_t::DispatchEven( d_temp_storage, temp_storage_bytes, d_input, d_histogram1, num_levels1, lower_level1, upper_level1, num_row_pixels, num_rows, row_stride_samples, launch.get_stream(), is_byte_sample); }); } using bin_types = nvbench::type_list; using some_offset_types = nvbench::type_list; #ifdef TUNE_SampleT using sample_types = nvbench::type_list; #else // !defined(TUNE_SampleT) using sample_types = nvbench::type_list; #endif // TUNE_SampleT NVBENCH_BENCH_TYPES(even, NVBENCH_TYPE_AXES(sample_types, bin_types, some_offset_types)) .set_name("base") .set_type_axes_names({"SampleT{ct}", "BinT{ct}", "OffsetT{ct}"}) .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4)) .add_int64_axis("Bins", {32, 128, 2048, 2097152}) .add_string_axis("Entropy", {"0.201", "1.000"}); cccl-2.5.0/cub/benchmarks/bench/histogram/histogram_common.cuh000066400000000000000000000075571463375617100244650ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #pragma once #include #include #if !TUNE_BASE # if TUNE_LOAD == 0 # define TUNE_LOAD_MODIFIER cub::LOAD_DEFAULT # elif TUNE_LOAD == 1 # define TUNE_LOAD_MODIFIER cub::LOAD_LDG # else // TUNE_LOAD == 2 # define TUNE_LOAD_MODIFIER cub::LOAD_CA # endif // TUNE_LOAD # define TUNE_VEC_SIZE (1 << TUNE_VEC_SIZE_POW) # if TUNE_MEM_PREFERENCE == 0 constexpr cub::BlockHistogramMemoryPreference MEM_PREFERENCE = cub::GMEM; # elif TUNE_MEM_PREFERENCE == 1 constexpr cub::BlockHistogramMemoryPreference MEM_PREFERENCE = cub::SMEM; # else // TUNE_MEM_PREFERENCE == 2 constexpr cub::BlockHistogramMemoryPreference MEM_PREFERENCE = cub::BLEND; # endif // TUNE_MEM_PREFERENCE # if TUNE_LOAD_ALGORITHM_ID == 0 # define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_DIRECT # elif TUNE_LOAD_ALGORITHM_ID == 1 # define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_WARP_TRANSPOSE # else # define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_STRIPED # endif // TUNE_LOAD_ALGORITHM_ID template struct policy_hub_t { struct policy_t : cub::ChainedPolicy<350, policy_t, policy_t> { static constexpr cub::BlockLoadAlgorithm load_algorithm = (TUNE_LOAD_ALGORITHM == cub::BLOCK_LOAD_STRIPED) ? (NUM_CHANNELS == 1 ? cub::BLOCK_LOAD_STRIPED : cub::BLOCK_LOAD_DIRECT) : TUNE_LOAD_ALGORITHM; using AgentHistogramPolicyT = cub::AgentHistogramPolicy< TUNE_THREADS, TUNE_ITEMS, load_algorithm, TUNE_LOAD_MODIFIER, TUNE_RLE_COMPRESS, MEM_PREFERENCE, TUNE_WORK_STEALING, TUNE_VEC_SIZE>; }; using MaxPolicy = policy_t; }; #endif // !TUNE_BASE template SampleT get_upper_level(OffsetT bins, OffsetT elements) { if constexpr (cuda::std::is_integral_v) { if constexpr (sizeof(SampleT) < sizeof(OffsetT)) { const SampleT max_key = std::numeric_limits::max(); return static_cast(std::min(bins, static_cast(max_key))); } else { return static_cast(bins); } } return static_cast(elements); } cccl-2.5.0/cub/benchmarks/bench/histogram/multi/000077500000000000000000000000001463375617100215335ustar00rootroot00000000000000cccl-2.5.0/cub/benchmarks/bench/histogram/multi/even.cu000066400000000000000000000146051463375617100230270ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #include "../histogram_common.cuh" #include // %RANGE% TUNE_ITEMS ipt 7:24:1 // %RANGE% TUNE_THREADS tpb 128:1024:32 // %RANGE% TUNE_RLE_COMPRESS rle 0:1:1 // %RANGE% TUNE_WORK_STEALING ws 0:1:1 // %RANGE% TUNE_MEM_PREFERENCE mem 0:2:1 // %RANGE% TUNE_LOAD ld 0:2:1 // %RANGE% TUNE_LOAD_ALGORITHM_ID laid 0:2:1 // %RANGE% TUNE_VEC_SIZE_POW vec 0:2:1 template static void even(nvbench::state& state, nvbench::type_list) { constexpr int num_channels = 4; constexpr int num_active_channels = 3; using sample_iterator_t = SampleT*; #if !TUNE_BASE using policy_t = policy_hub_t; using dispatch_t = cub::DispatchHistogram; #else // TUNE_BASE using dispatch_t = cub::DispatchHistogram; #endif // TUNE_BASE const auto entropy = str_to_entropy(state.get_string("Entropy")); const auto elements = state.get_int64("Elements{io}"); const auto num_bins = state.get_int64("Bins"); const int num_levels_r = static_cast(num_bins) + 1; const int num_levels_g = num_levels_r; const int num_levels_b = num_levels_g; const SampleT lower_level_r = 0; const SampleT upper_level_r = get_upper_level(num_bins, elements); const SampleT lower_level_g = lower_level_r; const SampleT upper_level_g = upper_level_r; const SampleT lower_level_b = lower_level_g; const SampleT upper_level_b = upper_level_g; thrust::device_vector hist_r(num_bins); thrust::device_vector hist_g(num_bins); thrust::device_vector hist_b(num_bins); thrust::device_vector input = generate(elements * num_channels, entropy, lower_level_r, upper_level_r); SampleT* d_input = thrust::raw_pointer_cast(input.data()); CounterT* d_histogram_r = thrust::raw_pointer_cast(hist_r.data()); CounterT* d_histogram_g = thrust::raw_pointer_cast(hist_g.data()); CounterT* d_histogram_b = thrust::raw_pointer_cast(hist_b.data()); CounterT* d_histogram[num_active_channels] = {d_histogram_r, d_histogram_g, d_histogram_b}; int num_levels[num_active_channels] = {num_levels_r, num_levels_g, num_levels_b}; SampleT lower_level[num_active_channels] = {lower_level_r, lower_level_g, lower_level_b}; SampleT upper_level[num_active_channels] = {upper_level_r, upper_level_g, upper_level_b}; std::uint8_t* d_temp_storage = nullptr; std::size_t temp_storage_bytes{}; cub::Int2Type is_byte_sample; OffsetT num_row_pixels = static_cast(elements); OffsetT num_rows = 1; OffsetT row_stride_samples = num_row_pixels; state.add_element_count(elements); state.add_global_memory_reads(elements * num_active_channels); state.add_global_memory_writes(num_bins * num_active_channels); dispatch_t::DispatchEven( d_temp_storage, temp_storage_bytes, d_input, d_histogram, num_levels, lower_level, upper_level, num_row_pixels, num_rows, row_stride_samples, 0, is_byte_sample); thrust::device_vector tmp(temp_storage_bytes); d_temp_storage = thrust::raw_pointer_cast(tmp.data()); state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) { dispatch_t::DispatchEven( d_temp_storage, temp_storage_bytes, d_input, d_histogram, num_levels, lower_level, upper_level, num_row_pixels, num_rows, row_stride_samples, launch.get_stream(), is_byte_sample); }); } using bin_types = nvbench::type_list; using some_offset_types = nvbench::type_list; #ifdef TUNE_SampleT using sample_types = nvbench::type_list; #else // !defined(TUNE_SampleT) using sample_types = nvbench::type_list; #endif // TUNE_SampleT NVBENCH_BENCH_TYPES(even, NVBENCH_TYPE_AXES(sample_types, bin_types, some_offset_types)) .set_name("base") .set_type_axes_names({"SampleT{ct}", "BinT{ct}", "OffsetT{ct}"}) .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4)) .add_int64_axis("Bins", {32, 128, 2048, 2097152}) .add_string_axis("Entropy", {"0.201", "1.000"}); cccl-2.5.0/cub/benchmarks/bench/histogram/multi/range.cu000066400000000000000000000151611463375617100231640ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #include #include "../histogram_common.cuh" #include // %RANGE% TUNE_ITEMS ipt 7:24:1 // %RANGE% TUNE_THREADS tpb 128:1024:32 // %RANGE% TUNE_RLE_COMPRESS rle 0:1:1 // %RANGE% TUNE_WORK_STEALING ws 0:1:1 // %RANGE% TUNE_MEM_PREFERENCE mem 0:2:1 // %RANGE% TUNE_LOAD ld 0:2:1 // %RANGE% TUNE_LOAD_ALGORITHM_ID laid 0:2:1 // %RANGE% TUNE_VEC_SIZE_POW vec 0:2:1 template static void range(nvbench::state& state, nvbench::type_list) { constexpr int num_channels = 4; constexpr int num_active_channels = 3; using sample_iterator_t = SampleT*; #if !TUNE_BASE using policy_t = policy_hub_t; using dispatch_t = cub::DispatchHistogram; #else // TUNE_BASE using dispatch_t = cub::DispatchHistogram; #endif // TUNE_BASE const auto entropy = str_to_entropy(state.get_string("Entropy")); const auto elements = state.get_int64("Elements{io}"); const auto num_bins = state.get_int64("Bins"); const int num_levels_r = static_cast(num_bins) + 1; const int num_levels_g = num_levels_r; const int num_levels_b = num_levels_g; const SampleT lower_level = 0; const SampleT upper_level = get_upper_level(num_bins, elements); SampleT step = (upper_level - lower_level) / num_bins; thrust::device_vector levels_r(num_bins + 1); // TODO Extract sequence to the helper TU thrust::sequence(levels_r.begin(), levels_r.end(), lower_level, step); thrust::device_vector levels_g = levels_r; thrust::device_vector levels_b = levels_g; SampleT* d_levels_r = thrust::raw_pointer_cast(levels_r.data()); SampleT* d_levels_g = thrust::raw_pointer_cast(levels_g.data()); SampleT* d_levels_b = thrust::raw_pointer_cast(levels_b.data()); thrust::device_vector hist_r(num_bins); thrust::device_vector hist_g(num_bins); thrust::device_vector hist_b(num_bins); thrust::device_vector input = generate(elements * num_channels, entropy, lower_level, upper_level); SampleT* d_input = thrust::raw_pointer_cast(input.data()); CounterT* d_histogram_r = thrust::raw_pointer_cast(hist_r.data()); CounterT* d_histogram_g = thrust::raw_pointer_cast(hist_g.data()); CounterT* d_histogram_b = thrust::raw_pointer_cast(hist_b.data()); CounterT* d_histogram[num_active_channels] = {d_histogram_r, d_histogram_g, d_histogram_b}; int num_levels[num_active_channels] = {num_levels_r, num_levels_g, num_levels_b}; SampleT* d_levels[num_active_channels] = {d_levels_r, d_levels_g, d_levels_b}; std::uint8_t* d_temp_storage = nullptr; std::size_t temp_storage_bytes{}; cub::Int2Type is_byte_sample; OffsetT num_row_pixels = static_cast(elements); OffsetT num_rows = 1; OffsetT row_stride_samples = num_row_pixels; state.add_element_count(elements); state.add_global_memory_reads(elements * num_active_channels); state.add_global_memory_writes(num_bins * num_active_channels); dispatch_t::DispatchRange( d_temp_storage, temp_storage_bytes, d_input, d_histogram, num_levels, d_levels, num_row_pixels, num_rows, row_stride_samples, 0, is_byte_sample); thrust::device_vector tmp(temp_storage_bytes); d_temp_storage = thrust::raw_pointer_cast(tmp.data()); state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) { dispatch_t::DispatchRange( d_temp_storage, temp_storage_bytes, d_input, d_histogram, num_levels, d_levels, num_row_pixels, num_rows, row_stride_samples, launch.get_stream(), is_byte_sample); }); } using bin_types = nvbench::type_list; using some_offset_types = nvbench::type_list; #ifdef TUNE_SampleT using sample_types = nvbench::type_list; #else // !defined(TUNE_SampleT) using sample_types = nvbench::type_list; #endif // TUNE_SampleT NVBENCH_BENCH_TYPES(range, NVBENCH_TYPE_AXES(sample_types, bin_types, some_offset_types)) .set_name("base") .set_type_axes_names({"SampleT{ct}", "BinT{ct}", "OffsetT{ct}"}) .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4)) .add_int64_axis("Bins", {32, 128, 2048, 2097152}) .add_string_axis("Entropy", {"0.201", "1.000"}); cccl-2.5.0/cub/benchmarks/bench/histogram/range.cu000066400000000000000000000135351463375617100220350ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #include #include "histogram_common.cuh" #include // %RANGE% TUNE_ITEMS ipt 7:24:1 // %RANGE% TUNE_THREADS tpb 128:1024:32 // %RANGE% TUNE_RLE_COMPRESS rle 0:1:1 // %RANGE% TUNE_WORK_STEALING ws 0:1:1 // %RANGE% TUNE_MEM_PREFERENCE mem 0:2:1 // %RANGE% TUNE_LOAD ld 0:2:1 // %RANGE% TUNE_LOAD_ALGORITHM_ID laid 0:2:1 // %RANGE% TUNE_VEC_SIZE_POW vec 0:2:1 template static void range(nvbench::state& state, nvbench::type_list) { constexpr int num_channels = 1; constexpr int num_active_channels = 1; using sample_iterator_t = SampleT*; #if !TUNE_BASE using policy_t = policy_hub_t; using dispatch_t = cub::DispatchHistogram; #else // TUNE_BASE using dispatch_t = cub::DispatchHistogram; #endif // TUNE_BASE const auto entropy = str_to_entropy(state.get_string("Entropy")); const auto elements = state.get_int64("Elements{io}"); const auto num_bins = state.get_int64("Bins"); const int num_levels = static_cast(num_bins) + 1; const SampleT lower_level = 0; const SampleT upper_level = get_upper_level(num_bins, elements); SampleT step = (upper_level - lower_level) / num_bins; thrust::device_vector levels(num_bins + 1); // TODO Extract sequence to the helper TU thrust::sequence(levels.begin(), levels.end(), lower_level, step); SampleT* d_levels = thrust::raw_pointer_cast(levels.data()); thrust::device_vector input = generate(elements, entropy, lower_level, upper_level); thrust::device_vector hist(num_bins); SampleT* d_input = thrust::raw_pointer_cast(input.data()); CounterT* d_histogram = thrust::raw_pointer_cast(hist.data()); CounterT* d_histogram1[1] = {d_histogram}; int num_levels1[1] = {num_levels}; SampleT* d_levels1[1] = {d_levels}; std::uint8_t* d_temp_storage = nullptr; std::size_t temp_storage_bytes{}; cub::Int2Type is_byte_sample; OffsetT num_row_pixels = static_cast(elements); OffsetT num_rows = 1; OffsetT row_stride_samples = num_row_pixels; state.add_element_count(elements); state.add_global_memory_reads(elements); state.add_global_memory_writes(num_bins); dispatch_t::DispatchRange( d_temp_storage, temp_storage_bytes, d_input, d_histogram1, num_levels1, d_levels1, num_row_pixels, num_rows, row_stride_samples, 0, is_byte_sample); thrust::device_vector tmp(temp_storage_bytes); d_temp_storage = thrust::raw_pointer_cast(tmp.data()); state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) { dispatch_t::DispatchRange( d_temp_storage, temp_storage_bytes, d_input, d_histogram1, num_levels1, d_levels1, num_row_pixels, num_rows, row_stride_samples, launch.get_stream(), is_byte_sample); }); } using bin_types = nvbench::type_list; using some_offset_types = nvbench::type_list; #ifdef TUNE_SampleT using sample_types = nvbench::type_list; #else // !defined(TUNE_SampleT) using sample_types = nvbench::type_list; #endif // TUNE_SampleT NVBENCH_BENCH_TYPES(range, NVBENCH_TYPE_AXES(sample_types, bin_types, some_offset_types)) .set_name("base") .set_type_axes_names({"SampleT{ct}", "BinT{ct}", "OffsetT{ct}"}) .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4)) .add_int64_axis("Bins", {32, 128, 2048, 2097152}) .add_string_axis("Entropy", {"0.201", "1.000"}); cccl-2.5.0/cub/benchmarks/bench/merge_sort/000077500000000000000000000000001463375617100205525ustar00rootroot00000000000000cccl-2.5.0/cub/benchmarks/bench/merge_sort/keys.cu000066400000000000000000000126561463375617100220700ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #include #include // %RANGE% TUNE_TRANSPOSE trp 0:1:1 // %RANGE% TUNE_LOAD ld 0:2:1 // %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1 // %RANGE% TUNE_THREADS_PER_BLOCK_POW2 tpb 6:10:1 #ifndef TUNE_BASE # define TUNE_THREADS_PER_BLOCK (1 << TUNE_THREADS_PER_BLOCK_POW2) #endif // TUNE_BASE using value_t = cub::NullType; #if !TUNE_BASE # if TUNE_TRANSPOSE == 0 # define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_DIRECT # define TUNE_STORE_ALGORITHM cub::BLOCK_STORE_DIRECT # else // TUNE_TRANSPOSE == 1 # define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_WARP_TRANSPOSE # define TUNE_STORE_ALGORITHM cub::BLOCK_STORE_WARP_TRANSPOSE # endif // TUNE_TRANSPOSE # if TUNE_LOAD == 0 # define TUNE_LOAD_MODIFIER cub::LOAD_DEFAULT # elif TUNE_LOAD == 1 # define TUNE_LOAD_MODIFIER cub::LOAD_LDG # else // TUNE_LOAD == 2 # define TUNE_LOAD_MODIFIER cub::LOAD_CA # endif // TUNE_LOAD template struct policy_hub_t { struct policy_t : cub::ChainedPolicy<300, policy_t, policy_t> { using MergeSortPolicy = cub::AgentMergeSortPolicy(TUNE_ITEMS_PER_THREAD), TUNE_LOAD_ALGORITHM, TUNE_LOAD_MODIFIER, TUNE_STORE_ALGORITHM>; }; using MaxPolicy = policy_t; }; #endif // !TUNE_BASE template void keys(nvbench::state& state, nvbench::type_list) { using key_t = T; using value_t = cub::NullType; using key_input_it_t = key_t*; using value_input_it_t = value_t*; using key_it_t = key_t*; using value_it_t = value_t*; using offset_t = OffsetT; using compare_op_t = less_t; #if !TUNE_BASE using policy_t = policy_hub_t; using dispatch_t = cub::DispatchMergeSort; #else // TUNE_BASE using dispatch_t = cub::DispatchMergeSort; #endif // TUNE_BASE // Retrieve axis parameters const auto elements = static_cast(state.get_int64("Elements{io}")); const bit_entropy entropy = str_to_entropy(state.get_string("Entropy")); thrust::device_vector buffer_1 = generate(elements, entropy); thrust::device_vector buffer_2(elements); key_t* d_buffer_1 = thrust::raw_pointer_cast(buffer_1.data()); key_t* d_buffer_2 = thrust::raw_pointer_cast(buffer_2.data()); // Enable throughput calculations and add "Size" column to results. state.add_element_count(elements); state.add_global_memory_reads(elements, "Size"); state.add_global_memory_writes(elements); // Allocate temporary storage: std::size_t temp_size{}; dispatch_t::Dispatch( nullptr, temp_size, d_buffer_1, nullptr, d_buffer_2, nullptr, static_cast(elements), compare_op_t{}, 0 /* stream */); thrust::device_vector temp(temp_size); auto* temp_storage = thrust::raw_pointer_cast(temp.data()); state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) { dispatch_t::Dispatch( temp_storage, temp_size, d_buffer_1, nullptr, d_buffer_2, nullptr, static_cast(elements), compare_op_t{}, launch.get_stream()); }); } NVBENCH_BENCH_TYPES(keys, NVBENCH_TYPE_AXES(all_types, offset_types)) .set_name("base") .set_type_axes_names({"T{ct}", "OffsetT{ct}"}) .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4)) .add_string_axis("Entropy", {"1.000", "0.201"}); cccl-2.5.0/cub/benchmarks/bench/merge_sort/pairs.cu000066400000000000000000000147141463375617100222300ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #include #include // %RANGE% TUNE_TRANSPOSE trp 0:1:1 // %RANGE% TUNE_LOAD ld 0:2:1 // %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1 // %RANGE% TUNE_THREADS_PER_BLOCK_POW2 tpb 6:10:1 #ifndef TUNE_BASE # define TUNE_THREADS_PER_BLOCK (1 << TUNE_THREADS_PER_BLOCK_POW2) #endif #if !TUNE_BASE # if TUNE_TRANSPOSE == 0 # define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_DIRECT # define TUNE_STORE_ALGORITHM cub::BLOCK_STORE_DIRECT # else // TUNE_TRANSPOSE == 1 # define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_WARP_TRANSPOSE # define TUNE_STORE_ALGORITHM cub::BLOCK_STORE_WARP_TRANSPOSE # endif // TUNE_TRANSPOSE # if TUNE_LOAD == 0 # define TUNE_LOAD_MODIFIER cub::LOAD_DEFAULT # elif TUNE_LOAD == 1 # define TUNE_LOAD_MODIFIER cub::LOAD_LDG # else // TUNE_LOAD == 2 # define TUNE_LOAD_MODIFIER cub::LOAD_CA # endif // TUNE_LOAD template struct policy_hub_t { struct policy_t : cub::ChainedPolicy<300, policy_t, policy_t> { using MergeSortPolicy = cub::AgentMergeSortPolicy(TUNE_ITEMS_PER_THREAD), TUNE_LOAD_ALGORITHM, TUNE_LOAD_MODIFIER, TUNE_STORE_ALGORITHM>; }; using MaxPolicy = policy_t; }; #endif // TUNE_BASE template void pairs(nvbench::state& state, nvbench::type_list) { using key_t = KeyT; using value_t = ValueT; using key_input_it_t = key_t*; using value_input_it_t = value_t*; using key_it_t = key_t*; using value_it_t = value_t*; using offset_t = OffsetT; using compare_op_t = less_t; #if !TUNE_BASE using policy_t = policy_hub_t; using dispatch_t = cub::DispatchMergeSort; #else // TUNE_BASE using dispatch_t = cub::DispatchMergeSort; #endif // TUNE_BASE // Retrieve axis parameters const auto elements = static_cast(state.get_int64("Elements{io}")); const bit_entropy entropy = str_to_entropy(state.get_string("Entropy")); thrust::device_vector keys_buffer_1 = generate(elements, entropy); thrust::device_vector keys_buffer_2(elements); thrust::device_vector values_buffer_1(elements); thrust::device_vector values_buffer_2(elements); key_t* d_keys_buffer_1 = thrust::raw_pointer_cast(keys_buffer_1.data()); key_t* d_keys_buffer_2 = thrust::raw_pointer_cast(keys_buffer_2.data()); value_t* d_values_buffer_1 = thrust::raw_pointer_cast(values_buffer_1.data()); value_t* d_values_buffer_2 = thrust::raw_pointer_cast(values_buffer_2.data()); // Enable throughput calculations and add "Size" column to results. state.add_element_count(elements); state.add_global_memory_reads(elements); state.add_global_memory_reads(elements); state.add_global_memory_writes(elements); state.add_global_memory_writes(elements); // Allocate temporary storage: std::size_t temp_size{}; dispatch_t::Dispatch( nullptr, temp_size, d_keys_buffer_1, d_values_buffer_1, d_keys_buffer_2, d_values_buffer_2, static_cast(elements), compare_op_t{}, 0 /* stream */); thrust::device_vector temp(temp_size); auto* temp_storage = thrust::raw_pointer_cast(temp.data()); state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) { dispatch_t::Dispatch( temp_storage, temp_size, d_keys_buffer_1, d_values_buffer_1, d_keys_buffer_2, d_values_buffer_2, static_cast(elements), compare_op_t{}, launch.get_stream()); }); } #ifdef TUNE_KeyT using key_types = nvbench::type_list; #else // !defined(TUNE_KeyT) using key_types = all_types; #endif // TUNE_KeyT #ifdef TUNE_ValueT using value_types = nvbench::type_list; #else // !defined(TUNE_ValueT) using value_types = nvbench::type_list; #endif // TUNE_ValueT NVBENCH_BENCH_TYPES(pairs, NVBENCH_TYPE_AXES(key_types, value_types, offset_types)) .set_name("base") .set_type_axes_names({"KeyT{ct}", "ValueT{ct}", "OffsetT{ct}"}) .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4)) .add_string_axis("Entropy", {"1.000", "0.201"}); cccl-2.5.0/cub/benchmarks/bench/partition/000077500000000000000000000000001463375617100204155ustar00rootroot00000000000000cccl-2.5.0/cub/benchmarks/bench/partition/flagged.cu000066400000000000000000000137041463375617100223440ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #include #include #include // %RANGE% TUNE_TRANSPOSE trp 0:1:1 // %RANGE% TUNE_LOAD ld 0:1:1 // %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1 // %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32 // %RANGE% TUNE_MAGIC_NS ns 0:2048:4 // %RANGE% TUNE_DELAY_CONSTRUCTOR_ID dcid 0:7:1 // %RANGE% TUNE_L2_WRITE_LATENCY_NS l2w 0:1200:5 constexpr bool keep_rejects = true; constexpr bool may_alias = false; #if !TUNE_BASE # if TUNE_TRANSPOSE == 0 # define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_DIRECT # else // TUNE_TRANSPOSE == 1 # define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_WARP_TRANSPOSE # endif // TUNE_TRANSPOSE # if TUNE_LOAD == 0 # define TUNE_LOAD_MODIFIER cub::LOAD_DEFAULT # else // TUNE_LOAD == 1 # define TUNE_LOAD_MODIFIER cub::LOAD_CA # endif // TUNE_LOAD template struct policy_hub_t { struct policy_t : cub::ChainedPolicy<300, policy_t, policy_t> { static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = TUNE_ITEMS_PER_THREAD; static constexpr int ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT)))); using SelectIfPolicyT = cub::AgentSelectIfPolicy; }; using MaxPolicy = policy_t; }; #endif // TUNE_BASE template void flagged(nvbench::state& state, nvbench::type_list) { using input_it_t = const T*; using flag_it_t = const bool*; using output_it_t = T*; using num_selected_it_t = OffsetT*; using select_op_t = cub::NullType; using equality_op_t = cub::NullType; using offset_t = OffsetT; #if !TUNE_BASE using policy_t = policy_hub_t; using dispatch_t = cub::DispatchSelectIf< input_it_t, flag_it_t, output_it_t, num_selected_it_t, select_op_t, equality_op_t, offset_t, keep_rejects, may_alias, policy_t>; #else // TUNE_BASE using dispatch_t = cub::DispatchSelectIf< input_it_t, flag_it_t, output_it_t, num_selected_it_t, select_op_t, equality_op_t, offset_t, keep_rejects, may_alias>; #endif // TUNE_BASE // Retrieve axis parameters const auto elements = static_cast(state.get_int64("Elements{io}")); const bit_entropy entropy = str_to_entropy(state.get_string("Entropy")); auto generator = generate(elements, entropy); thrust::device_vector in = generator; thrust::device_vector flags = generator; thrust::device_vector num_selected(1); thrust::device_vector out(elements); input_it_t d_in = thrust::raw_pointer_cast(in.data()); flag_it_t d_flags = thrust::raw_pointer_cast(flags.data()); output_it_t d_out = thrust::raw_pointer_cast(out.data()); num_selected_it_t d_num_selected = thrust::raw_pointer_cast(num_selected.data()); state.add_element_count(elements); state.add_global_memory_reads(elements); state.add_global_memory_reads(elements); state.add_global_memory_writes(elements); state.add_global_memory_writes(1); std::size_t temp_size{}; dispatch_t::Dispatch( nullptr, temp_size, d_in, d_flags, d_out, d_num_selected, select_op_t{}, equality_op_t{}, elements, 0); thrust::device_vector temp(temp_size); auto* temp_storage = thrust::raw_pointer_cast(temp.data()); state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) { dispatch_t::Dispatch( temp_storage, temp_size, d_in, d_flags, d_out, d_num_selected, select_op_t{}, equality_op_t{}, elements, launch.get_stream()); }); } NVBENCH_BENCH_TYPES(flagged, NVBENCH_TYPE_AXES(fundamental_types, offset_types)) .set_name("base") .set_type_axes_names({"T{ct}", "OffsetT{ct}"}) .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4)) .add_string_axis("Entropy", {"1.000", "0.544", "0.000"}); cccl-2.5.0/cub/benchmarks/bench/partition/if.cu000066400000000000000000000146271463375617100213560ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #include #include #include // %RANGE% TUNE_TRANSPOSE trp 0:1:1 // %RANGE% TUNE_LOAD ld 0:1:1 // %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1 // %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32 // %RANGE% TUNE_MAGIC_NS ns 0:2048:4 // %RANGE% TUNE_DELAY_CONSTRUCTOR_ID dcid 0:7:1 // %RANGE% TUNE_L2_WRITE_LATENCY_NS l2w 0:1200:5 constexpr bool keep_rejects = true; constexpr bool may_alias = false; #if !TUNE_BASE # if TUNE_TRANSPOSE == 0 # define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_DIRECT # else // TUNE_TRANSPOSE == 1 # define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_WARP_TRANSPOSE # endif // TUNE_TRANSPOSE # if TUNE_LOAD == 0 # define TUNE_LOAD_MODIFIER cub::LOAD_DEFAULT # else // TUNE_LOAD == 1 # define TUNE_LOAD_MODIFIER cub::LOAD_CA # endif // TUNE_LOAD template struct policy_hub_t { struct policy_t : cub::ChainedPolicy<300, policy_t, policy_t> { static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = TUNE_ITEMS_PER_THREAD; static constexpr int ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT)))); using SelectIfPolicyT = cub::AgentSelectIfPolicy; }; using MaxPolicy = policy_t; }; #endif // !TUNE_BASE template struct less_then_t { T m_val; __device__ bool operator()(const T& val) const { return val < m_val; } }; template T value_from_entropy(double percentage) { if (percentage == 1) { return std::numeric_limits::max(); } const auto max_val = static_cast(std::numeric_limits::max()); const auto min_val = static_cast(std::numeric_limits::lowest()); const auto result = min_val + percentage * max_val - percentage * min_val; return static_cast(result); } template void partition(nvbench::state& state, nvbench::type_list) { using input_it_t = const T*; using flag_it_t = cub::NullType*; using output_it_t = T*; using num_selected_it_t = OffsetT*; using select_op_t = less_then_t; using equality_op_t = cub::NullType; using offset_t = OffsetT; #if !TUNE_BASE using policy_t = policy_hub_t; using dispatch_t = cub::DispatchSelectIf< input_it_t, flag_it_t, output_it_t, num_selected_it_t, select_op_t, equality_op_t, offset_t, keep_rejects, may_alias, policy_t>; #else // TUNE_BASE using dispatch_t = cub::DispatchSelectIf< input_it_t, flag_it_t, output_it_t, num_selected_it_t, select_op_t, equality_op_t, offset_t, keep_rejects, may_alias>; #endif // !TUNE_BASE // Retrieve axis parameters const auto elements = static_cast(state.get_int64("Elements{io}")); const bit_entropy entropy = str_to_entropy(state.get_string("Entropy")); T val = value_from_entropy(entropy_to_probability(entropy)); select_op_t select_op{val}; thrust::device_vector in = generate(elements); thrust::device_vector num_selected(1); thrust::device_vector out(elements); input_it_t d_in = thrust::raw_pointer_cast(in.data()); flag_it_t d_flags = nullptr; output_it_t d_out = thrust::raw_pointer_cast(out.data()); num_selected_it_t d_num_selected = thrust::raw_pointer_cast(num_selected.data()); state.add_element_count(elements); state.add_global_memory_reads(elements); state.add_global_memory_writes(elements); state.add_global_memory_writes(1); std::size_t temp_size{}; dispatch_t::Dispatch( nullptr, temp_size, d_in, d_flags, d_out, d_num_selected, select_op, equality_op_t{}, elements, 0); thrust::device_vector temp(temp_size); auto* temp_storage = thrust::raw_pointer_cast(temp.data()); state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) { dispatch_t::Dispatch( temp_storage, temp_size, d_in, d_flags, d_out, d_num_selected, select_op, equality_op_t{}, elements, launch.get_stream()); }); } NVBENCH_BENCH_TYPES(partition, NVBENCH_TYPE_AXES(fundamental_types, offset_types)) .set_name("base") .set_type_axes_names({"T{ct}", "OffsetT{ct}"}) .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4)) .add_string_axis("Entropy", {"1.000", "0.544", "0.000"}); cccl-2.5.0/cub/benchmarks/bench/partition/three_way.cu000066400000000000000000000135401463375617100227400ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #include #include #include // %RANGE% TUNE_TRANSPOSE trp 0:1:1 // %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1 // %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32 // %RANGE% TUNE_MAGIC_NS ns 0:2048:4 // %RANGE% TUNE_DELAY_CONSTRUCTOR_ID dcid 0:7:1 // %RANGE% TUNE_L2_WRITE_LATENCY_NS l2w 0:1200:5 #if !TUNE_BASE # if TUNE_TRANSPOSE == 0 # define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_DIRECT # else // TUNE_TRANSPOSE == 1 # define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_WARP_TRANSPOSE # endif // TUNE_TRANSPOSE template struct policy_hub_t { struct policy_t : cub::ChainedPolicy<350, policy_t, policy_t> { using ThreeWayPartitionPolicy = // cub::AgentThreeWayPartitionPolicy; }; using MaxPolicy = policy_t; }; #endif // !TUNE_BASE template struct less_then_t { T m_val; __device__ bool operator()(const T& val) const { return val < m_val; } }; template void partition(nvbench::state& state, nvbench::type_list) { using input_it_t = const T*; using output_it_t = T*; using num_selected_it_t = OffsetT*; using select_op_t = less_then_t; using offset_t = OffsetT; #if !TUNE_BASE using policy_t = policy_hub_t; using dispatch_t = cub::DispatchThreeWayPartitionIf< input_it_t, output_it_t, output_it_t, output_it_t, num_selected_it_t, select_op_t, select_op_t, offset_t, policy_t>; #else // TUNE_BASE using dispatch_t = cub::DispatchThreeWayPartitionIf< input_it_t, output_it_t, output_it_t, output_it_t, num_selected_it_t, select_op_t, select_op_t, offset_t>; #endif // !TUNE_BASE // Retrieve axis parameters const auto elements = static_cast(state.get_int64("Elements{io}")); const bit_entropy entropy = str_to_entropy(state.get_string("Entropy")); T min_val{}; T max_val = std::numeric_limits::max(); T left_border = max_val / 3; T right_border = left_border * 2; select_op_t select_op_1{left_border}; select_op_t select_op_2{right_border}; thrust::device_vector in = generate(elements, entropy, min_val, max_val); thrust::device_vector num_selected(1); thrust::device_vector out_1(elements); thrust::device_vector out_2(elements); thrust::device_vector out_3(elements); input_it_t d_in = thrust::raw_pointer_cast(in.data()); output_it_t d_out_1 = thrust::raw_pointer_cast(out_1.data()); output_it_t d_out_2 = thrust::raw_pointer_cast(out_2.data()); output_it_t d_out_3 = thrust::raw_pointer_cast(out_3.data()); num_selected_it_t d_num_selected = thrust::raw_pointer_cast(num_selected.data()); state.add_element_count(elements); state.add_global_memory_reads(elements); state.add_global_memory_writes(elements); state.add_global_memory_writes(1); std::size_t temp_size{}; dispatch_t::Dispatch( nullptr, temp_size, d_in, d_out_1, d_out_2, d_out_3, d_num_selected, select_op_1, select_op_2, elements, 0); thrust::device_vector temp(temp_size); auto* temp_storage = thrust::raw_pointer_cast(temp.data()); state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) { dispatch_t::Dispatch( temp_storage, temp_size, d_in, d_out_1, d_out_2, d_out_3, d_num_selected, select_op_1, select_op_2, elements, launch.get_stream()); }); } NVBENCH_BENCH_TYPES(partition, NVBENCH_TYPE_AXES(fundamental_types, offset_types)) .set_name("base") .set_type_axes_names({"T{ct}", "OffsetT{ct}"}) .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4)) .add_string_axis("Entropy", {"1.000", "0.544", "0.000"}); cccl-2.5.0/cub/benchmarks/bench/radix_sort/000077500000000000000000000000001463375617100205625ustar00rootroot00000000000000cccl-2.5.0/cub/benchmarks/bench/radix_sort/keys.cu000066400000000000000000000200641463375617100220700ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #include #include // %//RANGE//% TUNE_RADIX_BITS bits 8:9:1 #define TUNE_RADIX_BITS 8 // %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1 // %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32 using value_t = cub::NullType; constexpr bool is_descending = false; constexpr bool is_overwrite_ok = false; #if !TUNE_BASE template struct policy_hub_t { static constexpr bool KEYS_ONLY = std::is_same::value; using DominantT = cub::detail::conditional_t<(sizeof(ValueT) > sizeof(KeyT)), ValueT, KeyT>; struct policy_t : cub::ChainedPolicy<300, policy_t, policy_t> { static constexpr int ONESWEEP_RADIX_BITS = TUNE_RADIX_BITS; static constexpr bool ONESWEEP = true; static constexpr bool OFFSET_64BIT = sizeof(OffsetT) == 8; // Onesweep policy using OnesweepPolicy = cub::AgentRadixSortOnesweepPolicy< TUNE_THREADS_PER_BLOCK, TUNE_ITEMS_PER_THREAD, DominantT, 1, cub::RADIX_RANK_MATCH_EARLY_COUNTS_ANY, cub::BLOCK_SCAN_RAKING_MEMOIZE, cub::RADIX_SORT_STORE_DIRECT, ONESWEEP_RADIX_BITS>; // These kernels are launched once, no point in tuning at the moment using HistogramPolicy = cub::AgentRadixSortHistogramPolicy<128, 16, 1, KeyT, ONESWEEP_RADIX_BITS>; using ExclusiveSumPolicy = cub::AgentRadixSortExclusiveSumPolicy<256, ONESWEEP_RADIX_BITS>; using ScanPolicy = cub::AgentScanPolicy<512, 23, OffsetT, cub::BLOCK_LOAD_WARP_TRANSPOSE, cub::LOAD_DEFAULT, cub::BLOCK_STORE_WARP_TRANSPOSE, cub::BLOCK_SCAN_RAKING_MEMOIZE>; // No point in tuning static constexpr int SINGLE_TILE_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5; // No point in tuning single-tile policy using SingleTilePolicy = cub::AgentRadixSortDownsweepPolicy< 256, 19, DominantT, cub::BLOCK_LOAD_DIRECT, cub::LOAD_LDG, cub::RADIX_RANK_MEMOIZE, cub::BLOCK_SCAN_WARP_SCANS, SINGLE_TILE_RADIX_BITS>; }; using MaxPolicy = policy_t; }; template constexpr std::size_t max_onesweep_temp_storage_size() { using portion_offset = int; using onesweep_policy = typename policy_hub_t::policy_t::OnesweepPolicy; using agent_radix_sort_onesweep_t = cub::AgentRadixSortOnesweep; using hist_policy = typename policy_hub_t::policy_t::HistogramPolicy; using hist_agent = cub::AgentRadixSortHistogram; return cub::max(sizeof(typename agent_radix_sort_onesweep_t::TempStorage), sizeof(typename hist_agent::TempStorage)); } template constexpr std::size_t max_temp_storage_size() { using policy_t = typename policy_hub_t::policy_t; static_assert(policy_t::ONESWEEP); return max_onesweep_temp_storage_size(); } template constexpr bool fits_in_default_shared_memory() { return max_temp_storage_size() < 48 * 1024; } #else // TUNE_BASE template constexpr bool fits_in_default_shared_memory() { return true; } #endif // TUNE_BASE template void radix_sort_keys(std::integral_constant, nvbench::state& state, nvbench::type_list) { using offset_t = cub::detail::choose_offset_t; using key_t = T; #if !TUNE_BASE using policy_t = policy_hub_t; using dispatch_t = cub::DispatchRadixSort; #else // TUNE_BASE using dispatch_t = cub::DispatchRadixSort; #endif // TUNE_BASE constexpr int begin_bit = 0; constexpr int end_bit = sizeof(key_t) * 8; // Retrieve axis parameters const auto elements = static_cast(state.get_int64("Elements{io}")); const bit_entropy entropy = str_to_entropy(state.get_string("Entropy")); thrust::device_vector buffer_1 = generate(elements, entropy); thrust::device_vector buffer_2(elements); key_t* d_buffer_1 = thrust::raw_pointer_cast(buffer_1.data()); key_t* d_buffer_2 = thrust::raw_pointer_cast(buffer_2.data()); cub::DoubleBuffer d_keys(d_buffer_1, d_buffer_2); cub::DoubleBuffer d_values; // Enable throughput calculations and add "Size" column to results. state.add_element_count(elements); state.add_global_memory_reads(elements, "Size"); state.add_global_memory_writes(elements); // Allocate temporary storage: std::size_t temp_size{}; dispatch_t::Dispatch( nullptr, temp_size, d_keys, d_values, static_cast(elements), begin_bit, end_bit, is_overwrite_ok, 0 /* stream */); thrust::device_vector temp(temp_size); auto* temp_storage = thrust::raw_pointer_cast(temp.data()); state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) { cub::DoubleBuffer keys = d_keys; cub::DoubleBuffer values = d_values; dispatch_t::Dispatch( temp_storage, temp_size, keys, values, static_cast(elements), begin_bit, end_bit, is_overwrite_ok, launch.get_stream()); }); } template void radix_sort_keys(std::integral_constant, nvbench::state&, nvbench::type_list) { (void) is_descending; (void) is_overwrite_ok; } template void radix_sort_keys(nvbench::state& state, nvbench::type_list tl) { using offset_t = cub::detail::choose_offset_t; radix_sort_keys(std::integral_constant()>{}, state, tl); } NVBENCH_BENCH_TYPES(radix_sort_keys, NVBENCH_TYPE_AXES(fundamental_types, offset_types)) .set_name("base") .set_type_axes_names({"T{ct}", "OffsetT{ct}"}) .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4)) .add_string_axis("Entropy", {"1.000", "0.544", "0.201"}); cccl-2.5.0/cub/benchmarks/bench/radix_sort/pairs.cu000066400000000000000000000222511463375617100222330ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #include #include // %//RANGE//% TUNE_RADIX_BITS bits 8:9:1 #define TUNE_RADIX_BITS 8 // %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1 // %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32 constexpr bool is_descending = false; constexpr bool is_overwrite_ok = false; #if !TUNE_BASE template struct policy_hub_t { static constexpr bool KEYS_ONLY = std::is_same::value; using DominantT = cub::detail::conditional_t<(sizeof(ValueT) > sizeof(KeyT)), ValueT, KeyT>; struct policy_t : cub::ChainedPolicy<300, policy_t, policy_t> { static constexpr int ONESWEEP_RADIX_BITS = TUNE_RADIX_BITS; static constexpr bool ONESWEEP = true; static constexpr bool OFFSET_64BIT = sizeof(OffsetT) == 8; // Onesweep policy using OnesweepPolicy = cub::AgentRadixSortOnesweepPolicy< TUNE_THREADS_PER_BLOCK, TUNE_ITEMS_PER_THREAD, DominantT, 1, cub::RADIX_RANK_MATCH_EARLY_COUNTS_ANY, cub::BLOCK_SCAN_RAKING_MEMOIZE, cub::RADIX_SORT_STORE_DIRECT, ONESWEEP_RADIX_BITS>; // These kernels are launched once, no point in tuning at the moment using HistogramPolicy = cub::AgentRadixSortHistogramPolicy<128, 16, 1, KeyT, ONESWEEP_RADIX_BITS>; using ExclusiveSumPolicy = cub::AgentRadixSortExclusiveSumPolicy<256, ONESWEEP_RADIX_BITS>; using ScanPolicy = cub::AgentScanPolicy<512, 23, OffsetT, cub::BLOCK_LOAD_WARP_TRANSPOSE, cub::LOAD_DEFAULT, cub::BLOCK_STORE_WARP_TRANSPOSE, cub::BLOCK_SCAN_RAKING_MEMOIZE>; // No point in tuning static constexpr int SINGLE_TILE_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5; // No point in tuning single-tile policy using SingleTilePolicy = cub::AgentRadixSortDownsweepPolicy< 256, 19, DominantT, cub::BLOCK_LOAD_DIRECT, cub::LOAD_LDG, cub::RADIX_RANK_MEMOIZE, cub::BLOCK_SCAN_WARP_SCANS, SINGLE_TILE_RADIX_BITS>; }; using MaxPolicy = policy_t; }; template constexpr std::size_t max_onesweep_temp_storage_size() { using portion_offset = int; using onesweep_policy = typename policy_hub_t::policy_t::OnesweepPolicy; using agent_radix_sort_onesweep_t = cub::AgentRadixSortOnesweep; using hist_policy = typename policy_hub_t::policy_t::HistogramPolicy; using hist_agent = cub::AgentRadixSortHistogram; return cub::max(sizeof(typename agent_radix_sort_onesweep_t::TempStorage), sizeof(typename hist_agent::TempStorage)); } template constexpr std::size_t max_temp_storage_size() { using policy_t = typename policy_hub_t::policy_t; static_assert(policy_t::ONESWEEP); return max_onesweep_temp_storage_size(); } template constexpr bool fits_in_default_shared_memory() { return max_temp_storage_size() < 48 * 1024; } #else // TUNE_BASE template constexpr bool fits_in_default_shared_memory() { return true; } #endif // TUNE_BASE template void radix_sort_values( std::integral_constant, nvbench::state& state, nvbench::type_list) { using offset_t = cub::detail::choose_offset_t; using key_t = KeyT; using value_t = ValueT; #if !TUNE_BASE using policy_t = policy_hub_t; using dispatch_t = cub::DispatchRadixSort; #else // TUNE_BASE using dispatch_t = cub::DispatchRadixSort; #endif // TUNE_BASE constexpr int begin_bit = 0; constexpr int end_bit = sizeof(key_t) * 8; // Retrieve axis parameters const auto elements = static_cast(state.get_int64("Elements{io}")); const bit_entropy entropy = str_to_entropy(state.get_string("Entropy")); thrust::device_vector keys_buffer_1 = generate(elements, entropy); thrust::device_vector values_buffer_1 = generate(elements); thrust::device_vector keys_buffer_2(elements); thrust::device_vector values_buffer_2(elements); key_t* d_keys_buffer_1 = thrust::raw_pointer_cast(keys_buffer_1.data()); key_t* d_keys_buffer_2 = thrust::raw_pointer_cast(keys_buffer_2.data()); value_t* d_values_buffer_1 = thrust::raw_pointer_cast(values_buffer_1.data()); value_t* d_values_buffer_2 = thrust::raw_pointer_cast(values_buffer_2.data()); cub::DoubleBuffer d_keys(d_keys_buffer_1, d_keys_buffer_2); cub::DoubleBuffer d_values(d_values_buffer_1, d_values_buffer_2); // Enable throughput calculations and add "Size" column to results. state.add_element_count(elements); state.add_global_memory_reads(elements); state.add_global_memory_reads(elements); state.add_global_memory_writes(elements); state.add_global_memory_writes(elements); // Allocate temporary storage: std::size_t temp_size{}; dispatch_t::Dispatch( nullptr, temp_size, d_keys, d_values, static_cast(elements), begin_bit, end_bit, is_overwrite_ok, 0 /* stream */); thrust::device_vector temp(temp_size); auto* temp_storage = thrust::raw_pointer_cast(temp.data()); state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) { cub::DoubleBuffer keys = d_keys; cub::DoubleBuffer values = d_values; dispatch_t::Dispatch( temp_storage, temp_size, keys, values, static_cast(elements), begin_bit, end_bit, is_overwrite_ok, launch.get_stream()); }); } template void radix_sort_values(std::integral_constant, nvbench::state&, nvbench::type_list) { (void) is_descending; (void) is_overwrite_ok; } template void radix_sort_values(nvbench::state& state, nvbench::type_list tl) { using offset_t = cub::detail::choose_offset_t; radix_sort_values(std::integral_constant()>{}, state, tl); } #ifdef TUNE_KeyT using key_types = nvbench::type_list; #else // !defined(TUNE_KeyT) using key_types = integral_types; #endif // TUNE_KeyT #ifdef TUNE_ValueT using value_types = nvbench::type_list; #else // !defined(Tune_ValueT) using value_types = nvbench::type_list; #endif // TUNE_ValueT NVBENCH_BENCH_TYPES(radix_sort_values, NVBENCH_TYPE_AXES(key_types, value_types, offset_types)) .set_name("base") .set_type_axes_names({"KeyT{ct}", "ValueT{ct}", "OffsetT{ct}"}) .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4)) .add_string_axis("Entropy", {"1.000", "0.201"}); cccl-2.5.0/cub/benchmarks/bench/reduce/000077500000000000000000000000001463375617100176535ustar00rootroot00000000000000cccl-2.5.0/cub/benchmarks/bench/reduce/base.cuh000066400000000000000000000111041463375617100212630ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #include #ifndef TUNE_BASE # define TUNE_ITEMS_PER_VEC_LOAD (1 << TUNE_ITEMS_PER_VEC_LOAD_POW2) #endif #if !TUNE_BASE template struct policy_hub_t { struct policy_t : cub::ChainedPolicy<300, policy_t, policy_t> { static constexpr int threads_per_block = TUNE_THREADS_PER_BLOCK; static constexpr int items_per_thread = TUNE_ITEMS_PER_THREAD; static constexpr int items_per_vec_load = TUNE_ITEMS_PER_VEC_LOAD; using ReducePolicy = cub::AgentReducePolicy; // SingleTilePolicy using SingleTilePolicy = ReducePolicy; // SegmentedReducePolicy using SegmentedReducePolicy = ReducePolicy; }; using MaxPolicy = policy_t; }; #endif // !TUNE_BASE template void reduce(nvbench::state& state, nvbench::type_list) { using accum_t = T; using input_it_t = const T*; using output_it_t = T*; using offset_t = cub::detail::choose_offset_t; using output_t = T; using init_t = T; #if !TUNE_BASE using policy_t = policy_hub_t; using dispatch_t = cub::DispatchReduce; #else // TUNE_BASE using dispatch_t = cub::DispatchReduce; #endif // TUNE_BASE // Retrieve axis parameters const auto elements = static_cast(state.get_int64("Elements{io}")); thrust::device_vector in = generate(elements); thrust::device_vector out(1); input_it_t d_in = thrust::raw_pointer_cast(in.data()); output_it_t d_out = thrust::raw_pointer_cast(out.data()); // Enable throughput calculations and add "Size" column to results. state.add_element_count(elements); state.add_global_memory_reads(elements, "Size"); state.add_global_memory_writes(1); // Allocate temporary storage: std::size_t temp_size; dispatch_t::Dispatch( nullptr, temp_size, d_in, d_out, static_cast(elements), op_t{}, init_t{}, 0 /* stream */); thrust::device_vector temp(temp_size); auto* temp_storage = thrust::raw_pointer_cast(temp.data()); state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) { dispatch_t::Dispatch( temp_storage, temp_size, d_in, d_out, static_cast(elements), op_t{}, init_t{}, launch.get_stream()); }); } NVBENCH_BENCH_TYPES(reduce, NVBENCH_TYPE_AXES(all_types, offset_types)) .set_name("base") .set_type_axes_names({"T{ct}", "OffsetT{ct}"}) .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4)); cccl-2.5.0/cub/benchmarks/bench/reduce/by_key.cu000066400000000000000000000162141463375617100214720ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #include #include #include // %RANGE% TUNE_ITEMS ipt 7:24:1 // %RANGE% TUNE_THREADS tpb 128:1024:32 // %RANGE% TUNE_TRANSPOSE trp 0:1:1 // %RANGE% TUNE_LOAD ld 0:1:1 // %RANGE% TUNE_MAGIC_NS ns 0:2048:4 // %RANGE% TUNE_DELAY_CONSTRUCTOR_ID dcid 0:7:1 // %RANGE% TUNE_L2_WRITE_LATENCY_NS l2w 0:1200:5 #if !TUNE_BASE # if TUNE_TRANSPOSE == 0 # define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_DIRECT # else // TUNE_TRANSPOSE == 1 # define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_WARP_TRANSPOSE # endif // TUNE_TRANSPOSE # if TUNE_LOAD == 0 # define TUNE_LOAD_MODIFIER cub::LOAD_DEFAULT # else // TUNE_LOAD == 1 # define TUNE_LOAD_MODIFIER cub::LOAD_CA # endif // TUNE_LOAD struct device_reduce_by_key_policy_hub { struct Policy350 : cub::ChainedPolicy<350, Policy350, Policy350> { using ReduceByKeyPolicyT = cub::AgentReduceByKeyPolicy; }; using MaxPolicy = Policy350; }; #endif // !TUNE_BASE template static void reduce(nvbench::state& state, nvbench::type_list) { using keys_input_it_t = const KeyT*; using unique_output_it_t = KeyT*; using vals_input_it_t = const ValueT*; using aggregate_output_it_t = ValueT*; using num_runs_output_iterator_t = OffsetT*; using equality_op_t = cub::Equality; using reduction_op_t = cub::Sum; using accum_t = ValueT; using offset_t = OffsetT; #if !TUNE_BASE using dispatch_t = cub::DispatchReduceByKey< keys_input_it_t, unique_output_it_t, vals_input_it_t, aggregate_output_it_t, num_runs_output_iterator_t, equality_op_t, reduction_op_t, offset_t, accum_t, device_reduce_by_key_policy_hub>; #else using dispatch_t = cub::DispatchReduceByKey< keys_input_it_t, unique_output_it_t, vals_input_it_t, aggregate_output_it_t, num_runs_output_iterator_t, equality_op_t, reduction_op_t, offset_t, accum_t>; #endif const auto elements = static_cast(state.get_int64("Elements{io}")); constexpr std::size_t min_segment_size = 1; const std::size_t max_segment_size = static_cast(state.get_int64("MaxSegSize")); thrust::device_vector num_runs_out(1); thrust::device_vector in_vals(elements); thrust::device_vector out_vals(elements); thrust::device_vector out_keys(elements); thrust::device_vector in_keys = generate.uniform.key_segments(elements, min_segment_size, max_segment_size); KeyT* d_in_keys = thrust::raw_pointer_cast(in_keys.data()); KeyT* d_out_keys = thrust::raw_pointer_cast(out_keys.data()); ValueT* d_in_vals = thrust::raw_pointer_cast(in_vals.data()); ValueT* d_out_vals = thrust::raw_pointer_cast(out_vals.data()); OffsetT* d_num_runs_out = thrust::raw_pointer_cast(num_runs_out.data()); std::uint8_t* d_temp_storage{}; std::size_t temp_storage_bytes{}; dispatch_t::Dispatch( d_temp_storage, temp_storage_bytes, d_in_keys, d_out_keys, d_in_vals, d_out_vals, d_num_runs_out, equality_op_t{}, reduction_op_t{}, elements, 0); thrust::device_vector temp_storage(temp_storage_bytes); d_temp_storage = thrust::raw_pointer_cast(temp_storage.data()); dispatch_t::Dispatch( d_temp_storage, temp_storage_bytes, d_in_keys, d_out_keys, d_in_vals, d_out_vals, d_num_runs_out, equality_op_t{}, reduction_op_t{}, elements, 0); cudaDeviceSynchronize(); const OffsetT num_runs = num_runs_out[0]; state.add_element_count(elements); state.add_global_memory_reads(elements); state.add_global_memory_reads(elements); state.add_global_memory_writes(num_runs); state.add_global_memory_writes(num_runs); state.add_global_memory_writes(1); state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) { dispatch_t::Dispatch( d_temp_storage, temp_storage_bytes, d_in_keys, d_out_keys, d_in_vals, d_out_vals, d_num_runs_out, equality_op_t{}, reduction_op_t{}, elements, launch.get_stream()); }); } using some_offset_types = nvbench::type_list; #ifdef TUNE_KeyT using key_types = nvbench::type_list; #else // !defined(TUNE_KeyT) using key_types = nvbench::type_list; #endif // TUNE_KeyT #ifdef TUNE_ValueT using value_types = nvbench::type_list; #else // !defined(TUNE_ValueT) using value_types = all_types; #endif // TUNE_ValueT NVBENCH_BENCH_TYPES(reduce, NVBENCH_TYPE_AXES(key_types, value_types, some_offset_types)) .set_name("base") .set_type_axes_names({"KeyT{ct}", "ValueT{ct}", "OffsetT{ct}"}) .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4)) .add_int64_power_of_two_axis("MaxSegSize", {1, 4, 8}); cccl-2.5.0/cub/benchmarks/bench/reduce/max.cu000066400000000000000000000036541463375617100210010ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #include // %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1 // %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32 // %RANGE% TUNE_ITEMS_PER_VEC_LOAD_POW2 ipv 1:2:1 using op_t = max_t; #include "base.cuh" cccl-2.5.0/cub/benchmarks/bench/reduce/sum.cu000066400000000000000000000036571463375617100210230ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #include // %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1 // %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32 // %RANGE% TUNE_ITEMS_PER_VEC_LOAD_POW2 ipv 1:2:1 using op_t = cub::Sum; #include "base.cuh" cccl-2.5.0/cub/benchmarks/bench/run_length_encode/000077500000000000000000000000001463375617100220665ustar00rootroot00000000000000cccl-2.5.0/cub/benchmarks/bench/run_length_encode/encode.cu000066400000000000000000000151041463375617100236550ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #include #include #include // %RANGE% TUNE_ITEMS ipt 7:24:1 // %RANGE% TUNE_THREADS tpb 128:1024:32 // %RANGE% TUNE_TRANSPOSE trp 0:1:1 // %RANGE% TUNE_LOAD ld 0:1:1 // %RANGE% TUNE_MAGIC_NS ns 0:2048:4 // %RANGE% TUNE_DELAY_CONSTRUCTOR_ID dcid 0:7:1 // %RANGE% TUNE_L2_WRITE_LATENCY_NS l2w 0:1200:5 #if !TUNE_BASE # if TUNE_TRANSPOSE == 0 # define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_DIRECT # else // TUNE_TRANSPOSE == 1 # define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_WARP_TRANSPOSE # endif // TUNE_TRANSPOSE # if TUNE_LOAD == 0 # define TUNE_LOAD_MODIFIER cub::LOAD_DEFAULT # else // TUNE_LOAD == 1 # define TUNE_LOAD_MODIFIER cub::LOAD_CA # endif // TUNE_LOAD struct device_reduce_by_key_policy_hub { struct Policy350 : cub::ChainedPolicy<350, Policy350, Policy350> { using ReduceByKeyPolicyT = cub::AgentReduceByKeyPolicy; }; using MaxPolicy = Policy350; }; #endif // !TUNE_BASE template static void rle(nvbench::state& state, nvbench::type_list) { using offset_t = OffsetT; using keys_input_it_t = const T*; using unique_output_it_t = T*; using vals_input_it_t = cub::ConstantInputIterator; using aggregate_output_it_t = offset_t*; using num_runs_output_iterator_t = offset_t*; using equality_op_t = cub::Equality; using reduction_op_t = cub::Sum; using accum_t = offset_t; #if !TUNE_BASE using dispatch_t = cub::DispatchReduceByKey< keys_input_it_t, unique_output_it_t, vals_input_it_t, aggregate_output_it_t, num_runs_output_iterator_t, equality_op_t, reduction_op_t, offset_t, accum_t, device_reduce_by_key_policy_hub>; #else using policy_t = cub::detail::device_run_length_encode_policy_hub; using dispatch_t = cub::DispatchReduceByKey< keys_input_it_t, unique_output_it_t, vals_input_it_t, aggregate_output_it_t, num_runs_output_iterator_t, equality_op_t, reduction_op_t, offset_t, accum_t, policy_t>; #endif const auto elements = static_cast(state.get_int64("Elements{io}")); constexpr std::size_t min_segment_size = 1; const std::size_t max_segment_size = static_cast(state.get_int64("MaxSegSize")); thrust::device_vector num_runs_out(1); thrust::device_vector out_vals(elements); thrust::device_vector out_keys(elements); thrust::device_vector in_keys = generate.uniform.key_segments(elements, min_segment_size, max_segment_size); T* d_in_keys = thrust::raw_pointer_cast(in_keys.data()); T* d_out_keys = thrust::raw_pointer_cast(out_keys.data()); offset_t* d_out_vals = thrust::raw_pointer_cast(out_vals.data()); offset_t* d_num_runs_out = thrust::raw_pointer_cast(num_runs_out.data()); vals_input_it_t d_in_vals(offset_t{1}); std::uint8_t* d_temp_storage{}; std::size_t temp_storage_bytes{}; dispatch_t::Dispatch( d_temp_storage, temp_storage_bytes, d_in_keys, d_out_keys, d_in_vals, d_out_vals, d_num_runs_out, equality_op_t{}, reduction_op_t{}, elements, 0); thrust::device_vector temp_storage(temp_storage_bytes); d_temp_storage = thrust::raw_pointer_cast(temp_storage.data()); dispatch_t::Dispatch( d_temp_storage, temp_storage_bytes, d_in_keys, d_out_keys, d_in_vals, d_out_vals, d_num_runs_out, equality_op_t{}, reduction_op_t{}, elements, 0); cudaDeviceSynchronize(); const OffsetT num_runs = num_runs_out[0]; state.add_element_count(elements); state.add_global_memory_reads(elements); state.add_global_memory_writes(num_runs); state.add_global_memory_writes(num_runs); state.add_global_memory_writes(1); state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) { dispatch_t::Dispatch( d_temp_storage, temp_storage_bytes, d_in_keys, d_out_keys, d_in_vals, d_out_vals, d_num_runs_out, equality_op_t{}, reduction_op_t{}, elements, launch.get_stream()); }); } using some_offset_types = nvbench::type_list; NVBENCH_BENCH_TYPES(rle, NVBENCH_TYPE_AXES(all_types, some_offset_types)) .set_name("base") .set_type_axes_names({"T{ct}", "OffsetT{ct}"}) .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4)) .add_int64_power_of_two_axis("MaxSegSize", {1, 4, 8}); cccl-2.5.0/cub/benchmarks/bench/run_length_encode/non_trivial_runs.cu000066400000000000000000000146061463375617100260210ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #include #include #include // %RANGE% TUNE_ITEMS ipt 7:24:1 // %RANGE% TUNE_THREADS tpb 128:1024:32 // %RANGE% TUNE_TRANSPOSE trp 0:1:1 // %RANGE% TUNE_TIME_SLICING ts 0:1:1 // %RANGE% TUNE_LOAD ld 0:1:1 // %RANGE% TUNE_MAGIC_NS ns 0:2048:4 // %RANGE% TUNE_DELAY_CONSTRUCTOR_ID dcid 0:7:1 // %RANGE% TUNE_L2_WRITE_LATENCY_NS l2w 0:1200:5 #if !TUNE_BASE # if TUNE_TRANSPOSE == 0 # define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_DIRECT # else // TUNE_TRANSPOSE == 1 # define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_WARP_TRANSPOSE # endif // TUNE_TRANSPOSE # if TUNE_LOAD == 0 # define TUNE_LOAD_MODIFIER cub::LOAD_DEFAULT # else // TUNE_LOAD == 1 # define TUNE_LOAD_MODIFIER cub::LOAD_CA # endif // TUNE_LOAD struct device_rle_policy_hub { struct Policy350 : cub::ChainedPolicy<350, Policy350, Policy350> { using RleSweepPolicyT = cub::AgentRlePolicy; }; using MaxPolicy = Policy350; }; #endif // !TUNE_BASE template static void rle(nvbench::state& state, nvbench::type_list) { using offset_t = OffsetT; using keys_input_it_t = const T*; using offset_output_it_t = offset_t*; using length_output_it_t = offset_t*; using num_runs_output_iterator_t = offset_t*; using equality_op_t = cub::Equality; using accum_t = offset_t; #if !TUNE_BASE using dispatch_t = cub::DeviceRleDispatch; #else using dispatch_t = cub::DeviceRleDispatch; #endif const auto elements = static_cast(state.get_int64("Elements{io}")); constexpr std::size_t min_segment_size = 1; const std::size_t max_segment_size = static_cast(state.get_int64("MaxSegSize")); thrust::device_vector num_runs_out(1); thrust::device_vector out_offsets(elements); thrust::device_vector out_lengths(elements); thrust::device_vector in_keys = generate.uniform.key_segments(elements, min_segment_size, max_segment_size); T* d_in_keys = thrust::raw_pointer_cast(in_keys.data()); offset_t* d_out_offsets = thrust::raw_pointer_cast(out_offsets.data()); offset_t* d_out_lengths = thrust::raw_pointer_cast(out_lengths.data()); offset_t* d_num_runs_out = thrust::raw_pointer_cast(num_runs_out.data()); std::uint8_t* d_temp_storage{}; std::size_t temp_storage_bytes{}; dispatch_t::Dispatch( d_temp_storage, temp_storage_bytes, d_in_keys, d_out_offsets, d_out_lengths, d_num_runs_out, equality_op_t{}, elements, 0); thrust::device_vector temp_storage(temp_storage_bytes); d_temp_storage = thrust::raw_pointer_cast(temp_storage.data()); dispatch_t::Dispatch( d_temp_storage, temp_storage_bytes, d_in_keys, d_out_offsets, d_out_lengths, d_num_runs_out, equality_op_t{}, elements, 0); cudaDeviceSynchronize(); const OffsetT num_runs = num_runs_out[0]; state.add_element_count(elements); state.add_global_memory_reads(elements); state.add_global_memory_writes(num_runs); state.add_global_memory_writes(num_runs); state.add_global_memory_writes(1); state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) { dispatch_t::Dispatch( d_temp_storage, temp_storage_bytes, d_in_keys, d_out_offsets, d_out_lengths, d_num_runs_out, equality_op_t{}, elements, launch.get_stream()); }); } using some_offset_types = nvbench::type_list; NVBENCH_BENCH_TYPES(rle, NVBENCH_TYPE_AXES(all_types, some_offset_types)) .set_name("base") .set_type_axes_names({"T{ct}", "OffsetT{ct}"}) .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4)) .add_int64_power_of_two_axis("MaxSegSize", {1, 4, 8}); cccl-2.5.0/cub/benchmarks/bench/scan/000077500000000000000000000000001463375617100173305ustar00rootroot00000000000000cccl-2.5.0/cub/benchmarks/bench/scan/exclusive/000077500000000000000000000000001463375617100213375ustar00rootroot00000000000000cccl-2.5.0/cub/benchmarks/bench/scan/exclusive/base.cuh000066400000000000000000000123401463375617100227520ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #include #include #if !TUNE_BASE # if TUNE_TRANSPOSE == 0 # define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_DIRECT # define TUNE_STORE_ALGORITHM cub::BLOCK_STORE_DIRECT # else // TUNE_TRANSPOSE == 1 # define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_WARP_TRANSPOSE # define TUNE_STORE_ALGORITHM cub::BLOCK_STORE_WARP_TRANSPOSE # endif // TUNE_TRANSPOSE # if TUNE_LOAD == 0 # define TUNE_LOAD_MODIFIER cub::LOAD_DEFAULT # else // TUNE_LOAD == 1 # define TUNE_LOAD_MODIFIER cub::LOAD_CA # endif // TUNE_LOAD template struct policy_hub_t { template using agent_policy_t = cub::AgentScanPolicy< NOMINAL_BLOCK_THREADS_4B, NOMINAL_ITEMS_PER_THREAD_4B, ComputeT, LOAD_ALGORITHM, LOAD_MODIFIER, STORE_ALGORITHM, SCAN_ALGORITHM, cub::MemBoundScaling, delay_constructor_t>; struct policy_t : cub::ChainedPolicy<300, policy_t, policy_t> { using ScanPolicyT = agent_policy_t; }; using MaxPolicy = policy_t; }; #endif // TUNE_BASE template static void basic(nvbench::state& state, nvbench::type_list) { using init_t = cub::detail::InputValue; using accum_t = cub::detail::accumulator_t; using input_it_t = const T*; using output_it_t = T*; using offset_t = OffsetT; #if !TUNE_BASE using policy_t = policy_hub_t; using dispatch_t = cub::DispatchScan; #else using dispatch_t = cub::DispatchScan; #endif const auto elements = static_cast(state.get_int64("Elements{io}")); thrust::device_vector input = generate(elements); thrust::device_vector output(elements); T* d_input = thrust::raw_pointer_cast(input.data()); T* d_output = thrust::raw_pointer_cast(output.data()); state.add_element_count(elements); state.add_global_memory_reads(elements, "Size"); state.add_global_memory_writes(elements); size_t tmp_size; dispatch_t::Dispatch( nullptr, tmp_size, d_input, d_output, op_t{}, init_t{T{}}, static_cast(input.size()), 0 /* stream */); thrust::device_vector tmp(tmp_size); nvbench::uint8_t* d_tmp = thrust::raw_pointer_cast(tmp.data()); state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) { dispatch_t::Dispatch( thrust::raw_pointer_cast(tmp.data()), tmp_size, d_input, d_output, op_t{}, init_t{T{}}, static_cast(input.size()), launch.get_stream()); }); } using some_offset_types = nvbench::type_list; NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(all_types, some_offset_types)) .set_name("base") .set_type_axes_names({"T{ct}", "OffsetT{ct}"}) .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4)); cccl-2.5.0/cub/benchmarks/bench/scan/exclusive/by_key.cu000066400000000000000000000137401463375617100231570ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #include #include #include // %RANGE% TUNE_ITEMS ipt 7:24:1 // %RANGE% TUNE_THREADS tpb 128:1024:32 // %RANGE% TUNE_MAGIC_NS ns 0:2048:4 // %RANGE% TUNE_DELAY_CONSTRUCTOR_ID dcid 0:7:1 // %RANGE% TUNE_L2_WRITE_LATENCY_NS l2w 0:1200:5 // %RANGE% TUNE_TRANSPOSE trp 0:1:1 // %RANGE% TUNE_LOAD ld 0:2:1 #if !TUNE_BASE # if TUNE_TRANSPOSE == 0 # define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_DIRECT # define TUNE_STORE_ALGORITHM cub::BLOCK_STORE_DIRECT # else // TUNE_TRANSPOSE == 1 # define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_WARP_TRANSPOSE # define TUNE_STORE_ALGORITHM cub::BLOCK_STORE_WARP_TRANSPOSE # endif // TUNE_TRANSPOSE # if TUNE_LOAD == 0 # define TUNE_LOAD_MODIFIER cub::LOAD_DEFAULT # else // TUNE_LOAD == 1 # define TUNE_LOAD_MODIFIER cub::LOAD_CA # endif // TUNE_LOAD struct policy_hub_t { struct policy_t : cub::ChainedPolicy<300, policy_t, policy_t> { using ScanByKeyPolicyT = cub::AgentScanByKeyPolicy< TUNE_THREADS, TUNE_ITEMS, // TODO Tune TUNE_LOAD_ALGORITHM, TUNE_LOAD_MODIFIER, cub::BLOCK_SCAN_WARP_SCANS, TUNE_STORE_ALGORITHM, delay_constructor_t>; }; using MaxPolicy = policy_t; }; #endif // !TUNE_BASE template static void scan(nvbench::state& state, nvbench::type_list) { using init_value_t = ValueT; using op_t = cub::Sum; using accum_t = cub::detail::accumulator_t; using key_input_it_t = const KeyT*; using val_input_it_t = const ValueT*; using val_output_it_t = ValueT*; using equality_op_t = cub::Equality; using offset_t = OffsetT; #if !TUNE_BASE using policy_t = policy_hub_t; using dispatch_t = cub::DispatchScanByKey< key_input_it_t, val_input_it_t, val_output_it_t, equality_op_t, op_t, init_value_t, offset_t, accum_t, policy_t>; #else // TUNE_BASE using dispatch_t = cub:: DispatchScanByKey; #endif // TUNE_BASE const auto elements = static_cast(state.get_int64("Elements{io}")); thrust::device_vector in_vals(elements); thrust::device_vector out_vals(elements); thrust::device_vector keys = generate.uniform.key_segments(elements, 0, 5200); KeyT* d_keys = thrust::raw_pointer_cast(keys.data()); ValueT* d_in_vals = thrust::raw_pointer_cast(in_vals.data()); ValueT* d_out_vals = thrust::raw_pointer_cast(out_vals.data()); state.add_element_count(elements); state.add_global_memory_reads(elements); state.add_global_memory_reads(elements); state.add_global_memory_writes(elements); size_t tmp_size; dispatch_t::Dispatch( nullptr, tmp_size, d_keys, d_in_vals, d_out_vals, equality_op_t{}, op_t{}, init_value_t{}, static_cast(elements), 0 /* stream */); thrust::device_vector tmp(tmp_size); nvbench::uint8_t* d_tmp = thrust::raw_pointer_cast(tmp.data()); state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) { dispatch_t::Dispatch( d_tmp, tmp_size, d_keys, d_in_vals, d_out_vals, equality_op_t{}, op_t{}, init_value_t{}, static_cast(elements), launch.get_stream()); }); } using some_offset_types = nvbench::type_list; #ifdef TUNE_KeyT using key_types = nvbench::type_list; #else // !defined(TUNE_KeyT) using key_types = all_types; #endif // TUNE_KeyT #ifdef TUNE_ValueT using value_types = nvbench::type_list; #else // !defined(TUNE_ValueT) using value_types = nvbench::type_list; #endif // TUNE_ValueT NVBENCH_BENCH_TYPES(scan, NVBENCH_TYPE_AXES(key_types, value_types, some_offset_types)) .set_name("base") .set_type_axes_names({"KeyT{ct}", "ValueT{ct}", "OffsetT{ct}"}) .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4)); cccl-2.5.0/cub/benchmarks/bench/scan/exclusive/max.cu000066400000000000000000000040551463375617100224610ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ // %RANGE% TUNE_ITEMS ipt 7:24:1 // %RANGE% TUNE_THREADS tpb 128:1024:32 // %RANGE% TUNE_MAGIC_NS ns 0:2048:4 // %RANGE% TUNE_DELAY_CONSTRUCTOR_ID dcid 0:7:1 // %RANGE% TUNE_L2_WRITE_LATENCY_NS l2w 0:1200:5 // %RANGE% TUNE_TRANSPOSE trp 0:1:1 // %RANGE% TUNE_LOAD ld 0:2:1 #include using op_t = max_t; #include "base.cuh" cccl-2.5.0/cub/benchmarks/bench/scan/exclusive/sum.cu000066400000000000000000000040601463375617100224740ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #include // %RANGE% TUNE_ITEMS ipt 7:24:1 // %RANGE% TUNE_THREADS tpb 128:1024:32 // %RANGE% TUNE_MAGIC_NS ns 0:2048:4 // %RANGE% TUNE_DELAY_CONSTRUCTOR_ID dcid 0:7:1 // %RANGE% TUNE_L2_WRITE_LATENCY_NS l2w 0:1200:5 // %RANGE% TUNE_TRANSPOSE trp 0:1:1 // %RANGE% TUNE_LOAD ld 0:2:1 using op_t = cub::Sum; #include "base.cuh" cccl-2.5.0/cub/benchmarks/bench/segmented_sort/000077500000000000000000000000001463375617100214265ustar00rootroot00000000000000cccl-2.5.0/cub/benchmarks/bench/segmented_sort/keys.cu000066400000000000000000000245011463375617100227340ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #include #include // %RANGE% TUNE_L_ITEMS ipt 7:24:1 // %RANGE% TUNE_M_ITEMS ipmw 1:17:1 // %RANGE% TUNE_S_ITEMS ipsw 1:17:1 // %RANGE% TUNE_THREADS tpb 128:1024:32 // %RANGE% TUNE_SW_THREADS_POW2 tpsw 1:4:1 // %RANGE% TUNE_MW_THREADS_POW2 tpmw 1:5:1 // %RANGE% TUNE_RADIX_BITS bits 4:8:1 // %RANGE% TUNE_PARTITIONING_THRESHOLD pt 100:800:50 // %RANGE% TUNE_RANK_ALGORITHM ra 0:4:1 // %RANGE% TUNE_LOAD ld 0:2:1 // %RANGE% TUNE_TRANSPOSE trp 0:1:1 // %RANGE% TUNE_S_LOAD sld 0:2:1 // %RANGE% TUNE_S_TRANSPOSE strp 0:1:1 // %RANGE% TUNE_M_LOAD mld 0:2:1 // %RANGE% TUNE_M_TRANSPOSE mtrp 0:1:1 #if !TUNE_BASE # define TUNE_SW_THREADS (1 << TUNE_SW_THREADS_POW2) # define TUNE_MW_THREADS (1 << TUNE_MW_THREADS_POW2) # define SMALL_SEGMENT_SIZE TUNE_S_ITEMS* TUNE_SW_THREADS # define MEDIUM_SEGMENT_SIZE TUNE_M_ITEMS* TUNE_MW_THREADS # define LARGE_SEGMENT_SIZE TUNE_L_ITEMS* TUNE_THREADS # if (LARGE_SEGMENT_SIZE <= SMALL_SEGMENT_SIZE) || (LARGE_SEGMENT_SIZE <= MEDIUM_SEGMENT_SIZE) # error Large segment size must be larger than small and medium segment sizes # endif # if (MEDIUM_SEGMENT_SIZE <= SMALL_SEGMENT_SIZE) # error Medium segment size must be larger than small one # endif # if TUNE_LOAD == 0 # define TUNE_LOAD_MODIFIER cub::LOAD_DEFAULT # elif TUNE_LOAD == 1 # define TUNE_LOAD_MODIFIER cub::LOAD_LDG # else // TUNE_LOAD == 2 # define TUNE_LOAD_MODIFIER cub::LOAD_CA # endif // TUNE_LOAD # if TUNE_S_LOAD == 0 # define TUNE_S_LOAD_MODIFIER cub::LOAD_DEFAULT # elif TUNE_S_LOAD == 1 # define TUNE_S_LOAD_MODIFIER cub::LOAD_LDG # else // TUNE_S_LOAD == 2 # define TUNE_S_LOAD_MODIFIER cub::LOAD_CA # endif // TUNE_S_LOAD # if TUNE_M_LOAD == 0 # define TUNE_M_LOAD_MODIFIER cub::LOAD_DEFAULT # elif TUNE_M_LOAD == 1 # define TUNE_M_LOAD_MODIFIER cub::LOAD_LDG # else // TUNE_M_LOAD == 2 # define TUNE_M_LOAD_MODIFIER cub::LOAD_CA # endif // TUNE_M_LOAD # if TUNE_TRANSPOSE == 0 # define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_DIRECT # else // TUNE_TRANSPOSE == 1 # define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_WARP_TRANSPOSE # endif // TUNE_TRANSPOSE # if TUNE_S_TRANSPOSE == 0 # define TUNE_S_LOAD_ALGORITHM cub::WarpLoadAlgorithm::WARP_LOAD_DIRECT # else // TUNE_S_TRANSPOSE == 1 # define TUNE_S_LOAD_ALGORITHM cub::WarpLoadAlgorithm::WARP_LOAD_TRANSPOSE # endif // TUNE_S_TRANSPOSE # if TUNE_M_TRANSPOSE == 0 # define TUNE_M_LOAD_ALGORITHM cub::WarpLoadAlgorithm::WARP_LOAD_DIRECT # else // TUNE_M_TRANSPOSE == 1 # define TUNE_M_LOAD_ALGORITHM cub::WarpLoadAlgorithm::WARP_LOAD_TRANSPOSE # endif // TUNE_M_TRANSPOSE template struct device_seg_sort_policy_hub { using DominantT = KeyT; struct Policy350 : cub::ChainedPolicy<350, Policy350, Policy350> { static constexpr int BLOCK_THREADS = TUNE_THREADS; static constexpr int RADIX_BITS = TUNE_RADIX_BITS; static constexpr int PARTITIONING_THRESHOLD = TUNE_PARTITIONING_THRESHOLD; using LargeSegmentPolicy = cub::AgentRadixSortDownsweepPolicy< BLOCK_THREADS, TUNE_L_ITEMS, DominantT, TUNE_LOAD_ALGORITHM, TUNE_LOAD_MODIFIER, static_cast(TUNE_RANK_ALGORITHM), cub::BLOCK_SCAN_WARP_SCANS, RADIX_BITS>; static constexpr int ITEMS_PER_SMALL_THREAD = TUNE_S_ITEMS; static constexpr int ITEMS_PER_MEDIUM_THREAD = TUNE_M_ITEMS; using SmallAndMediumSegmentedSortPolicyT = cub::AgentSmallAndMediumSegmentedSortPolicy< BLOCK_THREADS, // Small policy cub:: AgentSubWarpMergeSortPolicy, // Medium policy cub::AgentSubWarpMergeSortPolicy>; }; using MaxPolicy = Policy350; }; #endif // !TUNE_BASE template void seg_sort(nvbench::state& state, nvbench::type_list ts, const thrust::device_vector& offsets, bit_entropy entropy) { constexpr bool is_descending = false; constexpr bool is_overwrite_ok = false; using offset_t = OffsetT; using begin_offset_it_t = const offset_t*; using end_offset_it_t = const offset_t*; using key_t = T; using value_t = cub::NullType; #if !TUNE_BASE using policy_t = device_seg_sort_policy_hub; using dispatch_t = // cub::DispatchSegmentedSort; #else using dispatch_t = // cub::DispatchSegmentedSort; #endif const auto elements = static_cast(state.get_int64("Elements{io}")); const auto segments = offsets.size() - 1; thrust::device_vector buffer_1 = generate(elements, entropy); thrust::device_vector buffer_2(elements); key_t* d_buffer_1 = thrust::raw_pointer_cast(buffer_1.data()); key_t* d_buffer_2 = thrust::raw_pointer_cast(buffer_2.data()); cub::DoubleBuffer d_keys(d_buffer_1, d_buffer_2); cub::DoubleBuffer d_values; begin_offset_it_t d_begin_offsets = thrust::raw_pointer_cast(offsets.data()); end_offset_it_t d_end_offsets = d_begin_offsets + 1; state.add_element_count(elements); state.add_global_memory_reads(elements); state.add_global_memory_writes(elements); state.add_global_memory_reads(segments + 1); std::size_t temp_storage_bytes{}; std::uint8_t* d_temp_storage{}; dispatch_t::Dispatch( d_temp_storage, temp_storage_bytes, d_keys, d_values, elements, segments, d_begin_offsets, d_end_offsets, is_overwrite_ok, 0); thrust::device_vector temp_storage(temp_storage_bytes); d_temp_storage = thrust::raw_pointer_cast(temp_storage.data()); state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch& launch) { cub::DoubleBuffer keys = d_keys; cub::DoubleBuffer values = d_values; dispatch_t::Dispatch( d_temp_storage, temp_storage_bytes, keys, values, elements, segments, d_begin_offsets, d_end_offsets, is_overwrite_ok, launch.get_stream()); }); } using some_offset_types = nvbench::type_list; template void power_law(nvbench::state& state, nvbench::type_list ts) { const auto elements = static_cast(state.get_int64("Elements{io}")); const auto segments = static_cast(state.get_int64("Segments{io}")); const bit_entropy entropy = str_to_entropy(state.get_string("Entropy")); thrust::device_vector offsets = generate.power_law.segment_offsets(elements, segments); seg_sort(state, ts, offsets, entropy); } NVBENCH_BENCH_TYPES(power_law, NVBENCH_TYPE_AXES(fundamental_types, some_offset_types)) .set_name("power") .set_type_axes_names({"T{ct}", "OffsetT{ct}"}) .add_int64_power_of_two_axis("Elements{io}", nvbench::range(22, 30, 4)) .add_int64_power_of_two_axis("Segments{io}", nvbench::range(12, 20, 4)) .add_string_axis("Entropy", {"1.000", "0.201"}); template void uniform(nvbench::state& state, nvbench::type_list ts) { const auto elements = static_cast(state.get_int64("Elements{io}")); const auto max_segment_size = static_cast(state.get_int64("MaxSegmentSize")); const auto max_segment_size_log = static_cast(std::log2(max_segment_size)); const auto min_segment_size = 1 << (max_segment_size_log - 1); thrust::device_vector offsets = generate.uniform.segment_offsets(elements, min_segment_size, max_segment_size); seg_sort(state, ts, offsets, bit_entropy::_1_000); } NVBENCH_BENCH_TYPES(uniform, NVBENCH_TYPE_AXES(fundamental_types, some_offset_types)) .set_name("small") .set_type_axes_names({"T{ct}", "OffsetT{ct}"}) .add_int64_power_of_two_axis("Elements{io}", nvbench::range(22, 30, 4)) .add_int64_power_of_two_axis("MaxSegmentSize", nvbench::range(1, 8, 1)); NVBENCH_BENCH_TYPES(uniform, NVBENCH_TYPE_AXES(fundamental_types, some_offset_types)) .set_name("large") .set_type_axes_names({"T{ct}", "OffsetT{ct}"}) .add_int64_power_of_two_axis("Elements{io}", nvbench::range(22, 30, 4)) .add_int64_power_of_two_axis("MaxSegmentSize", nvbench::range(10, 18, 2)); cccl-2.5.0/cub/benchmarks/bench/select/000077500000000000000000000000001463375617100176635ustar00rootroot00000000000000cccl-2.5.0/cub/benchmarks/bench/select/flagged.cu000066400000000000000000000141451463375617100216120ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #include #include #include #include // %RANGE% TUNE_TRANSPOSE trp 0:1:1 // %RANGE% TUNE_LOAD ld 0:1:1 // %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1 // %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32 // %RANGE% TUNE_MAGIC_NS ns 0:2048:4 // %RANGE% TUNE_DELAY_CONSTRUCTOR_ID dcid 0:7:1 // %RANGE% TUNE_L2_WRITE_LATENCY_NS l2w 0:1200:5 constexpr bool keep_rejects = false; constexpr bool may_alias = false; #if !TUNE_BASE # if TUNE_TRANSPOSE == 0 # define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_DIRECT # else // TUNE_TRANSPOSE == 1 # define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_WARP_TRANSPOSE # endif // TUNE_TRANSPOSE # if TUNE_LOAD == 0 # define TUNE_LOAD_MODIFIER cub::LOAD_DEFAULT # else // TUNE_LOAD == 1 # define TUNE_LOAD_MODIFIER cub::LOAD_CA # endif // TUNE_LOAD template struct policy_hub_t { struct policy_t : cub::ChainedPolicy<300, policy_t, policy_t> { static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = TUNE_ITEMS_PER_THREAD; static constexpr int ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT)))); using SelectIfPolicyT = cub::AgentSelectIfPolicy; }; using MaxPolicy = policy_t; }; #endif // !TUNE_BASE template void select(nvbench::state& state, nvbench::type_list) { using input_it_t = const T*; using flag_it_t = const bool*; using output_it_t = T*; using num_selected_it_t = OffsetT*; using select_op_t = cub::NullType; using equality_op_t = cub::NullType; using offset_t = OffsetT; #if !TUNE_BASE using policy_t = policy_hub_t; using dispatch_t = cub::DispatchSelectIf< input_it_t, flag_it_t, output_it_t, num_selected_it_t, select_op_t, equality_op_t, offset_t, keep_rejects, may_alias, policy_t>; #else // TUNE_BASE using dispatch_t = cub::DispatchSelectIf< input_it_t, flag_it_t, output_it_t, num_selected_it_t, select_op_t, equality_op_t, offset_t, keep_rejects, may_alias>; #endif // !TUNE_BASE // Retrieve axis parameters const auto elements = static_cast(state.get_int64("Elements{io}")); const bit_entropy entropy = str_to_entropy(state.get_string("Entropy")); auto generator = generate(elements, entropy); thrust::device_vector in = generator; thrust::device_vector flags = generator; thrust::device_vector num_selected(1); // TODO Extract into helper TU const auto selected_elements = thrust::count(flags.cbegin(), flags.cend(), true); thrust::device_vector out(selected_elements); input_it_t d_in = thrust::raw_pointer_cast(in.data()); flag_it_t d_flags = thrust::raw_pointer_cast(flags.data()); output_it_t d_out = thrust::raw_pointer_cast(out.data()); num_selected_it_t d_num_selected = thrust::raw_pointer_cast(num_selected.data()); state.add_element_count(elements); state.add_global_memory_reads(elements); state.add_global_memory_reads(elements); state.add_global_memory_writes(selected_elements); state.add_global_memory_writes(1); std::size_t temp_size{}; dispatch_t::Dispatch( nullptr, temp_size, d_in, d_flags, d_out, d_num_selected, select_op_t{}, equality_op_t{}, elements, 0); thrust::device_vector temp(temp_size); auto* temp_storage = thrust::raw_pointer_cast(temp.data()); state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) { dispatch_t::Dispatch( temp_storage, temp_size, d_in, d_flags, d_out, d_num_selected, select_op_t{}, equality_op_t{}, elements, launch.get_stream()); }); } NVBENCH_BENCH_TYPES(select, NVBENCH_TYPE_AXES(fundamental_types, offset_types)) .set_name("base") .set_type_axes_names({"T{ct}", "OffsetT{ct}"}) .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4)) .add_string_axis("Entropy", {"1.000", "0.544", "0.000"}); cccl-2.5.0/cub/benchmarks/bench/select/if.cu000066400000000000000000000151051463375617100206140ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #include #include #include #include #include // %RANGE% TUNE_TRANSPOSE trp 0:1:1 // %RANGE% TUNE_LOAD ld 0:1:1 // %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1 // %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32 // %RANGE% TUNE_MAGIC_NS ns 0:2048:4 // %RANGE% TUNE_DELAY_CONSTRUCTOR_ID dcid 0:7:1 // %RANGE% TUNE_L2_WRITE_LATENCY_NS l2w 0:1200:5 constexpr bool keep_rejects = false; constexpr bool may_alias = false; #if !TUNE_BASE # if TUNE_TRANSPOSE == 0 # define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_DIRECT # else // TUNE_TRANSPOSE == 1 # define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_WARP_TRANSPOSE # endif // TUNE_TRANSPOSE # if TUNE_LOAD == 0 # define TUNE_LOAD_MODIFIER cub::LOAD_DEFAULT # else // TUNE_LOAD == 1 # define TUNE_LOAD_MODIFIER cub::LOAD_CA # endif // TUNE_LOAD template struct policy_hub_t { struct policy_t : cub::ChainedPolicy<300, policy_t, policy_t> { static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = TUNE_ITEMS_PER_THREAD; static constexpr int ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT)))); using SelectIfPolicyT = cub::AgentSelectIfPolicy; }; using MaxPolicy = policy_t; }; #endif // !TUNE_BASE template struct less_then_t { T m_val; __device__ bool operator()(const T& val) const { return val < m_val; } }; template T value_from_entropy(double percentage) { if (percentage == 1) { return std::numeric_limits::max(); } const auto max_val = static_cast(std::numeric_limits::max()); const auto min_val = static_cast(std::numeric_limits::lowest()); const auto result = min_val + percentage * max_val - percentage * min_val; return static_cast(result); } template void select(nvbench::state& state, nvbench::type_list) { using input_it_t = const T*; using flag_it_t = cub::NullType*; using output_it_t = T*; using num_selected_it_t = OffsetT*; using select_op_t = less_then_t; using equality_op_t = cub::NullType; using offset_t = OffsetT; #if !TUNE_BASE using policy_t = policy_hub_t; using dispatch_t = cub::DispatchSelectIf< input_it_t, flag_it_t, output_it_t, num_selected_it_t, select_op_t, equality_op_t, offset_t, keep_rejects, may_alias, policy_t>; #else // TUNE_BASE using dispatch_t = cub::DispatchSelectIf< input_it_t, flag_it_t, output_it_t, num_selected_it_t, select_op_t, equality_op_t, offset_t, keep_rejects, may_alias>; #endif // TUNE_BASE // Retrieve axis parameters const auto elements = static_cast(state.get_int64("Elements{io}")); const bit_entropy entropy = str_to_entropy(state.get_string("Entropy")); T val = value_from_entropy(entropy_to_probability(entropy)); select_op_t select_op{val}; thrust::device_vector in = generate(elements); thrust::device_vector num_selected(1); // TODO Extract into helper TU const auto selected_elements = thrust::count_if(in.cbegin(), in.cend(), select_op); thrust::device_vector out(selected_elements); input_it_t d_in = thrust::raw_pointer_cast(in.data()); flag_it_t d_flags = nullptr; output_it_t d_out = thrust::raw_pointer_cast(out.data()); num_selected_it_t d_num_selected = thrust::raw_pointer_cast(num_selected.data()); state.add_element_count(elements); state.add_global_memory_reads(elements); state.add_global_memory_writes(selected_elements); state.add_global_memory_writes(1); std::size_t temp_size{}; dispatch_t::Dispatch( nullptr, temp_size, d_in, d_flags, d_out, d_num_selected, select_op, equality_op_t{}, elements, 0); thrust::device_vector temp(temp_size); auto* temp_storage = thrust::raw_pointer_cast(temp.data()); state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) { dispatch_t::Dispatch( temp_storage, temp_size, d_in, d_flags, d_out, d_num_selected, select_op, equality_op_t{}, elements, launch.get_stream()); }); } NVBENCH_BENCH_TYPES(select, NVBENCH_TYPE_AXES(fundamental_types, offset_types)) .set_name("base") .set_type_axes_names({"T{ct}", "OffsetT{ct}"}) .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4)) .add_string_axis("Entropy", {"1.000", "0.544", "0.000"}); cccl-2.5.0/cub/benchmarks/bench/select/unique_by_key.cu000066400000000000000000000160071463375617100230700ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #include #include #include // %RANGE% TUNE_ITEMS ipt 7:24:1 // %RANGE% TUNE_THREADS tpb 128:1024:32 // %RANGE% TUNE_TRANSPOSE trp 0:1:1 // %RANGE% TUNE_LOAD ld 0:1:1 // %RANGE% TUNE_MAGIC_NS ns 0:2048:4 // %RANGE% TUNE_DELAY_CONSTRUCTOR_ID dcid 0:7:1 // %RANGE% TUNE_L2_WRITE_LATENCY_NS l2w 0:1200:5 #if !TUNE_BASE # if TUNE_TRANSPOSE == 0 # define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_DIRECT # else // TUNE_TRANSPOSE == 1 # define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_WARP_TRANSPOSE # endif // TUNE_TRANSPOSE # if TUNE_LOAD == 0 # define TUNE_LOAD_MODIFIER cub::LOAD_DEFAULT # else // TUNE_LOAD == 1 # define TUNE_LOAD_MODIFIER cub::LOAD_CA # endif // TUNE_LOAD struct policy_hub { struct Policy350 : cub::ChainedPolicy<350, Policy350, Policy350> { using UniqueByKeyPolicyT = cub::AgentUniqueByKeyPolicy; }; using MaxPolicy = Policy350; }; #endif // !TUNE_BASE template static void select(nvbench::state& state, nvbench::type_list) { using keys_input_it_t = const KeyT*; using keys_output_it_t = KeyT*; using vals_input_it_t = const ValueT*; using vals_output_it_t = ValueT*; using num_runs_output_iterator_t = OffsetT*; using equality_op_t = cub::Equality; using offset_t = OffsetT; #if !TUNE_BASE using dispatch_t = cub::DispatchUniqueByKey< keys_input_it_t, vals_input_it_t, keys_output_it_t, vals_output_it_t, num_runs_output_iterator_t, equality_op_t, offset_t, policy_hub>; #else using dispatch_t = cub::DispatchUniqueByKey; #endif const auto elements = static_cast(state.get_int64("Elements{io}")); constexpr std::size_t min_segment_size = 1; const std::size_t max_segment_size = static_cast(state.get_int64("MaxSegSize")); thrust::device_vector num_runs_out(1); thrust::device_vector in_vals(elements); thrust::device_vector out_vals(elements); thrust::device_vector out_keys(elements); thrust::device_vector in_keys = generate.uniform.key_segments(elements, min_segment_size, max_segment_size); KeyT* d_in_keys = thrust::raw_pointer_cast(in_keys.data()); KeyT* d_out_keys = thrust::raw_pointer_cast(out_keys.data()); ValueT* d_in_vals = thrust::raw_pointer_cast(in_vals.data()); ValueT* d_out_vals = thrust::raw_pointer_cast(out_vals.data()); OffsetT* d_num_runs_out = thrust::raw_pointer_cast(num_runs_out.data()); std::uint8_t* d_temp_storage{}; std::size_t temp_storage_bytes{}; dispatch_t::Dispatch( d_temp_storage, temp_storage_bytes, d_in_keys, d_in_vals, d_out_keys, d_out_vals, d_num_runs_out, equality_op_t{}, elements, 0); thrust::device_vector temp_storage(temp_storage_bytes); d_temp_storage = thrust::raw_pointer_cast(temp_storage.data()); dispatch_t::Dispatch( d_temp_storage, temp_storage_bytes, d_in_keys, d_in_vals, d_out_keys, d_out_vals, d_num_runs_out, equality_op_t{}, elements, 0); cudaDeviceSynchronize(); const OffsetT num_runs = num_runs_out[0]; state.add_element_count(elements); state.add_global_memory_reads(elements); state.add_global_memory_reads(elements); state.add_global_memory_writes(num_runs); state.add_global_memory_writes(num_runs); state.add_global_memory_writes(1); state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) { dispatch_t::Dispatch( d_temp_storage, temp_storage_bytes, d_in_keys, d_in_vals, d_out_keys, d_out_vals, d_num_runs_out, equality_op_t{}, elements, launch.get_stream()); }); } using some_offset_types = nvbench::type_list; #ifdef TUNE_KeyT using key_types = nvbench::type_list; #else // !defined(TUNE_KeyT) using key_types = nvbench::type_list; #endif // TUNE_KeyT #ifdef TUNE_ValueT using value_types = nvbench::type_list; #else // !defined(TUNE_ValueT) using value_types = all_types; #endif // TUNE_ValueT NVBENCH_BENCH_TYPES(select, NVBENCH_TYPE_AXES(key_types, value_types, some_offset_types)) .set_name("base") .set_type_axes_names({"KeyT{ct}", "ValueT{ct}", "OffsetT{ct}"}) .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4)) .add_int64_power_of_two_axis("MaxSegSize", {1, 4, 8}); cccl-2.5.0/cub/benchmarks/bench/transform_reduce/000077500000000000000000000000001463375617100217465ustar00rootroot00000000000000cccl-2.5.0/cub/benchmarks/bench/transform_reduce/sum.cu000066400000000000000000000162441463375617100231120ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #include "thrust/iterator/transform_iterator.h" #include // %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1 // %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32 // %RANGE% TUNE_ITEMS_PER_VEC_LOAD_POW2 ipv 1:2:1 #ifndef TUNE_BASE # define TUNE_ITEMS_PER_VEC_LOAD (1 << TUNE_ITEMS_PER_VEC_LOAD_POW2) #endif #if !TUNE_BASE template struct policy_hub_t { struct policy_t : cub::ChainedPolicy<300, policy_t, policy_t> { static constexpr int threads_per_block = TUNE_THREADS_PER_BLOCK; static constexpr int items_per_thread = TUNE_ITEMS_PER_THREAD; static constexpr int items_per_vec_load = TUNE_ITEMS_PER_VEC_LOAD; using ReducePolicy = cub::AgentReducePolicy; // SingleTilePolicy using SingleTilePolicy = ReducePolicy; // SegmentedReducePolicy using SegmentedReducePolicy = ReducePolicy; }; using MaxPolicy = policy_t; }; #endif // !TUNE_BASE template struct square_t { __host__ __device__ T operator()(const T& x) const { return x * x; } }; #define USE_TRANSPOSE_ITERATOR 0 #if USE_TRANSPOSE_ITERATOR template void reduce(nvbench::state& state, nvbench::type_list) { using accum_t = T; using input_it_t = thrust::transform_iterator, typename thrust::device_vector::iterator>; using output_it_t = T*; using offset_t = cub::detail::choose_offset_t; using output_t = T; using init_t = T; using reduction_op_t = cub::Sum; using transform_op_t = square_t; # if !TUNE_BASE using policy_t = policy_hub_t; using dispatch_t = cub::DispatchReduce; # else // TUNE_BASE using dispatch_t = cub::DispatchReduce; # endif // TUNE_BASE // Retrieve axis parameters const auto elements = static_cast(state.get_int64("Elements{io}")); thrust::device_vector in = generate(elements); thrust::device_vector out(1); input_it_t d_in = thrust::make_transform_iterator(in.begin(), square_t{}); output_it_t d_out = thrust::raw_pointer_cast(out.data()); // Enable throughput calculations and add "Size" column to results. state.add_element_count(elements); state.add_global_memory_reads(elements, "Size"); state.add_global_memory_writes(1); // Allocate temporary storage: std::size_t temp_size; dispatch_t::Dispatch( nullptr, temp_size, d_in, d_out, static_cast(elements), reduction_op_t{}, init_t{}, 0 /* stream */); thrust::device_vector temp(temp_size); auto* temp_storage = thrust::raw_pointer_cast(temp.data()); state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) { dispatch_t::Dispatch( temp_storage, temp_size, d_in, d_out, static_cast(elements), reduction_op_t{}, init_t{}, launch.get_stream()); }); } #else template void reduce(nvbench::state& state, nvbench::type_list) { using accum_t = T; using input_it_t = const T*; using output_it_t = T*; using offset_t = cub::detail::choose_offset_t; using output_t = T; using init_t = T; using reduction_op_t = cub::Sum; using transform_op_t = square_t; # if !TUNE_BASE using policy_t = policy_hub_t; using dispatch_t = cub:: DispatchTransformReduce; # else // TUNE_BASE using dispatch_t = cub::DispatchTransformReduce; # endif // TUNE_BASE // Retrieve axis parameters const auto elements = static_cast(state.get_int64("Elements{io}")); thrust::device_vector in = generate(elements); thrust::device_vector out(1); input_it_t d_in = thrust::raw_pointer_cast(in.data()); output_it_t d_out = thrust::raw_pointer_cast(out.data()); // Enable throughput calculations and add "Size" column to results. state.add_element_count(elements); state.add_global_memory_reads(elements, "Size"); state.add_global_memory_writes(1); // Allocate temporary storage: std::size_t temp_size; dispatch_t::Dispatch( nullptr, temp_size, d_in, d_out, static_cast(elements), reduction_op_t{}, init_t{}, 0 /* stream */, transform_op_t{}); thrust::device_vector temp(temp_size); auto* temp_storage = thrust::raw_pointer_cast(temp.data()); state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) { dispatch_t::Dispatch( temp_storage, temp_size, d_in, d_out, static_cast(elements), reduction_op_t{}, init_t{}, launch.get_stream(), transform_op_t{}); }); } #endif NVBENCH_BENCH_TYPES(reduce, NVBENCH_TYPE_AXES(all_types, offset_types)) .set_name("base") .set_type_axes_names({"T{ct}", "OffsetT{ct}"}) .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4)); cccl-2.5.0/cub/benchmarks/docker/000077500000000000000000000000001463375617100165745ustar00rootroot00000000000000cccl-2.5.0/cub/benchmarks/docker/.gitignore000066400000000000000000000000131463375617100205560ustar00rootroot00000000000000Dockerfile cccl-2.5.0/cub/benchmarks/docker/recipe.py000066400000000000000000000013111463375617100204110ustar00rootroot00000000000000#!/usr/bin/env python import hpccm hpccm.config.set_container_format('docker') Stage0 += hpccm.primitives.baseimage(image='nvidia/cuda:12.2.0-devel-ubuntu22.04') Stage0 += hpccm.building_blocks.apt_get(ospackages=['git', 'tmux', 'gcc', 'g++', 'vim', 'python3', 'python-is-python3', 'ninja-build']) # Stage0 += hpccm.building_blocks.llvm(version='15', extra_tools=True, toolset=True) Stage0 += hpccm.building_blocks.cmake(eula=True, version='3.26.3') # Stage0 += hpccm.building_blocks.nsight_compute(eula=True, version='2023.1.1') Stage0 += hpccm.building_blocks.pip(packages=['fpzip', 'numpy', 'pandas', 'pynvml'], pip='pip3') Stage0 += hpccm.primitives.environment(variables={'CUDA_MODULE_LOADING': 'EAGER'}) cccl-2.5.0/cub/benchmarks/nvbench_helper/000077500000000000000000000000001463375617100203075ustar00rootroot00000000000000cccl-2.5.0/cub/benchmarks/nvbench_helper/CMakeLists.txt000066400000000000000000000043261463375617100230540ustar00rootroot00000000000000# Fetch nvbench CPMAddPackage("gh:NVIDIA/nvbench#main") add_library(nvbench_helper OBJECT nvbench_helper/nvbench_helper.cuh nvbench_helper/nvbench_helper.cu) target_link_libraries(nvbench_helper PUBLIC CUB::CUB Thrust::Thrust CUB::libcudacxx nvbench::nvbench PRIVATE CUDA::curand) target_include_directories(nvbench_helper PUBLIC "${CMAKE_CURRENT_LIST_DIR}/nvbench_helper") set_target_properties(nvbench_helper PROPERTIES CUDA_STANDARD 17 CXX_STANDARD 17) CPMAddPackage("gh:catchorg/Catch2@2.13.9") option(CUB_ENABLE_NVBENCH_HELPER_TESTS "Enable tests for nvbench_helper" OFF) mark_as_advanced(CUB_ENABLE_NVBENCH_HELPER_TESTS) if (CUB_ENABLE_NVBENCH_HELPER_TESTS) CPMAddPackage(NAME Boost VERSION 1.83.0 GITHUB_REPOSITORY "boostorg/boost" GIT_TAG "boost-1.83.0") function(add_nvbench_helper_test device_system) set(nvbench_helper_test_target nvbench_helper.test.${device_system}) add_executable(${nvbench_helper_test_target} test/gen_seed.cu test/gen_range.cu test/gen_entropy.cu test/gen_uniform_distribution.cu test/gen_power_law_distribution.cu test/main.cpp) target_link_libraries(${nvbench_helper_test_target} PRIVATE nvbench_helper Catch2::Catch2 Boost::math) if ("${device_system}" STREQUAL "cpp") target_compile_definitions(${nvbench_helper_test_target} PRIVATE THRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_CPP) endif() set_target_properties(${nvbench_helper_test_target} PROPERTIES ARCHIVE_OUTPUT_DIRECTORY "${CUB_LIBRARY_OUTPUT_DIR}" LIBRARY_OUTPUT_DIRECTORY "${CUB_LIBRARY_OUTPUT_DIR}" RUNTIME_OUTPUT_DIRECTORY "${CUB_EXECUTABLE_OUTPUT_DIR}" CUDA_STANDARD 17 CXX_STANDARD 17) endfunction() add_nvbench_helper_test(cpp) add_nvbench_helper_test(cuda) endif() cccl-2.5.0/cub/benchmarks/nvbench_helper/nvbench_helper/000077500000000000000000000000001463375617100232715ustar00rootroot00000000000000cccl-2.5.0/cub/benchmarks/nvbench_helper/nvbench_helper/look_back_helper.cuh000066400000000000000000000056421463375617100272640ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #pragma once #if !TUNE_BASE # include # include # if !defined(TUNE_MAGIC_NS) || !defined(TUNE_L2_WRITE_LATENCY_NS) || !defined(TUNE_DELAY_CONSTRUCTOR_ID) # error "TUNE_MAGIC_NS, TUNE_L2_WRITE_LATENCY_NS, and TUNE_DELAY_CONSTRUCTOR_ID must be defined" # endif using delay_constructors = nvbench::type_list< cub::detail::no_delay_constructor_t, cub::detail::fixed_delay_constructor_t, cub::detail::exponential_backoff_constructor_t, cub::detail::exponential_backoff_jitter_constructor_t, cub::detail::exponential_backoff_jitter_window_constructor_t, cub::detail::exponential_backon_jitter_window_constructor_t, cub::detail::exponential_backon_jitter_constructor_t, cub::detail::exponential_backon_constructor_t>; using delay_constructor_t = nvbench::tl::get; #endif // !TUNE_BASE cccl-2.5.0/cub/benchmarks/nvbench_helper/nvbench_helper/nvbench_helper.cu000066400000000000000000000570421463375617100266140ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "thrust/device_vector.h" #include #include namespace { constexpr double lognormal_mean = 3.0; constexpr double lognormal_sigma = 1.2; enum class executor { host, device }; class host_generator_t { public: template void generate(seed_t seed, cuda::std::span device_span, bit_entropy entropy, T min, T max); const double* new_uniform_distribution(seed_t seed, std::size_t num_items); const double* new_lognormal_distribution(seed_t seed, std::size_t num_items); const double* new_constant(std::size_t num_items, double val); private: thrust::host_vector m_distribution; }; const double* host_generator_t::new_uniform_distribution(seed_t seed, std::size_t num_items) { m_distribution.resize(num_items); double* h_distribution = thrust::raw_pointer_cast(m_distribution.data()); std::default_random_engine re(seed.get()); std::uniform_real_distribution dist(0.0, 1.0); for (std::size_t i = 0; i < num_items; i++) { h_distribution[i] = dist(re); } return h_distribution; } const double* host_generator_t::new_lognormal_distribution(seed_t seed, std::size_t num_items) { m_distribution.resize(num_items); double* h_distribution = thrust::raw_pointer_cast(m_distribution.data()); std::default_random_engine re(seed.get()); std::lognormal_distribution dist(lognormal_mean, lognormal_sigma); for (std::size_t i = 0; i < num_items; i++) { h_distribution[i] = dist(re); } return h_distribution; } const double* host_generator_t::new_constant(std::size_t num_items, double val) { m_distribution.resize(num_items); double* h_distribution = thrust::raw_pointer_cast(m_distribution.data()); thrust::fill_n(thrust::host, h_distribution, num_items, val); return h_distribution; } class device_generator_t { public: device_generator_t() { curandCreateGenerator(&m_gen, CURAND_RNG_PSEUDO_DEFAULT); } ~device_generator_t() { curandDestroyGenerator(m_gen); } template void generate(seed_t seed, cuda::std::span device_span, bit_entropy entropy, T min, T max); const double* new_uniform_distribution(seed_t seed, std::size_t num_items); const double* new_lognormal_distribution(seed_t seed, std::size_t num_items); const double* new_constant(std::size_t num_items, double val); private: curandGenerator_t m_gen; thrust::device_vector m_distribution; }; template struct random_to_item_t { double m_min; double m_max; __host__ __device__ random_to_item_t(T min, T max) : m_min(static_cast(min)) , m_max(static_cast(max)) {} __host__ __device__ T operator()(double random_value) const { if constexpr (std::is_floating_point_v) { return static_cast((m_max - m_min) * random_value + m_min); } else { return static_cast(floor((m_max - m_min + 1) * random_value + m_min)); } } }; const double* device_generator_t::new_uniform_distribution(seed_t seed, std::size_t num_items) { m_distribution.resize(num_items); double* d_distribution = thrust::raw_pointer_cast(m_distribution.data()); curandSetPseudoRandomGeneratorSeed(m_gen, seed.get()); curandGenerateUniformDouble(m_gen, d_distribution, num_items); return d_distribution; } const double* device_generator_t::new_lognormal_distribution(seed_t seed, std::size_t num_items) { m_distribution.resize(num_items); double* d_distribution = thrust::raw_pointer_cast(m_distribution.data()); curandSetPseudoRandomGeneratorSeed(m_gen, seed.get()); curandGenerateLogNormalDouble(m_gen, d_distribution, num_items, lognormal_mean, lognormal_sigma); return d_distribution; } const double* device_generator_t::new_constant(std::size_t num_items, double val) { m_distribution.resize(num_items); double* d_distribution = thrust::raw_pointer_cast(m_distribution.data()); thrust::fill_n(thrust::device, d_distribution, num_items, val); return d_distribution; } struct and_t { template __host__ __device__ T operator()(T a, T b) const { return a & b; } __host__ __device__ float operator()(float a, float b) const { const std::uint32_t result = reinterpret_cast(a) & reinterpret_cast(b); return reinterpret_cast(result); } __host__ __device__ double operator()(double a, double b) const { const std::uint64_t result = reinterpret_cast(a) & reinterpret_cast(b); return reinterpret_cast(result); } __host__ __device__ complex operator()(complex a, complex b) const { double a_real = a.real(); double a_imag = a.imag(); double b_real = b.real(); double b_imag = b.imag(); const std::uint64_t result_real = reinterpret_cast(a_real) & reinterpret_cast(b_real); const std::uint64_t result_imag = reinterpret_cast(a_imag) & reinterpret_cast(b_imag); return {static_cast(reinterpret_cast(result_real)), static_cast(reinterpret_cast(result_imag))}; } }; struct set_real_t { complex m_min{}; complex m_max{}; complex* m_d_in{}; const double* m_d_tmp{}; __host__ __device__ void operator()(std::size_t i) const { m_d_in[i].real(random_to_item_t{m_min.real(), m_max.real()}(m_d_tmp[i])); } }; struct set_imag_t { complex m_min{}; complex m_max{}; complex* m_d_in{}; const double* m_d_tmp{}; __host__ __device__ void operator()(std::size_t i) const { m_d_in[i].imag(random_to_item_t{m_min.imag(), m_max.imag()}(m_d_tmp[i])); } }; template struct lognormal_transformer_t { std::size_t total_elements; double sum; __host__ __device__ T operator()(double val) const { return floor(val * total_elements / sum); } }; class generator_t { public: template void generate(executor exec, seed_t seed, cuda::std::span span, bit_entropy entropy, T min, T max) { construct_guard(exec); if (exec == executor::device) { this->generate(thrust::device, *m_device_generator, seed, span, entropy, min, max); } else { this->generate(thrust::host, *m_host_generator, seed, span, entropy, min, max); } } template void power_law_segment_offsets(executor exec, seed_t seed, cuda::std::span span, std::size_t total_elements) { construct_guard(exec); if (exec == executor::device) { this->power_law_segment_offsets(thrust::device, *m_device_generator, seed, span, total_elements); } else { this->power_law_segment_offsets(thrust::host, *m_host_generator, seed, span, total_elements); } } private: void construct_guard(executor exec) { if (exec == executor::device) { if (!m_device_generator) { m_device_generator.emplace(); } } else { if (!m_host_generator) { m_host_generator.emplace(); } } } template void generate(const ExecT& exec, DistT& dist, seed_t seed, cuda::std::span span, bit_entropy entropy, T min, T max); template void generate(const ExecT& exec, DistT& dist, seed_t seed, cuda::std::span span, bit_entropy entropy, complex min, complex max); template void generate( const ExecT& exec, DistT& dist, seed_t seed, cuda::std::span span, bit_entropy entropy, bool min, bool max); template void power_law_segment_offsets( const ExecT& exec, DistT& dist, seed_t seed, cuda::std::span span, std::size_t total_elements); std::optional m_host_generator; std::optional m_device_generator; }; template void generator_t::generate( const ExecT& exec, DistT& dist, seed_t seed, cuda::std::span span, bit_entropy entropy, T min, T max) { switch (entropy) { case bit_entropy::_1_000: { const double* uniform_distribution = dist.new_uniform_distribution(seed, span.size()); thrust::transform( exec, uniform_distribution, uniform_distribution + span.size(), span.data(), random_to_item_t(min, max)); return; } case bit_entropy::_0_000: { std::mt19937 rng; rng.seed(static_cast(seed.get())); std::uniform_real_distribution dist(0.0f, 1.0f); T random_value = random_to_item_t(min, max)(dist(rng)); thrust::fill(exec, span.data(), span.data() + span.size(), random_value); return; } default: { const double* uniform_distribution = dist.new_uniform_distribution(seed, span.size()); ++seed; thrust::transform( exec, uniform_distribution, uniform_distribution + span.size(), span.data(), random_to_item_t(min, max)); const int number_of_steps = static_cast(entropy); constexpr bool is_device = std::is_same_v; using vec_t = std::conditional_t, thrust::host_vector>; vec_t tmp_vec(span.size()); cuda::std::span tmp(thrust::raw_pointer_cast(tmp_vec.data()), tmp_vec.size()); for (int i = 0; i < number_of_steps; i++, ++seed) { this->generate(is_device ? executor::device : executor::host, seed, tmp, bit_entropy::_1_000, min, max); thrust::transform(exec, span.data(), span.data() + span.size(), tmp.data(), span.data(), and_t{}); } return; } }; } template void generator_t::generate( const ExecT& exec, DistT& dist, seed_t seed, cuda::std::span span, bit_entropy entropy, complex min, complex max) { switch (entropy) { case bit_entropy::_1_000: { const double* uniform_distribution = dist.new_uniform_distribution(seed, span.size()); thrust::for_each_n( exec, thrust::make_counting_iterator(0), span.size(), set_real_t{min, max, span.data(), uniform_distribution}); ++seed; uniform_distribution = dist.new_uniform_distribution(seed, span.size()); thrust::for_each_n( exec, thrust::make_counting_iterator(0), span.size(), set_imag_t{min, max, span.data(), uniform_distribution}); ++seed; return; } case bit_entropy::_0_000: { std::mt19937 rng; rng.seed(static_cast(seed.get())); std::uniform_real_distribution dist(0.0f, 1.0f); const float random_imag = random_to_item_t(min.imag(), max.imag())(dist(rng)); const float random_real = random_to_item_t(min.imag(), max.imag())(dist(rng)); thrust::fill(exec, span.data(), span.data() + span.size(), complex{random_real, random_imag}); return; } default: { const double* uniform_distribution = dist.new_uniform_distribution(seed, span.size()); thrust::for_each_n( exec, thrust::make_counting_iterator(0), span.size(), set_real_t{min, max, span.data(), uniform_distribution}); ++seed; uniform_distribution = dist.new_uniform_distribution(seed, span.size()); thrust::for_each_n( exec, thrust::make_counting_iterator(0), span.size(), set_imag_t{min, max, span.data(), uniform_distribution}); ++seed; const int number_of_steps = static_cast(entropy); constexpr bool is_device = std::is_same_v; using vec_t = std::conditional_t, thrust::host_vector>; vec_t tmp_vec(span.size()); cuda::std::span tmp(thrust::raw_pointer_cast(tmp_vec.data()), tmp_vec.size()); for (int i = 0; i < number_of_steps; i++, ++seed) { this->generate(is_device ? executor::device : executor::host, seed, tmp, bit_entropy::_1_000, min, max); thrust::transform(exec, span.data(), span.data() + span.size(), tmp.data(), span.data(), and_t{}); // TODO issue } return; } }; } struct random_to_probability_t { double m_probability; __host__ __device__ bool operator()(double random_value) const { return random_value < m_probability; } }; template void generator_t::generate( const ExecT& exec, DistT& dist, seed_t seed, cuda::std::span span, bit_entropy entropy, bool /* min */, bool /* max */) { if (entropy == bit_entropy::_0_000) { thrust::fill(exec, span.data(), span.data() + span.size(), false); } else if (entropy == bit_entropy::_1_000) { thrust::fill(exec, span.data(), span.data() + span.size(), true); } else { const double* uniform_distribution = dist.new_uniform_distribution(seed, span.size()); thrust::transform( exec, uniform_distribution, uniform_distribution + span.size(), span.data(), random_to_probability_t{entropy_to_probability(entropy)}); } } template struct lognormal_adjust_t { T* segment_sizes{}; __host__ __device__ T operator()(std::size_t sid) const { return segment_sizes[sid] + 1; } }; template void generator_t::power_law_segment_offsets( const ExecT& exec, DistT& dist, seed_t seed, cuda::std::span device_segment_offsets, std::size_t total_elements) { const std::size_t total_segments = device_segment_offsets.size() - 1; const double* uniform_distribution = dist.new_lognormal_distribution(seed, total_segments); if (thrust::count(exec, uniform_distribution, uniform_distribution + total_segments, 0.0) == total_segments) { uniform_distribution = dist.new_constant(total_segments, 1.0); } const double sum = thrust::reduce(exec, uniform_distribution, uniform_distribution + total_segments); thrust::transform( exec, uniform_distribution, uniform_distribution + total_segments, device_segment_offsets.data(), lognormal_transformer_t{total_elements, sum}); const int diff = total_elements - thrust::reduce(exec, device_segment_offsets.data(), device_segment_offsets.data() + device_segment_offsets.size()); if (diff > 0) { thrust::tabulate(exec, device_segment_offsets.data(), device_segment_offsets.data() + diff, lognormal_adjust_t{device_segment_offsets.data()}); } thrust::exclusive_scan( exec, device_segment_offsets.data(), device_segment_offsets.data() + device_segment_offsets.size(), device_segment_offsets.data()); } template void gen(executor exec, seed_t seed, cuda::std::span span, bit_entropy entropy, T min, T max) { generator_t{}.generate(exec, seed, span, entropy, min, max); } } // namespace namespace detail { template void gen_host(seed_t seed, cuda::std::span span, bit_entropy entropy, T min, T max) { gen(executor::host, seed, span, entropy, min, max); } template void gen_device(seed_t seed, cuda::std::span device_span, bit_entropy entropy, T min, T max) { gen(executor::device, seed, device_span, entropy, min, max); } template struct offset_to_iterator_t { T* base_it; __host__ __device__ __forceinline__ T* operator()(std::size_t offset) const { return base_it + offset; } }; template struct repeat_index_t { __host__ __device__ __forceinline__ thrust::constant_iterator operator()(std::size_t i) { return thrust::constant_iterator(static_cast(i)); } }; struct offset_to_size_t { std::size_t* offsets = nullptr; __host__ __device__ __forceinline__ std::size_t operator()(std::size_t i) { return offsets[i + 1] - offsets[i]; } }; template void gen_key_segments(executor exec, seed_t seed, cuda::std::span keys, cuda::std::span segment_offsets) { thrust::counting_iterator iota(0); offset_to_iterator_t dst_transform_op{keys.data()}; const std::size_t total_segments = segment_offsets.size() - 1; auto d_range_srcs = thrust::make_transform_iterator(iota, repeat_index_t{}); auto d_range_dsts = thrust::make_transform_iterator(segment_offsets.data(), dst_transform_op); auto d_range_sizes = thrust::make_transform_iterator(iota, offset_to_size_t{segment_offsets.data()}); if (exec == executor::device) { std::uint8_t* d_temp_storage = nullptr; std::size_t temp_storage_bytes = 0; cub::DeviceCopy::Batched( d_temp_storage, temp_storage_bytes, d_range_srcs, d_range_dsts, d_range_sizes, total_segments); thrust::device_vector temp_storage(temp_storage_bytes); d_temp_storage = thrust::raw_pointer_cast(temp_storage.data()); cub::DeviceCopy::Batched( d_temp_storage, temp_storage_bytes, d_range_srcs, d_range_dsts, d_range_sizes, total_segments); cudaDeviceSynchronize(); } else { for (std::size_t sid = 0; sid < total_segments; sid++) { thrust::copy(d_range_srcs[sid], d_range_srcs[sid] + d_range_sizes[sid], d_range_dsts[sid]); } } } template struct ge_t { T val; __host__ __device__ bool operator()(T x) { return x >= val; } }; template std::size_t gen_uniform_offsets( executor exec, seed_t seed, cuda::std::span segment_offsets, std::size_t min_segment_size, std::size_t max_segment_size) { const T total_elements = segment_offsets.size() - 2; gen(exec, seed, segment_offsets, bit_entropy::_1_000, static_cast(min_segment_size), static_cast(max_segment_size)); auto tail = [&](const auto& policy) { thrust::fill_n(policy, segment_offsets.data() + total_elements, 1, total_elements + 1); thrust::exclusive_scan( policy, segment_offsets.data(), segment_offsets.data() + segment_offsets.size(), segment_offsets.data()); auto iter = thrust::find_if( policy, segment_offsets.data(), segment_offsets.data() + segment_offsets.size(), ge_t{total_elements}); auto dist = thrust::distance(segment_offsets.data(), iter); thrust::fill_n(policy, segment_offsets.data() + dist, 1, total_elements); return dist + 1; }; if (exec == executor::device) { return tail(thrust::device); } return tail(thrust::host); } } // namespace detail namespace detail { /** * @brief Generates a vector of random key segments. * * Not all parameter combinations can be satisfied. For instance, if the total * elements is less than the minimal segment size, the function will return a * vector with a single element that is outside of the requested range. * At most one segment can be out of the requested range. */ template void gen_uniform_key_segments_host( seed_t seed, cuda::std::span keys, std::size_t min_segment_size, std::size_t max_segment_size) { thrust::host_vector segment_offsets(keys.size() + 2); { cuda::std::span segment_offsets_span( thrust::raw_pointer_cast(segment_offsets.data()), segment_offsets.size()); const std::size_t offsets_size = gen_uniform_offsets(executor::host, seed, segment_offsets_span, min_segment_size, max_segment_size); segment_offsets.resize(offsets_size); } cuda::std::span segment_offsets_span( thrust::raw_pointer_cast(segment_offsets.data()), segment_offsets.size()); gen_key_segments(executor::host, seed, keys, segment_offsets_span); } template void gen_uniform_key_segments_device( seed_t seed, cuda::std::span keys, std::size_t min_segment_size, std::size_t max_segment_size) { thrust::device_vector segment_offsets(keys.size() + 2); { cuda::std::span segment_offsets_span( thrust::raw_pointer_cast(segment_offsets.data()), segment_offsets.size()); const std::size_t offsets_size = gen_uniform_offsets(executor::device, seed, segment_offsets_span, min_segment_size, max_segment_size); segment_offsets.resize(offsets_size); } cuda::std::span segment_offsets_span( thrust::raw_pointer_cast(segment_offsets.data()), segment_offsets.size()); gen_key_segments(executor::device, seed, keys, segment_offsets_span); } template std::size_t gen_uniform_segment_offsets_host( seed_t seed, cuda::std::span segment_offsets, std::size_t min_segment_size, std::size_t max_segment_size) { return gen_uniform_offsets(executor::host, seed, segment_offsets, min_segment_size, max_segment_size); } template std::size_t gen_uniform_segment_offsets_device( seed_t seed, cuda::std::span segment_offsets, std::size_t min_segment_size, std::size_t max_segment_size) { return gen_uniform_offsets(executor::device, seed, segment_offsets, min_segment_size, max_segment_size); } template void gen_power_law_segment_offsets_host(seed_t seed, cuda::std::span segment_offsets, std::size_t elements) { generator_t{}.power_law_segment_offsets(executor::host, seed, segment_offsets, elements); } template void gen_power_law_segment_offsets_device(seed_t seed, cuda::std::span segment_offsets, std::size_t elements) { generator_t{}.power_law_segment_offsets(executor::device, seed, segment_offsets, elements); } void do_not_optimize(const void* ptr) { (void) ptr; } } // namespace detail #define INSTANTIATE(TYPE) \ template void detail::gen_power_law_segment_offsets_host(seed_t, cuda::std::span, std::size_t); \ template void detail::gen_power_law_segment_offsets_device(seed_t, cuda::std::span, std::size_t); \ template std::size_t detail::gen_uniform_segment_offsets_host( \ seed_t, cuda::std::span, std::size_t, std::size_t); \ template std::size_t detail::gen_uniform_segment_offsets_device( \ seed_t, cuda::std::span, std::size_t, std::size_t) INSTANTIATE(uint32_t); INSTANTIATE(uint64_t); #undef INSTANTIATE #define INSTANTIATE(TYPE) \ template void detail::gen_uniform_key_segments_host(seed_t, cuda::std::span, std::size_t, std::size_t); \ template void detail::gen_uniform_key_segments_device(seed_t, cuda::std::span, std::size_t, std::size_t); \ template void detail::gen_device(seed_t, cuda::std::span, bit_entropy, TYPE min, TYPE max); \ template void detail::gen_host(seed_t, cuda::std::span, bit_entropy, TYPE min, TYPE max) INSTANTIATE(bool); INSTANTIATE(uint8_t); INSTANTIATE(uint16_t); INSTANTIATE(uint32_t); INSTANTIATE(uint64_t); INSTANTIATE(int8_t); INSTANTIATE(int16_t); INSTANTIATE(int32_t); INSTANTIATE(int64_t); #if NVBENCH_HELPER_HAS_I128 INSTANTIATE(int128_t); INSTANTIATE(uint128_t); #endif INSTANTIATE(float); INSTANTIATE(double); INSTANTIATE(complex); #undef INSTANTIATE cccl-2.5.0/cub/benchmarks/nvbench_helper/nvbench_helper/nvbench_helper.cuh000066400000000000000000000343521463375617100267630ustar00rootroot00000000000000#pragma once #include #include #include #include #include #include #include #include #if defined(_MSC_VER) # define NVBENCH_HELPER_HAS_I128 0 #else # define NVBENCH_HELPER_HAS_I128 1 #endif #if NVBENCH_HELPER_HAS_I128 using int128_t = __int128_t; using uint128_t = __uint128_t; NVBENCH_DECLARE_TYPE_STRINGS(int128_t, "I128", "int128_t"); NVBENCH_DECLARE_TYPE_STRINGS(uint128_t, "U128", "uint128_t"); #endif using complex = cuda::std::complex; NVBENCH_DECLARE_TYPE_STRINGS(complex, "C64", "complex"); namespace detail { template struct push_back {}; template struct push_back> { using type = nvbench::type_list; }; } // namespace detail template using push_back_t = typename detail::push_back::type; #ifdef TUNE_OffsetT using offset_types = nvbench::type_list; #else using offset_types = nvbench::type_list; #endif #ifdef TUNE_T using integral_types = nvbench::type_list; using fundamental_types = nvbench::type_list; using all_types = nvbench::type_list; #else using integral_types = nvbench::type_list; using fundamental_types = nvbench::type_list; using all_types = nvbench::type_list; #endif template class value_wrapper_t { T m_val{}; public: explicit value_wrapper_t(T val) : m_val(val) {} T get() const { return m_val; } value_wrapper_t& operator++() { m_val++; return *this; } }; class seed_t : public value_wrapper_t { public: using value_wrapper_t::value_wrapper_t; using value_wrapper_t::operator++; seed_t() : value_wrapper_t(42) {} }; enum class bit_entropy { _1_000 = 0, _0_811 = 1, _0_544 = 2, _0_337 = 3, _0_201 = 4, _0_000 = 4200 }; NVBENCH_DECLARE_TYPE_STRINGS(bit_entropy, "BE", "bit entropy"); [[nodiscard]] inline double entropy_to_probability(bit_entropy entropy) { switch (entropy) { case bit_entropy::_1_000: return 1.0; case bit_entropy::_0_811: return 0.811; case bit_entropy::_0_544: return 0.544; case bit_entropy::_0_337: return 0.337; case bit_entropy::_0_201: return 0.201; case bit_entropy::_0_000: return 0.0; default: return 0.0; } } [[nodiscard]] inline bit_entropy str_to_entropy(std::string str) { if (str == "1.000") { return bit_entropy::_1_000; } else if (str == "0.811") { return bit_entropy::_0_811; } else if (str == "0.544") { return bit_entropy::_0_544; } else if (str == "0.337") { return bit_entropy::_0_337; } else if (str == "0.201") { return bit_entropy::_0_201; } else if (str == "0.000") { return bit_entropy::_0_000; } throw std::runtime_error("Can't convert string to bit entropy"); } namespace detail { void do_not_optimize(const void* ptr); template void gen_host(seed_t seed, cuda::std::span data, bit_entropy entropy, T min, T max); template void gen_device(seed_t seed, cuda::std::span data, bit_entropy entropy, T min, T max); template void gen_uniform_key_segments_host( seed_t seed, cuda::std::span data, std::size_t min_segment_size, std::size_t max_segment_size); template void gen_uniform_key_segments_device( seed_t seed, cuda::std::span data, std::size_t min_segment_size, std::size_t max_segment_size); template std::size_t gen_uniform_segment_offsets_host( seed_t seed, cuda::std::span segment_offsets, std::size_t min_segment_size, std::size_t max_segment_size); template std::size_t gen_uniform_segment_offsets_device( seed_t seed, cuda::std::span segment_offsets, std::size_t min_segment_size, std::size_t max_segment_size); template void gen_power_law_segment_offsets_host(seed_t seed, cuda::std::span segment_offsets, std::size_t elements); template void gen_power_law_segment_offsets_device(seed_t seed, cuda::std::span segment_offsets, std::size_t elements); namespace { struct generator_base_t { seed_t m_seed{}; const std::size_t m_elements{0}; const bit_entropy m_entropy{bit_entropy::_1_000}; template thrust::device_vector generate(T min, T max) { thrust::device_vector vec(m_elements); cuda::std::span span(thrust::raw_pointer_cast(vec.data()), m_elements); #if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA gen_device(m_seed, span, m_entropy, min, max); #else gen_host(m_seed, span, m_entropy, min, max); #endif ++m_seed; return vec; } }; template struct vector_generator_t : generator_base_t { const T m_min{std::numeric_limits::min()}; const T m_max{std::numeric_limits::max()}; operator thrust::device_vector() { return generator_base_t::generate(m_min, m_max); } }; template <> struct vector_generator_t : generator_base_t { template operator thrust::device_vector() { return generator_base_t::generate(std::numeric_limits::min(), std::numeric_limits::max()); } // This overload is needed because numeric limits is not specialized for complex, making // the min and max values for complex equal zero. operator thrust::device_vector() { const complex min = complex{std::numeric_limits::min(), std::numeric_limits::min()}; const complex max = complex{std::numeric_limits::max(), std::numeric_limits::max()}; return generator_base_t::generate(min, max); } }; struct uniform_key_segments_generator_t { seed_t m_seed{}; const std::size_t m_total_elements{0}; const std::size_t m_min_segment_size{0}; const std::size_t m_max_segment_size{0}; template operator thrust::device_vector() { thrust::device_vector keys_vec(m_total_elements); cuda::std::span keys(thrust::raw_pointer_cast(keys_vec.data()), keys_vec.size()); #if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA gen_uniform_key_segments_device(m_seed, keys, m_min_segment_size, m_max_segment_size); #else gen_uniform_key_segments_host(m_seed, keys, m_min_segment_size, m_max_segment_size); #endif ++m_seed; return keys_vec; } }; struct uniform_segment_offsets_generator_t { seed_t m_seed{}; const std::size_t m_total_elements{0}; const std::size_t m_min_segment_size{0}; const std::size_t m_max_segment_size{0}; template operator thrust::device_vector() { thrust::device_vector offsets_vec(m_total_elements + 2); cuda::std::span offsets(thrust::raw_pointer_cast(offsets_vec.data()), offsets_vec.size()); const std::size_t offsets_size = #if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA gen_uniform_segment_offsets_device(m_seed, offsets, m_min_segment_size, m_max_segment_size); #else gen_uniform_segment_offsets_host(m_seed, offsets, m_min_segment_size, m_max_segment_size); #endif offsets_vec.resize(offsets_size); offsets_vec.shrink_to_fit(); ++m_seed; return offsets_vec; } }; struct power_law_segment_offsets_generator_t { seed_t m_seed{}; const std::size_t m_elements{0}; const std::size_t m_segments{0}; template operator thrust::device_vector() { thrust::device_vector offsets_vec(m_segments + 1); cuda::std::span offsets(thrust::raw_pointer_cast(offsets_vec.data()), offsets_vec.size()); #if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA gen_power_law_segment_offsets_device(m_seed, offsets, m_elements); #else gen_power_law_segment_offsets_host(m_seed, offsets, m_elements); #endif ++m_seed; return offsets_vec; } }; struct gen_uniform_key_segments_t { uniform_key_segments_generator_t operator()(std::size_t total_elements, std::size_t min_segment_size, std::size_t max_segment_size) const { return {seed_t{}, total_elements, min_segment_size, max_segment_size}; } }; struct gen_uniform_segment_offsets_t { uniform_segment_offsets_generator_t operator()(std::size_t total_elements, std::size_t min_segment_size, std::size_t max_segment_size) const { return {seed_t{}, total_elements, min_segment_size, max_segment_size}; } }; struct gen_uniform_t { gen_uniform_key_segments_t key_segments{}; gen_uniform_segment_offsets_t segment_offsets{}; }; struct gen_power_law_segment_offsets_t { power_law_segment_offsets_generator_t operator()(std::size_t elements, std::size_t segments) const { return {seed_t{}, elements, segments}; } }; struct gen_power_law_t { gen_power_law_segment_offsets_t segment_offsets{}; }; struct gen_t { vector_generator_t operator()(std::size_t elements, bit_entropy entropy = bit_entropy::_1_000) const { return {seed_t{}, elements, entropy}; } template vector_generator_t operator()( std::size_t elements, bit_entropy entropy = bit_entropy::_1_000, T min = std::numeric_limits::min, T max = std::numeric_limits::max()) const { return {seed_t{}, elements, entropy, min, max}; } gen_uniform_t uniform{}; gen_power_law_t power_law{}; }; } // namespace } // namespace detail inline detail::gen_t generate; template void do_not_optimize(const T& val) { detail::do_not_optimize(&val); } struct less_t { template __host__ __device__ bool operator()(const DataType& lhs, const DataType& rhs) const { return lhs < rhs; } }; template <> __host__ __device__ inline bool less_t::operator()(const complex& lhs, const complex& rhs) const { double magnitude_0 = cuda::std::abs(lhs); double magnitude_1 = cuda::std::abs(rhs); if (cuda::std::isnan(magnitude_0) || cuda::std::isnan(magnitude_1)) { // NaN's are always equal. return false; } else if (cuda::std::isinf(magnitude_0) || cuda::std::isinf(magnitude_1)) { // If the real or imaginary part of the complex number has a very large value // (close to the maximum representable value for a double), it is possible that // the magnitude computation can result in positive infinity: // ```cpp // const double large_number = std::numeric_limits::max() / 2; // std::complex z(large_number, large_number); // std::abs(z) == inf; // ``` // Dividing both components by a constant before computing the magnitude prevents overflow. const complex::value_type scaler = 0.5; magnitude_0 = cuda::std::abs(lhs * scaler); magnitude_1 = cuda::std::abs(rhs * scaler); } const complex::value_type difference = cuda::std::abs(magnitude_0 - magnitude_1); const complex::value_type threshold = cuda::std::numeric_limits::epsilon() * 2; if (difference < threshold) { // Triangles with the same magnitude are sorted by their phase angle. const complex::value_type phase_angle_0 = cuda::std::arg(lhs); const complex::value_type phase_angle_1 = cuda::std::arg(rhs); return phase_angle_0 < phase_angle_1; } else { return magnitude_0 < magnitude_1; } } struct max_t { template __host__ __device__ DataType operator()(const DataType& lhs, const DataType& rhs) { less_t less{}; return less(lhs, rhs) ? rhs : lhs; } }; namespace { struct caching_allocator_t { using value_type = char; caching_allocator_t() = default; ~caching_allocator_t() { free_all(); } char* allocate(std::ptrdiff_t num_bytes) { value_type* result{}; auto free_block = free_blocks.find(num_bytes); if (free_block != free_blocks.end()) { result = free_block->second; free_blocks.erase(free_block); } else { result = do_allocate(num_bytes); } allocated_blocks.insert(std::make_pair(result, num_bytes)); return result; } void deallocate(char* ptr, size_t) { auto iter = allocated_blocks.find(ptr); if (iter == allocated_blocks.end()) { throw std::runtime_error("Memory was not allocated by this allocator"); } std::ptrdiff_t num_bytes = iter->second; allocated_blocks.erase(iter); free_blocks.insert(std::make_pair(num_bytes, ptr)); } private: using free_blocks_type = std::multimap; using allocated_blocks_type = std::map; free_blocks_type free_blocks; allocated_blocks_type allocated_blocks; void free_all() { for (auto i : free_blocks) { do_deallocate(i.second); } for (auto i : allocated_blocks) { do_deallocate(i.first); } } value_type* do_allocate(std::size_t num_bytes) { value_type* result{}; #if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA const cudaError_t status = cudaMalloc(&result, num_bytes); if (cudaSuccess != status) { throw std::runtime_error(std::string("Failed to allocate device memory: ") + cudaGetErrorString(status)); } #else result = new value_type[num_bytes]; #endif return result; } void do_deallocate(value_type* ptr) { #if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA cudaFree(ptr); #else delete[] ptr; #endif } }; #if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA auto policy(caching_allocator_t& alloc) { return thrust::cuda::par(alloc); } #else auto policy(caching_allocator_t&) { return thrust::device; } #endif #if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA auto policy(caching_allocator_t& alloc, nvbench::launch& launch) { return thrust::cuda::par(alloc).on(launch.get_stream()); } #else auto policy(caching_allocator_t&, nvbench::launch&) { return thrust::device; } #endif } // namespace cccl-2.5.0/cub/benchmarks/nvbench_helper/test/000077500000000000000000000000001463375617100212665ustar00rootroot00000000000000cccl-2.5.0/cub/benchmarks/nvbench_helper/test/gen_entropy.cu000066400000000000000000000137311463375617100241550ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #include #include #include #include #include #include #include #include #include #include template double get_expected_entropy(bit_entropy in_entropy) { if (in_entropy == bit_entropy::_0_000) { return 0.0; } if (in_entropy == bit_entropy::_1_000) { return sizeof(T) * 8; } const int samples = static_cast(in_entropy) + 1; const double p1 = std::pow(0.5, samples); const double p2 = 1 - p1; const double entropy = (-p1 * std::log2(p1)) + (-p2 * std::log2(p2)); return sizeof(T) * 8 * entropy; } template double compute_actual_entropy(thrust::device_vector in) { const int n = static_cast(in.size()); #if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA thrust::device_vector unique(n); thrust::device_vector counts(n); thrust::device_vector num_runs(1); thrust::sort(in.begin(), in.end(), less_t{}); // RLE void* d_temp_storage = nullptr; std::size_t temp_storage_bytes = 0; T* d_in = thrust::raw_pointer_cast(in.data()); T* d_unique_out = thrust::raw_pointer_cast(unique.data()); int* d_counts_out = thrust::raw_pointer_cast(counts.data()); int* d_num_runs_out = thrust::raw_pointer_cast(num_runs.data()); cub::DeviceRunLengthEncode::Encode( d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_counts_out, d_num_runs_out, n); thrust::device_vector temp_storage(temp_storage_bytes); d_temp_storage = thrust::raw_pointer_cast(temp_storage.data()); cub::DeviceRunLengthEncode::Encode( d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_counts_out, d_num_runs_out, n); thrust::host_vector h_counts = counts; thrust::host_vector h_num_runs = num_runs; #else std::vector h_in(in.begin(), in.end()); std::sort(h_in.begin(), h_in.end(), less_t{}); thrust::host_vector h_counts; T prev = h_in[0]; int length = 1; for (std::size_t i = 1; i < h_in.size(); i++) { const T next = h_in[i]; if (next == prev) { length++; } else { h_counts.push_back(length); prev = next; length = 1; } } h_counts.push_back(length); thrust::host_vector h_num_runs(1, h_counts.size()); #endif // normalize counts thrust::host_vector ps(h_num_runs[0]); for (std::size_t i = 0; i < ps.size(); i++) { ps[i] = static_cast(h_counts[i]) / n; } double entropy = 0.0; if (ps.size()) { for (double p : ps) { entropy -= p * std::log2(p); } } return entropy; } TEMPLATE_LIST_TEST_CASE("Generators produce data with given entropy", "[gen]", fundamental_types) { constexpr int num_entropy_levels = 6; std::array entropy_levels{ bit_entropy::_0_000, bit_entropy::_0_201, bit_entropy::_0_337, bit_entropy::_0_544, bit_entropy::_0_811, bit_entropy::_1_000}; std::vector entropy(num_entropy_levels); std::transform(entropy_levels.cbegin(), entropy_levels.cend(), entropy.begin(), [](bit_entropy entropy) { const thrust::device_vector data = generate(1 << 24, entropy); return compute_actual_entropy(data); }); REQUIRE(std::is_sorted(entropy.begin(), entropy.end(), less_t{})); REQUIRE(std::unique(entropy.begin(), entropy.end()) == entropy.end()); } TEST_CASE("Generators support bool", "[gen]") { constexpr int num_entropy_levels = 6; std::array entropy_levels{ bit_entropy::_0_000, bit_entropy::_0_201, bit_entropy::_0_337, bit_entropy::_0_544, bit_entropy::_0_811, bit_entropy::_1_000}; std::vector number_of_set(num_entropy_levels); std::transform(entropy_levels.cbegin(), entropy_levels.cend(), number_of_set.begin(), [](bit_entropy entropy) { const thrust::device_vector data = generate(1 << 24, entropy); return thrust::count(data.begin(), data.end(), true); }); REQUIRE(std::is_sorted(number_of_set.begin(), number_of_set.end())); REQUIRE(std::unique(number_of_set.begin(), number_of_set.end()) == number_of_set.end()); } cccl-2.5.0/cub/benchmarks/nvbench_helper/test/gen_power_law_distribution.cu000066400000000000000000000062471463375617100272570ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #include #include #include #include #include #include #include #include bool is_normal(thrust::host_vector data) { std::sort(data.begin(), data.end()); const double A2 = boost::math::statistics::anderson_darling_normality_statistic(data); return A2 / data.size() < 0.05; } using types = nvbench::type_list; TEMPLATE_LIST_TEST_CASE("Generators produce power law distributed data", "[gen][power-law]", types) { const std::size_t elements = 1 << 28; const std::size_t segments = 4 * 1024; const thrust::device_vector d_segment_offsets = generate.power_law.segment_offsets(elements, segments); REQUIRE(d_segment_offsets.size() == segments + 1); std::size_t actual_elements = 0; thrust::host_vector log_sizes(segments); const thrust::host_vector h_segment_offsets = d_segment_offsets; for (int i = 0; i < segments; ++i) { const TestType begin = h_segment_offsets[i]; const TestType end = h_segment_offsets[i + 1]; REQUIRE(begin <= end); const std::size_t size = end - begin; actual_elements += size; log_sizes[i] = std::log(size); } REQUIRE(actual_elements == elements); REQUIRE(is_normal(std::move(log_sizes))); } cccl-2.5.0/cub/benchmarks/nvbench_helper/test/gen_range.cu000066400000000000000000000052461463375617100235530ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #include #include #include #include #include using types = nvbench::type_list; TEMPLATE_LIST_TEST_CASE("Generators produce data within specified range", "[gen]", types) { const auto min = static_cast(GENERATE_COPY(take(3, random(-124, 0)))); const auto max = static_cast(GENERATE_COPY(take(3, random(0, 124)))); const thrust::device_vector data = generate(1 << 16, bit_entropy::_1_000, min, max); const TestType min_element = *thrust::min_element(data.begin(), data.end()); const TestType max_element = *thrust::max_element(data.begin(), data.end()); REQUIRE(min_element >= min); REQUIRE(max_element <= max); } cccl-2.5.0/cub/benchmarks/nvbench_helper/test/gen_seed.cu000066400000000000000000000047661463375617100234050ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #include #include #include #include using types = nvbench::type_list; TEMPLATE_LIST_TEST_CASE("Generator seeds the data", "[gen]", types) { auto generator = generate(1 << 24, bit_entropy::_0_811); const thrust::device_vector vec_1 = generator; const thrust::device_vector vec_2 = generator; REQUIRE(vec_1.size() == vec_2.size()); REQUIRE_FALSE(thrust::equal(vec_1.begin(), vec_1.end(), vec_2.begin())); } cccl-2.5.0/cub/benchmarks/nvbench_helper/test/gen_uniform_distribution.cu000066400000000000000000000164571463375617100267430ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #include #include #include #include #include #include #include #include #include template bool is_uniform(thrust::host_vector data, T min, T max) { const double value_range = static_cast(max) - min; const bool exact_binning = value_range < (1 << 20); const int number_of_bins = exact_binning ? static_cast(max - min + 1) : static_cast(std::sqrt(data.size())); thrust::host_vector bins(number_of_bins, 0); const double interval = value_range / static_cast(number_of_bins); const double expected_count = static_cast(data.size()) / number_of_bins; for (T val : data) { int bin_index = exact_binning ? val - min : (val - static_cast(min)) / interval; if (bin_index >= 0 && bin_index < number_of_bins) { bins[bin_index]++; } } double chi_square = 0.0; for (const auto& count : bins) { chi_square += std::pow(count - expected_count, 2) / expected_count; } boost::math::chi_squared_distribution chi_squared_dist(number_of_bins - 1); const double confidence = 0.95; const double critical_value = boost::math::quantile(chi_squared_dist, confidence); return chi_square <= critical_value; } using types = nvbench::type_list; TEMPLATE_LIST_TEST_CASE("Generators produce uniformly distributed data", "[gen][uniform]", types) { const std::size_t elements = 1 << GENERATE_COPY(16, 20, 24, 28); const TestType min = std::numeric_limits::min(); const TestType max = std::numeric_limits::max(); const thrust::device_vector data = generate(elements, bit_entropy::_1_000, min, max); REQUIRE(is_uniform(data, min, max)); } struct complex_to_real_t { __host__ __device__ float operator()(const complex& c) const { return c.real(); } }; struct complex_to_imag_t { __host__ __device__ float operator()(const complex& c) const { return c.imag(); } }; TEST_CASE("Generators produce uniformly distributed complex", "[gen]") { const float min = std::numeric_limits::min(); const float max = std::numeric_limits::max(); const thrust::device_vector data = generate(1 << 16); thrust::device_vector component(data.size()); thrust::transform(data.begin(), data.end(), component.begin(), complex_to_real_t()); REQUIRE(is_uniform(component, min, max)); thrust::transform(data.begin(), data.end(), component.begin(), complex_to_imag_t()); REQUIRE(is_uniform(component, min, max)); } TEST_CASE("Generators produce uniformly distributed bools", "[gen]") { const thrust::device_vector data = generate(1 << 24, bit_entropy::_0_544); const std::size_t falses = thrust::count(data.begin(), data.end(), false); const std::size_t trues = thrust::count(data.begin(), data.end(), true); REQUIRE(falses > 0); REQUIRE(trues > 0); REQUIRE(falses + trues == data.size()); const double ratio = static_cast(falses) / trues; REQUIRE(ratio > 0.7); } using offsets = nvbench::type_list; TEMPLATE_LIST_TEST_CASE("Generators produce uniformly distributed offsets", "[gen]", offsets) { const std::size_t min_segment_size = 1; const std::size_t max_segment_size = 256; const std::size_t elements = 1 << GENERATE_COPY(16, 20, 24, 28); const thrust::device_vector d_segments = generate.uniform.segment_offsets(elements, min_segment_size, max_segment_size); const thrust::host_vector h_segments = d_segments; const std::size_t num_segments = h_segments.size() - 1; std::size_t actual_elements = 0; thrust::host_vector segment_sizes(num_segments); for (std::size_t sid = 0; sid < num_segments; sid++) { const TestType begin = h_segments[sid]; const TestType end = h_segments[sid + 1]; REQUIRE(begin <= end); const TestType size = end - begin; REQUIRE(size >= min_segment_size); REQUIRE(size <= max_segment_size); segment_sizes[sid] = size; actual_elements += size; } REQUIRE(actual_elements == elements); REQUIRE(is_uniform(std::move(segment_sizes), min_segment_size, max_segment_size)); } TEMPLATE_LIST_TEST_CASE("Generators produce uniformly distributed key segments", "[gen]", types) { const std::size_t min_segment_size = 1; const std::size_t max_segment_size = 128; const std::size_t elements = 1 << GENERATE_COPY(16, 20, 24, 28); const thrust::device_vector d_keys = generate.uniform.key_segments(elements, min_segment_size, max_segment_size); REQUIRE(d_keys.size() == elements); const thrust::host_vector h_keys = d_keys; thrust::host_vector segment_sizes; TestType prev = h_keys[0]; int length = 1; for (std::size_t kid = 1; kid < elements; kid++) { TestType next = h_keys[kid]; if (next == prev) { length++; } else { REQUIRE(length >= min_segment_size); REQUIRE(length <= max_segment_size); segment_sizes.push_back(length); prev = next; length = 1; } } REQUIRE(length >= min_segment_size); REQUIRE(length <= max_segment_size); segment_sizes.push_back(length); REQUIRE(is_uniform(std::move(segment_sizes), min_segment_size, max_segment_size)); } cccl-2.5.0/cub/benchmarks/nvbench_helper/test/main.cpp000066400000000000000000000034121463375617100227160ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #define CATCH_CONFIG_MAIN #include cccl-2.5.0/cub/cmake/000077500000000000000000000000001463375617100142705ustar00rootroot00000000000000cccl-2.5.0/cub/cmake/AppendOptionIfAvailable.cmake000066400000000000000000000004121463375617100217470ustar00rootroot00000000000000include_guard(GLOBAL) include(CheckCXXCompilerFlag) macro (APPEND_OPTION_IF_AVAILABLE _FLAG _LIST) string(MAKE_C_IDENTIFIER "CXX_FLAG_${_FLAG}" _VAR) check_cxx_compiler_flag(${_FLAG} ${_VAR}) if (${${_VAR}}) list(APPEND ${_LIST} ${_FLAG}) endif () endmacro () cccl-2.5.0/cub/cmake/CPM.cmake000066400000000000000000001066101463375617100157150ustar00rootroot00000000000000# CPM.cmake - CMake's missing package manager # =========================================== # See https://github.com/cpm-cmake/CPM.cmake for usage and update instructions. # # MIT License # ----------- #[[ Copyright (c) 2019-2023 Lars Melchior and contributors Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ]] cmake_minimum_required(VERSION 3.14 FATAL_ERROR) # Initialize logging prefix if(NOT CPM_INDENT) set(CPM_INDENT "CPM:" CACHE INTERNAL "" ) endif() if(NOT COMMAND cpm_message) function(cpm_message) message(${ARGV}) endfunction() endif() set(CURRENT_CPM_VERSION 0.38.5) get_filename_component(CPM_CURRENT_DIRECTORY "${CMAKE_CURRENT_LIST_DIR}" REALPATH) if(CPM_DIRECTORY) if(NOT CPM_DIRECTORY STREQUAL CPM_CURRENT_DIRECTORY) if(CPM_VERSION VERSION_LESS CURRENT_CPM_VERSION) message( AUTHOR_WARNING "${CPM_INDENT} \ A dependency is using a more recent CPM version (${CURRENT_CPM_VERSION}) than the current project (${CPM_VERSION}). \ It is recommended to upgrade CPM to the most recent version. \ See https://github.com/cpm-cmake/CPM.cmake for more information." ) endif() if(${CMAKE_VERSION} VERSION_LESS "3.17.0") include(FetchContent) endif() return() endif() get_property( CPM_INITIALIZED GLOBAL "" PROPERTY CPM_INITIALIZED SET ) if(CPM_INITIALIZED) return() endif() endif() if(CURRENT_CPM_VERSION MATCHES "development-version") message( WARNING "${CPM_INDENT} Your project is using an unstable development version of CPM.cmake. \ Please update to a recent release if possible. \ See https://github.com/cpm-cmake/CPM.cmake for details." ) endif() set_property(GLOBAL PROPERTY CPM_INITIALIZED true) macro(cpm_set_policies) # the policy allows us to change options without caching cmake_policy(SET CMP0077 NEW) set(CMAKE_POLICY_DEFAULT_CMP0077 NEW) # the policy allows us to change set(CACHE) without caching if(POLICY CMP0126) cmake_policy(SET CMP0126 NEW) set(CMAKE_POLICY_DEFAULT_CMP0126 NEW) endif() # The policy uses the download time for timestamp, instead of the timestamp in the archive. This # allows for proper rebuilds when a projects url changes if(POLICY CMP0135) cmake_policy(SET CMP0135 NEW) set(CMAKE_POLICY_DEFAULT_CMP0135 NEW) endif() endmacro() cpm_set_policies() option(CPM_USE_LOCAL_PACKAGES "Always try to use `find_package` to get dependencies" $ENV{CPM_USE_LOCAL_PACKAGES} ) option(CPM_LOCAL_PACKAGES_ONLY "Only use `find_package` to get dependencies" $ENV{CPM_LOCAL_PACKAGES_ONLY} ) option(CPM_DOWNLOAD_ALL "Always download dependencies from source" $ENV{CPM_DOWNLOAD_ALL}) option(CPM_DONT_UPDATE_MODULE_PATH "Don't update the module path to allow using find_package" $ENV{CPM_DONT_UPDATE_MODULE_PATH} ) option(CPM_DONT_CREATE_PACKAGE_LOCK "Don't create a package lock file in the binary path" $ENV{CPM_DONT_CREATE_PACKAGE_LOCK} ) option(CPM_INCLUDE_ALL_IN_PACKAGE_LOCK "Add all packages added through CPM.cmake to the package lock" $ENV{CPM_INCLUDE_ALL_IN_PACKAGE_LOCK} ) option(CPM_USE_NAMED_CACHE_DIRECTORIES "Use additional directory of package name in cache on the most nested level." $ENV{CPM_USE_NAMED_CACHE_DIRECTORIES} ) set(CPM_VERSION ${CURRENT_CPM_VERSION} CACHE INTERNAL "" ) set(CPM_DIRECTORY ${CPM_CURRENT_DIRECTORY} CACHE INTERNAL "" ) set(CPM_FILE ${CMAKE_CURRENT_LIST_FILE} CACHE INTERNAL "" ) set(CPM_PACKAGES "" CACHE INTERNAL "" ) set(CPM_DRY_RUN OFF CACHE INTERNAL "Don't download or configure dependencies (for testing)" ) if(DEFINED ENV{CPM_SOURCE_CACHE}) set(CPM_SOURCE_CACHE_DEFAULT $ENV{CPM_SOURCE_CACHE}) else() set(CPM_SOURCE_CACHE_DEFAULT OFF) endif() set(CPM_SOURCE_CACHE ${CPM_SOURCE_CACHE_DEFAULT} CACHE PATH "Directory to download CPM dependencies" ) if(NOT CPM_DONT_UPDATE_MODULE_PATH) set(CPM_MODULE_PATH "${CMAKE_BINARY_DIR}/CPM_modules" CACHE INTERNAL "" ) # remove old modules file(REMOVE_RECURSE ${CPM_MODULE_PATH}) file(MAKE_DIRECTORY ${CPM_MODULE_PATH}) # locally added CPM modules should override global packages set(CMAKE_MODULE_PATH "${CPM_MODULE_PATH};${CMAKE_MODULE_PATH}") endif() if(NOT CPM_DONT_CREATE_PACKAGE_LOCK) set(CPM_PACKAGE_LOCK_FILE "${CMAKE_BINARY_DIR}/cpm-package-lock.cmake" CACHE INTERNAL "" ) file(WRITE ${CPM_PACKAGE_LOCK_FILE} "# CPM Package Lock\n# This file should be committed to version control\n\n" ) endif() include(FetchContent) # Try to infer package name from git repository uri (path or url) function(cpm_package_name_from_git_uri URI RESULT) if("${URI}" MATCHES "([^/:]+)/?.git/?$") set(${RESULT} ${CMAKE_MATCH_1} PARENT_SCOPE ) else() unset(${RESULT} PARENT_SCOPE) endif() endfunction() # Try to infer package name and version from a url function(cpm_package_name_and_ver_from_url url outName outVer) if(url MATCHES "[/\\?]([a-zA-Z0-9_\\.-]+)\\.(tar|tar\\.gz|tar\\.bz2|zip|ZIP)(\\?|/|$)") # We matched an archive set(filename "${CMAKE_MATCH_1}") if(filename MATCHES "([a-zA-Z0-9_\\.-]+)[_-]v?(([0-9]+\\.)*[0-9]+[a-zA-Z0-9]*)") # We matched - (ie foo-1.2.3) set(${outName} "${CMAKE_MATCH_1}" PARENT_SCOPE ) set(${outVer} "${CMAKE_MATCH_2}" PARENT_SCOPE ) elseif(filename MATCHES "(([0-9]+\\.)+[0-9]+[a-zA-Z0-9]*)") # We couldn't find a name, but we found a version # # In many cases (which we don't handle here) the url would look something like # `irrelevant/ACTUAL_PACKAGE_NAME/irrelevant/1.2.3.zip`. In such a case we can't possibly # distinguish the package name from the irrelevant bits. Moreover if we try to match the # package name from the filename, we'd get bogus at best. unset(${outName} PARENT_SCOPE) set(${outVer} "${CMAKE_MATCH_1}" PARENT_SCOPE ) else() # Boldly assume that the file name is the package name. # # Yes, something like `irrelevant/ACTUAL_NAME/irrelevant/download.zip` will ruin our day, but # such cases should be quite rare. No popular service does this... we think. set(${outName} "${filename}" PARENT_SCOPE ) unset(${outVer} PARENT_SCOPE) endif() else() # No ideas yet what to do with non-archives unset(${outName} PARENT_SCOPE) unset(${outVer} PARENT_SCOPE) endif() endfunction() function(cpm_find_package NAME VERSION) string(REPLACE " " ";" EXTRA_ARGS "${ARGN}") find_package(${NAME} ${VERSION} ${EXTRA_ARGS} QUIET) if(${CPM_ARGS_NAME}_FOUND) if(DEFINED ${CPM_ARGS_NAME}_VERSION) set(VERSION ${${CPM_ARGS_NAME}_VERSION}) endif() cpm_message(STATUS "${CPM_INDENT} Using local package ${CPM_ARGS_NAME}@${VERSION}") CPMRegisterPackage(${CPM_ARGS_NAME} "${VERSION}") set(CPM_PACKAGE_FOUND YES PARENT_SCOPE ) else() set(CPM_PACKAGE_FOUND NO PARENT_SCOPE ) endif() endfunction() # Create a custom FindXXX.cmake module for a CPM package This prevents `find_package(NAME)` from # finding the system library function(cpm_create_module_file Name) if(NOT CPM_DONT_UPDATE_MODULE_PATH) # erase any previous modules file(WRITE ${CPM_MODULE_PATH}/Find${Name}.cmake "include(\"${CPM_FILE}\")\n${ARGN}\nset(${Name}_FOUND TRUE)" ) endif() endfunction() # Find a package locally or fallback to CPMAddPackage function(CPMFindPackage) set(oneValueArgs NAME VERSION GIT_TAG FIND_PACKAGE_ARGUMENTS) cmake_parse_arguments(CPM_ARGS "" "${oneValueArgs}" "" ${ARGN}) if(NOT DEFINED CPM_ARGS_VERSION) if(DEFINED CPM_ARGS_GIT_TAG) cpm_get_version_from_git_tag("${CPM_ARGS_GIT_TAG}" CPM_ARGS_VERSION) endif() endif() set(downloadPackage ${CPM_DOWNLOAD_ALL}) if(DEFINED CPM_DOWNLOAD_${CPM_ARGS_NAME}) set(downloadPackage ${CPM_DOWNLOAD_${CPM_ARGS_NAME}}) elseif(DEFINED ENV{CPM_DOWNLOAD_${CPM_ARGS_NAME}}) set(downloadPackage $ENV{CPM_DOWNLOAD_${CPM_ARGS_NAME}}) endif() if(downloadPackage) CPMAddPackage(${ARGN}) cpm_export_variables(${CPM_ARGS_NAME}) return() endif() cpm_check_if_package_already_added(${CPM_ARGS_NAME} "${CPM_ARGS_VERSION}") if(CPM_PACKAGE_ALREADY_ADDED) cpm_export_variables(${CPM_ARGS_NAME}) return() endif() cpm_find_package(${CPM_ARGS_NAME} "${CPM_ARGS_VERSION}" ${CPM_ARGS_FIND_PACKAGE_ARGUMENTS}) if(NOT CPM_PACKAGE_FOUND) CPMAddPackage(${ARGN}) cpm_export_variables(${CPM_ARGS_NAME}) endif() endfunction() # checks if a package has been added before function(cpm_check_if_package_already_added CPM_ARGS_NAME CPM_ARGS_VERSION) if("${CPM_ARGS_NAME}" IN_LIST CPM_PACKAGES) CPMGetPackageVersion(${CPM_ARGS_NAME} CPM_PACKAGE_VERSION) if("${CPM_PACKAGE_VERSION}" VERSION_LESS "${CPM_ARGS_VERSION}") message( WARNING "${CPM_INDENT} Requires a newer version of ${CPM_ARGS_NAME} (${CPM_ARGS_VERSION}) than currently included (${CPM_PACKAGE_VERSION})." ) endif() cpm_get_fetch_properties(${CPM_ARGS_NAME}) set(${CPM_ARGS_NAME}_ADDED NO) set(CPM_PACKAGE_ALREADY_ADDED YES PARENT_SCOPE ) cpm_export_variables(${CPM_ARGS_NAME}) else() set(CPM_PACKAGE_ALREADY_ADDED NO PARENT_SCOPE ) endif() endfunction() # Parse the argument of CPMAddPackage in case a single one was provided and convert it to a list of # arguments which can then be parsed idiomatically. For example gh:foo/bar@1.2.3 will be converted # to: GITHUB_REPOSITORY;foo/bar;VERSION;1.2.3 function(cpm_parse_add_package_single_arg arg outArgs) # Look for a scheme if("${arg}" MATCHES "^([a-zA-Z]+):(.+)$") string(TOLOWER "${CMAKE_MATCH_1}" scheme) set(uri "${CMAKE_MATCH_2}") # Check for CPM-specific schemes if(scheme STREQUAL "gh") set(out "GITHUB_REPOSITORY;${uri}") set(packageType "git") elseif(scheme STREQUAL "gl") set(out "GITLAB_REPOSITORY;${uri}") set(packageType "git") elseif(scheme STREQUAL "bb") set(out "BITBUCKET_REPOSITORY;${uri}") set(packageType "git") # A CPM-specific scheme was not found. Looks like this is a generic URL so try to determine # type elseif(arg MATCHES ".git/?(@|#|$)") set(out "GIT_REPOSITORY;${arg}") set(packageType "git") else() # Fall back to a URL set(out "URL;${arg}") set(packageType "archive") # We could also check for SVN since FetchContent supports it, but SVN is so rare these days. # We just won't bother with the additional complexity it will induce in this function. SVN is # done by multi-arg endif() else() if(arg MATCHES ".git/?(@|#|$)") set(out "GIT_REPOSITORY;${arg}") set(packageType "git") else() # Give up message(FATAL_ERROR "${CPM_INDENT} Can't determine package type of '${arg}'") endif() endif() # For all packages we interpret @... as version. Only replace the last occurrence. Thus URIs # containing '@' can be used string(REGEX REPLACE "@([^@]+)$" ";VERSION;\\1" out "${out}") # Parse the rest according to package type if(packageType STREQUAL "git") # For git repos we interpret #... as a tag or branch or commit hash string(REGEX REPLACE "#([^#]+)$" ";GIT_TAG;\\1" out "${out}") elseif(packageType STREQUAL "archive") # For archives we interpret #... as a URL hash. string(REGEX REPLACE "#([^#]+)$" ";URL_HASH;\\1" out "${out}") # We don't try to parse the version if it's not provided explicitly. cpm_get_version_from_url # should do this at a later point else() # We should never get here. This is an assertion and hitting it means there's a bug in the code # above. A packageType was set, but not handled by this if-else. message(FATAL_ERROR "${CPM_INDENT} Unsupported package type '${packageType}' of '${arg}'") endif() set(${outArgs} ${out} PARENT_SCOPE ) endfunction() # Check that the working directory for a git repo is clean function(cpm_check_git_working_dir_is_clean repoPath gitTag isClean) find_package(Git REQUIRED) if(NOT GIT_EXECUTABLE) # No git executable, assume directory is clean set(${isClean} TRUE PARENT_SCOPE ) return() endif() # check for uncommitted changes execute_process( COMMAND ${GIT_EXECUTABLE} status --porcelain RESULT_VARIABLE resultGitStatus OUTPUT_VARIABLE repoStatus OUTPUT_STRIP_TRAILING_WHITESPACE ERROR_QUIET WORKING_DIRECTORY ${repoPath} ) if(resultGitStatus) # not supposed to happen, assume clean anyway message(WARNING "${CPM_INDENT} Calling git status on folder ${repoPath} failed") set(${isClean} TRUE PARENT_SCOPE ) return() endif() if(NOT "${repoStatus}" STREQUAL "") set(${isClean} FALSE PARENT_SCOPE ) return() endif() # check for committed changes execute_process( COMMAND ${GIT_EXECUTABLE} diff -s --exit-code ${gitTag} RESULT_VARIABLE resultGitDiff OUTPUT_STRIP_TRAILING_WHITESPACE OUTPUT_QUIET WORKING_DIRECTORY ${repoPath} ) if(${resultGitDiff} EQUAL 0) set(${isClean} TRUE PARENT_SCOPE ) else() set(${isClean} FALSE PARENT_SCOPE ) endif() endfunction() # method to overwrite internal FetchContent properties, to allow using CPM.cmake to overload # FetchContent calls. As these are internal cmake properties, this method should be used carefully # and may need modification in future CMake versions. Source: # https://github.com/Kitware/CMake/blob/dc3d0b5a0a7d26d43d6cfeb511e224533b5d188f/Modules/FetchContent.cmake#L1152 function(cpm_override_fetchcontent contentName) cmake_parse_arguments(PARSE_ARGV 1 arg "" "SOURCE_DIR;BINARY_DIR" "") if(NOT "${arg_UNPARSED_ARGUMENTS}" STREQUAL "") message(FATAL_ERROR "${CPM_INDENT} Unsupported arguments: ${arg_UNPARSED_ARGUMENTS}") endif() string(TOLOWER ${contentName} contentNameLower) set(prefix "_FetchContent_${contentNameLower}") set(propertyName "${prefix}_sourceDir") define_property( GLOBAL PROPERTY ${propertyName} BRIEF_DOCS "Internal implementation detail of FetchContent_Populate()" FULL_DOCS "Details used by FetchContent_Populate() for ${contentName}" ) set_property(GLOBAL PROPERTY ${propertyName} "${arg_SOURCE_DIR}") set(propertyName "${prefix}_binaryDir") define_property( GLOBAL PROPERTY ${propertyName} BRIEF_DOCS "Internal implementation detail of FetchContent_Populate()" FULL_DOCS "Details used by FetchContent_Populate() for ${contentName}" ) set_property(GLOBAL PROPERTY ${propertyName} "${arg_BINARY_DIR}") set(propertyName "${prefix}_populated") define_property( GLOBAL PROPERTY ${propertyName} BRIEF_DOCS "Internal implementation detail of FetchContent_Populate()" FULL_DOCS "Details used by FetchContent_Populate() for ${contentName}" ) set_property(GLOBAL PROPERTY ${propertyName} TRUE) endfunction() # Download and add a package from source function(CPMAddPackage) cpm_set_policies() list(LENGTH ARGN argnLength) if(argnLength EQUAL 1) cpm_parse_add_package_single_arg("${ARGN}" ARGN) # The shorthand syntax implies EXCLUDE_FROM_ALL and SYSTEM set(ARGN "${ARGN};EXCLUDE_FROM_ALL;YES;SYSTEM;YES;") endif() set(oneValueArgs NAME FORCE VERSION GIT_TAG DOWNLOAD_ONLY GITHUB_REPOSITORY GITLAB_REPOSITORY BITBUCKET_REPOSITORY GIT_REPOSITORY SOURCE_DIR FIND_PACKAGE_ARGUMENTS NO_CACHE SYSTEM GIT_SHALLOW EXCLUDE_FROM_ALL SOURCE_SUBDIR ) set(multiValueArgs URL OPTIONS DOWNLOAD_COMMAND) cmake_parse_arguments(CPM_ARGS "" "${oneValueArgs}" "${multiValueArgs}" "${ARGN}") # Set default values for arguments if(NOT DEFINED CPM_ARGS_VERSION) if(DEFINED CPM_ARGS_GIT_TAG) cpm_get_version_from_git_tag("${CPM_ARGS_GIT_TAG}" CPM_ARGS_VERSION) endif() endif() if(CPM_ARGS_DOWNLOAD_ONLY) set(DOWNLOAD_ONLY ${CPM_ARGS_DOWNLOAD_ONLY}) else() set(DOWNLOAD_ONLY NO) endif() if(DEFINED CPM_ARGS_GITHUB_REPOSITORY) set(CPM_ARGS_GIT_REPOSITORY "https://github.com/${CPM_ARGS_GITHUB_REPOSITORY}.git") elseif(DEFINED CPM_ARGS_GITLAB_REPOSITORY) set(CPM_ARGS_GIT_REPOSITORY "https://gitlab.com/${CPM_ARGS_GITLAB_REPOSITORY}.git") elseif(DEFINED CPM_ARGS_BITBUCKET_REPOSITORY) set(CPM_ARGS_GIT_REPOSITORY "https://bitbucket.org/${CPM_ARGS_BITBUCKET_REPOSITORY}.git") endif() if(DEFINED CPM_ARGS_GIT_REPOSITORY) list(APPEND CPM_ARGS_UNPARSED_ARGUMENTS GIT_REPOSITORY ${CPM_ARGS_GIT_REPOSITORY}) if(NOT DEFINED CPM_ARGS_GIT_TAG) set(CPM_ARGS_GIT_TAG v${CPM_ARGS_VERSION}) endif() # If a name wasn't provided, try to infer it from the git repo if(NOT DEFINED CPM_ARGS_NAME) cpm_package_name_from_git_uri(${CPM_ARGS_GIT_REPOSITORY} CPM_ARGS_NAME) endif() endif() set(CPM_SKIP_FETCH FALSE) if(DEFINED CPM_ARGS_GIT_TAG) list(APPEND CPM_ARGS_UNPARSED_ARGUMENTS GIT_TAG ${CPM_ARGS_GIT_TAG}) # If GIT_SHALLOW is explicitly specified, honor the value. if(DEFINED CPM_ARGS_GIT_SHALLOW) list(APPEND CPM_ARGS_UNPARSED_ARGUMENTS GIT_SHALLOW ${CPM_ARGS_GIT_SHALLOW}) endif() endif() if(DEFINED CPM_ARGS_URL) # If a name or version aren't provided, try to infer them from the URL list(GET CPM_ARGS_URL 0 firstUrl) cpm_package_name_and_ver_from_url(${firstUrl} nameFromUrl verFromUrl) # If we fail to obtain name and version from the first URL, we could try other URLs if any. # However multiple URLs are expected to be quite rare, so for now we won't bother. # If the caller provided their own name and version, they trump the inferred ones. if(NOT DEFINED CPM_ARGS_NAME) set(CPM_ARGS_NAME ${nameFromUrl}) endif() if(NOT DEFINED CPM_ARGS_VERSION) set(CPM_ARGS_VERSION ${verFromUrl}) endif() list(APPEND CPM_ARGS_UNPARSED_ARGUMENTS URL "${CPM_ARGS_URL}") endif() # Check for required arguments if(NOT DEFINED CPM_ARGS_NAME) message( FATAL_ERROR "${CPM_INDENT} 'NAME' was not provided and couldn't be automatically inferred for package added with arguments: '${ARGN}'" ) endif() # Check if package has been added before cpm_check_if_package_already_added(${CPM_ARGS_NAME} "${CPM_ARGS_VERSION}") if(CPM_PACKAGE_ALREADY_ADDED) cpm_export_variables(${CPM_ARGS_NAME}) return() endif() # Check for manual overrides if(NOT CPM_ARGS_FORCE AND NOT "${CPM_${CPM_ARGS_NAME}_SOURCE}" STREQUAL "") set(PACKAGE_SOURCE ${CPM_${CPM_ARGS_NAME}_SOURCE}) set(CPM_${CPM_ARGS_NAME}_SOURCE "") CPMAddPackage( NAME "${CPM_ARGS_NAME}" SOURCE_DIR "${PACKAGE_SOURCE}" EXCLUDE_FROM_ALL "${CPM_ARGS_EXCLUDE_FROM_ALL}" SYSTEM "${CPM_ARGS_SYSTEM}" OPTIONS "${CPM_ARGS_OPTIONS}" SOURCE_SUBDIR "${CPM_ARGS_SOURCE_SUBDIR}" DOWNLOAD_ONLY "${DOWNLOAD_ONLY}" FORCE True ) cpm_export_variables(${CPM_ARGS_NAME}) return() endif() # Check for available declaration if(NOT CPM_ARGS_FORCE AND NOT "${CPM_DECLARATION_${CPM_ARGS_NAME}}" STREQUAL "") set(declaration ${CPM_DECLARATION_${CPM_ARGS_NAME}}) set(CPM_DECLARATION_${CPM_ARGS_NAME} "") CPMAddPackage(${declaration}) cpm_export_variables(${CPM_ARGS_NAME}) # checking again to ensure version and option compatibility cpm_check_if_package_already_added(${CPM_ARGS_NAME} "${CPM_ARGS_VERSION}") return() endif() if(NOT CPM_ARGS_FORCE) if(CPM_USE_LOCAL_PACKAGES OR CPM_LOCAL_PACKAGES_ONLY) cpm_find_package(${CPM_ARGS_NAME} "${CPM_ARGS_VERSION}" ${CPM_ARGS_FIND_PACKAGE_ARGUMENTS}) if(CPM_PACKAGE_FOUND) cpm_export_variables(${CPM_ARGS_NAME}) return() endif() if(CPM_LOCAL_PACKAGES_ONLY) message( SEND_ERROR "${CPM_INDENT} ${CPM_ARGS_NAME} not found via find_package(${CPM_ARGS_NAME} ${CPM_ARGS_VERSION})" ) endif() endif() endif() CPMRegisterPackage("${CPM_ARGS_NAME}" "${CPM_ARGS_VERSION}") if(DEFINED CPM_ARGS_GIT_TAG) set(PACKAGE_INFO "${CPM_ARGS_GIT_TAG}") elseif(DEFINED CPM_ARGS_SOURCE_DIR) set(PACKAGE_INFO "${CPM_ARGS_SOURCE_DIR}") else() set(PACKAGE_INFO "${CPM_ARGS_VERSION}") endif() if(DEFINED FETCHCONTENT_BASE_DIR) # respect user's FETCHCONTENT_BASE_DIR if set set(CPM_FETCHCONTENT_BASE_DIR ${FETCHCONTENT_BASE_DIR}) else() set(CPM_FETCHCONTENT_BASE_DIR ${CMAKE_BINARY_DIR}/_deps) endif() if(DEFINED CPM_ARGS_DOWNLOAD_COMMAND) list(APPEND CPM_ARGS_UNPARSED_ARGUMENTS DOWNLOAD_COMMAND ${CPM_ARGS_DOWNLOAD_COMMAND}) elseif(DEFINED CPM_ARGS_SOURCE_DIR) list(APPEND CPM_ARGS_UNPARSED_ARGUMENTS SOURCE_DIR ${CPM_ARGS_SOURCE_DIR}) if(NOT IS_ABSOLUTE ${CPM_ARGS_SOURCE_DIR}) # Expand `CPM_ARGS_SOURCE_DIR` relative path. This is important because EXISTS doesn't work # for relative paths. get_filename_component( source_directory ${CPM_ARGS_SOURCE_DIR} REALPATH BASE_DIR ${CMAKE_CURRENT_BINARY_DIR} ) else() set(source_directory ${CPM_ARGS_SOURCE_DIR}) endif() if(NOT EXISTS ${source_directory}) string(TOLOWER ${CPM_ARGS_NAME} lower_case_name) # remove timestamps so CMake will re-download the dependency file(REMOVE_RECURSE "${CPM_FETCHCONTENT_BASE_DIR}/${lower_case_name}-subbuild") endif() elseif(CPM_SOURCE_CACHE AND NOT CPM_ARGS_NO_CACHE) string(TOLOWER ${CPM_ARGS_NAME} lower_case_name) set(origin_parameters ${CPM_ARGS_UNPARSED_ARGUMENTS}) list(SORT origin_parameters) if(CPM_USE_NAMED_CACHE_DIRECTORIES) string(SHA1 origin_hash "${origin_parameters};NEW_CACHE_STRUCTURE_TAG") set(download_directory ${CPM_SOURCE_CACHE}/${lower_case_name}/${origin_hash}/${CPM_ARGS_NAME}) else() string(SHA1 origin_hash "${origin_parameters}") set(download_directory ${CPM_SOURCE_CACHE}/${lower_case_name}/${origin_hash}) endif() # Expand `download_directory` relative path. This is important because EXISTS doesn't work for # relative paths. get_filename_component(download_directory ${download_directory} ABSOLUTE) list(APPEND CPM_ARGS_UNPARSED_ARGUMENTS SOURCE_DIR ${download_directory}) if(CPM_SOURCE_CACHE) file(LOCK ${download_directory}/../cmake.lock) endif() if(EXISTS ${download_directory}) if(CPM_SOURCE_CACHE) file(LOCK ${download_directory}/../cmake.lock RELEASE) endif() cpm_store_fetch_properties( ${CPM_ARGS_NAME} "${download_directory}" "${CPM_FETCHCONTENT_BASE_DIR}/${lower_case_name}-build" ) cpm_get_fetch_properties("${CPM_ARGS_NAME}") if(DEFINED CPM_ARGS_GIT_TAG AND NOT (PATCH_COMMAND IN_LIST CPM_ARGS_UNPARSED_ARGUMENTS)) # warn if cache has been changed since checkout cpm_check_git_working_dir_is_clean(${download_directory} ${CPM_ARGS_GIT_TAG} IS_CLEAN) if(NOT ${IS_CLEAN}) message( WARNING "${CPM_INDENT} Cache for ${CPM_ARGS_NAME} (${download_directory}) is dirty" ) endif() endif() cpm_add_subdirectory( "${CPM_ARGS_NAME}" "${DOWNLOAD_ONLY}" "${${CPM_ARGS_NAME}_SOURCE_DIR}/${CPM_ARGS_SOURCE_SUBDIR}" "${${CPM_ARGS_NAME}_BINARY_DIR}" "${CPM_ARGS_EXCLUDE_FROM_ALL}" "${CPM_ARGS_SYSTEM}" "${CPM_ARGS_OPTIONS}" ) set(PACKAGE_INFO "${PACKAGE_INFO} at ${download_directory}") # As the source dir is already cached/populated, we override the call to FetchContent. set(CPM_SKIP_FETCH TRUE) cpm_override_fetchcontent( "${lower_case_name}" SOURCE_DIR "${${CPM_ARGS_NAME}_SOURCE_DIR}/${CPM_ARGS_SOURCE_SUBDIR}" BINARY_DIR "${${CPM_ARGS_NAME}_BINARY_DIR}" ) else() # Enable shallow clone when GIT_TAG is not a commit hash. Our guess may not be accurate, but # it should guarantee no commit hash get mis-detected. if(NOT DEFINED CPM_ARGS_GIT_SHALLOW) cpm_is_git_tag_commit_hash("${CPM_ARGS_GIT_TAG}" IS_HASH) if(NOT ${IS_HASH}) list(APPEND CPM_ARGS_UNPARSED_ARGUMENTS GIT_SHALLOW TRUE) endif() endif() # remove timestamps so CMake will re-download the dependency file(REMOVE_RECURSE ${CPM_FETCHCONTENT_BASE_DIR}/${lower_case_name}-subbuild) set(PACKAGE_INFO "${PACKAGE_INFO} to ${download_directory}") endif() endif() cpm_create_module_file(${CPM_ARGS_NAME} "CPMAddPackage(\"${ARGN}\")") if(CPM_PACKAGE_LOCK_ENABLED) if((CPM_ARGS_VERSION AND NOT CPM_ARGS_SOURCE_DIR) OR CPM_INCLUDE_ALL_IN_PACKAGE_LOCK) cpm_add_to_package_lock(${CPM_ARGS_NAME} "${ARGN}") elseif(CPM_ARGS_SOURCE_DIR) cpm_add_comment_to_package_lock(${CPM_ARGS_NAME} "local directory") else() cpm_add_comment_to_package_lock(${CPM_ARGS_NAME} "${ARGN}") endif() endif() cpm_message( STATUS "${CPM_INDENT} Adding package ${CPM_ARGS_NAME}@${CPM_ARGS_VERSION} (${PACKAGE_INFO})" ) if(NOT CPM_SKIP_FETCH) cpm_declare_fetch( "${CPM_ARGS_NAME}" "${CPM_ARGS_VERSION}" "${PACKAGE_INFO}" "${CPM_ARGS_UNPARSED_ARGUMENTS}" ) cpm_fetch_package("${CPM_ARGS_NAME}" populated) if(CPM_SOURCE_CACHE AND download_directory) file(LOCK ${download_directory}/../cmake.lock RELEASE) endif() if(${populated}) cpm_add_subdirectory( "${CPM_ARGS_NAME}" "${DOWNLOAD_ONLY}" "${${CPM_ARGS_NAME}_SOURCE_DIR}/${CPM_ARGS_SOURCE_SUBDIR}" "${${CPM_ARGS_NAME}_BINARY_DIR}" "${CPM_ARGS_EXCLUDE_FROM_ALL}" "${CPM_ARGS_SYSTEM}" "${CPM_ARGS_OPTIONS}" ) endif() cpm_get_fetch_properties("${CPM_ARGS_NAME}") endif() set(${CPM_ARGS_NAME}_ADDED YES) cpm_export_variables("${CPM_ARGS_NAME}") endfunction() # Fetch a previously declared package macro(CPMGetPackage Name) if(DEFINED "CPM_DECLARATION_${Name}") CPMAddPackage(NAME ${Name}) else() message(SEND_ERROR "${CPM_INDENT} Cannot retrieve package ${Name}: no declaration available") endif() endmacro() # export variables available to the caller to the parent scope expects ${CPM_ARGS_NAME} to be set macro(cpm_export_variables name) set(${name}_SOURCE_DIR "${${name}_SOURCE_DIR}" PARENT_SCOPE ) set(${name}_BINARY_DIR "${${name}_BINARY_DIR}" PARENT_SCOPE ) set(${name}_ADDED "${${name}_ADDED}" PARENT_SCOPE ) set(CPM_LAST_PACKAGE_NAME "${name}" PARENT_SCOPE ) endmacro() # declares a package, so that any call to CPMAddPackage for the package name will use these # arguments instead. Previous declarations will not be overridden. macro(CPMDeclarePackage Name) if(NOT DEFINED "CPM_DECLARATION_${Name}") set("CPM_DECLARATION_${Name}" "${ARGN}") endif() endmacro() function(cpm_add_to_package_lock Name) if(NOT CPM_DONT_CREATE_PACKAGE_LOCK) cpm_prettify_package_arguments(PRETTY_ARGN false ${ARGN}) file(APPEND ${CPM_PACKAGE_LOCK_FILE} "# ${Name}\nCPMDeclarePackage(${Name}\n${PRETTY_ARGN})\n") endif() endfunction() function(cpm_add_comment_to_package_lock Name) if(NOT CPM_DONT_CREATE_PACKAGE_LOCK) cpm_prettify_package_arguments(PRETTY_ARGN true ${ARGN}) file(APPEND ${CPM_PACKAGE_LOCK_FILE} "# ${Name} (unversioned)\n# CPMDeclarePackage(${Name}\n${PRETTY_ARGN}#)\n" ) endif() endfunction() # includes the package lock file if it exists and creates a target `cpm-update-package-lock` to # update it macro(CPMUsePackageLock file) if(NOT CPM_DONT_CREATE_PACKAGE_LOCK) get_filename_component(CPM_ABSOLUTE_PACKAGE_LOCK_PATH ${file} ABSOLUTE) if(EXISTS ${CPM_ABSOLUTE_PACKAGE_LOCK_PATH}) include(${CPM_ABSOLUTE_PACKAGE_LOCK_PATH}) endif() if(NOT TARGET cpm-update-package-lock) add_custom_target( cpm-update-package-lock COMMAND ${CMAKE_COMMAND} -E copy ${CPM_PACKAGE_LOCK_FILE} ${CPM_ABSOLUTE_PACKAGE_LOCK_PATH} ) endif() set(CPM_PACKAGE_LOCK_ENABLED true) endif() endmacro() # registers a package that has been added to CPM function(CPMRegisterPackage PACKAGE VERSION) list(APPEND CPM_PACKAGES ${PACKAGE}) set(CPM_PACKAGES ${CPM_PACKAGES} CACHE INTERNAL "" ) set("CPM_PACKAGE_${PACKAGE}_VERSION" ${VERSION} CACHE INTERNAL "" ) endfunction() # retrieve the current version of the package to ${OUTPUT} function(CPMGetPackageVersion PACKAGE OUTPUT) set(${OUTPUT} "${CPM_PACKAGE_${PACKAGE}_VERSION}" PARENT_SCOPE ) endfunction() # declares a package in FetchContent_Declare function(cpm_declare_fetch PACKAGE VERSION INFO) if(${CPM_DRY_RUN}) cpm_message(STATUS "${CPM_INDENT} Package not declared (dry run)") return() endif() FetchContent_Declare(${PACKAGE} ${ARGN}) endfunction() # returns properties for a package previously defined by cpm_declare_fetch function(cpm_get_fetch_properties PACKAGE) if(${CPM_DRY_RUN}) return() endif() set(${PACKAGE}_SOURCE_DIR "${CPM_PACKAGE_${PACKAGE}_SOURCE_DIR}" PARENT_SCOPE ) set(${PACKAGE}_BINARY_DIR "${CPM_PACKAGE_${PACKAGE}_BINARY_DIR}" PARENT_SCOPE ) endfunction() function(cpm_store_fetch_properties PACKAGE source_dir binary_dir) if(${CPM_DRY_RUN}) return() endif() set(CPM_PACKAGE_${PACKAGE}_SOURCE_DIR "${source_dir}" CACHE INTERNAL "" ) set(CPM_PACKAGE_${PACKAGE}_BINARY_DIR "${binary_dir}" CACHE INTERNAL "" ) endfunction() # adds a package as a subdirectory if viable, according to provided options function( cpm_add_subdirectory PACKAGE DOWNLOAD_ONLY SOURCE_DIR BINARY_DIR EXCLUDE SYSTEM OPTIONS ) if(NOT DOWNLOAD_ONLY AND EXISTS ${SOURCE_DIR}/CMakeLists.txt) set(addSubdirectoryExtraArgs "") if(EXCLUDE) list(APPEND addSubdirectoryExtraArgs EXCLUDE_FROM_ALL) endif() if("${SYSTEM}" AND "${CMAKE_VERSION}" VERSION_GREATER_EQUAL "3.25") # https://cmake.org/cmake/help/latest/prop_dir/SYSTEM.html#prop_dir:SYSTEM list(APPEND addSubdirectoryExtraArgs SYSTEM) endif() if(OPTIONS) foreach(OPTION ${OPTIONS}) cpm_parse_option("${OPTION}") set(${OPTION_KEY} "${OPTION_VALUE}") endforeach() endif() set(CPM_OLD_INDENT "${CPM_INDENT}") set(CPM_INDENT "${CPM_INDENT} ${PACKAGE}:") add_subdirectory(${SOURCE_DIR} ${BINARY_DIR} ${addSubdirectoryExtraArgs}) set(CPM_INDENT "${CPM_OLD_INDENT}") endif() endfunction() # downloads a previously declared package via FetchContent and exports the variables # `${PACKAGE}_SOURCE_DIR` and `${PACKAGE}_BINARY_DIR` to the parent scope function(cpm_fetch_package PACKAGE populated) set(${populated} FALSE PARENT_SCOPE ) if(${CPM_DRY_RUN}) cpm_message(STATUS "${CPM_INDENT} Package ${PACKAGE} not fetched (dry run)") return() endif() FetchContent_GetProperties(${PACKAGE}) string(TOLOWER "${PACKAGE}" lower_case_name) if(NOT ${lower_case_name}_POPULATED) FetchContent_Populate(${PACKAGE}) set(${populated} TRUE PARENT_SCOPE ) endif() cpm_store_fetch_properties( ${CPM_ARGS_NAME} ${${lower_case_name}_SOURCE_DIR} ${${lower_case_name}_BINARY_DIR} ) set(${PACKAGE}_SOURCE_DIR ${${lower_case_name}_SOURCE_DIR} PARENT_SCOPE ) set(${PACKAGE}_BINARY_DIR ${${lower_case_name}_BINARY_DIR} PARENT_SCOPE ) endfunction() # splits a package option function(cpm_parse_option OPTION) string(REGEX MATCH "^[^ ]+" OPTION_KEY "${OPTION}") string(LENGTH "${OPTION}" OPTION_LENGTH) string(LENGTH "${OPTION_KEY}" OPTION_KEY_LENGTH) if(OPTION_KEY_LENGTH STREQUAL OPTION_LENGTH) # no value for key provided, assume user wants to set option to "ON" set(OPTION_VALUE "ON") else() math(EXPR OPTION_KEY_LENGTH "${OPTION_KEY_LENGTH}+1") string(SUBSTRING "${OPTION}" "${OPTION_KEY_LENGTH}" "-1" OPTION_VALUE) endif() set(OPTION_KEY "${OPTION_KEY}" PARENT_SCOPE ) set(OPTION_VALUE "${OPTION_VALUE}" PARENT_SCOPE ) endfunction() # guesses the package version from a git tag function(cpm_get_version_from_git_tag GIT_TAG RESULT) string(LENGTH ${GIT_TAG} length) if(length EQUAL 40) # GIT_TAG is probably a git hash set(${RESULT} 0 PARENT_SCOPE ) else() string(REGEX MATCH "v?([0123456789.]*).*" _ ${GIT_TAG}) set(${RESULT} ${CMAKE_MATCH_1} PARENT_SCOPE ) endif() endfunction() # guesses if the git tag is a commit hash or an actual tag or a branch name. function(cpm_is_git_tag_commit_hash GIT_TAG RESULT) string(LENGTH "${GIT_TAG}" length) # full hash has 40 characters, and short hash has at least 7 characters. if(length LESS 7 OR length GREATER 40) set(${RESULT} 0 PARENT_SCOPE ) else() if(${GIT_TAG} MATCHES "^[a-fA-F0-9]+$") set(${RESULT} 1 PARENT_SCOPE ) else() set(${RESULT} 0 PARENT_SCOPE ) endif() endif() endfunction() function(cpm_prettify_package_arguments OUT_VAR IS_IN_COMMENT) set(oneValueArgs NAME FORCE VERSION GIT_TAG DOWNLOAD_ONLY GITHUB_REPOSITORY GITLAB_REPOSITORY GIT_REPOSITORY SOURCE_DIR FIND_PACKAGE_ARGUMENTS NO_CACHE SYSTEM GIT_SHALLOW ) set(multiValueArgs URL OPTIONS DOWNLOAD_COMMAND) cmake_parse_arguments(CPM_ARGS "" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) foreach(oneArgName ${oneValueArgs}) if(DEFINED CPM_ARGS_${oneArgName}) if(${IS_IN_COMMENT}) string(APPEND PRETTY_OUT_VAR "#") endif() if(${oneArgName} STREQUAL "SOURCE_DIR") string(REPLACE ${CMAKE_SOURCE_DIR} "\${CMAKE_SOURCE_DIR}" CPM_ARGS_${oneArgName} ${CPM_ARGS_${oneArgName}} ) endif() string(APPEND PRETTY_OUT_VAR " ${oneArgName} ${CPM_ARGS_${oneArgName}}\n") endif() endforeach() foreach(multiArgName ${multiValueArgs}) if(DEFINED CPM_ARGS_${multiArgName}) if(${IS_IN_COMMENT}) string(APPEND PRETTY_OUT_VAR "#") endif() string(APPEND PRETTY_OUT_VAR " ${multiArgName}\n") foreach(singleOption ${CPM_ARGS_${multiArgName}}) if(${IS_IN_COMMENT}) string(APPEND PRETTY_OUT_VAR "#") endif() string(APPEND PRETTY_OUT_VAR " \"${singleOption}\"\n") endforeach() endif() endforeach() if(NOT "${CPM_ARGS_UNPARSED_ARGUMENTS}" STREQUAL "") if(${IS_IN_COMMENT}) string(APPEND PRETTY_OUT_VAR "#") endif() string(APPEND PRETTY_OUT_VAR " ") foreach(CPM_ARGS_UNPARSED_ARGUMENT ${CPM_ARGS_UNPARSED_ARGUMENTS}) string(APPEND PRETTY_OUT_VAR " ${CPM_ARGS_UNPARSED_ARGUMENT}") endforeach() string(APPEND PRETTY_OUT_VAR "\n") endif() set(${OUT_VAR} ${PRETTY_OUT_VAR} PARENT_SCOPE ) endfunction() cccl-2.5.0/cub/cmake/CubAddSubdir.cmake000066400000000000000000000002051463375617100175620ustar00rootroot00000000000000find_package(CUB REQUIRED CONFIG NO_DEFAULT_PATH # Only check the explicit path in HINTS: HINTS "${CMAKE_CURRENT_LIST_DIR}/.." ) cccl-2.5.0/cub/cmake/CubBuildCompilerTargets.cmake000066400000000000000000000142351463375617100220150ustar00rootroot00000000000000# # This file defines the `cub_build_compiler_targets()` function, which # creates the following interface targets: # # cub.compiler_interface # - Interface target providing compiler-specific options needed to build # CUB's tests, examples, etc. function(cub_build_compiler_targets) set(cxx_compile_definitions) set(cxx_compile_options) set(cuda_compile_options) # Ensure that we build our tests without treating ourself as system header list(APPEND cxx_compile_definitions "_CCCL_NO_SYSTEM_HEADER") if ("MSVC" STREQUAL "${CMAKE_CXX_COMPILER_ID}") list(APPEND cxx_compile_definitions _ENABLE_EXTENDED_ALIGNED_STORAGE) list(APPEND cuda_compile_options "--use-local-env") # sccache cannot handle the -Fd option generationg pdb files set(CMAKE_MSVC_DEBUG_INFORMATION_FORMAT Embedded) append_option_if_available("/W4" cxx_compile_options) append_option_if_available("/WX" cxx_compile_options) # Suppress overly-pedantic/unavoidable warnings brought in with /W4: # C4324: structure was padded due to alignment specifier append_option_if_available("/wd4324" cxx_compile_options) # C4505: unreferenced local function has been removed # The CUDA `host_runtime.h` header emits this for # `__cudaUnregisterBinaryUtil`. append_option_if_available("/wd4505" cxx_compile_options) # C4706: assignment within conditional expression # MSVC doesn't provide an opt-out for this warning when the assignment is # intentional. Clang will warn for these, but suppresses the warning when # double-parentheses are used around the assignment. We'll let Clang catch # unintentional assignments and suppress all such warnings on MSVC. append_option_if_available("/wd4706" cxx_compile_options) # Some tests require /bigobj to fit everything into their object files: append_option_if_available("/bigobj" cxx_compile_options) else() append_option_if_available("-Wreorder" cuda_compile_options) append_option_if_available("-Werror" cxx_compile_options) append_option_if_available("-Wall" cxx_compile_options) append_option_if_available("-Wextra" cxx_compile_options) append_option_if_available("-Winit-self" cxx_compile_options) append_option_if_available("-Woverloaded-virtual" cxx_compile_options) append_option_if_available("-Wcast-qual" cxx_compile_options) append_option_if_available("-Wpointer-arith" cxx_compile_options) append_option_if_available("-Wunused-local-typedef" cxx_compile_options) append_option_if_available("-Wvla" cxx_compile_options) # Disable GNU extensions (flag is clang only) append_option_if_available("-Wgnu" cxx_compile_options) append_option_if_available("-Wno-gnu-line-marker" cxx_compile_options) # WAR 3916341 # Calling a variadic macro with zero args is a GNU extension until C++20, # but the THRUST_PP_ARITY macro is used with zero args. Need to see if this # is a real problem worth fixing. append_option_if_available("-Wno-gnu-zero-variadic-macro-arguments" cxx_compile_options) # This complains about functions in CUDA system headers when used with nvcc. append_option_if_available("-Wno-unused-function" cxx_compile_options) endif() if ("GNU" STREQUAL "${CMAKE_CXX_COMPILER_ID}") if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 7.3) # GCC 7.3 complains about name mangling changes due to `noexcept` # becoming part of the type system; we don't care. append_option_if_available("-Wno-noexcept-type" cxx_compile_options) endif() endif() if ("Intel" STREQUAL "${CMAKE_CXX_COMPILER_ID}") # Do not flush denormal floats to zero append_option_if_available("-no-ftz" cxx_compile_options) # Disable warning that inlining is inhibited by compiler thresholds. append_option_if_available("-diag-disable=11074" cxx_compile_options) append_option_if_available("-diag-disable=11076" cxx_compile_options) # Disable warning about deprecated classic compiler append_option_if_available("-diag-disable=10441" cxx_compile_options) endif() if ("Clang" STREQUAL "${CMAKE_CXX_COMPILER_ID}") option(CUB_ENABLE_CT_PROFILING "Enable compilation time profiling" OFF) if (CUB_ENABLE_CT_PROFILING) append_option_if_available("-ftime-trace" cxx_compile_options) endif() endif() if ("NVHPC" STREQUAL "${CMAKE_CUDA_COMPILER_ID}") # Do not flush denormal floats to zero list(APPEND cxx_compile_options -Mnodaz) # TODO: Managed memory is currently not supported on windows with WSL list(APPEND cxx_compile_options -gpu=nomanaged) endif() add_library(cub.compiler_interface INTERFACE) foreach (cxx_option IN LISTS cxx_compile_options) target_compile_options(cub.compiler_interface INTERFACE $<$:${cxx_option}> $<$:${cxx_option}> # Only use -Xcompiler with NVCC, not NVC++. # # CMake can't split genexs, so this can't be formatted better :( # This is: # if (using CUDA and CUDA_COMPILER is NVCC) add -Xcompiler=opt: $<$:-Xcompiler=${cxx_option}> ) endforeach() foreach (cuda_option IN LISTS cuda_compile_options) target_compile_options(cub.compiler_interface INTERFACE $<$:${cuda_option}> ) endforeach() # Add these for both CUDA and CXX targets: target_compile_definitions(cub.compiler_interface INTERFACE ${cxx_compile_definitions} ) # Promote warnings and display diagnostic numbers for nvcc: target_compile_options(cub.compiler_interface INTERFACE # If using CUDA w/ NVCC... # Display diagnostic numbers. $<$:-Xcudafe=--display_error_number> # Promote warnings. $<$:-Xcudafe=--promote_warnings> # Don't complain about deprecated GPU targets. $<$:-Wno-deprecated-gpu-targets> ) if ("MSVC" STREQUAL "${CMAKE_CXX_COMPILER_ID}") # Use the local env instead of rebuilding it all the time target_compile_options(cub.compiler_interface INTERFACE # If using CUDA w/ NVCC... $<$:--use-local-env> ) endif() endfunction() cccl-2.5.0/cub/cmake/CubBuildTargetList.cmake000066400000000000000000000223421463375617100207710ustar00rootroot00000000000000# This file provides utilities for building and working with CUB # configuration targets. # # CUB_TARGETS # - Built by the calling the `cub_build_target_list()` function. # - Each item is the name of a CUB interface target that is configured for a # certain build configuration. Currently only C++ standard dialect is # considered. # # cub_build_target_list() # - Creates the CUB_TARGETS list. # # The following functions can be used to test/set metadata on a CUB target: # # cub_get_target_property( ) # - Checks the ${prop} target property on CUB target ${target_name} # and sets the ${prop_var} variable in the caller's scope. # - is any valid cmake identifier. # - is the name of a CUB target. # - is one of the following: # - DIALECT: The C++ dialect. Valid values: 11, 14, 17, 20. # - PREFIX: A unique prefix that should be used to name all # targets/tests/examples that use this configuration. # # cub_get_target_properties() # - Defines ${target_name}_${prop} in the caller's scope, for `prop` in: # {DIALECT, PREFIX}. See above for details. # # cub_clone_target_properties( ) # - Set the {DIALECT, PREFIX} metadata on ${dst_target} to match # ${src_target}. See above for details. # - This *MUST* be called on any targets that link to another CUB target # to ensure that dialect information is updated correctly, e.g. # `cub_clone_target_properties(${my_cub_test} ${some_cub_target})` # Dialects: set(CUB_CPP_DIALECT_OPTIONS 11 14 17 20 CACHE INTERNAL "C++ dialects supported by CUB." FORCE ) define_property(TARGET PROPERTY _CUB_DIALECT BRIEF_DOCS "A target's C++ dialect: 11, 14, or 17." FULL_DOCS "A target's C++ dialect: 11, 14, or 17." ) define_property(TARGET PROPERTY _CUB_PREFIX BRIEF_DOCS "A prefix describing the config, eg. 'cub.cpp14'." FULL_DOCS "A prefix describing the config, eg. 'cub.cpp14'." ) function(cub_set_target_properties target_name dialect prefix) set_target_properties(${target_name} PROPERTIES _CUB_DIALECT ${dialect} _CUB_PREFIX ${prefix} ) get_target_property(type ${target_name} TYPE) if (NOT ${type} STREQUAL "INTERFACE_LIBRARY") set_target_properties(${target_name} PROPERTIES CXX_STANDARD ${dialect} CUDA_STANDARD ${dialect} ARCHIVE_OUTPUT_DIRECTORY "${CUB_LIBRARY_OUTPUT_DIR}" LIBRARY_OUTPUT_DIRECTORY "${CUB_LIBRARY_OUTPUT_DIR}" RUNTIME_OUTPUT_DIRECTORY "${CUB_EXECUTABLE_OUTPUT_DIR}" ) endif() endfunction() # Get a cub property from a target and store it in var_name # cub_get_target_property( [DIALECT|PREFIX] macro(cub_get_target_property prop_var target_name prop) get_property(${prop_var} TARGET ${target_name} PROPERTY _CUB_${prop}) endmacro() # Defines the following string variables in the caller's scope: # - ${target_name}_DIALECT # - ${target_name}_PREFIX macro(cub_get_target_properties target_name) cub_get_target_property(${target_name}_DIALECT ${target_name} DIALECT) cub_get_target_property(${target_name}_PREFIX ${target_name} PREFIX) endmacro() # Set one target's _CUB_* properties to match another target function(cub_clone_target_properties dst_target src_target) cub_get_target_properties(${src_target}) cub_set_target_properties(${dst_target} ${${src_target}_DIALECT} ${${src_target}_PREFIX} ) endfunction() # Set ${var_name} to TRUE or FALSE in the caller's scope function(_cub_is_config_valid var_name dialect) if (CUB_ENABLE_DIALECT_CPP${dialect}) set(${var_name} TRUE PARENT_SCOPE) else() set(${var_name} FALSE PARENT_SCOPE) endif() endfunction() function(_cub_init_target_list) set(CUB_TARGETS "" CACHE INTERNAL "" FORCE) endfunction() function(_cub_add_target_to_target_list target_name dialect prefix) cub_set_target_properties(${target_name} ${dialect} ${prefix}) target_link_libraries(${target_name} INTERFACE CUB::CUB cub.compiler_interface ) if (TARGET cub.thrust) target_link_libraries(${target_name} INTERFACE cub.thrust) endif() set(CUB_TARGETS ${CUB_TARGETS} ${target_name} CACHE INTERNAL "" FORCE) set(label "cpp${dialect}") string(TOLOWER "${label}" label) message(STATUS "Enabling CUB configuration: ${label}") endfunction() # Build a ${CUB_TARGETS} list containing target names for all # requested configurations function(cub_build_target_list) # Clear the list of targets: _cub_init_target_list() # Handle dialect options: set(num_dialects_enabled 0) foreach (dialect IN LISTS CUB_CPP_DIALECT_OPTIONS) if (CUB_IN_THRUST) # Just use Thrust's settings: if (THRUST_ENABLE_MULTICONFIG) set(CUB_ENABLE_DIALECT_CPP${dialect} ${THRUST_MULTICONFIG_ENABLE_DIALECT_CPP${dialect}} ) else() set(val OFF) if (dialect EQUAL ${THRUST_CPP_DIALECT}) set(val ON) endif() set(CUB_ENABLE_DIALECT_CPP${dialect} ${val}) endif() else() # Create CMake options: set(default_value OFF) if (dialect EQUAL 14) # Default to just 14 on: set(default_value ON) endif() option(CUB_ENABLE_DIALECT_CPP${dialect} "Generate C++${dialect} build configurations." ${default_value} ) endif() if (CUB_ENABLE_DIALECT_CPP${dialect}) math(EXPR num_dialects_enabled "${num_dialects_enabled} + 1") endif() endforeach() # Ensure that only one C++ dialect is enabled when dialect info is hidden: if ((NOT CUB_ENABLE_CPP_DIALECT_IN_NAMES) AND (NOT num_dialects_enabled EQUAL 1)) message(FATAL_ERROR "Only one CUB_ENABLE_DIALECT_CPP## option allowed when " "CUB_ENABLE_CPP_DIALECT_IN_NAMES is OFF." ) endif() # CMake fixed C++17 support for NVCC + MSVC targets in 3.18.3: if (CUB_ENABLE_DIALECT_CPP17) cmake_minimum_required(VERSION 3.18.3) endif() # Supported versions of MSVC do not distinguish between C++11 and C++14. # Warn the user that they may be generating a ton of redundant targets. if ("MSVC" STREQUAL "${CMAKE_CXX_COMPILER_ID}" AND CUB_ENABLE_DIALECT_CPP11) message(WARNING "Supported versions of MSVC (2017+) do not distinguish between C++11 " "and C++14. The requested C++11 targets will be built with C++14." ) endif() # Generic config flags: macro(add_flag_option flag docstring default) set(cub_opt "CUB_${flag}") if (CUB_IN_THRUST) set(thrust_opt "THRUST_${flag}") # Use thrust's settings: set(${cub_opt} ${${thrust_opt}}) else() option(${cub_opt} "${docstring}" "${default}") mark_as_advanced(${cub_opt}) endif() endmacro() add_flag_option(IGNORE_DEPRECATED_CPP_DIALECT "Don't warn about any deprecated C++ standards and compilers." OFF) add_flag_option(IGNORE_DEPRECATED_CPP_11 "Don't warn about deprecated C++11." OFF) add_flag_option(IGNORE_DEPRECATED_COMPILER "Don't warn about deprecated compilers." OFF) # Build cub.compiler_interface with warning flags, etc # This must be called before _cub_add_target_to_target_list. cub_build_compiler_targets() # Set up the CUB target while testing out our find_package scripts. find_package(CUB REQUIRED CONFIG NO_DEFAULT_PATH # Only check the explicit path in HINTS: HINTS "${CUB_SOURCE_DIR}" ) # TODO # Some of the iterators and unittests depend on thrust. We should break the # cyclical dependency by migrating CUB's Thrust bits into Thrust. find_package(Thrust ${CUB_VERSION} EXACT CONFIG HINTS "../thrust" # Monorepo path ) if (Thrust_FOUND) thrust_set_CUB_target(CUB::CUB) thrust_create_target(cub.thrust HOST CPP DEVICE CUDA) else() message(STATUS "Thrust was not found. Set CMake variable 'Thrust_DIR' to the " "thrust-config.cmake file of a Thrust ${CUB_VERSION} installation to " "enable additional testing." ) endif() # Build CUB_TARGETS foreach(dialect IN LISTS CUB_CPP_DIALECT_OPTIONS) _cub_is_config_valid(config_valid ${dialect}) if (config_valid) if (NOT CUB_ENABLE_CPP_DIALECT_IN_NAMES) set(prefix "cub") else() set(prefix "cub.cpp${dialect}") endif() set(target_name "${prefix}") add_library(${target_name} INTERFACE) # Set configuration metadata for this cub interface target: _cub_add_target_to_target_list(${target_name} ${dialect} ${prefix}) endif() endforeach() # dialects list(LENGTH CUB_TARGETS count) message(STATUS "${count} unique cub.dialect configurations generated") # Top level meta-target. Makes it easier to just build CUB targets when # building both CUB and Thrust. Add all project files here so IDEs will be # aware of them. This will not generate build rules. file(GLOB_RECURSE all_sources RELATIVE "${CMAKE_CURRENT_LIST_DIR}" "${CUB_SOURCE_DIR}/cub/*.cuh" ) # Add a cub.all target that builds all configs. if (NOT CUB_ENABLE_CPP_DIALECT_IN_NAMES) add_custom_target(cub.all) else() add_custom_target(cub.all SOURCES ${all_sources}) # Create meta targets for each config: foreach(cub_target IN LISTS CUB_TARGETS) cub_get_target_property(config_prefix ${cub_target} PREFIX) add_custom_target(${config_prefix}.all) add_dependencies(cub.all ${config_prefix}.all) endforeach() endif() endfunction() cccl-2.5.0/cub/cmake/CubCudaConfig.cmake000066400000000000000000000036621463375617100177350ustar00rootroot00000000000000enable_language(CUDA) # # Architecture options: # # Since we have to filter the arch list based on target features, we don't # currently support the convenience arch flags: if ("all" IN_LIST CMAKE_CUDA_ARCHITECTURES OR "all-major" IN_LIST CMAKE_CUDA_ARCHITECTURES OR "native" IN_LIST CMAKE_CUDA_ARCHITECTURES) message(FATAL_ERROR "The CUB dev build requires an explicit list of architectures in CMAKE_CUDA_ARCHITECTURES. " "The convenience flags of 'all', 'all-major', and 'native' are not supported.\n" "CMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}") endif() # Create a new arch list that only contains arches that support CDP: set(CUB_CUDA_ARCHITECTURES ${CMAKE_CUDA_ARCHITECTURES}) set(CUB_CUDA_ARCHITECTURES_RDC ${CUB_CUDA_ARCHITECTURES}) list(FILTER CUB_CUDA_ARCHITECTURES_RDC EXCLUDE REGEX "53|62|72") message(STATUS "CUB_CUDA_ARCHITECTURES: ${CUB_CUDA_ARCHITECTURES}") message(STATUS "CUB_CUDA_ARCHITECTURES_RDC: ${CUB_CUDA_ARCHITECTURES_RDC}") if ("MSVC" STREQUAL "${CMAKE_CXX_COMPILER_ID}") # Currently, there are linkage issues caused by bugs in interaction between MSBuild and CMake object libraries # that take place with -rdc builds. Changing the default for now. option(CUB_ENABLE_RDC_TESTS "Enable tests that require separable compilation." OFF) else() option(CUB_ENABLE_RDC_TESTS "Enable tests that require separable compilation." ON) endif() option(CUB_FORCE_RDC "Enable separable compilation on all targets that support it." OFF) list(LENGTH CUB_CUDA_ARCHITECTURES_RDC rdc_arch_count) if (rdc_arch_count EQUAL 0) message(NOTICE "Disabling CUB CDPv1 targets as no enabled architectures support it.") set(CUB_ENABLE_RDC_TESTS OFF CACHE BOOL "" FORCE) set(CUB_FORCE_RDC OFF CACHE BOOL "" FORCE) endif() # # Clang CUDA options # if ("Clang" STREQUAL "${CMAKE_CUDA_COMPILER_ID}") set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-unknown-cuda-version -Xclang=-fcuda-allow-variadic-functions") endif () cccl-2.5.0/cub/cmake/CubHeaderTesting.cmake000066400000000000000000000042151463375617100204540ustar00rootroot00000000000000# For every public header, build a translation unit containing `#include
` # to let the compiler try to figure out warnings in that header if it is not otherwise # included in tests, and also to verify if the headers are modular enough. # .inl files are not globbed for, because they are not supposed to be used as public # entrypoints. # Meta target for all configs' header builds: add_custom_target(cub.all.headers) file(GLOB_RECURSE headers RELATIVE "${CUB_SOURCE_DIR}/cub" CONFIGURE_DEPENDS cub/*.cuh ) set(headertest_srcs) foreach (header IN LISTS headers) set(headertest_src "headers/${header}.cu") configure_file("${CUB_SOURCE_DIR}/cmake/header_test.in" "${headertest_src}") list(APPEND headertest_srcs "${headertest_src}") endforeach() function(cub_add_header_test label definitions) foreach(cub_target IN LISTS CUB_TARGETS) cub_get_target_property(config_prefix ${cub_target} PREFIX) set(headertest_target ${config_prefix}.headers.${label}) add_library(${headertest_target} OBJECT ${headertest_srcs}) target_link_libraries(${headertest_target} PUBLIC ${cub_target}) target_compile_definitions(${headertest_target} PRIVATE ${definitions}) cub_clone_target_properties(${headertest_target} ${cub_target}) cub_configure_cuda_target(${headertest_target} RDC ${CUB_FORCE_RDC}) if (CUB_IN_THRUST) thrust_fix_clang_nvcc_build_for(${headertest_target}) endif() add_dependencies(cub.all.headers ${headertest_target}) add_dependencies(${config_prefix}.all ${headertest_target}) endforeach() endfunction() # Wrap Thrust/CUB in a custom namespace to check proper use of ns macros: set(header_definitions "THRUST_WRAPPED_NAMESPACE=wrapped_thrust" "CUB_WRAPPED_NAMESPACE=wrapped_cub") cub_add_header_test(base "${header_definitions}") set(header_definitions "THRUST_WRAPPED_NAMESPACE=wrapped_thrust" "CUB_WRAPPED_NAMESPACE=wrapped_cub" "CCCL_DISABLE_BF16_SUPPORT") cub_add_header_test(bf16 "${header_definitions}") set(header_definitions "THRUST_WRAPPED_NAMESPACE=wrapped_thrust" "CUB_WRAPPED_NAMESPACE=wrapped_cub" "CCCL_DISABLE_FP16_SUPPORT") cub_add_header_test(half "${header_definitions}") cccl-2.5.0/cub/cmake/CubInstallRules.cmake000066400000000000000000000033731463375617100203530ustar00rootroot00000000000000# Bring in CMAKE_INSTALL_LIBDIR include(GNUInstallDirs) # CUB is a header library; no need to build anything before installing: set(CMAKE_SKIP_INSTALL_ALL_DEPENDENCY TRUE) install(DIRECTORY "${CUB_SOURCE_DIR}/cub" DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}" FILES_MATCHING PATTERN "*.cuh" PATTERN "*.hpp" ) install(DIRECTORY "${CUB_SOURCE_DIR}/cub/cmake/" DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/cub" REGEX .*header-search.cmake.* EXCLUDE ) # Need to configure a file to store the infix specified in # CMAKE_INSTALL_INCLUDEDIR since it can be defined by the user set(_CCCL_RELATIVE_LIBDIR "${CMAKE_INSTALL_LIBDIR}") if(_CCCL_RELATIVE_LIBDIR MATCHES "^${CMAKE_INSTALL_PREFIX}") # libdir is an abs string that starts with prefix string(LENGTH "${CMAKE_INSTALL_PREFIX}" to_remove) string(SUBSTRING "${_CCCL_RELATIVE_LIBDIR}" ${to_remove} -1 relative) # remove any leading "/"" string(REGEX REPLACE "^/(.)" "\\1" _CCCL_RELATIVE_LIBDIR "${relative}") elseif(_CCCL_RELATIVE_LIBDIR MATCHES "^/") message(FATAL_ERROR "CMAKE_INSTALL_LIBDIR ('${CMAKE_INSTALL_LIBDIR}') must be a relative path or an absolute path under CMAKE_INSTALL_PREFIX ('${CMAKE_INSTALL_PREFIX}')") endif() set(install_location "${_CCCL_RELATIVE_LIBDIR}/cmake/cub") # Transform to a list of directories, replace each directory with "../" # and convert back to a string string(REGEX REPLACE "/" ";" from_install_prefix "${install_location}") list(TRANSFORM from_install_prefix REPLACE ".+" "../") list(JOIN from_install_prefix "" from_install_prefix) configure_file("${CUB_SOURCE_DIR}/cub/cmake/cub-header-search.cmake.in" "${CUB_BINARY_DIR}/cub/cmake/cub-header-search.cmake" @ONLY) install(FILES "${CUB_BINARY_DIR}/cub/cmake/cub-header-search.cmake" DESTINATION "${install_location}") cccl-2.5.0/cub/cmake/CubUtilities.cmake000066400000000000000000000020101463375617100176700ustar00rootroot00000000000000# cub_configure_cuda_target( RDC ) # # Configures `target_name` with the appropriate CUDA architectures and RDC state. function(cub_configure_cuda_target target_name) set(options) set(one_value_args RDC) set(multi_value_args) cmake_parse_arguments(cub_cuda "${options}" "${one_value_args}" "${multi_value_args}" ${ARGN}) if (cub_cuda_UNPARSED_ARGUMENTS) message(AUTHOR_WARNING "Unrecognized arguments passed to cub_configure_cuda_target: " ${cub_cuda_UNPARSED_ARGUMENTS}) endif() if (NOT DEFINED cub_cuda_RDC) message(AUTHOR_WARNING "RDC option required for cub_configure_cuda_target.") endif() if (cub_cuda_RDC) set_target_properties(${target_name} PROPERTIES CUDA_ARCHITECTURES "${CUB_CUDA_ARCHITECTURES_RDC}" POSITION_INDEPENDENT_CODE ON CUDA_SEPARABLE_COMPILATION ON) else() set_target_properties(${target_name} PROPERTIES CUDA_ARCHITECTURES "${CUB_CUDA_ARCHITECTURES}" CUDA_SEPARABLE_COMPILATION OFF) endif() endfunction() cccl-2.5.0/cub/cmake/header_test.in000066400000000000000000000055741463375617100171220ustar00rootroot00000000000000// This source file checks that: // 1) Header compiles without error. // 2) Common macro collisions with platform/system headers are avoided. // Define CUB_MACRO_CHECK(macro, header), which emits a diagnostic indicating // a potential macro collision and halts. // // Use raw platform checks instead of the CUB_HOST_COMPILER macros since we // don't want to #include any headers other than the one being tested. // // This is only implemented for MSVC/GCC/Clang. #if defined(_MSC_VER) // MSVC // Fake up an error for MSVC #define CUB_MACRO_CHECK_IMPL(msg) \ /* Print message that looks like an error: */ \ __pragma(message(__FILE__ ":" CUB_MACRO_CHECK_IMPL0(__LINE__) \ ": error: " #msg)) \ /* abort compilation due to static_assert or syntax error: */ \ static_assert(false, #msg); #define CUB_MACRO_CHECK_IMPL0(x) CUB_MACRO_CHECK_IMPL1(x) #define CUB_MACRO_CHECK_IMPL1(x) #x #elif defined(__clang__) || defined(__GNUC__) // GCC/clang are easy: #define CUB_MACRO_CHECK_IMPL(msg) CUB_MACRO_CHECK_IMPL0(GCC error #msg) #define CUB_MACRO_CHECK_IMPL0(expr) _Pragma(#expr) #endif // Hacky way to build a string, but it works on all tested platforms. #define CUB_MACRO_CHECK(MACRO, HEADER) \ CUB_MACRO_CHECK_IMPL(Identifier MACRO should not be used from CUB \ headers due to conflicts with HEADER macros.) // complex.h conflicts #define I CUB_MACRO_CHECK('I', complex.h) // windows.h conflicts #define small CUB_MACRO_CHECK('small', windows.h) // We can't enable these checks without breaking some builds -- some standard // library implementations unconditionally `#undef` these macros, which then // causes random failures later. // Leaving these commented out as a warning: Here be dragons. //#define min(...) CUB_MACRO_CHECK('min', windows.h) //#define max(...) CUB_MACRO_CHECK('max', windows.h) #ifdef _WIN32 // On Windows, make sure any include of Windows.h (e.g. via NVTX) does not define the checked macros # define WIN32_LEAN_AND_MEAN #endif // _WIN32 // termios.h conflicts (NVIDIA/thrust#1547) #define B0 CUB_MACRO_CHECK("B0", termios.h) #include #if defined(CCCL_DISABLE_BF16_SUPPORT) #if defined(__CUDA_BF16_TYPES_EXIST__) #error CUB should not include cuda_bf16.h when BF16 support is disabled #endif // __CUDA_BF16_TYPES_EXIST__ #endif // CCCL_DISABLE_BF16_SUPPORT #if defined(CCCL_DISABLE_FP16_SUPPORT) #if defined(__CUDA_FP16_TYPES_EXIST__) #error CUB should not include cuda_fp16.h when half support is disabled #endif // __CUDA_FP16_TYPES_EXIST__ #if defined(__CUDA_BF16_TYPES_EXIST__) #error CUB should not include cuda_bf16.h when half support is disabled #endif // __CUDA_BF16_TYPES_EXIST__ #endif // CCCL_DISABLE_FP16_SUPPORT cccl-2.5.0/cub/cub/000077500000000000000000000000001463375617100137615ustar00rootroot00000000000000cccl-2.5.0/cub/cub/agent/000077500000000000000000000000001463375617100150575ustar00rootroot00000000000000cccl-2.5.0/cub/cub/agent/agent_adjacent_difference.cuh000066400000000000000000000207371463375617100226720ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include #include #include CUB_NAMESPACE_BEGIN template struct AgentAdjacentDifferencePolicy { static constexpr int BLOCK_THREADS = _BLOCK_THREADS; static constexpr int ITEMS_PER_THREAD = _ITEMS_PER_THREAD; static constexpr int ITEMS_PER_TILE = BLOCK_THREADS * ITEMS_PER_THREAD; static constexpr cub::BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; static constexpr cub::CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; static constexpr cub::BlockStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM; }; template struct AgentDifference { using LoadIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator::type; using BlockLoad = typename cub::BlockLoadType::type; using BlockStore = typename cub::BlockStoreType::type; using BlockAdjacentDifferenceT = cub::BlockAdjacentDifference; union _TempStorage { typename BlockLoad::TempStorage load; typename BlockStore::TempStorage store; typename BlockAdjacentDifferenceT::TempStorage adjacent_difference; }; /// Alias wrapper allowing storage to be unioned struct TempStorage : Uninitialized<_TempStorage> {}; static constexpr int BLOCK_THREADS = Policy::BLOCK_THREADS; static constexpr int ITEMS_PER_THREAD = Policy::ITEMS_PER_THREAD; static constexpr int ITEMS_PER_TILE = Policy::ITEMS_PER_TILE; static constexpr int SHARED_MEMORY_SIZE = static_cast(sizeof(TempStorage)); _TempStorage& temp_storage; InputIteratorT input_it; LoadIt load_it; InputT* first_tile_previous; OutputIteratorT result; DifferenceOpT difference_op; OffsetT num_items; _CCCL_DEVICE _CCCL_FORCEINLINE AgentDifference( TempStorage& temp_storage, InputIteratorT input_it, InputT* first_tile_previous, OutputIteratorT result, DifferenceOpT difference_op, OffsetT num_items) : temp_storage(temp_storage.Alias()) , input_it(input_it) , load_it(THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(Policy(), input_it)) , first_tile_previous(first_tile_previous) , result(result) , difference_op(difference_op) , num_items(num_items) {} template _CCCL_DEVICE _CCCL_FORCEINLINE void consume_tile_impl(int num_remaining, int tile_idx, OffsetT tile_base) { InputT input[ITEMS_PER_THREAD]; OutputT output[ITEMS_PER_THREAD]; if (IS_LAST_TILE) { // Fill last elements with the first element // because collectives are not suffix guarded BlockLoad(temp_storage.load).Load(load_it + tile_base, input, num_remaining, *(load_it + tile_base)); } else { BlockLoad(temp_storage.load).Load(load_it + tile_base, input); } CTA_SYNC(); if (ReadLeft) { if (IS_FIRST_TILE) { if (IS_LAST_TILE) { BlockAdjacentDifferenceT(temp_storage.adjacent_difference) .SubtractLeftPartialTile(input, output, difference_op, num_remaining); } else { BlockAdjacentDifferenceT(temp_storage.adjacent_difference).SubtractLeft(input, output, difference_op); } } else { InputT tile_prev_input = MayAlias ? first_tile_previous[tile_idx] : *(input_it + tile_base - 1); if (IS_LAST_TILE) { BlockAdjacentDifferenceT(temp_storage.adjacent_difference) .SubtractLeftPartialTile(input, output, difference_op, num_remaining, tile_prev_input); } else { BlockAdjacentDifferenceT(temp_storage.adjacent_difference) .SubtractLeft(input, output, difference_op, tile_prev_input); } } } else { if (IS_LAST_TILE) { BlockAdjacentDifferenceT(temp_storage.adjacent_difference) .SubtractRightPartialTile(input, output, difference_op, num_remaining); } else { InputT tile_next_input = MayAlias ? first_tile_previous[tile_idx] : *(input_it + tile_base + ITEMS_PER_TILE); BlockAdjacentDifferenceT(temp_storage.adjacent_difference) .SubtractRight(input, output, difference_op, tile_next_input); } } CTA_SYNC(); if (IS_LAST_TILE) { BlockStore(temp_storage.store).Store(result + tile_base, output, num_remaining); } else { BlockStore(temp_storage.store).Store(result + tile_base, output); } } template _CCCL_DEVICE _CCCL_FORCEINLINE void consume_tile(int num_remaining, int tile_idx, OffsetT tile_base) { if (tile_idx == 0) { consume_tile_impl(num_remaining, tile_idx, tile_base); } else { consume_tile_impl(num_remaining, tile_idx, tile_base); } } _CCCL_DEVICE _CCCL_FORCEINLINE void Process(int tile_idx, OffsetT tile_base) { OffsetT num_remaining = num_items - tile_base; if (num_remaining > ITEMS_PER_TILE) // not a last tile { consume_tile(num_remaining, tile_idx, tile_base); } else { consume_tile(num_remaining, tile_idx, tile_base); } } }; template struct AgentDifferenceInit { static constexpr int BLOCK_THREADS = 128; static _CCCL_DEVICE _CCCL_FORCEINLINE void Process(int tile_idx, InputIteratorT first, InputT* result, OffsetT num_tiles, int items_per_tile) { OffsetT tile_base = static_cast(tile_idx) * items_per_tile; if (tile_base > 0 && tile_idx < num_tiles) { if (ReadLeft) { result[tile_idx] = first[tile_base - 1]; } else { result[tile_idx - 1] = first[tile_base]; } } } }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/agent/agent_batch_memcpy.cuh000066400000000000000000001413431463375617100213770ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * cub::AgentBatchMemcpy implements device-wide copying of a batch of device-accessible * source-buffers to device-accessible destination-buffers. */ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include #include #include #include #include #include #include CUB_NAMESPACE_BEGIN namespace detail { template _CCCL_FORCEINLINE _CCCL_DEVICE void LoadVectorAndFunnelShiftR(uint32_t const* aligned_ptr, uint32_t bit_shift, uint4& data_out) { data_out = {aligned_ptr[0], aligned_ptr[1], aligned_ptr[2], aligned_ptr[3]}; if (!PTR_IS_FOUR_BYTE_ALIGNED) { uint32_t tail = aligned_ptr[4]; data_out.x = __funnelshift_r(data_out.x, data_out.y, bit_shift); data_out.y = __funnelshift_r(data_out.y, data_out.z, bit_shift); data_out.z = __funnelshift_r(data_out.z, data_out.w, bit_shift); data_out.w = __funnelshift_r(data_out.w, tail, bit_shift); } } template _CCCL_FORCEINLINE _CCCL_DEVICE void LoadVectorAndFunnelShiftR(uint32_t const* aligned_ptr, uint32_t bit_shift, uint2& data_out) { data_out = {aligned_ptr[0], aligned_ptr[1]}; if (!PTR_IS_FOUR_BYTE_ALIGNED) { uint32_t tail = aligned_ptr[2]; data_out.x = __funnelshift_r(data_out.x, data_out.y, bit_shift); data_out.y = __funnelshift_r(data_out.y, tail, bit_shift); } } template _CCCL_FORCEINLINE _CCCL_DEVICE void LoadVectorAndFunnelShiftR(uint32_t const* aligned_ptr, uint32_t bit_shift, uint32_t& data_out) { data_out = aligned_ptr[0]; if (!PTR_IS_FOUR_BYTE_ALIGNED) { uint32_t tail = aligned_ptr[1]; data_out = __funnelshift_r(data_out, tail, bit_shift); } } /** * @brief Loads data from \p ptr into \p data_out without requiring \p ptr to be aligned. * @note If \p ptr isn't aligned to four bytes, the bytes from the last four-byte aligned address up * to \p ptr are loaded too (but dropped) and, hence, need to be device-accessible. Similarly, if * \p ptr isn't aligned to four bytes, the bytes from `(ptr + sizeof(VectorT))` up to the following * four-byte aligned address are loaded too (but dropped), and, hence, need to be device-accessible. * * @tparam VectorT The vector type used for vectorized stores (i.e., one of uint4, uint2, uint32_t) * @param ptr The pointer from which the data is supposed to be loaded * @param data_out The vector type that stores the data loaded from \p ptr */ template _CCCL_FORCEINLINE _CCCL_DEVICE void LoadVector(const char* ptr, VectorT& data_out) { const uint32_t offset = reinterpret_cast(ptr) % 4U; const uint32_t* aligned_ptr = reinterpret_cast(ptr - offset); constexpr uint32_t bits_per_byte = 8U; const uint32_t bit_shift = offset * bits_per_byte; // If `ptr` is aligned to four bytes, we can perform a simple uint32_t-aliased load if (offset == 0) { LoadVectorAndFunnelShiftR(aligned_ptr, bit_shift, data_out); } // Otherwise, we need to load extra bytes and perform funnel-shifting else { LoadVectorAndFunnelShiftR(aligned_ptr, bit_shift, data_out); } } /** * @brief Helper data structure to hold information on the byte range for which we can safely * perform vectorized copies. * * @tparam VectorT The vector type used for vectorized stores (i.e., one of uint4, uint2, uint32_t) */ template struct PointerRange { VectorT* out_begin; VectorT* out_end; const char* in_begin; const char* in_end; }; /** * @brief Both `out_start_aligned` and `out_end_aligned` are indices into `out_ptr`. * `out_start_aligned` is the first VectorT-aligned memory location after `out_ptr + 3`. * `out_end_aligned` is the last VectorT-aligned memory location before `out_end - 4`, where out_end * corresponds to one past the last byte to be copied. Bytes between `[out_start_aligned, * out_end_aligned)` will be copied using VectorT. `out_ptr + 3` and `out_end - 4` are used instead * of `out_ptr` and `out_end` to avoid `LoadVector` reading beyond data boundaries. * * @tparam VectorT The vector type used for vectorized stores (i.e., one of uint4, uint2, uint32_t) * @tparam ByteOffsetT Type used to index the bytes within the buffers * @param in_begin Pointer to the beginning of the byte range that shall be copied * @param out_begin Pointer to the beginning of the byte range that shall be copied * @param num_bytes Number of bytes that shall be copied * @return The byte range that can safely be copied using vectorized stores of type VectorT */ template _CCCL_DEVICE _CCCL_FORCEINLINE PointerRange GetAlignedPtrs(const void* in_begin, void* out_begin, ByteOffsetT num_bytes) { // Data type size used for vectorized stores constexpr size_t out_datatype_size = sizeof(VectorT); // Data type size used for type-aliased loads constexpr size_t in_datatype_size = sizeof(uint32_t); // char-aliased ptrs to simplify pointer arithmetic char* out_ptr = reinterpret_cast(out_begin); const char* in_ptr = reinterpret_cast(in_begin); // Number of bytes between the first VectorT-aligned address at or before out_begin and out_begin const uint32_t alignment_offset = reinterpret_cast(out_ptr) % out_datatype_size; // The first VectorT-aligned address before (or at) out_begin char* out_chars_aligned = reinterpret_cast(out_ptr - alignment_offset); // The number of extra bytes preceding `in_ptr` that are loaded but dropped uint32_t in_extra_bytes = reinterpret_cast(in_ptr) % in_datatype_size; // The offset required by `LoadVector`: // If the input pointer is not aligned, we load data from the last aligned address preceding the // pointer. That is, loading up to (in_datatype_size-1) bytes before `in_ptr` uint32_t in_offset_req = in_extra_bytes; // Bytes after `out_chars_aligned` to the first VectorT-aligned address at or after `out_begin` uint32_t out_start_aligned = CUB_QUOTIENT_CEILING(in_offset_req + alignment_offset, out_datatype_size) * out_datatype_size; // Compute the beginning of the aligned ranges (output and input pointers) VectorT* out_aligned_begin = reinterpret_cast(out_chars_aligned + out_start_aligned); const char* in_aligned_begin = in_ptr + (reinterpret_cast(out_aligned_begin) - out_ptr); // If the aligned range is not aligned for the input pointer, we load up to (in_datatype_size-1) // bytes after the last byte that is copied. That is, we always load four bytes up to the next // aligned input address at a time. E.g., if the last byte loaded is one byte past the last // aligned address we'll also load the three bytes after that byte. uint32_t in_extra_bytes_from_aligned = (reinterpret_cast(in_aligned_begin) % in_datatype_size); uint32_t in_end_padding_req = (in_datatype_size - in_extra_bytes_from_aligned) % in_datatype_size; // Bytes after `out_chars_aligned` to the last VectorT-aligned // address at (or before) `out_begin` + `num_bytes` uint32_t out_end_aligned{}; if (in_end_padding_req + alignment_offset > num_bytes) { out_end_aligned = out_start_aligned; } else { out_end_aligned = (num_bytes - in_end_padding_req + alignment_offset) / out_datatype_size * out_datatype_size; } VectorT* out_aligned_end = reinterpret_cast(out_chars_aligned + out_end_aligned); const char* in_aligned_end = in_ptr + (reinterpret_cast(out_aligned_end) - out_ptr); return {out_aligned_begin, out_aligned_end, in_aligned_begin, in_aligned_end}; } /** * @brief Cooperatively copies \p num_bytes from \p src to \p dest using vectorized stores of type * \p VectorT for addresses within [dest, dest + num_bytes) that are aligned to \p VectorT. A * byte-wise copy is used for byte-ranges that are not aligned to \p VectorT. * * @tparam LOGICAL_WARP_SIZE The number of threads cooperaing to copy the data; all threads within * [0, `LOGICAL_WARP_SIZE`) must invoke this method with the same arguments * @tparam VectorT The vector type used for vectorized stores (i.e., one of uint4, uint2, uint32_t) * @tparam ByteOffsetT Type used to index the bytes within the buffers * @param thread_rank The thread rank within the group that cooperates to copy the data must be * within [0, `LOGICAL_WARP_SIZE`) * @param dest Pointer to the memory location to copy to * @param num_bytes Number of bytes to copy * @param src Pointer to the memory location to copy from */ template _CCCL_DEVICE _CCCL_FORCEINLINE void VectorizedCopy(int32_t thread_rank, void* dest, ByteOffsetT num_bytes, const void* src) { char* out_ptr = reinterpret_cast(dest); const char* in_ptr = reinterpret_cast(src); // Gets the byte range that can safely be copied using vectorized stores of type VectorT auto aligned_range = GetAlignedPtrs(src, dest, num_bytes); // If byte range for which we can use vectorized copies is empty -> use byte-wise copies if (aligned_range.out_end <= aligned_range.out_begin) { for (ByteOffsetT ichar = thread_rank; ichar < num_bytes; ichar += LOGICAL_WARP_SIZE) { out_ptr[ichar] = in_ptr[ichar]; } } else { // Copy bytes in range `[dest, aligned_range.out_begin)` out_ptr += thread_rank; in_ptr += thread_rank; while (out_ptr < reinterpret_cast(aligned_range.out_begin)) { *out_ptr = *in_ptr; out_ptr += LOGICAL_WARP_SIZE; in_ptr += LOGICAL_WARP_SIZE; } // Copy bytes in range `[aligned_range.out_begin, aligned_range.out_end)` VectorT* aligned_range_begin = aligned_range.out_begin + thread_rank; const char* in_aligned_begin = aligned_range.in_begin + thread_rank * sizeof(VectorT); while (aligned_range_begin < aligned_range.out_end) { VectorT data_in; LoadVector(in_aligned_begin, data_in); *aligned_range_begin = data_in; in_aligned_begin += sizeof(VectorT) * LOGICAL_WARP_SIZE; aligned_range_begin += LOGICAL_WARP_SIZE; } // Copy bytes in range `[aligned_range.out_end, dest + num_bytes)`. out_ptr = reinterpret_cast(aligned_range.out_end) + thread_rank; in_ptr = aligned_range.in_end + thread_rank; while (out_ptr < reinterpret_cast(dest) + num_bytes) { *out_ptr = *in_ptr; out_ptr += LOGICAL_WARP_SIZE; in_ptr += LOGICAL_WARP_SIZE; } } } template ::type = 0> _CCCL_DEVICE _CCCL_FORCEINLINE void copy_items(InputBufferT input_buffer, OutputBufferT output_buffer, OffsetT num_bytes, OffsetT offset = 0) { VectorizedCopy( threadIdx.x % LOGICAL_WARP_SIZE, &reinterpret_cast(output_buffer)[offset], num_bytes, &reinterpret_cast(input_buffer)[offset]); } template ::type = 0> _CCCL_DEVICE _CCCL_FORCEINLINE void copy_items(InputBufferT input_buffer, OutputBufferT output_buffer, OffsetT num_items, OffsetT offset = 0) { output_buffer += offset; input_buffer += offset; for (OffsetT i = threadIdx.x % LOGICAL_WARP_SIZE; i < num_items; i += LOGICAL_WARP_SIZE) { *(output_buffer + i) = *(input_buffer + i); } } template ::type = 0> _CCCL_DEVICE _CCCL_FORCEINLINE AliasT read_item(InputIt buffer_src, OffsetT offset) { return *(reinterpret_cast(buffer_src) + offset); } template ::type = 0> _CCCL_DEVICE _CCCL_FORCEINLINE AliasT read_item(InputIt buffer_src, OffsetT offset) { return *(buffer_src + offset); } template ::type = 0> _CCCL_DEVICE _CCCL_FORCEINLINE void write_item(OutputIt buffer_dst, OffsetT offset, AliasT value) { *(reinterpret_cast(buffer_dst) + offset) = value; } template ::type = 0> _CCCL_DEVICE _CCCL_FORCEINLINE void write_item(OutputIt buffer_dst, OffsetT offset, AliasT value) { *(buffer_dst + offset) = value; } /** * @brief A helper class that allows threads to maintain multiple counters, where the counter that * shall be incremented can be addressed dynamically without incurring register spillage. * * @tparam NUM_ITEMS The number of counters to allocate * @tparam MAX_ITEM_VALUE The maximum count that must be supported. * @tparam PREFER_POW2_BITS Whether the number of bits to dedicate to each counter should be a * power-of-two. If enabled, this allows replacing integer multiplication with a bit-shift in * exchange for higher register pressure. * @tparam BackingUnitT The data type that is used to provide the bits of all the counters that * shall be allocated. */ template class BitPackedCounter { private: /// The minimum number of bits required to represent all values from [0, MAX_ITEM_VALUE] static constexpr uint32_t MIN_BITS_PER_ITEM = (MAX_ITEM_VALUE == 0U) ? 1U : cub::Log2(MAX_ITEM_VALUE + 1U)>::VALUE; /// The number of bits allocated for each item. For pre-Volta, we prefer a power-of-2 here to /// have the compiler replace costly integer multiplication with bit-shifting. static constexpr uint32_t BITS_PER_ITEM = PREFER_POW2_BITS ? (0x01ULL << (cub::Log2(MIN_BITS_PER_ITEM)>::VALUE)) : MIN_BITS_PER_ITEM; /// The number of bits that each backing data type can store static constexpr uint32_t NUM_BITS_PER_UNIT = sizeof(BackingUnitT) * 8; /// The number of items that each backing data type can store static constexpr uint32_t ITEMS_PER_UNIT = NUM_BITS_PER_UNIT / BITS_PER_ITEM; /// The number of bits the backing data type is actually making use of static constexpr uint32_t USED_BITS_PER_UNIT = ITEMS_PER_UNIT * BITS_PER_ITEM; /// The number of backing data types required to store the given number of items static constexpr uint32_t NUM_TOTAL_UNITS = CUB_QUOTIENT_CEILING(NUM_ITEMS, ITEMS_PER_UNIT); /// This is the net number of bit-storage provided by each unit (remainder bits are unused) static constexpr uint32_t UNIT_MASK = (USED_BITS_PER_UNIT >= (8U * sizeof(uint32_t))) ? 0xFFFFFFFF : (0x01U << USED_BITS_PER_UNIT) - 1; /// This is the bit-mask for each item static constexpr uint32_t ITEM_MASK = (BITS_PER_ITEM >= (8U * sizeof(uint32_t))) ? 0xFFFFFFFF : (0x01U << BITS_PER_ITEM) - 1; //------------------------------------------------------------------------------ // ACCESSORS //------------------------------------------------------------------------------ public: _CCCL_DEVICE _CCCL_FORCEINLINE uint32_t Get(uint32_t index) const { const uint32_t target_offset = index * BITS_PER_ITEM; uint32_t val = 0; #pragma unroll for (uint32_t i = 0; i < NUM_TOTAL_UNITS; ++i) { // In case the bit-offset of the counter at is larger than the bit range of the // current unit, the bit_shift amount will be larger than the bits provided by this unit. As // C++'s bit-shift has undefined behaviour if the bits being shifted exceed the operand width, // we use the PTX instruction `shr` to make sure behaviour is well-defined. // Negative bit-shift amounts wrap around in unsigned integer math and are ultimately clamped. const uint32_t bit_shift = target_offset - i * USED_BITS_PER_UNIT; val |= detail::LogicShiftRight(data[i], bit_shift) & ITEM_MASK; } return val; } _CCCL_DEVICE _CCCL_FORCEINLINE void Add(uint32_t index, uint32_t value) { const uint32_t target_offset = index * BITS_PER_ITEM; #pragma unroll for (uint32_t i = 0; i < NUM_TOTAL_UNITS; ++i) { // In case the bit-offset of the counter at is larger than the bit range of the // current unit, the bit_shift amount will be larger than the bits provided by this unit. As // C++'s bit-shift has undefined behaviour if the bits being shifted exceed the operand width, // we use the PTX instruction `shl` to make sure behaviour is well-defined. // Negative bit-shift amounts wrap around in unsigned integer math and are ultimately clamped. const uint32_t bit_shift = target_offset - i * USED_BITS_PER_UNIT; data[i] += detail::LogicShiftLeft(value, bit_shift) & UNIT_MASK; } } _CCCL_DEVICE BitPackedCounter operator+(const BitPackedCounter& rhs) const { BitPackedCounter result; #pragma unroll for (uint32_t i = 0; i < NUM_TOTAL_UNITS; ++i) { result.data[i] = data[i] + rhs.data[i]; } return result; } //------------------------------------------------------------------------------ // MEMBER VARIABLES //------------------------------------------------------------------------------ private: BackingUnitT data[NUM_TOTAL_UNITS] = {}; }; /** * Parameterizable tuning policy type for AgentBatchMemcpy */ template struct AgentBatchMemcpyPolicy { /// Threads per thread block static constexpr uint32_t BLOCK_THREADS = _BLOCK_THREADS; /// Items per thread (per tile of input) static constexpr uint32_t BUFFERS_PER_THREAD = _BUFFERS_PER_THREAD; /// The number of bytes that each thread will work on with each iteration of reading in bytes /// from one or more // source-buffers and writing them out to the respective destination-buffers. static constexpr uint32_t TLEV_BYTES_PER_THREAD = _TLEV_BYTES_PER_THREAD; /// Whether the BitPackedCounter should prefer allocating a power-of-2 number of bits per /// counter static constexpr uint32_t PREFER_POW2_BITS = _PREFER_POW2_BITS; /// BLEV tile size granularity static constexpr uint32_t BLOCK_LEVEL_TILE_SIZE = _BLOCK_LEVEL_TILE_SIZE; static constexpr uint32_t WARP_LEVEL_THRESHOLD = _WARP_LEVEL_THRESHOLD; static constexpr uint32_t BLOCK_LEVEL_THRESHOLD = _BLOCK_LEVEL_THRESHOLD; using buff_delay_constructor = BuffDelayConstructor; using block_delay_constructor = BlockDelayConstructor; }; template class AgentBatchMemcpy { private: //--------------------------------------------------------------------- // CONFIGS / CONSTANTS //--------------------------------------------------------------------- // Tuning policy-based configurations static constexpr uint32_t BLOCK_THREADS = AgentMemcpySmallBuffersPolicyT::BLOCK_THREADS; static constexpr uint32_t BUFFERS_PER_THREAD = AgentMemcpySmallBuffersPolicyT::BUFFERS_PER_THREAD; static constexpr uint32_t TLEV_BYTES_PER_THREAD = AgentMemcpySmallBuffersPolicyT::TLEV_BYTES_PER_THREAD; static constexpr bool PREFER_POW2_BITS = AgentMemcpySmallBuffersPolicyT::PREFER_POW2_BITS; static constexpr uint32_t BLOCK_LEVEL_TILE_SIZE = AgentMemcpySmallBuffersPolicyT::BLOCK_LEVEL_TILE_SIZE; // Derived configs static constexpr uint32_t BUFFERS_PER_BLOCK = BUFFERS_PER_THREAD * BLOCK_THREADS; static constexpr uint32_t TLEV_BUFFERS_PER_THREAD = BUFFERS_PER_THREAD; static constexpr uint32_t BLEV_BUFFERS_PER_THREAD = BUFFERS_PER_THREAD; static constexpr uint32_t WARP_LEVEL_THRESHOLD = AgentMemcpySmallBuffersPolicyT::WARP_LEVEL_THRESHOLD; static constexpr uint32_t BLOCK_LEVEL_THRESHOLD = AgentMemcpySmallBuffersPolicyT::BLOCK_LEVEL_THRESHOLD; static constexpr uint32_t BUFFER_STABLE_PARTITION = false; // Constants enum : uint32_t { TLEV_SIZE_CLASS = 0, WLEV_SIZE_CLASS, BLEV_SIZE_CLASS, NUM_SIZE_CLASSES, }; //--------------------------------------------------------------------- // TYPE DECLARATIONS //--------------------------------------------------------------------- /// Internal load/store type. For byte-wise memcpy, a single-byte type using AliasT = typename ::cuda::std::conditional, std::iterator_traits>>::type::value_type; /// Types of the input and output buffers using InputBufferT = cub::detail::value_t; using OutputBufferT = cub::detail::value_t; /// Type that has to be sufficiently large to hold any of the buffers' sizes. /// The BufferSizeIteratorT's value type must be convertible to this type. using BufferSizeT = cub::detail::value_t; /// Type used to index into the tile of buffers that this thread block is assigned to. using BlockBufferOffsetT = uint16_t; /// Internal type used to index into the bytes of and represent size of a TLEV buffer using TLevBufferSizeT = uint16_t; /** * @brief Helper struct to simplify BlockExchange within a single four-byte word */ struct ZippedTLevByteAssignment { // The buffer id within this tile BlockBufferOffsetT tile_buffer_id; // Byte-offset within that buffer TLevBufferSizeT buffer_byte_offset; }; /** * POD to keep track of pairs after having partitioned this tile's * buffers by their size. */ struct BufferTuple { // Size is only valid (and relevant) for buffers that are use thread-level collaboration TLevBufferSizeT size; // The buffer id relativ to this tile (i.e., the buffer id within this tile) BlockBufferOffsetT buffer_id; }; // Load buffers in a striped arrangement if we do not want to performa a stable partitioning into // small, medium, and large buffers, otherwise load them in a blocked arrangement using BufferLoadT = BlockLoad(BLOCK_THREADS), static_cast(BUFFERS_PER_THREAD), BUFFER_STABLE_PARTITION ? BLOCK_LOAD_WARP_TRANSPOSE : BLOCK_LOAD_STRIPED>; // A vectorized counter that will count the number of buffers that fall into each of the // size-classes. Where the size class representes the collaboration level that is required to // process a buffer. The collaboration level being either: //-> (1) TLEV (thread-level collaboration), requiring one or multiple threads but not a FULL warp // to collaborate //-> (2) WLEV (warp-level collaboration), requiring a full warp to collaborate on a buffer //-> (3) BLEV (block-level collaboration), requiring one or multiple thread blocks to collaborate // on a buffer */ using VectorizedSizeClassCounterT = BitPackedCounter; // Block-level scan used to compute the write offsets using BlockSizeClassScanT = cub::BlockScan(BLOCK_THREADS)>; // using BlockBLevTileCountScanT = cub::BlockScan(BLOCK_THREADS)>; // Block-level run-length decode algorithm to evenly distribute work of all buffers requiring // thread-level collaboration using BlockRunLengthDecodeT = cub::BlockRunLengthDecode(BLOCK_THREADS), static_cast(TLEV_BUFFERS_PER_THREAD), static_cast(TLEV_BYTES_PER_THREAD)>; using BlockExchangeTLevT = cub::BlockExchange(BLOCK_THREADS), static_cast(TLEV_BYTES_PER_THREAD)>; using BLevBuffScanPrefixCallbackOpT = TilePrefixCallbackOp; using BLevBlockScanPrefixCallbackOpT = TilePrefixCallbackOp; //----------------------------------------------------------------------------- // SHARED MEMORY DECLARATIONS //----------------------------------------------------------------------------- struct _TempStorage { union { typename BufferLoadT::TempStorage load_storage; // Stage 1: histogram over the size classes in preparation for partitioning buffers by size typename BlockSizeClassScanT::TempStorage size_scan_storage; // Stage 2: Communicate the number ofer buffers requiring block-level collaboration typename BLevBuffScanPrefixCallbackOpT::TempStorage buffer_scan_callback; // Stage 3; batch memcpy buffers that require only thread-level collaboration struct { BufferTuple buffers_by_size_class[BUFFERS_PER_BLOCK]; // Stage 3.1: Write buffers requiring block-level collaboration to queue union { struct { typename BLevBlockScanPrefixCallbackOpT::TempStorage block_scan_callback; typename BlockBLevTileCountScanT::TempStorage block_scan_storage; } blev; // Stage 3.3: run-length decode & block exchange for tlev // rld_state needs to be persistent across loop iterations (RunLengthDecode calls) and, // hence, cannot alias block_exchange_storage struct { typename BlockRunLengthDecodeT::TempStorage rld_state; typename BlockExchangeTLevT::TempStorage block_exchange_storage; } tlev; }; } staged; }; BufferOffsetT blev_buffer_offset; }; //----------------------------------------------------------------------------- // PUBLIC TYPE MEMBERS //----------------------------------------------------------------------------- public: struct TempStorage : Uninitialized<_TempStorage> {}; //----------------------------------------------------------------------------- // PRIVATE MEMBER FUNCTIONS //----------------------------------------------------------------------------- private: /// Shared storage reference _TempStorage& temp_storage; /** * @brief Loads this tile's buffers' sizes, without any guards (i.e., out-of-bounds checks) */ _CCCL_DEVICE _CCCL_FORCEINLINE void LoadBufferSizesFullTile(BufferSizeIteratorT tile_buffer_sizes_it, BufferSizeT (&buffer_sizes)[BUFFERS_PER_THREAD]) { BufferLoadT(temp_storage.load_storage).Load(tile_buffer_sizes_it, buffer_sizes); } /** * @brief Loads this tile's buffers' sizes, making sure to read at most \p num_valid items. */ _CCCL_DEVICE _CCCL_FORCEINLINE void LoadBufferSizesPartialTile( BufferSizeIteratorT tile_buffer_sizes_it, BufferSizeT (&buffer_sizes)[BUFFERS_PER_THREAD], BufferOffsetT num_valid) { // Out-of-bounds buffer items are initialized to '0', so those buffers will simply be ignored // later on constexpr BufferSizeT OOB_DEFAULT_BUFFER_SIZE = 0U; BufferLoadT(temp_storage.load_storage).Load(tile_buffer_sizes_it, buffer_sizes, num_valid, OOB_DEFAULT_BUFFER_SIZE); } /** * @brief Computes the histogram over the number of buffers belonging to each of the three * size-classes (TLEV, WLEV, BLEV). */ _CCCL_DEVICE _CCCL_FORCEINLINE VectorizedSizeClassCounterT GetBufferSizeClassHistogram(const BufferSizeT (&buffer_sizes)[BUFFERS_PER_THREAD]) { VectorizedSizeClassCounterT vectorized_counters{}; #pragma unroll for (uint32_t i = 0; i < BUFFERS_PER_THREAD; i++) { // Whether to increment ANY of the buffer size classes at all const uint32_t increment = buffer_sizes[i] > 0 ? 1U : 0U; // Identify the buffer's size class uint32_t buffer_size_class = 0; buffer_size_class += buffer_sizes[i] > WARP_LEVEL_THRESHOLD ? 1U : 0U; buffer_size_class += buffer_sizes[i] > BLOCK_LEVEL_THRESHOLD ? 1U : 0U; // Increment the count of the respective size class vectorized_counters.Add(buffer_size_class, increment); } return vectorized_counters; } /** * @brief Scatters the buffers into the respective buffer's size-class partition. */ _CCCL_DEVICE _CCCL_FORCEINLINE void PartitionBuffersBySize( const BufferSizeT (&buffer_sizes)[BUFFERS_PER_THREAD], VectorizedSizeClassCounterT& vectorized_offsets, BufferTuple (&buffers_by_size_class)[BUFFERS_PER_BLOCK]) { // If we intend to perform a stable partitioning, the thread's buffer are in a blocked // arrangement, otherwise they are in a striped arrangement BlockBufferOffsetT buffer_id = BUFFER_STABLE_PARTITION ? (BUFFERS_PER_THREAD * threadIdx.x) : (threadIdx.x); constexpr BlockBufferOffsetT BUFFER_STRIDE = BUFFER_STABLE_PARTITION ? static_cast(1) : static_cast(BLOCK_THREADS); #pragma unroll for (uint32_t i = 0; i < BUFFERS_PER_THREAD; i++) { if (buffer_sizes[i] > 0) { uint32_t buffer_size_class = 0; buffer_size_class += buffer_sizes[i] > WARP_LEVEL_THRESHOLD ? 1U : 0U; buffer_size_class += buffer_sizes[i] > BLOCK_LEVEL_THRESHOLD ? 1U : 0U; const uint32_t write_offset = vectorized_offsets.Get(buffer_size_class); buffers_by_size_class[write_offset] = {static_cast(buffer_sizes[i]), buffer_id}; vectorized_offsets.Add(buffer_size_class, 1U); } buffer_id += BUFFER_STRIDE; } } /** * @brief Read in all the buffers that require block-level collaboration and put them to a queue * that will get picked up in a separate, subsequent kernel. */ _CCCL_DEVICE _CCCL_FORCEINLINE void EnqueueBLEVBuffers( BufferTuple* buffers_by_size_class, InputBufferIt tile_buffer_srcs, OutputBufferIt tile_buffer_dsts, BufferSizeIteratorT tile_buffer_sizes, BlockBufferOffsetT num_blev_buffers, BufferOffsetT tile_buffer_offset, BufferOffsetT tile_id) { BlockOffsetT block_offset[BLEV_BUFFERS_PER_THREAD]; // Read in the BLEV buffer partition (i.e., the buffers that require block-level collaboration) uint32_t blev_buffer_offset = threadIdx.x * BLEV_BUFFERS_PER_THREAD; #pragma unroll for (uint32_t i = 0; i < BLEV_BUFFERS_PER_THREAD; i++) { if (blev_buffer_offset < num_blev_buffers) { BlockBufferOffsetT tile_buffer_id = buffers_by_size_class[blev_buffer_offset].buffer_id; block_offset[i] = CUB_QUOTIENT_CEILING(tile_buffer_sizes[tile_buffer_id], BLOCK_LEVEL_TILE_SIZE); } else { // Out-of-bounds buffers are assigned a tile count of '0' block_offset[i] = 0U; } blev_buffer_offset++; } if (tile_id == 0) { BlockOffsetT block_aggregate; BlockBLevTileCountScanT(temp_storage.staged.blev.block_scan_storage) .ExclusiveSum(block_offset, block_offset, block_aggregate); if (threadIdx.x == 0) { blev_block_scan_state.SetInclusive(0, block_aggregate); } } else { BLevBlockScanPrefixCallbackOpT blev_tile_prefix_op( blev_block_scan_state, temp_storage.staged.blev.block_scan_callback, Sum(), tile_id); BlockBLevTileCountScanT(temp_storage.staged.blev.block_scan_storage) .ExclusiveSum(block_offset, block_offset, blev_tile_prefix_op); } CTA_SYNC(); // Read in the BLEV buffer partition (i.e., the buffers that require block-level collaboration) blev_buffer_offset = threadIdx.x * BLEV_BUFFERS_PER_THREAD; #pragma unroll for (uint32_t i = 0; i < BLEV_BUFFERS_PER_THREAD; i++) { if (blev_buffer_offset < num_blev_buffers) { BlockBufferOffsetT tile_buffer_id = buffers_by_size_class[blev_buffer_offset].buffer_id; blev_buffer_srcs[tile_buffer_offset + blev_buffer_offset] = tile_buffer_srcs[tile_buffer_id]; blev_buffer_dsts[tile_buffer_offset + blev_buffer_offset] = tile_buffer_dsts[tile_buffer_id]; blev_buffer_sizes[tile_buffer_offset + blev_buffer_offset] = tile_buffer_sizes[tile_buffer_id]; blev_buffer_tile_offsets[tile_buffer_offset + blev_buffer_offset] = block_offset[i]; blev_buffer_offset++; } } } /** * @brief Read in all the buffers of this tile that require warp-level collaboration and copy * their bytes to the corresponding destination buffer */ _CCCL_DEVICE _CCCL_FORCEINLINE void BatchMemcpyWLEVBuffers( BufferTuple* buffers_by_size_class, InputBufferIt tile_buffer_srcs, OutputBufferIt tile_buffer_dsts, BufferSizeIteratorT tile_buffer_sizes, BlockBufferOffsetT num_wlev_buffers) { const int32_t warp_id = threadIdx.x / CUB_PTX_WARP_THREADS; constexpr uint32_t WARPS_PER_BLOCK = BLOCK_THREADS / CUB_PTX_WARP_THREADS; for (BlockBufferOffsetT buffer_offset = warp_id; buffer_offset < num_wlev_buffers; buffer_offset += WARPS_PER_BLOCK) { const auto buffer_id = buffers_by_size_class[buffer_offset].buffer_id; copy_items( tile_buffer_srcs[buffer_id], tile_buffer_dsts[buffer_id], tile_buffer_sizes[buffer_id]); } } /** * @brief Read in all the buffers of this tile that require thread-level collaboration and copy * their bytes to the corresponding destination buffer */ _CCCL_DEVICE _CCCL_FORCEINLINE void BatchMemcpyTLEVBuffers( BufferTuple* buffers_by_size_class, InputBufferIt tile_buffer_srcs, OutputBufferIt tile_buffer_dsts, BlockBufferOffsetT num_tlev_buffers) { // Read in the buffers' ids that require thread-level collaboration (where buffer id is the // buffer within this tile) BlockBufferOffsetT tlev_buffer_ids[TLEV_BUFFERS_PER_THREAD]; TLevBufferSizeT tlev_buffer_sizes[TLEV_BUFFERS_PER_THREAD]; // Currently we do not go over the TLEV buffers in multiple iterations, so we need to make sure // we are able to be covered for the case that all our buffers are TLEV buffers static_assert(TLEV_BUFFERS_PER_THREAD >= BUFFERS_PER_THREAD, "Unsupported confiugraiton: The number of 'thread-level buffers' must be at " "least as large as the number of overall buffers being processed by each " "thread."); // Read in the TLEV buffer partition (i.e., the buffers that require thread-level collaboration) uint32_t tlev_buffer_offset = threadIdx.x * TLEV_BUFFERS_PER_THREAD; // Pre-populate the buffer sizes to 0 (i.e. zero-padding towards the end) to ensure // out-of-bounds TLEV buffers will not be considered #pragma unroll for (uint32_t i = 0; i < TLEV_BUFFERS_PER_THREAD; i++) { tlev_buffer_sizes[i] = 0; } // Assign TLEV buffers in a blocked arrangement (each thread is assigned consecutive TLEV // buffers) #pragma unroll for (uint32_t i = 0; i < TLEV_BUFFERS_PER_THREAD; i++) { if (tlev_buffer_offset < num_tlev_buffers) { tlev_buffer_ids[i] = buffers_by_size_class[tlev_buffer_offset].buffer_id; tlev_buffer_sizes[i] = buffers_by_size_class[tlev_buffer_offset].size; } tlev_buffer_offset++; } // Evenly distribute all the bytes that have to be copied from all the buffers that require // thread-level collaboration using BlockRunLengthDecode uint32_t num_total_tlev_bytes = 0U; BlockRunLengthDecodeT block_run_length_decode( temp_storage.staged.tlev.rld_state, tlev_buffer_ids, tlev_buffer_sizes, num_total_tlev_bytes); // Run-length decode the buffers' sizes into a window buffer of limited size. This is repeated // until we were able to cover all the bytes of TLEV buffers uint32_t decoded_window_offset = 0U; while (decoded_window_offset < num_total_tlev_bytes) { BlockBufferOffsetT buffer_id[TLEV_BYTES_PER_THREAD]; TLevBufferSizeT buffer_byte_offset[TLEV_BYTES_PER_THREAD]; // Now we have a balanced assignment: buffer_id[i] will hold the tile's buffer id and // buffer_byte_offset[i] that buffer's byte that this thread supposed to copy block_run_length_decode.RunLengthDecode(buffer_id, buffer_byte_offset, decoded_window_offset); // Zip from SoA to AoS ZippedTLevByteAssignment zipped_byte_assignment[TLEV_BYTES_PER_THREAD]; #pragma unroll for (int32_t i = 0; i < TLEV_BYTES_PER_THREAD; i++) { zipped_byte_assignment[i] = {buffer_id[i], buffer_byte_offset[i]}; } // Exchange from blocked to striped arrangement for coalesced memory reads and writes BlockExchangeTLevT(temp_storage.staged.tlev.block_exchange_storage) .BlockedToStriped(zipped_byte_assignment, zipped_byte_assignment); // Read in the bytes that this thread is assigned to constexpr uint32_t WINDOW_SIZE = (TLEV_BYTES_PER_THREAD * BLOCK_THREADS); const bool is_full_window = decoded_window_offset + WINDOW_SIZE < num_total_tlev_bytes; if (is_full_window) { uint32_t absolute_tlev_byte_offset = decoded_window_offset + threadIdx.x; AliasT src_byte[TLEV_BYTES_PER_THREAD]; #pragma unroll for (int32_t i = 0; i < TLEV_BYTES_PER_THREAD; i++) { src_byte[i] = read_item( tile_buffer_srcs[zipped_byte_assignment[i].tile_buffer_id], zipped_byte_assignment[i].buffer_byte_offset); absolute_tlev_byte_offset += BLOCK_THREADS; } #pragma unroll for (int32_t i = 0; i < TLEV_BYTES_PER_THREAD; i++) { write_item( tile_buffer_dsts[zipped_byte_assignment[i].tile_buffer_id], zipped_byte_assignment[i].buffer_byte_offset, src_byte[i]); } } else { uint32_t absolute_tlev_byte_offset = decoded_window_offset + threadIdx.x; #pragma unroll for (int32_t i = 0; i < TLEV_BYTES_PER_THREAD; i++) { if (absolute_tlev_byte_offset < num_total_tlev_bytes) { const AliasT src_byte = read_item( tile_buffer_srcs[zipped_byte_assignment[i].tile_buffer_id], zipped_byte_assignment[i].buffer_byte_offset); write_item( tile_buffer_dsts[zipped_byte_assignment[i].tile_buffer_id], zipped_byte_assignment[i].buffer_byte_offset, src_byte); } absolute_tlev_byte_offset += BLOCK_THREADS; } } decoded_window_offset += WINDOW_SIZE; // Ensure all threads finished collaborative BlockExchange so temporary storage can be reused // with next iteration CTA_SYNC(); } } //----------------------------------------------------------------------------- // PUBLIC MEMBER FUNCTIONS //----------------------------------------------------------------------------- public: _CCCL_DEVICE _CCCL_FORCEINLINE void ConsumeTile(BufferOffsetT tile_id) { // Offset into this tile's buffers BufferOffsetT buffer_offset = tile_id * BUFFERS_PER_BLOCK; // Indicates whether all of this tiles items are within bounds bool is_full_tile = buffer_offset + BUFFERS_PER_BLOCK < num_buffers; // Load the buffer sizes of this tile's buffers BufferSizeIteratorT tile_buffer_sizes_it = buffer_sizes_it + buffer_offset; BufferSizeT buffer_sizes[BUFFERS_PER_THREAD]; if (is_full_tile) { LoadBufferSizesFullTile(tile_buffer_sizes_it, buffer_sizes); } else { LoadBufferSizesPartialTile(tile_buffer_sizes_it, buffer_sizes, num_buffers - buffer_offset); } // Ensure we can repurpose the BlockLoad's temporary storage CTA_SYNC(); // Count how many buffers fall into each size-class VectorizedSizeClassCounterT size_class_histogram = GetBufferSizeClassHistogram(buffer_sizes); // Compute the prefix sum over the histogram VectorizedSizeClassCounterT size_class_agg = {}; BlockSizeClassScanT(temp_storage.size_scan_storage) .ExclusiveSum(size_class_histogram, size_class_histogram, size_class_agg); // Ensure we can repurpose the scan's temporary storage for scattering the buffer ids CTA_SYNC(); // Factor in the per-size-class counts / offsets // That is, WLEV buffer offset has to be offset by the TLEV buffer count and BLEV buffer offset // has to be offset by the TLEV+WLEV buffer count uint32_t buffer_count = 0U; for (uint32_t i = 0; i < NUM_SIZE_CLASSES; i++) { size_class_histogram.Add(i, buffer_count); buffer_count += size_class_agg.Get(i); } // Signal the number of BLEV buffers we're planning to write out BufferOffsetT buffer_exclusive_prefix = 0; if (tile_id == 0) { if (threadIdx.x == 0) { blev_buffer_scan_state.SetInclusive(tile_id, size_class_agg.Get(BLEV_SIZE_CLASS)); } buffer_exclusive_prefix = 0; } else { BLevBuffScanPrefixCallbackOpT blev_buffer_prefix_op( blev_buffer_scan_state, temp_storage.buffer_scan_callback, Sum(), tile_id); // Signal our partial prefix and wait for the inclusive prefix of previous tiles if (threadIdx.x < CUB_PTX_WARP_THREADS) { buffer_exclusive_prefix = blev_buffer_prefix_op(size_class_agg.Get(BLEV_SIZE_CLASS)); } } if (threadIdx.x == 0) { temp_storage.blev_buffer_offset = buffer_exclusive_prefix; } // Ensure the prefix callback has finished using its temporary storage and that it can be reused // in the next stage CTA_SYNC(); // Scatter the buffers into one of the three partitions (TLEV, WLEV, BLEV) depending on their // size PartitionBuffersBySize(buffer_sizes, size_class_histogram, temp_storage.staged.buffers_by_size_class); // Ensure all buffers have been partitioned by their size class AND // ensure that blev_buffer_offset has been written to shared memory CTA_SYNC(); // TODO: think about prefetching tile_buffer_{srcs,dsts} into shmem InputBufferIt tile_buffer_srcs = input_buffer_it + buffer_offset; OutputBufferIt tile_buffer_dsts = output_buffer_it + buffer_offset; BufferSizeIteratorT tile_buffer_sizes = buffer_sizes_it + buffer_offset; // Copy block-level buffers EnqueueBLEVBuffers( &temp_storage.staged .buffers_by_size_class[size_class_agg.Get(TLEV_SIZE_CLASS) + size_class_agg.Get(WLEV_SIZE_CLASS)], tile_buffer_srcs, tile_buffer_dsts, tile_buffer_sizes, size_class_agg.Get(BLEV_SIZE_CLASS), temp_storage.blev_buffer_offset, tile_id); // Ensure we can repurpose the temporary storage required by EnqueueBLEVBuffers CTA_SYNC(); // Copy warp-level buffers BatchMemcpyWLEVBuffers( &temp_storage.staged.buffers_by_size_class[size_class_agg.Get(TLEV_SIZE_CLASS)], tile_buffer_srcs, tile_buffer_dsts, tile_buffer_sizes, size_class_agg.Get(WLEV_SIZE_CLASS)); // Perform batch memcpy for all the buffers that require thread-level collaboration uint32_t num_tlev_buffers = size_class_agg.Get(TLEV_SIZE_CLASS); BatchMemcpyTLEVBuffers( temp_storage.staged.buffers_by_size_class, tile_buffer_srcs, tile_buffer_dsts, num_tlev_buffers); } //----------------------------------------------------------------------------- // CONSTRUCTOR //----------------------------------------------------------------------------- _CCCL_DEVICE _CCCL_FORCEINLINE AgentBatchMemcpy( TempStorage& temp_storage, InputBufferIt input_buffer_it, OutputBufferIt output_buffer_it, BufferSizeIteratorT buffer_sizes_it, BufferOffsetT num_buffers, BlevBufferSrcsOutItT blev_buffer_srcs, BlevBufferDstsOutItT blev_buffer_dsts, BlevBufferSizesOutItT blev_buffer_sizes, BlevBufferTileOffsetsOutItT blev_buffer_tile_offsets, BLevBufferOffsetTileState blev_buffer_scan_state, BLevBlockOffsetTileState blev_block_scan_state) : temp_storage(temp_storage.Alias()) , input_buffer_it(input_buffer_it) , output_buffer_it(output_buffer_it) , buffer_sizes_it(buffer_sizes_it) , num_buffers(num_buffers) , blev_buffer_srcs(blev_buffer_srcs) , blev_buffer_dsts(blev_buffer_dsts) , blev_buffer_sizes(blev_buffer_sizes) , blev_buffer_tile_offsets(blev_buffer_tile_offsets) , blev_buffer_scan_state(blev_buffer_scan_state) , blev_block_scan_state(blev_block_scan_state) {} private: // Iterator providing the pointers to the source memory buffers InputBufferIt input_buffer_it; // Iterator providing the pointers to the destination memory buffers OutputBufferIt output_buffer_it; // Iterator providing the number of bytes to be copied for each pair of buffers BufferSizeIteratorT buffer_sizes_it; // The total number of buffer pairs BufferOffsetT num_buffers; // Output iterator to which the source pointers of the BLEV buffers are written BlevBufferSrcsOutItT blev_buffer_srcs; // Output iterator to which the destination pointers of the BLEV buffers are written BlevBufferDstsOutItT blev_buffer_dsts; // Output iterator to which the number of bytes to be copied of the BLEV buffers are written BlevBufferSizesOutItT blev_buffer_sizes; // Output iterator to which the mapping of tiles to BLEV buffers is written BlevBufferTileOffsetsOutItT blev_buffer_tile_offsets; // The single-pass prefix scan's tile state used for tracking the prefix sum over the number of // BLEV buffers BLevBufferOffsetTileState blev_buffer_scan_state; // The single-pass prefix scan's tile state used for tracking the prefix sum over tiles of BLEV // buffers BLevBlockOffsetTileState blev_block_scan_state; }; } // namespace detail CUB_NAMESPACE_END cccl-2.5.0/cub/cub/agent/agent_for.cuh000066400000000000000000000056501463375617100175320ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include CUB_NAMESPACE_BEGIN namespace detail { namespace for_each { template struct policy_t { static constexpr int block_threads = BlockThreads; static constexpr int items_per_thread = ItemsPerThread; }; template struct agent_block_striped_t { static constexpr int items_per_thread = PolicyT::items_per_thread; OffsetT tile_base; OpT op; template _CCCL_DEVICE _CCCL_FORCEINLINE void consume_tile(int items_in_tile, int block_threads) { #pragma unroll for (int item = 0; item < items_per_thread; item++) { const auto idx = static_cast(block_threads * item + threadIdx.x); if (IsFullTile || idx < items_in_tile) { (void) op(tile_base + idx); } } } }; } // namespace for_each } // namespace detail CUB_NAMESPACE_END cccl-2.5.0/cub/cub/agent/agent_histogram.cuh000066400000000000000000000760171463375617100207460ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * cub::AgentHistogram implements a stateful abstraction of CUDA thread blocks for participating in device-wide * histogram . */ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include #include CUB_NAMESPACE_BEGIN /****************************************************************************** * Tuning policy ******************************************************************************/ /** * */ enum BlockHistogramMemoryPreference { GMEM, SMEM, BLEND }; /** * Parameterizable tuning policy type for AgentHistogram * * @tparam _BLOCK_THREADS * Threads per thread block * * @tparam _PIXELS_PER_THREAD * Pixels per thread (per tile of input) * * @tparam _LOAD_ALGORITHM * The BlockLoad algorithm to use * * @tparam _LOAD_MODIFIER * Cache load modifier for reading input elements * * @tparam _RLE_COMPRESS * Whether to perform localized RLE to compress samples before histogramming * * @tparam _MEM_PREFERENCE * Whether to prefer privatized shared-memory bins (versus privatized global-memory bins) * * @tparam _WORK_STEALING * Whether to dequeue tiles from a global work queue * * @tparam _VEC_SIZE * Vector size for samples loading (1, 2, 4) */ template struct AgentHistogramPolicy { enum { /// Threads per thread block BLOCK_THREADS = _BLOCK_THREADS, /// Pixels per thread (per tile of input) PIXELS_PER_THREAD = _PIXELS_PER_THREAD, /// Whether to perform localized RLE to compress samples before histogramming IS_RLE_COMPRESS = _RLE_COMPRESS, /// Whether to prefer privatized shared-memory bins (versus privatized global-memory bins) MEM_PREFERENCE = _MEM_PREFERENCE, /// Whether to dequeue tiles from a global work queue IS_WORK_STEALING = _WORK_STEALING, }; /// Vector size for samples loading (1, 2, 4) static constexpr int VEC_SIZE = _VEC_SIZE; ///< The BlockLoad algorithm to use static constexpr BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; ///< Cache load modifier for reading input elements static constexpr CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; }; /****************************************************************************** * Thread block abstractions ******************************************************************************/ /** * @brief AgentHistogram implements a stateful abstraction of CUDA thread blocks for participating * in device-wide histogram . * * @tparam AgentHistogramPolicyT * Parameterized AgentHistogramPolicy tuning policy type * * @tparam PRIVATIZED_SMEM_BINS * Number of privatized shared-memory histogram bins of any channel. Zero indicates privatized * counters to be maintained in device-accessible memory. * * @tparam NUM_CHANNELS * Number of channels interleaved in the input data. Supports up to four channels. * * @tparam NUM_ACTIVE_CHANNELS * Number of channels actively being histogrammed * * @tparam SampleIteratorT * Random-access input iterator type for reading samples * * @tparam CounterT * Integer type for counting sample occurrences per histogram bin * * @tparam PrivatizedDecodeOpT * The transform operator type for determining privatized counter indices from samples, one for * each channel * * @tparam OutputDecodeOpT * The transform operator type for determining output bin-ids from privatized counter indices, one * for each channel * * @tparam OffsetT * Signed integer type for global offsets * * @tparam LEGACY_PTX_ARCH * PTX compute capability (unused) */ template struct AgentHistogram { //--------------------------------------------------------------------- // Types and constants //--------------------------------------------------------------------- /// The sample type of the input iterator using SampleT = cub::detail::value_t; /// The pixel type of SampleT using PixelT = typename CubVector::Type; /// The vec type of SampleT static constexpr int VecSize = AgentHistogramPolicyT::VEC_SIZE; using VecT = typename CubVector::Type; /// Constants enum { BLOCK_THREADS = AgentHistogramPolicyT::BLOCK_THREADS, PIXELS_PER_THREAD = AgentHistogramPolicyT::PIXELS_PER_THREAD, SAMPLES_PER_THREAD = PIXELS_PER_THREAD * NUM_CHANNELS, VECS_PER_THREAD = SAMPLES_PER_THREAD / VecSize, TILE_PIXELS = PIXELS_PER_THREAD * BLOCK_THREADS, TILE_SAMPLES = SAMPLES_PER_THREAD * BLOCK_THREADS, IS_RLE_COMPRESS = AgentHistogramPolicyT::IS_RLE_COMPRESS, MEM_PREFERENCE = (PRIVATIZED_SMEM_BINS > 0) ? AgentHistogramPolicyT::MEM_PREFERENCE : GMEM, IS_WORK_STEALING = AgentHistogramPolicyT::IS_WORK_STEALING, }; /// Cache load modifier for reading input elements static constexpr CacheLoadModifier LOAD_MODIFIER = AgentHistogramPolicyT::LOAD_MODIFIER; /// Input iterator wrapper type (for applying cache modifier) // Wrap the native input pointer with CacheModifiedInputIterator // or directly use the supplied input iterator type using WrappedSampleIteratorT = cub::detail::conditional_t::value, CacheModifiedInputIterator, SampleIteratorT>; /// Pixel input iterator type (for applying cache modifier) typedef CacheModifiedInputIterator WrappedPixelIteratorT; /// Qaud input iterator type (for applying cache modifier) typedef CacheModifiedInputIterator WrappedVecsIteratorT; /// Parameterized BlockLoad type for samples typedef BlockLoad BlockLoadSampleT; /// Parameterized BlockLoad type for pixels typedef BlockLoad BlockLoadPixelT; /// Parameterized BlockLoad type for vecs typedef BlockLoad BlockLoadVecT; /// Shared memory type required by this thread block struct _TempStorage { // Smem needed for block-privatized smem histogram (with 1 word of padding) CounterT histograms[NUM_ACTIVE_CHANNELS][PRIVATIZED_SMEM_BINS + 1]; int tile_idx; // Aliasable storage layout union Aliasable { // Smem needed for loading a tile of samples typename BlockLoadSampleT::TempStorage sample_load; // Smem needed for loading a tile of pixels typename BlockLoadPixelT::TempStorage pixel_load; // Smem needed for loading a tile of vecs typename BlockLoadVecT::TempStorage vec_load; } aliasable; }; /// Temporary storage type (unionable) struct TempStorage : Uninitialized<_TempStorage> {}; //--------------------------------------------------------------------- // Per-thread fields //--------------------------------------------------------------------- /// Reference to temp_storage _TempStorage& temp_storage; /// Sample input iterator (with cache modifier applied, if possible) WrappedSampleIteratorT d_wrapped_samples; /// Native pointer for input samples (possibly NULL if unavailable) SampleT* d_native_samples; /// The number of output bins for each channel int (&num_output_bins)[NUM_ACTIVE_CHANNELS]; /// The number of privatized bins for each channel int (&num_privatized_bins)[NUM_ACTIVE_CHANNELS]; /// Reference to gmem privatized histograms for each channel CounterT* d_privatized_histograms[NUM_ACTIVE_CHANNELS]; /// Reference to final output histograms (gmem) CounterT* (&d_output_histograms)[NUM_ACTIVE_CHANNELS]; /// The transform operator for determining output bin-ids from privatized counter indices, one for each channel OutputDecodeOpT (&output_decode_op)[NUM_ACTIVE_CHANNELS]; /// The transform operator for determining privatized counter indices from samples, one for each channel PrivatizedDecodeOpT (&privatized_decode_op)[NUM_ACTIVE_CHANNELS]; /// Whether to prefer privatized smem counters vs privatized global counters bool prefer_smem; //--------------------------------------------------------------------- // Initialize privatized bin counters //--------------------------------------------------------------------- // Initialize privatized bin counters _CCCL_DEVICE _CCCL_FORCEINLINE void InitBinCounters(CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS]) { // Initialize histogram bin counts to zeros #pragma unroll for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) { for (int privatized_bin = threadIdx.x; privatized_bin < num_privatized_bins[CHANNEL]; privatized_bin += BLOCK_THREADS) { privatized_histograms[CHANNEL][privatized_bin] = 0; } } // Barrier to make sure all threads are done updating counters CTA_SYNC(); } // Initialize privatized bin counters. Specialized for privatized shared-memory counters _CCCL_DEVICE _CCCL_FORCEINLINE void InitSmemBinCounters() { CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS]; for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) { privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL]; } InitBinCounters(privatized_histograms); } // Initialize privatized bin counters. Specialized for privatized global-memory counters _CCCL_DEVICE _CCCL_FORCEINLINE void InitGmemBinCounters() { InitBinCounters(d_privatized_histograms); } //--------------------------------------------------------------------- // Update final output histograms //--------------------------------------------------------------------- // Update final output histograms from privatized histograms _CCCL_DEVICE _CCCL_FORCEINLINE void StoreOutput(CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS]) { // Barrier to make sure all threads are done updating counters CTA_SYNC(); // Apply privatized bin counts to output bin counts #pragma unroll for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) { int channel_bins = num_privatized_bins[CHANNEL]; for (int privatized_bin = threadIdx.x; privatized_bin < channel_bins; privatized_bin += BLOCK_THREADS) { int output_bin = -1; CounterT count = privatized_histograms[CHANNEL][privatized_bin]; bool is_valid = count > 0; output_decode_op[CHANNEL].template BinSelect((SampleT) privatized_bin, output_bin, is_valid); if (output_bin >= 0) { atomicAdd(&d_output_histograms[CHANNEL][output_bin], count); } } } } // Update final output histograms from privatized histograms. Specialized for privatized shared-memory counters _CCCL_DEVICE _CCCL_FORCEINLINE void StoreSmemOutput() { CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS]; for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) { privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL]; } StoreOutput(privatized_histograms); } // Update final output histograms from privatized histograms. Specialized for privatized global-memory counters _CCCL_DEVICE _CCCL_FORCEINLINE void StoreGmemOutput() { StoreOutput(d_privatized_histograms); } //--------------------------------------------------------------------- // Tile accumulation //--------------------------------------------------------------------- // Accumulate pixels. Specialized for RLE compression. _CCCL_DEVICE _CCCL_FORCEINLINE void AccumulatePixels( SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS], bool is_valid[PIXELS_PER_THREAD], CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS], Int2Type is_rle_compress) { #pragma unroll for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) { // Bin pixels int bins[PIXELS_PER_THREAD]; #pragma unroll for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL) { bins[PIXEL] = -1; privatized_decode_op[CHANNEL].template BinSelect( samples[PIXEL][CHANNEL], bins[PIXEL], is_valid[PIXEL]); } CounterT accumulator = 1; #pragma unroll for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD - 1; ++PIXEL) { if (bins[PIXEL] != bins[PIXEL + 1]) { if (bins[PIXEL] >= 0) { atomicAdd(privatized_histograms[CHANNEL] + bins[PIXEL], accumulator); } accumulator = 0; } accumulator++; } // Last pixel if (bins[PIXELS_PER_THREAD - 1] >= 0) { atomicAdd(privatized_histograms[CHANNEL] + bins[PIXELS_PER_THREAD - 1], accumulator); } } } // Accumulate pixels. Specialized for individual accumulation of each pixel. _CCCL_DEVICE _CCCL_FORCEINLINE void AccumulatePixels( SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS], bool is_valid[PIXELS_PER_THREAD], CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS], Int2Type is_rle_compress) { #pragma unroll for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL) { #pragma unroll for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) { int bin = -1; privatized_decode_op[CHANNEL].template BinSelect(samples[PIXEL][CHANNEL], bin, is_valid[PIXEL]); if (bin >= 0) { atomicAdd(privatized_histograms[CHANNEL] + bin, 1); } } } } /** * Accumulate pixel, specialized for smem privatized histogram */ _CCCL_DEVICE _CCCL_FORCEINLINE void AccumulateSmemPixels(SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS], bool is_valid[PIXELS_PER_THREAD]) { CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS]; for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) { privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL]; } AccumulatePixels(samples, is_valid, privatized_histograms, Int2Type()); } /** * Accumulate pixel, specialized for gmem privatized histogram */ _CCCL_DEVICE _CCCL_FORCEINLINE void AccumulateGmemPixels(SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS], bool is_valid[PIXELS_PER_THREAD]) { AccumulatePixels(samples, is_valid, d_privatized_histograms, Int2Type()); } //--------------------------------------------------------------------- // Tile loading //--------------------------------------------------------------------- // Load full, aligned tile using pixel iterator (multi-channel) template _CCCL_DEVICE _CCCL_FORCEINLINE void LoadFullAlignedTile( OffsetT block_offset, int valid_samples, SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS], Int2Type<_NUM_ACTIVE_CHANNELS> num_active_channels) { typedef PixelT AliasedPixels[PIXELS_PER_THREAD]; WrappedPixelIteratorT d_wrapped_pixels((PixelT*) (d_native_samples + block_offset)); // Load using a wrapped pixel iterator BlockLoadPixelT(temp_storage.aliasable.pixel_load).Load(d_wrapped_pixels, reinterpret_cast(samples)); } // Load full, aligned tile using vec iterator (single-channel) _CCCL_DEVICE _CCCL_FORCEINLINE void LoadFullAlignedTile( OffsetT block_offset, int valid_samples, SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS], Int2Type<1> num_active_channels) { typedef VecT AliasedVecs[VECS_PER_THREAD]; WrappedVecsIteratorT d_wrapped_vecs((VecT*) (d_native_samples + block_offset)); // Load using a wrapped vec iterator BlockLoadVecT(temp_storage.aliasable.vec_load).Load(d_wrapped_vecs, reinterpret_cast(samples)); } // Load full, aligned tile _CCCL_DEVICE _CCCL_FORCEINLINE void LoadTile( OffsetT block_offset, int valid_samples, SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS], Int2Type is_full_tile, Int2Type is_aligned) { LoadFullAlignedTile(block_offset, valid_samples, samples, Int2Type()); } // Load full, mis-aligned tile using sample iterator _CCCL_DEVICE _CCCL_FORCEINLINE void LoadTile( OffsetT block_offset, int valid_samples, SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS], Int2Type is_full_tile, Int2Type is_aligned) { typedef SampleT AliasedSamples[SAMPLES_PER_THREAD]; // Load using sample iterator BlockLoadSampleT(temp_storage.aliasable.sample_load) .Load(d_wrapped_samples + block_offset, reinterpret_cast(samples)); } // Load partially-full, aligned tile using the pixel iterator _CCCL_DEVICE _CCCL_FORCEINLINE void LoadTile( OffsetT block_offset, int valid_samples, SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS], Int2Type is_full_tile, Int2Type is_aligned) { typedef PixelT AliasedPixels[PIXELS_PER_THREAD]; WrappedPixelIteratorT d_wrapped_pixels((PixelT*) (d_native_samples + block_offset)); int valid_pixels = valid_samples / NUM_CHANNELS; // Load using a wrapped pixel iterator BlockLoadPixelT(temp_storage.aliasable.pixel_load) .Load(d_wrapped_pixels, reinterpret_cast(samples), valid_pixels); } // Load partially-full, mis-aligned tile using sample iterator _CCCL_DEVICE _CCCL_FORCEINLINE void LoadTile( OffsetT block_offset, int valid_samples, SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS], Int2Type is_full_tile, Int2Type is_aligned) { typedef SampleT AliasedSamples[SAMPLES_PER_THREAD]; BlockLoadSampleT(temp_storage.aliasable.sample_load) .Load(d_wrapped_samples + block_offset, reinterpret_cast(samples), valid_samples); } template _CCCL_DEVICE _CCCL_FORCEINLINE void MarkValid(bool (&is_valid)[PIXELS_PER_THREAD], int valid_samples, Int2Type /* is_striped = false */) { #pragma unroll for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL) { is_valid[PIXEL] = IS_FULL_TILE || (((threadIdx.x * PIXELS_PER_THREAD + PIXEL) * NUM_CHANNELS) < valid_samples); } } template _CCCL_DEVICE _CCCL_FORCEINLINE void MarkValid(bool (&is_valid)[PIXELS_PER_THREAD], int valid_samples, Int2Type /* is_striped = true */) { #pragma unroll for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL) { is_valid[PIXEL] = IS_FULL_TILE || (((threadIdx.x + BLOCK_THREADS * PIXEL) * NUM_CHANNELS) < valid_samples); } } //--------------------------------------------------------------------- // Tile processing //--------------------------------------------------------------------- /** * @brief Consume a tile of data samples * * @tparam IS_ALIGNED * Whether the tile offset is aligned (vec-aligned for single-channel, pixel-aligned for multi-channel) * * @tparam IS_FULL_TILE Whether the tile is full */ template _CCCL_DEVICE _CCCL_FORCEINLINE void ConsumeTile(OffsetT block_offset, int valid_samples) { SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS]; bool is_valid[PIXELS_PER_THREAD]; // Load tile LoadTile(block_offset, valid_samples, samples, Int2Type(), Int2Type()); // Set valid flags MarkValid( is_valid, valid_samples, Int2Type{}); // Accumulate samples if (prefer_smem) { AccumulateSmemPixels(samples, is_valid); } else { AccumulateGmemPixels(samples, is_valid); } } /** * @brief Consume row tiles. Specialized for work-stealing from queue * * @param num_row_pixels * The number of multi-channel pixels per row in the region of interest * * @param num_rows * The number of rows in the region of interest * * @param row_stride_samples * The number of samples between starts of consecutive rows in the region of interest * * @param tiles_per_row * Number of image tiles per row */ template _CCCL_DEVICE _CCCL_FORCEINLINE void ConsumeTiles( OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_samples, int tiles_per_row, GridQueue tile_queue, Int2Type is_work_stealing) { int num_tiles = num_rows * tiles_per_row; int tile_idx = (blockIdx.y * gridDim.x) + blockIdx.x; OffsetT num_even_share_tiles = gridDim.x * gridDim.y; while (tile_idx < num_tiles) { int row = tile_idx / tiles_per_row; int col = tile_idx - (row * tiles_per_row); OffsetT row_offset = row * row_stride_samples; OffsetT col_offset = (col * TILE_SAMPLES); OffsetT tile_offset = row_offset + col_offset; if (col == tiles_per_row - 1) { // Consume a partially-full tile at the end of the row OffsetT num_remaining = (num_row_pixels * NUM_CHANNELS) - col_offset; ConsumeTile(tile_offset, num_remaining); } else { // Consume full tile ConsumeTile(tile_offset, TILE_SAMPLES); } CTA_SYNC(); // Get next tile if (threadIdx.x == 0) { temp_storage.tile_idx = tile_queue.Drain(1) + num_even_share_tiles; } CTA_SYNC(); tile_idx = temp_storage.tile_idx; } } /** * @brief Consume row tiles. Specialized for even-share (striped across thread blocks) * * @param num_row_pixels * The number of multi-channel pixels per row in the region of interest * * @param num_rows * The number of rows in the region of interest * * @param row_stride_samples * The number of samples between starts of consecutive rows in the region of interest * * @param tiles_per_row * Number of image tiles per row */ template _CCCL_DEVICE _CCCL_FORCEINLINE void ConsumeTiles( OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_samples, int tiles_per_row, GridQueue tile_queue, Int2Type is_work_stealing) { for (int row = blockIdx.y; row < num_rows; row += gridDim.y) { OffsetT row_begin = row * row_stride_samples; OffsetT row_end = row_begin + (num_row_pixels * NUM_CHANNELS); OffsetT tile_offset = row_begin + (blockIdx.x * TILE_SAMPLES); while (tile_offset < row_end) { OffsetT num_remaining = row_end - tile_offset; if (num_remaining < TILE_SAMPLES) { // Consume partial tile ConsumeTile(tile_offset, num_remaining); break; } // Consume full tile ConsumeTile(tile_offset, TILE_SAMPLES); tile_offset += gridDim.x * TILE_SAMPLES; } } } //--------------------------------------------------------------------- // Parameter extraction //--------------------------------------------------------------------- // Return a native pixel pointer (specialized for CacheModifiedInputIterator types) template _CCCL_DEVICE _CCCL_FORCEINLINE SampleT* NativePointer(CacheModifiedInputIterator<_MODIFIER, _ValueT, _OffsetT> itr) { return itr.ptr; } // Return a native pixel pointer (specialized for other types) template _CCCL_DEVICE _CCCL_FORCEINLINE SampleT* NativePointer(IteratorT itr) { return NULL; } //--------------------------------------------------------------------- // Interface //--------------------------------------------------------------------- /** * @brief Constructor * * @param temp_storage * Reference to temp_storage * * @param d_samples * Input data to reduce * * @param num_output_bins * The number bins per final output histogram * * @param num_privatized_bins * The number bins per privatized histogram * * @param d_output_histograms * Reference to final output histograms * * @param d_privatized_histograms * Reference to privatized histograms * * @param output_decode_op * The transform operator for determining output bin-ids from privatized counter indices, one for each channel * * @param privatized_decode_op * The transform operator for determining privatized counter indices from samples, one for each channel */ _CCCL_DEVICE _CCCL_FORCEINLINE AgentHistogram( TempStorage& temp_storage, SampleIteratorT d_samples, int (&num_output_bins)[NUM_ACTIVE_CHANNELS], int (&num_privatized_bins)[NUM_ACTIVE_CHANNELS], CounterT* (&d_output_histograms)[NUM_ACTIVE_CHANNELS], CounterT* (&d_privatized_histograms)[NUM_ACTIVE_CHANNELS], OutputDecodeOpT (&output_decode_op)[NUM_ACTIVE_CHANNELS], PrivatizedDecodeOpT (&privatized_decode_op)[NUM_ACTIVE_CHANNELS]) : temp_storage(temp_storage.Alias()) , d_wrapped_samples(d_samples) , d_native_samples(NativePointer(d_wrapped_samples)) , num_output_bins(num_output_bins) , num_privatized_bins(num_privatized_bins) , d_output_histograms(d_output_histograms) , output_decode_op(output_decode_op) , privatized_decode_op(privatized_decode_op) , prefer_smem((MEM_PREFERENCE == SMEM) ? true : // prefer smem privatized histograms (MEM_PREFERENCE == GMEM) ? false : // prefer gmem privatized histograms blockIdx.x & 1) // prefer blended privatized histograms { int blockId = (blockIdx.y * gridDim.x) + blockIdx.x; // Initialize the locations of this block's privatized histograms for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) { this->d_privatized_histograms[CHANNEL] = d_privatized_histograms[CHANNEL] + (blockId * num_privatized_bins[CHANNEL]); } } /** * @brief Consume image * * @param num_row_pixels * The number of multi-channel pixels per row in the region of interest * * @param num_rows * The number of rows in the region of interest * * @param row_stride_samples * The number of samples between starts of consecutive rows in the region of interest * * @param tiles_per_row * Number of image tiles per row * * @param tile_queue * Queue descriptor for assigning tiles of work to thread blocks */ _CCCL_DEVICE _CCCL_FORCEINLINE void ConsumeTiles( OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_samples, int tiles_per_row, GridQueue tile_queue) { // Check whether all row starting offsets are vec-aligned (in single-channel) or pixel-aligned (in multi-channel) int vec_mask = AlignBytes::ALIGN_BYTES - 1; int pixel_mask = AlignBytes::ALIGN_BYTES - 1; size_t row_bytes = sizeof(SampleT) * row_stride_samples; bool vec_aligned_rows = (NUM_CHANNELS == 1) && (SAMPLES_PER_THREAD % VecSize == 0) && // Single channel ((size_t(d_native_samples) & vec_mask) == 0) && // ptr is quad-aligned ((num_rows == 1) || ((row_bytes & vec_mask) == 0)); // number of row-samples is a multiple of the alignment of the // quad bool pixel_aligned_rows = (NUM_CHANNELS > 1) && // Multi channel ((size_t(d_native_samples) & pixel_mask) == 0) && // ptr is pixel-aligned ((row_bytes & pixel_mask) == 0); // number of row-samples is a multiple of the alignment of the pixel // Whether rows are aligned and can be vectorized if ((d_native_samples != NULL) && (vec_aligned_rows || pixel_aligned_rows)) { ConsumeTiles( num_row_pixels, num_rows, row_stride_samples, tiles_per_row, tile_queue, Int2Type()); } else { ConsumeTiles( num_row_pixels, num_rows, row_stride_samples, tiles_per_row, tile_queue, Int2Type()); } } /** * Initialize privatized bin counters. Specialized for privatized shared-memory counters */ _CCCL_DEVICE _CCCL_FORCEINLINE void InitBinCounters() { if (prefer_smem) { InitSmemBinCounters(); } else { InitGmemBinCounters(); } } /** * Store privatized histogram to device-accessible memory. Specialized for privatized shared-memory counters */ _CCCL_DEVICE _CCCL_FORCEINLINE void StoreOutput() { if (prefer_smem) { StoreSmemOutput(); } else { StoreGmemOutput(); } } }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/agent/agent_merge_sort.cuh000066400000000000000000000600401463375617100211040ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include #include #include CUB_NAMESPACE_BEGIN template struct AgentMergeSortPolicy { static constexpr int BLOCK_THREADS = _BLOCK_THREADS; static constexpr int ITEMS_PER_THREAD = _ITEMS_PER_THREAD; static constexpr int ITEMS_PER_TILE = BLOCK_THREADS * ITEMS_PER_THREAD; static constexpr cub::BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; static constexpr cub::CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; static constexpr cub::BlockStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM; }; /// \brief This agent is responsible for the initial in-tile sorting. template struct AgentBlockSort { //--------------------------------------------------------------------- // Types and constants //--------------------------------------------------------------------- static constexpr bool KEYS_ONLY = std::is_same::value; using BlockMergeSortT = BlockMergeSort; using KeysLoadIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator::type; using ItemsLoadIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator::type; using BlockLoadKeys = typename cub::BlockLoadType::type; using BlockLoadItems = typename cub::BlockLoadType::type; using BlockStoreKeysIt = typename cub::BlockStoreType::type; using BlockStoreItemsIt = typename cub::BlockStoreType::type; using BlockStoreKeysRaw = typename cub::BlockStoreType::type; using BlockStoreItemsRaw = typename cub::BlockStoreType::type; union _TempStorage { typename BlockLoadKeys::TempStorage load_keys; typename BlockLoadItems::TempStorage load_items; typename BlockStoreKeysIt::TempStorage store_keys_it; typename BlockStoreItemsIt::TempStorage store_items_it; typename BlockStoreKeysRaw::TempStorage store_keys_raw; typename BlockStoreItemsRaw::TempStorage store_items_raw; typename BlockMergeSortT::TempStorage block_merge; }; /// Alias wrapper allowing storage to be unioned struct TempStorage : Uninitialized<_TempStorage> {}; static constexpr int BLOCK_THREADS = Policy::BLOCK_THREADS; static constexpr int ITEMS_PER_THREAD = Policy::ITEMS_PER_THREAD; static constexpr int ITEMS_PER_TILE = Policy::ITEMS_PER_TILE; //--------------------------------------------------------------------- // Per thread data //--------------------------------------------------------------------- bool ping; _TempStorage& storage; KeysLoadIt keys_in; ItemsLoadIt items_in; OffsetT keys_count; KeyIteratorT keys_out_it; ValueIteratorT items_out_it; KeyT* keys_out_raw; ValueT* items_out_raw; CompareOpT compare_op; _CCCL_DEVICE _CCCL_FORCEINLINE AgentBlockSort( bool ping_, TempStorage& storage_, KeysLoadIt keys_in_, ItemsLoadIt items_in_, OffsetT keys_count_, KeyIteratorT keys_out_it_, ValueIteratorT items_out_it_, KeyT* keys_out_raw_, ValueT* items_out_raw_, CompareOpT compare_op_) : ping(ping_) , storage(storage_.Alias()) , keys_in(keys_in_) , items_in(items_in_) , keys_count(keys_count_) , keys_out_it(keys_out_it_) , items_out_it(items_out_it_) , keys_out_raw(keys_out_raw_) , items_out_raw(items_out_raw_) , compare_op(compare_op_) {} _CCCL_DEVICE _CCCL_FORCEINLINE void Process() { auto tile_idx = static_cast(blockIdx.x); auto num_tiles = static_cast(gridDim.x); auto tile_base = tile_idx * ITEMS_PER_TILE; int items_in_tile = (cub::min)(keys_count - tile_base, int{ITEMS_PER_TILE}); if (tile_idx < num_tiles - 1) { consume_tile(tile_base, ITEMS_PER_TILE); } else { consume_tile(tile_base, items_in_tile); } } template _CCCL_DEVICE _CCCL_FORCEINLINE void consume_tile(OffsetT tile_base, int num_remaining) { ValueT items_local[ITEMS_PER_THREAD]; if (!KEYS_ONLY) { if (IS_LAST_TILE) { BlockLoadItems(storage.load_items) .Load(items_in + tile_base, items_local, num_remaining, *(items_in + tile_base)); } else { BlockLoadItems(storage.load_items).Load(items_in + tile_base, items_local); } CTA_SYNC(); } KeyT keys_local[ITEMS_PER_THREAD]; if (IS_LAST_TILE) { BlockLoadKeys(storage.load_keys).Load(keys_in + tile_base, keys_local, num_remaining, *(keys_in + tile_base)); } else { BlockLoadKeys(storage.load_keys).Load(keys_in + tile_base, keys_local); } CTA_SYNC(); if (IS_LAST_TILE) { BlockMergeSortT(storage.block_merge).Sort(keys_local, items_local, compare_op, num_remaining, keys_local[0]); } else { BlockMergeSortT(storage.block_merge).Sort(keys_local, items_local, compare_op); } CTA_SYNC(); if (ping) { if (IS_LAST_TILE) { BlockStoreKeysIt(storage.store_keys_it).Store(keys_out_it + tile_base, keys_local, num_remaining); } else { BlockStoreKeysIt(storage.store_keys_it).Store(keys_out_it + tile_base, keys_local); } if (!KEYS_ONLY) { CTA_SYNC(); if (IS_LAST_TILE) { BlockStoreItemsIt(storage.store_items_it).Store(items_out_it + tile_base, items_local, num_remaining); } else { BlockStoreItemsIt(storage.store_items_it).Store(items_out_it + tile_base, items_local); } } } else { if (IS_LAST_TILE) { BlockStoreKeysRaw(storage.store_keys_raw).Store(keys_out_raw + tile_base, keys_local, num_remaining); } else { BlockStoreKeysRaw(storage.store_keys_raw).Store(keys_out_raw + tile_base, keys_local); } if (!KEYS_ONLY) { CTA_SYNC(); if (IS_LAST_TILE) { BlockStoreItemsRaw(storage.store_items_raw).Store(items_out_raw + tile_base, items_local, num_remaining); } else { BlockStoreItemsRaw(storage.store_items_raw).Store(items_out_raw + tile_base, items_local); } } } } }; /** * \brief This agent is responsible for partitioning a merge path into equal segments * * There are two sorted arrays to be merged into one array. If the first array * is partitioned between parallel workers by slicing it into ranges of equal * size, there could be a significant workload imbalance. The imbalance is * caused by the fact that the distribution of elements from the second array * is unknown beforehand. Instead, the MergePath is partitioned between workers. * This approach guarantees an equal amount of work being assigned to each worker. * * This approach is outlined in the paper: * Odeh et al, "Merge Path - Parallel Merging Made Simple" * doi:10.1109/IPDPSW.2012.202 */ template struct AgentPartition { bool ping; KeyIteratorT keys_ping; KeyT* keys_pong; OffsetT keys_count; OffsetT partition_idx; OffsetT* merge_partitions; CompareOpT compare_op; OffsetT target_merged_tiles_number; int items_per_tile; OffsetT num_partitions; _CCCL_DEVICE _CCCL_FORCEINLINE AgentPartition( bool ping, KeyIteratorT keys_ping, KeyT* keys_pong, OffsetT keys_count, OffsetT partition_idx, OffsetT* merge_partitions, CompareOpT compare_op, OffsetT target_merged_tiles_number, int items_per_tile, OffsetT num_partitions) : ping(ping) , keys_ping(keys_ping) , keys_pong(keys_pong) , keys_count(keys_count) , partition_idx(partition_idx) , merge_partitions(merge_partitions) , compare_op(compare_op) , target_merged_tiles_number(target_merged_tiles_number) , items_per_tile(items_per_tile) , num_partitions(num_partitions) {} _CCCL_DEVICE _CCCL_FORCEINLINE void Process() { OffsetT merged_tiles_number = target_merged_tiles_number / 2; // target_merged_tiles_number is a power of two. OffsetT mask = target_merged_tiles_number - 1; // The first tile number in the tiles group being merged, equal to: // target_merged_tiles_number * (partition_idx / target_merged_tiles_number) OffsetT list = ~mask & partition_idx; OffsetT start = items_per_tile * list; OffsetT size = items_per_tile * merged_tiles_number; // Tile number within the tile group being merged, equal to: // partition_idx / target_merged_tiles_number OffsetT local_tile_idx = mask & partition_idx; OffsetT keys1_beg = (cub::min)(keys_count, start); OffsetT keys1_end = (cub::min)(keys_count, detail::safe_add_bound_to_max(start, size)); OffsetT keys2_beg = keys1_end; OffsetT keys2_end = (cub::min)(keys_count, detail::safe_add_bound_to_max(keys2_beg, size)); // The last partition (which is one-past-the-last-tile) is only to mark the end of keys1_end for the merge stage if (partition_idx + 1 == num_partitions) { merge_partitions[partition_idx] = keys1_end; } else { OffsetT partition_at = (cub::min)(keys2_end - keys1_beg, items_per_tile * local_tile_idx); OffsetT partition_diag = ping ? MergePath( keys_ping + keys1_beg, keys_ping + keys2_beg, keys1_end - keys1_beg, keys2_end - keys2_beg, partition_at, compare_op) : MergePath( keys_pong + keys1_beg, keys_pong + keys2_beg, keys1_end - keys1_beg, keys2_end - keys2_beg, partition_at, compare_op); merge_partitions[partition_idx] = keys1_beg + partition_diag; } } }; /// \brief The agent is responsible for merging N consecutive sorted arrays into N/2 sorted arrays. template struct AgentMerge { //--------------------------------------------------------------------- // Types and constants //--------------------------------------------------------------------- using KeysLoadPingIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator::type; using ItemsLoadPingIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator::type; using KeysLoadPongIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator::type; using ItemsLoadPongIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator::type; using KeysOutputPongIt = KeyIteratorT; using ItemsOutputPongIt = ValueIteratorT; using KeysOutputPingIt = KeyT*; using ItemsOutputPingIt = ValueT*; using BlockStoreKeysPong = typename BlockStoreType::type; using BlockStoreItemsPong = typename BlockStoreType::type; using BlockStoreKeysPing = typename BlockStoreType::type; using BlockStoreItemsPing = typename BlockStoreType::type; /// Parameterized BlockReduce primitive union _TempStorage { typename BlockStoreKeysPing::TempStorage store_keys_ping; typename BlockStoreItemsPing::TempStorage store_items_ping; typename BlockStoreKeysPong::TempStorage store_keys_pong; typename BlockStoreItemsPong::TempStorage store_items_pong; KeyT keys_shared[Policy::ITEMS_PER_TILE + 1]; ValueT items_shared[Policy::ITEMS_PER_TILE + 1]; }; /// Alias wrapper allowing storage to be unioned struct TempStorage : Uninitialized<_TempStorage> {}; static constexpr bool KEYS_ONLY = std::is_same::value; static constexpr int BLOCK_THREADS = Policy::BLOCK_THREADS; static constexpr int ITEMS_PER_THREAD = Policy::ITEMS_PER_THREAD; static constexpr int ITEMS_PER_TILE = Policy::ITEMS_PER_TILE; //--------------------------------------------------------------------- // Per thread data //--------------------------------------------------------------------- bool ping; _TempStorage& storage; KeysLoadPingIt keys_in_ping; ItemsLoadPingIt items_in_ping; KeysLoadPongIt keys_in_pong; ItemsLoadPongIt items_in_pong; OffsetT keys_count; KeysOutputPongIt keys_out_pong; ItemsOutputPongIt items_out_pong; KeysOutputPingIt keys_out_ping; ItemsOutputPingIt items_out_ping; CompareOpT compare_op; OffsetT* merge_partitions; OffsetT target_merged_tiles_number; //--------------------------------------------------------------------- // Utility functions //--------------------------------------------------------------------- /** * \brief Concatenates up to ITEMS_PER_THREAD elements from input{1,2} into output array * * Reads data in a coalesced fashion [BLOCK_THREADS * item + tid] and * stores the result in output[item]. */ template _CCCL_DEVICE _CCCL_FORCEINLINE void gmem_to_reg(T (&output)[ITEMS_PER_THREAD], It1 input1, It2 input2, int count1, int count2) { if (IS_FULL_TILE) { #pragma unroll for (int item = 0; item < ITEMS_PER_THREAD; ++item) { int idx = BLOCK_THREADS * item + threadIdx.x; output[item] = (idx < count1) ? input1[idx] : input2[idx - count1]; } } else { #pragma unroll for (int item = 0; item < ITEMS_PER_THREAD; ++item) { int idx = BLOCK_THREADS * item + threadIdx.x; if (idx < count1 + count2) { output[item] = (idx < count1) ? input1[idx] : input2[idx - count1]; } } } } /// \brief Stores data in a coalesced fashion in[item] -> out[BLOCK_THREADS * item + tid] template _CCCL_DEVICE _CCCL_FORCEINLINE void reg_to_shared(It output, T (&input)[ITEMS_PER_THREAD]) { #pragma unroll for (int item = 0; item < ITEMS_PER_THREAD; ++item) { int idx = BLOCK_THREADS * item + threadIdx.x; output[idx] = input[item]; } } template _CCCL_DEVICE _CCCL_FORCEINLINE void consume_tile(int tid, OffsetT tile_idx, OffsetT tile_base, int count) { OffsetT partition_beg = merge_partitions[tile_idx + 0]; OffsetT partition_end = merge_partitions[tile_idx + 1]; // target_merged_tiles_number is a power of two. OffsetT merged_tiles_number = target_merged_tiles_number / 2; OffsetT mask = target_merged_tiles_number - 1; // The first tile number in the tiles group being merged, equal to: // target_merged_tiles_number * (tile_idx / target_merged_tiles_number) OffsetT list = ~mask & tile_idx; OffsetT start = ITEMS_PER_TILE * list; OffsetT size = ITEMS_PER_TILE * merged_tiles_number; OffsetT diag = ITEMS_PER_TILE * tile_idx - start; OffsetT keys1_beg = partition_beg - start; OffsetT keys1_end = partition_end - start; OffsetT keys_end_dist_from_start = keys_count - start; OffsetT max_keys2 = (keys_end_dist_from_start > size) ? (keys_end_dist_from_start - size) : 0; // We have the following invariants: // diag >= keys1_beg, because diag is the distance of the total merge path so far (keys1 + keys2) // diag+ITEMS_PER_TILE >= keys1_end, because diag+ITEMS_PER_TILE is the distance of the merge path for the next tile // and keys1_end is key1's component of that path OffsetT keys2_beg = (cub::min)(max_keys2, diag - keys1_beg); OffsetT keys2_end = (cub::min)(max_keys2, detail::safe_add_bound_to_max(diag, static_cast(ITEMS_PER_TILE)) - keys1_end); // Check if it's the last tile in the tile group being merged if (mask == (mask & tile_idx)) { keys1_end = (cub::min)(keys_count - start, size); keys2_end = (cub::min)(max_keys2, size); } // number of keys per tile // int num_keys1 = static_cast(keys1_end - keys1_beg); int num_keys2 = static_cast(keys2_end - keys2_beg); // load keys1 & keys2 KeyT keys_local[ITEMS_PER_THREAD]; if (ping) { gmem_to_reg( keys_local, keys_in_ping + start + keys1_beg, keys_in_ping + start + size + keys2_beg, num_keys1, num_keys2); } else { gmem_to_reg( keys_local, keys_in_pong + start + keys1_beg, keys_in_pong + start + size + keys2_beg, num_keys1, num_keys2); } reg_to_shared(&storage.keys_shared[0], keys_local); // preload items into registers already // ValueT items_local[ITEMS_PER_THREAD]; if (!KEYS_ONLY) { if (ping) { gmem_to_reg( items_local, items_in_ping + start + keys1_beg, items_in_ping + start + size + keys2_beg, num_keys1, num_keys2); } else { gmem_to_reg( items_local, items_in_pong + start + keys1_beg, items_in_pong + start + size + keys2_beg, num_keys1, num_keys2); } } CTA_SYNC(); // use binary search in shared memory // to find merge path for each of thread // we can use int type here, because the number of // items in shared memory is limited // int diag0_local = (cub::min)(num_keys1 + num_keys2, ITEMS_PER_THREAD * tid); int keys1_beg_local = MergePath( &storage.keys_shared[0], &storage.keys_shared[num_keys1], num_keys1, num_keys2, diag0_local, compare_op); int keys1_end_local = num_keys1; int keys2_beg_local = diag0_local - keys1_beg_local; int keys2_end_local = num_keys2; int num_keys1_local = keys1_end_local - keys1_beg_local; int num_keys2_local = keys2_end_local - keys2_beg_local; // perform serial merge // int indices[ITEMS_PER_THREAD]; SerialMerge( &storage.keys_shared[0], keys1_beg_local, keys2_beg_local + num_keys1, num_keys1_local, num_keys2_local, keys_local, indices, compare_op); CTA_SYNC(); // write keys // if (ping) { if (IS_FULL_TILE) { BlockStoreKeysPing(storage.store_keys_ping).Store(keys_out_ping + tile_base, keys_local); } else { BlockStoreKeysPing(storage.store_keys_ping).Store(keys_out_ping + tile_base, keys_local, num_keys1 + num_keys2); } } else { if (IS_FULL_TILE) { BlockStoreKeysPong(storage.store_keys_pong).Store(keys_out_pong + tile_base, keys_local); } else { BlockStoreKeysPong(storage.store_keys_pong).Store(keys_out_pong + tile_base, keys_local, num_keys1 + num_keys2); } } // if items are provided, merge them if (!KEYS_ONLY) { CTA_SYNC(); reg_to_shared(&storage.items_shared[0], items_local); CTA_SYNC(); // gather items from shared mem // #pragma unroll for (int item = 0; item < ITEMS_PER_THREAD; ++item) { items_local[item] = storage.items_shared[indices[item]]; } CTA_SYNC(); // write from reg to gmem // if (ping) { if (IS_FULL_TILE) { BlockStoreItemsPing(storage.store_items_ping).Store(items_out_ping + tile_base, items_local); } else { BlockStoreItemsPing(storage.store_items_ping).Store(items_out_ping + tile_base, items_local, count); } } else { if (IS_FULL_TILE) { BlockStoreItemsPong(storage.store_items_pong).Store(items_out_pong + tile_base, items_local); } else { BlockStoreItemsPong(storage.store_items_pong).Store(items_out_pong + tile_base, items_local, count); } } } } _CCCL_DEVICE _CCCL_FORCEINLINE AgentMerge( bool ping_, TempStorage& storage_, KeysLoadPingIt keys_in_ping_, ItemsLoadPingIt items_in_ping_, KeysLoadPongIt keys_in_pong_, ItemsLoadPongIt items_in_pong_, OffsetT keys_count_, KeysOutputPingIt keys_out_ping_, ItemsOutputPingIt items_out_ping_, KeysOutputPongIt keys_out_pong_, ItemsOutputPongIt items_out_pong_, CompareOpT compare_op_, OffsetT* merge_partitions_, OffsetT target_merged_tiles_number_) : ping(ping_) , storage(storage_.Alias()) , keys_in_ping(keys_in_ping_) , items_in_ping(items_in_ping_) , keys_in_pong(keys_in_pong_) , items_in_pong(items_in_pong_) , keys_count(keys_count_) , keys_out_pong(keys_out_pong_) , items_out_pong(items_out_pong_) , keys_out_ping(keys_out_ping_) , items_out_ping(items_out_ping_) , compare_op(compare_op_) , merge_partitions(merge_partitions_) , target_merged_tiles_number(target_merged_tiles_number_) {} _CCCL_DEVICE _CCCL_FORCEINLINE void Process() { int tile_idx = static_cast(blockIdx.x); int num_tiles = static_cast(gridDim.x); OffsetT tile_base = OffsetT(tile_idx) * ITEMS_PER_TILE; int tid = static_cast(threadIdx.x); int items_in_tile = static_cast((cub::min)(static_cast(ITEMS_PER_TILE), keys_count - tile_base)); if (tile_idx < num_tiles - 1) { consume_tile(tid, tile_idx, tile_base, ITEMS_PER_TILE); } else { consume_tile(tid, tile_idx, tile_base, items_in_tile); } } }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/agent/agent_radix_sort_downsweep.cuh000066400000000000000000000614741463375617100232230ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * AgentRadixSortDownsweep implements a stateful abstraction of CUDA thread * blocks for participating in device-wide radix sort downsweep . */ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include #include #include #include #include #include #include CUB_NAMESPACE_BEGIN /****************************************************************************** * Tuning policy types ******************************************************************************/ /** * @brief Parameterizable tuning policy type for AgentRadixSortDownsweep * * @tparam NOMINAL_BLOCK_THREADS_4B * Threads per thread block * * @tparam NOMINAL_ITEMS_PER_THREAD_4B * Items per thread (per tile of input) * * @tparam ComputeT * Dominant compute type * * @tparam _LOAD_ALGORITHM * The BlockLoad algorithm to use * * @tparam _LOAD_MODIFIER * Cache load modifier for reading keys (and values) * * @tparam _RANK_ALGORITHM * The radix ranking algorithm to use * * @tparam _SCAN_ALGORITHM * The block scan algorithm to use * * @tparam _RADIX_BITS * The number of radix bits, i.e., log2(bins) */ template > struct AgentRadixSortDownsweepPolicy : ScalingType { enum { /// The number of radix bits, i.e., log2(bins) RADIX_BITS = _RADIX_BITS, }; /// The BlockLoad algorithm to use static constexpr BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; /// Cache load modifier for reading keys (and values) static constexpr CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; /// The radix ranking algorithm to use static constexpr RadixRankAlgorithm RANK_ALGORITHM = _RANK_ALGORITHM; /// The BlockScan algorithm to use static constexpr BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; }; /****************************************************************************** * Thread block abstractions ******************************************************************************/ /** * @brief AgentRadixSortDownsweep implements a stateful abstraction of CUDA thread blocks for participating in * device-wide radix sort downsweep . * * @tparam AgentRadixSortDownsweepPolicy * Parameterized AgentRadixSortDownsweepPolicy tuning policy type * * @tparam IS_DESCENDING * Whether or not the sorted-order is high-to-low * * @tparam KeyT * KeyT type * * @tparam ValueT * ValueT type * * @tparam OffsetT * Signed integer type for global offsets */ template struct AgentRadixSortDownsweep { //--------------------------------------------------------------------- // Type definitions and constants //--------------------------------------------------------------------- using traits = detail::radix::traits_t; using bit_ordered_type = typename traits::bit_ordered_type; using bit_ordered_conversion = typename traits::bit_ordered_conversion_policy; static constexpr BlockLoadAlgorithm LOAD_ALGORITHM = AgentRadixSortDownsweepPolicy::LOAD_ALGORITHM; static constexpr CacheLoadModifier LOAD_MODIFIER = AgentRadixSortDownsweepPolicy::LOAD_MODIFIER; static constexpr RadixRankAlgorithm RANK_ALGORITHM = AgentRadixSortDownsweepPolicy::RANK_ALGORITHM; static constexpr BlockScanAlgorithm SCAN_ALGORITHM = AgentRadixSortDownsweepPolicy::SCAN_ALGORITHM; enum { BLOCK_THREADS = AgentRadixSortDownsweepPolicy::BLOCK_THREADS, ITEMS_PER_THREAD = AgentRadixSortDownsweepPolicy::ITEMS_PER_THREAD, RADIX_BITS = AgentRadixSortDownsweepPolicy::RADIX_BITS, TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, RADIX_DIGITS = 1 << RADIX_BITS, KEYS_ONLY = std::is_same::value, LOAD_WARP_STRIPED = RANK_ALGORITHM == RADIX_RANK_MATCH || RANK_ALGORITHM == RADIX_RANK_MATCH_EARLY_COUNTS_ANY || RANK_ALGORITHM == RADIX_RANK_MATCH_EARLY_COUNTS_ATOMIC_OR, }; // Input iterator wrapper type (for applying cache modifier)s using KeysItr = CacheModifiedInputIterator; using ValuesItr = CacheModifiedInputIterator; // Radix ranking type to use using BlockRadixRankT = cub::detail::block_radix_rank_t; // Digit extractor type using fundamental_digit_extractor_t = BFEDigitExtractor; using digit_extractor_t = typename traits::template digit_extractor_t; enum { /// Number of bin-starting offsets tracked per thread BINS_TRACKED_PER_THREAD = BlockRadixRankT::BINS_TRACKED_PER_THREAD }; // BlockLoad type (keys) using BlockLoadKeysT = BlockLoad; // BlockLoad type (values) using BlockLoadValuesT = BlockLoad; // Value exchange array type typedef ValueT ValueExchangeT[TILE_ITEMS]; /** * Shared memory storage layout */ union __align__(16) _TempStorage { typename BlockLoadKeysT::TempStorage load_keys; typename BlockLoadValuesT::TempStorage load_values; typename BlockRadixRankT::TempStorage radix_rank; struct KeysAndOffsets { bit_ordered_type exchange_keys[TILE_ITEMS]; OffsetT relative_bin_offsets[RADIX_DIGITS]; } keys_and_offsets; Uninitialized exchange_values; OffsetT exclusive_digit_prefix[RADIX_DIGITS]; }; /// Alias wrapper allowing storage to be unioned struct TempStorage : Uninitialized<_TempStorage> {}; //--------------------------------------------------------------------- // Thread fields //--------------------------------------------------------------------- // Shared storage for this CTA _TempStorage& temp_storage; // Input and output device pointers KeysItr d_keys_in; ValuesItr d_values_in; bit_ordered_type* d_keys_out; ValueT* d_values_out; // The global scatter base offset for each digit (valid in the first RADIX_DIGITS threads) OffsetT bin_offset[BINS_TRACKED_PER_THREAD]; std::uint32_t current_bit; std::uint32_t num_bits; // Whether to short-cirucit int short_circuit; DecomposerT decomposer; //--------------------------------------------------------------------- // Utility methods //--------------------------------------------------------------------- _CCCL_DEVICE _CCCL_FORCEINLINE digit_extractor_t digit_extractor() { return traits::template digit_extractor(current_bit, num_bits, decomposer); } /** * Scatter ranked keys through shared memory, then to device-accessible memory */ template _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterKeys( bit_ordered_type (&twiddled_keys)[ITEMS_PER_THREAD], OffsetT (&relative_bin_offsets)[ITEMS_PER_THREAD], int (&ranks)[ITEMS_PER_THREAD], OffsetT valid_items) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { temp_storage.keys_and_offsets.exchange_keys[ranks[ITEM]] = twiddled_keys[ITEM]; } CTA_SYNC(); #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { bit_ordered_type key = temp_storage.keys_and_offsets.exchange_keys[threadIdx.x + (ITEM * BLOCK_THREADS)]; std::uint32_t digit = digit_extractor().Digit(key); relative_bin_offsets[ITEM] = temp_storage.keys_and_offsets.relative_bin_offsets[digit]; key = bit_ordered_conversion::from_bit_ordered(decomposer, key); if (FULL_TILE || (static_cast(threadIdx.x + (ITEM * BLOCK_THREADS)) < valid_items)) { d_keys_out[relative_bin_offsets[ITEM] + threadIdx.x + (ITEM * BLOCK_THREADS)] = key; } } } /** * Scatter ranked values through shared memory, then to device-accessible memory */ template _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterValues( ValueT (&values)[ITEMS_PER_THREAD], OffsetT (&relative_bin_offsets)[ITEMS_PER_THREAD], int (&ranks)[ITEMS_PER_THREAD], OffsetT valid_items) { CTA_SYNC(); ValueExchangeT& exchange_values = temp_storage.exchange_values.Alias(); #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { exchange_values[ranks[ITEM]] = values[ITEM]; } CTA_SYNC(); #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { ValueT value = exchange_values[threadIdx.x + (ITEM * BLOCK_THREADS)]; if (FULL_TILE || (static_cast(threadIdx.x + (ITEM * BLOCK_THREADS)) < valid_items)) { d_values_out[relative_bin_offsets[ITEM] + threadIdx.x + (ITEM * BLOCK_THREADS)] = value; } } } /** * Load a tile of keys (specialized for full tile, block load) */ _CCCL_DEVICE _CCCL_FORCEINLINE void LoadKeys( bit_ordered_type (&keys)[ITEMS_PER_THREAD], OffsetT block_offset, OffsetT valid_items, bit_ordered_type oob_item, Int2Type is_full_tile, Int2Type warp_striped) { BlockLoadKeysT(temp_storage.load_keys).Load(d_keys_in + block_offset, keys); CTA_SYNC(); } /** * Load a tile of keys (specialized for partial tile, block load) */ _CCCL_DEVICE _CCCL_FORCEINLINE void LoadKeys( bit_ordered_type (&keys)[ITEMS_PER_THREAD], OffsetT block_offset, OffsetT valid_items, bit_ordered_type oob_item, Int2Type is_full_tile, Int2Type warp_striped) { // Register pressure work-around: moving valid_items through shfl prevents compiler // from reusing guards/addressing from prior guarded loads valid_items = ShuffleIndex(valid_items, 0, 0xffffffff); BlockLoadKeysT(temp_storage.load_keys).Load(d_keys_in + block_offset, keys, valid_items, oob_item); CTA_SYNC(); } /** * Load a tile of keys (specialized for full tile, warp-striped load) */ _CCCL_DEVICE _CCCL_FORCEINLINE void LoadKeys( bit_ordered_type (&keys)[ITEMS_PER_THREAD], OffsetT block_offset, OffsetT valid_items, bit_ordered_type oob_item, Int2Type is_full_tile, Int2Type warp_striped) { LoadDirectWarpStriped(threadIdx.x, d_keys_in + block_offset, keys); } /** * Load a tile of keys (specialized for partial tile, warp-striped load) */ _CCCL_DEVICE _CCCL_FORCEINLINE void LoadKeys( bit_ordered_type (&keys)[ITEMS_PER_THREAD], OffsetT block_offset, OffsetT valid_items, bit_ordered_type oob_item, Int2Type is_full_tile, Int2Type warp_striped) { // Register pressure work-around: moving valid_items through shfl prevents compiler // from reusing guards/addressing from prior guarded loads valid_items = ShuffleIndex(valid_items, 0, 0xffffffff); LoadDirectWarpStriped(threadIdx.x, d_keys_in + block_offset, keys, valid_items, oob_item); } /** * Load a tile of values (specialized for full tile, block load) */ _CCCL_DEVICE _CCCL_FORCEINLINE void LoadValues( ValueT (&values)[ITEMS_PER_THREAD], OffsetT block_offset, OffsetT valid_items, Int2Type is_full_tile, Int2Type warp_striped) { BlockLoadValuesT(temp_storage.load_values).Load(d_values_in + block_offset, values); CTA_SYNC(); } /** * Load a tile of values (specialized for partial tile, block load) */ _CCCL_DEVICE _CCCL_FORCEINLINE void LoadValues( ValueT (&values)[ITEMS_PER_THREAD], OffsetT block_offset, OffsetT valid_items, Int2Type is_full_tile, Int2Type warp_striped) { // Register pressure work-around: moving valid_items through shfl prevents compiler // from reusing guards/addressing from prior guarded loads valid_items = ShuffleIndex(valid_items, 0, 0xffffffff); BlockLoadValuesT(temp_storage.load_values).Load(d_values_in + block_offset, values, valid_items); CTA_SYNC(); } /** * Load a tile of items (specialized for full tile, warp-striped load) */ _CCCL_DEVICE _CCCL_FORCEINLINE void LoadValues( ValueT (&values)[ITEMS_PER_THREAD], OffsetT block_offset, OffsetT valid_items, Int2Type is_full_tile, Int2Type warp_striped) { LoadDirectWarpStriped(threadIdx.x, d_values_in + block_offset, values); } /** * Load a tile of items (specialized for partial tile, warp-striped load) */ _CCCL_DEVICE _CCCL_FORCEINLINE void LoadValues( ValueT (&values)[ITEMS_PER_THREAD], OffsetT block_offset, OffsetT valid_items, Int2Type is_full_tile, Int2Type warp_striped) { // Register pressure work-around: moving valid_items through shfl prevents compiler // from reusing guards/addressing from prior guarded loads valid_items = ShuffleIndex(valid_items, 0, 0xffffffff); LoadDirectWarpStriped(threadIdx.x, d_values_in + block_offset, values, valid_items); } /** * Truck along associated values */ template _CCCL_DEVICE _CCCL_FORCEINLINE void GatherScatterValues( OffsetT (&relative_bin_offsets)[ITEMS_PER_THREAD], int (&ranks)[ITEMS_PER_THREAD], OffsetT block_offset, OffsetT valid_items, Int2Type /*is_keys_only*/) { ValueT values[ITEMS_PER_THREAD]; CTA_SYNC(); LoadValues(values, block_offset, valid_items, Int2Type(), Int2Type()); ScatterValues(values, relative_bin_offsets, ranks, valid_items); } /** * Truck along associated values (specialized for key-only sorting) */ template _CCCL_DEVICE _CCCL_FORCEINLINE void GatherScatterValues( OffsetT (& /*relative_bin_offsets*/)[ITEMS_PER_THREAD], int (& /*ranks*/)[ITEMS_PER_THREAD], OffsetT /*block_offset*/, OffsetT /*valid_items*/, Int2Type /*is_keys_only*/) {} /** * Process tile */ template _CCCL_DEVICE _CCCL_FORCEINLINE void ProcessTile(OffsetT block_offset, const OffsetT& valid_items = TILE_ITEMS) { bit_ordered_type keys[ITEMS_PER_THREAD]; int ranks[ITEMS_PER_THREAD]; OffsetT relative_bin_offsets[ITEMS_PER_THREAD]; // Assign default (min/max) value to all keys bit_ordered_type default_key = IS_DESCENDING ? traits::min_raw_binary_key(decomposer) : traits::max_raw_binary_key(decomposer); // Load tile of keys LoadKeys(keys, block_offset, valid_items, default_key, Int2Type(), Int2Type()); #pragma unroll for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++) { keys[KEY] = bit_ordered_conversion::to_bit_ordered(decomposer, keys[KEY]); } // Rank the twiddled keys int exclusive_digit_prefix[BINS_TRACKED_PER_THREAD]; BlockRadixRankT(temp_storage.radix_rank).RankKeys(keys, ranks, digit_extractor(), exclusive_digit_prefix); CTA_SYNC(); // Share exclusive digit prefix #pragma unroll for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) { int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track; if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) { // Store exclusive prefix temp_storage.exclusive_digit_prefix[bin_idx] = exclusive_digit_prefix[track]; } } CTA_SYNC(); // Get inclusive digit prefix int inclusive_digit_prefix[BINS_TRACKED_PER_THREAD]; #pragma unroll for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) { int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track; if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) { if (IS_DESCENDING) { // Get inclusive digit prefix from exclusive prefix (higher bins come first) inclusive_digit_prefix[track] = (bin_idx == 0) ? (BLOCK_THREADS * ITEMS_PER_THREAD) : temp_storage.exclusive_digit_prefix[bin_idx - 1]; } else { // Get inclusive digit prefix from exclusive prefix (lower bins come first) inclusive_digit_prefix[track] = (bin_idx == RADIX_DIGITS - 1) ? (BLOCK_THREADS * ITEMS_PER_THREAD) : temp_storage.exclusive_digit_prefix[bin_idx + 1]; } } } CTA_SYNC(); // Update global scatter base offsets for each digit #pragma unroll for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) { int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track; if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) { bin_offset[track] -= exclusive_digit_prefix[track]; temp_storage.keys_and_offsets.relative_bin_offsets[bin_idx] = bin_offset[track]; bin_offset[track] += inclusive_digit_prefix[track]; } } CTA_SYNC(); // Scatter keys ScatterKeys(keys, relative_bin_offsets, ranks, valid_items); // Gather/scatter values GatherScatterValues(relative_bin_offsets, ranks, block_offset, valid_items, Int2Type()); } //--------------------------------------------------------------------- // Copy shortcut //--------------------------------------------------------------------- /** * Copy tiles within the range of input */ template _CCCL_DEVICE _CCCL_FORCEINLINE void Copy(InputIteratorT d_in, T* d_out, OffsetT block_offset, OffsetT block_end) { // Simply copy the input while (block_end - block_offset >= TILE_ITEMS) { T items[ITEMS_PER_THREAD]; LoadDirectStriped(threadIdx.x, d_in + block_offset, items); CTA_SYNC(); StoreDirectStriped(threadIdx.x, d_out + block_offset, items); block_offset += TILE_ITEMS; } // Clean up last partial tile with guarded-I/O if (block_offset < block_end) { OffsetT valid_items = block_end - block_offset; T items[ITEMS_PER_THREAD]; LoadDirectStriped(threadIdx.x, d_in + block_offset, items, valid_items); CTA_SYNC(); StoreDirectStriped(threadIdx.x, d_out + block_offset, items, valid_items); } } /** * Copy tiles within the range of input (specialized for NullType) */ template _CCCL_DEVICE _CCCL_FORCEINLINE void Copy(InputIteratorT /*d_in*/, NullType* /*d_out*/, OffsetT /*block_offset*/, OffsetT /*block_end*/) {} //--------------------------------------------------------------------- // Interface //--------------------------------------------------------------------- /** * Constructor */ _CCCL_DEVICE _CCCL_FORCEINLINE AgentRadixSortDownsweep( TempStorage& temp_storage, OffsetT (&bin_offset)[BINS_TRACKED_PER_THREAD], OffsetT num_items, const KeyT* d_keys_in, KeyT* d_keys_out, const ValueT* d_values_in, ValueT* d_values_out, int current_bit, int num_bits, DecomposerT decomposer = {}) : temp_storage(temp_storage.Alias()) , d_keys_in(reinterpret_cast(d_keys_in)) , d_values_in(d_values_in) , d_keys_out(reinterpret_cast(d_keys_out)) , d_values_out(d_values_out) , current_bit(current_bit) , num_bits(num_bits) , short_circuit(1) , decomposer(decomposer) { #pragma unroll for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) { this->bin_offset[track] = bin_offset[track]; int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track; if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) { // Short circuit if the histogram has only bin counts of only zeros or problem-size short_circuit = short_circuit && ((bin_offset[track] == 0) || (bin_offset[track] == num_items)); } } short_circuit = CTA_SYNC_AND(short_circuit); } /** * Constructor */ _CCCL_DEVICE _CCCL_FORCEINLINE AgentRadixSortDownsweep( TempStorage& temp_storage, OffsetT num_items, OffsetT* d_spine, const KeyT* d_keys_in, KeyT* d_keys_out, const ValueT* d_values_in, ValueT* d_values_out, int current_bit, int num_bits, DecomposerT decomposer = {}) : temp_storage(temp_storage.Alias()) , d_keys_in(reinterpret_cast(d_keys_in)) , d_values_in(d_values_in) , d_keys_out(reinterpret_cast(d_keys_out)) , d_values_out(d_values_out) , current_bit(current_bit) , num_bits(num_bits) , short_circuit(1) , decomposer(decomposer) { #pragma unroll for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) { int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track; // Load digit bin offsets (each of the first RADIX_DIGITS threads will load an offset for that digit) if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) { if (IS_DESCENDING) { bin_idx = RADIX_DIGITS - bin_idx - 1; } // Short circuit if the first block's histogram has only bin counts of only zeros or problem-size OffsetT first_block_bin_offset = d_spine[gridDim.x * bin_idx]; short_circuit = short_circuit && ((first_block_bin_offset == 0) || (first_block_bin_offset == num_items)); // Load my block's bin offset for my bin bin_offset[track] = d_spine[(gridDim.x * bin_idx) + blockIdx.x]; } } short_circuit = CTA_SYNC_AND(short_circuit); } /** * Distribute keys from a segment of input tiles. */ _CCCL_DEVICE _CCCL_FORCEINLINE void ProcessRegion(OffsetT block_offset, OffsetT block_end) { if (short_circuit) { // Copy keys Copy(d_keys_in, d_keys_out, block_offset, block_end); // Copy values Copy(d_values_in, d_values_out, block_offset, block_end); } else { // Process full tiles of tile_items #pragma unroll 1 while (block_end - block_offset >= TILE_ITEMS) { ProcessTile(block_offset); block_offset += TILE_ITEMS; CTA_SYNC(); } // Clean up last partial tile with guarded-I/O if (block_offset < block_end) { ProcessTile(block_offset, block_end - block_offset); } } } }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/agent/agent_radix_sort_histogram.cuh000066400000000000000000000225001463375617100231700ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * agent_radix_sort_histogram.cuh implements a stateful abstraction of CUDA * thread blocks for participating in the device histogram kernel used for * one-sweep radix sorting. */ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include #include CUB_NAMESPACE_BEGIN template struct AgentRadixSortHistogramPolicy { enum { BLOCK_THREADS = _BLOCK_THREADS, ITEMS_PER_THREAD = _ITEMS_PER_THREAD, /** NUM_PARTS is the number of private histograms (parts) each histogram is split * into. Each warp lane is assigned to a specific part based on the lane * ID. However, lanes with the same ID in different warp use the same private * histogram. This arrangement helps reduce the degree of conflicts in atomic * operations. */ NUM_PARTS = CUB_MAX(1, NOMINAL_4B_NUM_PARTS * 4 / CUB_MAX(sizeof(ComputeT), 4)), RADIX_BITS = _RADIX_BITS, }; }; template struct AgentRadixSortExclusiveSumPolicy { enum { BLOCK_THREADS = _BLOCK_THREADS, RADIX_BITS = _RADIX_BITS, }; }; template struct AgentRadixSortHistogram { // constants enum { ITEMS_PER_THREAD = AgentRadixSortHistogramPolicy::ITEMS_PER_THREAD, BLOCK_THREADS = AgentRadixSortHistogramPolicy::BLOCK_THREADS, TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, RADIX_BITS = AgentRadixSortHistogramPolicy::RADIX_BITS, RADIX_DIGITS = 1 << RADIX_BITS, MAX_NUM_PASSES = (sizeof(KeyT) * 8 + RADIX_BITS - 1) / RADIX_BITS, NUM_PARTS = AgentRadixSortHistogramPolicy::NUM_PARTS, }; using traits = detail::radix::traits_t; using bit_ordered_type = typename traits::bit_ordered_type; using bit_ordered_conversion = typename traits::bit_ordered_conversion_policy; typedef RadixSortTwiddle Twiddle; typedef std::uint32_t ShmemCounterT; typedef ShmemCounterT ShmemAtomicCounterT; using fundamental_digit_extractor_t = ShiftDigitExtractor; using digit_extractor_t = typename traits::template digit_extractor_t; struct _TempStorage { ShmemAtomicCounterT bins[MAX_NUM_PASSES][RADIX_DIGITS][NUM_PARTS]; }; struct TempStorage : Uninitialized<_TempStorage> {}; // thread fields // shared memory storage _TempStorage& s; // bins for the histogram OffsetT* d_bins_out; // data to compute the histogram const bit_ordered_type* d_keys_in; // number of data items OffsetT num_items; // begin and end bits for sorting int begin_bit, end_bit; // number of sorting passes int num_passes; DecomposerT decomposer; _CCCL_DEVICE _CCCL_FORCEINLINE AgentRadixSortHistogram( TempStorage& temp_storage, OffsetT* d_bins_out, const KeyT* d_keys_in, OffsetT num_items, int begin_bit, int end_bit, DecomposerT decomposer = {}) : s(temp_storage.Alias()) , d_bins_out(d_bins_out) , d_keys_in(reinterpret_cast(d_keys_in)) , num_items(num_items) , begin_bit(begin_bit) , end_bit(end_bit) , num_passes((end_bit - begin_bit + RADIX_BITS - 1) / RADIX_BITS) , decomposer(decomposer) {} _CCCL_DEVICE _CCCL_FORCEINLINE void Init() { // Initialize bins to 0. #pragma unroll for (int bin = threadIdx.x; bin < RADIX_DIGITS; bin += BLOCK_THREADS) { #pragma unroll for (int pass = 0; pass < num_passes; ++pass) { #pragma unroll for (int part = 0; part < NUM_PARTS; ++part) { s.bins[pass][bin][part] = 0; } } } CTA_SYNC(); } _CCCL_DEVICE _CCCL_FORCEINLINE void LoadTileKeys(OffsetT tile_offset, bit_ordered_type (&keys)[ITEMS_PER_THREAD]) { // tile_offset < num_items always, hence the line below works bool full_tile = num_items - tile_offset >= TILE_ITEMS; if (full_tile) { LoadDirectStriped(threadIdx.x, d_keys_in + tile_offset, keys); } else { LoadDirectStriped( threadIdx.x, d_keys_in + tile_offset, keys, num_items - tile_offset, Twiddle::DefaultKey(decomposer)); } #pragma unroll for (int u = 0; u < ITEMS_PER_THREAD; ++u) { keys[u] = Twiddle::In(keys[u], decomposer); } } _CCCL_DEVICE _CCCL_FORCEINLINE void AccumulateSharedHistograms(OffsetT tile_offset, bit_ordered_type (&keys)[ITEMS_PER_THREAD]) { int part = LaneId() % NUM_PARTS; #pragma unroll for (int current_bit = begin_bit, pass = 0; current_bit < end_bit; current_bit += RADIX_BITS, ++pass) { int num_bits = CUB_MIN(RADIX_BITS, end_bit - current_bit); #pragma unroll for (int u = 0; u < ITEMS_PER_THREAD; ++u) { std::uint32_t bin = digit_extractor(current_bit, num_bits).Digit(keys[u]); // Using cuda::atomic<> results in lower performance on GP100, // so atomicAdd() is used instead. atomicAdd(&s.bins[pass][bin][part], 1); } } } _CCCL_DEVICE _CCCL_FORCEINLINE void AccumulateGlobalHistograms() { #pragma unroll for (int bin = threadIdx.x; bin < RADIX_DIGITS; bin += BLOCK_THREADS) { #pragma unroll for (int pass = 0; pass < num_passes; ++pass) { OffsetT count = internal::ThreadReduce(s.bins[pass][bin], Sum()); if (count > 0) { // Using cuda::atomic<> here would also require using it in // other kernels. However, other kernels of onesweep sorting // (ExclusiveSum, Onesweep) don't need atomic // access. Therefore, atomicAdd() is used, until // cuda::atomic_ref<> becomes available. atomicAdd(&d_bins_out[pass * RADIX_DIGITS + bin], count); } } } } _CCCL_DEVICE _CCCL_FORCEINLINE void Process() { // Within a portion, avoid overflowing (u)int32 counters. // Between portions, accumulate results in global memory. constexpr OffsetT MAX_PORTION_SIZE = 1 << 30; OffsetT num_portions = cub::DivideAndRoundUp(num_items, MAX_PORTION_SIZE); for (OffsetT portion = 0; portion < num_portions; ++portion) { // Reset the counters. Init(); CTA_SYNC(); // Process the tiles. OffsetT portion_offset = portion * MAX_PORTION_SIZE; OffsetT portion_size = CUB_MIN(MAX_PORTION_SIZE, num_items - portion_offset); for (OffsetT offset = blockIdx.x * TILE_ITEMS; offset < portion_size; offset += TILE_ITEMS * gridDim.x) { OffsetT tile_offset = portion_offset + offset; bit_ordered_type keys[ITEMS_PER_THREAD]; LoadTileKeys(tile_offset, keys); AccumulateSharedHistograms(tile_offset, keys); } CTA_SYNC(); // Accumulate the result in global memory. AccumulateGlobalHistograms(); CTA_SYNC(); } } _CCCL_DEVICE _CCCL_FORCEINLINE digit_extractor_t digit_extractor(int current_bit, int num_bits) { return traits::template digit_extractor(current_bit, num_bits, decomposer); } }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/agent/agent_radix_sort_onesweep.cuh000066400000000000000000000554041463375617100230310ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * agent_radix_sort_onesweep.cuh implements a stateful abstraction of CUDA * thread blocks for participating in the device one-sweep radix sort kernel. */ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include #include CUB_NAMESPACE_BEGIN /** \brief cub::RadixSortStoreAlgorithm enumerates different algorithms to write * partitioned elements (keys, values) stored in shared memory into global * memory. Currently applies only to writing 4B keys in full tiles; in all other cases, * RADIX_SORT_STORE_DIRECT is used. */ enum RadixSortStoreAlgorithm { /** \brief Elements are statically distributed among block threads, which write them * into the appropriate partition in global memory. This results in fewer instructions * and more writes in flight at a given moment, but may generate more transactions. */ RADIX_SORT_STORE_DIRECT, /** \brief Elements are distributed among warps in a block distribution. Each warp * goes through its elements and tries to write them while minimizing the number of * memory transactions. This results in fewer memory transactions, but more * instructions and less writes in flight at a given moment. */ RADIX_SORT_STORE_ALIGNED }; template > struct AgentRadixSortOnesweepPolicy : ScalingType { enum { RANK_NUM_PARTS = _RANK_NUM_PARTS, RADIX_BITS = _RADIX_BITS, }; static constexpr RadixRankAlgorithm RANK_ALGORITHM = _RANK_ALGORITHM; static constexpr BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; static constexpr RadixSortStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM; }; template struct AgentRadixSortOnesweep { // constants enum { ITEMS_PER_THREAD = AgentRadixSortOnesweepPolicy::ITEMS_PER_THREAD, KEYS_ONLY = std::is_same::value, BLOCK_THREADS = AgentRadixSortOnesweepPolicy::BLOCK_THREADS, RANK_NUM_PARTS = AgentRadixSortOnesweepPolicy::RANK_NUM_PARTS, TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, RADIX_BITS = AgentRadixSortOnesweepPolicy::RADIX_BITS, RADIX_DIGITS = 1 << RADIX_BITS, BINS_PER_THREAD = (RADIX_DIGITS + BLOCK_THREADS - 1) / BLOCK_THREADS, FULL_BINS = BINS_PER_THREAD * BLOCK_THREADS == RADIX_DIGITS, WARP_THREADS = CUB_PTX_WARP_THREADS, BLOCK_WARPS = BLOCK_THREADS / WARP_THREADS, WARP_MASK = ~0, LOOKBACK_PARTIAL_MASK = 1 << (PortionOffsetT(sizeof(PortionOffsetT)) * 8 - 2), LOOKBACK_GLOBAL_MASK = 1 << (PortionOffsetT(sizeof(PortionOffsetT)) * 8 - 1), LOOKBACK_KIND_MASK = LOOKBACK_PARTIAL_MASK | LOOKBACK_GLOBAL_MASK, LOOKBACK_VALUE_MASK = ~LOOKBACK_KIND_MASK, }; using traits = detail::radix::traits_t; using bit_ordered_type = typename traits::bit_ordered_type; using bit_ordered_conversion = typename traits::bit_ordered_conversion_policy; using fundamental_digit_extractor_t = ShiftDigitExtractor; using digit_extractor_t = typename traits::template digit_extractor_t; typedef PortionOffsetT AtomicOffsetT; static constexpr RadixRankAlgorithm RANK_ALGORITHM = AgentRadixSortOnesweepPolicy::RANK_ALGORITHM; static constexpr BlockScanAlgorithm SCAN_ALGORITHM = AgentRadixSortOnesweepPolicy::SCAN_ALGORITHM; static constexpr RadixSortStoreAlgorithm STORE_ALGORITHM = sizeof(bit_ordered_type) == sizeof(uint32_t) ? AgentRadixSortOnesweepPolicy::STORE_ALGORITHM : RADIX_SORT_STORE_DIRECT; typedef RadixSortTwiddle Twiddle; static_assert(RANK_ALGORITHM == RADIX_RANK_MATCH || RANK_ALGORITHM == RADIX_RANK_MATCH_EARLY_COUNTS_ANY || RANK_ALGORITHM == RADIX_RANK_MATCH_EARLY_COUNTS_ATOMIC_OR, "for onesweep agent, the ranking algorithm must warp-strided key arrangement"); using BlockRadixRankT = cub::detail::conditional_t< RANK_ALGORITHM == RADIX_RANK_MATCH_EARLY_COUNTS_ATOMIC_OR, BlockRadixRankMatchEarlyCounts, cub::detail::conditional_t< RANK_ALGORITHM == RADIX_RANK_MATCH, BlockRadixRankMatch, BlockRadixRankMatchEarlyCounts>>; // temporary storage struct TempStorage_ { union { bit_ordered_type keys_out[TILE_ITEMS]; ValueT values_out[TILE_ITEMS]; typename BlockRadixRankT::TempStorage rank_temp_storage; }; union { OffsetT global_offsets[RADIX_DIGITS]; PortionOffsetT block_idx; }; }; using TempStorage = Uninitialized; // thread variables TempStorage_& s; // kernel parameters AtomicOffsetT* d_lookback; AtomicOffsetT* d_ctrs; OffsetT* d_bins_out; const OffsetT* d_bins_in; bit_ordered_type* d_keys_out; const bit_ordered_type* d_keys_in; ValueT* d_values_out; const ValueT* d_values_in; PortionOffsetT num_items; int current_bit; int num_bits; // other thread variables int warp; int lane; DecomposerT decomposer; PortionOffsetT block_idx; bool full_block; _CCCL_DEVICE _CCCL_FORCEINLINE digit_extractor_t digit_extractor() { return traits::template digit_extractor(current_bit, num_bits, decomposer); } // helper methods _CCCL_DEVICE _CCCL_FORCEINLINE std::uint32_t Digit(bit_ordered_type key) { return digit_extractor().Digit(key); } _CCCL_DEVICE _CCCL_FORCEINLINE int ThreadBin(int u) { return threadIdx.x * BINS_PER_THREAD + u; } _CCCL_DEVICE _CCCL_FORCEINLINE void LookbackPartial(int (&bins)[BINS_PER_THREAD]) { #pragma unroll for (int u = 0; u < BINS_PER_THREAD; ++u) { int bin = ThreadBin(u); if (FULL_BINS || bin < RADIX_DIGITS) { // write the local sum into the bin AtomicOffsetT& loc = d_lookback[block_idx * RADIX_DIGITS + bin]; PortionOffsetT value = bins[u] | LOOKBACK_PARTIAL_MASK; ThreadStore(&loc, value); } } } struct CountsCallback { typedef AgentRadixSortOnesweep AgentT; AgentT& agent; int (&bins)[BINS_PER_THREAD]; bit_ordered_type (&keys)[ITEMS_PER_THREAD]; static constexpr bool EMPTY = false; _CCCL_DEVICE _CCCL_FORCEINLINE CountsCallback(AgentT& agent, int (&bins)[BINS_PER_THREAD], bit_ordered_type (&keys)[ITEMS_PER_THREAD]) : agent(agent) , bins(bins) , keys(keys) {} _CCCL_DEVICE _CCCL_FORCEINLINE void operator()(int (&other_bins)[BINS_PER_THREAD]) { #pragma unroll for (int u = 0; u < BINS_PER_THREAD; ++u) { bins[u] = other_bins[u]; } agent.LookbackPartial(bins); agent.TryShortCircuit(keys, bins); } }; _CCCL_DEVICE _CCCL_FORCEINLINE void LookbackGlobal(int (&bins)[BINS_PER_THREAD]) { #pragma unroll for (int u = 0; u < BINS_PER_THREAD; ++u) { int bin = ThreadBin(u); if (FULL_BINS || bin < RADIX_DIGITS) { PortionOffsetT inc_sum = bins[u]; int want_mask = ~0; // backtrack as long as necessary for (PortionOffsetT block_jdx = block_idx - 1; block_jdx >= 0; --block_jdx) { // wait for some value to appear PortionOffsetT value_j = 0; AtomicOffsetT& loc_j = d_lookback[block_jdx * RADIX_DIGITS + bin]; do { __threadfence_block(); // prevent hoisting loads from loop value_j = ThreadLoad(&loc_j); } while (value_j == 0); inc_sum += value_j & LOOKBACK_VALUE_MASK; want_mask = WARP_BALLOT((value_j & LOOKBACK_GLOBAL_MASK) == 0, want_mask); if (value_j & LOOKBACK_GLOBAL_MASK) { break; } } AtomicOffsetT& loc_i = d_lookback[block_idx * RADIX_DIGITS + bin]; PortionOffsetT value_i = inc_sum | LOOKBACK_GLOBAL_MASK; ThreadStore(&loc_i, value_i); s.global_offsets[bin] += inc_sum - bins[u]; } } } _CCCL_DEVICE _CCCL_FORCEINLINE void LoadKeys(OffsetT tile_offset, bit_ordered_type (&keys)[ITEMS_PER_THREAD]) { if (full_block) { LoadDirectWarpStriped(threadIdx.x, d_keys_in + tile_offset, keys); } else { LoadDirectWarpStriped( threadIdx.x, d_keys_in + tile_offset, keys, num_items - tile_offset, Twiddle::DefaultKey(decomposer)); } #pragma unroll for (int u = 0; u < ITEMS_PER_THREAD; ++u) { keys[u] = Twiddle::In(keys[u], decomposer); } } _CCCL_DEVICE _CCCL_FORCEINLINE void LoadValues(OffsetT tile_offset, ValueT (&values)[ITEMS_PER_THREAD]) { if (full_block) { LoadDirectWarpStriped(threadIdx.x, d_values_in + tile_offset, values); } else { int tile_items = num_items - tile_offset; LoadDirectWarpStriped(threadIdx.x, d_values_in + tile_offset, values, tile_items); } } /** Checks whether "short-circuiting" is possible. Short-circuiting happens * if all TILE_ITEMS keys fall into the same bin, i.e. have the same digit * value (note that it only happens for full tiles). If short-circuiting is * performed, the part of the ranking algorithm after the CountsCallback, as * well as the rest of the sorting (e.g. scattering keys and values to * shared and global memory) are skipped; updates related to decoupled * look-back are still performed. Instead, the keys assigned to the current * thread block are written cooperatively into a contiguous location in * d_keys_out corresponding to their digit. The values (if also sorting * values) assigned to the current thread block are similarly copied from * d_values_in to d_values_out. */ _CCCL_DEVICE _CCCL_FORCEINLINE void TryShortCircuit(bit_ordered_type (&keys)[ITEMS_PER_THREAD], int (&bins)[BINS_PER_THREAD]) { // check if any bin can be short-circuited bool short_circuit = false; #pragma unroll for (int u = 0; u < BINS_PER_THREAD; ++u) { if (FULL_BINS || ThreadBin(u) < RADIX_DIGITS) { short_circuit = short_circuit || bins[u] == TILE_ITEMS; } } short_circuit = CTA_SYNC_OR(short_circuit); if (!short_circuit) { return; } ShortCircuitCopy(keys, bins); } _CCCL_DEVICE _CCCL_FORCEINLINE void ShortCircuitCopy(bit_ordered_type (&keys)[ITEMS_PER_THREAD], int (&bins)[BINS_PER_THREAD]) { // short-circuit handling; note that global look-back is still required // compute offsets std::uint32_t common_bin = Digit(keys[0]); int offsets[BINS_PER_THREAD]; #pragma unroll for (int u = 0; u < BINS_PER_THREAD; ++u) { int bin = ThreadBin(u); offsets[u] = bin > common_bin ? TILE_ITEMS : 0; } // global lookback LoadBinsToOffsetsGlobal(offsets); LookbackGlobal(bins); UpdateBinsGlobal(bins, offsets); CTA_SYNC(); // scatter the keys OffsetT global_offset = s.global_offsets[common_bin]; #pragma unroll for (int u = 0; u < ITEMS_PER_THREAD; ++u) { keys[u] = Twiddle::Out(keys[u], decomposer); } if (full_block) { StoreDirectWarpStriped(threadIdx.x, d_keys_out + global_offset, keys); } else { int tile_items = num_items - block_idx * TILE_ITEMS; StoreDirectWarpStriped(threadIdx.x, d_keys_out + global_offset, keys, tile_items); } if (!KEYS_ONLY) { // gather and scatter the values ValueT values[ITEMS_PER_THREAD]; LoadValues(block_idx * TILE_ITEMS, values); if (full_block) { StoreDirectWarpStriped(threadIdx.x, d_values_out + global_offset, values); } else { int tile_items = num_items - block_idx * TILE_ITEMS; StoreDirectWarpStriped(threadIdx.x, d_values_out + global_offset, values, tile_items); } } // exit early ThreadExit(); } _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterKeysShared(bit_ordered_type (&keys)[ITEMS_PER_THREAD], int (&ranks)[ITEMS_PER_THREAD]) { // write to shared memory #pragma unroll for (int u = 0; u < ITEMS_PER_THREAD; ++u) { s.keys_out[ranks[u]] = keys[u]; } } _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterValuesShared(ValueT (&values)[ITEMS_PER_THREAD], int (&ranks)[ITEMS_PER_THREAD]) { // write to shared memory #pragma unroll for (int u = 0; u < ITEMS_PER_THREAD; ++u) { s.values_out[ranks[u]] = values[u]; } } _CCCL_DEVICE _CCCL_FORCEINLINE void LoadBinsToOffsetsGlobal(int (&offsets)[BINS_PER_THREAD]) { // global offset - global part #pragma unroll for (int u = 0; u < BINS_PER_THREAD; ++u) { int bin = ThreadBin(u); if (FULL_BINS || bin < RADIX_DIGITS) { s.global_offsets[bin] = d_bins_in[bin] - offsets[u]; } } } _CCCL_DEVICE _CCCL_FORCEINLINE void UpdateBinsGlobal(int (&bins)[BINS_PER_THREAD], int (&offsets)[BINS_PER_THREAD]) { bool last_block = (block_idx + 1) * TILE_ITEMS >= num_items; if (d_bins_out != NULL && last_block) { #pragma unroll for (int u = 0; u < BINS_PER_THREAD; ++u) { int bin = ThreadBin(u); if (FULL_BINS || bin < RADIX_DIGITS) { d_bins_out[bin] = s.global_offsets[bin] + offsets[u] + bins[u]; } } } } template _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterKeysGlobalDirect() { int tile_items = FULL_TILE ? TILE_ITEMS : num_items - block_idx * TILE_ITEMS; #pragma unroll for (int u = 0; u < ITEMS_PER_THREAD; ++u) { int idx = threadIdx.x + u * BLOCK_THREADS; bit_ordered_type key = s.keys_out[idx]; OffsetT global_idx = idx + s.global_offsets[Digit(key)]; if (FULL_TILE || idx < tile_items) { d_keys_out[global_idx] = Twiddle::Out(key, decomposer); } WARP_SYNC(WARP_MASK); } } template _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterValuesGlobalDirect(int (&digits)[ITEMS_PER_THREAD]) { int tile_items = FULL_TILE ? TILE_ITEMS : num_items - block_idx * TILE_ITEMS; #pragma unroll for (int u = 0; u < ITEMS_PER_THREAD; ++u) { int idx = threadIdx.x + u * BLOCK_THREADS; ValueT value = s.values_out[idx]; OffsetT global_idx = idx + s.global_offsets[digits[u]]; if (FULL_TILE || idx < tile_items) { d_values_out[global_idx] = value; } WARP_SYNC(WARP_MASK); } } _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterKeysGlobalAligned() { // this only works with full tiles constexpr int ITEMS_PER_WARP = TILE_ITEMS / BLOCK_WARPS; constexpr int ALIGN = 8; constexpr auto CACHE_MODIFIER = STORE_CG; int warp_start = warp * ITEMS_PER_WARP; int warp_end = (warp + 1) * ITEMS_PER_WARP; int warp_offset = warp_start; while (warp_offset < warp_end - WARP_THREADS) { int idx = warp_offset + lane; bit_ordered_type key = s.keys_out[idx]; bit_ordered_type key_out = Twiddle::Out(key, decomposer); OffsetT global_idx = idx + s.global_offsets[Digit(key)]; int last_lane = WARP_THREADS - 1; int num_writes = WARP_THREADS; if (lane == last_lane) { num_writes -= int(global_idx + 1) % ALIGN; } num_writes = SHFL_IDX_SYNC(num_writes, last_lane, WARP_MASK); if (lane < num_writes) { ThreadStore(&d_keys_out[global_idx], key_out); } warp_offset += num_writes; } { int num_writes = warp_end - warp_offset; if (lane < num_writes) { int idx = warp_offset + lane; bit_ordered_type key = s.keys_out[idx]; OffsetT global_idx = idx + s.global_offsets[Digit(key)]; ThreadStore(&d_keys_out[global_idx], Twiddle::Out(key, decomposer)); } } } _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterKeysGlobal() { // write block data to global memory if (full_block) { if (STORE_ALGORITHM == RADIX_SORT_STORE_ALIGNED) { ScatterKeysGlobalAligned(); } else { ScatterKeysGlobalDirect(); } } else { ScatterKeysGlobalDirect(); } } _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterValuesGlobal(int (&digits)[ITEMS_PER_THREAD]) { // write block data to global memory if (full_block) { ScatterValuesGlobalDirect(digits); } else { ScatterValuesGlobalDirect(digits); } } _CCCL_DEVICE _CCCL_FORCEINLINE void ComputeKeyDigits(int (&digits)[ITEMS_PER_THREAD]) { #pragma unroll for (int u = 0; u < ITEMS_PER_THREAD; ++u) { int idx = threadIdx.x + u * BLOCK_THREADS; digits[u] = Digit(s.keys_out[idx]); } } _CCCL_DEVICE _CCCL_FORCEINLINE void GatherScatterValues(int (&ranks)[ITEMS_PER_THREAD], Int2Type keys_only) { // compute digits corresponding to the keys int digits[ITEMS_PER_THREAD]; ComputeKeyDigits(digits); // load values ValueT values[ITEMS_PER_THREAD]; LoadValues(block_idx * TILE_ITEMS, values); // scatter values CTA_SYNC(); ScatterValuesShared(values, ranks); CTA_SYNC(); ScatterValuesGlobal(digits); } _CCCL_DEVICE _CCCL_FORCEINLINE void GatherScatterValues(int (&ranks)[ITEMS_PER_THREAD], Int2Type keys_only) {} _CCCL_DEVICE _CCCL_FORCEINLINE void Process() { // load keys // if warp1 < warp2, all elements of warp1 occur before those of warp2 // in the source array bit_ordered_type keys[ITEMS_PER_THREAD]; LoadKeys(block_idx * TILE_ITEMS, keys); // rank keys int ranks[ITEMS_PER_THREAD]; int exclusive_digit_prefix[BINS_PER_THREAD]; int bins[BINS_PER_THREAD]; BlockRadixRankT(s.rank_temp_storage) .RankKeys(keys, ranks, digit_extractor(), exclusive_digit_prefix, CountsCallback(*this, bins, keys)); // scatter keys in shared memory CTA_SYNC(); ScatterKeysShared(keys, ranks); // compute global offsets LoadBinsToOffsetsGlobal(exclusive_digit_prefix); LookbackGlobal(bins); UpdateBinsGlobal(bins, exclusive_digit_prefix); // scatter keys in global memory CTA_SYNC(); ScatterKeysGlobal(); // scatter values if necessary GatherScatterValues(ranks, Int2Type()); } _CCCL_DEVICE _CCCL_FORCEINLINE // AgentRadixSortOnesweep( TempStorage& temp_storage, AtomicOffsetT* d_lookback, AtomicOffsetT* d_ctrs, OffsetT* d_bins_out, const OffsetT* d_bins_in, KeyT* d_keys_out, const KeyT* d_keys_in, ValueT* d_values_out, const ValueT* d_values_in, PortionOffsetT num_items, int current_bit, int num_bits, DecomposerT decomposer = {}) : s(temp_storage.Alias()) , d_lookback(d_lookback) , d_ctrs(d_ctrs) , d_bins_out(d_bins_out) , d_bins_in(d_bins_in) , d_keys_out(reinterpret_cast(d_keys_out)) , d_keys_in(reinterpret_cast(d_keys_in)) , d_values_out(d_values_out) , d_values_in(d_values_in) , num_items(num_items) , current_bit(current_bit) , num_bits(num_bits) , warp(threadIdx.x / WARP_THREADS) , lane(LaneId()) , decomposer(decomposer) { // initialization if (threadIdx.x == 0) { s.block_idx = atomicAdd(d_ctrs, 1); } CTA_SYNC(); block_idx = s.block_idx; full_block = (block_idx + 1) * TILE_ITEMS <= num_items; } }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/agent/agent_radix_sort_upsweep.cuh000066400000000000000000000415741463375617100226770ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * AgentRadixSortUpsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix * sort upsweep . */ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include #include #include #include CUB_NAMESPACE_BEGIN /****************************************************************************** * Tuning policy types ******************************************************************************/ /** * @brief Parameterizable tuning policy type for AgentRadixSortUpsweep * * @tparam NOMINAL_BLOCK_THREADS_4B * Threads per thread block * * @tparam NOMINAL_ITEMS_PER_THREAD_4B * Items per thread (per tile of input) * * @tparam ComputeT * Dominant compute type * * @tparam _LOAD_MODIFIER * Cache load modifier for reading keys * * @tparam _RADIX_BITS * The number of radix bits, i.e., log2(bins) */ template > struct AgentRadixSortUpsweepPolicy : ScalingType { enum { /// The number of radix bits, i.e., log2(bins) RADIX_BITS = _RADIX_BITS, }; /// Cache load modifier for reading keys static constexpr CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; }; /****************************************************************************** * Thread block abstractions ******************************************************************************/ /** * @brief AgentRadixSortUpsweep implements a stateful abstraction of CUDA thread blocks for * participating in device-wide radix sort upsweep . * * @tparam AgentRadixSortUpsweepPolicy * Parameterized AgentRadixSortUpsweepPolicy tuning policy type * * @tparam KeyT * KeyT type * * @tparam DecomposerT = detail::identity_decomposer_t * Signed integer type for global offsets */ template struct AgentRadixSortUpsweep { //--------------------------------------------------------------------- // Type definitions and constants //--------------------------------------------------------------------- using traits = detail::radix::traits_t; using bit_ordered_type = typename traits::bit_ordered_type; using bit_ordered_conversion = typename traits::bit_ordered_conversion_policy; // Integer type for digit counters (to be packed into words of PackedCounters) typedef unsigned char DigitCounter; // Integer type for packing DigitCounters into columns of shared memory banks typedef unsigned int PackedCounter; static constexpr CacheLoadModifier LOAD_MODIFIER = AgentRadixSortUpsweepPolicy::LOAD_MODIFIER; enum { RADIX_BITS = AgentRadixSortUpsweepPolicy::RADIX_BITS, BLOCK_THREADS = AgentRadixSortUpsweepPolicy::BLOCK_THREADS, KEYS_PER_THREAD = AgentRadixSortUpsweepPolicy::ITEMS_PER_THREAD, RADIX_DIGITS = 1 << RADIX_BITS, LOG_WARP_THREADS = CUB_PTX_LOG_WARP_THREADS, WARP_THREADS = 1 << LOG_WARP_THREADS, WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, TILE_ITEMS = BLOCK_THREADS * KEYS_PER_THREAD, BYTES_PER_COUNTER = sizeof(DigitCounter), LOG_BYTES_PER_COUNTER = Log2::VALUE, PACKING_RATIO = sizeof(PackedCounter) / sizeof(DigitCounter), LOG_PACKING_RATIO = Log2::VALUE, LOG_COUNTER_LANES = CUB_MAX(0, int(RADIX_BITS) - int(LOG_PACKING_RATIO)), COUNTER_LANES = 1 << LOG_COUNTER_LANES, // To prevent counter overflow, we must periodically unpack and aggregate the // digit counters back into registers. Each counter lane is assigned to a // warp for aggregation. LANES_PER_WARP = CUB_MAX(1, (COUNTER_LANES + WARPS - 1) / WARPS), // Unroll tiles in batches without risk of counter overflow UNROLL_COUNT = CUB_MIN(64, 255 / KEYS_PER_THREAD), UNROLLED_ELEMENTS = UNROLL_COUNT * TILE_ITEMS, }; // Input iterator wrapper type (for applying cache modifier)s typedef CacheModifiedInputIterator KeysItr; // Digit extractor type using fundamental_digit_extractor_t = BFEDigitExtractor; using digit_extractor_t = typename traits::template digit_extractor_t; /** * Shared memory storage layout */ union __align__(16) _TempStorage { DigitCounter thread_counters[COUNTER_LANES][BLOCK_THREADS][PACKING_RATIO]; PackedCounter packed_thread_counters[COUNTER_LANES][BLOCK_THREADS]; OffsetT block_counters[WARP_THREADS][RADIX_DIGITS]; }; /// Alias wrapper allowing storage to be unioned struct TempStorage : Uninitialized<_TempStorage> {}; //--------------------------------------------------------------------- // Thread fields (aggregate state bundle) //--------------------------------------------------------------------- // Shared storage for this CTA _TempStorage& temp_storage; // Thread-local counters for periodically aggregating composite-counter lanes OffsetT local_counts[LANES_PER_WARP][PACKING_RATIO]; // Input and output device pointers KeysItr d_keys_in; // Target bits int current_bit; int num_bits; DecomposerT decomposer; //--------------------------------------------------------------------- // Helper structure for templated iteration //--------------------------------------------------------------------- // Iterate template struct Iterate { // BucketKeys static _CCCL_DEVICE _CCCL_FORCEINLINE void BucketKeys(AgentRadixSortUpsweep& cta, bit_ordered_type keys[KEYS_PER_THREAD]) { cta.Bucket(keys[COUNT]); // Next Iterate::BucketKeys(cta, keys); } }; // Terminate template struct Iterate { // BucketKeys static _CCCL_DEVICE _CCCL_FORCEINLINE void BucketKeys(AgentRadixSortUpsweep& /*cta*/, bit_ordered_type /*keys*/[KEYS_PER_THREAD]) {} }; //--------------------------------------------------------------------- // Utility methods //--------------------------------------------------------------------- _CCCL_DEVICE _CCCL_FORCEINLINE digit_extractor_t digit_extractor() { return traits::template digit_extractor(current_bit, num_bits, decomposer); } /** * Decode a key and increment corresponding smem digit counter */ _CCCL_DEVICE _CCCL_FORCEINLINE void Bucket(bit_ordered_type key) { // Perform transform op bit_ordered_type converted_key = bit_ordered_conversion::to_bit_ordered(decomposer, key); // Extract current digit bits std::uint32_t digit = digit_extractor().Digit(converted_key); // Get sub-counter offset std::uint32_t sub_counter = digit & (PACKING_RATIO - 1); // Get row offset std::uint32_t row_offset = digit >> LOG_PACKING_RATIO; // Increment counter temp_storage.thread_counters[row_offset][threadIdx.x][sub_counter]++; } /** * Reset composite counters */ _CCCL_DEVICE _CCCL_FORCEINLINE void ResetDigitCounters() { #pragma unroll for (int LANE = 0; LANE < COUNTER_LANES; LANE++) { temp_storage.packed_thread_counters[LANE][threadIdx.x] = 0; } } /** * Reset the unpacked counters in each thread */ _CCCL_DEVICE _CCCL_FORCEINLINE void ResetUnpackedCounters() { #pragma unroll for (int LANE = 0; LANE < LANES_PER_WARP; LANE++) { #pragma unroll for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++) { local_counts[LANE][UNPACKED_COUNTER] = 0; } } } /** * Extracts and aggregates the digit counters for each counter lane * owned by this warp */ _CCCL_DEVICE _CCCL_FORCEINLINE void UnpackDigitCounts() { unsigned int warp_id = threadIdx.x >> LOG_WARP_THREADS; unsigned int warp_tid = LaneId(); #pragma unroll for (int LANE = 0; LANE < LANES_PER_WARP; LANE++) { const int counter_lane = (LANE * WARPS) + warp_id; if (counter_lane < COUNTER_LANES) { #pragma unroll for (int PACKED_COUNTER = 0; PACKED_COUNTER < BLOCK_THREADS; PACKED_COUNTER += WARP_THREADS) { #pragma unroll for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++) { OffsetT counter = temp_storage.thread_counters[counter_lane][warp_tid + PACKED_COUNTER][UNPACKED_COUNTER]; local_counts[LANE][UNPACKED_COUNTER] += counter; } } } } } /** * Processes a single, full tile */ _CCCL_DEVICE _CCCL_FORCEINLINE void ProcessFullTile(OffsetT block_offset) { // Tile of keys bit_ordered_type keys[KEYS_PER_THREAD]; LoadDirectStriped(threadIdx.x, d_keys_in + block_offset, keys); // Prevent hoisting CTA_SYNC(); // Bucket tile of keys Iterate<0, KEYS_PER_THREAD>::BucketKeys(*this, keys); } /** * Processes a single load (may have some threads masked off) */ _CCCL_DEVICE _CCCL_FORCEINLINE void ProcessPartialTile(OffsetT block_offset, const OffsetT& block_end) { // Process partial tile if necessary using single loads for (OffsetT offset = threadIdx.x; offset < block_end - block_offset; offset += BLOCK_THREADS) { // Load and bucket key bit_ordered_type key = d_keys_in[block_offset + offset]; Bucket(key); } } //--------------------------------------------------------------------- // Interface //--------------------------------------------------------------------- /** * Constructor */ _CCCL_DEVICE _CCCL_FORCEINLINE AgentRadixSortUpsweep( TempStorage& temp_storage, const KeyT* d_keys_in, int current_bit, int num_bits, DecomposerT decomposer = {}) : temp_storage(temp_storage.Alias()) , d_keys_in(reinterpret_cast(d_keys_in)) , current_bit(current_bit) , num_bits(num_bits) , decomposer(decomposer) {} /** * Compute radix digit histograms from a segment of input tiles. */ _CCCL_DEVICE _CCCL_FORCEINLINE void ProcessRegion(OffsetT block_offset, const OffsetT& block_end) { // Reset digit counters in smem and unpacked counters in registers ResetDigitCounters(); ResetUnpackedCounters(); // Unroll batches of full tiles while (block_end - block_offset >= UNROLLED_ELEMENTS) { for (int i = 0; i < UNROLL_COUNT; ++i) { ProcessFullTile(block_offset); block_offset += TILE_ITEMS; } CTA_SYNC(); // Aggregate back into local_count registers to prevent overflow UnpackDigitCounts(); CTA_SYNC(); // Reset composite counters in lanes ResetDigitCounters(); } // Unroll single full tiles while (block_end - block_offset >= TILE_ITEMS) { ProcessFullTile(block_offset); block_offset += TILE_ITEMS; } // Process partial tile if necessary ProcessPartialTile(block_offset, block_end); CTA_SYNC(); // Aggregate back into local_count registers UnpackDigitCounts(); } /** * Extract counts (saving them to the external array) */ template _CCCL_DEVICE _CCCL_FORCEINLINE void ExtractCounts(OffsetT* counters, int bin_stride = 1, int bin_offset = 0) { unsigned int warp_id = threadIdx.x >> LOG_WARP_THREADS; unsigned int warp_tid = LaneId(); // Place unpacked digit counters in shared memory #pragma unroll for (int LANE = 0; LANE < LANES_PER_WARP; LANE++) { int counter_lane = (LANE * WARPS) + warp_id; if (counter_lane < COUNTER_LANES) { int digit_row = counter_lane << LOG_PACKING_RATIO; #pragma unroll for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++) { int bin_idx = digit_row + UNPACKED_COUNTER; temp_storage.block_counters[warp_tid][bin_idx] = local_counts[LANE][UNPACKED_COUNTER]; } } } CTA_SYNC(); // Rake-reduce bin_count reductions // Whole blocks #pragma unroll for (int BIN_BASE = RADIX_DIGITS % BLOCK_THREADS; (BIN_BASE + BLOCK_THREADS) <= RADIX_DIGITS; BIN_BASE += BLOCK_THREADS) { int bin_idx = BIN_BASE + threadIdx.x; OffsetT bin_count = 0; #pragma unroll for (int i = 0; i < WARP_THREADS; ++i) { bin_count += temp_storage.block_counters[i][bin_idx]; } if (IS_DESCENDING) { bin_idx = RADIX_DIGITS - bin_idx - 1; } counters[(bin_stride * bin_idx) + bin_offset] = bin_count; } // Remainder if ((RADIX_DIGITS % BLOCK_THREADS != 0) && (threadIdx.x < RADIX_DIGITS)) { int bin_idx = threadIdx.x; OffsetT bin_count = 0; #pragma unroll for (int i = 0; i < WARP_THREADS; ++i) { bin_count += temp_storage.block_counters[i][bin_idx]; } if (IS_DESCENDING) { bin_idx = RADIX_DIGITS - bin_idx - 1; } counters[(bin_stride * bin_idx) + bin_offset] = bin_count; } } /** * @brief Extract counts * * @param[out] bin_count * The exclusive prefix sum for the digits * [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - * 1] */ template _CCCL_DEVICE _CCCL_FORCEINLINE void ExtractCounts(OffsetT (&bin_count)[BINS_TRACKED_PER_THREAD]) { unsigned int warp_id = threadIdx.x >> LOG_WARP_THREADS; unsigned int warp_tid = LaneId(); // Place unpacked digit counters in shared memory #pragma unroll for (int LANE = 0; LANE < LANES_PER_WARP; LANE++) { int counter_lane = (LANE * WARPS) + warp_id; if (counter_lane < COUNTER_LANES) { int digit_row = counter_lane << LOG_PACKING_RATIO; #pragma unroll for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++) { int bin_idx = digit_row + UNPACKED_COUNTER; temp_storage.block_counters[warp_tid][bin_idx] = local_counts[LANE][UNPACKED_COUNTER]; } } } CTA_SYNC(); // Rake-reduce bin_count reductions #pragma unroll for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) { int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track; if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) { bin_count[track] = 0; #pragma unroll for (int i = 0; i < WARP_THREADS; ++i) { bin_count[track] += temp_storage.block_counters[i][bin_idx]; } } } } }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/agent/agent_reduce.cuh000066400000000000000000000423071463375617100202130ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * @file cub::AgentReduce implements a stateful abstraction of CUDA thread * blocks for participating in device-wide reduction. */ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include #include #include #include #include _CCCL_SUPPRESS_DEPRECATED_PUSH #include _CCCL_SUPPRESS_DEPRECATED_POP CUB_NAMESPACE_BEGIN /****************************************************************************** * Tuning policy types ******************************************************************************/ /** * Parameterizable tuning policy type for AgentReduce * @tparam NOMINAL_BLOCK_THREADS_4B Threads per thread block * @tparam NOMINAL_ITEMS_PER_THREAD_4B Items per thread (per tile of input) * @tparam ComputeT Dominant compute type * @tparam _VECTOR_LOAD_LENGTH Number of items per vectorized load * @tparam _BLOCK_ALGORITHM Cooperative block-wide reduction algorithm to use * @tparam _LOAD_MODIFIER Cache load modifier for reading input elements */ template > struct AgentReducePolicy : ScalingType { /// Number of items per vectorized load static constexpr int VECTOR_LOAD_LENGTH = _VECTOR_LOAD_LENGTH; /// Cooperative block-wide reduction algorithm to use static constexpr BlockReduceAlgorithm BLOCK_ALGORITHM = _BLOCK_ALGORITHM; /// Cache load modifier for reading input elements static constexpr CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; }; /****************************************************************************** * Thread block abstractions ******************************************************************************/ /** * @brief AgentReduce implements a stateful abstraction of CUDA thread blocks * for participating in device-wide reduction . * * Each thread reduces only the values it loads. If `FIRST_TILE`, this partial * reduction is stored into `thread_aggregate`. Otherwise it is accumulated * into `thread_aggregate`. * * @tparam AgentReducePolicy * Parameterized AgentReducePolicy tuning policy type * * @tparam InputIteratorT * Random-access iterator type for input * * @tparam OutputIteratorT * Random-access iterator type for output * * @tparam OffsetT * Signed integer type for global offsets * * @tparam ReductionOp * Binary reduction operator type having member * `auto operator()(T &&a, U &&b)` * * @tparam AccumT * The type of intermediate accumulator (according to P2322R6) */ template struct AgentReduce { //--------------------------------------------------------------------- // Types and constants //--------------------------------------------------------------------- /// The input value type using InputT = cub::detail::value_t; /// Vector type of InputT for data movement using VectorT = typename CubVector::Type; /// Input iterator wrapper type (for applying cache modifier) // Wrap the native input pointer with CacheModifiedInputIterator // or directly use the supplied input iterator type using WrappedInputIteratorT = cub::detail::conditional_t::value, CacheModifiedInputIterator, InputIteratorT>; /// Constants static constexpr int BLOCK_THREADS = AgentReducePolicy::BLOCK_THREADS; static constexpr int ITEMS_PER_THREAD = AgentReducePolicy::ITEMS_PER_THREAD; static constexpr int TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD; static constexpr int VECTOR_LOAD_LENGTH = CUB_MIN(ITEMS_PER_THREAD, AgentReducePolicy::VECTOR_LOAD_LENGTH); // Can vectorize according to the policy if the input iterator is a native // pointer to a primitive type static constexpr bool ATTEMPT_VECTORIZATION = (VECTOR_LOAD_LENGTH > 1) && (ITEMS_PER_THREAD % VECTOR_LOAD_LENGTH == 0) && (std::is_pointer::value) && Traits::PRIMITIVE; static constexpr CacheLoadModifier LOAD_MODIFIER = AgentReducePolicy::LOAD_MODIFIER; static constexpr BlockReduceAlgorithm BLOCK_ALGORITHM = AgentReducePolicy::BLOCK_ALGORITHM; /// Parameterized BlockReduce primitive using BlockReduceT = BlockReduce; /// Shared memory type required by this thread block struct _TempStorage { typename BlockReduceT::TempStorage reduce; }; /// Alias wrapper allowing storage to be unioned struct TempStorage : Uninitialized<_TempStorage> {}; //--------------------------------------------------------------------- // Per-thread fields //--------------------------------------------------------------------- _TempStorage& temp_storage; ///< Reference to temp_storage InputIteratorT d_in; ///< Input data to reduce WrappedInputIteratorT d_wrapped_in; ///< Wrapped input data to reduce ReductionOp reduction_op; ///< Binary reduction operator TransformOp transform_op; ///< Transform operator //--------------------------------------------------------------------- // Utility //--------------------------------------------------------------------- // Whether or not the input is aligned with the vector type (specialized for // types we can vectorize) template static _CCCL_DEVICE _CCCL_FORCEINLINE bool IsAligned(Iterator d_in, Int2Type /*can_vectorize*/) { return (size_t(d_in) & (sizeof(VectorT) - 1)) == 0; } // Whether or not the input is aligned with the vector type (specialized for // types we cannot vectorize) template static _CCCL_DEVICE _CCCL_FORCEINLINE bool IsAligned(Iterator /*d_in*/, Int2Type /*can_vectorize*/) { return false; } //--------------------------------------------------------------------- // Constructor //--------------------------------------------------------------------- /** * @brief Constructor * @param temp_storage Reference to temp_storage * @param d_in Input data to reduce * @param reduction_op Binary reduction operator */ _CCCL_DEVICE _CCCL_FORCEINLINE AgentReduce(TempStorage& temp_storage, InputIteratorT d_in, ReductionOp reduction_op, TransformOp transform_op = {}) : temp_storage(temp_storage.Alias()) , d_in(d_in) , d_wrapped_in(d_in) , reduction_op(reduction_op) , transform_op(transform_op) {} //--------------------------------------------------------------------- // Tile consumption //--------------------------------------------------------------------- /** * @brief Consume a full tile of input (non-vectorized) * @param block_offset The offset the tile to consume * @param valid_items The number of valid items in the tile * @param is_full_tile Whether or not this is a full tile * @param can_vectorize Whether or not we can vectorize loads */ template _CCCL_DEVICE _CCCL_FORCEINLINE void ConsumeTile( AccumT& thread_aggregate, OffsetT block_offset, int /*valid_items*/, Int2Type /*is_full_tile*/, Int2Type /*can_vectorize*/) { AccumT items[ITEMS_PER_THREAD]; // Load items in striped fashion cub::detail::load_transform_direct_striped( threadIdx.x, d_wrapped_in + block_offset, items, transform_op); // Reduce items within each thread stripe thread_aggregate = (IS_FIRST_TILE) ? internal::ThreadReduce(items, reduction_op) : internal::ThreadReduce(items, reduction_op, thread_aggregate); } /** * Consume a full tile of input (vectorized) * @param block_offset The offset the tile to consume * @param valid_items The number of valid items in the tile * @param is_full_tile Whether or not this is a full tile * @param can_vectorize Whether or not we can vectorize loads */ template _CCCL_DEVICE _CCCL_FORCEINLINE void ConsumeTile( AccumT& thread_aggregate, OffsetT block_offset, int /*valid_items*/, Int2Type /*is_full_tile*/, Int2Type /*can_vectorize*/) { // Alias items as an array of VectorT and load it in striped fashion enum { WORDS = ITEMS_PER_THREAD / VECTOR_LOAD_LENGTH }; // Fabricate a vectorized input iterator InputT* d_in_unqualified = const_cast(d_in) + block_offset + (threadIdx.x * VECTOR_LOAD_LENGTH); CacheModifiedInputIterator d_vec_in( reinterpret_cast(d_in_unqualified)); // Load items as vector items InputT input_items[ITEMS_PER_THREAD]; VectorT* vec_items = reinterpret_cast(input_items); #pragma unroll for (int i = 0; i < WORDS; ++i) { vec_items[i] = d_vec_in[BLOCK_THREADS * i]; } // Convert from input type to output type AccumT items[ITEMS_PER_THREAD]; #pragma unroll for (int i = 0; i < ITEMS_PER_THREAD; ++i) { items[i] = transform_op(input_items[i]); } // Reduce items within each thread stripe thread_aggregate = (IS_FIRST_TILE) ? internal::ThreadReduce(items, reduction_op) : internal::ThreadReduce(items, reduction_op, thread_aggregate); } /** * Consume a partial tile of input * @param block_offset The offset the tile to consume * @param valid_items The number of valid items in the tile * @param is_full_tile Whether or not this is a full tile * @param can_vectorize Whether or not we can vectorize loads */ template _CCCL_DEVICE _CCCL_FORCEINLINE void ConsumeTile( AccumT& thread_aggregate, OffsetT block_offset, int valid_items, Int2Type /*is_full_tile*/, Int2Type /*can_vectorize*/) { // Partial tile int thread_offset = threadIdx.x; // Read first item if ((IS_FIRST_TILE) && (thread_offset < valid_items)) { thread_aggregate = transform_op(d_wrapped_in[block_offset + thread_offset]); thread_offset += BLOCK_THREADS; } // Continue reading items (block-striped) while (thread_offset < valid_items) { InputT item(d_wrapped_in[block_offset + thread_offset]); thread_aggregate = reduction_op(thread_aggregate, transform_op(item)); thread_offset += BLOCK_THREADS; } } //--------------------------------------------------------------- // Consume a contiguous segment of tiles //--------------------------------------------------------------------- /** * @brief Reduce a contiguous segment of input tiles * @param even_share GridEvenShare descriptor * @param can_vectorize Whether or not we can vectorize loads */ template _CCCL_DEVICE _CCCL_FORCEINLINE AccumT ConsumeRange(GridEvenShare& even_share, Int2Type can_vectorize) { AccumT thread_aggregate{}; if (even_share.block_end - even_share.block_offset < TILE_ITEMS) { // First tile isn't full (not all threads have valid items) int valid_items = even_share.block_end - even_share.block_offset; ConsumeTile(thread_aggregate, even_share.block_offset, valid_items, Int2Type(), can_vectorize); return BlockReduceT(temp_storage.reduce).Reduce(thread_aggregate, reduction_op, valid_items); } // Extracting this into a function saves 8% of generated kernel size by allowing to reuse // the block reduction below. This also workaround hang in nvcc. ConsumeFullTileRange(thread_aggregate, even_share, can_vectorize); // Compute block-wide reduction (all threads have valid items) return BlockReduceT(temp_storage.reduce).Reduce(thread_aggregate, reduction_op); } /** * @brief Reduce a contiguous segment of input tiles * @param[in] block_offset Threadblock begin offset (inclusive) * @param[in] block_end Threadblock end offset (exclusive) */ _CCCL_DEVICE _CCCL_FORCEINLINE AccumT ConsumeRange(OffsetT block_offset, OffsetT block_end) { GridEvenShare even_share; even_share.template BlockInit(block_offset, block_end); return (IsAligned(d_in + block_offset, Int2Type())) ? ConsumeRange(even_share, Int2Type < true && ATTEMPT_VECTORIZATION > ()) : ConsumeRange(even_share, Int2Type < false && ATTEMPT_VECTORIZATION > ()); } /** * Reduce a contiguous segment of input tiles * @param[in] even_share GridEvenShare descriptor */ _CCCL_DEVICE _CCCL_FORCEINLINE AccumT ConsumeTiles(GridEvenShare& even_share) { // Initialize GRID_MAPPING_STRIP_MINE even-share descriptor for this thread block even_share.template BlockInit(); return (IsAligned(d_in, Int2Type())) ? ConsumeRange(even_share, Int2Type < true && ATTEMPT_VECTORIZATION > ()) : ConsumeRange(even_share, Int2Type < false && ATTEMPT_VECTORIZATION > ()); } private: /** * @brief Reduce a contiguous segment of input tiles with more than `TILE_ITEMS` elements * @param even_share GridEvenShare descriptor * @param can_vectorize Whether or not we can vectorize loads */ template _CCCL_DEVICE _CCCL_FORCEINLINE void ConsumeFullTileRange( AccumT& thread_aggregate, GridEvenShare& even_share, Int2Type can_vectorize) { // At least one full block ConsumeTile(thread_aggregate, even_share.block_offset, TILE_ITEMS, Int2Type(), can_vectorize); if (even_share.block_end - even_share.block_offset < even_share.block_stride) { // Exit early to handle offset overflow return; } even_share.block_offset += even_share.block_stride; // Consume subsequent full tiles of input, at least one full tile was processed, so // `even_share.block_end >= TILE_ITEMS` while (even_share.block_offset <= even_share.block_end - TILE_ITEMS) { ConsumeTile(thread_aggregate, even_share.block_offset, TILE_ITEMS, Int2Type(), can_vectorize); if (even_share.block_end - even_share.block_offset < even_share.block_stride) { // Exit early to handle offset overflow return; } even_share.block_offset += even_share.block_stride; } // Consume a partially-full tile if (even_share.block_offset < even_share.block_end) { int valid_items = even_share.block_end - even_share.block_offset; ConsumeTile(thread_aggregate, even_share.block_offset, valid_items, Int2Type(), can_vectorize); } } }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/agent/agent_reduce_by_key.cuh000066400000000000000000000553431463375617100215610ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * @file cub::AgentReduceByKey implements a stateful abstraction of CUDA thread * blocks for participating in device-wide reduce-value-by-key. */ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include #include #include #include #include CUB_NAMESPACE_BEGIN /****************************************************************************** * Tuning policy types ******************************************************************************/ /** * @brief Parameterizable tuning policy type for AgentReduceByKey * * @tparam _BLOCK_THREADS * Threads per thread block * * @tparam _ITEMS_PER_THREAD * Items per thread (per tile of input) * * @tparam _LOAD_ALGORITHM * The BlockLoad algorithm to use * * @tparam _LOAD_MODIFIER * Cache load modifier for reading input elements * * @tparam _SCAN_ALGORITHM * The BlockScan algorithm to use * * @tparam DelayConstructorT * Implementation detail, do not specify directly, requirements on the * content of this type are subject to breaking change. */ template > struct AgentReduceByKeyPolicy { ///< Threads per thread block static constexpr int BLOCK_THREADS = _BLOCK_THREADS; ///< Items per thread (per tile of input) static constexpr int ITEMS_PER_THREAD = _ITEMS_PER_THREAD; ///< The BlockLoad algorithm to use static constexpr BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; ///< Cache load modifier for reading input elements static constexpr CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< The BlockScan algorithm to use static constexpr BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; struct detail { using delay_constructor_t = DelayConstructorT; }; }; /****************************************************************************** * Thread block abstractions ******************************************************************************/ /** * @brief AgentReduceByKey implements a stateful abstraction of CUDA thread * blocks for participating in device-wide reduce-value-by-key * * @tparam AgentReduceByKeyPolicyT * Parameterized AgentReduceByKeyPolicy tuning policy type * * @tparam KeysInputIteratorT * Random-access input iterator type for keys * * @tparam UniqueOutputIteratorT * Random-access output iterator type for keys * * @tparam ValuesInputIteratorT * Random-access input iterator type for values * * @tparam AggregatesOutputIteratorT * Random-access output iterator type for values * * @tparam NumRunsOutputIteratorT * Output iterator type for recording number of items selected * * @tparam EqualityOpT * KeyT equality operator type * * @tparam ReductionOpT * ValueT reduction operator type * * @tparam OffsetT * Signed integer type for global offsets */ template struct AgentReduceByKey { //--------------------------------------------------------------------- // Types and constants //--------------------------------------------------------------------- // The input keys type using KeyInputT = cub::detail::value_t; // The output keys type using KeyOutputT = cub::detail::non_void_value_t; // The input values type using ValueInputT = cub::detail::value_t; // Tuple type for scanning (pairs accumulated segment-value with // segment-index) using OffsetValuePairT = KeyValuePair; // Tuple type for pairing keys and values using KeyValuePairT = KeyValuePair; // Tile status descriptor interface type using ScanTileStateT = ReduceByKeyScanTileState; // Guarded inequality functor template struct GuardedInequalityWrapper { /// Wrapped equality operator _EqualityOpT op; /// Items remaining int num_remaining; /// Constructor _CCCL_HOST_DEVICE _CCCL_FORCEINLINE GuardedInequalityWrapper(_EqualityOpT op, int num_remaining) : op(op) , num_remaining(num_remaining) {} /// Boolean inequality operator, returns (a != b) template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool operator()(const T& a, const T& b, int idx) const { if (idx < num_remaining) { return !op(a, b); // In bounds } // Return true if first out-of-bounds item, false otherwise return (idx == num_remaining); } }; // Constants static constexpr int BLOCK_THREADS = AgentReduceByKeyPolicyT::BLOCK_THREADS; static constexpr int ITEMS_PER_THREAD = AgentReduceByKeyPolicyT::ITEMS_PER_THREAD; static constexpr int TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD; static constexpr int TWO_PHASE_SCATTER = (ITEMS_PER_THREAD > 1); // Whether or not the scan operation has a zero-valued identity value (true // if we're performing addition on a primitive type) static constexpr int HAS_IDENTITY_ZERO = (std::is_same::value) && (Traits::PRIMITIVE); // Cache-modified Input iterator wrapper type (for applying cache modifier) // for keys Wrap the native input pointer with // CacheModifiedValuesInputIterator or directly use the supplied input // iterator type using WrappedKeysInputIteratorT = cub::detail::conditional_t::value, CacheModifiedInputIterator, KeysInputIteratorT>; // Cache-modified Input iterator wrapper type (for applying cache modifier) // for values Wrap the native input pointer with // CacheModifiedValuesInputIterator or directly use the supplied input // iterator type using WrappedValuesInputIteratorT = cub::detail::conditional_t::value, CacheModifiedInputIterator, ValuesInputIteratorT>; // Cache-modified Input iterator wrapper type (for applying cache modifier) // for fixup values Wrap the native input pointer with // CacheModifiedValuesInputIterator or directly use the supplied input // iterator type using WrappedFixupInputIteratorT = cub::detail::conditional_t::value, CacheModifiedInputIterator, AggregatesOutputIteratorT>; // Reduce-value-by-segment scan operator using ReduceBySegmentOpT = ReduceBySegmentOp; // Parameterized BlockLoad type for keys using BlockLoadKeysT = BlockLoad; // Parameterized BlockLoad type for values using BlockLoadValuesT = BlockLoad; // Parameterized BlockDiscontinuity type for keys using BlockDiscontinuityKeys = BlockDiscontinuity; // Parameterized BlockScan type using BlockScanT = BlockScan; // Callback type for obtaining tile prefix during block scan using DelayConstructorT = typename AgentReduceByKeyPolicyT::detail::delay_constructor_t; using TilePrefixCallbackOpT = TilePrefixCallbackOp; // Key and value exchange types typedef KeyOutputT KeyExchangeT[TILE_ITEMS + 1]; typedef AccumT ValueExchangeT[TILE_ITEMS + 1]; // Shared memory type for this thread block union _TempStorage { struct ScanStorage { // Smem needed for tile scanning typename BlockScanT::TempStorage scan; // Smem needed for cooperative prefix callback typename TilePrefixCallbackOpT::TempStorage prefix; // Smem needed for discontinuity detection typename BlockDiscontinuityKeys::TempStorage discontinuity; } scan_storage; // Smem needed for loading keys typename BlockLoadKeysT::TempStorage load_keys; // Smem needed for loading values typename BlockLoadValuesT::TempStorage load_values; // Smem needed for compacting key value pairs(allows non POD items in this // union) Uninitialized raw_exchange; }; // Alias wrapper allowing storage to be unioned struct TempStorage : Uninitialized<_TempStorage> {}; //--------------------------------------------------------------------- // Per-thread fields //--------------------------------------------------------------------- /// Reference to temp_storage _TempStorage& temp_storage; /// Input keys WrappedKeysInputIteratorT d_keys_in; /// Unique output keys UniqueOutputIteratorT d_unique_out; /// Input values WrappedValuesInputIteratorT d_values_in; /// Output value aggregates AggregatesOutputIteratorT d_aggregates_out; /// Output pointer for total number of segments identified NumRunsOutputIteratorT d_num_runs_out; /// KeyT equality operator EqualityOpT equality_op; /// Reduction operator ReductionOpT reduction_op; /// Reduce-by-segment scan operator ReduceBySegmentOpT scan_op; //--------------------------------------------------------------------- // Constructor //--------------------------------------------------------------------- /** * @param temp_storage * Reference to temp_storage * * @param d_keys_in * Input keys * * @param d_unique_out * Unique output keys * * @param d_values_in * Input values * * @param d_aggregates_out * Output value aggregates * * @param d_num_runs_out * Output pointer for total number of segments identified * * @param equality_op * KeyT equality operator * * @param reduction_op * ValueT reduction operator */ _CCCL_DEVICE _CCCL_FORCEINLINE AgentReduceByKey( TempStorage& temp_storage, KeysInputIteratorT d_keys_in, UniqueOutputIteratorT d_unique_out, ValuesInputIteratorT d_values_in, AggregatesOutputIteratorT d_aggregates_out, NumRunsOutputIteratorT d_num_runs_out, EqualityOpT equality_op, ReductionOpT reduction_op) : temp_storage(temp_storage.Alias()) , d_keys_in(d_keys_in) , d_unique_out(d_unique_out) , d_values_in(d_values_in) , d_aggregates_out(d_aggregates_out) , d_num_runs_out(d_num_runs_out) , equality_op(equality_op) , reduction_op(reduction_op) , scan_op(reduction_op) {} //--------------------------------------------------------------------- // Scatter utility methods //--------------------------------------------------------------------- /** * Directly scatter flagged items to output offsets */ _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterDirect( KeyValuePairT (&scatter_items)[ITEMS_PER_THREAD], OffsetT (&segment_flags)[ITEMS_PER_THREAD], OffsetT (&segment_indices)[ITEMS_PER_THREAD]) { // Scatter flagged keys and values #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { if (segment_flags[ITEM]) { d_unique_out[segment_indices[ITEM]] = scatter_items[ITEM].key; d_aggregates_out[segment_indices[ITEM]] = scatter_items[ITEM].value; } } } /** * 2-phase scatter flagged items to output offsets * * The exclusive scan causes each head flag to be paired with the previous * value aggregate: the scatter offsets must be decremented for value * aggregates */ _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterTwoPhase( KeyValuePairT (&scatter_items)[ITEMS_PER_THREAD], OffsetT (&segment_flags)[ITEMS_PER_THREAD], OffsetT (&segment_indices)[ITEMS_PER_THREAD], OffsetT num_tile_segments, OffsetT num_tile_segments_prefix) { CTA_SYNC(); // Compact and scatter pairs #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { if (segment_flags[ITEM]) { temp_storage.raw_exchange.Alias()[segment_indices[ITEM] - num_tile_segments_prefix] = scatter_items[ITEM]; } } CTA_SYNC(); for (int item = threadIdx.x; item < num_tile_segments; item += BLOCK_THREADS) { KeyValuePairT pair = temp_storage.raw_exchange.Alias()[item]; d_unique_out[num_tile_segments_prefix + item] = pair.key; d_aggregates_out[num_tile_segments_prefix + item] = pair.value; } } /** * Scatter flagged items */ _CCCL_DEVICE _CCCL_FORCEINLINE void Scatter( KeyValuePairT (&scatter_items)[ITEMS_PER_THREAD], OffsetT (&segment_flags)[ITEMS_PER_THREAD], OffsetT (&segment_indices)[ITEMS_PER_THREAD], OffsetT num_tile_segments, OffsetT num_tile_segments_prefix) { // Do a one-phase scatter if (a) two-phase is disabled or (b) the average // number of selected items per thread is less than one if (TWO_PHASE_SCATTER && (num_tile_segments > BLOCK_THREADS)) { ScatterTwoPhase(scatter_items, segment_flags, segment_indices, num_tile_segments, num_tile_segments_prefix); } else { ScatterDirect(scatter_items, segment_flags, segment_indices); } } //--------------------------------------------------------------------- // Cooperatively scan a device-wide sequence of tiles with other CTAs //--------------------------------------------------------------------- /** * @brief Process a tile of input (dynamic chained scan) * * @tparam IS_LAST_TILE * Whether the current tile is the last tile * * @param num_remaining * Number of global input items remaining (including this tile) * * @param tile_idx * Tile index * * @param tile_offset * Tile offset * * @param tile_state * Global tile state descriptor */ template _CCCL_DEVICE _CCCL_FORCEINLINE void ConsumeTile(OffsetT num_remaining, int tile_idx, OffsetT tile_offset, ScanTileStateT& tile_state) { // Tile keys KeyOutputT keys[ITEMS_PER_THREAD]; // Tile keys shuffled up KeyOutputT prev_keys[ITEMS_PER_THREAD]; // Tile values AccumT values[ITEMS_PER_THREAD]; // Segment head flags OffsetT head_flags[ITEMS_PER_THREAD]; // Segment indices OffsetT segment_indices[ITEMS_PER_THREAD]; // Zipped values and segment flags|indices OffsetValuePairT scan_items[ITEMS_PER_THREAD]; // Zipped key value pairs for scattering KeyValuePairT scatter_items[ITEMS_PER_THREAD]; // Load keys if (IS_LAST_TILE) { BlockLoadKeysT(temp_storage.load_keys).Load(d_keys_in + tile_offset, keys, num_remaining); } else { BlockLoadKeysT(temp_storage.load_keys).Load(d_keys_in + tile_offset, keys); } // Load tile predecessor key in first thread KeyOutputT tile_predecessor; if (threadIdx.x == 0) { // if (tile_idx == 0) // first tile gets repeat of first item (thus first item will not // be flagged as a head) // else // Subsequent tiles get last key from previous tile tile_predecessor = (tile_idx == 0) ? keys[0] : d_keys_in[tile_offset - 1]; } CTA_SYNC(); // Load values if (IS_LAST_TILE) { BlockLoadValuesT(temp_storage.load_values).Load(d_values_in + tile_offset, values, num_remaining); } else { BlockLoadValuesT(temp_storage.load_values).Load(d_values_in + tile_offset, values); } CTA_SYNC(); // Initialize head-flags and shuffle up the previous keys if (IS_LAST_TILE) { // Use custom flag operator to additionally flag the first out-of-bounds // item GuardedInequalityWrapper flag_op(equality_op, num_remaining); BlockDiscontinuityKeys(temp_storage.scan_storage.discontinuity) .FlagHeads(head_flags, keys, prev_keys, flag_op, tile_predecessor); } else { InequalityWrapper flag_op(equality_op); BlockDiscontinuityKeys(temp_storage.scan_storage.discontinuity) .FlagHeads(head_flags, keys, prev_keys, flag_op, tile_predecessor); } // Reset head-flag on the very first item to make sure we don't start a new run for data where // (key[0] == key[0]) is false (e.g., when key[0] is NaN) if (threadIdx.x == 0 && tile_idx == 0) { head_flags[0] = 0; } // Zip values and head flags #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { scan_items[ITEM].value = values[ITEM]; scan_items[ITEM].key = head_flags[ITEM]; } // Perform exclusive tile scan // Inclusive block-wide scan aggregate OffsetValuePairT block_aggregate; // Number of segments prior to this tile OffsetT num_segments_prefix; // The tile prefix folded with block_aggregate OffsetValuePairT total_aggregate; if (tile_idx == 0) { // Scan first tile BlockScanT(temp_storage.scan_storage.scan).ExclusiveScan(scan_items, scan_items, scan_op, block_aggregate); num_segments_prefix = 0; total_aggregate = block_aggregate; // Update tile status if there are successor tiles if ((!IS_LAST_TILE) && (threadIdx.x == 0)) { tile_state.SetInclusive(0, block_aggregate); } } else { // Scan non-first tile TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.scan_storage.prefix, scan_op, tile_idx); BlockScanT(temp_storage.scan_storage.scan).ExclusiveScan(scan_items, scan_items, scan_op, prefix_op); block_aggregate = prefix_op.GetBlockAggregate(); num_segments_prefix = prefix_op.GetExclusivePrefix().key; total_aggregate = prefix_op.GetInclusivePrefix(); } // Rezip scatter items and segment indices #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { scatter_items[ITEM].key = prev_keys[ITEM]; scatter_items[ITEM].value = scan_items[ITEM].value; segment_indices[ITEM] = scan_items[ITEM].key; } // At this point, each flagged segment head has: // - The key for the previous segment // - The reduced value from the previous segment // - The segment index for the reduced value // Scatter flagged keys and values OffsetT num_tile_segments = block_aggregate.key; Scatter(scatter_items, head_flags, segment_indices, num_tile_segments, num_segments_prefix); // Last thread in last tile will output final count (and last pair, if // necessary) if ((IS_LAST_TILE) && (threadIdx.x == BLOCK_THREADS - 1)) { OffsetT num_segments = num_segments_prefix + num_tile_segments; // If the last tile is a whole tile, output the final_value if (num_remaining == TILE_ITEMS) { d_unique_out[num_segments] = keys[ITEMS_PER_THREAD - 1]; d_aggregates_out[num_segments] = total_aggregate.value; num_segments++; } // Output the total number of items selected *d_num_runs_out = num_segments; } } /** * @brief Scan tiles of items as part of a dynamic chained scan * * @param num_items * Total number of input items * * @param tile_state * Global tile state descriptor * * @param start_tile * The starting tile for the current grid */ _CCCL_DEVICE _CCCL_FORCEINLINE void ConsumeRange(OffsetT num_items, ScanTileStateT& tile_state, int start_tile) { // Blocks are launched in increasing order, so just assign one tile per // block // Current tile index int tile_idx = start_tile + blockIdx.x; // Global offset for the current tile OffsetT tile_offset = OffsetT(TILE_ITEMS) * tile_idx; // Remaining items (including this tile) OffsetT num_remaining = num_items - tile_offset; if (num_remaining > TILE_ITEMS) { // Not last tile ConsumeTile(num_remaining, tile_idx, tile_offset, tile_state); } else if (num_remaining > 0) { // Last tile ConsumeTile(num_remaining, tile_idx, tile_offset, tile_state); } } }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/agent/agent_rle.cuh000066400000000000000000001042321463375617100175220ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * cub::AgentRle implements a stateful abstraction of CUDA thread blocks for participating in device-wide * run-length-encode. */ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include #include #include #include #include #include #include CUB_NAMESPACE_BEGIN /****************************************************************************** * Tuning policy types ******************************************************************************/ /** * Parameterizable tuning policy type for AgentRle * * @tparam _BLOCK_THREADS * Threads per thread block * * @tparam _ITEMS_PER_THREAD * Items per thread (per tile of input) * * @tparam _LOAD_ALGORITHM * The BlockLoad algorithm to use * * @tparam _LOAD_MODIFIER * Cache load modifier for reading input elements * * @tparam _STORE_WARP_TIME_SLICING * Whether or not only one warp's worth of shared memory should be allocated and time-sliced among * block-warps during any store-related data transpositions * (versus each warp having its own storage) * * @tparam _SCAN_ALGORITHM * The BlockScan algorithm to use * * @tparam DelayConstructorT * Implementation detail, do not specify directly, requirements on the * content of this type are subject to breaking change. */ template > struct AgentRlePolicy { enum { /// Threads per thread block BLOCK_THREADS = _BLOCK_THREADS, /// Items per thread (per tile of input) ITEMS_PER_THREAD = _ITEMS_PER_THREAD, /// Whether or not only one warp's worth of shared memory should be allocated and time-sliced /// among block-warps during any store-related data transpositions (versus each warp having its /// own storage) STORE_WARP_TIME_SLICING = _STORE_WARP_TIME_SLICING, }; /// The BlockLoad algorithm to use static constexpr BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; /// Cache load modifier for reading input elements static constexpr CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; /// The BlockScan algorithm to use static constexpr BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; struct detail { using delay_constructor_t = DelayConstructorT; }; }; /****************************************************************************** * Thread block abstractions ******************************************************************************/ /** * @brief AgentRle implements a stateful abstraction of CUDA thread blocks for participating in device-wide * run-length-encode * * @tparam AgentRlePolicyT * Parameterized AgentRlePolicyT tuning policy type * * @tparam InputIteratorT * Random-access input iterator type for data * * @tparam OffsetsOutputIteratorT * Random-access output iterator type for offset values * * @tparam LengthsOutputIteratorT * Random-access output iterator type for length values * * @tparam EqualityOpT * T equality operator type * * @tparam OffsetT * Signed integer type for global offsets */ template struct AgentRle { //--------------------------------------------------------------------- // Types and constants //--------------------------------------------------------------------- /// The input value type using T = cub::detail::value_t; /// The lengths output value type using LengthT = cub::detail::non_void_value_t; /// Tuple type for scanning (pairs run-length and run-index) using LengthOffsetPair = KeyValuePair; /// Tile status descriptor interface type using ScanTileStateT = ReduceByKeyScanTileState; // Constants enum { WARP_THREADS = CUB_WARP_THREADS(0), BLOCK_THREADS = AgentRlePolicyT::BLOCK_THREADS, ITEMS_PER_THREAD = AgentRlePolicyT::ITEMS_PER_THREAD, WARP_ITEMS = WARP_THREADS * ITEMS_PER_THREAD, TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, /// Whether or not to sync after loading data SYNC_AFTER_LOAD = (AgentRlePolicyT::LOAD_ALGORITHM != BLOCK_LOAD_DIRECT), /// Whether or not only one warp's worth of shared memory should be allocated and time-sliced /// among block-warps during any store-related data transpositions (versus each warp having /// its own storage) STORE_WARP_TIME_SLICING = AgentRlePolicyT::STORE_WARP_TIME_SLICING, ACTIVE_EXCHANGE_WARPS = (STORE_WARP_TIME_SLICING) ? 1 : WARPS, }; /** * Special operator that signals all out-of-bounds items are not equal to everything else, * forcing both (1) the last item to be tail-flagged and (2) all oob items to be marked * trivial. */ template struct OobInequalityOp { OffsetT num_remaining; EqualityOpT equality_op; _CCCL_DEVICE _CCCL_FORCEINLINE OobInequalityOp(OffsetT num_remaining, EqualityOpT equality_op) : num_remaining(num_remaining) , equality_op(equality_op) {} template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool operator()(T first, T second, Index idx) { if (!LAST_TILE || (idx < num_remaining)) { return !equality_op(first, second); } else { return true; } } }; // Cache-modified Input iterator wrapper type (for applying cache modifier) for data // Wrap the native input pointer with CacheModifiedVLengthnputIterator // Directly use the supplied input iterator type using WrappedInputIteratorT = cub::detail::conditional_t::value, CacheModifiedInputIterator, InputIteratorT>; // Parameterized BlockLoad type for data using BlockLoadT = BlockLoad; // Parameterized BlockDiscontinuity type for data using BlockDiscontinuityT = BlockDiscontinuity; // Parameterized WarpScan type using WarpScanPairs = WarpScan; // Reduce-length-by-run scan operator using ReduceBySegmentOpT = ReduceBySegmentOp; // Callback type for obtaining tile prefix during block scan using DelayConstructorT = typename AgentRlePolicyT::detail::delay_constructor_t; using TilePrefixCallbackOpT = TilePrefixCallbackOp; // Warp exchange types using WarpExchangePairs = WarpExchange; using WarpExchangePairsStorage = cub::detail::conditional_t; using WarpExchangeOffsets = WarpExchange; using WarpExchangeLengths = WarpExchange; typedef LengthOffsetPair WarpAggregates[WARPS]; // Shared memory type for this thread block struct _TempStorage { // Aliasable storage layout union Aliasable { struct ScanStorage { // Smem needed for discontinuity detection typename BlockDiscontinuityT::TempStorage discontinuity; // Smem needed for warp-synchronous scans typename WarpScanPairs::TempStorage warp_scan[WARPS]; // Smem needed for sharing warp-wide aggregates Uninitialized warp_aggregates; // Smem needed for cooperative prefix callback typename TilePrefixCallbackOpT::TempStorage prefix; } scan_storage; // Smem needed for input loading typename BlockLoadT::TempStorage load; // Aliasable layout needed for two-phase scatter union ScatterAliasable { unsigned long long align; WarpExchangePairsStorage exchange_pairs[ACTIVE_EXCHANGE_WARPS]; typename WarpExchangeOffsets::TempStorage exchange_offsets[ACTIVE_EXCHANGE_WARPS]; typename WarpExchangeLengths::TempStorage exchange_lengths[ACTIVE_EXCHANGE_WARPS]; } scatter_aliasable; } aliasable; OffsetT tile_idx; // Shared tile index LengthOffsetPair tile_inclusive; // Inclusive tile prefix LengthOffsetPair tile_exclusive; // Exclusive tile prefix }; // Alias wrapper allowing storage to be unioned struct TempStorage : Uninitialized<_TempStorage> {}; //--------------------------------------------------------------------- // Per-thread fields //--------------------------------------------------------------------- _TempStorage& temp_storage; ///< Reference to temp_storage WrappedInputIteratorT d_in; ///< Pointer to input sequence of data items OffsetsOutputIteratorT d_offsets_out; ///< Input run offsets LengthsOutputIteratorT d_lengths_out; ///< Output run lengths EqualityOpT equality_op; ///< T equality operator ReduceBySegmentOpT scan_op; ///< Reduce-length-by-flag scan operator OffsetT num_items; ///< Total number of input items //--------------------------------------------------------------------- // Constructor //--------------------------------------------------------------------- /** * @param[in] temp_storage * Reference to temp_storage * * @param[in] d_in * Pointer to input sequence of data items * * @param[out] d_offsets_out * Pointer to output sequence of run offsets * * @param[out] d_lengths_out * Pointer to output sequence of run lengths * * @param[in] equality_op * Equality operator * * @param[in] num_items * Total number of input items */ _CCCL_DEVICE _CCCL_FORCEINLINE AgentRle( TempStorage& temp_storage, InputIteratorT d_in, OffsetsOutputIteratorT d_offsets_out, LengthsOutputIteratorT d_lengths_out, EqualityOpT equality_op, OffsetT num_items) : temp_storage(temp_storage.Alias()) , d_in(d_in) , d_offsets_out(d_offsets_out) , d_lengths_out(d_lengths_out) , equality_op(equality_op) , scan_op(cub::Sum()) , num_items(num_items) {} //--------------------------------------------------------------------- // Utility methods for initializing the selections //--------------------------------------------------------------------- template _CCCL_DEVICE _CCCL_FORCEINLINE void InitializeSelections( OffsetT tile_offset, OffsetT num_remaining, T (&items)[ITEMS_PER_THREAD], LengthOffsetPair (&lengths_and_num_runs)[ITEMS_PER_THREAD]) { bool head_flags[ITEMS_PER_THREAD]; bool tail_flags[ITEMS_PER_THREAD]; OobInequalityOp inequality_op(num_remaining, equality_op); if (FIRST_TILE && LAST_TILE) { // First-and-last-tile always head-flags the first item and tail-flags the last item BlockDiscontinuityT(temp_storage.aliasable.scan_storage.discontinuity) .FlagHeadsAndTails(head_flags, tail_flags, items, inequality_op); } else if (FIRST_TILE) { // First-tile always head-flags the first item // Get the first item from the next tile T tile_successor_item; if (threadIdx.x == BLOCK_THREADS - 1) { tile_successor_item = d_in[tile_offset + TILE_ITEMS]; } BlockDiscontinuityT(temp_storage.aliasable.scan_storage.discontinuity) .FlagHeadsAndTails(head_flags, tail_flags, tile_successor_item, items, inequality_op); } else if (LAST_TILE) { // Last-tile always flags the last item // Get the last item from the previous tile T tile_predecessor_item; if (threadIdx.x == 0) { tile_predecessor_item = d_in[tile_offset - 1]; } BlockDiscontinuityT(temp_storage.aliasable.scan_storage.discontinuity) .FlagHeadsAndTails(head_flags, tile_predecessor_item, tail_flags, items, inequality_op); } else { // Get the first item from the next tile T tile_successor_item; if (threadIdx.x == BLOCK_THREADS - 1) { tile_successor_item = d_in[tile_offset + TILE_ITEMS]; } // Get the last item from the previous tile T tile_predecessor_item; if (threadIdx.x == 0) { tile_predecessor_item = d_in[tile_offset - 1]; } BlockDiscontinuityT(temp_storage.aliasable.scan_storage.discontinuity) .FlagHeadsAndTails(head_flags, tile_predecessor_item, tail_flags, tile_successor_item, items, inequality_op); } // Zip counts and runs #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { // input output // items [ 0 0 0 1 2 3 3 ] // heads [ 1 0 0 1 1 1 0 ] // tails [ 0 0 1 1 1 0 1 ] // key [ 1 0 0 0 0 1 0 ] head && !tail - heads of non-trivial (length > 1) runs // value [ 1 1 1 0 0 1 1 ] !head || !tail - elements of non-trivial runs lengths_and_num_runs[ITEM].key = head_flags[ITEM] && (!tail_flags[ITEM]); lengths_and_num_runs[ITEM].value = ((!head_flags[ITEM]) || (!tail_flags[ITEM])); } } //--------------------------------------------------------------------- // Scan utility methods //--------------------------------------------------------------------- /** * Scan of allocations */ _CCCL_DEVICE _CCCL_FORCEINLINE void WarpScanAllocations( LengthOffsetPair& tile_aggregate, LengthOffsetPair& warp_aggregate, LengthOffsetPair& warp_exclusive_in_tile, LengthOffsetPair& thread_exclusive_in_warp, LengthOffsetPair (&lengths_and_num_runs)[ITEMS_PER_THREAD]) { // Perform warpscans unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS); int lane_id = LaneId(); LengthOffsetPair identity; identity.key = 0; identity.value = 0; LengthOffsetPair thread_inclusive; // `thread_exclusive_in_warp.key`: // number of non-trivial runs starts in previous threads // `thread_exclusive_in_warp.val`: // number of items in the last non-trivial run in previous threads // `thread_aggregate.key`: // number of non-trivial runs starts in this thread // `thread_aggregate.val`: // number of items in the last non-trivial run in this thread LengthOffsetPair thread_aggregate = internal::ThreadReduce(lengths_and_num_runs, scan_op); WarpScanPairs(temp_storage.aliasable.scan_storage.warp_scan[warp_id]) .Scan(thread_aggregate, thread_inclusive, thread_exclusive_in_warp, identity, scan_op); // `thread_inclusive.key`: // number of non-trivial runs starts in this and previous warp threads // `thread_inclusive.val`: // number of items in the last non-trivial run in this or previous warp threads // Last lane in each warp shares its warp-aggregate if (lane_id == WARP_THREADS - 1) { // `temp_storage.aliasable.scan_storage.warp_aggregates[warp_id].key`: // number of non-trivial runs starts in this warp // `temp_storage.aliasable.scan_storage.warp_aggregates[warp_id].val`: // number of items in the last non-trivial run in this warp temp_storage.aliasable.scan_storage.warp_aggregates.Alias()[warp_id] = thread_inclusive; } CTA_SYNC(); // Accumulate total selected and the warp-wide prefix // `warp_exclusive_in_tile.key`: // number of non-trivial runs starts in previous warps // `warp_exclusive_in_tile.val`: // number of items in the last non-trivial run in previous warps warp_exclusive_in_tile = identity; warp_aggregate = temp_storage.aliasable.scan_storage.warp_aggregates.Alias()[warp_id]; // `tile_aggregate.key`: // number of non-trivial runs starts in this CTA // `tile_aggregate.val`: // number of items in the last non-trivial run in this CTA tile_aggregate = temp_storage.aliasable.scan_storage.warp_aggregates.Alias()[0]; #pragma unroll for (int WARP = 1; WARP < WARPS; ++WARP) { if (warp_id == WARP) { warp_exclusive_in_tile = tile_aggregate; } tile_aggregate = scan_op(tile_aggregate, temp_storage.aliasable.scan_storage.warp_aggregates.Alias()[WARP]); } // Ensure all threads have read warp aggregates before temp_storage is repurposed in the // subsequent scatter stage CTA_SYNC(); } //--------------------------------------------------------------------- // Utility methods for scattering selections //--------------------------------------------------------------------- /** * Two-phase scatter, specialized for warp time-slicing */ template _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterTwoPhase( OffsetT tile_num_runs_exclusive_in_global, OffsetT warp_num_runs_aggregate, OffsetT warp_num_runs_exclusive_in_tile, OffsetT (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD], LengthOffsetPair (&lengths_and_offsets)[ITEMS_PER_THREAD], Int2Type is_warp_time_slice) { unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS); int lane_id = LaneId(); // Locally compact items within the warp (first warp) if (warp_id == 0) { WarpExchangePairs(temp_storage.aliasable.scatter_aliasable.exchange_pairs[0]) .ScatterToStriped(lengths_and_offsets, thread_num_runs_exclusive_in_warp); } // Locally compact items within the warp (remaining warps) #pragma unroll for (int SLICE = 1; SLICE < WARPS; ++SLICE) { CTA_SYNC(); if (warp_id == SLICE) { WarpExchangePairs(temp_storage.aliasable.scatter_aliasable.exchange_pairs[0]) .ScatterToStriped(lengths_and_offsets, thread_num_runs_exclusive_in_warp); } } // Global scatter #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { // warp_num_runs_aggregate - number of non-trivial runs starts in current warp if ((ITEM * WARP_THREADS) < warp_num_runs_aggregate - lane_id) { OffsetT item_offset = tile_num_runs_exclusive_in_global + warp_num_runs_exclusive_in_tile + (ITEM * WARP_THREADS) + lane_id; // Scatter offset d_offsets_out[item_offset] = lengths_and_offsets[ITEM].key; // Scatter length if not the first (global) length if ((ITEM != 0) || (item_offset > 0)) { d_lengths_out[item_offset - 1] = lengths_and_offsets[ITEM].value; } } } } /** * Two-phase scatter */ template _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterTwoPhase( OffsetT tile_num_runs_exclusive_in_global, OffsetT warp_num_runs_aggregate, OffsetT warp_num_runs_exclusive_in_tile, OffsetT (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD], LengthOffsetPair (&lengths_and_offsets)[ITEMS_PER_THREAD], Int2Type is_warp_time_slice) { unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS); int lane_id = LaneId(); // Unzip OffsetT run_offsets[ITEMS_PER_THREAD]; LengthT run_lengths[ITEMS_PER_THREAD]; #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { run_offsets[ITEM] = lengths_and_offsets[ITEM].key; run_lengths[ITEM] = lengths_and_offsets[ITEM].value; } WarpExchangeOffsets(temp_storage.aliasable.scatter_aliasable.exchange_offsets[warp_id]) .ScatterToStriped(run_offsets, thread_num_runs_exclusive_in_warp); WARP_SYNC(0xffffffff); WarpExchangeLengths(temp_storage.aliasable.scatter_aliasable.exchange_lengths[warp_id]) .ScatterToStriped(run_lengths, thread_num_runs_exclusive_in_warp); // Global scatter #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { if ((ITEM * WARP_THREADS) + lane_id < warp_num_runs_aggregate) { OffsetT item_offset = tile_num_runs_exclusive_in_global + warp_num_runs_exclusive_in_tile + (ITEM * WARP_THREADS) + lane_id; // Scatter offset d_offsets_out[item_offset] = run_offsets[ITEM]; // Scatter length if not the first (global) length if ((ITEM != 0) || (item_offset > 0)) { d_lengths_out[item_offset - 1] = run_lengths[ITEM]; } } } } /** * Direct scatter */ template _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterDirect( OffsetT tile_num_runs_exclusive_in_global, OffsetT warp_num_runs_aggregate, OffsetT warp_num_runs_exclusive_in_tile, OffsetT (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD], LengthOffsetPair (&lengths_and_offsets)[ITEMS_PER_THREAD]) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { if (thread_num_runs_exclusive_in_warp[ITEM] < warp_num_runs_aggregate) { OffsetT item_offset = tile_num_runs_exclusive_in_global + warp_num_runs_exclusive_in_tile + thread_num_runs_exclusive_in_warp[ITEM]; // Scatter offset d_offsets_out[item_offset] = lengths_and_offsets[ITEM].key; // Scatter length if not the first (global) length if (item_offset > 0) { d_lengths_out[item_offset - 1] = lengths_and_offsets[ITEM].value; } } } } /** * Scatter */ template _CCCL_DEVICE _CCCL_FORCEINLINE void Scatter( OffsetT tile_num_runs_aggregate, OffsetT tile_num_runs_exclusive_in_global, OffsetT warp_num_runs_aggregate, OffsetT warp_num_runs_exclusive_in_tile, OffsetT (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD], LengthOffsetPair (&lengths_and_offsets)[ITEMS_PER_THREAD]) { if ((ITEMS_PER_THREAD == 1) || (tile_num_runs_aggregate < BLOCK_THREADS)) { // Direct scatter if the warp has any items if (warp_num_runs_aggregate) { ScatterDirect( tile_num_runs_exclusive_in_global, warp_num_runs_aggregate, warp_num_runs_exclusive_in_tile, thread_num_runs_exclusive_in_warp, lengths_and_offsets); } } else { // Scatter two phase ScatterTwoPhase( tile_num_runs_exclusive_in_global, warp_num_runs_aggregate, warp_num_runs_exclusive_in_tile, thread_num_runs_exclusive_in_warp, lengths_and_offsets, Int2Type()); } } //--------------------------------------------------------------------- // Cooperatively scan a device-wide sequence of tiles with other CTAs //--------------------------------------------------------------------- /** * @brief Process a tile of input (dynamic chained scan) * * @param num_items * Total number of global input items * * @param num_remaining * Number of global input items remaining (including this tile) * * @param tile_idx * Tile index * * @param tile_offset * Tile offset * * @param &tile_status * Global list of tile status */ template _CCCL_DEVICE _CCCL_FORCEINLINE LengthOffsetPair ConsumeTile(OffsetT num_items, OffsetT num_remaining, int tile_idx, OffsetT tile_offset, ScanTileStateT& tile_status) { if (tile_idx == 0) { // First tile // Load items T items[ITEMS_PER_THREAD]; if (LAST_TILE) { BlockLoadT(temp_storage.aliasable.load).Load(d_in + tile_offset, items, num_remaining, T()); } else { BlockLoadT(temp_storage.aliasable.load).Load(d_in + tile_offset, items); } if (SYNC_AFTER_LOAD) { CTA_SYNC(); } // Set flags LengthOffsetPair lengths_and_num_runs[ITEMS_PER_THREAD]; InitializeSelections(tile_offset, num_remaining, items, lengths_and_num_runs); // Exclusive scan of lengths and runs LengthOffsetPair tile_aggregate; LengthOffsetPair warp_aggregate; LengthOffsetPair warp_exclusive_in_tile; LengthOffsetPair thread_exclusive_in_warp; WarpScanAllocations( tile_aggregate, warp_aggregate, warp_exclusive_in_tile, thread_exclusive_in_warp, lengths_and_num_runs); // Update tile status if this is not the last tile if (!LAST_TILE && (threadIdx.x == 0)) { tile_status.SetInclusive(0, tile_aggregate); } // Update thread_exclusive_in_warp to fold in warp run-length if (thread_exclusive_in_warp.key == 0) { // If there are no non-trivial runs starts in the previous warp threads, then // `thread_exclusive_in_warp.val` denotes the number of items in the last // non-trivial run of the previous CTA threads, so the better name for it is // `thread_exclusive_in_tile`. thread_exclusive_in_warp.value += warp_exclusive_in_tile.value; } LengthOffsetPair lengths_and_offsets[ITEMS_PER_THREAD]; OffsetT thread_num_runs_exclusive_in_warp[ITEMS_PER_THREAD]; LengthOffsetPair lengths_and_num_runs2[ITEMS_PER_THREAD]; // Downsweep scan through lengths_and_num_runs internal::ThreadScanExclusive(lengths_and_num_runs, lengths_and_num_runs2, scan_op, thread_exclusive_in_warp); // Zip #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { lengths_and_offsets[ITEM].value = lengths_and_num_runs2[ITEM].value; lengths_and_offsets[ITEM].key = tile_offset + (threadIdx.x * ITEMS_PER_THREAD) + ITEM; thread_num_runs_exclusive_in_warp[ITEM] = (lengths_and_num_runs[ITEM].key) ? lengths_and_num_runs2[ITEM].key : // keep WARP_THREADS * ITEMS_PER_THREAD; // discard } OffsetT tile_num_runs_aggregate = tile_aggregate.key; OffsetT tile_num_runs_exclusive_in_global = 0; OffsetT warp_num_runs_aggregate = warp_aggregate.key; OffsetT warp_num_runs_exclusive_in_tile = warp_exclusive_in_tile.key; // Scatter Scatter( tile_num_runs_aggregate, tile_num_runs_exclusive_in_global, warp_num_runs_aggregate, warp_num_runs_exclusive_in_tile, thread_num_runs_exclusive_in_warp, lengths_and_offsets); // Return running total (inclusive of this tile) return tile_aggregate; } else { // Not first tile // Load items T items[ITEMS_PER_THREAD]; if (LAST_TILE) { BlockLoadT(temp_storage.aliasable.load).Load(d_in + tile_offset, items, num_remaining, T()); } else { BlockLoadT(temp_storage.aliasable.load).Load(d_in + tile_offset, items); } if (SYNC_AFTER_LOAD) { CTA_SYNC(); } // Set flags LengthOffsetPair lengths_and_num_runs[ITEMS_PER_THREAD]; InitializeSelections(tile_offset, num_remaining, items, lengths_and_num_runs); // Exclusive scan of lengths and runs LengthOffsetPair tile_aggregate; LengthOffsetPair warp_aggregate; LengthOffsetPair warp_exclusive_in_tile; LengthOffsetPair thread_exclusive_in_warp; WarpScanAllocations( tile_aggregate, warp_aggregate, warp_exclusive_in_tile, thread_exclusive_in_warp, lengths_and_num_runs); // First warp computes tile prefix in lane 0 TilePrefixCallbackOpT prefix_op(tile_status, temp_storage.aliasable.scan_storage.prefix, Sum(), tile_idx); unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS); if (warp_id == 0) { prefix_op(tile_aggregate); if (threadIdx.x == 0) { temp_storage.tile_exclusive = prefix_op.exclusive_prefix; } } CTA_SYNC(); LengthOffsetPair tile_exclusive_in_global = temp_storage.tile_exclusive; // Update thread_exclusive_in_warp to fold in warp and tile run-lengths LengthOffsetPair thread_exclusive = scan_op(tile_exclusive_in_global, warp_exclusive_in_tile); if (thread_exclusive_in_warp.key == 0) { // If there are no non-trivial runs starts in the previous warp threads, then // `thread_exclusive_in_warp.val` denotes the number of items in the last // non-trivial run of the previous grid threads, so the better name for it is // `thread_exclusive_in_grid`. thread_exclusive_in_warp.value += thread_exclusive.value; } // Downsweep scan through lengths_and_num_runs // `lengths_and_num_runs2.key`: // number of non-trivial runs starts in previous grid threads // `lengths_and_num_runs2.val`: // number of items in the last non-trivial run in previous grid threads LengthOffsetPair lengths_and_num_runs2[ITEMS_PER_THREAD]; // `lengths_and_offsets.key`: // offset to the item in the input sequence // `lengths_and_offsets.val`: // number of items in the last non-trivial run in previous grid threads LengthOffsetPair lengths_and_offsets[ITEMS_PER_THREAD]; OffsetT thread_num_runs_exclusive_in_warp[ITEMS_PER_THREAD]; internal::ThreadScanExclusive(lengths_and_num_runs, lengths_and_num_runs2, scan_op, thread_exclusive_in_warp); // Zip #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { lengths_and_offsets[ITEM].value = lengths_and_num_runs2[ITEM].value; lengths_and_offsets[ITEM].key = tile_offset + (threadIdx.x * ITEMS_PER_THREAD) + ITEM; thread_num_runs_exclusive_in_warp[ITEM] = (lengths_and_num_runs[ITEM].key) ? lengths_and_num_runs2[ITEM].key : // keep WARP_THREADS * ITEMS_PER_THREAD; // discard } OffsetT tile_num_runs_aggregate = tile_aggregate.key; OffsetT tile_num_runs_exclusive_in_global = tile_exclusive_in_global.key; OffsetT warp_num_runs_aggregate = warp_aggregate.key; OffsetT warp_num_runs_exclusive_in_tile = warp_exclusive_in_tile.key; // Scatter Scatter( tile_num_runs_aggregate, tile_num_runs_exclusive_in_global, warp_num_runs_aggregate, warp_num_runs_exclusive_in_tile, thread_num_runs_exclusive_in_warp, lengths_and_offsets); // Return running total (inclusive of this tile) return prefix_op.inclusive_prefix; } } /** * @brief Scan tiles of items as part of a dynamic chained scan * * @param num_tiles * Total number of input tiles * * @param tile_status * Global list of tile status * * @param d_num_runs_out * Output pointer for total number of runs identified * * @tparam NumRunsIteratorT * Output iterator type for recording number of items selected */ template _CCCL_DEVICE _CCCL_FORCEINLINE void ConsumeRange(int num_tiles, ScanTileStateT& tile_status, NumRunsIteratorT d_num_runs_out) { // Blocks are launched in increasing order, so just assign one tile per block int tile_idx = (blockIdx.x * gridDim.y) + blockIdx.y; // Current tile index OffsetT tile_offset = tile_idx * TILE_ITEMS; // Global offset for the current tile OffsetT num_remaining = num_items - tile_offset; // Remaining items (including this tile) if (tile_idx < num_tiles - 1) { // Not the last tile (full) ConsumeTile(num_items, num_remaining, tile_idx, tile_offset, tile_status); } else if (num_remaining > 0) { // The last tile (possibly partially-full) LengthOffsetPair running_total = ConsumeTile(num_items, num_remaining, tile_idx, tile_offset, tile_status); if (threadIdx.x == 0) { // Output the total number of items selected *d_num_runs_out = running_total.key; // The inclusive prefix contains accumulated length reduction for the last run if (running_total.key > 0) { d_lengths_out[running_total.key - 1] = running_total.value; } } } } }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/agent/agent_scan.cuh000066400000000000000000000430111463375617100176610ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * @file cub::AgentScan implements a stateful abstraction of CUDA thread blocks * for participating in device-wide prefix scan . */ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include #include #include #include CUB_NAMESPACE_BEGIN /****************************************************************************** * Tuning policy types ******************************************************************************/ /** * @brief Parameterizable tuning policy type for AgentScan * * @tparam NOMINAL_BLOCK_THREADS_4B * Threads per thread block * * @tparam NOMINAL_ITEMS_PER_THREAD_4B * Items per thread (per tile of input) * * @tparam ComputeT * Dominant compute type * * @tparam _LOAD_ALGORITHM * The BlockLoad algorithm to use * * @tparam _LOAD_MODIFIER * Cache load modifier for reading input elements * * @tparam _STORE_ALGORITHM * The BlockStore algorithm to use * * @tparam _SCAN_ALGORITHM * The BlockScan algorithm to use * * @tparam DelayConstructorT * Implementation detail, do not specify directly, requirements on the * content of this type are subject to breaking change. */ template , typename DelayConstructorT = detail::default_delay_constructor_t> struct AgentScanPolicy : ScalingType { static constexpr BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; static constexpr CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; static constexpr BlockStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM; static constexpr BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; struct detail { using delay_constructor_t = DelayConstructorT; }; }; /****************************************************************************** * Thread block abstractions ******************************************************************************/ /** * @brief AgentScan implements a stateful abstraction of CUDA thread blocks for * participating in device-wide prefix scan. * @tparam AgentScanPolicyT * Parameterized AgentScanPolicyT tuning policy type * * @tparam InputIteratorT * Random-access input iterator type * * @tparam OutputIteratorT * Random-access output iterator type * * @tparam ScanOpT * Scan functor type * * @tparam InitValueT * The init_value element for ScanOpT type (cub::NullType for inclusive scan) * * @tparam OffsetT * Signed integer type for global offsets * */ template struct AgentScan { //--------------------------------------------------------------------- // Types and constants //--------------------------------------------------------------------- // The input value type using InputT = cub::detail::value_t; // Tile status descriptor interface type using ScanTileStateT = ScanTileState; // Input iterator wrapper type (for applying cache modifier) // Wrap the native input pointer with CacheModifiedInputIterator // or directly use the supplied input iterator type using WrappedInputIteratorT = cub::detail::conditional_t::value, CacheModifiedInputIterator, InputIteratorT>; // Constants enum { // Inclusive scan if no init_value type is provided IS_INCLUSIVE = std::is_same::value, BLOCK_THREADS = AgentScanPolicyT::BLOCK_THREADS, ITEMS_PER_THREAD = AgentScanPolicyT::ITEMS_PER_THREAD, TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, }; // Parameterized BlockLoad type typedef BlockLoad BlockLoadT; // Parameterized BlockStore type typedef BlockStore BlockStoreT; // Parameterized BlockScan type typedef BlockScan BlockScanT; // Callback type for obtaining tile prefix during block scan using DelayConstructorT = typename AgentScanPolicyT::detail::delay_constructor_t; using TilePrefixCallbackOpT = TilePrefixCallbackOp; // Stateful BlockScan prefix callback type for managing a running total while // scanning consecutive tiles typedef BlockScanRunningPrefixOp RunningPrefixCallbackOp; // Shared memory type for this thread block union _TempStorage { // Smem needed for tile loading typename BlockLoadT::TempStorage load; // Smem needed for tile storing typename BlockStoreT::TempStorage store; struct ScanStorage { // Smem needed for cooperative prefix callback typename TilePrefixCallbackOpT::TempStorage prefix; // Smem needed for tile scanning typename BlockScanT::TempStorage scan; } scan_storage; }; // Alias wrapper allowing storage to be unioned struct TempStorage : Uninitialized<_TempStorage> {}; //--------------------------------------------------------------------- // Per-thread fields //--------------------------------------------------------------------- _TempStorage& temp_storage; ///< Reference to temp_storage WrappedInputIteratorT d_in; ///< Input data OutputIteratorT d_out; ///< Output data ScanOpT scan_op; ///< Binary scan operator InitValueT init_value; ///< The init_value element for ScanOpT //--------------------------------------------------------------------- // Block scan utility methods //--------------------------------------------------------------------- /** * Exclusive scan specialization (first tile) */ _CCCL_DEVICE _CCCL_FORCEINLINE void ScanTile( AccumT (&items)[ITEMS_PER_THREAD], AccumT init_value, ScanOpT scan_op, AccumT& block_aggregate, Int2Type /*is_inclusive*/) { BlockScanT(temp_storage.scan_storage.scan).ExclusiveScan(items, items, init_value, scan_op, block_aggregate); block_aggregate = scan_op(init_value, block_aggregate); } /** * Inclusive scan specialization (first tile) */ _CCCL_DEVICE _CCCL_FORCEINLINE void ScanTile( AccumT (&items)[ITEMS_PER_THREAD], InitValueT /*init_value*/, ScanOpT scan_op, AccumT& block_aggregate, Int2Type /*is_inclusive*/) { BlockScanT(temp_storage.scan_storage.scan).InclusiveScan(items, items, scan_op, block_aggregate); } /** * Exclusive scan specialization (subsequent tiles) */ template _CCCL_DEVICE _CCCL_FORCEINLINE void ScanTile( AccumT (&items)[ITEMS_PER_THREAD], ScanOpT scan_op, PrefixCallback& prefix_op, Int2Type /*is_inclusive*/) { BlockScanT(temp_storage.scan_storage.scan).ExclusiveScan(items, items, scan_op, prefix_op); } /** * Inclusive scan specialization (subsequent tiles) */ template _CCCL_DEVICE _CCCL_FORCEINLINE void ScanTile( AccumT (&items)[ITEMS_PER_THREAD], ScanOpT scan_op, PrefixCallback& prefix_op, Int2Type /*is_inclusive*/) { BlockScanT(temp_storage.scan_storage.scan).InclusiveScan(items, items, scan_op, prefix_op); } //--------------------------------------------------------------------- // Constructor //--------------------------------------------------------------------- /** * @param temp_storage * Reference to temp_storage * * @param d_in * Input data * * @param d_out * Output data * * @param scan_op * Binary scan operator * * @param init_value * Initial value to seed the exclusive scan */ _CCCL_DEVICE _CCCL_FORCEINLINE AgentScan( TempStorage& temp_storage, InputIteratorT d_in, OutputIteratorT d_out, ScanOpT scan_op, InitValueT init_value) : temp_storage(temp_storage.Alias()) , d_in(d_in) , d_out(d_out) , scan_op(scan_op) , init_value(init_value) {} //--------------------------------------------------------------------- // Cooperatively scan a device-wide sequence of tiles with other CTAs //--------------------------------------------------------------------- /** * Process a tile of input (dynamic chained scan) * @tparam IS_LAST_TILE * Whether the current tile is the last tile * * @param num_remaining * Number of global input items remaining (including this tile) * * @param tile_idx * Tile index * * @param tile_offset * Tile offset * * @param tile_state * Global tile state descriptor */ template _CCCL_DEVICE _CCCL_FORCEINLINE void ConsumeTile(OffsetT num_remaining, int tile_idx, OffsetT tile_offset, ScanTileStateT& tile_state) { // Load items AccumT items[ITEMS_PER_THREAD]; if (IS_LAST_TILE) { // Fill last element with the first element because collectives are // not suffix guarded. BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items, num_remaining, *(d_in + tile_offset)); } else { BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items); } CTA_SYNC(); // Perform tile scan if (tile_idx == 0) { // Scan first tile AccumT block_aggregate; ScanTile(items, init_value, scan_op, block_aggregate, Int2Type()); if ((!IS_LAST_TILE) && (threadIdx.x == 0)) { tile_state.SetInclusive(0, block_aggregate); } } else { // Scan non-first tile TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.scan_storage.prefix, scan_op, tile_idx); ScanTile(items, scan_op, prefix_op, Int2Type()); } CTA_SYNC(); // Store items if (IS_LAST_TILE) { BlockStoreT(temp_storage.store).Store(d_out + tile_offset, items, num_remaining); } else { BlockStoreT(temp_storage.store).Store(d_out + tile_offset, items); } } /** * @brief Scan tiles of items as part of a dynamic chained scan * * @param num_items * Total number of input items * * @param tile_state * Global tile state descriptor * * @param start_tile * The starting tile for the current grid */ _CCCL_DEVICE _CCCL_FORCEINLINE void ConsumeRange(OffsetT num_items, ScanTileStateT& tile_state, int start_tile) { // Blocks are launched in increasing order, so just assign one tile per // block // Current tile index int tile_idx = start_tile + blockIdx.x; // Global offset for the current tile OffsetT tile_offset = OffsetT(TILE_ITEMS) * tile_idx; // Remaining items (including this tile) OffsetT num_remaining = num_items - tile_offset; if (num_remaining > TILE_ITEMS) { // Not last tile ConsumeTile(num_remaining, tile_idx, tile_offset, tile_state); } else if (num_remaining > 0) { // Last tile ConsumeTile(num_remaining, tile_idx, tile_offset, tile_state); } } //--------------------------------------------------------------------------- // Scan an sequence of consecutive tiles (independent of other thread blocks) //--------------------------------------------------------------------------- /** * @brief Process a tile of input * * @param tile_offset * Tile offset * * @param prefix_op * Running prefix operator * * @param valid_items * Number of valid items in the tile */ template _CCCL_DEVICE _CCCL_FORCEINLINE void ConsumeTile(OffsetT tile_offset, RunningPrefixCallbackOp& prefix_op, int valid_items = TILE_ITEMS) { // Load items AccumT items[ITEMS_PER_THREAD]; if (IS_LAST_TILE) { // Fill last element with the first element because collectives are // not suffix guarded. BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items, valid_items, *(d_in + tile_offset)); } else { BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items); } CTA_SYNC(); // Block scan if (IS_FIRST_TILE) { AccumT block_aggregate; ScanTile(items, init_value, scan_op, block_aggregate, Int2Type()); prefix_op.running_total = block_aggregate; } else { ScanTile(items, scan_op, prefix_op, Int2Type()); } CTA_SYNC(); // Store items if (IS_LAST_TILE) { BlockStoreT(temp_storage.store).Store(d_out + tile_offset, items, valid_items); } else { BlockStoreT(temp_storage.store).Store(d_out + tile_offset, items); } } /** * @brief Scan a consecutive share of input tiles * * @param[in] range_offset * Threadblock begin offset (inclusive) * * @param[in] range_end * Threadblock end offset (exclusive) */ _CCCL_DEVICE _CCCL_FORCEINLINE void ConsumeRange(OffsetT range_offset, OffsetT range_end) { BlockScanRunningPrefixOp prefix_op(scan_op); if (range_offset + TILE_ITEMS <= range_end) { // Consume first tile of input (full) ConsumeTile(range_offset, prefix_op); range_offset += TILE_ITEMS; // Consume subsequent full tiles of input while (range_offset + TILE_ITEMS <= range_end) { ConsumeTile(range_offset, prefix_op); range_offset += TILE_ITEMS; } // Consume a partially-full tile if (range_offset < range_end) { int valid_items = range_end - range_offset; ConsumeTile(range_offset, prefix_op, valid_items); } } else { // Consume the first tile of input (partially-full) int valid_items = range_end - range_offset; ConsumeTile(range_offset, prefix_op, valid_items); } } /** * @brief Scan a consecutive share of input tiles, seeded with the * specified prefix value * @param[in] range_offset * Threadblock begin offset (inclusive) * * @param[in] range_end * Threadblock end offset (exclusive) * * @param[in] prefix * The prefix to apply to the scan segment */ _CCCL_DEVICE _CCCL_FORCEINLINE void ConsumeRange(OffsetT range_offset, OffsetT range_end, AccumT prefix) { BlockScanRunningPrefixOp prefix_op(prefix, scan_op); // Consume full tiles of input while (range_offset + TILE_ITEMS <= range_end) { ConsumeTile(range_offset, prefix_op); range_offset += TILE_ITEMS; } // Consume a partially-full tile if (range_offset < range_end) { int valid_items = range_end - range_offset; ConsumeTile(range_offset, prefix_op, valid_items); } } }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/agent/agent_scan_by_key.cuh000066400000000000000000000410521463375617100212260ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * @file AgentScanByKey implements a stateful abstraction of CUDA thread blocks * for participating in device-wide prefix scan by key. */ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include #include #include #include #include CUB_NAMESPACE_BEGIN /****************************************************************************** * Tuning policy types ******************************************************************************/ /** * Parameterizable tuning policy type for AgentScanByKey * * @tparam DelayConstructorT * Implementation detail, do not specify directly, requirements on the * content of this type are subject to breaking change. */ template > struct AgentScanByKeyPolicy { static constexpr int BLOCK_THREADS = _BLOCK_THREADS; static constexpr int ITEMS_PER_THREAD = _ITEMS_PER_THREAD; static constexpr BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; static constexpr CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; static constexpr BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; static constexpr BlockStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM; struct detail { using delay_constructor_t = DelayConstructorT; }; }; /****************************************************************************** * Thread block abstractions ******************************************************************************/ /** * @brief AgentScanByKey implements a stateful abstraction of CUDA thread * blocks for participating in device-wide prefix scan by key. * * @tparam AgentScanByKeyPolicyT * Parameterized AgentScanPolicyT tuning policy type * * @tparam KeysInputIteratorT * Random-access input iterator type * * @tparam ValuesInputIteratorT * Random-access input iterator type * * @tparam ValuesOutputIteratorT * Random-access output iterator type * * @tparam EqualityOp * Equality functor type * * @tparam ScanOpT * Scan functor type * * @tparam InitValueT * The init_value element for ScanOpT type (cub::NullType for inclusive scan) * * @tparam OffsetT * Signed integer type for global offsets * */ template struct AgentScanByKey { //--------------------------------------------------------------------- // Types and constants //--------------------------------------------------------------------- using KeyT = cub::detail::value_t; using InputT = cub::detail::value_t; using SizeValuePairT = KeyValuePair; using KeyValuePairT = KeyValuePair; using ReduceBySegmentOpT = ReduceBySegmentOp; using ScanTileStateT = ReduceByKeyScanTileState; // Constants // Inclusive scan if no init_value type is provided static constexpr int IS_INCLUSIVE = std::is_same::value; static constexpr int BLOCK_THREADS = AgentScanByKeyPolicyT::BLOCK_THREADS; static constexpr int ITEMS_PER_THREAD = AgentScanByKeyPolicyT::ITEMS_PER_THREAD; static constexpr int ITEMS_PER_TILE = BLOCK_THREADS * ITEMS_PER_THREAD; using WrappedKeysInputIteratorT = cub::detail::conditional_t::value, CacheModifiedInputIterator, KeysInputIteratorT>; using WrappedValuesInputIteratorT = cub::detail::conditional_t::value, CacheModifiedInputIterator, ValuesInputIteratorT>; using BlockLoadKeysT = BlockLoad; using BlockLoadValuesT = BlockLoad; using BlockStoreValuesT = BlockStore; using BlockDiscontinuityKeysT = BlockDiscontinuity; using DelayConstructorT = typename AgentScanByKeyPolicyT::detail::delay_constructor_t; using TilePrefixCallbackT = TilePrefixCallbackOp; using BlockScanT = BlockScan; union TempStorage_ { struct ScanStorage { typename BlockScanT::TempStorage scan; typename TilePrefixCallbackT::TempStorage prefix; typename BlockDiscontinuityKeysT::TempStorage discontinuity; } scan_storage; typename BlockLoadKeysT::TempStorage load_keys; typename BlockLoadValuesT::TempStorage load_values; typename BlockStoreValuesT::TempStorage store_values; }; struct TempStorage : cub::Uninitialized {}; //--------------------------------------------------------------------- // Per-thread fields //--------------------------------------------------------------------- TempStorage_& storage; WrappedKeysInputIteratorT d_keys_in; KeyT* d_keys_prev_in; WrappedValuesInputIteratorT d_values_in; ValuesOutputIteratorT d_values_out; InequalityWrapper inequality_op; ScanOpT scan_op; ReduceBySegmentOpT pair_scan_op; InitValueT init_value; //--------------------------------------------------------------------- // Block scan utility methods (first tile) //--------------------------------------------------------------------- // Exclusive scan specialization _CCCL_DEVICE _CCCL_FORCEINLINE void ScanTile( SizeValuePairT (&scan_items)[ITEMS_PER_THREAD], SizeValuePairT& tile_aggregate, Int2Type /* is_inclusive */) { BlockScanT(storage.scan_storage.scan).ExclusiveScan(scan_items, scan_items, pair_scan_op, tile_aggregate); } // Inclusive scan specialization _CCCL_DEVICE _CCCL_FORCEINLINE void ScanTile( SizeValuePairT (&scan_items)[ITEMS_PER_THREAD], SizeValuePairT& tile_aggregate, Int2Type /* is_inclusive */) { BlockScanT(storage.scan_storage.scan).InclusiveScan(scan_items, scan_items, pair_scan_op, tile_aggregate); } //--------------------------------------------------------------------- // Block scan utility methods (subsequent tiles) //--------------------------------------------------------------------- // Exclusive scan specialization (with prefix from predecessors) _CCCL_DEVICE _CCCL_FORCEINLINE void ScanTile( SizeValuePairT (&scan_items)[ITEMS_PER_THREAD], SizeValuePairT& tile_aggregate, TilePrefixCallbackT& prefix_op, Int2Type /* is_incclusive */) { BlockScanT(storage.scan_storage.scan).ExclusiveScan(scan_items, scan_items, pair_scan_op, prefix_op); tile_aggregate = prefix_op.GetBlockAggregate(); } // Inclusive scan specialization (with prefix from predecessors) _CCCL_DEVICE _CCCL_FORCEINLINE void ScanTile( SizeValuePairT (&scan_items)[ITEMS_PER_THREAD], SizeValuePairT& tile_aggregate, TilePrefixCallbackT& prefix_op, Int2Type /* is_inclusive */) { BlockScanT(storage.scan_storage.scan).InclusiveScan(scan_items, scan_items, pair_scan_op, prefix_op); tile_aggregate = prefix_op.GetBlockAggregate(); } //--------------------------------------------------------------------- // Zip utility methods //--------------------------------------------------------------------- template _CCCL_DEVICE _CCCL_FORCEINLINE void ZipValuesAndFlags( OffsetT num_remaining, AccumT (&values)[ITEMS_PER_THREAD], OffsetT (&segment_flags)[ITEMS_PER_THREAD], SizeValuePairT (&scan_items)[ITEMS_PER_THREAD]) { // Zip values and segment_flags #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { // Set segment_flags for first out-of-bounds item, zero for others if (IS_LAST_TILE && OffsetT(threadIdx.x * ITEMS_PER_THREAD) + ITEM == num_remaining) { segment_flags[ITEM] = 1; } scan_items[ITEM].value = values[ITEM]; scan_items[ITEM].key = segment_flags[ITEM]; } } _CCCL_DEVICE _CCCL_FORCEINLINE void UnzipValues(AccumT (&values)[ITEMS_PER_THREAD], SizeValuePairT (&scan_items)[ITEMS_PER_THREAD]) { // Zip values and segment_flags #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { values[ITEM] = scan_items[ITEM].value; } } template ::value, typename std::enable_if::type = 0> _CCCL_DEVICE _CCCL_FORCEINLINE void AddInitToScan(AccumT (&items)[ITEMS_PER_THREAD], OffsetT (&flags)[ITEMS_PER_THREAD]) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { items[ITEM] = flags[ITEM] ? init_value : scan_op(init_value, items[ITEM]); } } template ::value, typename std::enable_if::type = 0> _CCCL_DEVICE _CCCL_FORCEINLINE void AddInitToScan(AccumT (& /*items*/)[ITEMS_PER_THREAD], OffsetT (& /*flags*/)[ITEMS_PER_THREAD]) {} //--------------------------------------------------------------------- // Cooperatively scan a device-wide sequence of tiles with other CTAs //--------------------------------------------------------------------- // Process a tile of input (dynamic chained scan) // template _CCCL_DEVICE _CCCL_FORCEINLINE void ConsumeTile(OffsetT /*num_items*/, OffsetT num_remaining, int tile_idx, OffsetT tile_base, ScanTileStateT& tile_state) { // Load items KeyT keys[ITEMS_PER_THREAD]; AccumT values[ITEMS_PER_THREAD]; OffsetT segment_flags[ITEMS_PER_THREAD]; SizeValuePairT scan_items[ITEMS_PER_THREAD]; if (IS_LAST_TILE) { // Fill last element with the first element // because collectives are not suffix guarded BlockLoadKeysT(storage.load_keys).Load(d_keys_in + tile_base, keys, num_remaining, *(d_keys_in + tile_base)); } else { BlockLoadKeysT(storage.load_keys).Load(d_keys_in + tile_base, keys); } CTA_SYNC(); if (IS_LAST_TILE) { // Fill last element with the first element // because collectives are not suffix guarded BlockLoadValuesT(storage.load_values) .Load(d_values_in + tile_base, values, num_remaining, *(d_values_in + tile_base)); } else { BlockLoadValuesT(storage.load_values).Load(d_values_in + tile_base, values); } CTA_SYNC(); // first tile if (tile_idx == 0) { BlockDiscontinuityKeysT(storage.scan_storage.discontinuity).FlagHeads(segment_flags, keys, inequality_op); // Zip values and segment_flags ZipValuesAndFlags(num_remaining, values, segment_flags, scan_items); // Exclusive scan of values and segment_flags SizeValuePairT tile_aggregate; ScanTile(scan_items, tile_aggregate, Int2Type()); if (threadIdx.x == 0) { if (!IS_LAST_TILE) { tile_state.SetInclusive(0, tile_aggregate); } scan_items[0].key = 0; } } else { KeyT tile_pred_key = (threadIdx.x == 0) ? d_keys_prev_in[tile_idx] : KeyT(); BlockDiscontinuityKeysT(storage.scan_storage.discontinuity) .FlagHeads(segment_flags, keys, inequality_op, tile_pred_key); // Zip values and segment_flags ZipValuesAndFlags(num_remaining, values, segment_flags, scan_items); SizeValuePairT tile_aggregate; TilePrefixCallbackT prefix_op(tile_state, storage.scan_storage.prefix, pair_scan_op, tile_idx); ScanTile(scan_items, tile_aggregate, prefix_op, Int2Type()); } CTA_SYNC(); UnzipValues(values, scan_items); AddInitToScan(values, segment_flags); // Store items if (IS_LAST_TILE) { BlockStoreValuesT(storage.store_values).Store(d_values_out + tile_base, values, num_remaining); } else { BlockStoreValuesT(storage.store_values).Store(d_values_out + tile_base, values); } } //--------------------------------------------------------------------- // Constructor //--------------------------------------------------------------------- // Dequeue and scan tiles of items as part of a dynamic chained scan // with Init functor _CCCL_DEVICE _CCCL_FORCEINLINE AgentScanByKey( TempStorage& storage, KeysInputIteratorT d_keys_in, KeyT* d_keys_prev_in, ValuesInputIteratorT d_values_in, ValuesOutputIteratorT d_values_out, EqualityOp equality_op, ScanOpT scan_op, InitValueT init_value) : storage(storage.Alias()) , d_keys_in(d_keys_in) , d_keys_prev_in(d_keys_prev_in) , d_values_in(d_values_in) , d_values_out(d_values_out) , inequality_op(equality_op) , scan_op(scan_op) , pair_scan_op(scan_op) , init_value(init_value) {} /** * Scan tiles of items as part of a dynamic chained scan * * @param num_items * Total number of input items * * @param tile_state * Global tile state descriptor * * start_tile * The starting tile for the current grid */ _CCCL_DEVICE _CCCL_FORCEINLINE void ConsumeRange(OffsetT num_items, ScanTileStateT& tile_state, int start_tile) { int tile_idx = blockIdx.x; OffsetT tile_base = OffsetT(ITEMS_PER_TILE) * tile_idx; OffsetT num_remaining = num_items - tile_base; if (num_remaining > ITEMS_PER_TILE) { // Not the last tile (full) ConsumeTile(num_items, num_remaining, tile_idx, tile_base, tile_state); } else if (num_remaining > 0) { // The last tile (possibly partially-full) ConsumeTile(num_items, num_remaining, tile_idx, tile_base, tile_state); } } }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/agent/agent_segment_fixup.cuh000066400000000000000000000371021463375617100216160ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * cub::AgentSegmentFixup implements a stateful abstraction of CUDA thread blocks for participating in device-wide * reduce-value-by-key. */ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include #include #include #include #include CUB_NAMESPACE_BEGIN /****************************************************************************** * Tuning policy types ******************************************************************************/ /** * @brief Parameterizable tuning policy type for AgentSegmentFixup * * @tparam _BLOCK_THREADS * Threads per thread block * * @tparam _ITEMS_PER_THREAD * Items per thread (per tile of input) * * @tparam _LOAD_ALGORITHM * The BlockLoad algorithm to use * * @tparam _LOAD_MODIFIER * Cache load modifier for reading input elements * * @tparam _SCAN_ALGORITHM * The BlockScan algorithm to use */ template struct AgentSegmentFixupPolicy { enum { /// Threads per thread block BLOCK_THREADS = _BLOCK_THREADS, /// Items per thread (per tile of input) ITEMS_PER_THREAD = _ITEMS_PER_THREAD, }; /// The BlockLoad algorithm to use static constexpr BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; /// Cache load modifier for reading input elements static constexpr CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; /// The BlockScan algorithm to use static constexpr BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; }; /****************************************************************************** * Thread block abstractions ******************************************************************************/ /** * @brief AgentSegmentFixup implements a stateful abstraction of CUDA thread blocks for * participating in device-wide reduce-value-by-key * * @tparam AgentSegmentFixupPolicyT * Parameterized AgentSegmentFixupPolicy tuning policy type * * @tparam PairsInputIteratorT * Random-access input iterator type for keys * * @tparam AggregatesOutputIteratorT * Random-access output iterator type for values * * @tparam EqualityOpT * KeyT equality operator type * * @tparam ReductionOpT * ValueT reduction operator type * * @tparam OffsetT * Signed integer type for global offsets */ template struct AgentSegmentFixup { //--------------------------------------------------------------------- // Types and constants //--------------------------------------------------------------------- // Data type of key-value input iterator using KeyValuePairT = cub::detail::value_t; // Value type using ValueT = typename KeyValuePairT::Value; // Tile status descriptor interface type using ScanTileStateT = ReduceByKeyScanTileState; // Constants enum { BLOCK_THREADS = AgentSegmentFixupPolicyT::BLOCK_THREADS, ITEMS_PER_THREAD = AgentSegmentFixupPolicyT::ITEMS_PER_THREAD, TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, // Whether or not do fixup using RLE + global atomics USE_ATOMIC_FIXUP = (std::is_same::value || std::is_same::value || std::is_same::value || std::is_same::value), // Whether or not the scan operation has a zero-valued identity value // (true if we're performing addition on a primitive type) HAS_IDENTITY_ZERO = (std::is_same::value) && (Traits::PRIMITIVE), }; // Cache-modified Input iterator wrapper type (for applying cache modifier) for keys // Wrap the native input pointer with CacheModifiedValuesInputIterator // or directly use the supplied input iterator type using WrappedPairsInputIteratorT = cub::detail::conditional_t< std::is_pointer::value, CacheModifiedInputIterator, PairsInputIteratorT>; // Cache-modified Input iterator wrapper type (for applying cache modifier) for fixup values // Wrap the native input pointer with CacheModifiedValuesInputIterator // or directly use the supplied input iterator type using WrappedFixupInputIteratorT = cub::detail::conditional_t::value, CacheModifiedInputIterator, AggregatesOutputIteratorT>; // Reduce-value-by-segment scan operator using ReduceBySegmentOpT = ReduceByKeyOp; // Parameterized BlockLoad type for pairs using BlockLoadPairs = BlockLoad; // Parameterized BlockScan type using BlockScanT = BlockScan; // Callback type for obtaining tile prefix during block scan using TilePrefixCallbackOpT = TilePrefixCallbackOp; // Shared memory type for this thread block union _TempStorage { struct ScanStorage { // Smem needed for tile scanning typename BlockScanT::TempStorage scan; // Smem needed for cooperative prefix callback typename TilePrefixCallbackOpT::TempStorage prefix; } scan_storage; // Smem needed for loading keys typename BlockLoadPairs::TempStorage load_pairs; }; // Alias wrapper allowing storage to be unioned struct TempStorage : Uninitialized<_TempStorage> {}; //--------------------------------------------------------------------- // Per-thread fields //--------------------------------------------------------------------- _TempStorage& temp_storage; ///< Reference to temp_storage WrappedPairsInputIteratorT d_pairs_in; ///< Input keys AggregatesOutputIteratorT d_aggregates_out; ///< Output value aggregates WrappedFixupInputIteratorT d_fixup_in; ///< Fixup input values InequalityWrapper inequality_op; ///< KeyT inequality operator ReductionOpT reduction_op; ///< Reduction operator ReduceBySegmentOpT scan_op; ///< Reduce-by-segment scan operator //--------------------------------------------------------------------- // Constructor //--------------------------------------------------------------------- /** * @param temp_storage * Reference to temp_storage * * @param d_pairs_in * Input keys * * @param d_aggregates_out * Output value aggregates * * @param equality_op * KeyT equality operator * * @param reduction_op * ValueT reduction operator */ _CCCL_DEVICE _CCCL_FORCEINLINE AgentSegmentFixup( TempStorage& temp_storage, PairsInputIteratorT d_pairs_in, AggregatesOutputIteratorT d_aggregates_out, EqualityOpT equality_op, ReductionOpT reduction_op) : temp_storage(temp_storage.Alias()) , d_pairs_in(d_pairs_in) , d_aggregates_out(d_aggregates_out) , d_fixup_in(d_aggregates_out) , inequality_op(equality_op) , reduction_op(reduction_op) , scan_op(reduction_op) {} //--------------------------------------------------------------------- // Cooperatively scan a device-wide sequence of tiles with other CTAs //--------------------------------------------------------------------- /** * @brief Process input tile. Specialized for atomic-fixup * * @param num_remaining * Number of global input items remaining (including this tile) * * @param tile_idx * Tile index * * @param tile_offset * Tile offset * * @param tile_state * Global tile state descriptor * * @param use_atomic_fixup * Marker whether to use atomicAdd (instead of reduce-by-key) */ template _CCCL_DEVICE _CCCL_FORCEINLINE void ConsumeTile( OffsetT num_remaining, int tile_idx, OffsetT tile_offset, ScanTileStateT& tile_state, Int2Type use_atomic_fixup) { KeyValuePairT pairs[ITEMS_PER_THREAD]; // Load pairs KeyValuePairT oob_pair; oob_pair.key = -1; if (IS_LAST_TILE) { BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs, num_remaining, oob_pair); } else { BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs); } // RLE #pragma unroll for (int ITEM = 1; ITEM < ITEMS_PER_THREAD; ++ITEM) { ValueT* d_scatter = d_aggregates_out + pairs[ITEM - 1].key; if (pairs[ITEM].key != pairs[ITEM - 1].key) { atomicAdd(d_scatter, pairs[ITEM - 1].value); } else { pairs[ITEM].value = reduction_op(pairs[ITEM - 1].value, pairs[ITEM].value); } } // Flush last item if valid ValueT* d_scatter = d_aggregates_out + pairs[ITEMS_PER_THREAD - 1].key; if ((!IS_LAST_TILE) || (pairs[ITEMS_PER_THREAD - 1].key >= 0)) { atomicAdd(d_scatter, pairs[ITEMS_PER_THREAD - 1].value); } } /** * @brief Process input tile. Specialized for reduce-by-key fixup * * @param num_remaining * Number of global input items remaining (including this tile) * * @param tile_idx * Tile index * * @param tile_offset * Tile offset * * @param tile_state * Global tile state descriptor * * @param use_atomic_fixup * Marker whether to use atomicAdd (instead of reduce-by-key) */ template _CCCL_DEVICE _CCCL_FORCEINLINE void ConsumeTile( OffsetT num_remaining, int tile_idx, OffsetT tile_offset, ScanTileStateT& tile_state, Int2Type use_atomic_fixup) { KeyValuePairT pairs[ITEMS_PER_THREAD]; KeyValuePairT scatter_pairs[ITEMS_PER_THREAD]; // Load pairs KeyValuePairT oob_pair; oob_pair.key = -1; if (IS_LAST_TILE) { BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs, num_remaining, oob_pair); } else { BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs); } CTA_SYNC(); KeyValuePairT tile_aggregate; if (tile_idx == 0) { // Exclusive scan of values and segment_flags BlockScanT(temp_storage.scan_storage.scan).ExclusiveScan(pairs, scatter_pairs, scan_op, tile_aggregate); // Update tile status if this is not the last tile if (threadIdx.x == 0) { // Set first segment id to not trigger a flush (invalid from exclusive scan) scatter_pairs[0].key = pairs[0].key; if (!IS_LAST_TILE) { tile_state.SetInclusive(0, tile_aggregate); } } } else { // Exclusive scan of values and segment_flags TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.scan_storage.prefix, scan_op, tile_idx); BlockScanT(temp_storage.scan_storage.scan).ExclusiveScan(pairs, scatter_pairs, scan_op, prefix_op); tile_aggregate = prefix_op.GetBlockAggregate(); } // Scatter updated values #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { if (scatter_pairs[ITEM].key != pairs[ITEM].key) { // Update the value at the key location ValueT value = d_fixup_in[scatter_pairs[ITEM].key]; value = reduction_op(value, scatter_pairs[ITEM].value); d_aggregates_out[scatter_pairs[ITEM].key] = value; } } // Finalize the last item if (IS_LAST_TILE) { // Last thread will output final count and last item, if necessary if (threadIdx.x == BLOCK_THREADS - 1) { // If the last tile is a whole tile, the inclusive prefix contains accumulated value reduction for the last // segment if (num_remaining == TILE_ITEMS) { // Update the value at the key location OffsetT last_key = pairs[ITEMS_PER_THREAD - 1].key; d_aggregates_out[last_key] = reduction_op(tile_aggregate.value, d_fixup_in[last_key]); } } } } /** * @brief Scan tiles of items as part of a dynamic chained scan * * @param num_items * Total number of input items * * @param num_tiles * Total number of input tiles * * @param tile_state * Global tile state descriptor */ _CCCL_DEVICE _CCCL_FORCEINLINE void ConsumeRange(OffsetT num_items, int num_tiles, ScanTileStateT& tile_state) { // Blocks are launched in increasing order, so just assign one tile per block int tile_idx = (blockIdx.x * gridDim.y) + blockIdx.y; // Current tile index OffsetT tile_offset = tile_idx * TILE_ITEMS; // Global offset for the current tile OffsetT num_remaining = num_items - tile_offset; // Remaining items (including this tile) if (num_remaining > TILE_ITEMS) { // Not the last tile (full) ConsumeTile(num_remaining, tile_idx, tile_offset, tile_state, Int2Type()); } else if (num_remaining > 0) { // The last tile (possibly partially-full) ConsumeTile(num_remaining, tile_idx, tile_offset, tile_state, Int2Type()); } } }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/agent/agent_segmented_radix_sort.cuh000066400000000000000000000224061463375617100231530ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include #include CUB_NAMESPACE_BEGIN /** * This agent will be implementing the `DeviceSegmentedRadixSort` when the * https://github.com/NVIDIA/cub/issues/383 is addressed. * * @tparam IS_DESCENDING * Whether or not the sorted-order is high-to-low * * @tparam SegmentedPolicyT * Chained tuning policy * * @tparam KeyT * Key type * * @tparam ValueT * Value type * * @tparam OffsetT * Signed integer type for global offsets */ template struct AgentSegmentedRadixSort { OffsetT num_items; static constexpr int ITEMS_PER_THREAD = SegmentedPolicyT::ITEMS_PER_THREAD; static constexpr int BLOCK_THREADS = SegmentedPolicyT::BLOCK_THREADS; static constexpr int RADIX_BITS = SegmentedPolicyT::RADIX_BITS; static constexpr int RADIX_DIGITS = 1 << RADIX_BITS; static constexpr int KEYS_ONLY = std::is_same::value; using traits = detail::radix::traits_t; using bit_ordered_type = typename traits::bit_ordered_type; // Huge segment handlers using BlockUpsweepT = AgentRadixSortUpsweep; using DigitScanT = BlockScan; using BlockDownsweepT = AgentRadixSortDownsweep; /// Number of bin-starting offsets tracked per thread static constexpr int BINS_TRACKED_PER_THREAD = BlockDownsweepT::BINS_TRACKED_PER_THREAD; // Small segment handlers using BlockRadixSortT = BlockRadixSort; using BlockKeyLoadT = BlockLoad; using BlockValueLoadT = BlockLoad; union _TempStorage { // Huge segment handlers typename BlockUpsweepT::TempStorage upsweep; typename BlockDownsweepT::TempStorage downsweep; struct UnboundBlockSort { OffsetT reverse_counts_in[RADIX_DIGITS]; OffsetT reverse_counts_out[RADIX_DIGITS]; typename DigitScanT::TempStorage scan; } unbound_sort; // Small segment handlers typename BlockKeyLoadT::TempStorage keys_load; typename BlockValueLoadT::TempStorage values_load; typename BlockRadixSortT::TempStorage sort; }; using TempStorage = Uninitialized<_TempStorage>; _TempStorage& temp_storage; DecomposerT decomposer; _CCCL_DEVICE _CCCL_FORCEINLINE AgentSegmentedRadixSort(OffsetT num_items, TempStorage& temp_storage, DecomposerT decomposer = {}) : num_items(num_items) , temp_storage(temp_storage.Alias()) , decomposer(decomposer) {} _CCCL_DEVICE _CCCL_FORCEINLINE void ProcessSinglePass( int begin_bit, int end_bit, const KeyT* d_keys_in, const ValueT* d_values_in, KeyT* d_keys_out, ValueT* d_values_out) { KeyT thread_keys[ITEMS_PER_THREAD]; ValueT thread_values[ITEMS_PER_THREAD]; // For FP64 the difference is: // Lowest() -> -1.79769e+308 = 00...00b -> TwiddleIn -> -0 = 10...00b // LOWEST -> -nan = 11...11b -> TwiddleIn -> 0 = 00...00b bit_ordered_type default_key_bits = IS_DESCENDING ? traits::min_raw_binary_key(decomposer) : traits::max_raw_binary_key(decomposer); KeyT oob_default = reinterpret_cast(default_key_bits); if (!KEYS_ONLY) { BlockValueLoadT(temp_storage.values_load).Load(d_values_in, thread_values, num_items); CTA_SYNC(); } { BlockKeyLoadT(temp_storage.keys_load).Load(d_keys_in, thread_keys, num_items, oob_default); CTA_SYNC(); } BlockRadixSortT(temp_storage.sort) .SortBlockedToStriped( thread_keys, thread_values, begin_bit, end_bit, Int2Type(), Int2Type(), decomposer); cub::StoreDirectStriped(threadIdx.x, d_keys_out, thread_keys, num_items); if (!KEYS_ONLY) { cub::StoreDirectStriped(threadIdx.x, d_values_out, thread_values, num_items); } } _CCCL_DEVICE _CCCL_FORCEINLINE void ProcessIterative( int current_bit, int pass_bits, const KeyT* d_keys_in, const ValueT* d_values_in, KeyT* d_keys_out, ValueT* d_values_out) { // Upsweep BlockUpsweepT upsweep(temp_storage.upsweep, d_keys_in, current_bit, pass_bits, decomposer); upsweep.ProcessRegion(OffsetT{}, num_items); CTA_SYNC(); // The count of each digit value in this pass (valid in the first RADIX_DIGITS threads) OffsetT bin_count[BINS_TRACKED_PER_THREAD]; upsweep.ExtractCounts(bin_count); CTA_SYNC(); if (IS_DESCENDING) { // Reverse bin counts #pragma unroll for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) { int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track; if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) { temp_storage.unbound_sort.reverse_counts_in[bin_idx] = bin_count[track]; } } CTA_SYNC(); #pragma unroll for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) { int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track; if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) { bin_count[track] = temp_storage.unbound_sort.reverse_counts_in[RADIX_DIGITS - bin_idx - 1]; } } } // Scan // The global scatter base offset for each digit value in this pass // (valid in the first RADIX_DIGITS threads) OffsetT bin_offset[BINS_TRACKED_PER_THREAD]; DigitScanT(temp_storage.unbound_sort.scan).ExclusiveSum(bin_count, bin_offset); if (IS_DESCENDING) { // Reverse bin offsets #pragma unroll for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) { int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track; if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) { temp_storage.unbound_sort.reverse_counts_out[threadIdx.x] = bin_offset[track]; } } CTA_SYNC(); #pragma unroll for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) { int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track; if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) { bin_offset[track] = temp_storage.unbound_sort.reverse_counts_out[RADIX_DIGITS - bin_idx - 1]; } } } CTA_SYNC(); // Downsweep BlockDownsweepT downsweep( temp_storage.downsweep, bin_offset, num_items, d_keys_in, d_keys_out, d_values_in, d_values_out, current_bit, pass_bits, decomposer); downsweep.ProcessRegion(OffsetT{}, num_items); } }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/agent/agent_select_if.cuh000066400000000000000000000754231463375617100207060ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * cub::AgentSelectIf implements a stateful abstraction of CUDA thread blocks for participating in device-wide select. */ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include #include #include #include #include #include #include #include CUB_NAMESPACE_BEGIN /****************************************************************************** * Tuning policy types ******************************************************************************/ /** * Parameterizable tuning policy type for AgentSelectIf * * @tparam _BLOCK_THREADS * Threads per thread block * * @tparam _ITEMS_PER_THREAD * Items per thread (per tile of input) * * @tparam _LOAD_ALGORITHM * The BlockLoad algorithm to use * * @tparam _LOAD_MODIFIER * Cache load modifier for reading input elements * * @tparam _SCAN_ALGORITHM * The BlockScan algorithm to use * * @tparam DelayConstructorT * Implementation detail, do not specify directly, requirements on the * content of this type are subject to breaking change. */ template > struct AgentSelectIfPolicy { enum { /// Threads per thread block BLOCK_THREADS = _BLOCK_THREADS, /// Items per thread (per tile of input) ITEMS_PER_THREAD = _ITEMS_PER_THREAD, }; /// The BlockLoad algorithm to use static constexpr BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; /// Cache load modifier for reading input elements static constexpr CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; /// The BlockScan algorithm to use static constexpr BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; struct detail { using delay_constructor_t = DelayConstructorT; }; }; /****************************************************************************** * Thread block abstractions ******************************************************************************/ namespace detail { template struct partition_distinct_output_t { using selected_iterator_t = SelectedOutputItT; using rejected_iterator_t = RejectedOutputItT; selected_iterator_t selected_it; rejected_iterator_t rejected_it; }; } // namespace detail /** * @brief AgentSelectIf implements a stateful abstraction of CUDA thread blocks for participating in * device-wide selection * * Performs functor-based selection if SelectOpT functor type != NullType * Otherwise performs flag-based selection if FlagsInputIterator's value type != NullType * Otherwise performs discontinuity selection (keep unique) * * @tparam AgentSelectIfPolicyT * Parameterized AgentSelectIfPolicy tuning policy type * * @tparam InputIteratorT * Random-access input iterator type for selection items * * @tparam FlagsInputIteratorT * Random-access input iterator type for selections (NullType* if a selection functor or * discontinuity flagging is to be used for selection) * * @tparam OutputIteratorWrapperT * Either a random-access iterator or an instance of the `partition_distinct_output_t` template. * * @tparam SelectOpT * Selection operator type (NullType if selections or discontinuity flagging is to be used for * selection) * * @tparam EqualityOpT * Equality operator type (NullType if selection functor or selections is to be used for * selection) * * @tparam OffsetT * Signed integer type for global offsets * * @tparam KEEP_REJECTS * Whether or not we push rejected items to the back of the output */ template struct AgentSelectIf { //--------------------------------------------------------------------- // Types and constants //--------------------------------------------------------------------- // The input value type using InputT = cub::detail::value_t; // The flag value type using FlagT = cub::detail::value_t; // Tile status descriptor interface type using ScanTileStateT = ScanTileState; // Constants enum { USE_SELECT_OP, USE_SELECT_FLAGS, USE_DISCONTINUITY, USE_STENCIL_WITH_OP }; static constexpr ::cuda::std::int32_t BLOCK_THREADS = AgentSelectIfPolicyT::BLOCK_THREADS; static constexpr ::cuda::std::int32_t ITEMS_PER_THREAD = AgentSelectIfPolicyT::ITEMS_PER_THREAD; static constexpr ::cuda::std::int32_t TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD; static constexpr bool TWO_PHASE_SCATTER = (ITEMS_PER_THREAD > 1); static constexpr bool has_select_op = (!::cuda::std::is_same::value); static constexpr bool has_flags_it = (!::cuda::std::is_same::value); static constexpr bool use_stencil_with_op = has_select_op && has_flags_it; static constexpr auto SELECT_METHOD = use_stencil_with_op ? USE_STENCIL_WITH_OP : has_select_op ? USE_SELECT_OP : has_flags_it ? USE_SELECT_FLAGS : USE_DISCONTINUITY; // Cache-modified Input iterator wrapper type (for applying cache modifier) for items // Wrap the native input pointer with CacheModifiedValuesInputIterator // or directly use the supplied input iterator type using WrappedInputIteratorT = cub::detail::conditional_t<::cuda::std::is_pointer::value, CacheModifiedInputIterator, InputIteratorT>; // Cache-modified Input iterator wrapper type (for applying cache modifier) for values // Wrap the native input pointer with CacheModifiedValuesInputIterator // or directly use the supplied input iterator type using WrappedFlagsInputIteratorT = cub::detail::conditional_t<::cuda::std::is_pointer::value, CacheModifiedInputIterator, FlagsInputIteratorT>; // Parameterized BlockLoad type for input data using BlockLoadT = BlockLoad; // Parameterized BlockLoad type for flags using BlockLoadFlags = BlockLoad; // Parameterized BlockDiscontinuity type for items using BlockDiscontinuityT = BlockDiscontinuity; // Parameterized BlockScan type using BlockScanT = BlockScan; // Callback type for obtaining tile prefix during block scan using DelayConstructorT = typename AgentSelectIfPolicyT::detail::delay_constructor_t; using TilePrefixCallbackOpT = TilePrefixCallbackOp; // Item exchange type typedef InputT ItemExchangeT[TILE_ITEMS]; // Shared memory type for this thread block union _TempStorage { struct ScanStorage { // Smem needed for tile scanning typename BlockScanT::TempStorage scan; // Smem needed for cooperative prefix callback typename TilePrefixCallbackOpT::TempStorage prefix; // Smem needed for discontinuity detection typename BlockDiscontinuityT::TempStorage discontinuity; } scan_storage; // Smem needed for loading items typename BlockLoadT::TempStorage load_items; // Smem needed for loading values typename BlockLoadFlags::TempStorage load_flags; // Smem needed for compacting items (allows non POD items in this union) Uninitialized raw_exchange; }; // Alias wrapper allowing storage to be unioned struct TempStorage : Uninitialized<_TempStorage> {}; //--------------------------------------------------------------------- // Per-thread fields //--------------------------------------------------------------------- _TempStorage& temp_storage; ///< Reference to temp_storage WrappedInputIteratorT d_in; ///< Input items OutputIteratorWrapperT d_selected_out; ///< Unique output items WrappedFlagsInputIteratorT d_flags_in; ///< Input selection flags (if applicable) InequalityWrapper inequality_op; ///< T inequality operator SelectOpT select_op; ///< Selection operator OffsetT num_items; ///< Total number of input items //--------------------------------------------------------------------- // Constructor //--------------------------------------------------------------------- /** * @param temp_storage * Reference to temp_storage * * @param d_in * Input data * * @param d_flags_in * Input selection flags (if applicable) * * @param d_selected_out * Output data * * @param select_op * Selection operator * * @param equality_op * Equality operator * * @param num_items * Total number of input items */ _CCCL_DEVICE _CCCL_FORCEINLINE AgentSelectIf( TempStorage& temp_storage, InputIteratorT d_in, FlagsInputIteratorT d_flags_in, OutputIteratorWrapperT d_selected_out, SelectOpT select_op, EqualityOpT equality_op, OffsetT num_items) : temp_storage(temp_storage.Alias()) , d_in(d_in) , d_selected_out(d_selected_out) , d_flags_in(d_flags_in) , inequality_op(equality_op) , select_op(select_op) , num_items(num_items) {} //--------------------------------------------------------------------- // Utility methods for initializing the selections //--------------------------------------------------------------------- /** * Initialize selections (specialized for selection operator) */ template _CCCL_DEVICE _CCCL_FORCEINLINE void InitializeSelections( OffsetT /*tile_offset*/, OffsetT num_tile_items, InputT (&items)[ITEMS_PER_THREAD], OffsetT (&selection_flags)[ITEMS_PER_THREAD], Int2Type /*select_method*/) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { // Out-of-bounds items are selection_flags selection_flags[ITEM] = 1; if (!IS_LAST_TILE || (static_cast(threadIdx.x * ITEMS_PER_THREAD + ITEM) < num_tile_items)) { selection_flags[ITEM] = static_cast(select_op(items[ITEM])); } } } /** * Initialize selections (specialized for selection_op applied to d_flags_in) */ template _CCCL_DEVICE _CCCL_FORCEINLINE void InitializeSelections( OffsetT tile_offset, OffsetT num_tile_items, InputT (& /*items*/)[ITEMS_PER_THREAD], OffsetT (&selection_flags)[ITEMS_PER_THREAD], Int2Type /*select_method*/) { CTA_SYNC(); FlagT flags[ITEMS_PER_THREAD]; if (IS_LAST_TILE) { // Initialize the out-of-bounds flags #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { selection_flags[ITEM] = true; } // Guarded loads BlockLoadFlags(temp_storage.load_flags).Load(d_flags_in + tile_offset, flags, num_tile_items); } else { BlockLoadFlags(temp_storage.load_flags).Load(d_flags_in + tile_offset, flags); } #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { // Set selection_flags for out-of-bounds items if ((!IS_LAST_TILE) || (static_cast(threadIdx.x * ITEMS_PER_THREAD + ITEM) < num_tile_items)) { selection_flags[ITEM] = static_cast(select_op(flags[ITEM])); } } } /** * Initialize selections (specialized for valid flags) */ template _CCCL_DEVICE _CCCL_FORCEINLINE void InitializeSelections( OffsetT tile_offset, OffsetT num_tile_items, InputT (& /*items*/)[ITEMS_PER_THREAD], OffsetT (&selection_flags)[ITEMS_PER_THREAD], Int2Type /*select_method*/) { CTA_SYNC(); FlagT flags[ITEMS_PER_THREAD]; if (IS_LAST_TILE) { // Out-of-bounds items are selection_flags BlockLoadFlags(temp_storage.load_flags).Load(d_flags_in + tile_offset, flags, num_tile_items, 1); } else { BlockLoadFlags(temp_storage.load_flags).Load(d_flags_in + tile_offset, flags); } // Convert flag type to selection_flags type #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { selection_flags[ITEM] = static_cast(flags[ITEM]); } } /** * Initialize selections (specialized for discontinuity detection) */ template _CCCL_DEVICE _CCCL_FORCEINLINE void InitializeSelections( OffsetT tile_offset, OffsetT num_tile_items, InputT (&items)[ITEMS_PER_THREAD], OffsetT (&selection_flags)[ITEMS_PER_THREAD], Int2Type /*select_method*/) { if (IS_FIRST_TILE) { CTA_SYNC(); // Set head selection_flags. First tile sets the first flag for the first item BlockDiscontinuityT(temp_storage.scan_storage.discontinuity).FlagHeads(selection_flags, items, inequality_op); } else { InputT tile_predecessor; if (threadIdx.x == 0) { tile_predecessor = d_in[tile_offset - 1]; } CTA_SYNC(); BlockDiscontinuityT(temp_storage.scan_storage.discontinuity) .FlagHeads(selection_flags, items, inequality_op, tile_predecessor); } // Set selection flags for out-of-bounds items #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { // Set selection_flags for out-of-bounds items if ((IS_LAST_TILE) && (OffsetT(threadIdx.x * ITEMS_PER_THREAD) + ITEM >= num_tile_items)) { selection_flags[ITEM] = 1; } } } //--------------------------------------------------------------------- // Scatter utility methods //--------------------------------------------------------------------- /** * Scatter flagged items to output offsets (specialized for direct scattering). */ template _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterSelectedDirect( InputT (&items)[ITEMS_PER_THREAD], OffsetT (&selection_flags)[ITEMS_PER_THREAD], OffsetT (&selection_indices)[ITEMS_PER_THREAD], OffsetT num_selections) { // Scatter flagged items #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { if (selection_flags[ITEM]) { if ((!IS_LAST_TILE) || selection_indices[ITEM] < num_selections) { d_selected_out[selection_indices[ITEM]] = items[ITEM]; } } } } /** * @brief Scatter flagged items to output offsets (specialized for two-phase scattering) * * @param num_tile_items * Number of valid items in this tile * * @param num_tile_selections * Number of selections in this tile * * @param num_selections_prefix * Total number of selections prior to this tile * * @param num_rejected_prefix * Total number of rejections prior to this tile * * @param is_keep_rejects * Marker type indicating whether to keep rejected items in the second partition */ template _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterSelectedTwoPhase( InputT (&items)[ITEMS_PER_THREAD], OffsetT (&selection_flags)[ITEMS_PER_THREAD], OffsetT (&selection_indices)[ITEMS_PER_THREAD], int num_tile_selections, OffsetT num_selections_prefix) { CTA_SYNC(); // Compact and scatter items #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { int local_scatter_offset = selection_indices[ITEM] - num_selections_prefix; if (selection_flags[ITEM]) { temp_storage.raw_exchange.Alias()[local_scatter_offset] = items[ITEM]; } } CTA_SYNC(); for (int item = threadIdx.x; item < num_tile_selections; item += BLOCK_THREADS) { d_selected_out[num_selections_prefix + item] = temp_storage.raw_exchange.Alias()[item]; } } /** * @brief Scatter flagged items. Specialized for selection algorithm that simply discards rejected items * * @param num_tile_items * Number of valid items in this tile * * @param num_tile_selections * Number of selections in this tile * * @param num_selections_prefix * Total number of selections prior to this tile * * @param num_rejected_prefix * Total number of rejections prior to this tile * * @param num_selections * Total number of selections including this tile */ template _CCCL_DEVICE _CCCL_FORCEINLINE void Scatter( InputT (&items)[ITEMS_PER_THREAD], OffsetT (&selection_flags)[ITEMS_PER_THREAD], OffsetT (&selection_indices)[ITEMS_PER_THREAD], int num_tile_items, int num_tile_selections, OffsetT num_selections_prefix, OffsetT num_rejected_prefix, OffsetT num_selections, Int2Type /*is_keep_rejects*/) { // Do a two-phase scatter if two-phase is enabled and the average number of selection_flags items per thread is // greater than one if (TWO_PHASE_SCATTER && (num_tile_selections > BLOCK_THREADS)) { ScatterSelectedTwoPhase( items, selection_flags, selection_indices, num_tile_selections, num_selections_prefix); } else { ScatterSelectedDirect(items, selection_flags, selection_indices, num_selections); } } /** * @brief Scatter flagged items. Specialized for partitioning algorithm that writes rejected items to a second * partition. * * @param num_tile_items * Number of valid items in this tile * * @param num_tile_selections * Number of selections in this tile * * @param num_selections_prefix * Total number of selections prior to this tile * * @param num_rejected_prefix * Total number of rejections prior to this tile * * @param is_keep_rejects * Marker type indicating whether to keep rejected items in the second partition */ template _CCCL_DEVICE _CCCL_FORCEINLINE void Scatter( InputT (&items)[ITEMS_PER_THREAD], OffsetT (&selection_flags)[ITEMS_PER_THREAD], OffsetT (&selection_indices)[ITEMS_PER_THREAD], int num_tile_items, int num_tile_selections, OffsetT num_selections_prefix, OffsetT num_rejected_prefix, OffsetT num_selections, Int2Type /*is_keep_rejects*/) { CTA_SYNC(); int tile_num_rejections = num_tile_items - num_tile_selections; // Scatter items to shared memory (rejections first) #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { int item_idx = (threadIdx.x * ITEMS_PER_THREAD) + ITEM; int local_selection_idx = selection_indices[ITEM] - num_selections_prefix; int local_rejection_idx = item_idx - local_selection_idx; int local_scatter_offset = (selection_flags[ITEM]) ? tile_num_rejections + local_selection_idx : local_rejection_idx; temp_storage.raw_exchange.Alias()[local_scatter_offset] = items[ITEM]; } // Ensure all threads finished scattering to shared memory CTA_SYNC(); // Gather items from shared memory and scatter to global ScatterPartitionsToGlobal( num_tile_items, tile_num_rejections, num_selections_prefix, num_rejected_prefix, d_selected_out); } /** * @brief Second phase of scattering partitioned items to global memory. Specialized for partitioning to two * distinct partitions. */ template _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterPartitionsToGlobal( int num_tile_items, int tile_num_rejections, OffsetT num_selections_prefix, OffsetT num_rejected_prefix, detail::partition_distinct_output_t partitioned_out_it_wrapper) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { int item_idx = (ITEM * BLOCK_THREADS) + threadIdx.x; int rejection_idx = item_idx; int selection_idx = item_idx - tile_num_rejections; OffsetT scatter_offset = (item_idx < tile_num_rejections) ? num_rejected_prefix + rejection_idx : num_selections_prefix + selection_idx; InputT item = temp_storage.raw_exchange.Alias()[item_idx]; if (!IS_LAST_TILE || (item_idx < num_tile_items)) { if (item_idx >= tile_num_rejections) { partitioned_out_it_wrapper.selected_it[scatter_offset] = item; } else { partitioned_out_it_wrapper.rejected_it[scatter_offset] = item; } } } } /** * @brief Second phase of scattering partitioned items to global memory. Specialized for partitioning to a single * iterator, where selected items are written in order from the beginning of the itereator and rejected items are * writtem from the iterators end backwards. */ template _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterPartitionsToGlobal( int num_tile_items, int tile_num_rejections, OffsetT num_selections_prefix, OffsetT num_rejected_prefix, PartitionedOutputItT partitioned_out_it) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { int item_idx = (ITEM * BLOCK_THREADS) + threadIdx.x; int rejection_idx = item_idx; int selection_idx = item_idx - tile_num_rejections; OffsetT scatter_offset = (item_idx < tile_num_rejections) ? num_items - num_rejected_prefix - rejection_idx - 1 : num_selections_prefix + selection_idx; InputT item = temp_storage.raw_exchange.Alias()[item_idx]; if (!IS_LAST_TILE || (item_idx < num_tile_items)) { partitioned_out_it[scatter_offset] = item; } } } //--------------------------------------------------------------------- // Cooperatively scan a device-wide sequence of tiles with other CTAs //--------------------------------------------------------------------- /** * @brief Process first tile of input (dynamic chained scan). * * @param num_tile_items * Number of input items comprising this tile * * @param tile_offset * Tile offset * * @param tile_state * Global tile state descriptor * * @return The running count of selections (including this tile) */ template _CCCL_DEVICE _CCCL_FORCEINLINE OffsetT ConsumeFirstTile(int num_tile_items, OffsetT tile_offset, ScanTileStateT& tile_state) { InputT items[ITEMS_PER_THREAD]; OffsetT selection_flags[ITEMS_PER_THREAD]; OffsetT selection_indices[ITEMS_PER_THREAD]; // Load items if (IS_LAST_TILE) { BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items, num_tile_items); } else { BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items); } // Initialize selection_flags InitializeSelections( tile_offset, num_tile_items, items, selection_flags, Int2Type()); CTA_SYNC(); // Exclusive scan of selection_flags OffsetT num_tile_selections; BlockScanT(temp_storage.scan_storage.scan).ExclusiveSum(selection_flags, selection_indices, num_tile_selections); if (threadIdx.x == 0) { // Update tile status if this is not the last tile if (!IS_LAST_TILE) { tile_state.SetInclusive(0, num_tile_selections); } } // Discount any out-of-bounds selections if (IS_LAST_TILE) { num_tile_selections -= (TILE_ITEMS - num_tile_items); } // Scatter flagged items Scatter( items, selection_flags, selection_indices, num_tile_items, num_tile_selections, 0, 0, num_tile_selections, cub::Int2Type{}); return num_tile_selections; } /** * @brief Process subsequent tile of input (dynamic chained scan). * * @param num_tile_items * Number of input items comprising this tile * * @param tile_idx * Tile index * * @param tile_offset * Tile offset * * @param tile_state * Global tile state descriptor * * @return The running count of selections (including this tile) */ template _CCCL_DEVICE _CCCL_FORCEINLINE OffsetT ConsumeSubsequentTile(int num_tile_items, int tile_idx, OffsetT tile_offset, ScanTileStateT& tile_state) { InputT items[ITEMS_PER_THREAD]; OffsetT selection_flags[ITEMS_PER_THREAD]; OffsetT selection_indices[ITEMS_PER_THREAD]; // Load items if (IS_LAST_TILE) { BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items, num_tile_items); } else { BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items); } // Initialize selection_flags InitializeSelections( tile_offset, num_tile_items, items, selection_flags, Int2Type()); CTA_SYNC(); // Exclusive scan of values and selection_flags TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.scan_storage.prefix, cub::Sum(), tile_idx); BlockScanT(temp_storage.scan_storage.scan).ExclusiveSum(selection_flags, selection_indices, prefix_op); OffsetT num_tile_selections = prefix_op.GetBlockAggregate(); OffsetT num_selections = prefix_op.GetInclusivePrefix(); OffsetT num_selections_prefix = prefix_op.GetExclusivePrefix(); OffsetT num_rejected_prefix = tile_offset - num_selections_prefix; // Discount any out-of-bounds selections if (IS_LAST_TILE) { int num_discount = TILE_ITEMS - num_tile_items; num_selections -= num_discount; num_tile_selections -= num_discount; } // Scatter flagged items Scatter( items, selection_flags, selection_indices, num_tile_items, num_tile_selections, num_selections_prefix, num_rejected_prefix, num_selections, cub::Int2Type{}); return num_selections; } /** * @brief Process a tile of input * * @param num_tile_items * Number of input items comprising this tile * * @param tile_idx * Tile index * * @param tile_offset * Tile offset * * @param tile_state * Global tile state descriptor */ template _CCCL_DEVICE _CCCL_FORCEINLINE OffsetT ConsumeTile(int num_tile_items, int tile_idx, OffsetT tile_offset, ScanTileStateT& tile_state) { OffsetT num_selections; if (tile_idx == 0) { num_selections = ConsumeFirstTile(num_tile_items, tile_offset, tile_state); } else { num_selections = ConsumeSubsequentTile(num_tile_items, tile_idx, tile_offset, tile_state); } return num_selections; } /** * @brief Scan tiles of items as part of a dynamic chained scan * * @param num_tiles * Total number of input tiles * * @param tile_state * Global tile state descriptor * * @param d_num_selected_out * Output total number selection_flags * * @tparam NumSelectedIteratorT * Output iterator type for recording number of items selection_flags */ template _CCCL_DEVICE _CCCL_FORCEINLINE void ConsumeRange(int num_tiles, ScanTileStateT& tile_state, NumSelectedIteratorT d_num_selected_out) { // Blocks are launched in increasing order, so just assign one tile per block int tile_idx = (blockIdx.x * gridDim.y) + blockIdx.y; // Current tile index OffsetT tile_offset = static_cast(tile_idx) * static_cast(TILE_ITEMS); if (tile_idx < num_tiles - 1) { // Not the last tile (full) ConsumeTile(TILE_ITEMS, tile_idx, tile_offset, tile_state); } else { // The last tile (possibly partially-full) OffsetT num_remaining = num_items - tile_offset; OffsetT num_selections = ConsumeTile(num_remaining, tile_idx, tile_offset, tile_state); if (threadIdx.x == 0) { // Output the total number of items selection_flags *d_num_selected_out = num_selections; } } } }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/agent/agent_spmv_orig.cuh000066400000000000000000000602041463375617100207450ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * cub::AgentSpmv implements a stateful abstraction of CUDA thread blocks for participating in device-wide SpMV. */ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include #include #include #include #include #include CUB_NAMESPACE_BEGIN /****************************************************************************** * Tuning policy ******************************************************************************/ /** * @param Parameterizable tuning policy type for AgentSpmv * * @tparam _BLOCK_THREADS * Threads per thread block * * @tparam _ITEMS_PER_THREAD * Items per thread (per tile of input) * * @tparam _ROW_OFFSETS_SEARCH_LOAD_MODIFIER * Cache load modifier for reading CSR row-offsets during search * * @tparam _ROW_OFFSETS_LOAD_MODIFIER * Cache load modifier for reading CSR row-offsets * * @tparam _COLUMN_INDICES_LOAD_MODIFIER * Cache load modifier for reading CSR column-indices * * @tparam _VALUES_LOAD_MODIFIER * Cache load modifier for reading CSR values * * @tparam _VECTOR_VALUES_LOAD_MODIFIER * Cache load modifier for reading vector values * * @tparam _DIRECT_LOAD_NONZEROS * Whether to load nonzeros directly from global during sequential merging (vs. pre-staged through * shared memory) * * @tparam _SCAN_ALGORITHM * The BlockScan algorithm to use */ template struct AgentSpmvPolicy { enum { /// Threads per thread block BLOCK_THREADS = _BLOCK_THREADS, /// Items per thread (per tile of input) ITEMS_PER_THREAD = _ITEMS_PER_THREAD, /// Whether to load nonzeros directly from global during sequential merging (pre-staged through /// shared memory) DIRECT_LOAD_NONZEROS = _DIRECT_LOAD_NONZEROS, }; /// Cache load modifier for reading CSR row-offsets static constexpr CacheLoadModifier ROW_OFFSETS_SEARCH_LOAD_MODIFIER = _ROW_OFFSETS_SEARCH_LOAD_MODIFIER; /// Cache load modifier for reading CSR row-offsets static constexpr CacheLoadModifier ROW_OFFSETS_LOAD_MODIFIER = _ROW_OFFSETS_LOAD_MODIFIER; /// Cache load modifier for reading CSR column-indices static constexpr CacheLoadModifier COLUMN_INDICES_LOAD_MODIFIER = _COLUMN_INDICES_LOAD_MODIFIER; /// Cache load modifier for reading CSR values static constexpr CacheLoadModifier VALUES_LOAD_MODIFIER = _VALUES_LOAD_MODIFIER; /// Cache load modifier for reading vector values static constexpr CacheLoadModifier VECTOR_VALUES_LOAD_MODIFIER = _VECTOR_VALUES_LOAD_MODIFIER; /// The BlockScan algorithm to use static constexpr BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; }; /****************************************************************************** * Thread block abstractions ******************************************************************************/ /** * @tparam ValueT * Matrix and vector value type * * @tparam OffsetT * Signed integer type for sequence offsets */ template struct SpmvParams { /// Pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix /// A. const ValueT* d_values; /// Pointer to the array of \p m offsets demarcating the end of every row in \p d_column_indices /// and \p d_values const OffsetT* d_row_end_offsets; /// Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements /// of matrix A. (Indices are zero-valued.) const OffsetT* d_column_indices; /// Pointer to the array of \p num_cols values corresponding to the dense input vector x const ValueT* d_vector_x; /// Pointer to the array of \p num_rows values corresponding to the dense output vector y ValueT* d_vector_y; /// Number of rows of matrix A. int num_rows; /// Number of columns of matrix A. int num_cols; /// Number of nonzero elements of matrix A. int num_nonzeros; /// Alpha multiplicand ValueT alpha; /// Beta addend-multiplicand ValueT beta; }; /** * @brief AgentSpmv implements a stateful abstraction of CUDA thread blocks for participating in device-wide SpMV. * * @tparam AgentSpmvPolicyT * Parameterized AgentSpmvPolicy tuning policy type * * @tparam ValueT * Matrix and vector value type * * @tparam OffsetT * Signed integer type for sequence offsets * * @tparam HAS_ALPHA * Whether the input parameter \p alpha is 1 * * @tparam HAS_BETA * Whether the input parameter \p beta is 0 * * @tparam LEGACY_PTX_ARCH * PTX compute capability (unused) */ template struct AgentSpmv { //--------------------------------------------------------------------- // Types and constants //--------------------------------------------------------------------- /// Constants enum { BLOCK_THREADS = AgentSpmvPolicyT::BLOCK_THREADS, ITEMS_PER_THREAD = AgentSpmvPolicyT::ITEMS_PER_THREAD, TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, }; /// 2D merge path coordinate type typedef typename CubVector::Type CoordinateT; /// Input iterator wrapper types (for applying cache modifiers) typedef CacheModifiedInputIterator RowOffsetsSearchIteratorT; typedef CacheModifiedInputIterator RowOffsetsIteratorT; typedef CacheModifiedInputIterator ColumnIndicesIteratorT; typedef CacheModifiedInputIterator ValueIteratorT; typedef CacheModifiedInputIterator VectorValueIteratorT; // Tuple type for scanning (pairs accumulated segment-value with segment-index) typedef KeyValuePair KeyValuePairT; // Reduce-value-by-segment scan operator typedef ReduceByKeyOp ReduceBySegmentOpT; // BlockReduce specialization typedef BlockReduce BlockReduceT; // BlockScan specialization typedef BlockScan BlockScanT; // BlockScan specialization typedef BlockScan BlockPrefixSumT; // BlockExchange specialization typedef BlockExchange BlockExchangeT; /// Merge item type (either a non-zero value or a row-end offset) union MergeItem { // Value type to pair with index type OffsetT // (NullType if loading values directly during merge) using MergeValueT = cub::detail::conditional_t; OffsetT row_end_offset; MergeValueT nonzero; }; /// Shared memory type required by this thread block struct _TempStorage { CoordinateT tile_coords[2]; union Aliasable { // Smem needed for tile of merge items MergeItem merge_items[ITEMS_PER_THREAD + TILE_ITEMS + 1]; // Smem needed for block exchange typename BlockExchangeT::TempStorage exchange; // Smem needed for block-wide reduction typename BlockReduceT::TempStorage reduce; // Smem needed for tile scanning typename BlockScanT::TempStorage scan; // Smem needed for tile prefix sum typename BlockPrefixSumT::TempStorage prefix_sum; } aliasable; }; /// Temporary storage type (unionable) struct TempStorage : Uninitialized<_TempStorage> {}; //--------------------------------------------------------------------- // Per-thread fields //--------------------------------------------------------------------- /// Reference to temp_storage _TempStorage& temp_storage; SpmvParams& spmv_params; /// Wrapped pointer to the array of \p num_nonzeros values of the corresponding nonzero elements /// of matrix A. ValueIteratorT wd_values; /// Wrapped Pointer to the array of \p m offsets demarcating the end of every row in \p /// d_column_indices and \p d_values RowOffsetsIteratorT wd_row_end_offsets; /// Wrapped Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero /// elements of matrix A. (Indices are zero-valued.) ColumnIndicesIteratorT wd_column_indices; /// Wrapped Pointer to the array of \p num_cols values corresponding to the dense input vector /// x VectorValueIteratorT wd_vector_x; /// Wrapped Pointer to the array of \p num_cols values corresponding to the dense input vector /// x VectorValueIteratorT wd_vector_y; //--------------------------------------------------------------------- // Interface //--------------------------------------------------------------------- /** * @param temp_storage * Reference to temp_storage * * @param spmv_params * SpMV input parameter bundle */ _CCCL_DEVICE _CCCL_FORCEINLINE AgentSpmv(TempStorage& temp_storage, SpmvParams& spmv_params) : temp_storage(temp_storage.Alias()) , spmv_params(spmv_params) , wd_values(spmv_params.d_values) , wd_row_end_offsets(spmv_params.d_row_end_offsets) , wd_column_indices(spmv_params.d_column_indices) , wd_vector_x(spmv_params.d_vector_x) , wd_vector_y(spmv_params.d_vector_y) {} /** * @brief Consume a merge tile, specialized for direct-load of nonzeros * * @param is_direct_load * Marker type indicating whether to load nonzeros directly during path-discovery or beforehand in batch */ _CCCL_DEVICE _CCCL_FORCEINLINE KeyValuePairT ConsumeTile(int tile_idx, CoordinateT tile_start_coord, CoordinateT tile_end_coord, Int2Type is_direct_load) { int tile_num_rows = tile_end_coord.x - tile_start_coord.x; int tile_num_nonzeros = tile_end_coord.y - tile_start_coord.y; OffsetT* s_tile_row_end_offsets = &temp_storage.aliasable.merge_items[0].row_end_offset; // Gather the row end-offsets for the merge tile into shared memory for (int item = threadIdx.x; item < tile_num_rows + ITEMS_PER_THREAD; item += BLOCK_THREADS) { const OffsetT offset = (cub::min)(static_cast(tile_start_coord.x + item), static_cast(spmv_params.num_rows - 1)); s_tile_row_end_offsets[item] = wd_row_end_offsets[offset]; } CTA_SYNC(); // Search for the thread's starting coordinate within the merge tile CountingInputIterator tile_nonzero_indices(tile_start_coord.y); CoordinateT thread_start_coord; MergePathSearch( OffsetT(threadIdx.x * ITEMS_PER_THREAD), // Diagonal s_tile_row_end_offsets, // List A tile_nonzero_indices, // List B tile_num_rows, tile_num_nonzeros, thread_start_coord); CTA_SYNC(); // Perf-sync // Compute the thread's merge path segment CoordinateT thread_current_coord = thread_start_coord; KeyValuePairT scan_segment[ITEMS_PER_THREAD]; ValueT running_total = 0.0; #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { OffsetT nonzero_idx = CUB_MIN(tile_nonzero_indices[thread_current_coord.y], spmv_params.num_nonzeros - 1); OffsetT column_idx = wd_column_indices[nonzero_idx]; ValueT value = wd_values[nonzero_idx]; ValueT vector_value = wd_vector_x[column_idx]; ValueT nonzero = value * vector_value; OffsetT row_end_offset = s_tile_row_end_offsets[thread_current_coord.x]; if (tile_nonzero_indices[thread_current_coord.y] < row_end_offset) { // Move down (accumulate) running_total += nonzero; scan_segment[ITEM].value = running_total; scan_segment[ITEM].key = tile_num_rows; ++thread_current_coord.y; } else { // Move right (reset) scan_segment[ITEM].value = running_total; scan_segment[ITEM].key = thread_current_coord.x; running_total = 0.0; ++thread_current_coord.x; } } CTA_SYNC(); // Block-wide reduce-value-by-segment KeyValuePairT tile_carry; ReduceBySegmentOpT scan_op; KeyValuePairT scan_item; scan_item.value = running_total; scan_item.key = thread_current_coord.x; BlockScanT(temp_storage.aliasable.scan).ExclusiveScan(scan_item, scan_item, scan_op, tile_carry); if (tile_num_rows > 0) { if (threadIdx.x == 0) { scan_item.key = -1; } // Direct scatter #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { if (scan_segment[ITEM].key < tile_num_rows) { if (scan_item.key == scan_segment[ITEM].key) { scan_segment[ITEM].value = scan_item.value + scan_segment[ITEM].value; } if (HAS_ALPHA) { scan_segment[ITEM].value *= spmv_params.alpha; } if (HAS_BETA) { // Update the output vector element ValueT addend = spmv_params.beta * wd_vector_y[tile_start_coord.x + scan_segment[ITEM].key]; scan_segment[ITEM].value += addend; } // Set the output vector element spmv_params.d_vector_y[tile_start_coord.x + scan_segment[ITEM].key] = scan_segment[ITEM].value; } } } // Return the tile's running carry-out return tile_carry; } /** * @brief Consume a merge tile, specialized for indirect load of nonzeros * * @param is_direct_load * Marker type indicating whether to load nonzeros directly during path-discovery or beforehand in batch */ _CCCL_DEVICE _CCCL_FORCEINLINE KeyValuePairT ConsumeTile(int tile_idx, CoordinateT tile_start_coord, CoordinateT tile_end_coord, Int2Type is_direct_load) { int tile_num_rows = tile_end_coord.x - tile_start_coord.x; int tile_num_nonzeros = tile_end_coord.y - tile_start_coord.y; #if (CUB_PTX_ARCH >= 520) OffsetT* s_tile_row_end_offsets = &temp_storage.aliasable.merge_items[0].row_end_offset; ValueT* s_tile_nonzeros = &temp_storage.aliasable.merge_items[tile_num_rows + ITEMS_PER_THREAD].nonzero; // Gather the nonzeros for the merge tile into shared memory # pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { int nonzero_idx = threadIdx.x + (ITEM * BLOCK_THREADS); ValueIteratorT a = wd_values + tile_start_coord.y + nonzero_idx; ColumnIndicesIteratorT ci = wd_column_indices + tile_start_coord.y + nonzero_idx; ValueT* s = s_tile_nonzeros + nonzero_idx; if (nonzero_idx < tile_num_nonzeros) { OffsetT column_idx = *ci; ValueT value = *a; ValueT vector_value = wd_vector_x[column_idx]; ValueT nonzero = value * vector_value; *s = nonzero; } } #else OffsetT* s_tile_row_end_offsets = &temp_storage.aliasable.merge_items[0].row_end_offset; ValueT* s_tile_nonzeros = &temp_storage.aliasable.merge_items[tile_num_rows + ITEMS_PER_THREAD].nonzero; // Gather the nonzeros for the merge tile into shared memory if (tile_num_nonzeros > 0) { # pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { int nonzero_idx = threadIdx.x + (ITEM * BLOCK_THREADS); nonzero_idx = CUB_MIN(nonzero_idx, tile_num_nonzeros - 1); OffsetT column_idx = wd_column_indices[tile_start_coord.y + nonzero_idx]; ValueT value = wd_values[tile_start_coord.y + nonzero_idx]; ValueT vector_value = wd_vector_x[column_idx]; ValueT nonzero = value * vector_value; s_tile_nonzeros[nonzero_idx] = nonzero; } } #endif // Gather the row end-offsets for the merge tile into shared memory #pragma unroll 1 for (int item = threadIdx.x; item < tile_num_rows + ITEMS_PER_THREAD; item += BLOCK_THREADS) { const OffsetT offset = (cub::min)(static_cast(tile_start_coord.x + item), static_cast(spmv_params.num_rows - 1)); s_tile_row_end_offsets[item] = wd_row_end_offsets[offset]; } CTA_SYNC(); // Search for the thread's starting coordinate within the merge tile CountingInputIterator tile_nonzero_indices(tile_start_coord.y); CoordinateT thread_start_coord; MergePathSearch( OffsetT(threadIdx.x * ITEMS_PER_THREAD), // Diagonal s_tile_row_end_offsets, // List A tile_nonzero_indices, // List B tile_num_rows, tile_num_nonzeros, thread_start_coord); CTA_SYNC(); // Perf-sync // Compute the thread's merge path segment CoordinateT thread_current_coord = thread_start_coord; KeyValuePairT scan_segment[ITEMS_PER_THREAD]; ValueT running_total = 0.0; OffsetT row_end_offset = s_tile_row_end_offsets[thread_current_coord.x]; ValueT nonzero = s_tile_nonzeros[thread_current_coord.y]; #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { if (tile_nonzero_indices[thread_current_coord.y] < row_end_offset) { // Move down (accumulate) scan_segment[ITEM].value = nonzero; running_total += nonzero; ++thread_current_coord.y; nonzero = s_tile_nonzeros[thread_current_coord.y]; } else { // Move right (reset) scan_segment[ITEM].value = 0.0; running_total = 0.0; ++thread_current_coord.x; row_end_offset = s_tile_row_end_offsets[thread_current_coord.x]; } scan_segment[ITEM].key = thread_current_coord.x; } CTA_SYNC(); // Block-wide reduce-value-by-segment KeyValuePairT tile_carry; ReduceBySegmentOpT scan_op; KeyValuePairT scan_item; scan_item.value = running_total; scan_item.key = thread_current_coord.x; BlockScanT(temp_storage.aliasable.scan).ExclusiveScan(scan_item, scan_item, scan_op, tile_carry); if (threadIdx.x == 0) { scan_item.key = thread_start_coord.x; scan_item.value = 0.0; } if (tile_num_rows > 0) { CTA_SYNC(); // Scan downsweep and scatter ValueT* s_partials = &temp_storage.aliasable.merge_items[0].nonzero; if (scan_item.key != scan_segment[0].key) { s_partials[scan_item.key] = scan_item.value; } else { scan_segment[0].value += scan_item.value; } #pragma unroll for (int ITEM = 1; ITEM < ITEMS_PER_THREAD; ++ITEM) { if (scan_segment[ITEM - 1].key != scan_segment[ITEM].key) { s_partials[scan_segment[ITEM - 1].key] = scan_segment[ITEM - 1].value; } else { scan_segment[ITEM].value += scan_segment[ITEM - 1].value; } } CTA_SYNC(); #pragma unroll 1 for (int item = threadIdx.x; item < tile_num_rows; item += BLOCK_THREADS) { spmv_params.d_vector_y[tile_start_coord.x + item] = s_partials[item]; } } // Return the tile's running carry-out return tile_carry; } /** * @brief Consume input tile * * @param[in] d_tile_coordinates * Pointer to the temporary array of tile starting coordinates * * @param[out] d_tile_carry_pairs * Pointer to the temporary array carry-out dot product row-ids, one per block * * @param[in] num_merge_tiles * Number of merge tiles */ _CCCL_DEVICE _CCCL_FORCEINLINE void ConsumeTile(CoordinateT* d_tile_coordinates, KeyValuePairT* d_tile_carry_pairs, int num_merge_tiles) { int tile_idx = (blockIdx.x * gridDim.y) + blockIdx.y; // Current tile index if (tile_idx >= num_merge_tiles) { return; } // Read our starting coordinates if (threadIdx.x < 2) { if (d_tile_coordinates == NULL) { // Search our starting coordinates OffsetT diagonal = (tile_idx + threadIdx.x) * TILE_ITEMS; CoordinateT tile_coord; CountingInputIterator nonzero_indices(0); // Search the merge path MergePathSearch( diagonal, RowOffsetsSearchIteratorT(spmv_params.d_row_end_offsets), nonzero_indices, spmv_params.num_rows, spmv_params.num_nonzeros, tile_coord); temp_storage.tile_coords[threadIdx.x] = tile_coord; } else { temp_storage.tile_coords[threadIdx.x] = d_tile_coordinates[tile_idx + threadIdx.x]; } } CTA_SYNC(); CoordinateT tile_start_coord = temp_storage.tile_coords[0]; CoordinateT tile_end_coord = temp_storage.tile_coords[1]; // Consume multi-segment tile KeyValuePairT tile_carry = ConsumeTile(tile_idx, tile_start_coord, tile_end_coord, Int2Type()); // Output the tile's carry-out if (threadIdx.x == 0) { if (HAS_ALPHA) { tile_carry.value *= spmv_params.alpha; } tile_carry.key += tile_start_coord.x; if (tile_carry.key >= spmv_params.num_rows) { // FIXME: This works around an invalid memory access in the // fixup kernel. The underlying issue needs to be debugged and // properly fixed, but this hack prevents writes to // out-of-bounds addresses. It doesn't appear to have an effect // on the validity of the results, since this only affects the // carry-over from last tile in the input. tile_carry.key = spmv_params.num_rows - 1; tile_carry.value = ValueT{}; }; d_tile_carry_pairs[tile_idx] = tile_carry; } } }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/agent/agent_sub_warp_merge_sort.cuh000066400000000000000000000261231463375617100230120ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include #include #include #include #include CUB_NAMESPACE_BEGIN template struct AgentSubWarpMergeSortPolicy { static constexpr int WARP_THREADS = WARP_THREADS_ARG; static constexpr int ITEMS_PER_THREAD = ITEMS_PER_THREAD_ARG; static constexpr int ITEMS_PER_TILE = WARP_THREADS * ITEMS_PER_THREAD; static constexpr cub::WarpLoadAlgorithm LOAD_ALGORITHM = LOAD_ALGORITHM_ARG; static constexpr cub::CacheLoadModifier LOAD_MODIFIER = LOAD_MODIFIER_ARG; static constexpr cub::WarpStoreAlgorithm STORE_ALGORITHM = STORE_ALGORITHM_ARG; }; template struct AgentSmallAndMediumSegmentedSortPolicy { static constexpr int BLOCK_THREADS = BLOCK_THREADS_ARG; using SmallPolicyT = SmallPolicy; using MediumPolicyT = MediumPolicy; static constexpr int SEGMENTS_PER_MEDIUM_BLOCK = BLOCK_THREADS / MediumPolicyT::WARP_THREADS; static constexpr int SEGMENTS_PER_SMALL_BLOCK = BLOCK_THREADS / SmallPolicyT::WARP_THREADS; }; /** * @brief AgentSubWarpSort implements a sub-warp merge sort. * * This agent can work with any power of two number of threads, not exceeding * 32. The number of threads is defined in the `PolicyT::WARP_THREADS`. Virtual * warp of `PolicyT::WARP_THREADS` will efficiently load data using * `PolicyT::LOAD_ALGORITHM`, sort it using `WarpMergeSort`, and store it back * using `PolicyT::STORE_ALGORITHM`. * * @tparam IS_DESCENDING * Whether or not the sorted-order is high-to-low * * @tparam PolicyT * Chained tuning policy * * @tparam KeyT * Key type * * @tparam ValueT * Value type * * @tparam OffsetT * Signed integer type for global offsets */ template class AgentSubWarpSort { using traits = detail::radix::traits_t; using bit_ordered_type = typename traits::bit_ordered_type; struct BinaryOpT { template _CCCL_DEVICE bool operator()(T lhs, T rhs) const noexcept { _CCCL_IF_CONSTEXPR (IS_DESCENDING) { return lhs > rhs; } else { return lhs < rhs; } _LIBCUDACXX_UNREACHABLE(); } #if defined(__CUDA_FP16_TYPES_EXIST__) _CCCL_DEVICE bool operator()(__half lhs, __half rhs) const noexcept { // Need to explicitly cast to float for SM <= 52. _CCCL_IF_CONSTEXPR (IS_DESCENDING) { NV_IF_TARGET(NV_PROVIDES_SM_53, (return __hgt(lhs, rhs);), (return __half2float(lhs) > __half2float(rhs);)); } else { NV_IF_TARGET(NV_PROVIDES_SM_53, (return __hlt(lhs, rhs);), (return __half2float(lhs) < __half2float(rhs);)); } _LIBCUDACXX_UNREACHABLE(); } #endif // __CUDA_FP16_TYPES_EXIST__ }; #if defined(__CUDA_FP16_TYPES_EXIST__) _CCCL_DEVICE static bool equal(__half lhs, __half rhs) { // Need to explicitly cast to float for SM <= 52. NV_IF_TARGET(NV_PROVIDES_SM_53, (return __heq(lhs, rhs);), (return __half2float(lhs) == __half2float(rhs);)); } #endif // __CUDA_FP16_TYPES_EXIST__ template _CCCL_DEVICE static bool equal(T lhs, T rhs) { return lhs == rhs; } _CCCL_DEVICE static bool get_oob_default(Int2Type /* is bool */) { // Traits::MAX_KEY for `bool` is 0xFF which is different from `true` and makes // comparison with oob unreliable. return !IS_DESCENDING; } _CCCL_DEVICE static KeyT get_oob_default(Int2Type /* is bool */) { // For FP64 the difference is: // Lowest() -> -1.79769e+308 = 00...00b -> TwiddleIn -> -0 = 10...00b // LOWEST -> -nan = 11...11b -> TwiddleIn -> 0 = 00...00b // Segmented sort doesn't support custom types at the moment. bit_ordered_type default_key_bits = IS_DESCENDING ? traits::min_raw_binary_key(detail::identity_decomposer_t{}) : traits::max_raw_binary_key(detail::identity_decomposer_t{}); return reinterpret_cast(default_key_bits); } public: static constexpr bool KEYS_ONLY = std::is_same::value; using WarpMergeSortT = WarpMergeSort; using KeysLoadItT = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator::type; using ItemsLoadItT = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator::type; using WarpLoadKeysT = cub::WarpLoad; using WarpLoadItemsT = cub::WarpLoad; using WarpStoreKeysT = cub::WarpStore; using WarpStoreItemsT = cub::WarpStore; union _TempStorage { typename WarpLoadKeysT::TempStorage load_keys; typename WarpLoadItemsT::TempStorage load_items; typename WarpMergeSortT::TempStorage sort; typename WarpStoreKeysT::TempStorage store_keys; typename WarpStoreItemsT::TempStorage store_items; }; /// Alias wrapper allowing storage to be unioned struct TempStorage : Uninitialized<_TempStorage> {}; _TempStorage& storage; _CCCL_DEVICE _CCCL_FORCEINLINE explicit AgentSubWarpSort(TempStorage& temp_storage) : storage(temp_storage.Alias()) {} _CCCL_DEVICE _CCCL_FORCEINLINE void ProcessSegment( int segment_size, KeysLoadItT keys_input, KeyT* keys_output, ItemsLoadItT values_input, ValueT* values_output) { WarpMergeSortT warp_merge_sort(storage.sort); if (segment_size < 3) { ShortCircuit( warp_merge_sort.get_linear_tid(), segment_size, keys_input, keys_output, values_input, values_output, BinaryOpT{}); } else { KeyT keys[PolicyT::ITEMS_PER_THREAD]; ValueT values[PolicyT::ITEMS_PER_THREAD]; KeyT oob_default = AgentSubWarpSort::get_oob_default(Int2Type::value>{}); WarpLoadKeysT(storage.load_keys).Load(keys_input, keys, segment_size, oob_default); WARP_SYNC(warp_merge_sort.get_member_mask()); if (!KEYS_ONLY) { WarpLoadItemsT(storage.load_items).Load(values_input, values, segment_size); WARP_SYNC(warp_merge_sort.get_member_mask()); } warp_merge_sort.Sort(keys, values, BinaryOpT{}, segment_size, oob_default); WARP_SYNC(warp_merge_sort.get_member_mask()); WarpStoreKeysT(storage.store_keys).Store(keys_output, keys, segment_size); if (!KEYS_ONLY) { WARP_SYNC(warp_merge_sort.get_member_mask()); WarpStoreItemsT(storage.store_items).Store(values_output, values, segment_size); } } } private: /** * This method implements a shortcut for sorting less than three items. * Only the first thread of a virtual warp is used for soring. */ template _CCCL_DEVICE _CCCL_FORCEINLINE void ShortCircuit( unsigned int linear_tid, OffsetT segment_size, KeysLoadItT keys_input, KeyT* keys_output, ItemsLoadItT values_input, ValueT* values_output, CompareOpT binary_op) { if (segment_size == 1) { if (linear_tid == 0) { if (keys_input.ptr != keys_output) { keys_output[0] = keys_input[0]; } if (!KEYS_ONLY) { if (values_input.ptr != values_output) { values_output[0] = values_input[0]; } } } } else if (segment_size == 2) { if (linear_tid == 0) { KeyT lhs = keys_input[0]; KeyT rhs = keys_input[1]; if (equal(lhs, rhs) || binary_op(lhs, rhs)) { keys_output[0] = lhs; keys_output[1] = rhs; if (!KEYS_ONLY) { if (values_output != values_input.ptr) { values_output[0] = values_input[0]; values_output[1] = values_input[1]; } } } else { keys_output[0] = rhs; keys_output[1] = lhs; if (!KEYS_ONLY) { // values_output might be an alias for values_input, so // we have to use registers here const ValueT lhs_val = values_input[0]; const ValueT rhs_val = values_input[1]; values_output[0] = rhs_val; values_output[1] = lhs_val; } } } } } }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/agent/agent_three_way_partition.cuh000066400000000000000000000470361463375617100230300ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include #include #include #include #include #include #include CUB_NAMESPACE_BEGIN /****************************************************************************** * Tuning policy types ******************************************************************************/ namespace detail { namespace three_way_partition { template struct pair_pack_t { OffsetT x, y; _CCCL_DEVICE pair_pack_t operator+(const pair_pack_t& other) const { return {x + other.x, y + other.y}; } }; template struct accumulator_pack_base_t { using pack_t = pair_pack_t; _CCCL_DEVICE static pack_t pack(OffsetT f, OffsetT s) { return {f, s}; } _CCCL_DEVICE static OffsetT first(pack_t packed) { return packed.x; } _CCCL_DEVICE static OffsetT second(pack_t packed) { return packed.y; } }; template struct accumulator_pack_base_t::type> { using pack_t = std::uint64_t; _CCCL_DEVICE static pack_t pack(OffsetT f, OffsetT s) { return (static_cast(f) << 32) | static_cast(s); } _CCCL_DEVICE static OffsetT first(pack_t packed) { return static_cast(packed >> 32); } _CCCL_DEVICE static OffsetT second(pack_t packed) { return static_cast(packed & 0xFFFFFFFF); } }; template struct accumulator_pack_t : accumulator_pack_base_t { using base = accumulator_pack_base_t; using typename base::pack_t; _CCCL_DEVICE static void subtract(pack_t& packed, OffsetT val) { packed = base::pack(base::first(packed) - val, base::second(packed) - val); } _CCCL_DEVICE static OffsetT sum(pack_t& packed) { return base::first(packed) + base::second(packed); } _CCCL_DEVICE static pack_t zero() { return {}; } }; } // namespace three_way_partition } // namespace detail template > struct AgentThreeWayPartitionPolicy { static constexpr int BLOCK_THREADS = _BLOCK_THREADS; static constexpr int ITEMS_PER_THREAD = _ITEMS_PER_THREAD; static constexpr BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; static constexpr CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; static constexpr BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; struct detail { using delay_constructor_t = DelayConstructorT; }; }; /** * \brief Implements a device-wide three-way partitioning * * Splits input data into three parts based on the selection functors. If the * first functor selects an item, the algorithm places it in the first part. * Otherwise, if the second functor selects an item, the algorithm places it in * the second part. If both functors don't select an item, the algorithm places * it into the unselected part. */ template struct AgentThreeWayPartition { //--------------------------------------------------------------------- // Types and constants //--------------------------------------------------------------------- // The input value type using InputT = cub::detail::value_t; using AccumPackHelperT = detail::three_way_partition::accumulator_pack_t; using AccumPackT = typename AccumPackHelperT::pack_t; // Tile status descriptor interface type using ScanTileStateT = cub::ScanTileState; // Constants static constexpr int BLOCK_THREADS = PolicyT::BLOCK_THREADS; static constexpr int ITEMS_PER_THREAD = PolicyT::ITEMS_PER_THREAD; static constexpr int TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD; using WrappedInputIteratorT = cub::detail::conditional_t::value, cub::CacheModifiedInputIterator, InputIteratorT>; // Parameterized BlockLoad type for input data using BlockLoadT = cub::BlockLoad; // Parameterized BlockScan type using BlockScanT = cub::BlockScan; // Callback type for obtaining tile prefix during block scan using DelayConstructorT = typename PolicyT::detail::delay_constructor_t; using TilePrefixCallbackOpT = cub::TilePrefixCallbackOp; // Item exchange type using ItemExchangeT = InputT[TILE_ITEMS]; // Shared memory type for this thread block union _TempStorage { struct ScanStorage { // Smem needed for tile scanning typename BlockScanT::TempStorage scan; // Smem needed for cooperative prefix callback typename TilePrefixCallbackOpT::TempStorage prefix; } scan_storage; // Smem needed for loading items typename BlockLoadT::TempStorage load_items; // Smem needed for compacting items (allows non POD items in this union) cub::Uninitialized raw_exchange; }; // Alias wrapper allowing storage to be unioned struct TempStorage : cub::Uninitialized<_TempStorage> {}; //--------------------------------------------------------------------- // Per-thread fields //--------------------------------------------------------------------- _TempStorage& temp_storage; ///< Reference to temp_storage WrappedInputIteratorT d_in; ///< Input items FirstOutputIteratorT d_first_part_out; SecondOutputIteratorT d_second_part_out; UnselectedOutputIteratorT d_unselected_out; SelectFirstPartOp select_first_part_op; SelectSecondPartOp select_second_part_op; OffsetT num_items; ///< Total number of input items //--------------------------------------------------------------------- // Constructor //--------------------------------------------------------------------- // Constructor _CCCL_DEVICE _CCCL_FORCEINLINE AgentThreeWayPartition( TempStorage& temp_storage, InputIteratorT d_in, FirstOutputIteratorT d_first_part_out, SecondOutputIteratorT d_second_part_out, UnselectedOutputIteratorT d_unselected_out, SelectFirstPartOp select_first_part_op, SelectSecondPartOp select_second_part_op, OffsetT num_items) : temp_storage(temp_storage.Alias()) , d_in(d_in) , d_first_part_out(d_first_part_out) , d_second_part_out(d_second_part_out) , d_unselected_out(d_unselected_out) , select_first_part_op(select_first_part_op) , select_second_part_op(select_second_part_op) , num_items(num_items) {} //--------------------------------------------------------------------- // Utility methods for initializing the selections //--------------------------------------------------------------------- template _CCCL_DEVICE _CCCL_FORCEINLINE void Initialize( OffsetT num_tile_items, InputT (&items)[ITEMS_PER_THREAD], AccumPackT (&items_selection_flags)[ITEMS_PER_THREAD]) { for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { // Out-of-bounds items are selection_flags items_selection_flags[ITEM] = AccumPackHelperT::pack(1, 1); if (!IS_LAST_TILE || (OffsetT(threadIdx.x * ITEMS_PER_THREAD) + ITEM < num_tile_items)) { OffsetT first_item_selected = select_first_part_op(items[ITEM]); items_selection_flags[ITEM] = AccumPackHelperT::pack(first_item_selected, first_item_selected ? 0 : select_second_part_op(items[ITEM])); } } } template _CCCL_DEVICE _CCCL_FORCEINLINE void Scatter( InputT (&items)[ITEMS_PER_THREAD], AccumPackT (&items_selection_flags)[ITEMS_PER_THREAD], AccumPackT (&items_selection_indices)[ITEMS_PER_THREAD], int num_tile_items, AccumPackT num_tile_selected, AccumPackT num_tile_selected_prefix, OffsetT num_rejected_prefix) { CTA_SYNC(); const OffsetT num_first_selections_prefix = AccumPackHelperT::first(num_tile_selected_prefix); const OffsetT num_second_selections_prefix = AccumPackHelperT::second(num_tile_selected_prefix); const int first_item_end = AccumPackHelperT::first(num_tile_selected); const int second_item_end = first_item_end + AccumPackHelperT::second(num_tile_selected); // Scatter items to shared memory (rejections first) for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { int item_idx = (threadIdx.x * ITEMS_PER_THREAD) + ITEM; const OffsetT first_items_selection_indices = AccumPackHelperT::first(items_selection_indices[ITEM]); const OffsetT second_items_selection_indices = AccumPackHelperT::second(items_selection_indices[ITEM]); if (!IS_LAST_TILE || (item_idx < num_tile_items)) { int local_scatter_offset = 0; if (AccumPackHelperT::first(items_selection_flags[ITEM])) { local_scatter_offset = first_items_selection_indices - num_first_selections_prefix; } else if (AccumPackHelperT::second(items_selection_flags[ITEM])) { local_scatter_offset = first_item_end + second_items_selection_indices - num_second_selections_prefix; } else { // Medium item int local_selection_idx = (first_items_selection_indices - num_first_selections_prefix) + (second_items_selection_indices - num_second_selections_prefix); local_scatter_offset = second_item_end + item_idx - local_selection_idx; } temp_storage.raw_exchange.Alias()[local_scatter_offset] = items[ITEM]; } } CTA_SYNC(); // Gather items from shared memory and scatter to global for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { int item_idx = (ITEM * BLOCK_THREADS) + threadIdx.x; if (!IS_LAST_TILE || (item_idx < num_tile_items)) { InputT item = temp_storage.raw_exchange.Alias()[item_idx]; if (item_idx < first_item_end) { d_first_part_out[num_first_selections_prefix + item_idx] = item; } else if (item_idx < second_item_end) { d_second_part_out[num_second_selections_prefix + item_idx - first_item_end] = item; } else { int rejection_idx = item_idx - second_item_end; d_unselected_out[num_rejected_prefix + rejection_idx] = item; } } } } //--------------------------------------------------------------------- // Cooperatively scan a device-wide sequence of tiles with other CTAs //--------------------------------------------------------------------- /** * Process first tile of input (dynamic chained scan). * Returns the running count of selections (including this tile) * * @param num_tile_items Number of input items comprising this tile * @param tile_offset Tile offset * @param first_tile_state Global tile state descriptor * @param second_tile_state Global tile state descriptor */ template _CCCL_DEVICE _CCCL_FORCEINLINE void ConsumeFirstTile(int num_tile_items, OffsetT tile_offset, ScanTileStateT& tile_state, AccumPackT& num_items_selected) { InputT items[ITEMS_PER_THREAD]; AccumPackT items_selection_flags[ITEMS_PER_THREAD]; AccumPackT items_selection_indices[ITEMS_PER_THREAD]; // Load items if (IS_LAST_TILE) { BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items, num_tile_items); } else { BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items); } // Initialize selection_flags Initialize(num_tile_items, items, items_selection_flags); CTA_SYNC(); // Exclusive scan of selection_flags BlockScanT(temp_storage.scan_storage.scan) .ExclusiveSum(items_selection_flags, items_selection_indices, num_items_selected); if (threadIdx.x == 0) { // Update tile status if this is not the last tile if (!IS_LAST_TILE) { tile_state.SetInclusive(0, num_items_selected); } } // Discount any out-of-bounds selections if (IS_LAST_TILE) { AccumPackHelperT::subtract(num_items_selected, TILE_ITEMS - num_tile_items); } // Scatter flagged items Scatter( items, items_selection_flags, items_selection_indices, num_tile_items, num_items_selected, // all the prefixes equal to 0 because it's the first tile AccumPackHelperT::zero(), 0); } /** * Process subsequent tile of input (dynamic chained scan). * Returns the running count of selections (including this tile) * * @param num_tile_items Number of input items comprising this tile * @param tile_idx Tile index * @param tile_offset Tile offset * @param first_tile_state Global tile state descriptor * @param second_tile_state Global tile state descriptor */ template _CCCL_DEVICE _CCCL_FORCEINLINE void ConsumeSubsequentTile( int num_tile_items, int tile_idx, OffsetT tile_offset, ScanTileStateT& tile_state, AccumPackT& num_items_selected) { InputT items[ITEMS_PER_THREAD]; AccumPackT items_selected_flags[ITEMS_PER_THREAD]; AccumPackT items_selected_indices[ITEMS_PER_THREAD]; // Load items if (IS_LAST_TILE) { BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items, num_tile_items); } else { BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items); } // Initialize selection_flags Initialize(num_tile_items, items, items_selected_flags); CTA_SYNC(); // Exclusive scan of values and selection_flags TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.scan_storage.prefix, cub::Sum(), tile_idx); BlockScanT(temp_storage.scan_storage.scan).ExclusiveSum(items_selected_flags, items_selected_indices, prefix_op); num_items_selected = prefix_op.GetInclusivePrefix(); AccumPackT num_items_in_tile_selected = prefix_op.GetBlockAggregate(); AccumPackT num_items_selected_prefix = prefix_op.GetExclusivePrefix(); CTA_SYNC(); OffsetT num_rejected_prefix = (tile_idx * TILE_ITEMS) - AccumPackHelperT::sum(num_items_selected_prefix); // Discount any out-of-bounds selections. There are exactly // TILE_ITEMS - num_tile_items elements like that because we // marked them as selected in Initialize method. if (IS_LAST_TILE) { const int num_discount = TILE_ITEMS - num_tile_items; AccumPackHelperT::subtract(num_items_selected, num_discount); AccumPackHelperT::subtract(num_items_in_tile_selected, num_discount); } // Scatter flagged items Scatter( items, items_selected_flags, items_selected_indices, num_tile_items, num_items_in_tile_selected, num_items_selected_prefix, num_rejected_prefix); } /** * Process a tile of input */ template _CCCL_DEVICE _CCCL_FORCEINLINE void ConsumeTile(int num_tile_items, int tile_idx, OffsetT tile_offset, ScanTileStateT& tile_state, AccumPackT& accum) { if (tile_idx == 0) { ConsumeFirstTile(num_tile_items, tile_offset, tile_state, accum); } else { ConsumeSubsequentTile(num_tile_items, tile_idx, tile_offset, tile_state, accum); } } /** * Scan tiles of items as part of a dynamic chained scan * * @tparam NumSelectedIteratorT * Output iterator type for recording number of items selection_flags * * @param num_tiles * Total number of input tiles * * @param first_tile_state * Global tile state descriptor * * @param second_tile_state * Global tile state descriptor * * @param d_num_selected_out * Output total number selection_flags */ template _CCCL_DEVICE _CCCL_FORCEINLINE void ConsumeRange(int num_tiles, ScanTileStateT& tile_state, NumSelectedIteratorT d_num_selected_out) { // Blocks are launched in increasing order, so just assign one tile per block // Current tile index const int tile_idx = static_cast((blockIdx.x * gridDim.y) + blockIdx.y); // Global offset for the current tile const OffsetT tile_offset = tile_idx * TILE_ITEMS; AccumPackT accum; if (tile_idx < num_tiles - 1) { // Not the last tile (full) ConsumeTile(TILE_ITEMS, tile_idx, tile_offset, tile_state, accum); } else { // The last tile (possibly partially-full) const OffsetT num_remaining = num_items - tile_offset; ConsumeTile(num_remaining, tile_idx, tile_offset, tile_state, accum); if (threadIdx.x == 0) { // Output the total number of items selection_flags d_num_selected_out[0] = AccumPackHelperT::first(accum); d_num_selected_out[1] = AccumPackHelperT::second(accum); } } } }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/agent/agent_unique_by_key.cuh000066400000000000000000000472231463375617100216160ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * @file * cub::AgentUniqueByKey implements a stateful abstraction of CUDA thread blocks for participating in device-wide * unique-by-key. */ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include #include #include #include CUB_NAMESPACE_BEGIN /****************************************************************************** * Tuning policy types ******************************************************************************/ /** * Parameterizable tuning policy type for AgentUniqueByKey * * @tparam DelayConstructorT * Implementation detail, do not specify directly, requirements on the * content of this type are subject to breaking change. */ template > struct AgentUniqueByKeyPolicy { enum { BLOCK_THREADS = _BLOCK_THREADS, ITEMS_PER_THREAD = _ITEMS_PER_THREAD, }; static constexpr cub::BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; static constexpr cub::CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; static constexpr cub::BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; struct detail { using delay_constructor_t = DelayConstructorT; }; }; /****************************************************************************** * Thread block abstractions ******************************************************************************/ /** * @brief AgentUniqueByKey implements a stateful abstraction of CUDA thread blocks for participating * in device-wide unique-by-key * * @tparam AgentUniqueByKeyPolicyT * Parameterized AgentUniqueByKeyPolicy tuning policy type * * @tparam KeyInputIteratorT * Random-access input iterator type for keys * * @tparam ValueInputIteratorT * Random-access input iterator type for values * * @tparam KeyOutputIteratorT * Random-access output iterator type for keys * * @tparam ValueOutputIteratorT * Random-access output iterator type for values * * @tparam EqualityOpT * Equality operator type * * @tparam OffsetT * Signed integer type for global offsets */ template struct AgentUniqueByKey { //--------------------------------------------------------------------- // Types and constants //--------------------------------------------------------------------- // The input key and value type using KeyT = typename std::iterator_traits::value_type; using ValueT = typename std::iterator_traits::value_type; // Tile status descriptor interface type using ScanTileStateT = ScanTileState; // Constants enum { BLOCK_THREADS = AgentUniqueByKeyPolicyT::BLOCK_THREADS, ITEMS_PER_THREAD = AgentUniqueByKeyPolicyT::ITEMS_PER_THREAD, ITEMS_PER_TILE = BLOCK_THREADS * ITEMS_PER_THREAD, }; // Cache-modified Input iterator wrapper type (for applying cache modifier) for keys using WrappedKeyInputIteratorT = typename std::conditional< std::is_pointer::value, CacheModifiedInputIterator, // Wrap the native input pointer // with // CacheModifiedValuesInputIterator KeyInputIteratorT>::type; // Directly use the supplied input iterator type // Cache-modified Input iterator wrapper type (for applying cache modifier) for values using WrappedValueInputIteratorT = typename std::conditional< std::is_pointer::value, CacheModifiedInputIterator, // Wrap the native input // pointer with // CacheModifiedValuesInputIterator ValueInputIteratorT>::type; // Directly use the supplied input iterator type // Parameterized BlockLoad type for input data using BlockLoadKeys = BlockLoad; // Parameterized BlockLoad type for flags using BlockLoadValues = BlockLoad; // Parameterized BlockDiscontinuity type for items using BlockDiscontinuityKeys = cub::BlockDiscontinuity; // Parameterized BlockScan type using BlockScanT = cub::BlockScan; // Parameterized BlockDiscontinuity type for items using DelayConstructorT = typename AgentUniqueByKeyPolicyT::detail::delay_constructor_t; using TilePrefixCallback = cub::TilePrefixCallbackOp; // Key exchange type using KeyExchangeT = KeyT[ITEMS_PER_TILE]; // Value exchange type using ValueExchangeT = ValueT[ITEMS_PER_TILE]; // Shared memory type for this thread block union _TempStorage { struct ScanStorage { typename BlockScanT::TempStorage scan; typename TilePrefixCallback::TempStorage prefix; typename BlockDiscontinuityKeys::TempStorage discontinuity; } scan_storage; // Smem needed for loading keys typename BlockLoadKeys::TempStorage load_keys; // Smem needed for loading values typename BlockLoadValues::TempStorage load_values; // Smem needed for compacting items (allows non POD items in this union) Uninitialized shared_keys; Uninitialized shared_values; }; // Alias wrapper allowing storage to be unioned struct TempStorage : Uninitialized<_TempStorage> {}; //--------------------------------------------------------------------- // Per-thread fields //--------------------------------------------------------------------- _TempStorage& temp_storage; WrappedKeyInputIteratorT d_keys_in; WrappedValueInputIteratorT d_values_in; KeyOutputIteratorT d_keys_out; ValueOutputIteratorT d_values_out; cub::InequalityWrapper inequality_op; OffsetT num_items; //--------------------------------------------------------------------- // Constructor //--------------------------------------------------------------------- // Constructor _CCCL_DEVICE _CCCL_FORCEINLINE AgentUniqueByKey( TempStorage& temp_storage_, WrappedKeyInputIteratorT d_keys_in_, WrappedValueInputIteratorT d_values_in_, KeyOutputIteratorT d_keys_out_, ValueOutputIteratorT d_values_out_, EqualityOpT equality_op_, OffsetT num_items_) : temp_storage(temp_storage_.Alias()) , d_keys_in(d_keys_in_) , d_values_in(d_values_in_) , d_keys_out(d_keys_out_) , d_values_out(d_values_out_) , inequality_op(equality_op_) , num_items(num_items_) {} //--------------------------------------------------------------------- // Utility functions //--------------------------------------------------------------------- struct KeyTagT {}; struct ValueTagT {}; _CCCL_DEVICE _CCCL_FORCEINLINE KeyExchangeT& GetShared(KeyTagT) { return temp_storage.shared_keys.Alias(); } _CCCL_DEVICE _CCCL_FORCEINLINE ValueExchangeT& GetShared(ValueTagT) { return temp_storage.shared_values.Alias(); } //--------------------------------------------------------------------- // Scatter utility methods //--------------------------------------------------------------------- template _CCCL_DEVICE _CCCL_FORCEINLINE void Scatter( Tag tag, OutputIt items_out, T (&items)[ITEMS_PER_THREAD], OffsetT (&selection_flags)[ITEMS_PER_THREAD], OffsetT (&selection_indices)[ITEMS_PER_THREAD], int /*num_tile_items*/, int num_tile_selections, OffsetT num_selections_prefix, OffsetT /*num_selections*/) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { int local_scatter_offset = selection_indices[ITEM] - num_selections_prefix; if (selection_flags[ITEM]) { GetShared(tag)[local_scatter_offset] = items[ITEM]; } } CTA_SYNC(); // Preventing loop unrolling helps avoid perf degradation when switching from signed to unsigned 32-bit offset // types #pragma unroll 1 for (int item = threadIdx.x; item < num_tile_selections; item += BLOCK_THREADS) { items_out[num_selections_prefix + item] = GetShared(tag)[item]; } CTA_SYNC(); } //--------------------------------------------------------------------- // Cooperatively scan a device-wide sequence of tiles with other CTAs //--------------------------------------------------------------------- /** * @brief Process first tile of input (dynamic chained scan). * * @param num_tile_items * Number of input items comprising this tile * * @param tile_offset * Tile offset * * @param tile_state * Global tile state descriptor * * @return The running count of selections (including this tile) */ template _CCCL_DEVICE _CCCL_FORCEINLINE OffsetT ConsumeFirstTile(int num_tile_items, OffsetT tile_offset, ScanTileStateT& tile_state) { KeyT keys[ITEMS_PER_THREAD]; OffsetT selection_flags[ITEMS_PER_THREAD]; OffsetT selection_idx[ITEMS_PER_THREAD]; if (IS_LAST_TILE) { // Fill last elements with the first element // because collectives are not suffix guarded BlockLoadKeys(temp_storage.load_keys) .Load(d_keys_in + tile_offset, keys, num_tile_items, *(d_keys_in + tile_offset)); } else { BlockLoadKeys(temp_storage.load_keys).Load(d_keys_in + tile_offset, keys); } CTA_SYNC(); ValueT values[ITEMS_PER_THREAD]; if (IS_LAST_TILE) { // Fill last elements with the first element // because collectives are not suffix guarded BlockLoadValues(temp_storage.load_values) .Load(d_values_in + tile_offset, values, num_tile_items, *(d_values_in + tile_offset)); } else { BlockLoadValues(temp_storage.load_values).Load(d_values_in + tile_offset, values); } CTA_SYNC(); BlockDiscontinuityKeys(temp_storage.scan_storage.discontinuity).FlagHeads(selection_flags, keys, inequality_op); #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { // Set selection_flags for out-of-bounds items if ((IS_LAST_TILE) && (OffsetT(threadIdx.x * ITEMS_PER_THREAD) + ITEM >= num_tile_items)) { selection_flags[ITEM] = 1; } } CTA_SYNC(); OffsetT num_tile_selections = 0; OffsetT num_selections = 0; OffsetT num_selections_prefix = 0; BlockScanT(temp_storage.scan_storage.scan).ExclusiveSum(selection_flags, selection_idx, num_tile_selections); if (threadIdx.x == 0) { // Update tile status if this is not the last tile if (!IS_LAST_TILE) { tile_state.SetInclusive(0, num_tile_selections); } } // Do not count any out-of-bounds selections if (IS_LAST_TILE) { int num_discount = ITEMS_PER_TILE - num_tile_items; num_tile_selections -= num_discount; } num_selections = num_tile_selections; CTA_SYNC(); Scatter(KeyTagT(), d_keys_out, keys, selection_flags, selection_idx, num_tile_items, num_tile_selections, num_selections_prefix, num_selections); CTA_SYNC(); Scatter(ValueTagT(), d_values_out, values, selection_flags, selection_idx, num_tile_items, num_tile_selections, num_selections_prefix, num_selections); return num_selections; } /** * @brief Process subsequent tile of input (dynamic chained scan). * * @param num_tile_items * Number of input items comprising this tile * * @param tile_idx * Tile index * * @param tile_offset * Tile offset * * @param tile_state * Global tile state descriptor * * @return Returns the running count of selections (including this tile) */ template _CCCL_DEVICE _CCCL_FORCEINLINE OffsetT ConsumeSubsequentTile(int num_tile_items, int tile_idx, OffsetT tile_offset, ScanTileStateT& tile_state) { KeyT keys[ITEMS_PER_THREAD]; OffsetT selection_flags[ITEMS_PER_THREAD]; OffsetT selection_idx[ITEMS_PER_THREAD]; if (IS_LAST_TILE) { // Fill last elements with the first element // because collectives are not suffix guarded BlockLoadKeys(temp_storage.load_keys) .Load(d_keys_in + tile_offset, keys, num_tile_items, *(d_keys_in + tile_offset)); } else { BlockLoadKeys(temp_storage.load_keys).Load(d_keys_in + tile_offset, keys); } CTA_SYNC(); ValueT values[ITEMS_PER_THREAD]; if (IS_LAST_TILE) { // Fill last elements with the first element // because collectives are not suffix guarded BlockLoadValues(temp_storage.load_values) .Load(d_values_in + tile_offset, values, num_tile_items, *(d_values_in + tile_offset)); } else { BlockLoadValues(temp_storage.load_values).Load(d_values_in + tile_offset, values); } CTA_SYNC(); KeyT tile_predecessor = d_keys_in[tile_offset - 1]; BlockDiscontinuityKeys(temp_storage.scan_storage.discontinuity) .FlagHeads(selection_flags, keys, inequality_op, tile_predecessor); #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { // Set selection_flags for out-of-bounds items if ((IS_LAST_TILE) && (OffsetT(threadIdx.x * ITEMS_PER_THREAD) + ITEM >= num_tile_items)) { selection_flags[ITEM] = 1; } } CTA_SYNC(); OffsetT num_tile_selections = 0; OffsetT num_selections = 0; OffsetT num_selections_prefix = 0; TilePrefixCallback prefix_cb(tile_state, temp_storage.scan_storage.prefix, cub::Sum(), tile_idx); BlockScanT(temp_storage.scan_storage.scan).ExclusiveSum(selection_flags, selection_idx, prefix_cb); num_selections = prefix_cb.GetInclusivePrefix(); num_tile_selections = prefix_cb.GetBlockAggregate(); num_selections_prefix = prefix_cb.GetExclusivePrefix(); if (IS_LAST_TILE) { int num_discount = ITEMS_PER_TILE - num_tile_items; num_tile_selections -= num_discount; num_selections -= num_discount; } CTA_SYNC(); Scatter(KeyTagT(), d_keys_out, keys, selection_flags, selection_idx, num_tile_items, num_tile_selections, num_selections_prefix, num_selections); CTA_SYNC(); Scatter(ValueTagT(), d_values_out, values, selection_flags, selection_idx, num_tile_items, num_tile_selections, num_selections_prefix, num_selections); return num_selections; } /** * @brief Process a tile of input * * @param num_tile_items * Number of input items comprising this tile * * @param tile_idx * Tile index * * @param tile_offset * Tile offset * * @param tile_state * Global tile state descriptor */ template _CCCL_DEVICE _CCCL_FORCEINLINE OffsetT ConsumeTile(int num_tile_items, int tile_idx, OffsetT tile_offset, ScanTileStateT& tile_state) { OffsetT num_selections; if (tile_idx == 0) { num_selections = ConsumeFirstTile(num_tile_items, tile_offset, tile_state); } else { num_selections = ConsumeSubsequentTile(num_tile_items, tile_idx, tile_offset, tile_state); } return num_selections; } /** * @brief Scan tiles of items as part of a dynamic chained scan * * @param num_tiles * Total number of input tiles * * @param tile_state * Global tile state descriptor * * @param d_num_selected_out * Output total number selection_flags * * @tparam NumSelectedIteratorT * Output iterator type for recording number of items selection_flags * */ template _CCCL_DEVICE _CCCL_FORCEINLINE void ConsumeRange(int num_tiles, ScanTileStateT& tile_state, NumSelectedIteratorT d_num_selected_out) { // Blocks are launched in increasing order, so just assign one tile per block int tile_idx = (blockIdx.x * gridDim.y) + blockIdx.y; // Current tile index // Global offset for the current tile OffsetT tile_offset = static_cast(tile_idx) * static_cast(ITEMS_PER_TILE); if (tile_idx < num_tiles - 1) { ConsumeTile(ITEMS_PER_TILE, tile_idx, tile_offset, tile_state); } else { int num_remaining = static_cast(num_items - tile_offset); OffsetT num_selections = ConsumeTile(num_remaining, tile_idx, tile_offset, tile_state); if (threadIdx.x == 0) { *d_num_selected_out = num_selections; } } } }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/agent/single_pass_scan_operators.cuh000066400000000000000000001112441463375617100231740ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * Callback operator types for supplying BlockScan prefixes */ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include #include #include #include #include #include CUB_NAMESPACE_BEGIN /****************************************************************************** * Prefix functor type for maintaining a running prefix while scanning a * region independent of other thread blocks ******************************************************************************/ /** * Stateful callback operator type for supplying BlockScan prefixes. * Maintains a running prefix that can be applied to consecutive * BlockScan operations. * * @tparam T * BlockScan value type * * @tparam ScanOpT * Wrapped scan operator type */ template struct BlockScanRunningPrefixOp { /// Wrapped scan operator ScanOpT op; /// Running block-wide prefix T running_total; /// Constructor _CCCL_DEVICE _CCCL_FORCEINLINE BlockScanRunningPrefixOp(ScanOpT op) : op(op) {} /// Constructor _CCCL_DEVICE _CCCL_FORCEINLINE BlockScanRunningPrefixOp(T starting_prefix, ScanOpT op) : op(op) , running_total(starting_prefix) {} /** * Prefix callback operator. Returns the block-wide running_total in thread-0. * * @param block_aggregate * The aggregate sum of the BlockScan inputs */ _CCCL_DEVICE _CCCL_FORCEINLINE T operator()(const T& block_aggregate) { T retval = running_total; running_total = op(running_total, block_aggregate); return retval; } }; /****************************************************************************** * Generic tile status interface types for block-cooperative scans ******************************************************************************/ /** * Enumerations of tile status */ enum ScanTileStatus { SCAN_TILE_OOB, // Out-of-bounds (e.g., padding) SCAN_TILE_INVALID = 99, // Not yet processed SCAN_TILE_PARTIAL, // Tile aggregate is available SCAN_TILE_INCLUSIVE, // Inclusive tile prefix is available }; namespace detail { template _CCCL_DEVICE _CCCL_FORCEINLINE void delay() { NV_IF_TARGET(NV_PROVIDES_SM_70, (if (Delay > 0) { if (gridDim.x < GridThreshold) { __threadfence_block(); } else { __nanosleep(Delay); } })); } template _CCCL_DEVICE _CCCL_FORCEINLINE void delay(int ns) { NV_IF_TARGET(NV_PROVIDES_SM_70, (if (ns > 0) { if (gridDim.x < GridThreshold) { __threadfence_block(); } else { __nanosleep(ns); } })); } template _CCCL_DEVICE _CCCL_FORCEINLINE void always_delay() { NV_IF_TARGET(NV_PROVIDES_SM_70, (__nanosleep(Delay);)); } _CCCL_DEVICE _CCCL_FORCEINLINE void always_delay(int ns) { NV_IF_TARGET(NV_PROVIDES_SM_70, (__nanosleep(ns);), ((void) ns;)); } template _CCCL_DEVICE _CCCL_FORCEINLINE void delay_or_prevent_hoisting() { NV_IF_TARGET(NV_PROVIDES_SM_70, (delay();), (__threadfence_block();)); } template _CCCL_DEVICE _CCCL_FORCEINLINE void delay_or_prevent_hoisting(int ns) { NV_IF_TARGET(NV_PROVIDES_SM_70, (delay(ns);), ((void) ns; __threadfence_block();)); } template _CCCL_DEVICE _CCCL_FORCEINLINE void always_delay_or_prevent_hoisting() { NV_IF_TARGET(NV_PROVIDES_SM_70, (always_delay(Delay);), (__threadfence_block();)); } _CCCL_DEVICE _CCCL_FORCEINLINE void always_delay_or_prevent_hoisting(int ns) { NV_IF_TARGET(NV_PROVIDES_SM_70, (always_delay(ns);), ((void) ns; __threadfence_block();)); } template struct no_delay_constructor_t { struct delay_t { _CCCL_DEVICE _CCCL_FORCEINLINE void operator()() { NV_IF_TARGET(NV_PROVIDES_SM_70, (), (__threadfence_block();)); } }; _CCCL_DEVICE _CCCL_FORCEINLINE no_delay_constructor_t(unsigned int /* seed */) { delay(); } _CCCL_DEVICE _CCCL_FORCEINLINE delay_t operator()() { return {}; } }; template struct reduce_by_key_delay_constructor_t { struct delay_t { _CCCL_DEVICE _CCCL_FORCEINLINE void operator()() { NV_DISPATCH_TARGET( NV_IS_EXACTLY_SM_80, (delay();), NV_PROVIDES_SM_70, (delay<0, GridThreshold>();), NV_IS_DEVICE, (__threadfence_block();)); } }; _CCCL_DEVICE _CCCL_FORCEINLINE reduce_by_key_delay_constructor_t(unsigned int /* seed */) { delay(); } _CCCL_DEVICE _CCCL_FORCEINLINE delay_t operator()() { return {}; } }; template struct fixed_delay_constructor_t { struct delay_t { _CCCL_DEVICE _CCCL_FORCEINLINE void operator()() { delay_or_prevent_hoisting(); } }; _CCCL_DEVICE _CCCL_FORCEINLINE fixed_delay_constructor_t(unsigned int /* seed */) { delay(); } _CCCL_DEVICE _CCCL_FORCEINLINE delay_t operator()() { return {}; } }; template struct exponential_backoff_constructor_t { struct delay_t { int delay; _CCCL_DEVICE _CCCL_FORCEINLINE void operator()() { always_delay_or_prevent_hoisting(delay); delay <<= 1; } }; _CCCL_DEVICE _CCCL_FORCEINLINE exponential_backoff_constructor_t(unsigned int /* seed */) { always_delay(); } _CCCL_DEVICE _CCCL_FORCEINLINE delay_t operator()() { return {InitialDelay}; } }; template struct exponential_backoff_jitter_constructor_t { struct delay_t { static constexpr unsigned int a = 16807; static constexpr unsigned int c = 0; static constexpr unsigned int m = 1u << 31; unsigned int max_delay; unsigned int& seed; _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int next(unsigned int min, unsigned int max) { return (seed = (a * seed + c) % m) % (max + 1 - min) + min; } _CCCL_DEVICE _CCCL_FORCEINLINE void operator()() { always_delay_or_prevent_hoisting(next(0, max_delay)); max_delay <<= 1; } }; unsigned int seed; _CCCL_DEVICE _CCCL_FORCEINLINE exponential_backoff_jitter_constructor_t(unsigned int seed) : seed(seed) { always_delay(); } _CCCL_DEVICE _CCCL_FORCEINLINE delay_t operator()() { return {InitialDelay, seed}; } }; template struct exponential_backoff_jitter_window_constructor_t { struct delay_t { static constexpr unsigned int a = 16807; static constexpr unsigned int c = 0; static constexpr unsigned int m = 1u << 31; unsigned int max_delay; unsigned int& seed; _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int next(unsigned int min, unsigned int max) { return (seed = (a * seed + c) % m) % (max + 1 - min) + min; } _CCCL_DEVICE _CCCL_FORCEINLINE void operator()() { unsigned int next_max_delay = max_delay << 1; always_delay_or_prevent_hoisting(next(max_delay, next_max_delay)); max_delay = next_max_delay; } }; unsigned int seed; _CCCL_DEVICE _CCCL_FORCEINLINE exponential_backoff_jitter_window_constructor_t(unsigned int seed) : seed(seed) { always_delay(); } _CCCL_DEVICE _CCCL_FORCEINLINE delay_t operator()() { return {InitialDelay, seed}; } }; template struct exponential_backon_jitter_window_constructor_t { struct delay_t { static constexpr unsigned int a = 16807; static constexpr unsigned int c = 0; static constexpr unsigned int m = 1u << 31; unsigned int max_delay; unsigned int& seed; _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int next(unsigned int min, unsigned int max) { return (seed = (a * seed + c) % m) % (max + 1 - min) + min; } _CCCL_DEVICE _CCCL_FORCEINLINE void operator()() { int prev_delay = max_delay >> 1; always_delay_or_prevent_hoisting(next(prev_delay, max_delay)); max_delay = prev_delay; } }; unsigned int seed; unsigned int max_delay = InitialDelay; _CCCL_DEVICE _CCCL_FORCEINLINE exponential_backon_jitter_window_constructor_t(unsigned int seed) : seed(seed) { always_delay(); } _CCCL_DEVICE _CCCL_FORCEINLINE delay_t operator()() { max_delay >>= 1; return {max_delay, seed}; } }; template struct exponential_backon_jitter_constructor_t { struct delay_t { static constexpr unsigned int a = 16807; static constexpr unsigned int c = 0; static constexpr unsigned int m = 1u << 31; unsigned int max_delay; unsigned int& seed; _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int next(unsigned int min, unsigned int max) { return (seed = (a * seed + c) % m) % (max + 1 - min) + min; } _CCCL_DEVICE _CCCL_FORCEINLINE void operator()() { always_delay_or_prevent_hoisting(next(0, max_delay)); max_delay >>= 1; } }; unsigned int seed; unsigned int max_delay = InitialDelay; _CCCL_DEVICE _CCCL_FORCEINLINE exponential_backon_jitter_constructor_t(unsigned int seed) : seed(seed) { always_delay(); } _CCCL_DEVICE _CCCL_FORCEINLINE delay_t operator()() { max_delay >>= 1; return {max_delay, seed}; } }; template struct exponential_backon_constructor_t { struct delay_t { unsigned int delay; _CCCL_DEVICE _CCCL_FORCEINLINE void operator()() { always_delay_or_prevent_hoisting(delay); delay >>= 1; } }; unsigned int max_delay = InitialDelay; _CCCL_DEVICE _CCCL_FORCEINLINE exponential_backon_constructor_t(unsigned int /* seed */) { always_delay(); } _CCCL_DEVICE _CCCL_FORCEINLINE delay_t operator()() { max_delay >>= 1; return {max_delay}; } }; using default_no_delay_constructor_t = no_delay_constructor_t<450>; using default_no_delay_t = default_no_delay_constructor_t::delay_t; template using default_delay_constructor_t = cub::detail::conditional_t::PRIMITIVE, fixed_delay_constructor_t<350, 450>, default_no_delay_constructor_t>; template using default_delay_t = typename default_delay_constructor_t::delay_t; template using default_reduce_by_key_delay_constructor_t = detail::conditional_t<(Traits::PRIMITIVE) && (sizeof(ValueT) + sizeof(KeyT) < 16), reduce_by_key_delay_constructor_t<350, 450>, default_delay_constructor_t>>; } // namespace detail /** * Tile status interface. */ template ::PRIMITIVE> struct ScanTileState; /** * Tile status interface specialized for scan status and value types * that can be combined into one machine word that can be * read/written coherently in a single access. */ template struct ScanTileState { // Status word type using StatusWord = cub::detail::conditional_t< sizeof(T) == 8, unsigned long long, cub::detail::conditional_t>>; // Unit word type using TxnWord = cub::detail:: conditional_t>; // Device word type struct TileDescriptor { StatusWord status; T value; }; // Constants enum { TILE_STATUS_PADDING = CUB_PTX_WARP_THREADS, }; // Device storage TxnWord* d_tile_descriptors; /// Constructor _CCCL_HOST_DEVICE _CCCL_FORCEINLINE ScanTileState() : d_tile_descriptors(NULL) {} /** * @brief Initializer * * @param[in] num_tiles * Number of tiles * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. * When NULL, the required allocation size is written to \p temp_storage_bytes and no work is * done. * * @param[in] temp_storage_bytes * Size in bytes of \t d_temp_storage allocation */ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE cudaError_t Init(int /*num_tiles*/, void* d_temp_storage, size_t /*temp_storage_bytes*/) { d_tile_descriptors = reinterpret_cast(d_temp_storage); return cudaSuccess; } /** * @brief Compute device memory needed for tile status * * @param[in] num_tiles * Number of tiles * * @param[out] temp_storage_bytes * Size in bytes of \t d_temp_storage allocation */ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE static cudaError_t AllocationSize(int num_tiles, size_t& temp_storage_bytes) { // bytes needed for tile status descriptors temp_storage_bytes = (num_tiles + TILE_STATUS_PADDING) * sizeof(TxnWord); return cudaSuccess; } /** * Initialize (from device) */ _CCCL_DEVICE _CCCL_FORCEINLINE void InitializeStatus(int num_tiles) { int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x; TxnWord val = TxnWord(); TileDescriptor* descriptor = reinterpret_cast(&val); if (tile_idx < num_tiles) { // Not-yet-set descriptor->status = StatusWord(SCAN_TILE_INVALID); d_tile_descriptors[TILE_STATUS_PADDING + tile_idx] = val; } if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING)) { // Padding descriptor->status = StatusWord(SCAN_TILE_OOB); d_tile_descriptors[threadIdx.x] = val; } } /** * Update the specified tile's inclusive value and corresponding status */ _CCCL_DEVICE _CCCL_FORCEINLINE void SetInclusive(int tile_idx, T tile_inclusive) { TileDescriptor tile_descriptor; tile_descriptor.status = SCAN_TILE_INCLUSIVE; tile_descriptor.value = tile_inclusive; TxnWord alias; *reinterpret_cast(&alias) = tile_descriptor; detail::store_relaxed(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias); } /** * Update the specified tile's partial value and corresponding status */ _CCCL_DEVICE _CCCL_FORCEINLINE void SetPartial(int tile_idx, T tile_partial) { TileDescriptor tile_descriptor; tile_descriptor.status = SCAN_TILE_PARTIAL; tile_descriptor.value = tile_partial; TxnWord alias; *reinterpret_cast(&alias) = tile_descriptor; detail::store_relaxed(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias); } /** * Wait for the corresponding tile to become non-invalid */ template > _CCCL_DEVICE _CCCL_FORCEINLINE void WaitForValid(int tile_idx, StatusWord& status, T& value, DelayT delay_or_prevent_hoisting = {}) { TileDescriptor tile_descriptor; { TxnWord alias = detail::load_relaxed(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx); tile_descriptor = reinterpret_cast(alias); } while (WARP_ANY((tile_descriptor.status == SCAN_TILE_INVALID), 0xffffffff)) { delay_or_prevent_hoisting(); TxnWord alias = detail::load_relaxed(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx); tile_descriptor = reinterpret_cast(alias); } status = tile_descriptor.status; value = tile_descriptor.value; } /** * Loads and returns the tile's value. The returned value is undefined if either (a) the tile's status is invalid or * (b) there is no memory fence between reading a non-invalid status and the call to LoadValid. */ _CCCL_DEVICE _CCCL_FORCEINLINE T LoadValid(int tile_idx) { TxnWord alias = d_tile_descriptors[TILE_STATUS_PADDING + tile_idx]; TileDescriptor tile_descriptor = reinterpret_cast(alias); return tile_descriptor.value; } }; /** * Tile status interface specialized for scan status and value types that * cannot be combined into one machine word. */ template struct ScanTileState { // Status word type using StatusWord = unsigned int; // Constants enum { TILE_STATUS_PADDING = CUB_PTX_WARP_THREADS, }; // Device storage StatusWord* d_tile_status; T* d_tile_partial; T* d_tile_inclusive; /// Constructor _CCCL_HOST_DEVICE _CCCL_FORCEINLINE ScanTileState() : d_tile_status(NULL) , d_tile_partial(NULL) , d_tile_inclusive(NULL) {} /** * @brief Initializer * * @param[in] num_tiles * Number of tiles * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. * When NULL, the required allocation size is written to \p temp_storage_bytes and no work is * done. * * @param[in] temp_storage_bytes * Size in bytes of \t d_temp_storage allocation */ /// Initializer _CCCL_HOST_DEVICE _CCCL_FORCEINLINE cudaError_t Init(int num_tiles, void* d_temp_storage, size_t temp_storage_bytes) { cudaError_t error = cudaSuccess; do { void* allocations[3] = {}; size_t allocation_sizes[3]; // bytes needed for tile status descriptors allocation_sizes[0] = (num_tiles + TILE_STATUS_PADDING) * sizeof(StatusWord); // bytes needed for partials allocation_sizes[1] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized); // bytes needed for inclusives allocation_sizes[2] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized); // Compute allocation pointers into the single storage blob error = CubDebug(AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes)); if (cudaSuccess != error) { break; } // Alias the offsets d_tile_status = reinterpret_cast(allocations[0]); d_tile_partial = reinterpret_cast(allocations[1]); d_tile_inclusive = reinterpret_cast(allocations[2]); } while (0); return error; } /** * @brief Compute device memory needed for tile status * * @param[in] num_tiles * Number of tiles * * @param[out] temp_storage_bytes * Size in bytes of \t d_temp_storage allocation */ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE static cudaError_t AllocationSize(int num_tiles, size_t& temp_storage_bytes) { // Specify storage allocation requirements size_t allocation_sizes[3]; // bytes needed for tile status descriptors allocation_sizes[0] = (num_tiles + TILE_STATUS_PADDING) * sizeof(StatusWord); // bytes needed for partials allocation_sizes[1] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized); // bytes needed for inclusives allocation_sizes[2] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized); // Set the necessary size of the blob void* allocations[3] = {}; return CubDebug(AliasTemporaries(NULL, temp_storage_bytes, allocations, allocation_sizes)); } /** * Initialize (from device) */ _CCCL_DEVICE _CCCL_FORCEINLINE void InitializeStatus(int num_tiles) { int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x; if (tile_idx < num_tiles) { // Not-yet-set d_tile_status[TILE_STATUS_PADDING + tile_idx] = StatusWord(SCAN_TILE_INVALID); } if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING)) { // Padding d_tile_status[threadIdx.x] = StatusWord(SCAN_TILE_OOB); } } /** * Update the specified tile's inclusive value and corresponding status */ _CCCL_DEVICE _CCCL_FORCEINLINE void SetInclusive(int tile_idx, T tile_inclusive) { // Update tile inclusive value ThreadStore(d_tile_inclusive + TILE_STATUS_PADDING + tile_idx, tile_inclusive); detail::store_release(d_tile_status + TILE_STATUS_PADDING + tile_idx, StatusWord(SCAN_TILE_INCLUSIVE)); } /** * Update the specified tile's partial value and corresponding status */ _CCCL_DEVICE _CCCL_FORCEINLINE void SetPartial(int tile_idx, T tile_partial) { // Update tile partial value ThreadStore(d_tile_partial + TILE_STATUS_PADDING + tile_idx, tile_partial); detail::store_release(d_tile_status + TILE_STATUS_PADDING + tile_idx, StatusWord(SCAN_TILE_PARTIAL)); } /** * Wait for the corresponding tile to become non-invalid */ template _CCCL_DEVICE _CCCL_FORCEINLINE void WaitForValid(int tile_idx, StatusWord& status, T& value, DelayT delay = {}) { do { delay(); status = detail::load_relaxed(d_tile_status + TILE_STATUS_PADDING + tile_idx); __threadfence(); } while (WARP_ANY((status == SCAN_TILE_INVALID), 0xffffffff)); if (status == StatusWord(SCAN_TILE_PARTIAL)) { value = ThreadLoad(d_tile_partial + TILE_STATUS_PADDING + tile_idx); } else { value = ThreadLoad(d_tile_inclusive + TILE_STATUS_PADDING + tile_idx); } } /** * Loads and returns the tile's value. The returned value is undefined if either (a) the tile's status is invalid or * (b) there is no memory fence between reading a non-invalid status and the call to LoadValid. */ _CCCL_DEVICE _CCCL_FORCEINLINE T LoadValid(int tile_idx) { return d_tile_inclusive[TILE_STATUS_PADDING + tile_idx]; } }; /****************************************************************************** * ReduceByKey tile status interface types for block-cooperative scans ******************************************************************************/ /** * Tile status interface for reduction by key. * */ template ::PRIMITIVE) && (sizeof(ValueT) + sizeof(KeyT) < 16)> struct ReduceByKeyScanTileState; /** * Tile status interface for reduction by key, specialized for scan status and value types that * cannot be combined into one machine word. */ template struct ReduceByKeyScanTileState : ScanTileState> { typedef ScanTileState> SuperClass; /// Constructor _CCCL_HOST_DEVICE _CCCL_FORCEINLINE ReduceByKeyScanTileState() : SuperClass() {} }; /** * Tile status interface for reduction by key, specialized for scan status and value types that * can be combined into one machine word that can be read/written coherently in a single access. */ template struct ReduceByKeyScanTileState { using KeyValuePairT = KeyValuePair; // Constants enum { PAIR_SIZE = static_cast(sizeof(ValueT) + sizeof(KeyT)), TXN_WORD_SIZE = 1 << Log2::VALUE, STATUS_WORD_SIZE = TXN_WORD_SIZE - PAIR_SIZE, TILE_STATUS_PADDING = CUB_PTX_WARP_THREADS, }; // Status word type using StatusWord = cub::detail::conditional_t< STATUS_WORD_SIZE == 8, unsigned long long, cub::detail::conditional_t>>; // Status word type using TxnWord = cub::detail::conditional_t>; // Device word type (for when sizeof(ValueT) == sizeof(KeyT)) struct TileDescriptorBigStatus { KeyT key; ValueT value; StatusWord status; }; // Device word type (for when sizeof(ValueT) != sizeof(KeyT)) struct TileDescriptorLittleStatus { ValueT value; StatusWord status; KeyT key; }; // Device word type using TileDescriptor = cub::detail::conditional_t; // Device storage TxnWord* d_tile_descriptors; /// Constructor _CCCL_HOST_DEVICE _CCCL_FORCEINLINE ReduceByKeyScanTileState() : d_tile_descriptors(NULL) {} /** * @brief Initializer * * @param[in] num_tiles * Number of tiles * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When NULL, the required allocation size * is written to \p temp_storage_bytes and no work is done. * * @param[in] temp_storage_bytes * Size in bytes of \t d_temp_storage allocation */ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE cudaError_t Init(int /*num_tiles*/, void* d_temp_storage, size_t /*temp_storage_bytes*/) { d_tile_descriptors = reinterpret_cast(d_temp_storage); return cudaSuccess; } /** * @brief Compute device memory needed for tile status * * @param[in] num_tiles * Number of tiles * * @param[out] temp_storage_bytes * Size in bytes of \t d_temp_storage allocation */ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE static cudaError_t AllocationSize(int num_tiles, size_t& temp_storage_bytes) { // bytes needed for tile status descriptors temp_storage_bytes = (num_tiles + TILE_STATUS_PADDING) * sizeof(TxnWord); return cudaSuccess; } /** * Initialize (from device) */ _CCCL_DEVICE _CCCL_FORCEINLINE void InitializeStatus(int num_tiles) { int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x; TxnWord val = TxnWord(); TileDescriptor* descriptor = reinterpret_cast(&val); if (tile_idx < num_tiles) { // Not-yet-set descriptor->status = StatusWord(SCAN_TILE_INVALID); d_tile_descriptors[TILE_STATUS_PADDING + tile_idx] = val; } if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING)) { // Padding descriptor->status = StatusWord(SCAN_TILE_OOB); d_tile_descriptors[threadIdx.x] = val; } } /** * Update the specified tile's inclusive value and corresponding status */ _CCCL_DEVICE _CCCL_FORCEINLINE void SetInclusive(int tile_idx, KeyValuePairT tile_inclusive) { TileDescriptor tile_descriptor; tile_descriptor.status = SCAN_TILE_INCLUSIVE; tile_descriptor.value = tile_inclusive.value; tile_descriptor.key = tile_inclusive.key; TxnWord alias; *reinterpret_cast(&alias) = tile_descriptor; detail::store_relaxed(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias); } /** * Update the specified tile's partial value and corresponding status */ _CCCL_DEVICE _CCCL_FORCEINLINE void SetPartial(int tile_idx, KeyValuePairT tile_partial) { TileDescriptor tile_descriptor; tile_descriptor.status = SCAN_TILE_PARTIAL; tile_descriptor.value = tile_partial.value; tile_descriptor.key = tile_partial.key; TxnWord alias; *reinterpret_cast(&alias) = tile_descriptor; detail::store_relaxed(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias); } /** * Wait for the corresponding tile to become non-invalid */ template ::delay_t> _CCCL_DEVICE _CCCL_FORCEINLINE void WaitForValid(int tile_idx, StatusWord& status, KeyValuePairT& value, DelayT delay_or_prevent_hoisting = {}) { // TxnWord alias = ThreadLoad(d_tile_descriptors + TILE_STATUS_PADDING + // tile_idx); TileDescriptor tile_descriptor = reinterpret_cast(alias); // // while (tile_descriptor.status == SCAN_TILE_INVALID) // { // __threadfence_block(); // prevent hoisting loads from loop // // alias = ThreadLoad(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx); // tile_descriptor = reinterpret_cast(alias); // } // // status = tile_descriptor.status; // value.value = tile_descriptor.value; // value.key = tile_descriptor.key; TileDescriptor tile_descriptor; do { delay_or_prevent_hoisting(); TxnWord alias = detail::load_relaxed(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx); tile_descriptor = reinterpret_cast(alias); } while (WARP_ANY((tile_descriptor.status == SCAN_TILE_INVALID), 0xffffffff)); status = tile_descriptor.status; value.value = tile_descriptor.value; value.key = tile_descriptor.key; } }; /****************************************************************************** * Prefix call-back operator for coupling local block scan within a * block-cooperative scan ******************************************************************************/ /** * Stateful block-scan prefix functor. Provides the the running prefix for * the current tile by using the call-back warp to wait on on * aggregates/prefixes from predecessor tiles to become available. * * @tparam DelayConstructorT * Implementation detail, do not specify directly, requirements on the * content of this type are subject to breaking change. */ template > struct TilePrefixCallbackOp { // Parameterized warp reduce typedef WarpReduce WarpReduceT; // Temporary storage type struct _TempStorage { typename WarpReduceT::TempStorage warp_reduce; T exclusive_prefix; T inclusive_prefix; T block_aggregate; }; // Alias wrapper allowing temporary storage to be unioned struct TempStorage : Uninitialized<_TempStorage> {}; // Type of status word typedef typename ScanTileStateT::StatusWord StatusWord; // Fields _TempStorage& temp_storage; ///< Reference to a warp-reduction instance ScanTileStateT& tile_status; ///< Interface to tile status ScanOpT scan_op; ///< Binary scan operator int tile_idx; ///< The current tile index T exclusive_prefix; ///< Exclusive prefix for the tile T inclusive_prefix; ///< Inclusive prefix for the tile // Constructs prefix functor for a given tile index. // Precondition: thread blocks processing all of the predecessor tiles were scheduled. _CCCL_DEVICE _CCCL_FORCEINLINE TilePrefixCallbackOp(ScanTileStateT& tile_status, TempStorage& temp_storage, ScanOpT scan_op, int tile_idx) : temp_storage(temp_storage.Alias()) , tile_status(tile_status) , scan_op(scan_op) , tile_idx(tile_idx) {} // Computes the tile index and constructs prefix functor with it. // Precondition: thread block per tile assignment. _CCCL_DEVICE _CCCL_FORCEINLINE TilePrefixCallbackOp(ScanTileStateT& tile_status, TempStorage& temp_storage, ScanOpT scan_op) : TilePrefixCallbackOp(tile_status, temp_storage, scan_op, blockIdx.x) {} /** * @brief Block until all predecessors within the warp-wide window have non-invalid status * * @param predecessor_idx * Preceding tile index to inspect * * @param[out] predecessor_status * Preceding tile status * * @param[out] window_aggregate * Relevant partial reduction from this window of preceding tiles */ template > _CCCL_DEVICE _CCCL_FORCEINLINE void ProcessWindow(int predecessor_idx, StatusWord& predecessor_status, T& window_aggregate, DelayT delay = {}) { T value; tile_status.WaitForValid(predecessor_idx, predecessor_status, value, delay); // Perform a segmented reduction to get the prefix for the current window. // Use the swizzled scan operator because we are now scanning *down* towards thread0. int tail_flag = (predecessor_status == StatusWord(SCAN_TILE_INCLUSIVE)); window_aggregate = WarpReduceT(temp_storage.warp_reduce).TailSegmentedReduce(value, tail_flag, SwizzleScanOp(scan_op)); } // BlockScan prefix callback functor (called by the first warp) _CCCL_DEVICE _CCCL_FORCEINLINE T operator()(T block_aggregate) { // Update our status with our tile-aggregate if (threadIdx.x == 0) { detail::uninitialized_copy(&temp_storage.block_aggregate, block_aggregate); tile_status.SetPartial(tile_idx, block_aggregate); } int predecessor_idx = tile_idx - threadIdx.x - 1; StatusWord predecessor_status; T window_aggregate; // Wait for the warp-wide window of predecessor tiles to become valid DelayConstructorT construct_delay(tile_idx); ProcessWindow(predecessor_idx, predecessor_status, window_aggregate, construct_delay()); // The exclusive tile prefix starts out as the current window aggregate exclusive_prefix = window_aggregate; // Keep sliding the window back until we come across a tile whose inclusive prefix is known while (WARP_ALL((predecessor_status != StatusWord(SCAN_TILE_INCLUSIVE)), 0xffffffff)) { predecessor_idx -= CUB_PTX_WARP_THREADS; // Update exclusive tile prefix with the window prefix ProcessWindow(predecessor_idx, predecessor_status, window_aggregate, construct_delay()); exclusive_prefix = scan_op(window_aggregate, exclusive_prefix); } // Compute the inclusive tile prefix and update the status for this tile if (threadIdx.x == 0) { inclusive_prefix = scan_op(exclusive_prefix, block_aggregate); tile_status.SetInclusive(tile_idx, inclusive_prefix); detail::uninitialized_copy(&temp_storage.exclusive_prefix, exclusive_prefix); detail::uninitialized_copy(&temp_storage.inclusive_prefix, inclusive_prefix); } // Return exclusive_prefix return exclusive_prefix; } // Get the exclusive prefix stored in temporary storage _CCCL_DEVICE _CCCL_FORCEINLINE T GetExclusivePrefix() { return temp_storage.exclusive_prefix; } // Get the inclusive prefix stored in temporary storage _CCCL_DEVICE _CCCL_FORCEINLINE T GetInclusivePrefix() { return temp_storage.inclusive_prefix; } // Get the block aggregate stored in temporary storage _CCCL_DEVICE _CCCL_FORCEINLINE T GetBlockAggregate() { return temp_storage.block_aggregate; } _CCCL_DEVICE _CCCL_FORCEINLINE int GetTileIdx() const { return tile_idx; } }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/block/000077500000000000000000000000001463375617100150535ustar00rootroot00000000000000cccl-2.5.0/cub/cub/block/block_adjacent_difference.cuh000066400000000000000000001356541463375617100226670ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ //! @file The cub::BlockAdjacentDifference class provides collective methods for computing //! the differences of adjacent elements partitioned across a CUDA thread block. #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include CUB_NAMESPACE_BEGIN //! @rst //! BlockAdjacentDifference provides :ref:`collective ` methods for computing the //! differences of adjacent elements partitioned across a CUDA thread block. //! //! Overview //! ++++++++++++++++ //! //! BlockAdjacentDifference calculates the differences of adjacent elements in the elements partitioned across a CUDA //! thread block. Because the binary operation could be noncommutative, there are two sets of methods. //! Methods named SubtractLeft subtract left element ``i - 1`` of input sequence from current element ``i``. //! Methods named SubtractRight subtract the right element ``i + 1`` from the current one ``i``: //! //! .. code-block:: c++ //! //! int values[4]; // [1, 2, 3, 4] //! //... //! int subtract_left_result[4]; <-- [ 1, 1, 1, 1 ] //! int subtract_right_result[4]; <-- [ -1, -1, -1, 4 ] //! //! - For SubtractLeft, if the left element is out of bounds, the input value is assigned to ``output[0]`` //! without modification. //! - For SubtractRight, if the right element is out of bounds, the input value is assigned to the current output value //! without modification. //! - The block/example_block_reduce_dyn_smem.cu example under the examples/block folder illustrates usage of //! dynamically shared memory with BlockReduce and how to re-purpose the same memory region. //! This example can be easily adapted to the storage required by BlockAdjacentDifference. //! //! A Simple Example //! ++++++++++++++++ //! //! The code snippet below illustrates how to use BlockAdjacentDifference to //! compute the left difference between adjacent elements. //! //! .. code-block:: c++ //! //! #include //! // or equivalently //! //! struct CustomDifference //! { //! template //! __host__ DataType operator()(DataType &lhs, DataType &rhs) //! { //! return lhs - rhs; //! } //! }; //! //! __global__ void ExampleKernel(...) //! { //! // Specialize BlockAdjacentDifference for a 1D block of //! // 128 threads of type int //! using BlockAdjacentDifferenceT = //! cub::BlockAdjacentDifference; //! //! // Allocate shared memory for BlockAdjacentDifference //! __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage; //! //! // Obtain a segment of consecutive items that are blocked across threads //! int thread_data[4]; //! ... //! //! // Collectively compute adjacent_difference //! int result[4]; //! //! BlockAdjacentDifferenceT(temp_storage).SubtractLeft( //! thread_data, //! result, //! CustomDifference()); //! //! Suppose the set of input `thread_data` across the block of threads is //! ``{ [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4], ... }``. //! The corresponding output ``result`` in those threads will be //! ``{ [4,-2,-1,0], [0,0,0,0], [1,1,0,0], [0,1,-3,3], ... }``. //! //! @endrst template class BlockAdjacentDifference { private: /// The thread block size in threads static constexpr int BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z; /// Shared memory storage layout type (last element from each thread's input) struct _TempStorage { T first_items[BLOCK_THREADS]; T last_items[BLOCK_THREADS]; }; /// Internal storage allocator _CCCL_DEVICE _CCCL_FORCEINLINE _TempStorage& PrivateStorage() { __shared__ _TempStorage private_storage; return private_storage; } /// Specialization for when FlagOp has third index param template ::HAS_PARAM> struct ApplyOp { // Apply flag operator static _CCCL_DEVICE _CCCL_FORCEINLINE T FlagT(FlagOp flag_op, const T& a, const T& b, int idx) { return flag_op(b, a, idx); } }; /// Specialization for when FlagOp does not have a third index param template struct ApplyOp { // Apply flag operator static _CCCL_DEVICE _CCCL_FORCEINLINE T FlagT(FlagOp flag_op, const T& a, const T& b, int /*idx*/) { return flag_op(b, a); } }; /// Templated unrolling of item comparison (inductive case) struct Iterate { /** * Head flags * * @param[out] flags Calling thread's discontinuity head_flags * @param[in] input Calling thread's input items * @param[out] preds Calling thread's predecessor items * @param[in] flag_op Binary boolean flag predicate */ template static _CCCL_DEVICE _CCCL_FORCEINLINE void FlagHeads( int linear_tid, FlagT (&flags)[ITEMS_PER_THREAD], T (&input)[ITEMS_PER_THREAD], T (&preds)[ITEMS_PER_THREAD], FlagOp flag_op) { #pragma unroll for (int i = 1; i < ITEMS_PER_THREAD; ++i) { preds[i] = input[i - 1]; flags[i] = ApplyOp::FlagT(flag_op, preds[i], input[i], (linear_tid * ITEMS_PER_THREAD) + i); } } /** * Tail flags * * @param[out] flags Calling thread's discontinuity head_flags * @param[in] input Calling thread's input items * @param[in] flag_op Binary boolean flag predicate */ template static _CCCL_DEVICE _CCCL_FORCEINLINE void FlagTails(int linear_tid, FlagT (&flags)[ITEMS_PER_THREAD], T (&input)[ITEMS_PER_THREAD], FlagOp flag_op) { #pragma unroll for (int i = 0; i < ITEMS_PER_THREAD - 1; ++i) { flags[i] = ApplyOp::FlagT(flag_op, input[i], input[i + 1], (linear_tid * ITEMS_PER_THREAD) + i + 1); } } }; /// Shared storage reference _TempStorage& temp_storage; /// Linear thread-id unsigned int linear_tid; public: /// @smemstorage{BlockAdjacentDifference} struct TempStorage : Uninitialized<_TempStorage> {}; //! @name Collective constructors //! @{ //! @brief Collective constructor using a private static allocation of shared memory as temporary storage _CCCL_DEVICE _CCCL_FORCEINLINE BlockAdjacentDifference() : temp_storage(PrivateStorage()) , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) {} //! @brief Collective constructor using the specified memory allocation as temporary storage //! @param[in] temp_storage Reference to memory allocation having layout type TempStorage _CCCL_DEVICE _CCCL_FORCEINLINE BlockAdjacentDifference(TempStorage& temp_storage) : temp_storage(temp_storage.Alias()) , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) {} //! @} end member group //! @name Read left operations //! @{ //! @rst //! Subtracts the left element of each adjacent pair of elements partitioned across a CUDA thread block. //! //! - @rowmajor //! - @smemreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates how to use BlockAdjacentDifference to compute the left difference between //! adjacent elements. //! //! .. code-block:: c++ //! //! #include //! // or equivalently //! //! struct CustomDifference //! { //! template //! __host__ DataType operator()(DataType &lhs, DataType &rhs) //! { //! return lhs - rhs; //! } //! }; //! //! __global__ void ExampleKernel(...) //! { //! // Specialize BlockAdjacentDifference for a 1D block //! // of 128 threads of type int //! using BlockAdjacentDifferenceT = //! cub::BlockAdjacentDifference; //! //! // Allocate shared memory for BlockAdjacentDifference //! __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage; //! //! // Obtain a segment of consecutive items that are blocked across threads //! int thread_data[4]; //! ... //! //! // Collectively compute adjacent_difference //! BlockAdjacentDifferenceT(temp_storage).SubtractLeft( //! thread_data, //! thread_data, //! CustomDifference()); //! //! Suppose the set of input ``thread_data`` across the block of threads is //! ``{ [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4], ... }``. //! The corresponding output ``result`` in those threads will be //! ``{ [4,-2,-1,0], [0,0,0,0], [1,1,0,0], [0,1,-3,3], ... }``. //! @endrst //! //! @param[out] output //! Calling thread's adjacent difference result //! //! @param[in] input //! Calling thread's input items (may be aliased to `output`) //! //! @param[in] difference_op //! Binary difference operator template _CCCL_DEVICE _CCCL_FORCEINLINE void SubtractLeft(T (&input)[ITEMS_PER_THREAD], OutputType (&output)[ITEMS_PER_THREAD], DifferenceOpT difference_op) { // Share last item temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; CTA_SYNC(); #pragma unroll for (int item = ITEMS_PER_THREAD - 1; item > 0; item--) { output[item] = difference_op(input[item], input[item - 1]); } if (linear_tid == 0) { output[0] = input[0]; } else { output[0] = difference_op(input[0], temp_storage.last_items[linear_tid - 1]); } } //! @rst //! Subtracts the left element of each adjacent pair of elements partitioned across a CUDA thread block. //! //! - @rowmajor //! - @smemreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates how to use BlockAdjacentDifference to compute the left difference between //! adjacent elements. //! //! .. code-block:: c++ //! //! #include //! // or equivalently //! //! struct CustomDifference //! { //! template //! __host__ DataType operator()(DataType &lhs, DataType &rhs) //! { //! return lhs - rhs; //! } //! }; //! //! __global__ void ExampleKernel(...) //! { //! // Specialize BlockAdjacentDifference for a 1D block of //! // 128 threads of type int //! using BlockAdjacentDifferenceT = //! cub::BlockAdjacentDifference; //! //! // Allocate shared memory for BlockAdjacentDifference //! __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage; //! //! // Obtain a segment of consecutive items that are blocked across threads //! int thread_data[4]; //! ... //! //! // The last item in the previous tile: //! int tile_predecessor_item = ...; //! //! // Collectively compute adjacent_difference //! BlockAdjacentDifferenceT(temp_storage).SubtractLeft( //! thread_data, //! thread_data, //! CustomDifference(), //! tile_predecessor_item); //! //! Suppose the set of input ``thread_data`` across the block of threads is //! ``{ [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4], ... }``. //! and that `tile_predecessor_item` is `3`. The corresponding output //! ``result`` in those threads will be //! ``{ [1,-2,-1,0], [0,0,0,0], [1,1,0,0], [0,1,-3,3], ... }``. //! @endrst //! //! @param[out] output //! Calling thread's adjacent difference result //! //! @param[in] input //! Calling thread's input items (may be aliased to `output`) //! //! @param[in] difference_op //! Binary difference operator //! //! @param[in] tile_predecessor_item //! @rst //! *thread*\ :sub:`0` only item which is going to be subtracted from the first tile item //! (*input*\ :sub:`0` from *thread*\ :sub:`0`). //! @endrst template _CCCL_DEVICE _CCCL_FORCEINLINE void SubtractLeft( T (&input)[ITEMS_PER_THREAD], OutputT (&output)[ITEMS_PER_THREAD], DifferenceOpT difference_op, T tile_predecessor_item) { // Share last item temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; CTA_SYNC(); #pragma unroll for (int item = ITEMS_PER_THREAD - 1; item > 0; item--) { output[item] = difference_op(input[item], input[item - 1]); } // Set flag for first thread-item if (linear_tid == 0) { output[0] = difference_op(input[0], tile_predecessor_item); } else { output[0] = difference_op(input[0], temp_storage.last_items[linear_tid - 1]); } } //! @rst //! Subtracts the left element of each adjacent pair of elements partitioned across a CUDA thread block. //! //! - @rowmajor //! - @smemreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates how to use BlockAdjacentDifference to compute the left difference between //! adjacent elements. //! //! .. code-block:: c++ //! //! #include //! // or equivalently //! //! struct CustomDifference //! { //! template //! __host__ DataType operator()(DataType &lhs, DataType &rhs) //! { //! return lhs - rhs; //! } //! }; //! //! __global__ void ExampleKernel(...) //! { //! // Specialize BlockAdjacentDifference for a 1D block of //! // 128 threads of type int //! using BlockAdjacentDifferenceT = //! cub::BlockAdjacentDifference; //! //! // Allocate shared memory for BlockAdjacentDifference //! __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage; //! //! // Obtain a segment of consecutive items that are blocked across threads //! int thread_data[4]; //! ... //! int valid_items = 9; //! //! // Collectively compute adjacent_difference //! BlockAdjacentDifferenceT(temp_storage).SubtractLeftPartialTile( //! thread_data, //! thread_data, //! CustomDifference(), //! valid_items); //! //! Suppose the set of input ``thread_data`` across the block of threads is //! ``{ [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4], ... }``. //! The corresponding output ``result`` in those threads will be //! ``{ [4,-2,-1,0], [0,0,0,0], [1,3,3,3], [3,4,1,4], ... }``. //! @endrst //! //! @param[out] output //! Calling thread's adjacent difference result //! //! @param[in] input //! Calling thread's input items (may be aliased to `output`) //! //! @param[in] difference_op //! Binary difference operator //! //! @param[in] valid_items //! Number of valid items in thread block template _CCCL_DEVICE _CCCL_FORCEINLINE void SubtractLeftPartialTile( T (&input)[ITEMS_PER_THREAD], OutputType (&output)[ITEMS_PER_THREAD], DifferenceOpT difference_op, int valid_items) { // Share last item temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; CTA_SYNC(); if ((linear_tid + 1) * ITEMS_PER_THREAD <= valid_items) { #pragma unroll for (int item = ITEMS_PER_THREAD - 1; item > 0; item--) { output[item] = difference_op(input[item], input[item - 1]); } } else { #pragma unroll for (int item = ITEMS_PER_THREAD - 1; item > 0; item--) { const int idx = linear_tid * ITEMS_PER_THREAD + item; if (idx < valid_items) { output[item] = difference_op(input[item], input[item - 1]); } else { output[item] = input[item]; } } } if (linear_tid == 0 || valid_items <= linear_tid * ITEMS_PER_THREAD) { output[0] = input[0]; } else { output[0] = difference_op(input[0], temp_storage.last_items[linear_tid - 1]); } } //! @rst //! Subtracts the left element of each adjacent pair of elements partitioned across a CUDA thread block. //! //! - @rowmajor //! - @smemreuse //! //! //! Snippet //! +++++++ //! //! The code snippet below illustrates how to use BlockAdjacentDifference to compute the left difference between //! adjacent elements. //! //! .. code-block:: c++ //! //! #include //! // or equivalently //! //! struct CustomDifference //! { //! template //! __host__ DataType operator()(DataType &lhs, DataType &rhs) //! { //! return lhs - rhs; //! } //! }; //! //! __global__ void ExampleKernel(...) //! { //! // Specialize BlockAdjacentDifference for a 1D block of //! // 128 threads of type int //! using BlockAdjacentDifferenceT = //! cub::BlockAdjacentDifference; //! //! // Allocate shared memory for BlockAdjacentDifference //! __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage; //! //! // Obtain a segment of consecutive items that are blocked across threads //! int thread_data[4]; //! ... //! int valid_items = 9; //! int tile_predecessor_item = 4; //! //! // Collectively compute adjacent_difference //! BlockAdjacentDifferenceT(temp_storage).SubtractLeftPartialTile( //! thread_data, //! thread_data, //! CustomDifference(), //! valid_items, //! tile_predecessor_item); //! //! Suppose the set of input ``thread_data`` across the block of threads is //! ``{ [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4], ... }``. //! The corresponding output ``result`` in those threads will be //! ``{ [0,-2,-1,0], [0,0,0,0], [1,3,3,3], [3,4,1,4], ... }``. //! @endrst //! //! @param[out] output //! Calling thread's adjacent difference result //! //! @param[in] input //! Calling thread's input items (may be aliased to `output`) //! //! @param[in] difference_op //! Binary difference operator //! //! @param[in] valid_items //! Number of valid items in thread block //! //! @param[in] tile_predecessor_item //! @rst //! *thread*\ :sub:`0` only item which is going to be subtracted from the first tile item //! (*input*\ :sub:`0` from *thread*\ :sub:`0`). //! @endrst template _CCCL_DEVICE _CCCL_FORCEINLINE void SubtractLeftPartialTile( T (&input)[ITEMS_PER_THREAD], OutputType (&output)[ITEMS_PER_THREAD], DifferenceOpT difference_op, int valid_items, T tile_predecessor_item) { // Share last item temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; CTA_SYNC(); if ((linear_tid + 1) * ITEMS_PER_THREAD <= valid_items) { #pragma unroll for (int item = ITEMS_PER_THREAD - 1; item > 0; item--) { output[item] = difference_op(input[item], input[item - 1]); } } else { #pragma unroll for (int item = ITEMS_PER_THREAD - 1; item > 0; item--) { const int idx = linear_tid * ITEMS_PER_THREAD + item; if (idx < valid_items) { output[item] = difference_op(input[item], input[item - 1]); } else { output[item] = input[item]; } } } if (valid_items <= linear_tid * ITEMS_PER_THREAD) { output[0] = input[0]; } else if (linear_tid == 0) { output[0] = difference_op(input[0], tile_predecessor_item); } else { output[0] = difference_op(input[0], temp_storage.last_items[linear_tid - 1]); } } //! @} end member group //! @name Read right operations //! @{ //! //! @rst //! //! Subtracts the right element of each adjacent pair of elements partitioned across a CUDA thread block. //! //! - @rowmajor //! - @smemreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates how to use BlockAdjacentDifference to compute the right difference between //! adjacent elements. //! //! .. code-block:: c++ //! //! #include //! // or equivalently //! //! struct CustomDifference //! { //! template //! __host__ DataType operator()(DataType &lhs, DataType &rhs) //! { //! return lhs - rhs; //! } //! }; //! //! __global__ void ExampleKernel(...) //! { //! // Specialize BlockAdjacentDifference for a 1D block of //! // 128 threads of type int //! using BlockAdjacentDifferenceT = //! cub::BlockAdjacentDifference; //! //! // Allocate shared memory for BlockAdjacentDifference //! __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage; //! //! // Obtain a segment of consecutive items that are blocked across threads //! int thread_data[4]; //! ... //! //! // Collectively compute adjacent_difference //! BlockAdjacentDifferenceT(temp_storage).SubtractRight( //! thread_data, //! thread_data, //! CustomDifference()); //! //! Suppose the set of input ``thread_data`` across the block of threads is //! ``{ ...3], [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4] }``. //! The corresponding output ``result`` in those threads will be //! ``{ ...-1, [2,1,0,0], [0,0,0,-1], [-1,0,0,0], [-1,3,-3,4] }``. //! @endrst //! //! @param[out] output //! Calling thread's adjacent difference result //! //! @param[in] input //! Calling thread's input items (may be aliased to `output`) //! //! @param[in] difference_op //! Binary difference operator template _CCCL_DEVICE _CCCL_FORCEINLINE void SubtractRight(T (&input)[ITEMS_PER_THREAD], OutputT (&output)[ITEMS_PER_THREAD], DifferenceOpT difference_op) { // Share first item temp_storage.first_items[linear_tid] = input[0]; CTA_SYNC(); #pragma unroll for (int item = 0; item < ITEMS_PER_THREAD - 1; item++) { output[item] = difference_op(input[item], input[item + 1]); } if (linear_tid == BLOCK_THREADS - 1) { output[ITEMS_PER_THREAD - 1] = input[ITEMS_PER_THREAD - 1]; } else { output[ITEMS_PER_THREAD - 1] = difference_op(input[ITEMS_PER_THREAD - 1], temp_storage.first_items[linear_tid + 1]); } } //! @rst //! Subtracts the right element of each adjacent pair of elements partitioned across a CUDA thread block. //! //! - @rowmajor //! - @smemreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates how to use BlockAdjacentDifference to compute the right difference between //! adjacent elements. //! //! //! .. code-block:: c++ //! //! #include //! // or equivalently //! //! struct CustomDifference //! { //! template //! __host__ DataType operator()(DataType &lhs, DataType &rhs) //! { //! return lhs - rhs; //! } //! }; //! //! __global__ void ExampleKernel(...) //! { //! // Specialize BlockAdjacentDifference for a 1D block of //! // 128 threads of type int //! using BlockAdjacentDifferenceT = //! cub::BlockAdjacentDifference; //! //! // Allocate shared memory for BlockAdjacentDifference //! __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage; //! //! // Obtain a segment of consecutive items that are blocked across threads //! int thread_data[4]; //! ... //! //! // The first item in the next tile: //! int tile_successor_item = ...; //! //! // Collectively compute adjacent_difference //! BlockAdjacentDifferenceT(temp_storage).SubtractRight( //! thread_data, //! thread_data, //! CustomDifference(), //! tile_successor_item); //! //! Suppose the set of input ``thread_data`` across the block of threads is //! ``{ ...3], [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4] }``, //! and that ``tile_successor_item`` is ``3``. The corresponding output ``result`` //! in those threads will be //! ``{ ...-1, [2,1,0,0], [0,0,0,-1], [-1,0,0,0], [-1,3,-3,1] }``. //! @endrst //! //! @param[out] output //! Calling thread's adjacent difference result //! //! @param[in] input //! Calling thread's input items (may be aliased to `output`) //! //! @param[in] difference_op //! Binary difference operator //! //! @param[in] tile_successor_item //! @rst //! *thread*\ :sub:`BLOCK_THREADS` only item which is going to be subtracted from the last tile item //! (*input*\ :sub:`ITEMS_PER_THREAD` from *thread*\ :sub:`BLOCK_THREADS`). //! @endrst template _CCCL_DEVICE _CCCL_FORCEINLINE void SubtractRight( T (&input)[ITEMS_PER_THREAD], OutputT (&output)[ITEMS_PER_THREAD], DifferenceOpT difference_op, T tile_successor_item) { // Share first item temp_storage.first_items[linear_tid] = input[0]; CTA_SYNC(); // Set flag for last thread-item T successor_item = (linear_tid == BLOCK_THREADS - 1) ? tile_successor_item // Last thread : temp_storage.first_items[linear_tid + 1]; #pragma unroll for (int item = 0; item < ITEMS_PER_THREAD - 1; item++) { output[item] = difference_op(input[item], input[item + 1]); } output[ITEMS_PER_THREAD - 1] = difference_op(input[ITEMS_PER_THREAD - 1], successor_item); } //! @rst //! Subtracts the right element of each adjacent pair in range of elements partitioned across a CUDA thread block. //! //! - @rowmajor //! - @smemreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates how to use BlockAdjacentDifference to compute the right difference between //! adjacent elements. //! //! //! .. code-block:: c++ //! //! #include //! // or equivalently //! //! struct CustomDifference //! { //! template //! __host__ DataType operator()(DataType &lhs, DataType &rhs) //! { //! return lhs - rhs; //! } //! }; //! //! __global__ void ExampleKernel(...) //! { //! // Specialize BlockAdjacentDifference for a 1D block of //! // 128 threads of type int //! using BlockAdjacentDifferenceT = //! cub::BlockAdjacentDifference; //! //! // Allocate shared memory for BlockAdjacentDifference //! __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage; //! //! // Obtain a segment of consecutive items that are blocked across threads //! int thread_data[4]; //! ... //! //! // Collectively compute adjacent_difference //! BlockAdjacentDifferenceT(temp_storage).SubtractRightPartialTile( //! thread_data, //! thread_data, //! CustomDifference(), //! valid_items); //! //! Suppose the set of input ``thread_data`` across the block of threads is //! ``{ ...3], [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4] }``. //! and that ``valid_items`` is ``507``. The corresponding output ``result`` in //! those threads will be //! ``{ ...-1, [2,1,0,0], [0,0,0,-1], [-1,0,3,3], [3,4,1,4] }``. //! @endrst //! //! @param[out] output //! Calling thread's adjacent difference result //! //! @param[in] input //! Calling thread's input items (may be aliased to `output`) //! //! @param[in] difference_op //! Binary difference operator //! //! @param[in] valid_items //! Number of valid items in thread block template _CCCL_DEVICE _CCCL_FORCEINLINE void SubtractRightPartialTile( T (&input)[ITEMS_PER_THREAD], OutputT (&output)[ITEMS_PER_THREAD], DifferenceOpT difference_op, int valid_items) { // Share first item temp_storage.first_items[linear_tid] = input[0]; CTA_SYNC(); if ((linear_tid + 1) * ITEMS_PER_THREAD < valid_items) { #pragma unroll for (int item = 0; item < ITEMS_PER_THREAD - 1; item++) { output[item] = difference_op(input[item], input[item + 1]); } output[ITEMS_PER_THREAD - 1] = difference_op(input[ITEMS_PER_THREAD - 1], temp_storage.first_items[linear_tid + 1]); } else { #pragma unroll for (int item = 0; item < ITEMS_PER_THREAD; item++) { const int idx = linear_tid * ITEMS_PER_THREAD + item; // Right element of input[valid_items - 1] is out of bounds. // According to the API it's copied into output array // without modification. if (idx < valid_items - 1) { output[item] = difference_op(input[item], input[item + 1]); } else { output[item] = input[item]; } } } } //! @} end member group //! @name Head flag operations (deprecated) //! @{ #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document /** * @deprecated [Since 1.14.0] The cub::BlockAdjacentDifference::FlagHeads * APIs are deprecated. Use cub::BlockAdjacentDifference::SubtractLeft instead. * * @param[out] output * Calling thread's discontinuity head_flags * * @param[in] input * Calling thread's input items * * @param[out] preds * Calling thread's predecessor items * * @param[in] flag_op * Binary boolean flag predicate */ template CUB_DEPRECATED _CCCL_DEVICE _CCCL_FORCEINLINE void FlagHeads( FlagT (&output)[ITEMS_PER_THREAD], T (&input)[ITEMS_PER_THREAD], T (&preds)[ITEMS_PER_THREAD], FlagOp flag_op) { // Share last item temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; CTA_SYNC(); if (linear_tid == 0) { // Set flag for first thread-item (preds[0] is undefined) output[0] = 1; } else { preds[0] = temp_storage.last_items[linear_tid - 1]; output[0] = ApplyOp::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD); } // Set output for remaining items Iterate::FlagHeads(linear_tid, output, input, preds, flag_op); } /** * @deprecated [Since 1.14.0] The cub::BlockAdjacentDifference::FlagHeads * APIs are deprecated. Use cub::BlockAdjacentDifference::SubtractLeft instead. * * @param[out] output * Calling thread's discontinuity result * * @param[in] input * Calling thread's input items * * @param[out] preds * Calling thread's predecessor items * * @param[in] flag_op * Binary boolean flag predicate * * @param[in] tile_predecessor_item * [thread0 only] Item with which to compare the first tile item * (input0 from thread0). */ template CUB_DEPRECATED _CCCL_DEVICE _CCCL_FORCEINLINE void FlagHeads( FlagT (&output)[ITEMS_PER_THREAD], T (&input)[ITEMS_PER_THREAD], T (&preds)[ITEMS_PER_THREAD], FlagOp flag_op, T tile_predecessor_item) { // Share last item temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; CTA_SYNC(); // Set flag for first thread-item preds[0] = (linear_tid == 0) ? tile_predecessor_item : // First thread temp_storage.last_items[linear_tid - 1]; output[0] = ApplyOp::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD); // Set output for remaining items Iterate::FlagHeads(linear_tid, output, input, preds, flag_op); } #endif // DOXYGEN_SHOULD_SKIP_THIS /** * @deprecated [Since 1.14.0] The cub::BlockAdjacentDifference::FlagHeads * APIs are deprecated. Use cub::BlockAdjacentDifference::SubtractLeft instead. * * @param[out] output * Calling thread's discontinuity result * * @param[in] input * Calling thread's input items * * @param[in] flag_op * Binary boolean flag predicate */ template CUB_DEPRECATED _CCCL_DEVICE _CCCL_FORCEINLINE void FlagHeads(FlagT (&output)[ITEMS_PER_THREAD], T (&input)[ITEMS_PER_THREAD], FlagOp flag_op) { T preds[ITEMS_PER_THREAD]; FlagHeads(output, input, preds, flag_op); } /** * @deprecated [Since 1.14.0] The cub::BlockAdjacentDifference::FlagHeads * APIs are deprecated. Use cub::BlockAdjacentDifference::SubtractLeft instead. * * @param[out] output * Calling thread's discontinuity result * * @param[in] input * Calling thread's input items * * @param[in] flag_op * Binary boolean flag predicate * * @param[in] tile_predecessor_item * [thread0 only] Item with which to compare the first tile item * (input0 from thread0). */ template CUB_DEPRECATED _CCCL_DEVICE _CCCL_FORCEINLINE void FlagHeads(FlagT (&output)[ITEMS_PER_THREAD], T (&input)[ITEMS_PER_THREAD], FlagOp flag_op, T tile_predecessor_item) { T preds[ITEMS_PER_THREAD]; FlagHeads(output, input, preds, flag_op, tile_predecessor_item); } /** * @deprecated [Since 1.14.0] The cub::BlockAdjacentDifference::FlagTails * APIs are deprecated. Use cub::BlockAdjacentDifference::SubtractRight instead. * * @param output * [out] Calling thread's discontinuity result * * @param input * [in] Calling thread's input items * * @param flag_op * [in] Binary boolean flag predicate */ template CUB_DEPRECATED _CCCL_DEVICE _CCCL_FORCEINLINE void FlagTails(FlagT (&output)[ITEMS_PER_THREAD], T (&input)[ITEMS_PER_THREAD], FlagOp flag_op) { // Share first item temp_storage.first_items[linear_tid] = input[0]; CTA_SYNC(); // Set flag for last thread-item output[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ? 1 : // Last thread ApplyOp::FlagT( flag_op, input[ITEMS_PER_THREAD - 1], temp_storage.first_items[linear_tid + 1], (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD); // Set output for remaining items Iterate::FlagTails(linear_tid, output, input, flag_op); } /** * @deprecated [Since 1.14.0] The cub::BlockAdjacentDifference::FlagTails * APIs are deprecated. Use cub::BlockAdjacentDifference::SubtractRight instead. * * @param[out] output * Calling thread's discontinuity result * * @param[in] input * Calling thread's input items * * @param[in] flag_op * Binary boolean flag predicate * * @param[in] tile_successor_item * [threadBLOCK_THREADS-1 only] Item with which to compare * the last tile item (inputITEMS_PER_THREAD-1 from * threadBLOCK_THREADS-1). */ template CUB_DEPRECATED _CCCL_DEVICE _CCCL_FORCEINLINE void FlagTails(FlagT (&output)[ITEMS_PER_THREAD], T (&input)[ITEMS_PER_THREAD], FlagOp flag_op, T tile_successor_item) { // Share first item temp_storage.first_items[linear_tid] = input[0]; CTA_SYNC(); // Set flag for last thread-item T successor_item = (linear_tid == BLOCK_THREADS - 1) ? tile_successor_item : // Last thread temp_storage.first_items[linear_tid + 1]; output[ITEMS_PER_THREAD - 1] = ApplyOp::FlagT( flag_op, input[ITEMS_PER_THREAD - 1], successor_item, (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD); // Set output for remaining items Iterate::FlagTails(linear_tid, output, input, flag_op); } /** * @deprecated [Since 1.14.0] The cub::BlockAdjacentDifference::FlagHeadsAndTails * APIs are deprecated. Use cub::BlockAdjacentDifference::SubtractLeft or * cub::BlockAdjacentDifference::SubtractRight instead. * * @param[out] head_flags * Calling thread's discontinuity head_flags * * @param[out] tail_flags * Calling thread's discontinuity tail_flags * * @param[in] input * Calling thread's input items * * @param[in] flag_op * Binary boolean flag predicate */ template CUB_DEPRECATED _CCCL_DEVICE _CCCL_FORCEINLINE void FlagHeadsAndTails( FlagT (&head_flags)[ITEMS_PER_THREAD], FlagT (&tail_flags)[ITEMS_PER_THREAD], T (&input)[ITEMS_PER_THREAD], FlagOp flag_op) { // Share first and last items temp_storage.first_items[linear_tid] = input[0]; temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; CTA_SYNC(); T preds[ITEMS_PER_THREAD]; // Set flag for first thread-item preds[0] = temp_storage.last_items[linear_tid - 1]; if (linear_tid == 0) { head_flags[0] = 1; } else { head_flags[0] = ApplyOp::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD); } // Set flag for last thread-item tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ? 1 : // Last thread ApplyOp::FlagT( flag_op, input[ITEMS_PER_THREAD - 1], temp_storage.first_items[linear_tid + 1], (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD); // Set head_flags for remaining items Iterate::FlagHeads(linear_tid, head_flags, input, preds, flag_op); // Set tail_flags for remaining items Iterate::FlagTails(linear_tid, tail_flags, input, flag_op); } /** * @deprecated [Since 1.14.0] The cub::BlockAdjacentDifference::FlagHeadsAndTails * APIs are deprecated. Use cub::BlockAdjacentDifference::SubtractLeft or * cub::BlockAdjacentDifference::SubtractRight instead. * * @param[out] head_flags * Calling thread's discontinuity head_flags * * @param[out] tail_flags * Calling thread's discontinuity tail_flags * * @param[in] tile_successor_item * [threadBLOCK_THREADS-1 only] Item with which to compare * the last tile item (inputITEMS_PER_THREAD-1 from * threadBLOCK_THREADS-1). * * @param[in] input * Calling thread's input items * * @param[in] flag_op * Binary boolean flag predicate */ template CUB_DEPRECATED _CCCL_DEVICE _CCCL_FORCEINLINE void FlagHeadsAndTails( FlagT (&head_flags)[ITEMS_PER_THREAD], FlagT (&tail_flags)[ITEMS_PER_THREAD], T tile_successor_item, T (&input)[ITEMS_PER_THREAD], FlagOp flag_op) { // Share first and last items temp_storage.first_items[linear_tid] = input[0]; temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; CTA_SYNC(); T preds[ITEMS_PER_THREAD]; // Set flag for first thread-item if (linear_tid == 0) { head_flags[0] = 1; } else { preds[0] = temp_storage.last_items[linear_tid - 1]; head_flags[0] = ApplyOp::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD); } // Set flag for last thread-item T successor_item = (linear_tid == BLOCK_THREADS - 1) ? tile_successor_item : // Last thread temp_storage.first_items[linear_tid + 1]; tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp::FlagT( flag_op, input[ITEMS_PER_THREAD - 1], successor_item, (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD); // Set head_flags for remaining items Iterate::FlagHeads(linear_tid, head_flags, input, preds, flag_op); // Set tail_flags for remaining items Iterate::FlagTails(linear_tid, tail_flags, input, flag_op); } /** * @deprecated [Since 1.14.0] The cub::BlockAdjacentDifference::FlagHeadsAndTails * APIs are deprecated. Use cub::BlockAdjacentDifference::SubtractLeft or * cub::BlockAdjacentDifference::SubtractRight instead. * * @param[out] head_flags * Calling thread's discontinuity head_flags * * @param[in] tile_predecessor_item * [thread0 only] Item with which to compare the first tile item * (input0 from thread0). * * @param[out] tail_flags * Calling thread's discontinuity tail_flags * * @param[in] input * Calling thread's input items * * @param[in] flag_op * Binary boolean flag predicate */ template CUB_DEPRECATED _CCCL_DEVICE _CCCL_FORCEINLINE void FlagHeadsAndTails( FlagT (&head_flags)[ITEMS_PER_THREAD], T tile_predecessor_item, FlagT (&tail_flags)[ITEMS_PER_THREAD], T (&input)[ITEMS_PER_THREAD], FlagOp flag_op) { // Share first and last items temp_storage.first_items[linear_tid] = input[0]; temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; CTA_SYNC(); T preds[ITEMS_PER_THREAD]; // Set flag for first thread-item preds[0] = (linear_tid == 0) ? tile_predecessor_item : // First thread temp_storage.last_items[linear_tid - 1]; head_flags[0] = ApplyOp::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD); // Set flag for last thread-item tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ? 1 : // Last thread ApplyOp::FlagT( flag_op, input[ITEMS_PER_THREAD - 1], temp_storage.first_items[linear_tid + 1], (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD); // Set head_flags for remaining items Iterate::FlagHeads(linear_tid, head_flags, input, preds, flag_op); // Set tail_flags for remaining items Iterate::FlagTails(linear_tid, tail_flags, input, flag_op); } /** * @deprecated [Since 1.14.0] The cub::BlockAdjacentDifference::FlagHeadsAndTails * APIs are deprecated. Use cub::BlockAdjacentDifference::SubtractLeft or * cub::BlockAdjacentDifference::SubtractRight instead. * * @param head_flags * [out] Calling thread's discontinuity head_flags * * @param tile_predecessor_item * [in] [thread0 only] Item with which to compare the first tile * item (input0 from thread0). * * @param tail_flags * [out] Calling thread's discontinuity tail_flags * * @param tile_successor_item * [in] [threadBLOCK_THREADS-1 only] Item with which to * compare the last tile item (inputITEMS_PER_THREAD-1 from * threadBLOCK_THREADS-1). * * @param input * [in] Calling thread's input items * * @param flag_op * [in] Binary boolean flag predicate */ template CUB_DEPRECATED _CCCL_DEVICE _CCCL_FORCEINLINE void FlagHeadsAndTails( FlagT (&head_flags)[ITEMS_PER_THREAD], T tile_predecessor_item, FlagT (&tail_flags)[ITEMS_PER_THREAD], T tile_successor_item, T (&input)[ITEMS_PER_THREAD], FlagOp flag_op) { // Share first and last items temp_storage.first_items[linear_tid] = input[0]; temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; CTA_SYNC(); T preds[ITEMS_PER_THREAD]; // Set flag for first thread-item preds[0] = (linear_tid == 0) ? tile_predecessor_item : // First thread temp_storage.last_items[linear_tid - 1]; head_flags[0] = ApplyOp::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD); // Set flag for last thread-item T successor_item = (linear_tid == BLOCK_THREADS - 1) ? tile_successor_item : // Last thread temp_storage.first_items[linear_tid + 1]; tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp::FlagT( flag_op, input[ITEMS_PER_THREAD - 1], successor_item, (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD); // Set head_flags for remaining items Iterate::FlagHeads(linear_tid, head_flags, input, preds, flag_op); // Set tail_flags for remaining items Iterate::FlagTails(linear_tid, tail_flags, input, flag_op); } //! @} end member group }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/block/block_discontinuity.cuh000066400000000000000000001360261463375617100216430ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * @file * The cub::BlockDiscontinuity class provides [collective](index.html#sec0) methods for * flagging discontinuities within an ordered set of items partitioned across a CUDA thread block. */ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include CUB_NAMESPACE_BEGIN //! @rst //! The BlockDiscontinuity class provides :ref:`collective ` methods for //! flagging discontinuities within an ordered set of items partitioned across a CUDA thread //! block. //! //! Overview //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! - A set of "head flags" (or "tail flags") is often used to indicate corresponding items //! that differ from their predecessors (or successors). For example, head flags are convenient //! for demarcating disjoint data segments as part of a segmented scan or reduction. //! - @blocked //! //! Performance Considerations //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! - @granularity //! - Incurs zero bank conflicts for most types //! //! A Simple Example //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! @blockcollective{BlockDiscontinuity} //! //! The code snippet below illustrates the head flagging of 512 integer items that //! are partitioned in a :ref:`blocked arrangement ` across 128 threads //! where each thread owns 4 consecutive items. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! __global__ void ExampleKernel(...) //! { //! // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int //! typedef cub::BlockDiscontinuity BlockDiscontinuity; //! //! // Allocate shared memory for BlockDiscontinuity //! __shared__ typename BlockDiscontinuity::TempStorage temp_storage; //! //! // Obtain a segment of consecutive items that are blocked across threads //! int thread_data[4]; //! ... //! //! // Collectively compute head flags for discontinuities in the segment //! int head_flags[4]; //! BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data, cub::Inequality()); //! //! Suppose the set of input ``thread_data`` across the block of threads is //! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }``. //! The corresponding output ``head_flags`` in those threads will be //! ``{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }``. //! //! Re-using dynamically allocating shared memory //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The ``examples/block/example_block_reduce_dyn_smem.cu`` example illustrates usage of //! dynamically shared memory with BlockReduce and how to re-purpose the same memory region. //! This example can be easily adapted to the storage required by BlockDiscontinuity. //! @endrst //! //! @tparam T //! The data type to be flagged. //! //! @tparam BLOCK_DIM_X //! The thread block length in threads along the X dimension //! //! @tparam BLOCK_DIM_Y //! **[optional]** The thread block length in threads along the Y dimension (default: 1) //! //! @tparam BLOCK_DIM_Z //! **[optional]** The thread block length in threads along the Z dimension (default: 1) //! //! @tparam LEGACY_PTX_ARCH //! **[optional]** Unused template class BlockDiscontinuity { private: enum { /// The thread block size in threads BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, }; /// Shared memory storage layout type (last element from each thread's input) struct _TempStorage { T first_items[BLOCK_THREADS]; T last_items[BLOCK_THREADS]; }; /// Internal storage allocator _CCCL_DEVICE _CCCL_FORCEINLINE _TempStorage& PrivateStorage() { __shared__ _TempStorage private_storage; return private_storage; } /// Specialization for when FlagOp has third index param template ::HAS_PARAM> struct ApplyOp { // Apply flag operator static _CCCL_DEVICE _CCCL_FORCEINLINE bool FlagT(FlagOp flag_op, const T& a, const T& b, int idx) { return flag_op(a, b, idx); } }; /// Specialization for when FlagOp does not have a third index param template struct ApplyOp { // Apply flag operator static _CCCL_DEVICE _CCCL_FORCEINLINE bool FlagT(FlagOp flag_op, const T& a, const T& b, int /*idx*/) { return flag_op(a, b); } }; /// Templated unrolling of item comparison (inductive case) struct Iterate { /** * @brief Head flags * * @param[out] flags * Calling thread's discontinuity head_flags * * @param[in] input * Calling thread's input items * * @param[out] preds * Calling thread's predecessor items * * @param[in] flag_op * Binary boolean flag predicate */ template static _CCCL_DEVICE _CCCL_FORCEINLINE void FlagHeads( int linear_tid, FlagT (&flags)[ITEMS_PER_THREAD], T (&input)[ITEMS_PER_THREAD], T (&preds)[ITEMS_PER_THREAD], FlagOp flag_op) { #pragma unroll for (int i = 1; i < ITEMS_PER_THREAD; ++i) { preds[i] = input[i - 1]; flags[i] = ApplyOp::FlagT(flag_op, preds[i], input[i], (linear_tid * ITEMS_PER_THREAD) + i); } } /** * @brief Tail flags * * @param[out] flags * Calling thread's discontinuity head_flags * * @param[in] input * Calling thread's input items * * @param[in] flag_op * Binary boolean flag predicate */ template static _CCCL_DEVICE _CCCL_FORCEINLINE void FlagTails(int linear_tid, FlagT (&flags)[ITEMS_PER_THREAD], T (&input)[ITEMS_PER_THREAD], FlagOp flag_op) { #pragma unroll for (int i = 0; i < ITEMS_PER_THREAD - 1; ++i) { flags[i] = ApplyOp::FlagT(flag_op, input[i], input[i + 1], (linear_tid * ITEMS_PER_THREAD) + i + 1); } } }; /****************************************************************************** * Thread fields ******************************************************************************/ /// Shared storage reference _TempStorage& temp_storage; /// Linear thread-id unsigned int linear_tid; public: /// @smemstorage{BlockDiscontinuity} struct TempStorage : Uninitialized<_TempStorage> {}; //! @name Collective constructors //! @{ /** * @brief Collective constructor using a private static allocation of shared memory as temporary * storage. */ _CCCL_DEVICE _CCCL_FORCEINLINE BlockDiscontinuity() : temp_storage(PrivateStorage()) , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) {} /** * @brief Collective constructor using the specified memory allocation as temporary storage. * * @param[in] temp_storage * Reference to memory allocation having layout type TempStorage */ _CCCL_DEVICE _CCCL_FORCEINLINE BlockDiscontinuity(TempStorage& temp_storage) : temp_storage(temp_storage.Alias()) , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) {} //! @} end member group //! @name Head flag operations //! @{ #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document /** * @param[out] head_flags * Calling thread's discontinuity head_flags * * @param[in] input * Calling thread's input items * * @param[out] preds * Calling thread's predecessor items * * @param[in] flag_op * Binary boolean flag predicate */ template _CCCL_DEVICE _CCCL_FORCEINLINE void FlagHeads( FlagT (&head_flags)[ITEMS_PER_THREAD], T (&input)[ITEMS_PER_THREAD], T (&preds)[ITEMS_PER_THREAD], FlagOp flag_op) { // Share last item temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; CTA_SYNC(); if (linear_tid == 0) { // Set flag for first thread-item (preds[0] is undefined) head_flags[0] = 1; } else { preds[0] = temp_storage.last_items[linear_tid - 1]; head_flags[0] = ApplyOp::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD); } // Set head_flags for remaining items Iterate::FlagHeads(linear_tid, head_flags, input, preds, flag_op); } /** * @param[out] head_flags * Calling thread's discontinuity head_flags * * @param[in] input * Calling thread's input items * * @param[out] preds * Calling thread's predecessor items * * @param[in] flag_op * Binary boolean flag predicate * * @param[in] tile_predecessor_item * [thread0 only] Item with which to compare the first tile item * (input0 from thread0). */ template _CCCL_DEVICE _CCCL_FORCEINLINE void FlagHeads( FlagT (&head_flags)[ITEMS_PER_THREAD], T (&input)[ITEMS_PER_THREAD], T (&preds)[ITEMS_PER_THREAD], FlagOp flag_op, T tile_predecessor_item) { // Share last item temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; CTA_SYNC(); // Set flag for first thread-item preds[0] = (linear_tid == 0) ? tile_predecessor_item : // First thread temp_storage.last_items[linear_tid - 1]; head_flags[0] = ApplyOp::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD); // Set head_flags for remaining items Iterate::FlagHeads(linear_tid, head_flags, input, preds, flag_op); } #endif // DOXYGEN_SHOULD_SKIP_THIS //! @rst //! Sets head flags indicating discontinuities between items partitioned across the thread //! block, for which the first item has no reference and is always flagged. //! //! - The flag ``head_flags[i]`` is set for item ``input[i]`` when ``flag_op(previous-item, input[i])`` returns //! ``true`` (where ``previous-item`` is either the preceding item in the same thread or the last item in //! the previous thread). //! - For *thread*\ :sub:`0`, item ``input[0]`` is always flagged. //! - @blocked //! - @granularity //! - @smemreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates the head-flagging of 512 integer items that //! are partitioned in a :ref:`blocked arrangement ` across 128 threads //! where each thread owns 4 consecutive items. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! __global__ void ExampleKernel(...) //! { //! // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int //! typedef cub::BlockDiscontinuity BlockDiscontinuity; //! //! // Allocate shared memory for BlockDiscontinuity //! __shared__ typename BlockDiscontinuity::TempStorage temp_storage; //! //! // Obtain a segment of consecutive items that are blocked across threads //! int thread_data[4]; //! ... //! //! // Collectively compute head flags for discontinuities in the segment //! int head_flags[4]; //! BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data, cub::Inequality()); //! //! Suppose the set of input ``thread_data`` across the block of threads is //! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }``. //! The corresponding output ``head_flags`` in those threads will be //! ``{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }``. //! @endrst //! //! @tparam ITEMS_PER_THREAD //! **[inferred]** The number of consecutive items partitioned onto each thread //! //! @tparam FlagT //! **[inferred]** The flag type (must be an integer type) //! //! @tparam FlagOp //! **[inferred]** Binary predicate functor type having member //! `T operator()(const T &a, const T &b)` or member //! `T operator()(const T &a, const T &b, unsigned int b_index)`, and returning `true` //! if a discontinuity exists between `a` and `b`, otherwise `false`. //! `b_index` is the rank of b in the aggregate tile of data. //! //! @param[out] head_flags //! Calling thread's discontinuity head_flags //! //! @param[in] input //! Calling thread's input items //! //! @param[in] flag_op //! Binary boolean flag predicate template _CCCL_DEVICE _CCCL_FORCEINLINE void FlagHeads(FlagT (&head_flags)[ITEMS_PER_THREAD], T (&input)[ITEMS_PER_THREAD], FlagOp flag_op) { T preds[ITEMS_PER_THREAD]; FlagHeads(head_flags, input, preds, flag_op); } //! @rst //! Sets head flags indicating discontinuities between items partitioned across the thread block. //! //! - The flag ``head_flags[i]`` is set for item ``input[i]`` when ``flag_op(previous-item, input[i])`` //! returns ``true`` (where ``previous-item`` is either the preceding item in the same thread or the last item //! in the previous thread). //! - For *thread*\ :sub:`0`, item ``input[0]`` is compared against ``tile_predecessor_item``. //! - @blocked //! - @granularity //! - @smemreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates the head-flagging of 512 integer items that //! are partitioned in a :ref:`blocked arrangement ` across 128 threads //! where each thread owns 4 consecutive items. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! __global__ void ExampleKernel(...) //! { //! // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int //! typedef cub::BlockDiscontinuity BlockDiscontinuity; //! //! // Allocate shared memory for BlockDiscontinuity //! __shared__ typename BlockDiscontinuity::TempStorage temp_storage; //! //! // Obtain a segment of consecutive items that are blocked across threads //! int thread_data[4]; //! ... //! //! // Have thread0 obtain the predecessor item for the entire tile //! int tile_predecessor_item; //! if (threadIdx.x == 0) tile_predecessor_item == ... //! //! // Collectively compute head flags for discontinuities in the segment //! int head_flags[4]; //! BlockDiscontinuity(temp_storage).FlagHeads( //! head_flags, thread_data, cub::Inequality(), tile_predecessor_item); //! //! Suppose the set of input ``thread_data`` across the block of threads is //! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }``, //! and that ``tile_predecessor_item`` is ``0``. The corresponding output ``head_flags`` in those //! threads will be ``{ [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }``. //! @endrst //! //! @tparam ITEMS_PER_THREAD //! **[inferred]** The number of consecutive items partitioned onto each thread. //! //! @tparam FlagT //! **[inferred]** The flag type (must be an integer type) //! //! @tparam FlagOp //! **[inferred]** Binary predicate functor type having member //! `T operator()(const T &a, const T &b)` or member //! `T operator()(const T &a, const T &b, unsigned int b_index)`, //! and returning `true` if a discontinuity exists between `a` and `b`, //! otherwise `false`. `b_index` is the rank of b in the aggregate tile of data. //! //! @param[out] head_flags //! Calling thread's discontinuity `head_flags` //! //! @param[in] input //! Calling thread's input items //! //! @param[in] flag_op //! Binary boolean flag predicate //! //! @param[in] tile_predecessor_item //! @rst //! *thread*\ :sub:`0` only item with which to compare the first tile item (``input[0]`` from *thread*\ :sub:`0`). //! @endrst template _CCCL_DEVICE _CCCL_FORCEINLINE void FlagHeads( FlagT (&head_flags)[ITEMS_PER_THREAD], T (&input)[ITEMS_PER_THREAD], FlagOp flag_op, T tile_predecessor_item) { T preds[ITEMS_PER_THREAD]; FlagHeads(head_flags, input, preds, flag_op, tile_predecessor_item); } //! @} end member group //! @name Tail flag operations //! @{ //! @rst //! Sets tail flags indicating discontinuities between items partitioned across the thread //! block, for which the last item has no reference and is always flagged. //! //! - The flag ``tail_flags[i]`` is set for item ``input[i]`` when //! ``flag_op(input[i], next-item)`` //! returns ``true`` (where `next-item` is either the next item //! in the same thread or the first item in the next thread). //! - For *thread*\ :sub:`BLOCK_THREADS - 1`, item ``input[ITEMS_PER_THREAD - 1]`` is always flagged. //! - @blocked //! - @granularity //! - @smemreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates the tail-flagging of 512 integer items that //! are partitioned in a :ref:`blocked arrangement ` across 128 threads //! where each thread owns 4 consecutive items. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! __global__ void ExampleKernel(...) //! { //! // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int //! typedef cub::BlockDiscontinuity BlockDiscontinuity; //! //! // Allocate shared memory for BlockDiscontinuity //! __shared__ typename BlockDiscontinuity::TempStorage temp_storage; //! //! // Obtain a segment of consecutive items that are blocked across threads //! int thread_data[4]; //! ... //! //! // Collectively compute tail flags for discontinuities in the segment //! int tail_flags[4]; //! BlockDiscontinuity(temp_storage).FlagTails(tail_flags, thread_data, cub::Inequality()); //! //! Suppose the set of input ``thread_data`` across the block of threads is //! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }``. //! The corresponding output ``tail_flags`` in those threads will be //! ``{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }``. //! @endrst //! //! @tparam ITEMS_PER_THREAD //! **[inferred]** The number of consecutive items partitioned onto each thread. //! //! @tparam FlagT //! **[inferred]** The flag type (must be an integer type) //! //! @tparam FlagOp //! **[inferred]** Binary predicate functor type having member //! `T operator()(const T &a, const T &b)` or member //! `T operator()(const T &a, const T &b, unsigned int b_index)`, and returning `true` //! if a discontinuity exists between `a` and `b`, otherwise `false`. `b_index` is the //! rank of `b` in the aggregate tile of data. //! //! @param[out] tail_flags //! Calling thread's discontinuity tail_flags //! //! @param[in] input //! Calling thread's input items //! //! @param[in] flag_op //! Binary boolean flag predicate template _CCCL_DEVICE _CCCL_FORCEINLINE void FlagTails(FlagT (&tail_flags)[ITEMS_PER_THREAD], T (&input)[ITEMS_PER_THREAD], FlagOp flag_op) { // Share first item temp_storage.first_items[linear_tid] = input[0]; CTA_SYNC(); // Set flag for last thread-item tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ? 1 : // Last thread ApplyOp::FlagT( flag_op, input[ITEMS_PER_THREAD - 1], temp_storage.first_items[linear_tid + 1], (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD); // Set tail_flags for remaining items Iterate::FlagTails(linear_tid, tail_flags, input, flag_op); } //! @rst //! Sets tail flags indicating discontinuities between items partitioned across the thread block. //! //! - The flag ``tail_flags[i]`` is set for item ``input[i]`` when ``flag_op(input[i], next-item)`` //! returns ``true`` (where ``next-item`` is either the next item in the same thread or the first item in //! the next thread). //! - For *thread*\ :sub:`BLOCK_THREADS - 1`, item ``input[ITEMS_PER_THREAD - 1]`` is compared against //! ``tile_successor_item``. //! - @blocked //! - @granularity //! - @smemreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates the tail-flagging of 512 integer items that //! are partitioned in a :ref:`blocked arrangement ` across 128 threads //! where each thread owns 4 consecutive items. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! __global__ void ExampleKernel(...) //! { //! // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int //! typedef cub::BlockDiscontinuity BlockDiscontinuity; //! //! // Allocate shared memory for BlockDiscontinuity //! __shared__ typename BlockDiscontinuity::TempStorage temp_storage; //! //! // Obtain a segment of consecutive items that are blocked across threads //! int thread_data[4]; //! ... //! //! // Have thread127 obtain the successor item for the entire tile //! int tile_successor_item; //! if (threadIdx.x == 127) tile_successor_item == ... //! //! // Collectively compute tail flags for discontinuities in the segment //! int tail_flags[4]; //! BlockDiscontinuity(temp_storage).FlagTails( //! tail_flags, thread_data, cub::Inequality(), tile_successor_item); //! //! Suppose the set of input ``thread_data`` across the block of threads is //! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }`` //! and that ``tile_successor_item`` is ``125``. The corresponding output ``tail_flags`` in those //! threads will be ``{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }``. //! @endrst //! //! @tparam ITEMS_PER_THREAD //! **[inferred]** The number of consecutive items partitioned onto each thread. //! //! @tparam FlagT //! **[inferred]** The flag type (must be an integer type) //! //! @tparam FlagOp //! **[inferred]** Binary predicate functor type having member //! `T operator()(const T &a, const T &b)` or member //! `T operator()(const T &a, const T &b, unsigned int b_index)`, and returning `true` //! if a discontinuity exists between `a` and `b`, otherwise `false`. `b_index` is the //! rank of `b` in the aggregate tile of data. //! //! @param[out] tail_flags //! Calling thread's discontinuity tail_flags //! //! @param[in] input //! Calling thread's input items //! //! @param[in] flag_op //! Binary boolean flag predicate //! //! @param[in] tile_successor_item //! @rst //! *thread*\ :sub:`BLOCK_THREADS - 1` only item with which to //! compare the last tile item (``input[ITEMS_PER_THREAD - 1]`` from //! *thread*\ :sub:`BLOCK_THREADS - 1`). //! @endrst template _CCCL_DEVICE _CCCL_FORCEINLINE void FlagTails(FlagT (&tail_flags)[ITEMS_PER_THREAD], T (&input)[ITEMS_PER_THREAD], FlagOp flag_op, T tile_successor_item) { // Share first item temp_storage.first_items[linear_tid] = input[0]; CTA_SYNC(); // Set flag for last thread-item T successor_item = (linear_tid == BLOCK_THREADS - 1) ? tile_successor_item : // Last thread temp_storage.first_items[linear_tid + 1]; tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp::FlagT( flag_op, input[ITEMS_PER_THREAD - 1], successor_item, (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD); // Set tail_flags for remaining items Iterate::FlagTails(linear_tid, tail_flags, input, flag_op); } //! @} end member group //! @name Head & tail flag operations //! @{ //! @rst //! Sets both head and tail flags indicating discontinuities between items partitioned across the thread block. //! //! - The flag ``head_flags[i]`` is set for item ``input[i]`` when ``flag_op(previous-item, input[i])`` returns //! ``true`` (where ``previous-item`` is either the preceding item in the same thread or the last item in //! the previous thread). //! - For *thread*\ :sub:`0`, item ``input[0]`` is always flagged. //! - The flag ``tail_flags[i]`` is set for item ``input[i]`` when ``flag_op(input[i], next-item)`` //! returns ``true`` (where next-item is either the next item in the same thread or the first item in //! the next thread). //! - For *thread*\ :sub:`BLOCK_THREADS - 1`, item ``input[ITEMS_PER_THREAD - 1]`` is always flagged. //! - @blocked //! - @granularity //! - @smemreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates the head- and tail-flagging of 512 integer items that //! are partitioned in a :ref:`blocked arrangement ` across 128 threads //! where each thread owns 4 consecutive items. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! __global__ void ExampleKernel(...) //! { //! // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int //! typedef cub::BlockDiscontinuity BlockDiscontinuity; //! //! // Allocate shared memory for BlockDiscontinuity //! __shared__ typename BlockDiscontinuity::TempStorage temp_storage; //! //! // Obtain a segment of consecutive items that are blocked across threads //! int thread_data[4]; //! ... //! //! // Collectively compute head and flags for discontinuities in the segment //! int head_flags[4]; //! int tail_flags[4]; //! BlockDiscontinuity(temp_storage).FlagTails( //! head_flags, tail_flags, thread_data, cub::Inequality()); //! //! Suppose the set of input ``thread_data`` across the block of threads is //! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }`` //! and that the tile_successor_item is ``125``. The corresponding output ``head_flags`` //! in those threads will be ``{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }``. //! and the corresponding output ``tail_flags`` in those threads will be //! ``{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }``. //! @endrst //! //! @tparam ITEMS_PER_THREAD //! **[inferred]** The number of consecutive items partitioned onto each thread. //! //! @tparam FlagT //! **[inferred]** The flag type (must be an integer type) //! //! @tparam FlagOp //! **[inferred]** Binary predicate functor type having member //! `T operator()(const T &a, const T &b)` or member //! `T operator()(const T &a, const T &b, unsigned int b_index)`, and returning `true` //! if a discontinuity exists between `a` and `b`, otherwise `false`. `b_index` is the //! rank of `b` in the aggregate tile of data. //! //! @param[out] head_flags //! Calling thread's discontinuity head_flags //! //! @param[out] tail_flags //! Calling thread's discontinuity tail_flags //! //! @param[in] input //! Calling thread's input items //! //! @param[in] flag_op //! Binary boolean flag predicate template _CCCL_DEVICE _CCCL_FORCEINLINE void FlagHeadsAndTails( FlagT (&head_flags)[ITEMS_PER_THREAD], FlagT (&tail_flags)[ITEMS_PER_THREAD], T (&input)[ITEMS_PER_THREAD], FlagOp flag_op) { // Share first and last items temp_storage.first_items[linear_tid] = input[0]; temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; CTA_SYNC(); T preds[ITEMS_PER_THREAD]; // Set flag for first thread-item if (linear_tid == 0) { head_flags[0] = 1; } else { preds[0] = temp_storage.last_items[linear_tid - 1]; head_flags[0] = ApplyOp::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD); } // Set flag for last thread-item tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ? 1 : // Last thread ApplyOp::FlagT( flag_op, input[ITEMS_PER_THREAD - 1], temp_storage.first_items[linear_tid + 1], (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD); // Set head_flags for remaining items Iterate::FlagHeads(linear_tid, head_flags, input, preds, flag_op); // Set tail_flags for remaining items Iterate::FlagTails(linear_tid, tail_flags, input, flag_op); } //! @rst //! Sets both head and tail flags indicating discontinuities between items partitioned across the thread block. //! //! - The flag ``head_flags[i]`` is set for item ``input[i]`` when //! ``flag_op(previous-item, input[i])`` returns ``true`` (where ``previous-item`` is either the preceding item //! in the same thread or the last item in the previous thread). //! - For *thread*\ :sub:`0`, item ``input[0]`` is always flagged. //! - The flag ``tail_flags[i]`` is set for item ``input[i]`` when ``flag_op(input[i], next-item)`` returns ``true`` //! (where ``next-item`` is either the next item in the same thread or the first item in the next thread). //! - For *thread*\ :sub:`BLOCK_THREADS - 1`, item ``input[ITEMS_PER_THREAD - 1]`` is compared //! against ``tile_predecessor_item``. //! - @blocked //! - @granularity //! - @smemreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates the head- and tail-flagging of 512 integer items that //! are partitioned in a :ref:`blocked arrangement ` across 128 threads //! where each thread owns 4 consecutive items. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! __global__ void ExampleKernel(...) //! { //! // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int //! typedef cub::BlockDiscontinuity BlockDiscontinuity; //! //! // Allocate shared memory for BlockDiscontinuity //! __shared__ typename BlockDiscontinuity::TempStorage temp_storage; //! //! // Obtain a segment of consecutive items that are blocked across threads //! int thread_data[4]; //! ... //! //! // Have thread127 obtain the successor item for the entire tile //! int tile_successor_item; //! if (threadIdx.x == 127) tile_successor_item == ... //! //! // Collectively compute head and flags for discontinuities in the segment //! int head_flags[4]; //! int tail_flags[4]; //! BlockDiscontinuity(temp_storage).FlagTails( //! head_flags, tail_flags, tile_successor_item, thread_data, cub::Inequality()); //! //! Suppose the set of input ``thread_data`` across the block of threads is //! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }`` //! and that the tile_successor_item is ``125``. The corresponding output ``head_flags`` //! in those threads will be ``{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }``. //! and the corresponding output ``tail_flags`` in those threads will be //! ``{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }``. //! @endrst //! //! @tparam ITEMS_PER_THREAD //! **[inferred]** The number of consecutive items partitioned onto each thread. //! //! @tparam FlagT //! **[inferred]** The flag type (must be an integer type) //! //! @tparam FlagOp //! **[inferred]** Binary predicate functor type having member //! `T operator()(const T &a, const T &b)` or member //! `T operator()(const T &a, const T &b, unsigned int b_index)`, and returning `true` //! if a discontinuity exists between `a` and `b`, otherwise `false`. `b_index` is the //! rank of b in the aggregate tile of data. //! //! @param[out] head_flags //! Calling thread's discontinuity head_flags //! //! @param[out] tail_flags //! Calling thread's discontinuity tail_flags //! //! @param[in] tile_successor_item //! @rst //! *thread*\ :sub:`BLOCK_THREADS - 1` only item with which to compare //! the last tile item (``input[ITEMS_PER_THREAD - 1]`` from //! *thread*\ :sub:`BLOCK_THREADS - 1`). //! @endrst //! //! @param[in] input //! Calling thread's input items //! //! @param[in] flag_op //! Binary boolean flag predicate template _CCCL_DEVICE _CCCL_FORCEINLINE void FlagHeadsAndTails( FlagT (&head_flags)[ITEMS_PER_THREAD], FlagT (&tail_flags)[ITEMS_PER_THREAD], T tile_successor_item, T (&input)[ITEMS_PER_THREAD], FlagOp flag_op) { // Share first and last items temp_storage.first_items[linear_tid] = input[0]; temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; CTA_SYNC(); T preds[ITEMS_PER_THREAD]; // Set flag for first thread-item if (linear_tid == 0) { head_flags[0] = 1; } else { preds[0] = temp_storage.last_items[linear_tid - 1]; head_flags[0] = ApplyOp::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD); } // Set flag for last thread-item T successor_item = (linear_tid == BLOCK_THREADS - 1) ? tile_successor_item : // Last thread temp_storage.first_items[linear_tid + 1]; tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp::FlagT( flag_op, input[ITEMS_PER_THREAD - 1], successor_item, (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD); // Set head_flags for remaining items Iterate::FlagHeads(linear_tid, head_flags, input, preds, flag_op); // Set tail_flags for remaining items Iterate::FlagTails(linear_tid, tail_flags, input, flag_op); } //! @rst //! Sets both head and tail flags indicating discontinuities between items partitioned across the thread block. //! //! - The flag ``head_flags[i]`` is set for item ``input[i]`` when ``flag_op(previous-item, input[i])`` //! returns ``true`` (where ``previous-item`` is either the preceding item in the same thread or the last item //! in the previous thread). //! - For *thread*\ :sub:`0`, item ``input[0]`` is compared against ``tile_predecessor_item``. //! - The flag ``tail_flags[i]`` is set for item ``input[i]`` when //! ``flag_op(input[i], next-item)`` returns ``true`` (where ``next-item`` is either the next item //! in the same thread or the first item in the next thread). //! - For *thread*\ :sub:`BLOCK_THREADS - 1`, item //! ``input[ITEMS_PER_THREAD - 1]`` is always flagged. //! - @blocked //! - @granularity //! - @smemreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates the head- and tail-flagging of 512 integer items that //! are partitioned in a :ref:`blocked arrangement ` across 128 threads //! where each thread owns 4 consecutive items. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! __global__ void ExampleKernel(...) //! { //! // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int //! typedef cub::BlockDiscontinuity BlockDiscontinuity; //! //! // Allocate shared memory for BlockDiscontinuity //! __shared__ typename BlockDiscontinuity::TempStorage temp_storage; //! //! // Obtain a segment of consecutive items that are blocked across threads //! int thread_data[4]; //! ... //! //! // Have thread0 obtain the predecessor item for the entire tile //! int tile_predecessor_item; //! if (threadIdx.x == 0) tile_predecessor_item == ... //! //! // Have thread127 obtain the successor item for the entire tile //! int tile_successor_item; //! if (threadIdx.x == 127) tile_successor_item == ... //! //! // Collectively compute head and flags for discontinuities in the segment //! int head_flags[4]; //! int tail_flags[4]; //! BlockDiscontinuity(temp_storage).FlagTails( //! head_flags, tile_predecessor_item, tail_flags, tile_successor_item, //! thread_data, cub::Inequality()); //! //! Suppose the set of input ``thread_data`` across the block of threads is //! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }``, //! that the ``tile_predecessor_item`` is ``0``, and that the ``tile_successor_item`` is ``125``. //! The corresponding output ``head_flags`` in those threads will be //! ``{ [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }``, and the corresponding output ``tail_flags`` //! in those threads will be ``{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }``. //! @endrst //! //! @tparam ITEMS_PER_THREAD //! **[inferred]** The number of consecutive items partitioned onto each thread. //! //! @tparam FlagT //! **[inferred]** The flag type (must be an integer type) //! //! @tparam FlagOp //! **[inferred]** Binary predicate functor type having member //! `T operator()(const T &a, const T &b)` or member //! `T operator()(const T &a, const T &b, unsigned int b_index)`, and returning `true` //! if a discontinuity exists between `a` and `b`, otherwise `false`. `b_index` is the rank //! of b in the aggregate tile of data. //! //! @param[out] head_flags //! Calling thread's discontinuity head_flags //! //! @param[in] tile_predecessor_item //! @rst //! *thread*\ :sub:`0` only item with which to compare the first tile item (``input[0]`` from *thread*\ :sub:`0`). //! @endrst //! //! @param[out] tail_flags //! Calling thread's discontinuity tail_flags //! //! @param[in] input //! Calling thread's input items //! //! @param[in] flag_op //! Binary boolean flag predicate template _CCCL_DEVICE _CCCL_FORCEINLINE void FlagHeadsAndTails( FlagT (&head_flags)[ITEMS_PER_THREAD], T tile_predecessor_item, FlagT (&tail_flags)[ITEMS_PER_THREAD], T (&input)[ITEMS_PER_THREAD], FlagOp flag_op) { // Share first and last items temp_storage.first_items[linear_tid] = input[0]; temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; CTA_SYNC(); T preds[ITEMS_PER_THREAD]; // Set flag for first thread-item preds[0] = (linear_tid == 0) ? tile_predecessor_item : // First thread temp_storage.last_items[linear_tid - 1]; head_flags[0] = ApplyOp::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD); // Set flag for last thread-item tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ? 1 : // Last thread ApplyOp::FlagT( flag_op, input[ITEMS_PER_THREAD - 1], temp_storage.first_items[linear_tid + 1], (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD); // Set head_flags for remaining items Iterate::FlagHeads(linear_tid, head_flags, input, preds, flag_op); // Set tail_flags for remaining items Iterate::FlagTails(linear_tid, tail_flags, input, flag_op); } //! @rst //! Sets both head and tail flags indicating discontinuities between items partitioned across the thread block. //! //! - The flag ``head_flags[i]`` is set for item ``input[i]`` when ``flag_op(previous-item, input[i])`` //! returns ``true`` (where ``previous-item`` is either the preceding item in the same thread or the last item in //! the previous thread). //! - For *thread*\ :sub:`0`, item ``input[0]`` is compared against ``tile_predecessor_item``. //! - The flag ``tail_flags[i]`` is set for item ``input[i]`` when ``flag_op(input[i], next-item)`` //! returns ``true`` (where ``next-item`` is either the next item in the same thread or the first item in //! the next thread). //! - For *thread*\ :sub:`BLOCK_THREADS - 1`, item ``input[ITEMS_PER_THREAD - 1]`` is compared //! against ``tile_successor_item``. //! - @blocked //! - @granularity //! - @smemreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates the head- and tail-flagging of 512 integer items that //! are partitioned in a :ref:`blocked arrangement ` across 128 threads //! where each thread owns 4 consecutive items. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! __global__ void ExampleKernel(...) //! { //! // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int //! typedef cub::BlockDiscontinuity BlockDiscontinuity; //! //! // Allocate shared memory for BlockDiscontinuity //! __shared__ typename BlockDiscontinuity::TempStorage temp_storage; //! //! // Obtain a segment of consecutive items that are blocked across threads //! int thread_data[4]; //! ... //! //! // Have thread0 obtain the predecessor item for the entire tile //! int tile_predecessor_item; //! if (threadIdx.x == 0) tile_predecessor_item == ... //! //! // Have thread127 obtain the successor item for the entire tile //! int tile_successor_item; //! if (threadIdx.x == 127) tile_successor_item == ... //! //! // Collectively compute head and flags for discontinuities in the segment //! int head_flags[4]; //! int tail_flags[4]; //! BlockDiscontinuity(temp_storage).FlagTails( //! head_flags, tile_predecessor_item, tail_flags, tile_successor_item, //! thread_data, cub::Inequality()); //! //! Suppose the set of input ``thread_data`` across the block of threads is //! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }``, //! that the ``tile_predecessor_item`` is ``0``, and that the //! ``tile_successor_item`` is ``125``. The corresponding output ``head_flags`` //! in those threads will be ``{ [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }``. //! and the corresponding output ``tail_flags`` in those threads will be //! ``{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }``. //! @endrst //! //! @tparam ITEMS_PER_THREAD //! **[inferred]** The number of consecutive items partitioned onto each thread. //! //! @tparam FlagT //! **[inferred]** The flag type (must be an integer type) //! //! @tparam FlagOp //! **[inferred]** Binary predicate functor type having member //! `T operator()(const T &a, const T &b)` or member //! `T operator()(const T &a, const T &b, unsigned int b_index)`, and returning `true` //! if a discontinuity exists between `a` and `b`, otherwise `false`. `b_index` is the rank //! of `b` in the aggregate tile of data. //! //! @param[out] head_flags //! Calling thread's discontinuity head_flags //! //! @param[in] tile_predecessor_item //! @rst //! *thread*\ :sub:`0` only item with which to compare the first tile item (``input[0]`` from *thread*\ :sub:`0`). //! @endrst //! //! @param[out] tail_flags //! Calling thread's discontinuity tail_flags //! //! @param[in] tile_successor_item //! @rst //! *thread*\ :sub:`BLOCK_THREADS - 1` only item with which to compare the last tile item //! (``input[ITEMS_PER_THREAD - 1]`` from *thread*\ :sub:`BLOCK_THREADS - 1`). //! @endrst //! //! @param[in] input //! Calling thread's input items //! //! @param[in] flag_op //! Binary boolean flag predicate template _CCCL_DEVICE _CCCL_FORCEINLINE void FlagHeadsAndTails( FlagT (&head_flags)[ITEMS_PER_THREAD], T tile_predecessor_item, FlagT (&tail_flags)[ITEMS_PER_THREAD], T tile_successor_item, T (&input)[ITEMS_PER_THREAD], FlagOp flag_op) { // Share first and last items temp_storage.first_items[linear_tid] = input[0]; temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; CTA_SYNC(); T preds[ITEMS_PER_THREAD]; // Set flag for first thread-item preds[0] = (linear_tid == 0) ? tile_predecessor_item : // First thread temp_storage.last_items[linear_tid - 1]; head_flags[0] = ApplyOp::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD); // Set flag for last thread-item T successor_item = (linear_tid == BLOCK_THREADS - 1) ? tile_successor_item : // Last thread temp_storage.first_items[linear_tid + 1]; tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp::FlagT( flag_op, input[ITEMS_PER_THREAD - 1], successor_item, (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD); // Set head_flags for remaining items Iterate::FlagHeads(linear_tid, head_flags, input, preds, flag_op); // Set tail_flags for remaining items Iterate::FlagTails(linear_tid, tail_flags, input, flag_op); } //! @} end member group }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/block/block_exchange.cuh000066400000000000000000001332141463375617100205140ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ //! @file The cub::BlockExchange class provides :ref:`collective ` methods for //! rearranging data partitioned across a CUDA thread block. #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include CUB_NAMESPACE_BEGIN //! @rst //! The BlockExchange class provides :ref:`collective ` methods for rearranging data partitioned //! across a CUDA thread block. //! //! Overview //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! - It is commonplace for blocks of threads to rearrange data items between //! threads. For example, the device-accessible memory subsystem prefers access patterns //! where data items are "striped" across threads (where consecutive threads access consecutive items), //! yet most block-wide operations prefer a "blocked" partitioning of items across threads //! (where consecutive items belong to a single thread). //! - BlockExchange supports the following types of data exchanges: //! //! - Transposing between :ref:`blocked ` and :ref:`striped ` //! arrangements //! - Transposing between :ref:`blocked ` and //! :ref:`warp-striped ` arrangements //! - Scattering ranked items to a :ref:`blocked arrangement ` //! - Scattering ranked items to a :ref:`striped arrangement ` //! //! - @rowmajor //! //! A Simple Example //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! @blockcollective{BlockExchange} //! //! The code snippet below illustrates the conversion from a "blocked" to a "striped" arrangement //! of 512 integer items partitioned across 128 threads where each thread owns 4 items. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! __global__ void ExampleKernel(int *d_data, ...) //! { //! // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each //! typedef cub::BlockExchange BlockExchange; //! //! // Allocate shared memory for BlockExchange //! __shared__ typename BlockExchange::TempStorage temp_storage; //! //! // Load a tile of data striped across threads //! int thread_data[4]; //! cub::LoadDirectStriped<128>(threadIdx.x, d_data, thread_data); //! //! // Collectively exchange data into a blocked arrangement across threads //! BlockExchange(temp_storage).StripedToBlocked(thread_data); //! //! Suppose the set of striped input ``thread_data`` across the block of threads is //! ``{ [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }``. //! The corresponding output ``thread_data`` in those threads will be //! ``{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }``. //! //! Performance Considerations //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! - Proper device-specific padding ensures zero bank conflicts for most types. //! //! Re-using dynamically allocating shared memory //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The ``block/example_block_reduce_dyn_smem.cu`` example illustrates usage of dynamically shared memory with //! BlockReduce and how to re-purpose the same memory region. This example can be easily adapted to //! the storage required by BlockExchange. //! @endrst //! //! @tparam T //! The data type to be exchanged //! //! @tparam BLOCK_DIM_X //! The thread block length in threads along the X dimension //! //! @tparam ITEMS_PER_THREAD //! The number of items partitioned onto each thread. //! //! @tparam WARP_TIME_SLICING //! **[optional]** When `true`, only use enough shared memory for a single warp's worth of tile data, //! time-slicing the block-wide exchange over multiple synchronized rounds. //! Yields a smaller memory footprint at the expense of decreased parallelism. (Default: false) //! //! @tparam BLOCK_DIM_Y //! **[optional]** The thread block length in threads along the Y dimension (default: 1) //! //! @tparam BLOCK_DIM_Z //! **[optional]** The thread block length in threads along the Z dimension (default: 1) //! //! @tparam LEGACY_PTX_ARCH //! [optional] Unused. template class BlockExchange { private: /// Constants enum { /// The thread block size in threads BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, LOG_WARP_THREADS = CUB_LOG_WARP_THREADS(0), WARP_THREADS = 1 << LOG_WARP_THREADS, WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, LOG_SMEM_BANKS = CUB_LOG_SMEM_BANKS(0), SMEM_BANKS = 1 << LOG_SMEM_BANKS, TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, TIME_SLICES = (WARP_TIME_SLICING) ? WARPS : 1, TIME_SLICED_THREADS = (WARP_TIME_SLICING) ? CUB_MIN(BLOCK_THREADS, WARP_THREADS) : BLOCK_THREADS, TIME_SLICED_ITEMS = TIME_SLICED_THREADS * ITEMS_PER_THREAD, WARP_TIME_SLICED_THREADS = CUB_MIN(BLOCK_THREADS, WARP_THREADS), WARP_TIME_SLICED_ITEMS = WARP_TIME_SLICED_THREADS * ITEMS_PER_THREAD, // Insert padding to avoid bank conflicts during raking when items per thread is a power of two and > 4 (otherwise // we can typically use 128b loads) INSERT_PADDING = (ITEMS_PER_THREAD > 4) && (PowerOfTwo::VALUE), PADDING_ITEMS = (INSERT_PADDING) ? (TIME_SLICED_ITEMS >> LOG_SMEM_BANKS) : 0, }; /// Shared memory storage layout type struct __align__(16) _TempStorage { InputT buff[TIME_SLICED_ITEMS + PADDING_ITEMS]; }; public: /// @smemstorage{BlockExchange} struct TempStorage : Uninitialized<_TempStorage> {}; private: /// Shared storage reference _TempStorage& temp_storage; /// Linear thread-id unsigned int linear_tid; unsigned int lane_id; unsigned int warp_id; unsigned int warp_offset; /// Internal storage allocator _CCCL_DEVICE _CCCL_FORCEINLINE _TempStorage& PrivateStorage() { __shared__ _TempStorage private_storage; return private_storage; } //! @brief Transposes data items from **blocked** arrangement to **striped** arrangement. //! Specialized for no timeslicing. //! //! @param[in] input_items //! Items to exchange, converting between **blocked** and **striped** arrangements. //! //! @param[out] output_items //! Items to exchange, converting between **blocked** and **striped** arrangements. template _CCCL_DEVICE _CCCL_FORCEINLINE void BlockedToStriped( InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], Int2Type /*time_slicing*/) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM; if (INSERT_PADDING) { item_offset += item_offset >> LOG_SMEM_BANKS; } detail::uninitialized_copy(temp_storage.buff + item_offset, input_items[ITEM]); } CTA_SYNC(); #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid; if (INSERT_PADDING) { item_offset += item_offset >> LOG_SMEM_BANKS; } output_items[ITEM] = temp_storage.buff[item_offset]; } } //! @brief Transposes data items from **blocked** arrangement to **striped** //! arrangement. Specialized for warp-timeslicing. //! //! @param[in] input_items //! Items to exchange, converting between **blocked** and **striped** arrangements. //! //! @param[out] output_items //! Items to exchange, converting between **blocked** and **striped** arrangements. template _CCCL_DEVICE _CCCL_FORCEINLINE void BlockedToStriped( InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], Int2Type /*time_slicing*/) { InputT temp_items[ITEMS_PER_THREAD]; #pragma unroll for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++) { const int SLICE_OFFSET = SLICE * TIME_SLICED_ITEMS; const int SLICE_OOB = SLICE_OFFSET + TIME_SLICED_ITEMS; CTA_SYNC(); if (warp_id == SLICE) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM; if (INSERT_PADDING) { item_offset += item_offset >> LOG_SMEM_BANKS; } detail::uninitialized_copy(temp_storage.buff + item_offset, input_items[ITEM]); } } CTA_SYNC(); #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { // Read a strip of items const int STRIP_OFFSET = ITEM * BLOCK_THREADS; const int STRIP_OOB = STRIP_OFFSET + BLOCK_THREADS; if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET)) { int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET; if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS)) { if (INSERT_PADDING) { item_offset += item_offset >> LOG_SMEM_BANKS; } temp_items[ITEM] = temp_storage.buff[item_offset]; } } } } // Copy #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { output_items[ITEM] = temp_items[ITEM]; } } //! @brief Transposes data items from **blocked** arrangement to **warp-striped** arrangement. //! Specialized for no timeslicing //! //! @param[in] input_items //! Items to exchange, converting between **blocked** and **striped** arrangements. //! //! @param[out] output_items //! Items to exchange, converting between **blocked** and **striped** arrangements. template _CCCL_DEVICE _CCCL_FORCEINLINE void BlockedToWarpStriped( InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], Int2Type /*time_slicing*/) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { int item_offset = warp_offset + ITEM + (lane_id * ITEMS_PER_THREAD); if (INSERT_PADDING) { item_offset += item_offset >> LOG_SMEM_BANKS; } detail::uninitialized_copy(temp_storage.buff + item_offset, input_items[ITEM]); } WARP_SYNC(0xffffffff); #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { int item_offset = warp_offset + (ITEM * WARP_TIME_SLICED_THREADS) + lane_id; if (INSERT_PADDING) { item_offset += item_offset >> LOG_SMEM_BANKS; } output_items[ITEM] = temp_storage.buff[item_offset]; } } //! @brief Transposes data items from **blocked** arrangement to **warp-striped** arrangement. //! Specialized for warp-timeslicing //! //! @param[in] input_items //! Items to exchange, converting between **blocked** and **striped** arrangements. //! //! @param[out] output_items //! Items to exchange, converting between **blocked** and **striped** arrangements. template _CCCL_DEVICE _CCCL_FORCEINLINE void BlockedToWarpStriped( InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], Int2Type /*time_slicing*/) { if (warp_id == 0) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD); if (INSERT_PADDING) { item_offset += item_offset >> LOG_SMEM_BANKS; } detail::uninitialized_copy(temp_storage.buff + item_offset, input_items[ITEM]); } WARP_SYNC(0xffffffff); #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id; if (INSERT_PADDING) { item_offset += item_offset >> LOG_SMEM_BANKS; } output_items[ITEM] = temp_storage.buff[item_offset]; } } #pragma unroll for (unsigned int SLICE = 1; SLICE < TIME_SLICES; ++SLICE) { CTA_SYNC(); if (warp_id == SLICE) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD); if (INSERT_PADDING) { item_offset += item_offset >> LOG_SMEM_BANKS; } detail::uninitialized_copy(temp_storage.buff + item_offset, input_items[ITEM]); } WARP_SYNC(0xffffffff); #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id; if (INSERT_PADDING) { item_offset += item_offset >> LOG_SMEM_BANKS; } output_items[ITEM] = temp_storage.buff[item_offset]; } } } } //! @brief Transposes data items from **striped** arrangement to **blocked** arrangement. //! Specialized for no timeslicing. //! //! @param[in] input_items //! Items to exchange, converting between **blocked** and **striped** arrangements. //! //! @param[out] output_items //! Items to exchange, converting between **blocked** and **striped** arrangements. template _CCCL_DEVICE _CCCL_FORCEINLINE void StripedToBlocked( InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], Int2Type /*time_slicing*/) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid; if (INSERT_PADDING) { item_offset += item_offset >> LOG_SMEM_BANKS; } detail::uninitialized_copy(temp_storage.buff + item_offset, input_items[ITEM]); } CTA_SYNC(); // No timeslicing #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM; if (INSERT_PADDING) { item_offset += item_offset >> LOG_SMEM_BANKS; } output_items[ITEM] = temp_storage.buff[item_offset]; } } //! @brief Transposes data items from **striped** arrangement to **blocked** arrangement. //! Specialized for warp-timeslicing. //! //! @param[in] input_items //! Items to exchange, converting between **blocked** and **striped** arrangements. //! //! @param[out] output_items //! Items to exchange, converting between **blocked** and **striped** arrangements. template _CCCL_DEVICE _CCCL_FORCEINLINE void StripedToBlocked( InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], Int2Type /*time_slicing*/) { // Warp time-slicing InputT temp_items[ITEMS_PER_THREAD]; #pragma unroll for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++) { const int SLICE_OFFSET = SLICE * TIME_SLICED_ITEMS; const int SLICE_OOB = SLICE_OFFSET + TIME_SLICED_ITEMS; CTA_SYNC(); #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { // Write a strip of items const int STRIP_OFFSET = ITEM * BLOCK_THREADS; const int STRIP_OOB = STRIP_OFFSET + BLOCK_THREADS; if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET)) { int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET; if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS)) { if (INSERT_PADDING) { item_offset += item_offset >> LOG_SMEM_BANKS; } detail::uninitialized_copy(temp_storage.buff + item_offset, input_items[ITEM]); } } } CTA_SYNC(); if (warp_id == SLICE) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM; if (INSERT_PADDING) { item_offset += item_offset >> LOG_SMEM_BANKS; } temp_items[ITEM] = temp_storage.buff[item_offset]; } } } // Copy #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { output_items[ITEM] = temp_items[ITEM]; } } //! @brief Transposes data items from **warp-striped** arrangement to **blocked** arrangement. //! Specialized for no timeslicing //! //! @param[in] input_items //! Items to exchange, converting between **blocked** and **striped** arrangements. //! //! @param[out] output_items //! Items to exchange, converting between **blocked** and **striped** arrangements. template _CCCL_DEVICE _CCCL_FORCEINLINE void WarpStripedToBlocked( InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], Int2Type /*time_slicing*/) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { int item_offset = warp_offset + (ITEM * WARP_TIME_SLICED_THREADS) + lane_id; if (INSERT_PADDING) { item_offset += item_offset >> LOG_SMEM_BANKS; } detail::uninitialized_copy(temp_storage.buff + item_offset, input_items[ITEM]); } WARP_SYNC(0xffffffff); #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { int item_offset = warp_offset + ITEM + (lane_id * ITEMS_PER_THREAD); if (INSERT_PADDING) { item_offset += item_offset >> LOG_SMEM_BANKS; } detail::uninitialized_copy(output_items + ITEM, temp_storage.buff[item_offset]); } } //! @brief Transposes data items from **warp-striped** arrangement to **blocked** arrangement. //! Specialized for warp-timeslicing //! //! @param[in] input_items //! Items to exchange, converting between **blocked** and **striped** arrangements. //! //! @param[out] output_items //! Items to exchange, converting between **blocked** and **striped** arrangements. template _CCCL_DEVICE _CCCL_FORCEINLINE void WarpStripedToBlocked( InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], Int2Type /*time_slicing*/) { #pragma unroll for (unsigned int SLICE = 0; SLICE < TIME_SLICES; ++SLICE) { CTA_SYNC(); if (warp_id == SLICE) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id; if (INSERT_PADDING) { item_offset += item_offset >> LOG_SMEM_BANKS; } detail::uninitialized_copy(temp_storage.buff + item_offset, input_items[ITEM]); } WARP_SYNC(0xffffffff); #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD); if (INSERT_PADDING) { item_offset += item_offset >> LOG_SMEM_BANKS; } output_items[ITEM] = temp_storage.buff[item_offset]; } } } } //! @brief Exchanges data items annotated by rank into **blocked** arrangement. //! Specialized for no timeslicing. //! //! @param[in] input_items //! Items to exchange, converting between **blocked** and **striped** arrangements. //! //! @param[out] output_items //! Items to exchange, converting between **blocked** and **striped** arrangements. //! //! @param[in] ranks //! Corresponding scatter ranks template _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToBlocked( InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], OffsetT (&ranks)[ITEMS_PER_THREAD], Int2Type /*time_slicing*/) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { int item_offset = ranks[ITEM]; if (INSERT_PADDING) { item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); } detail::uninitialized_copy(temp_storage.buff + item_offset, input_items[ITEM]); } CTA_SYNC(); #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM; if (INSERT_PADDING) { item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); } output_items[ITEM] = temp_storage.buff[item_offset]; } } //! @brief Exchanges data items annotated by rank into **blocked** arrangement. //! Specialized for warp-timeslicing. //! //! @param[in] input_items //! Items to exchange, converting between **blocked** and **striped** arrangements. //! //! @param[out] output_items //! Items to exchange, converting between **blocked** and **striped** arrangements. //! //! @param[in] ranks //! Corresponding scatter ranks template _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToBlocked( InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], OffsetT ranks[ITEMS_PER_THREAD], Int2Type /*time_slicing*/) { InputT temp_items[ITEMS_PER_THREAD]; #pragma unroll for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++) { CTA_SYNC(); const int SLICE_OFFSET = TIME_SLICED_ITEMS * SLICE; #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { int item_offset = ranks[ITEM] - SLICE_OFFSET; if ((item_offset >= 0) && (item_offset < WARP_TIME_SLICED_ITEMS)) { if (INSERT_PADDING) { item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); } detail::uninitialized_copy(temp_storage.buff + item_offset, input_items[ITEM]); } } CTA_SYNC(); if (warp_id == SLICE) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM; if (INSERT_PADDING) { item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); } temp_items[ITEM] = temp_storage.buff[item_offset]; } } } // Copy #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { output_items[ITEM] = temp_items[ITEM]; } } //! @brief Exchanges data items annotated by rank into **striped** arrangement. //! Specialized for no timeslicing. //! //! @param[in] input_items //! Items to exchange, converting between **blocked** and **striped** arrangements. //! //! @param[out] output_items //! Items to exchange, converting between **blocked** and **striped** arrangements. //! //! @param[in] ranks //! Corresponding scatter ranks template _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToStriped( InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], OffsetT (&ranks)[ITEMS_PER_THREAD], Int2Type /*time_slicing*/) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { int item_offset = ranks[ITEM]; if (INSERT_PADDING) { item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); } detail::uninitialized_copy(temp_storage.buff + item_offset, input_items[ITEM]); } CTA_SYNC(); #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid; if (INSERT_PADDING) { item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); } output_items[ITEM] = temp_storage.buff[item_offset]; } } //! @brief Exchanges data items annotated by rank into **striped** arrangement. //! Specialized for warp-timeslicing. //! //! @param[in] input_items //! Items to exchange, converting between **blocked** and **striped** arrangements. //! //! @param[out] output_items //! Items to exchange, converting between **blocked** and **striped** arrangements. //! //! @param[in] ranks //! Corresponding scatter ranks template _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToStriped( InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], OffsetT (&ranks)[ITEMS_PER_THREAD], Int2Type /*time_slicing*/) { InputT temp_items[ITEMS_PER_THREAD]; #pragma unroll for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++) { const int SLICE_OFFSET = SLICE * TIME_SLICED_ITEMS; const int SLICE_OOB = SLICE_OFFSET + TIME_SLICED_ITEMS; CTA_SYNC(); #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { int item_offset = ranks[ITEM] - SLICE_OFFSET; if ((item_offset >= 0) && (item_offset < WARP_TIME_SLICED_ITEMS)) { if (INSERT_PADDING) { item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); } detail::uninitialized_copy(temp_storage.buff + item_offset, input_items[ITEM]); } } CTA_SYNC(); #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { // Read a strip of items const int STRIP_OFFSET = ITEM * BLOCK_THREADS; const int STRIP_OOB = STRIP_OFFSET + BLOCK_THREADS; if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET)) { int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET; if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS)) { if (INSERT_PADDING) { item_offset += item_offset >> LOG_SMEM_BANKS; } temp_items[ITEM] = temp_storage.buff[item_offset]; } } } } // Copy #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { output_items[ITEM] = temp_items[ITEM]; } } public: //! @name Collective constructors //! @{ /** * @brief Collective constructor using a private static allocation of shared memory as temporary storage. */ _CCCL_DEVICE _CCCL_FORCEINLINE BlockExchange() : temp_storage(PrivateStorage()) , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) , lane_id(LaneId()) , warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS) , warp_offset(warp_id * WARP_TIME_SLICED_ITEMS) {} /** * @brief Collective constructor using the specified memory allocation as temporary storage. * * @param[in] temp_storage * Reference to memory allocation having layout type TempStorage */ _CCCL_DEVICE _CCCL_FORCEINLINE BlockExchange(TempStorage& temp_storage) : temp_storage(temp_storage.Alias()) , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) , lane_id(LaneId()) , warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS) , warp_offset(warp_id * WARP_TIME_SLICED_ITEMS) {} //! @} end member group //! @name Structured exchanges //! @{ //! @rst //! Transposes data items from **striped** arrangement to **blocked** arrangement. //! //! - @smemreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates the conversion from a "striped" to a "blocked" arrangement //! of 512 integer items partitioned across 128 threads where each thread owns 4 items. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! __global__ void ExampleKernel(int *d_data, ...) //! { //! // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each //! typedef cub::BlockExchange BlockExchange; //! //! // Allocate shared memory for BlockExchange //! __shared__ typename BlockExchange::TempStorage temp_storage; //! //! // Load a tile of ordered data into a striped arrangement across block threads //! int thread_data[4]; //! cub::LoadDirectStriped<128>(threadIdx.x, d_data, thread_data); //! //! // Collectively exchange data into a blocked arrangement across threads //! BlockExchange(temp_storage).StripedToBlocked(thread_data, thread_data); //! //! Suppose the set of striped input ``thread_data`` across the block of threads is //! ``{ [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }`` after loading from //! device-accessible memory. The corresponding output ``thread_data`` in those threads will be //! ``{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }``. //! @endrst //! //! @param[in] input_items //! Items to exchange, converting between **striped** and **blocked** arrangements. //! //! @param[out] output_items //! Items from exchange, converting between **striped** and **blocked** arrangements. template _CCCL_DEVICE _CCCL_FORCEINLINE void StripedToBlocked(InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD]) { StripedToBlocked(input_items, output_items, Int2Type()); } //! @rst //! Transposes data items from **blocked** arrangement to **striped** arrangement. //! //! - @smemreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates the conversion from a "blocked" to a "striped" arrangement //! of 512 integer items partitioned across 128 threads where each thread owns 4 items. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! __global__ void ExampleKernel(int *d_data, ...) //! { //! // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each //! typedef cub::BlockExchange BlockExchange; //! //! // Allocate shared memory for BlockExchange //! __shared__ typename BlockExchange::TempStorage temp_storage; //! //! // Obtain a segment of consecutive items that are blocked across threads //! int thread_data[4]; //! ... //! //! // Collectively exchange data into a striped arrangement across threads //! BlockExchange(temp_storage).BlockedToStriped(thread_data, thread_data); //! //! // Store data striped across block threads into an ordered tile //! cub::StoreDirectStriped(threadIdx.x, d_data, thread_data); //! //! Suppose the set of blocked input ``thread_data`` across the block of threads is //! ``{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }``. //! The corresponding output ``thread_data`` in those threads will be //! ``{ [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }`` in //! preparation for storing to device-accessible memory. //! @endrst //! //! @param[in] input_items //! Items to exchange, converting between **striped** and **blocked** arrangements. //! //! @param[out] output_items //! Items from exchange, converting between **striped** and **blocked** arrangements. template _CCCL_DEVICE _CCCL_FORCEINLINE void BlockedToStriped(InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD]) { BlockedToStriped(input_items, output_items, Int2Type()); } //! @rst //! Transposes data items from **warp-striped** arrangement to **blocked** arrangement. //! //! - @smemreuse //! //! //! Snippet //! +++++++ //! //! The code snippet below illustrates the conversion from a "warp-striped" to a "blocked" //! arrangement of 512 integer items partitioned across 128 threads where each thread owns 4 //! items. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! __global__ void ExampleKernel(int *d_data, ...) //! { //! // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each //! typedef cub::BlockExchange BlockExchange; //! //! // Allocate shared memory for BlockExchange //! __shared__ typename BlockExchange::TempStorage temp_storage; //! //! // Load a tile of ordered data into a warp-striped arrangement across warp threads //! int thread_data[4]; //! cub::LoadSWarptriped(threadIdx.x, d_data, thread_data); //! //! // Collectively exchange data into a blocked arrangement across threads //! BlockExchange(temp_storage).WarpStripedToBlocked(thread_data); //! //! Suppose the set of warp-striped input ``thread_data`` across the block of threads is //! ``{ [0,32,64,96], [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] }`` //! after loading from device-accessible memory. (The first 128 items are striped across //! the first warp of 32 threads, the second 128 items are striped across the second warp, etc.) //! The corresponding output ``thread_data`` in those threads will be //! ``{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }``. //! @endrst //! //! @param[in] input_items //! Items to exchange, converting between **striped** and **blocked** arrangements. //! //! @param[out] output_items //! Items from exchange, converting between **striped** and **blocked** arrangements. template _CCCL_DEVICE _CCCL_FORCEINLINE void WarpStripedToBlocked(InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD]) { WarpStripedToBlocked(input_items, output_items, Int2Type()); } //! @rst //! Transposes data items from **blocked** arrangement to **warp-striped** arrangement. //! //! - @smemreuse //! //! //! Snippet //! +++++++ //! //! The code snippet below illustrates the conversion from a "blocked" to a "warp-striped" //! arrangement of 512 integer items partitioned across 128 threads where each thread owns 4 //! items. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! __global__ void ExampleKernel(int *d_data, ...) //! { //! // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each //! typedef cub::BlockExchange BlockExchange; //! //! // Allocate shared memory for BlockExchange //! __shared__ typename BlockExchange::TempStorage temp_storage; //! //! // Obtain a segment of consecutive items that are blocked across threads //! int thread_data[4]; //! ... //! //! // Collectively exchange data into a warp-striped arrangement across threads //! BlockExchange(temp_storage).BlockedToWarpStriped(thread_data, thread_data); //! //! // Store data striped across warp threads into an ordered tile //! cub::StoreDirectStriped(threadIdx.x, d_data, thread_data); //! //! Suppose the set of blocked input ``thread_data`` across the block of threads is //! ``{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }``. //! The corresponding output ``thread_data`` in those threads will be //! ``{ [0,32,64,96], [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] }`` //! in preparation for storing to device-accessible memory. (The first 128 items are striped //! across the first warp of 32 threads, the second 128 items are striped across the second warp, etc.) //! @endrst //! //! @param[in] input_items //! Items to exchange, converting between **striped** and **blocked** arrangements. //! //! @param[out] output_items //! Items from exchange, converting between **striped** and **blocked** arrangements. template _CCCL_DEVICE _CCCL_FORCEINLINE void BlockedToWarpStriped(InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD]) { BlockedToWarpStriped(input_items, output_items, Int2Type()); } //! @} end member group //! @name Scatter exchanges //! @{ //! @rst //! Exchanges data items annotated by rank into **blocked** arrangement. //! //! - @smemreuse //! @endrst //! //! @tparam OffsetT //! **[inferred]** Signed integer type for local offsets //! //! @param[in] input_items //! Items to exchange, converting between **striped** and **blocked** arrangements. //! //! @param[out] output_items //! Items from exchange, converting between **striped** and **blocked** arrangements. //! //! @param[in] ranks //! Corresponding scatter ranks template _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToBlocked( InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], OffsetT (&ranks)[ITEMS_PER_THREAD]) { ScatterToBlocked(input_items, output_items, ranks, Int2Type()); } //! @rst //! Exchanges data items annotated by rank into **striped** arrangement. //! //! - @smemreuse //! //! @endrst //! //! @tparam OffsetT //! **[inferred]** Signed integer type for local offsets //! //! @param[in] input_items //! Items to exchange, converting between **striped** and **blocked** arrangements. //! //! @param[out] output_items //! Items from exchange, converting between **striped** and **blocked** arrangements. //! //! @param[in] ranks //! Corresponding scatter ranks template _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToStriped( InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], OffsetT (&ranks)[ITEMS_PER_THREAD]) { ScatterToStriped(input_items, output_items, ranks, Int2Type()); } //! @rst //! Exchanges data items annotated by rank into **striped** arrangement. Items with rank -1 are not exchanged. //! //! - @smemreuse //! //! @endrst //! //! @tparam OffsetT //! **[inferred]** Signed integer type for local offsets //! //! @param[in] input_items //! Items to exchange, converting between **striped** and **blocked** arrangements. //! //! @param[out] output_items //! Items from exchange, converting between **striped** and **blocked** arrangements. //! //! @param[in] ranks //! Corresponding scatter ranks template _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToStripedGuarded( InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], OffsetT (&ranks)[ITEMS_PER_THREAD]) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { int item_offset = ranks[ITEM]; if (INSERT_PADDING) { item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); } if (ranks[ITEM] >= 0) { temp_storage.buff[item_offset] = input_items[ITEM]; } } CTA_SYNC(); #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid; if (INSERT_PADDING) { item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); } output_items[ITEM] = temp_storage.buff[item_offset]; } } //! @rst //! Exchanges valid data items annotated by rank into **striped** arrangement. //! //! - @smemreuse //! //! @endrst //! //! @tparam OffsetT //! **[inferred]** Signed integer type for local offsets //! //! @tparam ValidFlag //! **[inferred]** FlagT type denoting which items are valid //! //! @param[in] input_items //! Items to exchange, converting between **striped** and **blocked** arrangements. //! //! @param[out] output_items //! Items from exchange, converting between **striped** and **blocked** arrangements. //! //! @param[in] ranks //! Corresponding scatter ranks //! //! @param[in] is_valid //! Corresponding flag denoting item validity template _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToStripedFlagged( InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], OffsetT (&ranks)[ITEMS_PER_THREAD], ValidFlag (&is_valid)[ITEMS_PER_THREAD]) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { int item_offset = ranks[ITEM]; if (INSERT_PADDING) { item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); } if (is_valid[ITEM]) { temp_storage.buff[item_offset] = input_items[ITEM]; } } CTA_SYNC(); #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid; if (INSERT_PADDING) { item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); } output_items[ITEM] = temp_storage.buff[item_offset]; } } //! @} end member group #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document /** * @param[in-out] items * Items to exchange, converting between **striped** and **blocked** arrangements. */ _CCCL_DEVICE _CCCL_FORCEINLINE void StripedToBlocked(InputT (&items)[ITEMS_PER_THREAD]) { StripedToBlocked(items, items); } /** * @param[in-out] items * Items to exchange, converting between **striped** and **blocked** arrangements. */ _CCCL_DEVICE _CCCL_FORCEINLINE void BlockedToStriped(InputT (&items)[ITEMS_PER_THREAD]) { BlockedToStriped(items, items); } /** * @param[in-out] items * Items to exchange, converting between **striped** and **blocked** arrangements. */ _CCCL_DEVICE _CCCL_FORCEINLINE void WarpStripedToBlocked(InputT (&items)[ITEMS_PER_THREAD]) { WarpStripedToBlocked(items, items); } /** * @param[in-out] items * Items to exchange, converting between **striped** and **blocked** arrangements. */ _CCCL_DEVICE _CCCL_FORCEINLINE void BlockedToWarpStriped(InputT (&items)[ITEMS_PER_THREAD]) { BlockedToWarpStriped(items, items); } /** * @param[in-out] items * Items to exchange, converting between **striped** and **blocked** arrangements. * * @param[in] ranks * Corresponding scatter ranks */ template _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToBlocked(InputT (&items)[ITEMS_PER_THREAD], OffsetT (&ranks)[ITEMS_PER_THREAD]) { ScatterToBlocked(items, items, ranks); } /** * @param[in-out] items * Items to exchange, converting between **striped** and **blocked** arrangements. * * @param[in] ranks * Corresponding scatter ranks */ template _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToStriped(InputT (&items)[ITEMS_PER_THREAD], OffsetT (&ranks)[ITEMS_PER_THREAD]) { ScatterToStriped(items, items, ranks); } /** * @param[in-out] items * Items to exchange, converting between **striped** and **blocked** arrangements. * * @param[in] ranks * Corresponding scatter ranks */ template _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToStripedGuarded(InputT (&items)[ITEMS_PER_THREAD], OffsetT (&ranks)[ITEMS_PER_THREAD]) { ScatterToStripedGuarded(items, items, ranks); } /** * @param[in-out] items * Items to exchange, converting between **striped** and **blocked** arrangements. * * @param[in] ranks * Corresponding scatter ranks * * @param[in] is_valid * Corresponding flag denoting item validity */ template _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToStripedFlagged( InputT (&items)[ITEMS_PER_THREAD], OffsetT (&ranks)[ITEMS_PER_THREAD], ValidFlag (&is_valid)[ITEMS_PER_THREAD]) { ScatterToStriped(items, items, ranks, is_valid); } #endif // DOXYGEN_SHOULD_SKIP_THIS }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/block/block_histogram.cuh000066400000000000000000000357771463375617100207460ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * @file * The cub::BlockHistogram class provides [collective](index.html#sec0) methods for * constructing block-wide histograms from data samples partitioned across a CUDA thread block. */ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include CUB_NAMESPACE_BEGIN //! @brief BlockHistogramAlgorithm enumerates alternative algorithms for the parallel construction of //! block-wide histograms. enum BlockHistogramAlgorithm { //! @rst //! //! Overview //! ++++++++++++++++++++++++++ //! //! Sorting followed by differentiation. Execution is comprised of two phases: //! //! #. Sort the data using efficient radix sort //! #. Look for "runs" of same-valued keys by detecting discontinuities; the run-lengths are histogram bin counts. //! //! Performance Considerations //! ++++++++++++++++++++++++++ //! //! Delivers consistent throughput regardless of sample bin distribution. //! //! @endrst BLOCK_HISTO_SORT, //! @rst //! //! Overview //! ++++++++++++++++++++++++++ //! //! Use atomic addition to update byte counts directly //! //! Performance Considerations //! ++++++++++++++++++++++++++ //! //! Performance is strongly tied to the hardware implementation of atomic //! addition, and may be significantly degraded for non uniformly-random //! input distributions where many concurrent updates are likely to be //! made to the same bin counter. //! //! @endrst BLOCK_HISTO_ATOMIC, }; //! @rst //! The BlockHistogram class provides :ref:`collective ` methods for //! constructing block-wide histograms from data samples partitioned across a CUDA thread block. //! //! Overview //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! - A `histogram `_ counts the number of observations that fall into //! each of the disjoint categories (known as *bins*). //! - The ``T`` type must be implicitly castable to an integer type. //! - BlockHistogram expects each integral ``input[i]`` value to satisfy //! ``0 <= input[i] < BINS``. Values outside of this range result in undefined behavior. //! - BlockHistogram can be optionally specialized to use different algorithms: //! //! #. :cpp:enumerator:`cub::BLOCK_HISTO_SORT`: Sorting followed by differentiation. //! #. :cpp:enumerator:`cub::BLOCK_HISTO_ATOMIC`: Use atomic addition to update byte counts directly. //! //! A Simple Example //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! @blockcollective{BlockHistogram} //! //! The code snippet below illustrates a 256-bin histogram of 512 integer samples that //! are partitioned across 128 threads where each thread owns 4 samples. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! __global__ void ExampleKernel(...) //! { //! // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character //! samples each typedef cub::BlockHistogram BlockHistogram; //! //! // Allocate shared memory for BlockHistogram //! __shared__ typename BlockHistogram::TempStorage temp_storage; //! //! // Allocate shared memory for block-wide histogram bin counts //! __shared__ unsigned int smem_histogram[256]; //! //! // Obtain input samples per thread //! unsigned char data[4]; //! ... //! //! // Compute the block-wide histogram //! BlockHistogram(temp_storage).Histogram(data, smem_histogram); //! //! Performance and Usage Considerations //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! - @granularity //! - All input values must fall between ``[0, BINS)``, or behavior is undefined. //! - The histogram output can be constructed in shared or device-accessible memory //! - See ``cub::BlockHistogramAlgorithm`` for performance details regarding algorithmic alternatives //! //! Re-using dynamically allocating shared memory //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The ``block/example_block_reduce_dyn_smem.cu`` example illustrates usage of dynamically shared memory with //! BlockReduce and how to re-purpose the same memory region. This example can be easily adapted to the storage //! required by BlockHistogram. //! @endrst //! //! @tparam T //! The sample type being histogrammed (must be castable to an integer bin identifier) //! //! @tparam BLOCK_DIM_X //! The thread block length in threads along the X dimension //! //! @tparam ITEMS_PER_THREAD //! The number of items per thread //! //! @tparam BINS //! The number bins within the histogram //! //! @tparam ALGORITHM //! **[optional]** cub::BlockHistogramAlgorithm enumerator specifying the underlying algorithm to use //! (default: cub::BLOCK_HISTO_SORT) //! //! @tparam BLOCK_DIM_Y //! **[optional]** The thread block length in threads along the Y dimension (default: 1) //! //! @tparam BLOCK_DIM_Z //! **[optional]** The thread block length in threads along the Z dimension (default: 1) //! //! @tparam LEGACY_PTX_ARCH //! **[optional]** Unused. template class BlockHistogram { private: /// Constants enum { /// The thread block size in threads BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, }; /// Internal specialization. using InternalBlockHistogram = cub::detail::conditional_t, BlockHistogramAtomic>; /// Shared memory storage layout type for BlockHistogram typedef typename InternalBlockHistogram::TempStorage _TempStorage; /// Shared storage reference _TempStorage& temp_storage; /// Linear thread-id unsigned int linear_tid; /// Internal storage allocator _CCCL_DEVICE _CCCL_FORCEINLINE _TempStorage& PrivateStorage() { __shared__ _TempStorage private_storage; return private_storage; } public: /// @smemstorage{BlockHistogram} struct TempStorage : Uninitialized<_TempStorage> {}; //! @name Collective constructors //! @{ //! @brief Collective constructor using a private static allocation of shared memory as temporary storage. _CCCL_DEVICE _CCCL_FORCEINLINE BlockHistogram() : temp_storage(PrivateStorage()) , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) {} /** * @brief Collective constructor using the specified memory allocation as temporary storage. * * @param[in] temp_storage * Reference to memory allocation having layout type TempStorage */ _CCCL_DEVICE _CCCL_FORCEINLINE BlockHistogram(TempStorage& temp_storage) : temp_storage(temp_storage.Alias()) , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) {} //! @} end member group //! @name Histogram operations //! @{ //! @rst //! Initialize the shared histogram counters to zero. //! //! Snippet //! +++++++ //! //! The code snippet below illustrates a the initialization and update of a //! histogram of 512 integer samples that are partitioned across 128 threads //! where each thread owns 4 samples. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! __global__ void ExampleKernel(...) //! { //! // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each //! typedef cub::BlockHistogram BlockHistogram; //! //! // Allocate shared memory for BlockHistogram //! __shared__ typename BlockHistogram::TempStorage temp_storage; //! //! // Allocate shared memory for block-wide histogram bin counts //! __shared__ unsigned int smem_histogram[256]; //! //! // Obtain input samples per thread //! unsigned char thread_samples[4]; //! ... //! //! // Initialize the block-wide histogram //! BlockHistogram(temp_storage).InitHistogram(smem_histogram); //! //! // Update the block-wide histogram //! BlockHistogram(temp_storage).Composite(thread_samples, smem_histogram); //! //! @endrst //! //! @tparam CounterT //! **[inferred]** Histogram counter type template _CCCL_DEVICE _CCCL_FORCEINLINE void InitHistogram(CounterT histogram[BINS]) { // Initialize histogram bin counts to zeros int histo_offset = 0; #pragma unroll for (; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS) { histogram[histo_offset + linear_tid] = 0; } // Finish up with guarded initialization if necessary if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS)) { histogram[histo_offset + linear_tid] = 0; } } //! @rst //! Constructs a block-wide histogram in shared/device-accessible memory. //! Each thread contributes an array of input elements. //! //! - @granularity //! - @smemreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates a 256-bin histogram of 512 integer samples that //! are partitioned across 128 threads where each thread owns 4 samples. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! __global__ void ExampleKernel(...) //! { //! // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 //! // character samples each typedef cub::BlockHistogram //! // BlockHistogram; //! //! // Allocate shared memory for BlockHistogram //! __shared__ typename BlockHistogram::TempStorage temp_storage; //! //! // Allocate shared memory for block-wide histogram bin counts //! __shared__ unsigned int smem_histogram[256]; //! //! // Obtain input samples per thread //! unsigned char thread_samples[4]; //! ... //! //! // Compute the block-wide histogram //! BlockHistogram(temp_storage).Histogram(thread_samples, smem_histogram); //! //! @endrst //! //! @tparam CounterT //! **[inferred]** Histogram counter type //! //! @param[in] items //! Calling thread's input values to histogram //! //! @param[out] histogram //! Reference to shared/device-accessible memory histogram template _CCCL_DEVICE _CCCL_FORCEINLINE void Histogram(T (&items)[ITEMS_PER_THREAD], CounterT histogram[BINS]) { // Initialize histogram bin counts to zeros InitHistogram(histogram); CTA_SYNC(); // Composite the histogram InternalBlockHistogram(temp_storage).Composite(items, histogram); } //! @rst //! Updates an existing block-wide histogram in shared/device-accessible memory. //! Each thread composites an array of input elements. //! //! - @granularity //! - @smemreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates a the initialization and update of a //! histogram of 512 integer samples that are partitioned across 128 threads //! where each thread owns 4 samples. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! __global__ void ExampleKernel(...) //! { //! // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 //! // character samples each typedef cub::BlockHistogram //! // BlockHistogram; //! //! // Allocate shared memory for BlockHistogram //! __shared__ typename BlockHistogram::TempStorage temp_storage; //! //! // Allocate shared memory for block-wide histogram bin counts //! __shared__ unsigned int smem_histogram[256]; //! //! // Obtain input samples per thread //! unsigned char thread_samples[4]; //! ... //! //! // Initialize the block-wide histogram //! BlockHistogram(temp_storage).InitHistogram(smem_histogram); //! //! // Update the block-wide histogram //! BlockHistogram(temp_storage).Composite(thread_samples, smem_histogram); //! //! @endrst //! //! @tparam CounterT //! **[inferred]** Histogram counter type //! //! @param[in] items //! Calling thread's input values to histogram //! //! @param[out] histogram //! Reference to shared/device-accessible memory histogram template _CCCL_DEVICE _CCCL_FORCEINLINE void Composite(T (&items)[ITEMS_PER_THREAD], CounterT histogram[BINS]) { InternalBlockHistogram(temp_storage).Composite(items, histogram); } }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/block/block_load.cuh000066400000000000000000001520441463375617100176530ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ //! @file Operations for reading linear tiles of data into the CUDA thread block. #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include CUB_NAMESPACE_BEGIN //! @name Blocked arrangement I/O (direct) //! @{ //! @rst //! Load a linear segment of items into a blocked arrangement across the thread block. //! //! @blocked //! //! @endrst //! //! @tparam T //! **[inferred]** The data type to load. //! //! @tparam ITEMS_PER_THREAD //! **[inferred]** The number of consecutive items partitioned onto each thread. //! //! @tparam InputIteratorT //! **[inferred]** The random-access iterator type for input iterator. //! //! @param[in] linear_tid //! A suitable 1D thread-identifier for the calling thread //! (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks) //! //! @param[in] block_itr //! The thread block's base input iterator for loading from //! //! @param[out] items //! Data to load template _CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectBlocked(int linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD]) { // Load directly in thread-blocked order #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { items[ITEM] = block_itr[(linear_tid * ITEMS_PER_THREAD) + ITEM]; } } //! @rst //! Load a linear segment of items into a blocked arrangement across the thread block, guarded by range. //! //! @blocked //! //! @endrst //! //! @tparam T //! **[inferred]** The data type to load. //! //! @tparam ITEMS_PER_THREAD //! **[inferred]** The number of consecutive items partitioned onto each thread. //! //! @tparam InputIteratorT //! **[inferred]** The random-access iterator type for input iterator. //! //! @param[in] linear_tid //! A suitable 1D thread-identifier for the calling thread //! (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks) //! //! @param[in] block_itr //! The thread block's base input iterator for loading from //! //! @param[out] items //! Data to load //! //! @param[in] valid_items //! Number of valid items to load template _CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectBlocked(int linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { if ((linear_tid * ITEMS_PER_THREAD) + ITEM < valid_items) { items[ITEM] = block_itr[(linear_tid * ITEMS_PER_THREAD) + ITEM]; } } } //! @rst //! Load a linear segment of items into a blocked arrangement across the thread block, guarded //! by range, with a fall-back assignment of out-of-bound elements. //! //! @blocked //! //! @endrst //! //! @tparam T //! **[inferred]** The data type to load. //! //! @tparam ITEMS_PER_THREAD //! **[inferred]** The number of consecutive items partitioned onto each thread. //! //! @tparam InputIteratorT //! **[inferred]** The random-access iterator type for input \iterator. //! //! @param[in] linear_tid //! A suitable 1D thread-identifier for the calling thread //! (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks) //! //! @param[in] block_itr //! The thread block's base input iterator for loading from //! //! @param[out] items //! Data to load //! //! @param[in] valid_items //! Number of valid items to load //! //! @param[in] oob_default //! Default value to assign out-of-bound items template _CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectBlocked( int linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { items[ITEM] = oob_default; } LoadDirectBlocked(linear_tid, block_itr, items, valid_items); } #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document //! @brief Internal implementation for load vectorization //! //! @param[in] linear_tid //! A suitable 1D thread-identifier for the calling thread //! (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks) //! //! @param[in] block_ptr //! Input pointer for loading from //! //! @param[out] items //! Data to load template _CCCL_DEVICE _CCCL_FORCEINLINE void InternalLoadDirectBlockedVectorized(int linear_tid, T* block_ptr, T (&items)[ITEMS_PER_THREAD]) { // Biggest memory access word that T is a whole multiple of typedef typename UnitWord::DeviceWord DeviceWord; enum { TOTAL_WORDS = sizeof(items) / sizeof(DeviceWord), VECTOR_SIZE = (TOTAL_WORDS % 4 == 0) ? 4 : (TOTAL_WORDS % 2 == 0) ? 2 : 1, VECTORS_PER_THREAD = TOTAL_WORDS / VECTOR_SIZE, }; // Vector type typedef typename CubVector::Type Vector; // Vector items Vector vec_items[VECTORS_PER_THREAD]; // Aliased input ptr Vector* vec_ptr = reinterpret_cast(block_ptr) + (linear_tid * VECTORS_PER_THREAD); // Load directly in thread-blocked order # pragma unroll for (int ITEM = 0; ITEM < VECTORS_PER_THREAD; ITEM++) { vec_items[ITEM] = ThreadLoad(vec_ptr + ITEM); } // Copy # pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { items[ITEM] = *(reinterpret_cast(vec_items) + ITEM); } } #endif // DOXYGEN_SHOULD_SKIP_THIS //! @rst //! Load a linear segment of items into a blocked arrangement across the thread block. //! //! @blocked //! //! The input offset (``block_ptr + block_offset``) must be quad-item aligned //! //! The following conditions will prevent vectorization and loading will fall back to cub::BLOCK_LOAD_DIRECT: //! //! - ``ITEMS_PER_THREAD`` is odd //! - The data type ``T`` is not a built-in primitive or CUDA vector type //! (e.g., ``short``, ``int2``, ``double``, ``float2``, etc.) //! //! @endrst //! //! @tparam T //! **[inferred]** The data type to load. //! //! @tparam ITEMS_PER_THREAD //! **[inferred]** The number of consecutive items partitioned onto each thread. //! //! @param[in] linear_tid //! A suitable 1D thread-identifier for the calling thread //! (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks) //! //! @param[in] block_ptr //! Input pointer for loading from //! //! @param[out] items //! Data to load template _CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectBlockedVectorized(int linear_tid, T* block_ptr, T (&items)[ITEMS_PER_THREAD]) { InternalLoadDirectBlockedVectorized(linear_tid, block_ptr, items); } //! @} end member group //! @name Striped arrangement I/O (direct) //! @{ //! @rst //! Load a linear segment of items into a striped arrangement across the thread block. //! //! @striped //! //! @endrst //! //! @tparam BLOCK_THREADS //! The thread block size in threads //! //! @tparam T //! **[inferred]** The data type to load. //! //! @tparam ITEMS_PER_THREAD //! **[inferred]** The number of consecutive items partitioned onto each thread. //! //! @tparam InputIteratorT //! **[inferred]** The random-access iterator type for input iterator. //! //! @param[in] linear_tid //! A suitable 1D thread-identifier for the calling thread //! (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks) //! //! @param[in] block_itr //! The thread block's base input iterator for loading from //! //! @param[out] items //! Data to load template _CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectStriped(int linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD]) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { items[ITEM] = block_itr[linear_tid + ITEM * BLOCK_THREADS]; } } namespace detail { template _CCCL_DEVICE _CCCL_FORCEINLINE void load_transform_direct_striped( int linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], TransformOpT transform_op) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { items[ITEM] = transform_op(block_itr[linear_tid + ITEM * BLOCK_THREADS]); } } } // namespace detail //! @rst //! Load a linear segment of items into a striped arrangement across the thread block, guarded by range //! //! @striped //! //! @endrst //! //! @tparam BLOCK_THREADS //! The thread block size in threads //! //! @tparam T //! **inferred** The data type to load. //! //! @tparam ITEMS_PER_THREAD //! **inferred** The number of consecutive items partitioned onto each thread. //! //! @tparam InputIteratorT //! **inferred** The random-access iterator type for input \iterator. //! //! @param[in] linear_tid //! A suitable 1D thread-identifier for the calling thread //! (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) //! //! @param[in] block_itr //! The thread block's base input iterator for loading from //! //! @param[out] items //! Data to load //! //! @param[in] valid_items //! Number of valid items to load //! template _CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectStriped(int linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { if (linear_tid + (ITEM * BLOCK_THREADS) < valid_items) { items[ITEM] = block_itr[linear_tid + ITEM * BLOCK_THREADS]; } } } //! @rst //! Load a linear segment of items into a striped arrangement across the thread block, guarded //! by range, with a fall-back assignment of out-of-bound elements. //! //! @striped //! //! @endrst //! //! @tparam BLOCK_THREADS //! The thread block size in threads //! //! @tparam T //! **inferred** The data type to load. //! //! @tparam ITEMS_PER_THREAD //! **inferred** The number of consecutive items partitioned onto each thread. //! //! @tparam InputIteratorT //! **inferred** The random-access iterator type for input \iterator. //! //! @param[in] linear_tid //! A suitable 1D thread-identifier for the calling thread //! (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks) //! //! @param[in] block_itr //! The thread block's base input iterator for loading from //! //! @param[out] items //! Data to load //! //! @param[in] valid_items //! Number of valid items to load //! //! @param[in] oob_default //! Default value to assign out-of-bound items template _CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectStriped( int linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { items[ITEM] = oob_default; } LoadDirectStriped(linear_tid, block_itr, items, valid_items); } //! @} end member group //! @name Warp-striped arrangement I/O (direct) //! @{ //! @rst //! Load a linear segment of items into a warp-striped arrangement across the thread block. //! //! @warpstriped //! //! Usage Considerations //! ++++++++++++++++++++ //! //! The number of threads in the thread block must be a multiple of the architecture's warp size. //! //! @endrst //! //! @tparam T //! **inferred** The data type to load. //! //! @tparam ITEMS_PER_THREAD //! **inferred** The number of consecutive items partitioned onto each thread. //! //! @tparam InputIteratorT //! **inferred** The random-access iterator type for input iterator. //! //! @param[in] linear_tid //! A suitable 1D thread-identifier for the calling thread //! (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks) //! //! @param[in] block_itr //! The thread block's base input iterator for loading from //! //! @param[out] items //! Data to load template _CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectWarpStriped(int linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD]) { int tid = linear_tid & (CUB_PTX_WARP_THREADS - 1); int wid = linear_tid >> CUB_PTX_LOG_WARP_THREADS; int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD; // Load directly in warp-striped order #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { new (&items[ITEM]) InputT(block_itr[warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS)]); } } //! @rst //! Load a linear segment of items into a warp-striped arrangement across the thread block, guarded by range //! //! @warpstriped //! //! Usage Considerations //! ++++++++++++++++++++ //! //! The number of threads in the thread block must be a multiple of the architecture's warp size. //! //! @endrst //! //! @tparam T //! **inferred** The data type to load. //! //! @tparam ITEMS_PER_THREAD //! **inferred** The number of consecutive items partitioned onto each thread. //! //! @tparam InputIteratorT //! **inferred** The random-access iterator type for input \iterator. //! //! @param[in] linear_tid //! A suitable 1D thread-identifier for the calling thread //! (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks) //! //! @param[in] block_itr //! The thread block's base input iterator for loading from //! //! @param[out] items //! Data to load //! //! @param[in] valid_items //! Number of valid items to load template _CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectWarpStriped(int linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items) { int tid = linear_tid & (CUB_PTX_WARP_THREADS - 1); int wid = linear_tid >> CUB_PTX_LOG_WARP_THREADS; int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD; // Load directly in warp-striped order #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { if (warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS) < valid_items) { new (&items[ITEM]) InputT(block_itr[warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS)]); } } } //! @rst //! Load a linear segment of items into a warp-striped arrangement across the thread block, //! guarded by range, with a fall-back assignment of out-of-bound elements. //! //! @warpstriped //! //! @endrst //! //! Usage Considerations //! ++++++++++++++++++++ //! //! The number of threads in the thread block must be a multiple of the architecture's warp size. //! //! @tparam T //! **inferred** The data type to load. //! //! @tparam ITEMS_PER_THREAD //! **inferred** The number of consecutive items partitioned onto each thread. //! //! @tparam InputIteratorT //! **inferred** The random-access iterator type for input \iterator. //! //! @param[in] linear_tid //! A suitable 1D thread-identifier for the calling thread //! (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks) //! //! @param[in] block_itr //! The thread block's base input iterator for loading from //! //! @param[out] items //! Data to load //! //! @param[in] valid_items //! Number of valid items to load //! //! @param[in] oob_default //! Default value to assign out-of-bound items template _CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectWarpStriped( int linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default) { // Load directly in warp-striped order #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { items[ITEM] = oob_default; } LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items); } //! @} end member group //! @brief cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a //! linear segment of data from memory into a blocked arrangement across a CUDA thread block. enum BlockLoadAlgorithm { //! @rst //! Overview //! ++++++++++++++++++++++++++ //! //! A :ref:`blocked arrangement ` of data is read directly from memory. //! //! Performance Considerations //! ++++++++++++++++++++++++++ //! //! The utilization of memory transactions (coalescing) decreases as the //! access stride between threads increases (i.e., the number items per thread). //! @endrst BLOCK_LOAD_DIRECT, //! @rst //! Overview //! ++++++++++++++++++++++++++ //! //! A :ref:`striped arrangement ` of data is read directly from memory. //! //! Performance Considerations //! ++++++++++++++++++++++++++ //! //! The utilization of memory transactions (coalescing) doesn't depend on //! the number of items per thread. //! //! @endrst BLOCK_LOAD_STRIPED, //! @rst //! Overview //! ++++++++++++++++++++++++++ //! //! A :ref:`blocked arrangement ` of data is read //! from memory using CUDA's built-in vectorized loads as a coalescing optimization. //! For example, ``ld.global.v4.s32`` instructions will be generated //! when ``T = int`` and ``ITEMS_PER_THREAD % 4 == 0``. //! //! Performance Considerations //! ++++++++++++++++++++++++++ //! //! - The utilization of memory transactions (coalescing) remains high until the the //! access stride between threads (i.e., the number items per thread) exceeds the //! maximum vector load width (typically 4 items or 64B, whichever is lower). //! - The following conditions will prevent vectorization and loading will fall //! back to cub::BLOCK_LOAD_DIRECT: //! //! - ``ITEMS_PER_THREAD`` is odd //! - The ``InputIteratorT`` is not a simple pointer type //! - The block input offset is not quadword-aligned //! - The data type ``T`` is not a built-in primitive or CUDA vector type //! (e.g., ``short``, ``int2``, ``double``, ``float2``, etc.) //! //! @endrst BLOCK_LOAD_VECTORIZE, //! @rst //! Overview //! ++++++++++++++++++++++++++ //! //! A :ref:`striped arrangement ` of data is read efficiently from memory and then //! locally transposed into a :ref:`blocked arrangement `. //! //! Performance Considerations //! ++++++++++++++++++++++++++ //! //! - The utilization of memory transactions (coalescing) remains high regardless //! of items loaded per thread. //! - The local reordering incurs slightly longer latencies and throughput than the //! direct cub::BLOCK_LOAD_DIRECT and cub::BLOCK_LOAD_VECTORIZE alternatives. //! //! @endrst BLOCK_LOAD_TRANSPOSE, //! @rst //! Overview //! ++++++++++++++++++++++++++ //! //! A :ref:`warp-striped arrangement ` of data is read efficiently from memory and then //! locally transposed into a :ref:`blocked arrangement `. //! //! Usage Considerations //! ++++++++++++++++++++++++++ //! //! - BLOCK_THREADS must be a multiple of WARP_THREADS //! //! Performance Considerations //! ++++++++++++++++++++++++++ //! //! - The utilization of memory transactions (coalescing) remains high regardless of items loaded per thread. //! - The local reordering incurs slightly larger latencies than the //! direct cub::BLOCK_LOAD_DIRECT and cub::BLOCK_LOAD_VECTORIZE alternatives. //! - Provisions more shared storage, but incurs smaller latencies than the //! BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED alternative. //! //! @endrst BLOCK_LOAD_WARP_TRANSPOSE, //! @rst //! Overview //! ++++++++++++++++++++++++++ //! //! Like ``BLOCK_LOAD_WARP_TRANSPOSE``, a :ref:`warp-striped arrangement ` //! of data is read directly from memory and then is locally transposed into a //! :ref:`blocked arrangement `. To reduce the shared memory requirement, only one //! warp's worth of shared memory is provisioned and is subsequently time-sliced among warps. //! //! Usage Considerations //! ++++++++++++++++++++++++++ //! //! - BLOCK_THREADS must be a multiple of WARP_THREADS //! //! Performance Considerations //! ++++++++++++++++++++++++++ //! //! - The utilization of memory transactions (coalescing) remains high regardless //! of items loaded per thread. //! - Provisions less shared memory temporary storage, but incurs larger //! latencies than the BLOCK_LOAD_WARP_TRANSPOSE alternative. //! //! @endrst BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, }; //! @rst //! The BlockLoad class provides :ref:`collective ` data movement methods for loading a linear //! segment of items from memory into a :ref:`blocked arrangement ` across a //! CUDA thread block. //! //! Overview //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! - The BlockLoad class provides a single data movement abstraction that can be specialized //! to implement different cub::BlockLoadAlgorithm strategies. This facilitates different //! performance policies for different architectures, data types, granularity sizes, etc. //! - BlockLoad can be optionally specialized by different data movement strategies: //! //! #. :cpp:enumerator:`cub::BLOCK_LOAD_DIRECT`: //! A :ref:`blocked arrangement ` of data is read directly from memory. //! #. :cpp:enumerator:`cub::BLOCK_LOAD_STRIPED`: //! A :ref:`striped arrangement ` of data is read directly from memory. //! #. :cpp:enumerator:`cub::BLOCK_LOAD_VECTORIZE`: //! A :ref:`blocked arrangement ` of data is read directly from memory //! using CUDA's built-in vectorized loads as a coalescing optimization. //! #. :cpp:enumerator:`cub::BLOCK_LOAD_TRANSPOSE`: //! A :ref:`striped arrangement ` of data is read directly from memory and is then //! locally transposed into a :ref:`blocked arrangement `. //! #. :cpp:enumerator:`cub::BLOCK_LOAD_WARP_TRANSPOSE`: //! A :ref:`warp-striped arrangement ` of data is read directly from memory and is then //! locally transposed into a :ref:`blocked arrangement `. //! #. :cpp:enumerator:`cub::BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED`: //! A :ref:`warp-striped arrangement ` of data is read directly from memory and is then //! locally transposed into a :ref:`blocked arrangement ` one warp at a time. //! //! - @rowmajor //! //! A Simple Example //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! @blockcollective{BlockLoad} //! //! The code snippet below illustrates the loading of a linear //! segment of 512 integers into a "blocked" arrangement across 128 threads where each //! thread owns 4 consecutive items. The load is specialized for ``BLOCK_LOAD_WARP_TRANSPOSE``, //! meaning memory references are efficiently coalesced using a warp-striped access //! pattern (after which items are locally reordered among threads). //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! __global__ void ExampleKernel(int *d_data, ...) //! { //! // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each //! typedef cub::BlockLoad BlockLoad; //! //! // Allocate shared memory for BlockLoad //! __shared__ typename BlockLoad::TempStorage temp_storage; //! //! // Load a segment of consecutive items that are blocked across threads //! int thread_data[4]; //! BlockLoad(temp_storage).Load(d_data, thread_data); //! //! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, ...``. //! The set of ``thread_data`` across the block of threads in those threads will be //! ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }``. //! //! Re-using dynamically allocating shared memory //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The ``block/example_block_reduce_dyn_smem.cu`` example illustrates usage of //! dynamically shared memory with BlockReduce and how to re-purpose the same memory region. //! This example can be easily adapted to the storage required by BlockLoad. //! //! @endrst //! //! @tparam InputT //! The data type to read into (which must be convertible from the input iterator's value type). //! //! @tparam BLOCK_DIM_X //! The thread block length in threads along the X dimension //! //! @tparam ITEMS_PER_THREAD //! The number of consecutive items partitioned onto each thread. //! //! @tparam ALGORITHM //! **[optional]** cub::BlockLoadAlgorithm tuning policy. default: ``cub::BLOCK_LOAD_DIRECT``. //! //! @tparam WARP_TIME_SLICING //! **[optional]** Whether or not only one warp's worth of shared memory should be //! allocated and time-sliced among block-warps during any load-related data transpositions //! (versus each warp having its own storage). (default: false) //! //! @tparam BLOCK_DIM_Y //! **[optional]** The thread block length in threads along the Y dimension (default: 1) //! //! @tparam BLOCK_DIM_Z //! **[optional]** The thread block length in threads along the Z dimension (default: 1) //! //! @tparam LEGACY_PTX_ARCH //! **[optional]** Unused. template class BlockLoad { private: /// Constants enum { /// The thread block size in threads BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, }; /// Load helper template struct LoadInternal; /** * BLOCK_LOAD_DIRECT specialization of load helper */ template struct LoadInternal { /// Shared memory storage layout type typedef NullType TempStorage; /// Linear thread-id int linear_tid; /// Constructor _CCCL_DEVICE _CCCL_FORCEINLINE LoadInternal(TempStorage& /*temp_storage*/, int linear_tid) : linear_tid(linear_tid) {} /** * @brief Load a linear segment of items from memory * * @param[in] block_itr * The thread block's base input iterator for loading from * * @param[out] items * Data to load */ template _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD]) { LoadDirectBlocked(linear_tid, block_itr, items); } /** * @brief Load a linear segment of items from memory, guarded by range * * @param[in] block_itr * The thread block's base input iterator for loading from * * @param[out] items * Data to load * * @param[in] valid_items * Number of valid items to load */ template _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items) { LoadDirectBlocked(linear_tid, block_itr, items, valid_items); } /** * @brief Load a linear segment of items from memory, guarded by range, with a fall-back * assignment of out-of-bound elements * * @param[in] block_itr * The thread block's base input iterator for loading from * * @param[out] items * Data to load * * @param[in] valid_items * Number of valid items to load * * @param[in] oob_default * Default value to assign out-of-bound items */ template _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default) { LoadDirectBlocked(linear_tid, block_itr, items, valid_items, oob_default); } }; /** * BLOCK_LOAD_STRIPED specialization of load helper */ template struct LoadInternal { /// Shared memory storage layout type typedef NullType TempStorage; /// Linear thread-id int linear_tid; /// Constructor _CCCL_DEVICE _CCCL_FORCEINLINE LoadInternal(TempStorage& /*temp_storage*/, int linear_tid) : linear_tid(linear_tid) {} /** * @brief Load a linear segment of items from memory * * @param[in] block_itr * The thread block's base input iterator for loading from * * @param[out] items * Data to load */ template _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD]) { LoadDirectStriped(linear_tid, block_itr, items); } /** * @brief Load a linear segment of items from memory, guarded by range * * @param[in] block_itr * The thread block's base input iterator for loading from * * @param[out] items * Data to load * * @param[in] valid_items * Number of valid items to load */ template _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items) { LoadDirectStriped(linear_tid, block_itr, items, valid_items); } /** * @brief Load a linear segment of items from memory, guarded by range, with a fall-back * assignment of out-of-bound elements * * @param[in] block_itr * The thread block's base input iterator for loading from * * @param[out] items * Data to load * * @param[in] valid_items * Number of valid items to load * * @param[in] oob_default * Default value to assign out-of-bound items */ template _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default) { LoadDirectStriped(linear_tid, block_itr, items, valid_items, oob_default); } }; /** * BLOCK_LOAD_VECTORIZE specialization of load helper */ template struct LoadInternal { /// Shared memory storage layout type typedef NullType TempStorage; /// Linear thread-id int linear_tid; /// Constructor _CCCL_DEVICE _CCCL_FORCEINLINE LoadInternal(TempStorage& /*temp_storage*/, int linear_tid) : linear_tid(linear_tid) {} /** * @brief Load a linear segment of items from memory, specialized for native pointer types * (attempts vectorization) * * @param[in] block_ptr * The thread block's base input iterator for loading from * * @param[out] items * Data to load */ template _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputT* block_ptr, InputT (&items)[ITEMS_PER_THREAD]) { InternalLoadDirectBlockedVectorized(linear_tid, block_ptr, items); } /** * @brief Load a linear segment of items from memory, specialized for native pointer types * (attempts vectorization) * * @param[in] block_ptr * The thread block's base input iterator for loading from * * @param[out] items * Data to load */ template _CCCL_DEVICE _CCCL_FORCEINLINE void Load(const InputT* block_ptr, InputT (&items)[ITEMS_PER_THREAD]) { InternalLoadDirectBlockedVectorized(linear_tid, block_ptr, items); } /** * @brief Load a linear segment of items from memory, specialized for native pointer types * (attempts vectorization) * * @param[in] block_itr * The thread block's base input iterator for loading from * * @param[out] items * Data to load */ template _CCCL_DEVICE _CCCL_FORCEINLINE void Load(CacheModifiedInputIterator block_itr, InputT (&items)[ITEMS_PER_THREAD]) { InternalLoadDirectBlockedVectorized(linear_tid, block_itr.ptr, items); } /** * @brief Load a linear segment of items from memory, specialized for opaque input iterators * (skips vectorization) * * @param[in] block_itr * The thread block's base input iterator for loading from * * @param[out] items * Data to load */ template _CCCL_DEVICE _CCCL_FORCEINLINE void Load(_InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD]) { LoadDirectBlocked(linear_tid, block_itr, items); } /** * @brief Load a linear segment of items from memory, guarded by range (skips vectorization) * * @param[in] block_itr * The thread block's base input iterator for loading from * * @param[out] items * Data to load * * @param[in] valid_items * Number of valid items to load */ template _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items) { LoadDirectBlocked(linear_tid, block_itr, items, valid_items); } /** * @brief Load a linear segment of items from memory, guarded by range, with a fall-back * assignment of out-of-bound elements (skips vectorization) * * @param[in] block_itr * The thread block's base input iterator for loading from * * @param[out] items * Data to load * * @param[in] valid_items * Number of valid items to load * * @param[in] oob_default * Default value to assign out-of-bound items */ template _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default) { LoadDirectBlocked(linear_tid, block_itr, items, valid_items, oob_default); } }; /** * BLOCK_LOAD_TRANSPOSE specialization of load helper */ template struct LoadInternal { // BlockExchange utility type for keys typedef BlockExchange BlockExchange; /// Shared memory storage layout type struct _TempStorage : BlockExchange::TempStorage {}; /// Alias wrapper allowing storage to be unioned struct TempStorage : Uninitialized<_TempStorage> {}; /// Thread reference to shared storage _TempStorage& temp_storage; /// Linear thread-id int linear_tid; /// Constructor _CCCL_DEVICE _CCCL_FORCEINLINE LoadInternal(TempStorage& temp_storage, int linear_tid) : temp_storage(temp_storage.Alias()) , linear_tid(linear_tid) {} /** * @brief Load a linear segment of items from memory * * @param[in] block_itr * The thread block's base input iterator for loading from * * @param[out] items * Data to load */ template _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD]) { LoadDirectStriped(linear_tid, block_itr, items); BlockExchange(temp_storage).StripedToBlocked(items, items); } /** * @brief Load a linear segment of items from memory, guarded by range * * @param[in] block_itr * The thread block's base input iterator for loading from * * @param[out] items * Data to load * * @param[in] valid_items * Number of valid items to load */ template _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items) { LoadDirectStriped(linear_tid, block_itr, items, valid_items); BlockExchange(temp_storage).StripedToBlocked(items, items); } /** * @brief Load a linear segment of items from memory, guarded by range, with a fall-back * assignment of out-of-bound elements * * @param[in] block_itr * The thread block's base input iterator for loading from * * @param[out] items * Data to load * * @param[in] valid_items * Number of valid items to load * * @param[in] oob_default * Default value to assign out-of-bound items */ template _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default) { LoadDirectStriped(linear_tid, block_itr, items, valid_items, oob_default); BlockExchange(temp_storage).StripedToBlocked(items, items); } }; /** * BLOCK_LOAD_WARP_TRANSPOSE specialization of load helper */ template struct LoadInternal { enum { WARP_THREADS = CUB_WARP_THREADS(0) }; // Assert BLOCK_THREADS must be a multiple of WARP_THREADS CUB_STATIC_ASSERT((int(BLOCK_THREADS) % int(WARP_THREADS) == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS"); // BlockExchange utility type for keys typedef BlockExchange BlockExchange; /// Shared memory storage layout type struct _TempStorage : BlockExchange::TempStorage {}; /// Alias wrapper allowing storage to be unioned struct TempStorage : Uninitialized<_TempStorage> {}; /// Thread reference to shared storage _TempStorage& temp_storage; /// Linear thread-id int linear_tid; /// Constructor _CCCL_DEVICE _CCCL_FORCEINLINE LoadInternal(TempStorage& temp_storage, int linear_tid) : temp_storage(temp_storage.Alias()) , linear_tid(linear_tid) {} /** * @brief Load a linear segment of items from memory * * @param[in] block_itr * The thread block's base input iterator for loading from * * @param[out] items * Data to load */ template _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD]) { LoadDirectWarpStriped(linear_tid, block_itr, items); BlockExchange(temp_storage).WarpStripedToBlocked(items, items); } /** * @brief Load a linear segment of items from memory, guarded by range * * @param[in] block_itr * The thread block's base input iterator for loading from * * @param[out] items * Data to load * * @param[in] valid_items * Number of valid items to load */ template _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items) { LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items); BlockExchange(temp_storage).WarpStripedToBlocked(items, items); } /** * @brief Load a linear segment of items from memory, guarded by range, with a fall-back * assignment of out-of-bound elements * * @param[in] block_itr * The thread block's base input iterator for loading from * * @param[out] items * Data to load * * @param[in] valid_items * Number of valid items to load * * @param[in] oob_default * Default value to assign out-of-bound items */ template _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default) { LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items, oob_default); BlockExchange(temp_storage).WarpStripedToBlocked(items, items); } }; /** * BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED specialization of load helper */ template struct LoadInternal { enum { WARP_THREADS = CUB_WARP_THREADS(0) }; // Assert BLOCK_THREADS must be a multiple of WARP_THREADS CUB_STATIC_ASSERT((int(BLOCK_THREADS) % int(WARP_THREADS) == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS"); // BlockExchange utility type for keys typedef BlockExchange BlockExchange; /// Shared memory storage layout type struct _TempStorage : BlockExchange::TempStorage {}; /// Alias wrapper allowing storage to be unioned struct TempStorage : Uninitialized<_TempStorage> {}; /// Thread reference to shared storage _TempStorage& temp_storage; /// Linear thread-id int linear_tid; /// Constructor _CCCL_DEVICE _CCCL_FORCEINLINE LoadInternal(TempStorage& temp_storage, int linear_tid) : temp_storage(temp_storage.Alias()) , linear_tid(linear_tid) {} /** * @brief Load a linear segment of items from memory * * @param[in] block_itr * The thread block's base input iterator for loading from * * @param[out] items * Data to load */ template _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD]) { LoadDirectWarpStriped(linear_tid, block_itr, items); BlockExchange(temp_storage).WarpStripedToBlocked(items, items); } /** * @brief Load a linear segment of items from memory, guarded by range * * @param[in] block_itr * The thread block's base input iterator for loading from * * @param[out] items * Data to load * * @param[in] valid_items * Number of valid items to load */ template _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items) { LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items); BlockExchange(temp_storage).WarpStripedToBlocked(items, items); } /** * @brief Load a linear segment of items from memory, guarded by range, with a fall-back * assignment of out-of-bound elements * * @param[in] block_itr * The thread block's base input iterator for loading from * * @param[out] items * Data to load * * @param[in] valid_items * Number of valid items to load * * @param[in] oob_default * Default value to assign out-of-bound items */ template _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default) { LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items, oob_default); BlockExchange(temp_storage).WarpStripedToBlocked(items, items); } }; /// Internal load implementation to use typedef LoadInternal InternalLoad; /// Shared memory storage layout type typedef typename InternalLoad::TempStorage _TempStorage; /// Internal storage allocator _CCCL_DEVICE _CCCL_FORCEINLINE _TempStorage& PrivateStorage() { __shared__ _TempStorage private_storage; return private_storage; } /// Thread reference to shared storage _TempStorage& temp_storage; /// Linear thread-id int linear_tid; public: /// @smemstorage{BlockLoad} struct TempStorage : Uninitialized<_TempStorage> {}; //! @name Collective constructors //! @{ /** * @brief Collective constructor using a private static allocation of shared memory as temporary * storage. */ _CCCL_DEVICE _CCCL_FORCEINLINE BlockLoad() : temp_storage(PrivateStorage()) , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) {} /** * @brief Collective constructor using the specified memory allocation as temporary storage. * * @param[in] temp_storage * Reference to memory allocation having layout type TempStorage */ _CCCL_DEVICE _CCCL_FORCEINLINE BlockLoad(TempStorage& temp_storage) : temp_storage(temp_storage.Alias()) , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) {} //! @} end member group //! @name Data movement //! @{ //! @rst //! Load a linear segment of items from memory. //! //! - @blocked //! - @smemreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates the loading of a linear //! segment of 512 integers into a "blocked" arrangement across 128 threads where each //! thread owns 4 consecutive items. The load is specialized for ``BLOCK_LOAD_WARP_TRANSPOSE``, //! meaning memory references are efficiently coalesced using a warp-striped access //! pattern (after which items are locally reordered among threads). //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! __global__ void ExampleKernel(int *d_data, ...) //! { //! // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each //! typedef cub::BlockLoad BlockLoad; //! //! // Allocate shared memory for BlockLoad //! __shared__ typename BlockLoad::TempStorage temp_storage; //! //! // Load a segment of consecutive items that are blocked across threads //! int thread_data[4]; //! BlockLoad(temp_storage).Load(d_data, thread_data); //! //! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, ...``. //! The set of ``thread_data`` across the block of threads in those threads will be //! ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }``. //! //! @endrst //! //! @param[in] block_itr //! The thread block's base input iterator for loading from //! //! @param[out] items //! Data to load template _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD]) { InternalLoad(temp_storage, linear_tid).Load(block_itr, items); } //! @rst //! //! Load a linear segment of items from memory, guarded by range. //! //! - @blocked //! - @smemreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates the guarded loading of a linear //! segment of 512 integers into a "blocked" arrangement across 128 threads where each //! thread owns 4 consecutive items. The load is specialized for ``BLOCK_LOAD_WARP_TRANSPOSE``, //! meaning memory references are efficiently coalesced using a warp-striped access //! pattern (after which items are locally reordered among threads). //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! __global__ void ExampleKernel(int *d_data, int valid_items, ...) //! { //! // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each //! typedef cub::BlockLoad BlockLoad; //! //! // Allocate shared memory for BlockLoad //! __shared__ typename BlockLoad::TempStorage temp_storage; //! //! // Load a segment of consecutive items that are blocked across threads //! int thread_data[4]; //! BlockLoad(temp_storage).Load(d_data, thread_data, valid_items); //! //! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, 6...`` and ``valid_items`` is ``5``. //! The set of ``thread_data`` across the block of threads in those threads will be //! ``{ [0,1,2,3], [4,?,?,?], ..., [?,?,?,?] }``, with only the first two threads //! being unmasked to load portions of valid data (and other items remaining unassigned). //! //! @endrst //! //! @param[in] block_itr //! The thread block's base input iterator for loading from //! //! @param[out] items //! Data to load //! //! @param[in] valid_items //! Number of valid items to load template _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items) { InternalLoad(temp_storage, linear_tid).Load(block_itr, items, valid_items); } //! @rst //! Load a linear segment of items from memory, guarded by range, with a fall-back //! assignment of out-of-bound elements //! //! - @blocked //! - @smemreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates the guarded loading of a linear //! segment of 512 integers into a "blocked" arrangement across 128 threads where each //! thread owns 4 consecutive items. The load is specialized for ``BLOCK_LOAD_WARP_TRANSPOSE``, //! meaning memory references are efficiently coalesced using a warp-striped access //! pattern (after which items are locally reordered among threads). //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! __global__ void ExampleKernel(int *d_data, int valid_items, ...) //! { //! // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each //! typedef cub::BlockLoad BlockLoad; //! //! // Allocate shared memory for BlockLoad //! __shared__ typename BlockLoad::TempStorage temp_storage; //! //! // Load a segment of consecutive items that are blocked across threads //! int thread_data[4]; //! BlockLoad(temp_storage).Load(d_data, thread_data, valid_items, -1); //! //! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, 6...`` //! ``valid_items`` is ``5``, and the out-of-bounds default is ``-1``. //! The set of ``thread_data`` across the block of threads in those threads will be //! ``{ [0,1,2,3], [4,-1,-1,-1], ..., [-1,-1,-1,-1] }``, with only the first two threads //! being unmasked to load portions of valid data (and other items are assigned ``-1``) //! //! @endrst //! //! @param[in] block_itr //! The thread block's base input iterator for loading from //! //! @param[out] items //! Data to load //! //! @param[in] valid_items //! Number of valid items to load //! //! @param[in] oob_default //! Default value to assign out-of-bound items template _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default) { InternalLoad(temp_storage, linear_tid).Load(block_itr, items, valid_items, oob_default); } //@} end member group }; template > struct BlockLoadType { using type = cub::BlockLoad; }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/block/block_merge_sort.cuh000066400000000000000000000642051463375617100211030ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include #include #include CUB_NAMESPACE_BEGIN // Additional details of the Merge-Path Algorithm can be found in: // S. Odeh, O. Green, Z. Mwassi, O. Shmueli, Y. Birk, " Merge Path - Parallel // Merging Made Simple", Multithreaded Architectures and Applications (MTAAP) // Workshop, IEEE 26th International Parallel & Distributed Processing // Symposium (IPDPS), 2012 template _CCCL_DEVICE _CCCL_FORCEINLINE OffsetT MergePath( KeyIteratorT keys1, KeyIteratorT keys2, OffsetT keys1_count, OffsetT keys2_count, OffsetT diag, BinaryPred binary_pred) { OffsetT keys1_begin = diag < keys2_count ? 0 : diag - keys2_count; OffsetT keys1_end = (cub::min)(diag, keys1_count); while (keys1_begin < keys1_end) { OffsetT mid = cub::MidPoint(keys1_begin, keys1_end); KeyT key1 = keys1[mid]; KeyT key2 = keys2[diag - 1 - mid]; bool pred = binary_pred(key2, key1); if (pred) { keys1_end = mid; } else { keys1_begin = mid + 1; } } return keys1_begin; } template _CCCL_DEVICE _CCCL_FORCEINLINE void SerialMerge( KeyT* keys_shared, int keys1_beg, int keys2_beg, int keys1_count, int keys2_count, KeyT (&output)[ITEMS_PER_THREAD], int (&indices)[ITEMS_PER_THREAD], CompareOp compare_op) { int keys1_end = keys1_beg + keys1_count; int keys2_end = keys2_beg + keys2_count; KeyT key1 = keys_shared[keys1_beg]; KeyT key2 = keys_shared[keys2_beg]; #pragma unroll for (int item = 0; item < ITEMS_PER_THREAD; ++item) { bool p = (keys2_beg < keys2_end) && ((keys1_beg >= keys1_end) || compare_op(key2, key1)); output[item] = p ? key2 : key1; indices[item] = p ? keys2_beg++ : keys1_beg++; if (p) { key2 = keys_shared[keys2_beg]; } else { key1 = keys_shared[keys1_beg]; } } } /** * @brief Generalized merge sort algorithm * * This class is used to reduce code duplication. Warp and Block merge sort * differ only in how they compute thread index and how they synchronize * threads. Since synchronization might require access to custom data * (like member mask), CRTP is used. * * @par * The code snippet below illustrates the way this class can be used. * @par * @code * #include // or equivalently * * constexpr int BLOCK_THREADS = 256; * constexpr int ITEMS_PER_THREAD = 9; * * class BlockMergeSort : public BlockMergeSortStrategy * { * using BlockMergeSortStrategyT = * BlockMergeSortStrategy; * public: * __device__ __forceinline__ explicit BlockMergeSort( * typename BlockMergeSortStrategyT::TempStorage &temp_storage) * : BlockMergeSortStrategyT(temp_storage, threadIdx.x) * {} * * __device__ __forceinline__ void SyncImplementation() const * { * __syncthreads(); * } * }; * @endcode * * @tparam KeyT * KeyT type * * @tparam ValueT * ValueT type. cub::NullType indicates a keys-only sort * * @tparam SynchronizationPolicy * Provides a way of synchronizing threads. Should be derived from * `BlockMergeSortStrategy`. */ template class BlockMergeSortStrategy { static_assert(PowerOfTwo::VALUE, "NUM_THREADS must be a power of two"); private: static constexpr int ITEMS_PER_TILE = ITEMS_PER_THREAD * NUM_THREADS; // Whether or not there are values to be trucked along with keys static constexpr bool KEYS_ONLY = ::cuda::std::is_same::value; #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document /// Shared memory type required by this thread block union _TempStorage { KeyT keys_shared[ITEMS_PER_TILE + 1]; ValueT items_shared[ITEMS_PER_TILE + 1]; }; // union TempStorage #endif // DOXYGEN_SHOULD_SKIP_THIS /// Shared storage reference _TempStorage& temp_storage; /// Internal storage allocator _CCCL_DEVICE _CCCL_FORCEINLINE _TempStorage& PrivateStorage() { __shared__ _TempStorage private_storage; return private_storage; } const unsigned int linear_tid; public: /// \smemstorage{BlockMergeSort} struct TempStorage : Uninitialized<_TempStorage> {}; BlockMergeSortStrategy() = delete; explicit _CCCL_DEVICE _CCCL_FORCEINLINE BlockMergeSortStrategy(unsigned int linear_tid) : temp_storage(PrivateStorage()) , linear_tid(linear_tid) {} _CCCL_DEVICE _CCCL_FORCEINLINE BlockMergeSortStrategy(TempStorage& temp_storage, unsigned int linear_tid) : temp_storage(temp_storage.Alias()) , linear_tid(linear_tid) {} _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int get_linear_tid() const { return linear_tid; } /** * @brief Sorts items partitioned across a CUDA thread block using * a merge sorting method. * * @par * Sort is not guaranteed to be stable. That is, suppose that i and j are * equivalent: neither one is less than the other. It is not guaranteed * that the relative order of these two elements will be preserved by sort. * * @tparam CompareOp * functor type having member `bool operator()(KeyT lhs, KeyT rhs)`. * `CompareOp` is a model of [Strict Weak Ordering]. * * @param[in,out] keys * Keys to sort * * @param[in] compare_op * Comparison function object which returns true if the first argument is * ordered before the second * * [Strict Weak Ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order */ template _CCCL_DEVICE _CCCL_FORCEINLINE void Sort(KeyT (&keys)[ITEMS_PER_THREAD], CompareOp compare_op) { ValueT items[ITEMS_PER_THREAD]; Sort(keys, items, compare_op, ITEMS_PER_TILE, keys[0]); } /** * @brief Sorts items partitioned across a CUDA thread block using * a merge sorting method. * * @par * - Sort is not guaranteed to be stable. That is, suppose that `i` and `j` * are equivalent: neither one is less than the other. It is not guaranteed * that the relative order of these two elements will be preserved by sort. * - The value of `oob_default` is assigned to all elements that are out of * `valid_items` boundaries. It's expected that `oob_default` is ordered * after any value in the `valid_items` boundaries. The algorithm always * sorts a fixed amount of elements, which is equal to * `ITEMS_PER_THREAD * BLOCK_THREADS`. If there is a value that is ordered * after `oob_default`, it won't be placed within `valid_items` boundaries. * * @tparam CompareOp * functor type having member `bool operator()(KeyT lhs, KeyT rhs)`. * `CompareOp` is a model of [Strict Weak Ordering]. * * @param[in,out] keys * Keys to sort * * @param[in] compare_op * Comparison function object which returns true if the first argument is * ordered before the second * * @param[in] valid_items * Number of valid items to sort * * @param[in] oob_default * Default value to assign out-of-bound items * * [Strict Weak Ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order */ template _CCCL_DEVICE _CCCL_FORCEINLINE void Sort(KeyT (&keys)[ITEMS_PER_THREAD], CompareOp compare_op, int valid_items, KeyT oob_default) { ValueT items[ITEMS_PER_THREAD]; Sort(keys, items, compare_op, valid_items, oob_default); } /** * @brief Sorts items partitioned across a CUDA thread block using a merge sorting method. * * @par * Sort is not guaranteed to be stable. That is, suppose that `i` and `j` are * equivalent: neither one is less than the other. It is not guaranteed * that the relative order of these two elements will be preserved by sort. * * @tparam CompareOp * functor type having member `bool operator()(KeyT lhs, KeyT rhs)`. * `CompareOp` is a model of [Strict Weak Ordering]. * * @param[in,out] keys * Keys to sort * * @param[in,out] items * Values to sort * * @param[in] compare_op * Comparison function object which returns true if the first argument is * ordered before the second * * [Strict Weak Ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order */ template _CCCL_DEVICE _CCCL_FORCEINLINE void Sort(KeyT (&keys)[ITEMS_PER_THREAD], ValueT (&items)[ITEMS_PER_THREAD], CompareOp compare_op) { Sort(keys, items, compare_op, ITEMS_PER_TILE, keys[0]); } /** * @brief Sorts items partitioned across a CUDA thread block using * a merge sorting method. * * @par * - Sort is not guaranteed to be stable. That is, suppose that `i` and `j` * are equivalent: neither one is less than the other. It is not guaranteed * that the relative order of these two elements will be preserved by sort. * - The value of `oob_default` is assigned to all elements that are out of * `valid_items` boundaries. It's expected that `oob_default` is ordered * after any value in the `valid_items` boundaries. The algorithm always * sorts a fixed amount of elements, which is equal to * `ITEMS_PER_THREAD * BLOCK_THREADS`. If there is a value that is ordered * after `oob_default`, it won't be placed within `valid_items` boundaries. * * @tparam CompareOp * functor type having member `bool operator()(KeyT lhs, KeyT rhs)` * `CompareOp` is a model of [Strict Weak Ordering]. * * @tparam IS_LAST_TILE * True if `valid_items` isn't equal to the `ITEMS_PER_TILE` * * @param[in,out] keys * Keys to sort * * @param[in,out] items * Values to sort * * @param[in] compare_op * Comparison function object which returns true if the first argument is * ordered before the second * * @param[in] valid_items * Number of valid items to sort * * @param[in] oob_default * Default value to assign out-of-bound items * * [Strict Weak Ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order */ template _CCCL_DEVICE _CCCL_FORCEINLINE void Sort(KeyT (&keys)[ITEMS_PER_THREAD], ValueT (&items)[ITEMS_PER_THREAD], CompareOp compare_op, int valid_items, KeyT oob_default) { if (IS_LAST_TILE) { // if last tile, find valid max_key // and fill the remaining keys with it // KeyT max_key = oob_default; #pragma unroll for (int item = 1; item < ITEMS_PER_THREAD; ++item) { if (ITEMS_PER_THREAD * linear_tid + item < valid_items) { max_key = compare_op(max_key, keys[item]) ? keys[item] : max_key; } else { keys[item] = max_key; } } } // if first element of thread is in input range, stable sort items // if (!IS_LAST_TILE || ITEMS_PER_THREAD * linear_tid < valid_items) { StableOddEvenSort(keys, items, compare_op); } // each thread has sorted keys // merge sort keys in shared memory // for (int target_merged_threads_number = 2; target_merged_threads_number <= NUM_THREADS; target_merged_threads_number *= 2) { int merged_threads_number = target_merged_threads_number / 2; int mask = target_merged_threads_number - 1; Sync(); // store keys in shmem // #pragma unroll for (int item = 0; item < ITEMS_PER_THREAD; ++item) { int idx = ITEMS_PER_THREAD * linear_tid + item; temp_storage.keys_shared[idx] = keys[item]; } Sync(); int indices[ITEMS_PER_THREAD]; int first_thread_idx_in_thread_group_being_merged = ~mask & linear_tid; int start = ITEMS_PER_THREAD * first_thread_idx_in_thread_group_being_merged; int size = ITEMS_PER_THREAD * merged_threads_number; int thread_idx_in_thread_group_being_merged = mask & linear_tid; int diag = (cub::min)(valid_items, ITEMS_PER_THREAD * thread_idx_in_thread_group_being_merged); int keys1_beg = (cub::min)(valid_items, start); int keys1_end = (cub::min)(valid_items, keys1_beg + size); int keys2_beg = keys1_end; int keys2_end = (cub::min)(valid_items, keys2_beg + size); int keys1_count = keys1_end - keys1_beg; int keys2_count = keys2_end - keys2_beg; int partition_diag = MergePath( &temp_storage.keys_shared[keys1_beg], &temp_storage.keys_shared[keys2_beg], keys1_count, keys2_count, diag, compare_op); int keys1_beg_loc = keys1_beg + partition_diag; int keys1_end_loc = keys1_end; int keys2_beg_loc = keys2_beg + diag - partition_diag; int keys2_end_loc = keys2_end; int keys1_count_loc = keys1_end_loc - keys1_beg_loc; int keys2_count_loc = keys2_end_loc - keys2_beg_loc; SerialMerge( &temp_storage.keys_shared[0], keys1_beg_loc, keys2_beg_loc, keys1_count_loc, keys2_count_loc, keys, indices, compare_op); if (!KEYS_ONLY) { Sync(); // store keys in shmem // #pragma unroll for (int item = 0; item < ITEMS_PER_THREAD; ++item) { int idx = ITEMS_PER_THREAD * linear_tid + item; temp_storage.items_shared[idx] = items[item]; } Sync(); // gather items from shmem // #pragma unroll for (int item = 0; item < ITEMS_PER_THREAD; ++item) { items[item] = temp_storage.items_shared[indices[item]]; } } } } // func block_merge_sort /** * @brief Sorts items partitioned across a CUDA thread block using * a merge sorting method. * * @par * StableSort is stable: it preserves the relative ordering of equivalent * elements. That is, if `x` and `y` are elements such that `x` precedes `y`, * and if the two elements are equivalent (neither `x < y` nor `y < x`) then * a postcondition of StableSort is that `x` still precedes `y`. * * @tparam CompareOp * functor type having member `bool operator()(KeyT lhs, KeyT rhs)`. * `CompareOp` is a model of [Strict Weak Ordering]. * * @param[in,out] keys * Keys to sort * * @param[in] compare_op * Comparison function object which returns true if the first argument is * ordered before the second * * [Strict Weak Ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order */ template _CCCL_DEVICE _CCCL_FORCEINLINE void StableSort(KeyT (&keys)[ITEMS_PER_THREAD], CompareOp compare_op) { Sort(keys, compare_op); } /** * @brief Sorts items partitioned across a CUDA thread block using * a merge sorting method. * * @par * StableSort is stable: it preserves the relative ordering of equivalent * elements. That is, if `x` and `y` are elements such that `x` precedes `y`, * and if the two elements are equivalent (neither `x < y` nor `y < x`) then * a postcondition of StableSort is that `x` still precedes `y`. * * @tparam CompareOp * functor type having member `bool operator()(KeyT lhs, KeyT rhs)`. * `CompareOp` is a model of [Strict Weak Ordering]. * * @param[in,out] keys * Keys to sort * * @param[in,out] items * Values to sort * * @param[in] compare_op * Comparison function object which returns true if the first argument is * ordered before the second * * [Strict Weak Ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order */ template _CCCL_DEVICE _CCCL_FORCEINLINE void StableSort(KeyT (&keys)[ITEMS_PER_THREAD], ValueT (&items)[ITEMS_PER_THREAD], CompareOp compare_op) { Sort(keys, items, compare_op); } /** * @brief Sorts items partitioned across a CUDA thread block using * a merge sorting method. * * @par * - StableSort is stable: it preserves the relative ordering of equivalent * elements. That is, if `x` and `y` are elements such that `x` precedes * `y`, and if the two elements are equivalent (neither `x < y` nor `y < x`) * then a postcondition of StableSort is that `x` still precedes `y`. * - The value of `oob_default` is assigned to all elements that are out of * `valid_items` boundaries. It's expected that `oob_default` is ordered * after any value in the `valid_items` boundaries. The algorithm always * sorts a fixed amount of elements, which is equal to * `ITEMS_PER_THREAD * BLOCK_THREADS`. * If there is a value that is ordered after `oob_default`, it won't be * placed within `valid_items` boundaries. * * @tparam CompareOp * functor type having member `bool operator()(KeyT lhs, KeyT rhs)`. * `CompareOp` is a model of [Strict Weak Ordering]. * * @param[in,out] keys * Keys to sort * * @param[in] compare_op * Comparison function object which returns true if the first argument is * ordered before the second * * @param[in] valid_items * Number of valid items to sort * * @param[in] oob_default * Default value to assign out-of-bound items * * [Strict Weak Ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order */ template _CCCL_DEVICE _CCCL_FORCEINLINE void StableSort(KeyT (&keys)[ITEMS_PER_THREAD], CompareOp compare_op, int valid_items, KeyT oob_default) { Sort(keys, compare_op, valid_items, oob_default); } /** * @brief Sorts items partitioned across a CUDA thread block using * a merge sorting method. * * @par * - StableSort is stable: it preserves the relative ordering of equivalent * elements. That is, if `x` and `y` are elements such that `x` precedes * `y`, and if the two elements are equivalent (neither `x < y` nor `y < x`) * then a postcondition of StableSort is that `x` still precedes `y`. * - The value of `oob_default` is assigned to all elements that are out of * `valid_items` boundaries. It's expected that `oob_default` is ordered * after any value in the `valid_items` boundaries. The algorithm always * sorts a fixed amount of elements, which is equal to * `ITEMS_PER_THREAD * BLOCK_THREADS`. If there is a value that is ordered * after `oob_default`, it won't be placed within `valid_items` boundaries. * * @tparam CompareOp * functor type having member `bool operator()(KeyT lhs, KeyT rhs)`. * `CompareOp` is a model of [Strict Weak Ordering]. * * @tparam IS_LAST_TILE * True if `valid_items` isn't equal to the `ITEMS_PER_TILE` * * @param[in,out] keys * Keys to sort * * @param[in,out] items * Values to sort * * @param[in] compare_op * Comparison function object which returns true if the first argument is * ordered before the second * * @param[in] valid_items * Number of valid items to sort * * @param[in] oob_default * Default value to assign out-of-bound items * * [Strict Weak Ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order */ template _CCCL_DEVICE _CCCL_FORCEINLINE void StableSort( KeyT (&keys)[ITEMS_PER_THREAD], ValueT (&items)[ITEMS_PER_THREAD], CompareOp compare_op, int valid_items, KeyT oob_default) { Sort(keys, items, compare_op, valid_items, oob_default); } private: _CCCL_DEVICE _CCCL_FORCEINLINE void Sync() const { static_cast(this)->SyncImplementation(); } }; /** * @brief The BlockMergeSort class provides methods for sorting items * partitioned across a CUDA thread block using a merge sorting method. * * @tparam KeyT * KeyT type * * @tparam BLOCK_DIM_X * The thread block length in threads along the X dimension * * @tparam ITEMS_PER_THREAD * The number of items per thread * * @tparam ValueT * **[optional]** ValueT type (default: `cub::NullType`, which indicates * a keys-only sort) * * @tparam BLOCK_DIM_Y * **[optional]** The thread block length in threads along the Y dimension * (default: 1) * * @tparam BLOCK_DIM_Z * **[optional]** The thread block length in threads along the Z dimension * (default: 1) * * @par Overview * BlockMergeSort arranges items into ascending order using a comparison * functor with less-than semantics. Merge sort can handle arbitrary types * and comparison functors, but is slower than BlockRadixSort when sorting * arithmetic types into ascending/descending order. * * @par A Simple Example * @blockcollective{BlockMergeSort} * @par * The code snippet below illustrates a sort of 512 integer keys that are * partitioned across 128 threads * where each thread owns 4 consecutive items. * @par * @code * #include // or equivalently * * struct CustomLess * { * template * __device__ bool operator()(const DataType &lhs, const DataType &rhs) * { * return lhs < rhs; * } * }; * * __global__ void ExampleKernel(...) * { * // Specialize BlockMergeSort for a 1D block of 128 threads owning 4 integer items each * typedef cub::BlockMergeSort BlockMergeSort; * * // Allocate shared memory for BlockMergeSort * __shared__ typename BlockMergeSort::TempStorage temp_storage_shuffle; * * // Obtain a segment of consecutive items that are blocked across threads * int thread_keys[4]; * ... * * BlockMergeSort(temp_storage_shuffle).Sort(thread_keys, CustomLess()); * ... * } * @endcode * @par * Suppose the set of input `thread_keys` across the block of threads is * `{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }`. * The corresponding output `thread_keys` in those threads will be * `{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }`. * * @par Re-using dynamically allocating shared memory * The following example under the examples/block folder illustrates usage of * dynamically shared memory with BlockReduce and how to re-purpose * the same memory region: * example_block_reduce_dyn_smem.cu * * This example can be easily adapted to the storage required by BlockMergeSort. */ template class BlockMergeSort : public BlockMergeSortStrategy< KeyT, ValueT, BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, ITEMS_PER_THREAD, BlockMergeSort> { private: // The thread block size in threads static constexpr int BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z; static constexpr int ITEMS_PER_TILE = ITEMS_PER_THREAD * BLOCK_THREADS; using BlockMergeSortStrategyT = BlockMergeSortStrategy; public: _CCCL_DEVICE _CCCL_FORCEINLINE BlockMergeSort() : BlockMergeSortStrategyT(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) {} _CCCL_DEVICE _CCCL_FORCEINLINE explicit BlockMergeSort(typename BlockMergeSortStrategyT::TempStorage& temp_storage) : BlockMergeSortStrategyT(temp_storage, RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) {} private: _CCCL_DEVICE _CCCL_FORCEINLINE void SyncImplementation() const { CTA_SYNC(); } friend BlockMergeSortStrategyT; }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/block/block_radix_rank.cuh000066400000000000000000001210141463375617100210470ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ //! @file cub::BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include #include #include #include #include #include CUB_NAMESPACE_BEGIN //! @brief Radix ranking algorithm, the algorithm used to implement stable ranking of the //! keys from a single tile. Note that different ranking algorithms require different //! initial arrangements of keys to function properly. enum RadixRankAlgorithm { //! Ranking using the BlockRadixRank algorithm with `MEMOIZE_OUTER_SCAN == false`. //! It uses thread-private histograms, and thus uses more shared memory. //! Requires blocked arrangement of keys. Does not support count callbacks. RADIX_RANK_BASIC, //! Ranking using the BlockRadixRank algorithm with `MEMOIZE_OUTER_SCAN == true`. //! Similar to RADIX_RANK BASIC, it requires blocked arrangement of keys and does not support count callbacks. RADIX_RANK_MEMOIZE, //! Ranking using the BlockRadixRankMatch algorithm. It uses warp-private histograms and matching for ranking //! the keys in a single warp. Therefore, it uses less shared memory compared to RADIX_RANK_BASIC. //! It requires warp-striped key arrangement and supports count callbacks. RADIX_RANK_MATCH, //! Ranking using the BlockRadixRankMatchEarlyCounts algorithm with `MATCH_ALGORITHM == WARP_MATCH_ANY`. //! An alternative implementation of match-based ranking that computes bin counts early. //! Because of this, it works better with onesweep sorting, which requires bin counts for decoupled look-back. //! Assumes warp-striped key arrangement and supports count callbacks. RADIX_RANK_MATCH_EARLY_COUNTS_ANY, //! Ranking using the BlockRadixRankEarlyCounts algorithm with `MATCH_ALGORITHM == WARP_MATCH_ATOMIC_OR`. //! It uses extra space in shared memory to generate warp match masks using `atomicOr()`. //! This is faster when there are few matches, but can lead to slowdowns if the number of matching keys among //! warp lanes is high. Assumes warp-striped key arrangement and supports count callbacks. RADIX_RANK_MATCH_EARLY_COUNTS_ATOMIC_OR }; /** Empty callback implementation */ template struct BlockRadixRankEmptyCallback { _CCCL_DEVICE _CCCL_FORCEINLINE void operator()(int (&bins)[BINS_PER_THREAD]) {} }; #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document namespace detail { template struct warp_in_block_matcher_t { static _CCCL_DEVICE ::cuda::std::uint32_t match_any(::cuda::std::uint32_t label, ::cuda::std::uint32_t warp_id) { if (warp_id == static_cast<::cuda::std::uint32_t>(PartialWarpId)) { return MatchAny(label); } return MatchAny(label); } }; template struct warp_in_block_matcher_t { static _CCCL_DEVICE ::cuda::std::uint32_t match_any(::cuda::std::uint32_t label, ::cuda::std::uint32_t warp_id) { return MatchAny(label); } }; } // namespace detail #endif // DOXYGEN_SHOULD_SKIP_THIS //! @rst //! BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block. //! //! Overview //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! - Keys must be in a form suitable for radix ranking (i.e., unsigned bits). //! - @blocked //! //! Performance Considerations //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! - @granularity //! //! .. code-block:: c++ //! //! #include //! //! __global__ void ExampleKernel(...) //! { //! constexpr int block_threads = 2; //! constexpr int radix_bits = 5; //! //! // Specialize BlockRadixRank for a 1D block of 2 threads //! // Specialize BlockRadixRank for a 1D block of 2 threads //! using block_radix_rank = cub::BlockRadixRank; //! using storage_t = typename block_radix_rank::TempStorage; //! //! // Allocate shared memory for BlockRadixSort //! __shared__ storage_t temp_storage; //! //! // Obtain a segment of consecutive items that are blocked across threads //! int keys[2]; //! int ranks[2]; //! ... //! //! cub::BFEDigitExtractor extractor(0, radix_bits); //! block_radix_rank(temp_storage).RankKeys(keys, ranks, extractor); //! //! ... //! //! Suppose the set of input ``keys`` across the block of threads is ``{ [16,10], [9,11] }``. //! The corresponding output ``ranks`` in those threads will be ``{ [3,1], [0,2] }``. //! //! Re-using dynamically allocating shared memory //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The ``block/example_block_reduce_dyn_smem.cu`` example illustrates usage of dynamically shared memory with //! BlockReduce and how to re-purpose the same memory region. //! This example can be easily adapted to the storage required by BlockRadixRank. //! //! @endrst //! //! @tparam BLOCK_DIM_X //! The thread block length in threads along the X dimension //! //! @tparam RADIX_BITS //! The number of radix bits per digit place //! //! @tparam IS_DESCENDING //! Whether or not the sorted-order is high-to-low //! //! @tparam MEMOIZE_OUTER_SCAN //! **[optional]** Whether or not to buffer outer raking scan //! partials to incur fewer shared memory reads at the expense of higher register pressure //! (default: true for architectures SM35 and newer, false otherwise). //! See `BlockScanAlgorithm::BLOCK_SCAN_RAKING_MEMOIZE` for more details. //! //! @tparam INNER_SCAN_ALGORITHM //! **[optional]** The cub::BlockScanAlgorithm algorithm to use (default: cub::BLOCK_SCAN_WARP_SCANS) //! //! @tparam SMEM_CONFIG //! **[optional]** Shared memory bank mode (default: `cudaSharedMemBankSizeFourByte`) //! //! @tparam BLOCK_DIM_Y //! **[optional]** The thread block length in threads along the Y dimension (default: 1) //! //! @tparam BLOCK_DIM_Z //! **[optional]** The thread block length in threads along the Z dimension (default: 1) //! //! @tparam LEGACY_PTX_ARCH //! **[optional]** Unused. template class BlockRadixRank { private: // Integer type for digit counters (to be packed into words of type PackedCounters) using DigitCounter = unsigned short; // Integer type for packing DigitCounters into columns of shared memory banks using PackedCounter = cub::detail::conditional_t; static constexpr DigitCounter max_tile_size = ::cuda::std::numeric_limits::max(); enum { // The thread block size in threads BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, RADIX_DIGITS = 1 << RADIX_BITS, LOG_WARP_THREADS = CUB_LOG_WARP_THREADS(0), WARP_THREADS = 1 << LOG_WARP_THREADS, WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, BYTES_PER_COUNTER = sizeof(DigitCounter), LOG_BYTES_PER_COUNTER = Log2::VALUE, PACKING_RATIO = static_cast(sizeof(PackedCounter) / sizeof(DigitCounter)), LOG_PACKING_RATIO = Log2::VALUE, // Always at least one lane LOG_COUNTER_LANES = CUB_MAX((int(RADIX_BITS) - int(LOG_PACKING_RATIO)), 0), COUNTER_LANES = 1 << LOG_COUNTER_LANES, // The number of packed counters per thread (plus one for padding) PADDED_COUNTER_LANES = COUNTER_LANES + 1, RAKING_SEGMENT = PADDED_COUNTER_LANES, }; public: enum { /// Number of bin-starting offsets tracked per thread BINS_TRACKED_PER_THREAD = CUB_MAX(1, (RADIX_DIGITS + BLOCK_THREADS - 1) / BLOCK_THREADS), }; private: /// BlockScan type typedef BlockScan BlockScan; #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document struct __align__(16) _TempStorage { union Aliasable { DigitCounter digit_counters[PADDED_COUNTER_LANES][BLOCK_THREADS][PACKING_RATIO]; PackedCounter raking_grid[BLOCK_THREADS][RAKING_SEGMENT]; } aliasable; // Storage for scanning local ranks typename BlockScan::TempStorage block_scan; }; #endif // !DOXYGEN_SHOULD_SKIP_THIS /// Shared storage reference _TempStorage& temp_storage; /// Linear thread-id unsigned int linear_tid; /// Copy of raking segment, promoted to registers PackedCounter cached_segment[RAKING_SEGMENT]; /** * Internal storage allocator */ _CCCL_DEVICE _CCCL_FORCEINLINE _TempStorage& PrivateStorage() { __shared__ _TempStorage private_storage; return private_storage; } /** * Performs upsweep raking reduction, returning the aggregate */ _CCCL_DEVICE _CCCL_FORCEINLINE PackedCounter Upsweep() { PackedCounter* smem_raking_ptr = temp_storage.aliasable.raking_grid[linear_tid]; PackedCounter* raking_ptr; if (MEMOIZE_OUTER_SCAN) { // Copy data into registers #pragma unroll for (int i = 0; i < RAKING_SEGMENT; i++) { cached_segment[i] = smem_raking_ptr[i]; } raking_ptr = cached_segment; } else { raking_ptr = smem_raking_ptr; } return internal::ThreadReduce(raking_ptr, Sum()); } /// Performs exclusive downsweep raking scan _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveDownsweep(PackedCounter raking_partial) { PackedCounter* smem_raking_ptr = temp_storage.aliasable.raking_grid[linear_tid]; PackedCounter* raking_ptr = (MEMOIZE_OUTER_SCAN) ? cached_segment : smem_raking_ptr; // Exclusive raking downsweep scan internal::ThreadScanExclusive(raking_ptr, raking_ptr, Sum(), raking_partial); if (MEMOIZE_OUTER_SCAN) { // Copy data back to smem #pragma unroll for (int i = 0; i < RAKING_SEGMENT; i++) { smem_raking_ptr[i] = cached_segment[i]; } } } /** * Reset shared memory digit counters */ _CCCL_DEVICE _CCCL_FORCEINLINE void ResetCounters() { // Reset shared memory digit counters #pragma unroll for (int LANE = 0; LANE < PADDED_COUNTER_LANES; LANE++) { *((PackedCounter*) temp_storage.aliasable.digit_counters[LANE][linear_tid]) = 0; } } /** * Block-scan prefix callback */ struct PrefixCallBack { _CCCL_DEVICE _CCCL_FORCEINLINE PackedCounter operator()(PackedCounter block_aggregate) { PackedCounter block_prefix = 0; // Propagate totals in packed fields #pragma unroll for (int PACKED = 1; PACKED < PACKING_RATIO; PACKED++) { block_prefix += block_aggregate << (sizeof(DigitCounter) * 8 * PACKED); } return block_prefix; } }; /** * Scan shared memory digit counters. */ _CCCL_DEVICE _CCCL_FORCEINLINE void ScanCounters() { // Upsweep scan PackedCounter raking_partial = Upsweep(); // Compute exclusive sum PackedCounter exclusive_partial; PrefixCallBack prefix_call_back; BlockScan(temp_storage.block_scan).ExclusiveSum(raking_partial, exclusive_partial, prefix_call_back); // Downsweep scan with exclusive partial ExclusiveDownsweep(exclusive_partial); } public: /// @smemstorage{BlockScan} struct TempStorage : Uninitialized<_TempStorage> {}; //! @name Collective constructors //! @{ //! @brief Collective constructor using a private static allocation of shared memory as temporary storage. _CCCL_DEVICE _CCCL_FORCEINLINE BlockRadixRank() : temp_storage(PrivateStorage()) , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) {} /** * @brief Collective constructor using the specified memory allocation as temporary storage. * * @param[in] temp_storage * Reference to memory allocation having layout type TempStorage */ _CCCL_DEVICE _CCCL_FORCEINLINE BlockRadixRank(TempStorage& temp_storage) : temp_storage(temp_storage.Alias()) , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) {} //! @} end member group //! @name Raking //! @{ /** * @brief Rank keys. * * @param[in] keys * Keys for this tile * * @param[out] ranks * For each key, the local rank within the tile * * @param[in] digit_extractor * The digit extractor */ template _CCCL_DEVICE _CCCL_FORCEINLINE void RankKeys(UnsignedBits (&keys)[KEYS_PER_THREAD], int (&ranks)[KEYS_PER_THREAD], DigitExtractorT digit_extractor) { static_assert(BLOCK_THREADS * KEYS_PER_THREAD <= max_tile_size, "DigitCounter type is too small to hold this number of keys"); DigitCounter thread_prefixes[KEYS_PER_THREAD]; // For each key, the count of previous keys in this tile having the // same digit DigitCounter* digit_counters[KEYS_PER_THREAD]; // For each key, the byte-offset of its corresponding digit counter // in smem // Reset shared memory digit counters ResetCounters(); #pragma unroll for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM) { // Get digit ::cuda::std::uint32_t digit = digit_extractor.Digit(keys[ITEM]); // Get sub-counter ::cuda::std::uint32_t sub_counter = digit >> LOG_COUNTER_LANES; // Get counter lane ::cuda::std::uint32_t counter_lane = digit & (COUNTER_LANES - 1); if (IS_DESCENDING) { sub_counter = PACKING_RATIO - 1 - sub_counter; counter_lane = COUNTER_LANES - 1 - counter_lane; } // Pointer to smem digit counter digit_counters[ITEM] = &temp_storage.aliasable.digit_counters[counter_lane][linear_tid][sub_counter]; // Load thread-exclusive prefix thread_prefixes[ITEM] = *digit_counters[ITEM]; // Store inclusive prefix *digit_counters[ITEM] = thread_prefixes[ITEM] + 1; } CTA_SYNC(); // Scan shared memory counters ScanCounters(); CTA_SYNC(); // Extract the local ranks of each key #pragma unroll for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM) { // Add in thread block exclusive prefix ranks[ITEM] = thread_prefixes[ITEM] + *digit_counters[ITEM]; } } /** * @brief Rank keys. For the lower @p RADIX_DIGITS threads, digit counts for each digit are * provided for the corresponding thread. * * @param[in] keys * Keys for this tile * * @param[out] ranks * For each key, the local rank within the tile (out parameter) * * @param[in] digit_extractor * The digit extractor * * @param[out] exclusive_digit_prefix * The exclusive prefix sum for the digits * [(threadIdx.x * BINS_TRACKED_PER_THREAD) * ... * (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1] */ template _CCCL_DEVICE _CCCL_FORCEINLINE void RankKeys(UnsignedBits (&keys)[KEYS_PER_THREAD], int (&ranks)[KEYS_PER_THREAD], DigitExtractorT digit_extractor, int (&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD]) { static_assert(BLOCK_THREADS * KEYS_PER_THREAD <= max_tile_size, "DigitCounter type is too small to hold this number of keys"); // Rank keys RankKeys(keys, ranks, digit_extractor); // Get the inclusive and exclusive digit totals corresponding to the calling thread. #pragma unroll for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) { int bin_idx = (linear_tid * BINS_TRACKED_PER_THREAD) + track; if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) { if (IS_DESCENDING) { bin_idx = RADIX_DIGITS - bin_idx - 1; } // Obtain ex/inclusive digit counts. (Unfortunately these all reside in the // first counter column, resulting in unavoidable bank conflicts.) unsigned int counter_lane = (bin_idx & (COUNTER_LANES - 1)); unsigned int sub_counter = bin_idx >> (LOG_COUNTER_LANES); exclusive_digit_prefix[track] = temp_storage.aliasable.digit_counters[counter_lane][0][sub_counter]; } } } //! @} }; /** * Radix-rank using match.any */ template class BlockRadixRankMatch { private: typedef int32_t RankT; typedef int32_t DigitCounterT; enum { // The thread block size in threads BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, RADIX_DIGITS = 1 << RADIX_BITS, LOG_WARP_THREADS = CUB_LOG_WARP_THREADS(0), WARP_THREADS = 1 << LOG_WARP_THREADS, PARTIAL_WARP_THREADS = BLOCK_THREADS % WARP_THREADS, WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, PADDED_WARPS = ((WARPS & 0x1) == 0) ? WARPS + 1 : WARPS, COUNTERS = PADDED_WARPS * RADIX_DIGITS, RAKING_SEGMENT = (COUNTERS + BLOCK_THREADS - 1) / BLOCK_THREADS, PADDED_RAKING_SEGMENT = ((RAKING_SEGMENT & 0x1) == 0) ? RAKING_SEGMENT + 1 : RAKING_SEGMENT, }; public: enum { /// Number of bin-starting offsets tracked per thread BINS_TRACKED_PER_THREAD = CUB_MAX(1, (RADIX_DIGITS + BLOCK_THREADS - 1) / BLOCK_THREADS), }; private: /// BlockScan type typedef BlockScan BlockScanT; #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document struct __align__(16) _TempStorage { typename BlockScanT::TempStorage block_scan; union __align__(16) Aliasable { volatile DigitCounterT warp_digit_counters[RADIX_DIGITS][PADDED_WARPS]; DigitCounterT raking_grid[BLOCK_THREADS][PADDED_RAKING_SEGMENT]; } aliasable; }; #endif // !DOXYGEN_SHOULD_SKIP_THIS /// Shared storage reference _TempStorage& temp_storage; /// Linear thread-id unsigned int linear_tid; public: /// @smemstorage{BlockRadixRankMatch} struct TempStorage : Uninitialized<_TempStorage> {}; //! @name Collective constructors //! @{ /** * @brief Collective constructor using the specified memory allocation as temporary storage. * * @param[in] temp_storage * Reference to memory allocation having layout type TempStorage */ _CCCL_DEVICE _CCCL_FORCEINLINE BlockRadixRankMatch(TempStorage& temp_storage) : temp_storage(temp_storage.Alias()) , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) {} //! @} end member group //! @name Raking //! @{ /** * @brief Computes the count of keys for each digit value, and calls the * callback with the array of key counts. * * @tparam CountsCallback The callback type. It should implement an instance * overload of operator()(int (&bins)[BINS_TRACKED_PER_THREAD]), where bins * is an array of key counts for each digit value distributed in block * distribution among the threads of the thread block. Key counts can be * used, to update other data structures in global or shared * memory. Depending on the implementation of the ranking algoirhtm * (see BlockRadixRankMatchEarlyCounts), key counts may become available * early, therefore, they are returned through a callback rather than a * separate output parameter of RankKeys(). */ template _CCCL_DEVICE _CCCL_FORCEINLINE void CallBack(CountsCallback callback) { int bins[BINS_TRACKED_PER_THREAD]; // Get count for each digit #pragma unroll for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) { int bin_idx = (linear_tid * BINS_TRACKED_PER_THREAD) + track; constexpr int TILE_ITEMS = KEYS_PER_THREAD * BLOCK_THREADS; if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) { if (IS_DESCENDING) { bin_idx = RADIX_DIGITS - bin_idx - 1; bins[track] = (bin_idx > 0 ? temp_storage.aliasable.warp_digit_counters[bin_idx - 1][0] : TILE_ITEMS) - temp_storage.aliasable.warp_digit_counters[bin_idx][0]; } else { bins[track] = (bin_idx < RADIX_DIGITS - 1 ? temp_storage.aliasable.warp_digit_counters[bin_idx + 1][0] : TILE_ITEMS) - temp_storage.aliasable.warp_digit_counters[bin_idx][0]; } } } callback(bins); } /** * @brief Rank keys. * * @param[in] keys * Keys for this tile * * @param[out] ranks * For each key, the local rank within the tile * * @param[in] digit_extractor * The digit extractor */ template _CCCL_DEVICE _CCCL_FORCEINLINE void RankKeys(UnsignedBits (&keys)[KEYS_PER_THREAD], int (&ranks)[KEYS_PER_THREAD], DigitExtractorT digit_extractor, CountsCallback callback) { // Initialize shared digit counters #pragma unroll for (int ITEM = 0; ITEM < PADDED_RAKING_SEGMENT; ++ITEM) { temp_storage.aliasable.raking_grid[linear_tid][ITEM] = 0; } CTA_SYNC(); // Each warp will strip-mine its section of input, one strip at a time volatile DigitCounterT* digit_counters[KEYS_PER_THREAD]; uint32_t warp_id = linear_tid >> LOG_WARP_THREADS; uint32_t lane_mask_lt = LaneMaskLt(); #pragma unroll for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM) { // My digit ::cuda::std::uint32_t digit = digit_extractor.Digit(keys[ITEM]); if (IS_DESCENDING) { digit = RADIX_DIGITS - digit - 1; } // Mask of peers who have same digit as me uint32_t peer_mask = detail::warp_in_block_matcher_t::match_any(digit, warp_id); // Pointer to smem digit counter for this key digit_counters[ITEM] = &temp_storage.aliasable.warp_digit_counters[digit][warp_id]; // Number of occurrences in previous strips DigitCounterT warp_digit_prefix = *digit_counters[ITEM]; // Warp-sync WARP_SYNC(0xFFFFFFFF); // Number of peers having same digit as me int32_t digit_count = __popc(peer_mask); // Number of lower-ranked peers having same digit seen so far int32_t peer_digit_prefix = __popc(peer_mask & lane_mask_lt); if (peer_digit_prefix == 0) { // First thread for each digit updates the shared warp counter *digit_counters[ITEM] = DigitCounterT(warp_digit_prefix + digit_count); } // Warp-sync WARP_SYNC(0xFFFFFFFF); // Number of prior keys having same digit ranks[ITEM] = warp_digit_prefix + DigitCounterT(peer_digit_prefix); } CTA_SYNC(); // Scan warp counters DigitCounterT scan_counters[PADDED_RAKING_SEGMENT]; #pragma unroll for (int ITEM = 0; ITEM < PADDED_RAKING_SEGMENT; ++ITEM) { scan_counters[ITEM] = temp_storage.aliasable.raking_grid[linear_tid][ITEM]; } BlockScanT(temp_storage.block_scan).ExclusiveSum(scan_counters, scan_counters); #pragma unroll for (int ITEM = 0; ITEM < PADDED_RAKING_SEGMENT; ++ITEM) { temp_storage.aliasable.raking_grid[linear_tid][ITEM] = scan_counters[ITEM]; } CTA_SYNC(); if (!::cuda::std::is_same>::value) { CallBack(callback); } // Seed ranks with counter values from previous warps #pragma unroll for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM) { ranks[ITEM] += *digit_counters[ITEM]; } } template _CCCL_DEVICE _CCCL_FORCEINLINE void RankKeys(UnsignedBits (&keys)[KEYS_PER_THREAD], int (&ranks)[KEYS_PER_THREAD], DigitExtractorT digit_extractor) { RankKeys(keys, ranks, digit_extractor, BlockRadixRankEmptyCallback()); } /** * @brief Rank keys. For the lower @p RADIX_DIGITS threads, digit counts for each digit are * provided for the corresponding thread. * * @param[in] keys * Keys for this tile * * @param[out] ranks * For each key, the local rank within the tile (out parameter) * * @param[in] digit_extractor * The digit extractor * * @param[out] exclusive_digit_prefix * The exclusive prefix sum for the digits * [(threadIdx.x * BINS_TRACKED_PER_THREAD) * ... * (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1] */ template _CCCL_DEVICE _CCCL_FORCEINLINE void RankKeys( UnsignedBits (&keys)[KEYS_PER_THREAD], int (&ranks)[KEYS_PER_THREAD], DigitExtractorT digit_extractor, int (&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD], CountsCallback callback) { RankKeys(keys, ranks, digit_extractor, callback); // Get exclusive count for each digit #pragma unroll for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) { int bin_idx = (linear_tid * BINS_TRACKED_PER_THREAD) + track; if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) { if (IS_DESCENDING) { bin_idx = RADIX_DIGITS - bin_idx - 1; } exclusive_digit_prefix[track] = temp_storage.aliasable.warp_digit_counters[bin_idx][0]; } } } /** * @param[in] keys * Keys for this tile * * @param[out] ranks * For each key, the local rank within the tile (out parameter) * * @param[out] exclusive_digit_prefix * The exclusive prefix sum for the digits * [(threadIdx.x * BINS_TRACKED_PER_THREAD) * ... * (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1] */ template _CCCL_DEVICE _CCCL_FORCEINLINE void RankKeys(UnsignedBits (&keys)[KEYS_PER_THREAD], int (&ranks)[KEYS_PER_THREAD], DigitExtractorT digit_extractor, int (&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD]) { RankKeys( keys, ranks, digit_extractor, exclusive_digit_prefix, BlockRadixRankEmptyCallback()); } //! @} }; enum WarpMatchAlgorithm { WARP_MATCH_ANY, WARP_MATCH_ATOMIC_OR }; /** * Radix-rank using matching which computes the counts of keys for each digit * value early, at the expense of doing more work. This may be useful e.g. for * decoupled look-back, where it reduces the time other thread blocks need to * wait for digit counts to become available. */ template struct BlockRadixRankMatchEarlyCounts { // constants enum { BLOCK_THREADS = BLOCK_DIM_X, RADIX_DIGITS = 1 << RADIX_BITS, BINS_PER_THREAD = (RADIX_DIGITS + BLOCK_THREADS - 1) / BLOCK_THREADS, BINS_TRACKED_PER_THREAD = BINS_PER_THREAD, FULL_BINS = BINS_PER_THREAD * BLOCK_THREADS == RADIX_DIGITS, WARP_THREADS = CUB_PTX_WARP_THREADS, PARTIAL_WARP_THREADS = BLOCK_THREADS % WARP_THREADS, BLOCK_WARPS = BLOCK_THREADS / WARP_THREADS, PARTIAL_WARP_ID = BLOCK_WARPS - 1, WARP_MASK = ~0, NUM_MATCH_MASKS = MATCH_ALGORITHM == WARP_MATCH_ATOMIC_OR ? BLOCK_WARPS : 0, // Guard against declaring zero-sized array: MATCH_MASKS_ALLOC_SIZE = NUM_MATCH_MASKS < 1 ? 1 : NUM_MATCH_MASKS, }; // types typedef cub::BlockScan BlockScan; struct TempStorage { union { int warp_offsets[BLOCK_WARPS][RADIX_DIGITS]; int warp_histograms[BLOCK_WARPS][RADIX_DIGITS][NUM_PARTS]; }; int match_masks[MATCH_MASKS_ALLOC_SIZE][RADIX_DIGITS]; typename BlockScan::TempStorage prefix_tmp; }; TempStorage& temp_storage; // internal ranking implementation template struct BlockRadixRankMatchInternal { TempStorage& s; DigitExtractorT digit_extractor; CountsCallback callback; int warp; int lane; _CCCL_DEVICE _CCCL_FORCEINLINE ::cuda::std::uint32_t Digit(UnsignedBits key) { ::cuda::std::uint32_t digit = digit_extractor.Digit(key); return IS_DESCENDING ? RADIX_DIGITS - 1 - digit : digit; } _CCCL_DEVICE _CCCL_FORCEINLINE int ThreadBin(int u) { int bin = threadIdx.x * BINS_PER_THREAD + u; return IS_DESCENDING ? RADIX_DIGITS - 1 - bin : bin; } _CCCL_DEVICE _CCCL_FORCEINLINE void ComputeHistogramsWarp(UnsignedBits (&keys)[KEYS_PER_THREAD]) { // int* warp_offsets = &s.warp_offsets[warp][0]; int(&warp_histograms)[RADIX_DIGITS][NUM_PARTS] = s.warp_histograms[warp]; // compute warp-private histograms #pragma unroll for (int bin = lane; bin < RADIX_DIGITS; bin += WARP_THREADS) { #pragma unroll for (int part = 0; part < NUM_PARTS; ++part) { warp_histograms[bin][part] = 0; } } if (MATCH_ALGORITHM == WARP_MATCH_ATOMIC_OR) { int* match_masks = &s.match_masks[warp][0]; #pragma unroll for (int bin = lane; bin < RADIX_DIGITS; bin += WARP_THREADS) { match_masks[bin] = 0; } } WARP_SYNC(WARP_MASK); // compute private per-part histograms int part = lane % NUM_PARTS; #pragma unroll for (int u = 0; u < KEYS_PER_THREAD; ++u) { atomicAdd(&warp_histograms[Digit(keys[u])][part], 1); } // sum different parts; // no extra work is necessary if NUM_PARTS == 1 if (NUM_PARTS > 1) { WARP_SYNC(WARP_MASK); // TODO: handle RADIX_DIGITS % WARP_THREADS != 0 if it becomes necessary constexpr int WARP_BINS_PER_THREAD = RADIX_DIGITS / WARP_THREADS; int bins[WARP_BINS_PER_THREAD]; #pragma unroll for (int u = 0; u < WARP_BINS_PER_THREAD; ++u) { int bin = lane + u * WARP_THREADS; bins[u] = internal::ThreadReduce(warp_histograms[bin], Sum()); } CTA_SYNC(); // store the resulting histogram in shared memory int* warp_offsets = &s.warp_offsets[warp][0]; #pragma unroll for (int u = 0; u < WARP_BINS_PER_THREAD; ++u) { int bin = lane + u * WARP_THREADS; warp_offsets[bin] = bins[u]; } } } _CCCL_DEVICE _CCCL_FORCEINLINE void ComputeOffsetsWarpUpsweep(int (&bins)[BINS_PER_THREAD]) { // sum up warp-private histograms #pragma unroll for (int u = 0; u < BINS_PER_THREAD; ++u) { bins[u] = 0; int bin = ThreadBin(u); if (FULL_BINS || (bin >= 0 && bin < RADIX_DIGITS)) { #pragma unroll for (int j_warp = 0; j_warp < BLOCK_WARPS; ++j_warp) { int warp_offset = s.warp_offsets[j_warp][bin]; s.warp_offsets[j_warp][bin] = bins[u]; bins[u] += warp_offset; } } } } _CCCL_DEVICE _CCCL_FORCEINLINE void ComputeOffsetsWarpDownsweep(int (&offsets)[BINS_PER_THREAD]) { #pragma unroll for (int u = 0; u < BINS_PER_THREAD; ++u) { int bin = ThreadBin(u); if (FULL_BINS || (bin >= 0 && bin < RADIX_DIGITS)) { int digit_offset = offsets[u]; #pragma unroll for (int j_warp = 0; j_warp < BLOCK_WARPS; ++j_warp) { s.warp_offsets[j_warp][bin] += digit_offset; } } } } _CCCL_DEVICE _CCCL_FORCEINLINE void ComputeRanksItem( UnsignedBits (&keys)[KEYS_PER_THREAD], int (&ranks)[KEYS_PER_THREAD], Int2Type) { // compute key ranks int lane_mask = 1 << lane; int* warp_offsets = &s.warp_offsets[warp][0]; int* match_masks = &s.match_masks[warp][0]; #pragma unroll for (int u = 0; u < KEYS_PER_THREAD; ++u) { ::cuda::std::uint32_t bin = Digit(keys[u]); int* p_match_mask = &match_masks[bin]; atomicOr(p_match_mask, lane_mask); WARP_SYNC(WARP_MASK); int bin_mask = *p_match_mask; int leader = (WARP_THREADS - 1) - __clz(bin_mask); int warp_offset = 0; int popc = __popc(bin_mask & LaneMaskLe()); if (lane == leader) { // atomic is a bit faster warp_offset = atomicAdd(&warp_offsets[bin], popc); } warp_offset = SHFL_IDX_SYNC(warp_offset, leader, WARP_MASK); if (lane == leader) { *p_match_mask = 0; } WARP_SYNC(WARP_MASK); ranks[u] = warp_offset + popc - 1; } } _CCCL_DEVICE _CCCL_FORCEINLINE void ComputeRanksItem(UnsignedBits (&keys)[KEYS_PER_THREAD], int (&ranks)[KEYS_PER_THREAD], Int2Type) { // compute key ranks int* warp_offsets = &s.warp_offsets[warp][0]; #pragma unroll for (int u = 0; u < KEYS_PER_THREAD; ++u) { ::cuda::std::uint32_t bin = Digit(keys[u]); int bin_mask = detail::warp_in_block_matcher_t::match_any(bin, warp); int leader = (WARP_THREADS - 1) - __clz(bin_mask); int warp_offset = 0; int popc = __popc(bin_mask & LaneMaskLe()); if (lane == leader) { // atomic is a bit faster warp_offset = atomicAdd(&warp_offsets[bin], popc); } warp_offset = SHFL_IDX_SYNC(warp_offset, leader, WARP_MASK); ranks[u] = warp_offset + popc - 1; } } _CCCL_DEVICE _CCCL_FORCEINLINE void RankKeys(UnsignedBits (&keys)[KEYS_PER_THREAD], int (&ranks)[KEYS_PER_THREAD], int (&exclusive_digit_prefix)[BINS_PER_THREAD]) { ComputeHistogramsWarp(keys); CTA_SYNC(); int bins[BINS_PER_THREAD]; ComputeOffsetsWarpUpsweep(bins); callback(bins); BlockScan(s.prefix_tmp).ExclusiveSum(bins, exclusive_digit_prefix); ComputeOffsetsWarpDownsweep(exclusive_digit_prefix); CTA_SYNC(); ComputeRanksItem(keys, ranks, Int2Type()); } _CCCL_DEVICE _CCCL_FORCEINLINE BlockRadixRankMatchInternal(TempStorage& temp_storage, DigitExtractorT digit_extractor, CountsCallback callback) : s(temp_storage) , digit_extractor(digit_extractor) , callback(callback) , warp(threadIdx.x / WARP_THREADS) , lane(LaneId()) {} }; _CCCL_DEVICE _CCCL_FORCEINLINE BlockRadixRankMatchEarlyCounts(TempStorage& temp_storage) : temp_storage(temp_storage) {} /** * @brief Rank keys. For the lower @p RADIX_DIGITS threads, digit counts for each digit are * provided for the corresponding thread. */ template _CCCL_DEVICE _CCCL_FORCEINLINE void RankKeys( UnsignedBits (&keys)[KEYS_PER_THREAD], int (&ranks)[KEYS_PER_THREAD], DigitExtractorT digit_extractor, int (&exclusive_digit_prefix)[BINS_PER_THREAD], CountsCallback callback) { BlockRadixRankMatchInternal internal( temp_storage, digit_extractor, callback); internal.RankKeys(keys, ranks, exclusive_digit_prefix); } template _CCCL_DEVICE _CCCL_FORCEINLINE void RankKeys(UnsignedBits (&keys)[KEYS_PER_THREAD], int (&ranks)[KEYS_PER_THREAD], DigitExtractorT digit_extractor, int (&exclusive_digit_prefix)[BINS_PER_THREAD]) { typedef BlockRadixRankEmptyCallback CountsCallback; BlockRadixRankMatchInternal internal( temp_storage, digit_extractor, CountsCallback()); internal.RankKeys(keys, ranks, exclusive_digit_prefix); } template _CCCL_DEVICE _CCCL_FORCEINLINE void RankKeys(UnsignedBits (&keys)[KEYS_PER_THREAD], int (&ranks)[KEYS_PER_THREAD], DigitExtractorT digit_extractor) { int exclusive_digit_prefix[BINS_PER_THREAD]; RankKeys(keys, ranks, digit_extractor, exclusive_digit_prefix); } }; #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document namespace detail { // `BlockRadixRank` doesn't conform to the typical pattern, not exposing the algorithm // template parameter. Other algorithms don't provide the same template parameters, not allowing // multi-dimensional thread block specializations. // // TODO(senior-zero) for 3.0: // - Put existing implementations into the detail namespace // - Support multi-dimensional thread blocks in the rest of implementations // - Repurpose BlockRadixRank as an entry name with the algorithm template parameter template using block_radix_rank_t = cub::detail::conditional_t< RankAlgorithm == RADIX_RANK_BASIC, BlockRadixRank, cub::detail::conditional_t< RankAlgorithm == RADIX_RANK_MEMOIZE, BlockRadixRank, cub::detail::conditional_t< RankAlgorithm == RADIX_RANK_MATCH, BlockRadixRankMatch, cub::detail::conditional_t< RankAlgorithm == RADIX_RANK_MATCH_EARLY_COUNTS_ANY, BlockRadixRankMatchEarlyCounts, BlockRadixRankMatchEarlyCounts>>>>; } // namespace detail #endif // DOXYGEN_SHOULD_SKIP_THIS CUB_NAMESPACE_END cccl-2.5.0/cub/cub/block/block_radix_sort.cuh000066400000000000000000002500311463375617100211050ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * @file * The cub::BlockRadixSort class provides [collective](index.html#sec0) methods for radix * sorting of items partitioned across a CUDA thread block. */ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include #include #include CUB_NAMESPACE_BEGIN //! @rst //! BlockRadixSort class provides :ref:`collective ` methods for sorting //! items partitioned across a CUDA thread block using a radix sorting method. //! //! .. image:: ../img/sorting_logo.png //! :align: center //! //! Overview //! -------------------------------------------------- //! //! The `radix sorting method `_ arranges //! items into ascending order. It relies upon a positional representation for //! keys, i.e., each key is comprised of an ordered sequence of symbols (e.g., digits, //! characters, etc.) specified from least-significant to most-significant. For a //! given input sequence of keys and a set of rules specifying a total ordering //! of the symbolic alphabet, the radix sorting method produces a lexicographic //! ordering of those keys. //! //! @rowmajor //! //! Supported Types //! -------------------------------------------------- //! //! BlockRadixSort can sort all of the built-in C++ numeric primitive types //! (``unsigned char``, ``int``, ``double``, etc.) as well as CUDA's ``__half`` //! half-precision floating-point type. User-defined types are supported as long //! as decomposer object is provided. //! //! Floating-Point Special Cases //! -------------------------------------------------- //! //! - Positive and negative zeros are considered equivalent, and will be treated //! as such in the output. //! - No special handling is implemented for NaN values; these are sorted //! according to their bit representations after any transformations. //! //! Bitwise Key Transformations //! -------------------------------------------------- //! //! Although the direct radix sorting method can only be applied to unsigned //! integral types, BlockRadixSort is able to sort signed and floating-point //! types via simple bit-wise transformations that ensure lexicographic key //! ordering. //! //! These transformations must be considered when restricting the //! ``[begin_bit, end_bit)`` range, as the bitwise transformations will occur //! before the bit-range truncation. //! //! Any transformations applied to the keys prior to sorting are reversed //! while writing to the final output buffer. //! //! Type Specific Bitwise Transformations //! -------------------------------------------------- //! //! To convert the input values into a radix-sortable bitwise representation, //! the following transformations take place prior to sorting: //! //! * For unsigned integral values, the keys are used directly. //! * For signed integral values, the sign bit is inverted. //! * For positive floating point values, the sign bit is inverted. //! * For negative floating point values, the full key is inverted. //! //! No Descending Sort Transformations //! -------------------------------------------------- //! //! Unlike ``DeviceRadixSort``, ``BlockRadixSort`` does not invert the input key bits //! when performing a descending sort. Instead, it has special logic to reverse //! the order of the keys while sorting. //! //! Stability //! -------------------------------------------------- //! //! BlockRadixSort is stable. For floating-point types -0.0 and +0.0 //! are considered equal and appear in the result in the same order as they //! appear in the input. //! //! //! Performance Considerations //! -------------------------------------------------- //! //! * @granularity //! //! A Simple Example //! -------------------------------------------------- //! //! @blockcollective{BlockRadixSort} //! //! The code snippet below illustrates a sort of 512 integer keys that //! are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads //! where each thread owns 4 consecutive items. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! __global__ void ExampleKernel(...) //! { //! // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer items each //! typedef cub::BlockRadixSort BlockRadixSort; //! //! // Allocate shared memory for BlockRadixSort //! __shared__ typename BlockRadixSort::TempStorage temp_storage; //! //! // Obtain a segment of consecutive items that are blocked across threads //! int thread_keys[4]; //! ... //! //! // Collectively sort the keys //! BlockRadixSort(temp_storage).Sort(thread_keys); //! //! ... //! //! Suppose the set of input ``thread_keys`` across the block of threads is //! ``{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }``. //! The corresponding output ``thread_keys`` in those threads will be //! ``{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }``. //! //! Re-using dynamically allocating shared memory //! -------------------------------------------------- //! //! The following example under the examples/block folder illustrates usage of //! dynamically shared memory with BlockReduce and how to re-purpose //! the same memory region: //! example_block_reduce_dyn_smem.cu //! //! This example can be easily adapted to the storage required by BlockRadixSort. //! @endrst //! //! @tparam KeyT //! KeyT type //! //! @tparam BLOCK_DIM_X //! The thread block length in threads along the X dimension //! //! @tparam ITEMS_PER_THREAD //! The number of items per thread //! //! @tparam ValueT //! **[optional]** ValueT type (default: cub::NullType, which indicates a keys-only sort) //! //! @tparam RADIX_BITS //! **[optional]** The number of radix bits per digit place (default: 4 bits) //! //! @tparam MEMOIZE_OUTER_SCAN //! **[optional]** Whether or not to buffer outer raking scan partials to incur fewer shared memory //! reads at the expense of higher register pressure (default: true for architectures SM35 and //! newer, false otherwise). //! //! @tparam INNER_SCAN_ALGORITHM //! **[optional]** The cub::BlockScanAlgorithm algorithm to use //! (default: cub::BLOCK_SCAN_WARP_SCANS) //! //! @tparam SMEM_CONFIG //! **[optional]*8 Shared memory bank mode (default: `cudaSharedMemBankSizeFourByte`) //! //! @tparam BLOCK_DIM_Y //! **[optional]** The thread block length in threads along the Y dimension (default: 1) //! //! @tparam BLOCK_DIM_Z //! **[optional]** The thread block length in threads along the Z dimension (default: 1) //! //! @tparam LEGACY_PTX_ARCH //! **[optional]** Unused template class BlockRadixSort { private: /****************************************************************************** * Constants and type definitions ******************************************************************************/ enum { // The thread block size in threads BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, // Whether or not there are values to be trucked along with keys KEYS_ONLY = ::cuda::std::is_same::value, }; // KeyT traits and unsigned bits type using traits = detail::radix::traits_t; using bit_ordered_type = typename traits::bit_ordered_type; using bit_ordered_conversion = typename traits::bit_ordered_conversion_policy; /// Ascending BlockRadixRank utility type typedef BlockRadixRank AscendingBlockRadixRank; /// Descending BlockRadixRank utility type typedef BlockRadixRank DescendingBlockRadixRank; /// Digit extractor type using fundamental_digit_extractor_t = BFEDigitExtractor; /// BlockExchange utility type for keys typedef BlockExchange BlockExchangeKeys; /// BlockExchange utility type for values typedef BlockExchange BlockExchangeValues; #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document /// Shared memory storage layout type union _TempStorage { typename AscendingBlockRadixRank::TempStorage asending_ranking_storage; typename DescendingBlockRadixRank::TempStorage descending_ranking_storage; typename BlockExchangeKeys::TempStorage exchange_keys; typename BlockExchangeValues::TempStorage exchange_values; }; #endif // DOXYGEN_SHOULD_SKIP_THIS /****************************************************************************** * Thread fields ******************************************************************************/ /// Shared storage reference _TempStorage& temp_storage; /// Linear thread-id unsigned int linear_tid; /****************************************************************************** * Utility methods ******************************************************************************/ /// Internal storage allocator _CCCL_DEVICE _CCCL_FORCEINLINE _TempStorage& PrivateStorage() { __shared__ _TempStorage private_storage; return private_storage; } /// Rank keys (specialized for ascending sort) template _CCCL_DEVICE _CCCL_FORCEINLINE void RankKeys(bit_ordered_type (&unsigned_keys)[ITEMS_PER_THREAD], int (&ranks)[ITEMS_PER_THREAD], DigitExtractorT digit_extractor, Int2Type /*is_descending*/) { AscendingBlockRadixRank(temp_storage.asending_ranking_storage).RankKeys(unsigned_keys, ranks, digit_extractor); } /// Rank keys (specialized for descending sort) template _CCCL_DEVICE _CCCL_FORCEINLINE void RankKeys(bit_ordered_type (&unsigned_keys)[ITEMS_PER_THREAD], int (&ranks)[ITEMS_PER_THREAD], DigitExtractorT digit_extractor, Int2Type /*is_descending*/) { DescendingBlockRadixRank(temp_storage.descending_ranking_storage).RankKeys(unsigned_keys, ranks, digit_extractor); } /// ExchangeValues (specialized for key-value sort, to-blocked arrangement) _CCCL_DEVICE _CCCL_FORCEINLINE void ExchangeValues( ValueT (&values)[ITEMS_PER_THREAD], int (&ranks)[ITEMS_PER_THREAD], Int2Type /*is_keys_only*/, Int2Type /*is_blocked*/) { CTA_SYNC(); // Exchange values through shared memory in blocked arrangement BlockExchangeValues(temp_storage.exchange_values).ScatterToBlocked(values, ranks); } /// ExchangeValues (specialized for key-value sort, to-striped arrangement) _CCCL_DEVICE _CCCL_FORCEINLINE void ExchangeValues( ValueT (&values)[ITEMS_PER_THREAD], int (&ranks)[ITEMS_PER_THREAD], Int2Type /*is_keys_only*/, Int2Type /*is_blocked*/) { CTA_SYNC(); // Exchange values through shared memory in blocked arrangement BlockExchangeValues(temp_storage.exchange_values).ScatterToStriped(values, ranks); } /// ExchangeValues (specialized for keys-only sort) template _CCCL_DEVICE _CCCL_FORCEINLINE void ExchangeValues( ValueT (& /*values*/)[ITEMS_PER_THREAD], int (& /*ranks*/)[ITEMS_PER_THREAD], Int2Type /*is_keys_only*/, Int2Type /*is_blocked*/) {} /** * @brief Sort blocked arrangement * * @param keys * Keys to sort * * @param values * Values to sort * * @param begin_bit * The beginning (least-significant) bit index needed for key comparison * * @param end_bit * The past-the-end (most-significant) bit index needed for key comparison * * @param is_descending * Tag whether is a descending-order sort * * @param is_keys_only * Tag whether is keys-only sort */ template _CCCL_DEVICE _CCCL_FORCEINLINE void SortBlocked( KeyT (&keys)[ITEMS_PER_THREAD], ValueT (&values)[ITEMS_PER_THREAD], int begin_bit, int end_bit, Int2Type is_descending, Int2Type is_keys_only, DecomposerT decomposer = {}) { bit_ordered_type(&unsigned_keys)[ITEMS_PER_THREAD] = reinterpret_cast(keys); #pragma unroll for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++) { unsigned_keys[KEY] = bit_ordered_conversion::to_bit_ordered(decomposer, unsigned_keys[KEY]); } // Radix sorting passes while (true) { int pass_bits = CUB_MIN(RADIX_BITS, end_bit - begin_bit); auto digit_extractor = traits::template digit_extractor(begin_bit, pass_bits, decomposer); // Rank the blocked keys int ranks[ITEMS_PER_THREAD]; RankKeys(unsigned_keys, ranks, digit_extractor, is_descending); begin_bit += RADIX_BITS; CTA_SYNC(); // Exchange keys through shared memory in blocked arrangement BlockExchangeKeys(temp_storage.exchange_keys).ScatterToBlocked(keys, ranks); // Exchange values through shared memory in blocked arrangement ExchangeValues(values, ranks, is_keys_only, Int2Type()); // Quit if done if (begin_bit >= end_bit) { break; } CTA_SYNC(); } // Untwiddle bits if necessary #pragma unroll for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++) { unsigned_keys[KEY] = bit_ordered_conversion::from_bit_ordered(decomposer, unsigned_keys[KEY]); } } public: #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document /** * @brief Sort blocked -> striped arrangement * * @param keys * Keys to sort * * @param values * Values to sort * * @param begin_bit * The beginning (least-significant) bit index needed for key comparison * * @param end_bit * The past-the-end (most-significant) bit index needed for key comparison * * @param is_descending * Tag whether is a descending-order sort * * @param is_keys_only * Tag whether is keys-only sort */ template _CCCL_DEVICE _CCCL_FORCEINLINE void SortBlockedToStriped( KeyT (&keys)[ITEMS_PER_THREAD], ValueT (&values)[ITEMS_PER_THREAD], int begin_bit, int end_bit, Int2Type is_descending, Int2Type is_keys_only, DecomposerT decomposer = {}) { bit_ordered_type(&unsigned_keys)[ITEMS_PER_THREAD] = reinterpret_cast(keys); # pragma unroll for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++) { unsigned_keys[KEY] = bit_ordered_conversion::to_bit_ordered(decomposer, unsigned_keys[KEY]); } // Radix sorting passes while (true) { int pass_bits = CUB_MIN(RADIX_BITS, end_bit - begin_bit); auto digit_extractor = traits::template digit_extractor(begin_bit, pass_bits, decomposer); // Rank the blocked keys int ranks[ITEMS_PER_THREAD]; RankKeys(unsigned_keys, ranks, digit_extractor, is_descending); begin_bit += RADIX_BITS; CTA_SYNC(); // Check if this is the last pass if (begin_bit >= end_bit) { // Last pass exchanges keys through shared memory in striped arrangement BlockExchangeKeys(temp_storage.exchange_keys).ScatterToStriped(keys, ranks); // Last pass exchanges through shared memory in striped arrangement ExchangeValues(values, ranks, is_keys_only, Int2Type()); // Quit break; } // Exchange keys through shared memory in blocked arrangement BlockExchangeKeys(temp_storage.exchange_keys).ScatterToBlocked(keys, ranks); // Exchange values through shared memory in blocked arrangement ExchangeValues(values, ranks, is_keys_only, Int2Type()); CTA_SYNC(); } // Untwiddle bits if necessary # pragma unroll for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++) { unsigned_keys[KEY] = bit_ordered_conversion::from_bit_ordered(decomposer, unsigned_keys[KEY]); } } #endif // DOXYGEN_SHOULD_SKIP_THIS /// @smemstorage{BlockRadixSort} struct TempStorage : Uninitialized<_TempStorage> {}; //! @name Collective constructors //! @{ //! @brief Collective constructor using a private static allocation of shared memory as temporary storage. _CCCL_DEVICE _CCCL_FORCEINLINE BlockRadixSort() : temp_storage(PrivateStorage()) , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) {} /** * @brief Collective constructor using the specified memory allocation as temporary storage. * * @param[in] temp_storage * Reference to memory allocation having layout type TempStorage */ _CCCL_DEVICE _CCCL_FORCEINLINE BlockRadixSort(TempStorage& temp_storage) : temp_storage(temp_storage.Alias()) , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) {} //! @} end member group //! @name Sorting (blocked arrangements) //! @{ //! @rst //! Performs an ascending block-wide radix sort over a //! :ref:`blocked arrangement ` of keys. //! //! - @granularity //! - @smemreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates a sort of 512 integer keys that //! are partitioned in a :ref:`blocked arrangement ` across 128 threads //! where each thread owns 4 consecutive keys. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! __global__ void ExampleKernel(...) //! { //! // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each //! typedef cub::BlockRadixSort BlockRadixSort; //! //! // Allocate shared memory for BlockRadixSort //! __shared__ typename BlockRadixSort::TempStorage temp_storage; //! //! // Obtain a segment of consecutive items that are blocked across threads //! int thread_keys[4]; //! ... //! //! // Collectively sort the keys //! BlockRadixSort(temp_storage).Sort(thread_keys); //! //! Suppose the set of input ``thread_keys`` across the block of threads is //! ``{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }``. //! The corresponding output ``thread_keys`` in those threads will be //! ``{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }``. //! @endrst //! //! @param[in,out] keys //! Keys to sort //! //! @param[in] begin_bit //! **[optional]** The beginning (least-significant) bit index needed for key comparison //! //! @param[in] end_bit //! **[optional]** The past-the-end (most-significant) bit index needed for key comparison _CCCL_DEVICE _CCCL_FORCEINLINE void Sort(KeyT (&keys)[ITEMS_PER_THREAD], int begin_bit = 0, int end_bit = sizeof(KeyT) * 8) { NullType values[ITEMS_PER_THREAD]; SortBlocked(keys, values, begin_bit, end_bit, Int2Type(), Int2Type()); } //! @rst //! Performs an ascending block-wide radix sort over a //! :ref:`blocked arrangement ` of keys. //! //! * @granularity //! * @smemreuse //! //! Snippet //! ========================================================================== //! //! Let's consider a user-defined ``custom_t`` type below. To sort an array of //! ``custom_t`` objects, we have to tell CUB about relevant members of the //! ``custom_t`` type. We do this by providing a decomposer that returns a //! tuple of references to relevant members of the key. //! //! .. literalinclude:: ../../test/catch2_test_block_radix_sort_custom.cu //! :language: c++ //! :dedent: //! :start-after: example-begin custom-type //! :end-before: example-end custom-type //! //! The code snippet below illustrates a sort of 2 keys that //! are partitioned in a :ref:`blocked arrangement ` across 2 threads //! where each thread owns 1 key. //! //! .. literalinclude:: ../../test/catch2_test_block_radix_sort_custom.cu //! :language: c++ //! :dedent: //! :start-after: example-begin keys-bits //! :end-before: example-end keys-bits //! //! @endrst //! //! @tparam DecomposerT //! **[inferred]** Type of a callable object responsible for decomposing a //! ``KeyT`` into a tuple of references to its constituent arithmetic types: //! ``::cuda::std::tuple operator()(KeyT &key)``. //! The leftmost element of the tuple is considered the most significant. //! The call operator must not modify members of the key. //! //! @param[in,out] keys //! Keys to sort //! //! @param decomposer //! Callable object responsible for decomposing a ``KeyT`` into a tuple of //! references to its constituent arithmetic types. The leftmost element of //! the tuple is considered the most significant. The call operator must not //! modify members of the key. //! //! @param[in] begin_bit //! The least-significant bit index (inclusive) needed for //! key comparison //! //! @param[in] end_bit //! The most-significant bit index (exclusive) needed for key //! comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`) template _CCCL_DEVICE _CCCL_FORCEINLINE // typename ::cuda::std::enable_if< // !::cuda::std::is_convertible::value>::type Sort(KeyT (&keys)[ITEMS_PER_THREAD], DecomposerT decomposer, int begin_bit, int end_bit) { NullType values[ITEMS_PER_THREAD]; SortBlocked(keys, values, begin_bit, end_bit, Int2Type(), Int2Type(), decomposer); } //! @rst //! Performs an ascending block-wide radix sort over a //! :ref:`blocked arrangement ` of keys. //! //! * @granularity //! * @smemreuse //! //! Snippet //! ========================================================================== //! //! Let's consider a user-defined ``custom_t`` type below. To sort an array of //! ``custom_t`` objects, we have to tell CUB about relevant members of the //! ``custom_t`` type. We do this by providing a decomposer that returns a //! tuple of references to relevant members of the key. //! //! .. literalinclude:: ../../test/catch2_test_block_radix_sort_custom.cu //! :language: c++ //! :dedent: //! :start-after: example-begin custom-type //! :end-before: example-end custom-type //! //! The code snippet below illustrates a sort of 6 keys that //! are partitioned in a :ref:`blocked arrangement ` across 2 threads //! where each thread owns 3 consecutive keys. //! //! .. literalinclude:: ../../test/catch2_test_block_radix_sort_custom.cu //! :language: c++ //! :dedent: //! :start-after: example-begin keys //! :end-before: example-end keys //! //! @endrst //! //! @tparam DecomposerT //! **[inferred]** Type of a callable object responsible for decomposing a //! ``KeyT`` into a tuple of references to its constituent arithmetic types: //! ``::cuda::std::tuple operator()(KeyT &key)``. //! The leftmost element of the tuple is considered the most significant. //! The call operator must not modify members of the key. //! //! @param[in,out] keys //! Keys to sort //! //! @param decomposer //! Callable object responsible for decomposing a ``KeyT`` into a tuple of //! references to its constituent arithmetic types. The leftmost element of //! the tuple is considered the most significant. The call operator must not //! modify members of the key. template _CCCL_DEVICE _CCCL_FORCEINLINE // typename ::cuda::std::enable_if< // !::cuda::std::is_convertible::value>::type Sort(KeyT (&keys)[ITEMS_PER_THREAD], DecomposerT decomposer) { Sort(keys, decomposer, 0, detail::radix::traits_t::default_end_bit(decomposer)); } //! @rst //! Performs an ascending block-wide radix sort across a :ref:`blocked arrangement ` //! of keys and values. //! //! - BlockRadixSort can only accommodate one associated tile of values. To "truck along" //! more than one tile of values, simply perform a key-value sort of the keys paired //! with a temporary value array that enumerates the key indices. The reordered indices //! can then be used as a gather-vector for exchanging other associated tile data through //! shared memory. //! - @granularity //! - @smemreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates a sort of 512 integer keys and values that //! are partitioned in a :ref:`blocked arrangement ` across 128 threads //! where each thread owns 4 consecutive pairs. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! __global__ void ExampleKernel(...) //! { //! // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each //! typedef cub::BlockRadixSort BlockRadixSort; //! //! // Allocate shared memory for BlockRadixSort //! __shared__ typename BlockRadixSort::TempStorage temp_storage; //! //! // Obtain a segment of consecutive items that are blocked across threads //! int thread_keys[4]; //! int thread_values[4]; //! ... //! //! // Collectively sort the keys and values among block threads //! BlockRadixSort(temp_storage).Sort(thread_keys, thread_values); //! //! @endcode //! @par //! Suppose the set of input ``thread_keys`` across the block of threads is //! ``{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }``. The //! corresponding output ``thread_keys`` in those threads will be //! ``{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }``. //! //! @endrst //! //! @param[in,out] keys //! Keys to sort //! //! @param[in,out] values //! Values to sort //! //! @param[in] begin_bit //! **[optional]** The beginning (least-significant) bit index needed for key comparison //! //! @param[in] end_bit //! **[optional]** The past-the-end (most-significant) bit index needed for key comparison _CCCL_DEVICE _CCCL_FORCEINLINE void Sort(KeyT (&keys)[ITEMS_PER_THREAD], ValueT (&values)[ITEMS_PER_THREAD], int begin_bit = 0, int end_bit = sizeof(KeyT) * 8) { SortBlocked(keys, values, begin_bit, end_bit, Int2Type(), Int2Type()); } //! @rst //! Performs an ascending block-wide radix sort over a //! :ref:`blocked arrangement ` of keys and values. //! //! * BlockRadixSort can only accommodate one associated tile of values. To "truck along" //! more than one tile of values, simply perform a key-value sort of the keys paired //! with a temporary value array that enumerates the key indices. The reordered indices //! can then be used as a gather-vector for exchanging other associated tile data through //! shared memory. //! * @granularity //! * @smemreuse //! //! Snippet //! ========================================================================== //! //! Let's consider a user-defined ``custom_t`` type below. To sort an array of //! ``custom_t`` objects, we have to tell CUB about relevant members of the //! ``custom_t`` type. We do this by providing a decomposer that returns a //! tuple of references to relevant members of the key. //! //! .. literalinclude:: ../../test/catch2_test_block_radix_sort_custom.cu //! :language: c++ //! :dedent: //! :start-after: example-begin custom-type //! :end-before: example-end custom-type //! //! The code snippet below illustrates a sort of 2 keys and values that //! are partitioned in a :ref:`blocked arrangement ` across 2 threads //! where each thread owns 1 pair. //! //! .. literalinclude:: ../../test/catch2_test_block_radix_sort_custom.cu //! :language: c++ //! :dedent: //! :start-after: example-begin pairs-bits //! :end-before: example-end pairs-bits //! //! @endrst //! //! @tparam DecomposerT //! **[inferred]** Type of a callable object responsible for decomposing a //! ``KeyT`` into a tuple of references to its constituent arithmetic types: //! ``::cuda::std::tuple operator()(KeyT &key)``. //! The leftmost element of the tuple is considered the most significant. //! The call operator must not modify members of the key. //! //! @param[in,out] keys //! Keys to sort //! //! @param[in,out] values //! Values to sort //! //! @param decomposer //! Callable object responsible for decomposing a ``KeyT`` into a tuple of //! references to its constituent arithmetic types. The leftmost element of //! the tuple is considered the most significant. The call operator must not //! modify members of the key. //! //! @param[in] begin_bit //! The least-significant bit index (inclusive) needed for //! key comparison //! //! @param[in] end_bit //! The most-significant bit index (exclusive) needed for key //! comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`) template _CCCL_DEVICE _CCCL_FORCEINLINE // typename ::cuda::std::enable_if< // !::cuda::std::is_convertible::value>::type Sort(KeyT (&keys)[ITEMS_PER_THREAD], ValueT (&values)[ITEMS_PER_THREAD], DecomposerT decomposer, int begin_bit, int end_bit) { SortBlocked(keys, values, begin_bit, end_bit, Int2Type(), Int2Type(), decomposer); } //! @rst //! Performs an ascending block-wide radix sort over a //! :ref:`blocked arrangement ` of keys and values. //! //! * BlockRadixSort can only accommodate one associated tile of values. To "truck along" //! more than one tile of values, simply perform a key-value sort of the keys paired //! with a temporary value array that enumerates the key indices. The reordered indices //! can then be used as a gather-vector for exchanging other associated tile data through //! shared memory. //! * @granularity //! * @smemreuse //! //! Snippet //! ========================================================================== //! //! Let's consider a user-defined ``custom_t`` type below. To sort an array of //! ``custom_t`` objects, we have to tell CUB about relevant members of the //! ``custom_t`` type. We do this by providing a decomposer that returns a //! tuple of references to relevant members of the key. //! //! .. literalinclude:: ../../test/catch2_test_block_radix_sort_custom.cu //! :language: c++ //! :dedent: //! :start-after: example-begin custom-type //! :end-before: example-end custom-type //! //! The code snippet below illustrates a sort of 6 keys and values that //! are partitioned in a :ref:`blocked arrangement ` across 2 threads //! where each thread owns 3 consecutive pairs. //! //! .. literalinclude:: ../../test/catch2_test_block_radix_sort_custom.cu //! :language: c++ //! :dedent: //! :start-after: example-begin pairs //! :end-before: example-end pairs //! //! @endrst //! //! @tparam DecomposerT //! **[inferred]** Type of a callable object responsible for decomposing a //! ``KeyT`` into a tuple of references to its constituent arithmetic types: //! ``::cuda::std::tuple operator()(KeyT &key)``. //! The leftmost element of the tuple is considered the most significant. //! The call operator must not modify members of the key. //! //! @param[in,out] keys //! Keys to sort //! //! @param[in,out] values //! Values to sort //! //! @param decomposer //! Callable object responsible for decomposing a ``KeyT`` into a tuple of //! references to its constituent arithmetic types. The leftmost element of //! the tuple is considered the most significant. The call operator must not //! modify members of the key. template _CCCL_DEVICE _CCCL_FORCEINLINE // typename ::cuda::std::enable_if< // !::cuda::std::is_convertible::value>::type Sort(KeyT (&keys)[ITEMS_PER_THREAD], ValueT (&values)[ITEMS_PER_THREAD], DecomposerT decomposer) { Sort(keys, values, decomposer, 0, detail::radix::traits_t::default_end_bit(decomposer)); } //! @rst //! Performs a descending block-wide radix sort over a :ref:`blocked arrangement ` //! of keys. //! //! - @granularity //! - @smemreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates a sort of 512 integer keys that //! are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads //! where each thread owns 4 consecutive keys. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! __global__ void ExampleKernel(...) //! { //! // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each //! typedef cub::BlockRadixSort BlockRadixSort; //! //! // Allocate shared memory for BlockRadixSort //! __shared__ typename BlockRadixSort::TempStorage temp_storage; //! //! // Obtain a segment of consecutive items that are blocked across threads //! int thread_keys[4]; //! ... //! //! // Collectively sort the keys //! BlockRadixSort(temp_storage).Sort(thread_keys); //! //! Suppose the set of input ``thread_keys`` across the block of threads is //! ``{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }``. //! The corresponding output ``thread_keys`` in those threads will be //! ``{ [511,510,509,508], [11,10,9,8], [7,6,5,4], ..., [3,2,1,0] }``. //! //! @endrst //! //! @param[in,out] keys //! Keys to sort //! //! @param[in] begin_bit //! **[optional]** The beginning (least-significant) bit index needed for key comparison //! //! @param[in] end_bit //! **[optional]** The past-the-end (most-significant) bit index needed for key comparison _CCCL_DEVICE _CCCL_FORCEINLINE void SortDescending(KeyT (&keys)[ITEMS_PER_THREAD], int begin_bit = 0, int end_bit = sizeof(KeyT) * 8) { NullType values[ITEMS_PER_THREAD]; SortBlocked(keys, values, begin_bit, end_bit, Int2Type(), Int2Type()); } //! @rst //! Performs a descending block-wide radix sort over a //! :ref:`blocked arrangement ` of keys. //! //! * @granularity //! * @smemreuse //! //! Snippet //! ========================================================================== //! //! Let's consider a user-defined ``custom_t`` type below. To sort an array of //! ``custom_t`` objects, we have to tell CUB about relevant members of the //! ``custom_t`` type. We do this by providing a decomposer that returns a //! tuple of references to relevant members of the key. //! //! .. literalinclude:: ../../test/catch2_test_block_radix_sort_custom.cu //! :language: c++ //! :dedent: //! :start-after: example-begin custom-type //! :end-before: example-end custom-type //! //! The code snippet below illustrates a sort of 2 keys that //! are partitioned in a :ref:`blocked arrangement ` across 2 threads //! where each thread owns 1 key. //! //! .. literalinclude:: ../../test/catch2_test_block_radix_sort_custom.cu //! :language: c++ //! :dedent: //! :start-after: example-begin keys-descending-bits //! :end-before: example-end keys-descending-bits //! //! @endrst //! //! @tparam DecomposerT //! **[inferred]** Type of a callable object responsible for decomposing a //! ``KeyT`` into a tuple of references to its constituent arithmetic types: //! ``::cuda::std::tuple operator()(KeyT &key)``. //! The leftmost element of the tuple is considered the most significant. //! The call operator must not modify members of the key. //! //! @param[in,out] keys //! Keys to sort //! //! @param decomposer //! Callable object responsible for decomposing a ``KeyT`` into a tuple of //! references to its constituent arithmetic types. The leftmost element of //! the tuple is considered the most significant. The call operator must not //! modify members of the key. //! //! @param[in] begin_bit //! The least-significant bit index (inclusive) needed for //! key comparison //! //! @param[in] end_bit //! The most-significant bit index (exclusive) needed for key //! comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`) template _CCCL_DEVICE _CCCL_FORCEINLINE // typename ::cuda::std::enable_if< // !::cuda::std::is_convertible::value>::type SortDescending(KeyT (&keys)[ITEMS_PER_THREAD], DecomposerT decomposer, int begin_bit, int end_bit) { NullType values[ITEMS_PER_THREAD]; SortBlocked(keys, values, begin_bit, end_bit, Int2Type(), Int2Type(), decomposer); } //! @rst //! Performs a descending block-wide radix sort over a //! :ref:`blocked arrangement ` of keys. //! //! * @granularity //! * @smemreuse //! //! Snippet //! ========================================================================== //! //! Let's consider a user-defined ``custom_t`` type below. To sort an array of //! ``custom_t`` objects, we have to tell CUB about relevant members of the //! ``custom_t`` type. We do this by providing a decomposer that returns a //! tuple of references to relevant members of the key. //! //! .. literalinclude:: ../../test/catch2_test_block_radix_sort_custom.cu //! :language: c++ //! :dedent: //! :start-after: example-begin custom-type //! :end-before: example-end custom-type //! //! The code snippet below illustrates a sort of 6 keys that //! are partitioned in a :ref:`blocked arrangement ` across 2 threads //! where each thread owns 3 consecutive keys. //! //! .. literalinclude:: ../../test/catch2_test_block_radix_sort_custom.cu //! :language: c++ //! :dedent: //! :start-after: example-begin keys-descending //! :end-before: example-end keys-descending //! //! @endrst //! //! @tparam DecomposerT //! **[inferred]** Type of a callable object responsible for decomposing a //! ``KeyT`` into a tuple of references to its constituent arithmetic types: //! ``::cuda::std::tuple operator()(KeyT &key)``. //! The leftmost element of the tuple is considered the most significant. //! The call operator must not modify members of the key. //! //! @param[in,out] keys //! Keys to sort //! //! @param decomposer //! Callable object responsible for decomposing a ``KeyT`` into a tuple of //! references to its constituent arithmetic types. The leftmost element of //! the tuple is considered the most significant. The call operator must not //! modify members of the key. template _CCCL_DEVICE _CCCL_FORCEINLINE // typename ::cuda::std::enable_if< // !::cuda::std::is_convertible::value>::type SortDescending(KeyT (&keys)[ITEMS_PER_THREAD], DecomposerT decomposer) { NullType values[ITEMS_PER_THREAD]; SortBlocked( keys, values, 0, detail::radix::traits_t::default_end_bit(decomposer), Int2Type(), Int2Type(), decomposer); } //! @rst //! Performs a descending block-wide radix sort across a :ref:`blocked arrangement ` //! of keys and values. //! //! - BlockRadixSort can only accommodate one associated tile of values. To "truck along" //! more than one tile of values, simply perform a key-value sort of the keys paired //! with a temporary value array that enumerates the key indices. The reordered indices //! can then be used as a gather-vector for exchanging other associated tile data through //! shared memory. //! - @granularity //! - @smemreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates a sort of 512 integer keys and values that //! are partitioned in a :ref:`blocked arrangement ` across 128 threads //! where each thread owns 4 consecutive pairs. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! __global__ void ExampleKernel(...) //! { //! // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and //! values each typedef cub::BlockRadixSort BlockRadixSort; //! //! // Allocate shared memory for BlockRadixSort //! __shared__ typename BlockRadixSort::TempStorage temp_storage; //! //! // Obtain a segment of consecutive items that are blocked across threads //! int thread_keys[4]; //! int thread_values[4]; //! ... //! //! // Collectively sort the keys and values among block threads //! BlockRadixSort(temp_storage).Sort(thread_keys, thread_values); //! //! Suppose the set of input ``thread_keys`` across the block of threads is //! ``{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }``. The //! corresponding output ``thread_keys`` in those threads will be //! ``{ [511,510,509,508], [11,10,9,8], [7,6,5,4], ..., [3,2,1,0] }``. //! //! @endrst //! //! @param[in,out] keys //! Keys to sort //! //! @param[in,out] values //! Values to sort //! //! @param[in] begin_bit //! **[optional]** The beginning (least-significant) bit index needed for key comparison //! //! @param[in] end_bit //! **[optional]** The past-the-end (most-significant) bit index needed for key comparison _CCCL_DEVICE _CCCL_FORCEINLINE void SortDescending( KeyT (&keys)[ITEMS_PER_THREAD], ValueT (&values)[ITEMS_PER_THREAD], int begin_bit = 0, int end_bit = sizeof(KeyT) * 8) { SortBlocked(keys, values, begin_bit, end_bit, Int2Type(), Int2Type()); } //! @rst //! Performs a descending block-wide radix sort over a //! :ref:`blocked arrangement ` of keys and values. //! //! * BlockRadixSort can only accommodate one associated tile of values. To "truck along" //! more than one tile of values, simply perform a key-value sort of the keys paired //! with a temporary value array that enumerates the key indices. The reordered indices //! can then be used as a gather-vector for exchanging other associated tile data through //! shared memory. //! * @granularity //! * @smemreuse //! //! Snippet //! ========================================================================== //! //! Let's consider a user-defined ``custom_t`` type below. To sort an array of //! ``custom_t`` objects, we have to tell CUB about relevant members of the //! ``custom_t`` type. We do this by providing a decomposer that returns a //! tuple of references to relevant members of the key. //! //! .. literalinclude:: ../../test/catch2_test_block_radix_sort_custom.cu //! :language: c++ //! :dedent: //! :start-after: example-begin custom-type //! :end-before: example-end custom-type //! //! The code snippet below illustrates a sort of 2 pairs that //! are partitioned in a :ref:`blocked arrangement ` across 2 threads //! where each thread owns 1 pair. //! //! .. literalinclude:: ../../test/catch2_test_block_radix_sort_custom.cu //! :language: c++ //! :dedent: //! :start-after: example-begin pairs-descending-bits //! :end-before: example-end pairs-descending-bits //! //! @endrst //! //! @tparam DecomposerT //! **[inferred]** Type of a callable object responsible for decomposing a //! ``KeyT`` into a tuple of references to its constituent arithmetic types: //! ``::cuda::std::tuple operator()(KeyT &key)``. //! The leftmost element of the tuple is considered the most significant. //! The call operator must not modify members of the key. //! //! @param[in,out] keys //! Keys to sort //! //! @param[in,out] values //! Values to sort //! //! @param decomposer //! Callable object responsible for decomposing a ``KeyT`` into a tuple of //! references to its constituent arithmetic types. The leftmost element of //! the tuple is considered the most significant. The call operator must not //! modify members of the key. //! //! @param[in] begin_bit //! The least-significant bit index (inclusive) needed for //! key comparison //! //! @param[in] end_bit //! The most-significant bit index (exclusive) needed for key //! comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`) template _CCCL_DEVICE _CCCL_FORCEINLINE // typename ::cuda::std::enable_if< // !::cuda::std::is_convertible::value>::type SortDescending(KeyT (&keys)[ITEMS_PER_THREAD], ValueT (&values)[ITEMS_PER_THREAD], DecomposerT decomposer, int begin_bit, int end_bit) { SortBlocked(keys, values, begin_bit, end_bit, Int2Type(), Int2Type(), decomposer); } //! @rst //! Performs a descending block-wide radix sort over a //! :ref:`blocked arrangement ` of keys and values. //! //! * BlockRadixSort can only accommodate one associated tile of values. To "truck along" //! more than one tile of values, simply perform a key-value sort of the keys paired //! with a temporary value array that enumerates the key indices. The reordered indices //! can then be used as a gather-vector for exchanging other associated tile data through //! shared memory. //! * @granularity //! * @smemreuse //! //! Snippet //! ========================================================================== //! //! Let's consider a user-defined ``custom_t`` type below. To sort an array of //! ``custom_t`` objects, we have to tell CUB about relevant members of the //! ``custom_t`` type. We do this by providing a decomposer that returns a //! tuple of references to relevant members of the key. //! //! .. literalinclude:: ../../test/catch2_test_block_radix_sort_custom.cu //! :language: c++ //! :dedent: //! :start-after: example-begin custom-type //! :end-before: example-end custom-type //! //! The code snippet below illustrates a sort of 6 keys and values that //! are partitioned in a :ref:`blocked arrangement ` across 2 threads //! where each thread owns 3 consecutive pairs. //! //! .. literalinclude:: ../../test/catch2_test_block_radix_sort_custom.cu //! :language: c++ //! :dedent: //! :start-after: example-begin pairs-descending //! :end-before: example-end pairs-descending //! //! @endrst //! //! @tparam DecomposerT //! **[inferred]** Type of a callable object responsible for decomposing a //! ``KeyT`` into a tuple of references to its constituent arithmetic types: //! ``::cuda::std::tuple operator()(KeyT &key)``. //! The leftmost element of the tuple is considered the most significant. //! The call operator must not modify members of the key. //! //! @param[in,out] keys //! Keys to sort //! //! @param[in,out] values //! Values to sort //! //! @param decomposer //! Callable object responsible for decomposing a ``KeyT`` into a tuple of //! references to its constituent arithmetic types. The leftmost element of //! the tuple is considered the most significant. The call operator must not //! modify members of the key. template _CCCL_DEVICE _CCCL_FORCEINLINE // typename ::cuda::std::enable_if< // !::cuda::std::is_convertible::value>::type SortDescending(KeyT (&keys)[ITEMS_PER_THREAD], ValueT (&values)[ITEMS_PER_THREAD], DecomposerT decomposer) { SortBlocked( keys, values, 0, detail::radix::traits_t::default_end_bit(decomposer), Int2Type(), Int2Type(), decomposer); } //! @} end member group //! @name Sorting (blocked arrangement -> striped arrangement) //! @{ //! @rst //! Performs an ascending radix sort across a :ref:`blocked arrangement ` of keys, //! leaving them in a :ref:`striped arrangement `. //! //! - @granularity //! - @smemreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates a sort of 512 integer keys that //! are initially partitioned in a :ref:`blocked arrangement ` across 128 //! threads where each thread owns 4 consecutive keys. The final partitioning is striped. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! __global__ void ExampleKernel(...) //! { //! // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each //! typedef cub::BlockRadixSort BlockRadixSort; //! //! // Allocate shared memory for BlockRadixSort //! __shared__ typename BlockRadixSort::TempStorage temp_storage; //! //! // Obtain a segment of consecutive items that are blocked across threads //! int thread_keys[4]; //! ... //! //! // Collectively sort the keys //! BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys); //! //! Suppose the set of input ``thread_keys`` across the block of threads is //! ``{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }``. //! The corresponding output ``thread_keys`` in those threads will be //! ``{ [0,128,256,384], [1,129,257,385], [2,130,258,386], ..., [127,255,383,511] }``. //! //! @endrst //! //! @param[in,out] keys //! Keys to sort //! //! @param[in] begin_bit //! **[optional]** The beginning (least-significant) bit index needed for key comparison //! //! @param[in] end_bit //! **[optional]** The past-the-end (most-significant) bit index needed for key comparison _CCCL_DEVICE _CCCL_FORCEINLINE void SortBlockedToStriped(KeyT (&keys)[ITEMS_PER_THREAD], int begin_bit = 0, int end_bit = sizeof(KeyT) * 8) { NullType values[ITEMS_PER_THREAD]; SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type(), Int2Type()); } //! @rst //! Performs an ascending block-wide radix sort over a //! :ref:`blocked arrangement ` of keys, leaving them in a //! :ref:`striped arrangement `. //! //! * @granularity //! * @smemreuse //! //! Snippet //! ========================================================================== //! //! Let's consider a user-defined ``custom_t`` type below. To sort an array of //! ``custom_t`` objects, we have to tell CUB about relevant members of the //! ``custom_t`` type. We do this by providing a decomposer that returns a //! tuple of references to relevant members of the key. //! //! .. literalinclude:: ../../test/catch2_test_block_radix_sort_custom.cu //! :language: c++ //! :dedent: //! :start-after: example-begin custom-type //! :end-before: example-end custom-type //! //! The code snippet below illustrates a sort of 4 keys that //! are partitioned in a :ref:`blocked arrangement ` across 2 threads //! where each thread owns 2 consecutive keys. The final partitioning is striped. //! //! .. literalinclude:: ../../test/catch2_test_block_radix_sort_custom.cu //! :language: c++ //! :dedent: //! :start-after: example-begin keys-striped-bits //! :end-before: example-end keys-striped-bits //! //! @endrst //! //! @tparam DecomposerT //! **[inferred]** Type of a callable object responsible for decomposing a //! ``KeyT`` into a tuple of references to its constituent arithmetic types: //! ``::cuda::std::tuple operator()(KeyT &key)``. //! The leftmost element of the tuple is considered the most significant. //! The call operator must not modify members of the key. //! //! @param[in,out] keys //! Keys to sort //! //! @param decomposer //! Callable object responsible for decomposing a ``KeyT`` into a tuple of //! references to its constituent arithmetic types. The leftmost element of //! the tuple is considered the most significant. The call operator must not //! modify members of the key. //! //! @param[in] begin_bit //! The least-significant bit index (inclusive) needed for //! key comparison //! //! @param[in] end_bit //! The most-significant bit index (exclusive) needed for key //! comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`) template _CCCL_DEVICE _CCCL_FORCEINLINE // typename ::cuda::std::enable_if< // !::cuda::std::is_convertible::value>::type SortBlockedToStriped(KeyT (&keys)[ITEMS_PER_THREAD], DecomposerT decomposer, int begin_bit, int end_bit) { NullType values[ITEMS_PER_THREAD]; SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type(), Int2Type(), decomposer); } //! @rst //! Performs an ascending block-wide radix sort over a //! :ref:`blocked arrangement ` of keys, leaving them in a //! :ref:`striped arrangement `. //! //! * @granularity //! * @smemreuse //! //! Snippet //! ========================================================================== //! //! Let's consider a user-defined ``custom_t`` type below. To sort an array of //! ``custom_t`` objects, we have to tell CUB about relevant members of the //! ``custom_t`` type. We do this by providing a decomposer that returns a //! tuple of references to relevant members of the key. //! //! .. literalinclude:: ../../test/catch2_test_block_radix_sort_custom.cu //! :language: c++ //! :dedent: //! :start-after: example-begin custom-type //! :end-before: example-end custom-type //! //! The code snippet below illustrates a sort of 6 keys that //! are partitioned in a :ref:`blocked arrangement ` across 2 threads //! where each thread owns 3 consecutive keys. The final partitioning is striped. //! //! .. literalinclude:: ../../test/catch2_test_block_radix_sort_custom.cu //! :language: c++ //! :dedent: //! :start-after: example-begin keys-striped //! :end-before: example-end keys-striped //! //! @endrst //! //! @tparam DecomposerT //! **[inferred]** Type of a callable object responsible for decomposing a //! ``KeyT`` into a tuple of references to its constituent arithmetic types: //! ``::cuda::std::tuple operator()(KeyT &key)``. //! The leftmost element of the tuple is considered the most significant. //! The call operator must not modify members of the key. //! //! @param[in,out] keys //! Keys to sort //! //! @param decomposer //! Callable object responsible for decomposing a ``KeyT`` into a tuple of //! references to its constituent arithmetic types. The leftmost element of //! the tuple is considered the most significant. The call operator must not //! modify members of the key. template _CCCL_DEVICE _CCCL_FORCEINLINE // typename ::cuda::std::enable_if< // !::cuda::std::is_convertible::value>::type SortBlockedToStriped(KeyT (&keys)[ITEMS_PER_THREAD], DecomposerT decomposer) { NullType values[ITEMS_PER_THREAD]; SortBlockedToStriped( keys, values, 0, detail::radix::traits_t::default_end_bit(decomposer), Int2Type(), Int2Type(), decomposer); } //! @rst //! Performs an ascending radix sort across a :ref:`blocked arrangement ` of keys and //! values, leaving them in a :ref:`striped arrangement `. //! //! - BlockRadixSort can only accommodate one associated tile of values. To "truck along" //! more than one tile of values, simply perform a key-value sort of the keys paired //! with a temporary value array that enumerates the key indices. The reordered indices //! can then be used as a gather-vector for exchanging other associated tile data through //! shared memory. //! - @granularity //! - @smemreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates a sort of 512 integer keys and values that //! are initially partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 //! threads where each thread owns 4 consecutive pairs. The final partitioning is striped. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! __global__ void ExampleKernel(...) //! { //! // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each //! typedef cub::BlockRadixSort BlockRadixSort; //! //! // Allocate shared memory for BlockRadixSort //! __shared__ typename BlockRadixSort::TempStorage temp_storage; //! //! // Obtain a segment of consecutive items that are blocked across threads //! int thread_keys[4]; //! int thread_values[4]; //! ... //! //! // Collectively sort the keys and values among block threads //! BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys, thread_values); //! //! Suppose the set of input ``thread_keys`` across the block of threads is //! ``{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }``. //! The corresponding output ``thread_keys`` in those threads will be //! ``{ [0,128,256,384], [1,129,257,385], [2,130,258,386], ..., [127,255,383,511] }``. //! //! @endrst //! //! @param[in,out] keys //! Keys to sort //! //! @param[in,out] values //! Values to sort //! //! @param[in] begin_bit //! **[optional]** The beginning (least-significant) bit index needed for key comparison //! //! @param[in] end_bit //! **[optional]** The past-the-end (most-significant) bit index needed for key comparison _CCCL_DEVICE _CCCL_FORCEINLINE void SortBlockedToStriped( KeyT (&keys)[ITEMS_PER_THREAD], ValueT (&values)[ITEMS_PER_THREAD], int begin_bit = 0, int end_bit = sizeof(KeyT) * 8) { SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type(), Int2Type()); } //! @rst //! Performs an ascending block-wide radix sort over a //! :ref:`blocked arrangement ` of keys and values, leaving them in a //! :ref:`striped arrangement `. //! //! * @granularity //! * @smemreuse //! //! Snippet //! ========================================================================== //! //! Let's consider a user-defined ``custom_t`` type below. To sort an array of //! ``custom_t`` objects, we have to tell CUB about relevant members of the //! ``custom_t`` type. We do this by providing a decomposer that returns a //! tuple of references to relevant members of the key. //! //! .. literalinclude:: ../../test/catch2_test_block_radix_sort_custom.cu //! :language: c++ //! :dedent: //! :start-after: example-begin custom-type //! :end-before: example-end custom-type //! //! The code snippet below illustrates a sort of 4 pairs that //! are partitioned in a :ref:`blocked arrangement ` across 2 threads //! where each thread owns 2 consecutive pairs. The final partitioning is striped. //! //! .. literalinclude:: ../../test/catch2_test_block_radix_sort_custom.cu //! :language: c++ //! :dedent: //! :start-after: example-begin pairs-striped-bits //! :end-before: example-end pairs-striped-bits //! //! @endrst //! //! @tparam DecomposerT //! **[inferred]** Type of a callable object responsible for decomposing a //! ``KeyT`` into a tuple of references to its constituent arithmetic types: //! ``::cuda::std::tuple operator()(KeyT &key)``. //! The leftmost element of the tuple is considered the most significant. //! The call operator must not modify members of the key. //! //! @param[in,out] keys //! Keys to sort //! //! @param[in,out] values //! Values to sort //! //! @param decomposer //! Callable object responsible for decomposing a ``KeyT`` into a tuple of //! references to its constituent arithmetic types. The leftmost element of //! the tuple is considered the most significant. The call operator must not //! modify members of the key. //! //! @param[in] begin_bit //! The least-significant bit index (inclusive) needed for //! key comparison //! //! @param[in] end_bit //! The most-significant bit index (exclusive) needed for key //! comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`) template _CCCL_DEVICE _CCCL_FORCEINLINE // typename ::cuda::std::enable_if< // !::cuda::std::is_convertible::value>::type SortBlockedToStriped(KeyT (&keys)[ITEMS_PER_THREAD], ValueT (&values)[ITEMS_PER_THREAD], DecomposerT decomposer, int begin_bit, int end_bit) { SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type(), Int2Type(), decomposer); } //! @rst //! Performs an ascending block-wide radix sort over a //! :ref:`blocked arrangement ` of keys and values, leaving them in a //! :ref:`striped arrangement `. //! //! * @granularity //! * @smemreuse //! //! Snippet //! ========================================================================== //! //! Let's consider a user-defined ``custom_t`` type below. To sort an array of //! ``custom_t`` objects, we have to tell CUB about relevant members of the //! ``custom_t`` type. We do this by providing a decomposer that returns a //! tuple of references to relevant members of the key. //! //! .. literalinclude:: ../../test/catch2_test_block_radix_sort_custom.cu //! :language: c++ //! :dedent: //! :start-after: example-begin custom-type //! :end-before: example-end custom-type //! //! The code snippet below illustrates a sort of 6 pairs that //! are partitioned in a :ref:`blocked arrangement ` across 2 threads //! where each thread owns 3 consecutive pairs. The final partitioning is striped. //! //! .. literalinclude:: ../../test/catch2_test_block_radix_sort_custom.cu //! :language: c++ //! :dedent: //! :start-after: example-begin pairs-striped //! :end-before: example-end pairs-striped //! //! @endrst //! //! @tparam DecomposerT //! **[inferred]** Type of a callable object responsible for decomposing a //! ``KeyT`` into a tuple of references to its constituent arithmetic types: //! ``::cuda::std::tuple operator()(KeyT &key)``. //! The leftmost element of the tuple is considered the most significant. //! The call operator must not modify members of the key. //! //! @param[in,out] keys //! Keys to sort //! //! @param[in,out] values //! Values to sort //! //! @param decomposer //! Callable object responsible for decomposing a ``KeyT`` into a tuple of //! references to its constituent arithmetic types. The leftmost element of //! the tuple is considered the most significant. The call operator must not //! modify members of the key. template _CCCL_DEVICE _CCCL_FORCEINLINE // typename ::cuda::std::enable_if< // !::cuda::std::is_convertible::value>::type SortBlockedToStriped(KeyT (&keys)[ITEMS_PER_THREAD], ValueT (&values)[ITEMS_PER_THREAD], DecomposerT decomposer) { SortBlockedToStriped( keys, values, 0, detail::radix::traits_t::default_end_bit(decomposer), Int2Type(), Int2Type(), decomposer); } //! @rst //! Performs a descending radix sort across a :ref:`blocked arrangement ` //! of keys, leaving them in a :ref:`striped arrangement `. //! //! - @granularity //! - @smemreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates a sort of 512 integer keys that //! are initially partitioned in a :ref:`blocked arrangement ` across 128 //! threads where each thread owns 4 consecutive keys. The final partitioning is striped. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! __global__ void ExampleKernel(...) //! { //! // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each //! typedef cub::BlockRadixSort BlockRadixSort; //! //! // Allocate shared memory for BlockRadixSort //! __shared__ typename BlockRadixSort::TempStorage temp_storage; //! //! // Obtain a segment of consecutive items that are blocked across threads //! int thread_keys[4]; //! ... //! //! // Collectively sort the keys //! BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys); //! //! Suppose the set of input ``thread_keys`` across the block of threads is //! ``{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }``. //! The corresponding output ``thread_keys`` in those threads will be //! ``{ [511,383,255,127], [386,258,130,2], [385,257,128,1], ..., [384,256,128,0] }``. //! //! @endrst //! //! @param[in,out] keys //! Keys to sort //! //! @param[in] begin_bit //! **[optional]** The beginning (least-significant) bit index needed for key comparison //! //! @param[in] end_bit //! **[optional]** The past-the-end (most-significant) bit index needed for key comparison _CCCL_DEVICE _CCCL_FORCEINLINE void SortDescendingBlockedToStriped(KeyT (&keys)[ITEMS_PER_THREAD], int begin_bit = 0, int end_bit = sizeof(KeyT) * 8) { NullType values[ITEMS_PER_THREAD]; SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type(), Int2Type()); } //! @rst //! Performs a descending block-wide radix sort over a //! :ref:`blocked arrangement ` of keys, leaving them in a //! :ref:`striped arrangement `. //! //! * @granularity //! * @smemreuse //! //! Snippet //! ========================================================================== //! //! Let's consider a user-defined ``custom_t`` type below. To sort an array of //! ``custom_t`` objects, we have to tell CUB about relevant members of the //! ``custom_t`` type. We do this by providing a decomposer that returns a //! tuple of references to relevant members of the key. //! //! .. literalinclude:: ../../test/catch2_test_block_radix_sort_custom.cu //! :language: c++ //! :dedent: //! :start-after: example-begin custom-type //! :end-before: example-end custom-type //! //! The code snippet below illustrates a sort of 4 keys that //! are partitioned in a :ref:`blocked arrangement ` across 2 threads //! where each thread owns 2 consecutive keys. The final partitioning is striped. //! //! .. literalinclude:: ../../test/catch2_test_block_radix_sort_custom.cu //! :language: c++ //! :dedent: //! :start-after: example-begin keys-striped-descending-bits //! :end-before: example-end keys-striped-descending-bits //! //! @endrst //! //! @tparam DecomposerT //! **[inferred]** Type of a callable object responsible for decomposing a //! ``KeyT`` into a tuple of references to its constituent arithmetic types: //! ``::cuda::std::tuple operator()(KeyT &key)``. //! The leftmost element of the tuple is considered the most significant. //! The call operator must not modify members of the key. //! //! @param[in,out] keys //! Keys to sort //! //! @param decomposer //! Callable object responsible for decomposing a ``KeyT`` into a tuple of //! references to its constituent arithmetic types. The leftmost element of //! the tuple is considered the most significant. The call operator must not //! modify members of the key. //! //! @param[in] begin_bit //! The least-significant bit index (inclusive) needed for //! key comparison //! //! @param[in] end_bit //! The most-significant bit index (exclusive) needed for key //! comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`) template _CCCL_DEVICE _CCCL_FORCEINLINE // typename ::cuda::std::enable_if< // !::cuda::std::is_convertible::value>::type SortDescendingBlockedToStriped(KeyT (&keys)[ITEMS_PER_THREAD], DecomposerT decomposer, int begin_bit, int end_bit) { NullType values[ITEMS_PER_THREAD]; SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type(), Int2Type(), decomposer); } //! @rst //! Performs a descending block-wide radix sort over a //! :ref:`blocked arrangement ` of keys, leaving them in a //! :ref:`striped arrangement `. //! //! * @granularity //! * @smemreuse //! //! Snippet //! ========================================================================== //! //! Let's consider a user-defined ``custom_t`` type below. To sort an array of //! ``custom_t`` objects, we have to tell CUB about relevant members of the //! ``custom_t`` type. We do this by providing a decomposer that returns a //! tuple of references to relevant members of the key. //! //! .. literalinclude:: ../../test/catch2_test_block_radix_sort_custom.cu //! :language: c++ //! :dedent: //! :start-after: example-begin custom-type //! :end-before: example-end custom-type //! //! The code snippet below illustrates a sort of 6 keys that //! are partitioned in a :ref:`blocked arrangement ` across 2 threads //! where each thread owns 3 consecutive keys. The final partitioning is striped. //! //! .. literalinclude:: ../../test/catch2_test_block_radix_sort_custom.cu //! :language: c++ //! :dedent: //! :start-after: example-begin keys-striped-descending //! :end-before: example-end keys-striped-descending //! //! @endrst //! //! @tparam DecomposerT //! **[inferred]** Type of a callable object responsible for decomposing a //! ``KeyT`` into a tuple of references to its constituent arithmetic types: //! ``::cuda::std::tuple operator()(KeyT &key)``. //! The leftmost element of the tuple is considered the most significant. //! The call operator must not modify members of the key. //! //! @param[in,out] keys //! Keys to sort //! //! @param decomposer //! Callable object responsible for decomposing a ``KeyT`` into a tuple of //! references to its constituent arithmetic types. The leftmost element of //! the tuple is considered the most significant. The call operator must not //! modify members of the key. template _CCCL_DEVICE _CCCL_FORCEINLINE // typename ::cuda::std::enable_if< // !::cuda::std::is_convertible::value>::type SortDescendingBlockedToStriped(KeyT (&keys)[ITEMS_PER_THREAD], DecomposerT decomposer) { NullType values[ITEMS_PER_THREAD]; SortBlockedToStriped( keys, values, 0, detail::radix::traits_t::default_end_bit(decomposer), Int2Type(), Int2Type(), decomposer); } //! @rst //! Performs a descending radix sort across a :ref:`blocked arrangement ` //! of keys and values, leaving them in a :ref:`striped arrangement ` //! //! - BlockRadixSort can only accommodate one associated tile of values. To "truck along" //! more than one tile of values, simply perform a key-value sort of the keys paired //! with a temporary value array that enumerates the key indices. The reordered indices //! can then be used as a gather-vector for exchanging other associated tile data through //! shared memory. //! - @granularity //! - @smemreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates a sort of 512 integer keys and values that //! are initially partitioned in a :ref:`blocked arrangement ` across 128 //! threads where each thread owns 4 consecutive pairs. The final partitioning is striped. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! __global__ void ExampleKernel(...) //! { //! // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each //! typedef cub::BlockRadixSort BlockRadixSort; //! //! // Allocate shared memory for BlockRadixSort //! __shared__ typename BlockRadixSort::TempStorage temp_storage; //! //! // Obtain a segment of consecutive items that are blocked across threads //! int thread_keys[4]; //! int thread_values[4]; //! ... //! //! // Collectively sort the keys and values among block threads //! BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys, thread_values); //! //! Suppose the set of input ``thread_keys`` across the block of threads is //! ``{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }``. //! The corresponding output ``thread_keys`` in those threads will be //! ``{ [511,383,255,127], [386,258,130,2], [385,257,128,1], ..., [384,256,128,0] }``. //! //! @endrst //! //! @param[in,out] keys //! Keys to sort //! //! @param[in,out] values //! Values to sort //! //! @param[in] begin_bit //! **[optional]** The beginning (least-significant) bit index needed for key comparison //! //! @param[in] end_bit //! **[optional]** The past-the-end (most-significant) bit index needed for key comparison _CCCL_DEVICE _CCCL_FORCEINLINE void SortDescendingBlockedToStriped( KeyT (&keys)[ITEMS_PER_THREAD], ValueT (&values)[ITEMS_PER_THREAD], int begin_bit = 0, int end_bit = sizeof(KeyT) * 8) { SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type(), Int2Type()); } //! @rst //! Performs a descending block-wide radix sort over a //! :ref:`blocked arrangement ` of keys and values, leaving them in a //! :ref:`striped arrangement `. //! //! * @granularity //! * @smemreuse //! //! Snippet //! ========================================================================== //! //! Let's consider a user-defined ``custom_t`` type below. To sort an array of //! ``custom_t`` objects, we have to tell CUB about relevant members of the //! ``custom_t`` type. We do this by providing a decomposer that returns a //! tuple of references to relevant members of the key. //! //! .. literalinclude:: ../../test/catch2_test_block_radix_sort_custom.cu //! :language: c++ //! :dedent: //! :start-after: example-begin custom-type //! :end-before: example-end custom-type //! //! The code snippet below illustrates a sort of 4 keys and values that //! are partitioned in a :ref:`blocked arrangement ` across 2 threads //! where each thread owns 2 consecutive pairs. The final partitioning is striped. //! //! .. literalinclude:: ../../test/catch2_test_block_radix_sort_custom.cu //! :language: c++ //! :dedent: //! :start-after: example-begin pairs-striped-descending-bits //! :end-before: example-end pairs-striped-descending-bits //! //! @endrst //! //! @tparam DecomposerT //! **[inferred]** Type of a callable object responsible for decomposing a //! ``KeyT`` into a tuple of references to its constituent arithmetic types: //! ``::cuda::std::tuple operator()(KeyT &key)``. //! The leftmost element of the tuple is considered the most significant. //! The call operator must not modify members of the key. //! //! @param[in,out] keys //! Keys to sort //! //! @param[in,out] values //! Values to sort //! //! @param decomposer //! Callable object responsible for decomposing a ``KeyT`` into a tuple of //! references to its constituent arithmetic types. The leftmost element of //! the tuple is considered the most significant. The call operator must not //! modify members of the key. //! //! @param[in] begin_bit //! The least-significant bit index (inclusive) needed for //! key comparison //! //! @param[in] end_bit //! The most-significant bit index (exclusive) needed for key //! comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`) template _CCCL_DEVICE _CCCL_FORCEINLINE // typename ::cuda::std::enable_if< // !::cuda::std::is_convertible::value>::type SortDescendingBlockedToStriped( KeyT (&keys)[ITEMS_PER_THREAD], ValueT (&values)[ITEMS_PER_THREAD], DecomposerT decomposer, int begin_bit, int end_bit) { SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type(), Int2Type(), decomposer); } //! @rst //! Performs a descending block-wide radix sort over a //! :ref:`blocked arrangement ` of keys and values, leaving them in a //! :ref:`striped arrangement `. //! //! * @granularity //! * @smemreuse //! //! Snippet //! ========================================================================== //! //! Let's consider a user-defined ``custom_t`` type below. To sort an array of //! ``custom_t`` objects, we have to tell CUB about relevant members of the //! ``custom_t`` type. We do this by providing a decomposer that returns a //! tuple of references to relevant members of the key. //! //! .. literalinclude:: ../../test/catch2_test_block_radix_sort_custom.cu //! :language: c++ //! :dedent: //! :start-after: example-begin custom-type //! :end-before: example-end custom-type //! //! The code snippet below illustrates a sort of 6 keys and values that //! are partitioned in a :ref:`blocked arrangement ` across 2 threads //! where each thread owns 3 consecutive pairs. The final partitioning is striped. //! //! .. literalinclude:: ../../test/catch2_test_block_radix_sort_custom.cu //! :language: c++ //! :dedent: //! :start-after: example-begin pairs-striped-descending //! :end-before: example-end pairs-striped-descending //! //! @endrst //! //! @tparam DecomposerT //! **[inferred]** Type of a callable object responsible for decomposing a //! ``KeyT`` into a tuple of references to its constituent arithmetic types: //! ``::cuda::std::tuple operator()(KeyT &key)``. //! The leftmost element of the tuple is considered the most significant. //! The call operator must not modify members of the key. //! //! @param[in,out] keys //! Keys to sort //! //! @param[in,out] values //! Values to sort //! //! @param decomposer //! Callable object responsible for decomposing a ``KeyT`` into a tuple of //! references to its constituent arithmetic types. The leftmost element of //! the tuple is considered the most significant. The call operator must not //! modify members of the key. template _CCCL_DEVICE _CCCL_FORCEINLINE // typename ::cuda::std::enable_if< // !::cuda::std::is_convertible::value>::type SortDescendingBlockedToStriped( KeyT (&keys)[ITEMS_PER_THREAD], ValueT (&values)[ITEMS_PER_THREAD], DecomposerT decomposer) { SortBlockedToStriped( keys, values, 0, detail::radix::traits_t::default_end_bit(decomposer), Int2Type(), Int2Type(), decomposer); } //@} end member group }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/block/block_raking_layout.cuh000066400000000000000000000135351463375617100216050ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * @file * cub::BlockRakingLayout provides a conflict-free shared memory layout abstraction for warp-raking * across thread block data. */ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include CUB_NAMESPACE_BEGIN //! @rst //! BlockRakingLayout provides a conflict-free shared memory layout abstraction for 1D raking across thread block data. //! //! Overview //! ++++++++++++++++++++++++++ //! //! This type facilitates a shared memory usage pattern where a block of CUDA //! threads places elements into shared memory and then reduces the active //! parallelism to one "raking" warp of threads for serially aggregating consecutive //! sequences of shared items. Padding is inserted to eliminate bank conflicts //! (for most data types). //! //! @endrst //! //! @tparam T //! The data type to be exchanged. //! //! @tparam BLOCK_THREADS //! The thread block size in threads. //! //! @tparam LEGACY_PTX_ARCH //! **[optional]** Unused. template struct BlockRakingLayout { //--------------------------------------------------------------------- // Constants and type definitions //--------------------------------------------------------------------- enum { /// The total number of elements that need to be cooperatively reduced SHARED_ELEMENTS = BLOCK_THREADS, /// Maximum number of warp-synchronous raking threads MAX_RAKING_THREADS = CUB_MIN(BLOCK_THREADS, CUB_WARP_THREADS(0)), /// Number of raking elements per warp-synchronous raking thread (rounded up) SEGMENT_LENGTH = (SHARED_ELEMENTS + MAX_RAKING_THREADS - 1) / MAX_RAKING_THREADS, /// Never use a raking thread that will have no valid data (e.g., when BLOCK_THREADS is 62 and SEGMENT_LENGTH is 2, /// we should only use 31 raking threads) RAKING_THREADS = (SHARED_ELEMENTS + SEGMENT_LENGTH - 1) / SEGMENT_LENGTH, /// Whether we will have bank conflicts (technically we should find out if the GCD is > 1) HAS_CONFLICTS = (CUB_SMEM_BANKS(0) % SEGMENT_LENGTH == 0), /// Degree of bank conflicts (e.g., 4-way) CONFLICT_DEGREE = (HAS_CONFLICTS) ? (MAX_RAKING_THREADS * SEGMENT_LENGTH) / CUB_SMEM_BANKS(0) : 1, /// Pad each segment length with one element if segment length is not relatively prime to warp size and can't be /// optimized as a vector load USE_SEGMENT_PADDING = ((SEGMENT_LENGTH & 1) == 0) && (SEGMENT_LENGTH > 2), /// Total number of elements in the raking grid GRID_ELEMENTS = RAKING_THREADS * (SEGMENT_LENGTH + USE_SEGMENT_PADDING), /// Whether or not we need bounds checking during raking (the number of reduction elements is not a multiple of the /// number of raking threads) UNGUARDED = (SHARED_ELEMENTS % RAKING_THREADS == 0), }; /** * @brief Shared memory storage type */ struct __align__(16) _TempStorage { T buff[BlockRakingLayout::GRID_ELEMENTS]; }; /// Alias wrapper allowing storage to be unioned struct TempStorage : Uninitialized<_TempStorage> {}; /** * @brief Returns the location for the calling thread to place data into the grid */ static _CCCL_DEVICE _CCCL_FORCEINLINE T* PlacementPtr(TempStorage& temp_storage, unsigned int linear_tid) { // Offset for partial unsigned int offset = linear_tid; // Add in one padding element for every segment if (USE_SEGMENT_PADDING > 0) { offset += offset / SEGMENT_LENGTH; } // Incorporating a block of padding partials every shared memory segment return temp_storage.Alias().buff + offset; } /** * @brief Returns the location for the calling thread to begin sequential raking */ static _CCCL_DEVICE _CCCL_FORCEINLINE T* RakingPtr(TempStorage& temp_storage, unsigned int linear_tid) { return temp_storage.Alias().buff + (linear_tid * (SEGMENT_LENGTH + USE_SEGMENT_PADDING)); } }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/block/block_reduce.cuh000066400000000000000000000565371463375617100202150ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ //! @file The cub::BlockReduce class provides :ref:`collective ` methods for computing //! a parallel reduction of items partitioned across a CUDA thread block. #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include #include #include CUB_NAMESPACE_BEGIN /****************************************************************************** * Algorithmic variants ******************************************************************************/ //! BlockReduceAlgorithm enumerates alternative algorithms for parallel reduction across a CUDA thread block. enum BlockReduceAlgorithm { //! @rst //! Overview //! ++++++++++++++++++++++++++ //! //! An efficient "raking" reduction algorithm that only supports commutative //! reduction operators (true for most operations, e.g., addition). //! //! Execution is comprised of three phases: //! #. Upsweep sequential reduction in registers (if threads contribute more //! than one input each). Threads in warps other than the first warp place //! their partial reductions into shared memory. //! #. Upsweep sequential reduction in shared memory. Threads within the first //! warp continue to accumulate by raking across segments of shared partial reductions //! #. A warp-synchronous Kogge-Stone style reduction within the raking warp. //! //! Performance Considerations //! ++++++++++++++++++++++++++ //! //! - This variant performs less communication than BLOCK_REDUCE_RAKING_NON_COMMUTATIVE //! and is preferable when the reduction operator is commutative. This variant //! applies fewer reduction operators than BLOCK_REDUCE_WARP_REDUCTIONS, and can provide higher overall //! throughput across the GPU when suitably occupied. However, turn-around latency may be //! higher than to BLOCK_REDUCE_WARP_REDUCTIONS and thus less-desirable //! when the GPU is under-occupied. //! //! @endrst BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY, //! @rst //! Overview //! ++++++++++++++++++++++++++ //! //! An efficient "raking" reduction algorithm that supports commutative //! (e.g., addition) and non-commutative (e.g., string concatenation) reduction //! operators. @blocked. //! //! Execution is comprised of three phases: //! #. Upsweep sequential reduction in registers (if threads contribute more //! than one input each). Each thread then places the partial reduction //! of its item(s) into shared memory. //! #. Upsweep sequential reduction in shared memory. Threads within a //! single warp rake across segments of shared partial reductions. //! #. A warp-synchronous Kogge-Stone style reduction within the raking warp. //! //! Performance Considerations //! ++++++++++++++++++++++++++ //! //! - This variant performs more communication than BLOCK_REDUCE_RAKING //! and is only preferable when the reduction operator is non-commutative. This variant //! applies fewer reduction operators than BLOCK_REDUCE_WARP_REDUCTIONS, and can provide higher overall //! throughput across the GPU when suitably occupied. However, turn-around latency may be //! higher than to BLOCK_REDUCE_WARP_REDUCTIONS and thus less-desirable //! when the GPU is under-occupied. //! //! @endrst BLOCK_REDUCE_RAKING, //! @rst //! Overview //! ++++++++++++++++++++++++++ //! //! A quick "tiled warp-reductions" reduction algorithm that supports commutative //! (e.g., addition) and non-commutative (e.g., string concatenation) reduction //! operators. //! //! Execution is comprised of four phases: //! #. Upsweep sequential reduction in registers (if threads contribute more //! than one input each). Each thread then places the partial reduction //! of its item(s) into shared memory. //! #. Compute a shallow, but inefficient warp-synchronous Kogge-Stone style //! reduction within each warp. //! #. A propagation phase where the warp reduction outputs in each warp are //! updated with the aggregate from each preceding warp. //! //! Performance Considerations //! ++++++++++++++++++++++++++ //! //! - This variant applies more reduction operators than BLOCK_REDUCE_RAKING //! or BLOCK_REDUCE_RAKING_NON_COMMUTATIVE, which may result in lower overall //! throughput across the GPU. However turn-around latency may be lower and //! thus useful when the GPU is under-occupied. //! //! @endrst BLOCK_REDUCE_WARP_REDUCTIONS, }; //! @rst //! The BlockReduce class provides :ref:`collective ` methods for computing a parallel reduction //! of items partitioned across a CUDA thread block. //! //! Overview //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! - A `reduction `_ (or *fold*) uses a binary combining //! operator to compute a single aggregate from a list of input elements. //! - @rowmajor //! - BlockReduce can be optionally specialized by algorithm to accommodate different latency/throughput //! workload profiles: //! //! #. :cpp:enumerator:`cub::BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY`: //! An efficient "raking" reduction algorithm that only supports commutative reduction operators. //! #. :cpp:enumerator:`cub::BLOCK_REDUCE_RAKING`: //! An efficient "raking" reduction algorithm that supports commutative and non-commutative reduction operators. //! #. :cpp:enumerator:`cub::BLOCK_REDUCE_WARP_REDUCTIONS`: //! A quick "tiled warp-reductions" reduction algorithm that supports commutative and non-commutative //! reduction operators. //! //! Performance Considerations //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! - @granularity //! - Very efficient (only one synchronization barrier). //! - Incurs zero bank conflicts for most types //! - Computation is slightly more efficient (i.e., having lower instruction overhead) for: //! - Summation (vs. generic reduction) //! - ``BLOCK_THREADS`` is a multiple of the architecture's warp size //! - Every thread has a valid input (i.e., full vs. partial-tiles) //! - See cub::BlockReduceAlgorithm for performance details regarding algorithmic alternatives //! //! A Simple Example //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! @blockcollective{BlockReduce} //! //! The code snippet below illustrates a sum reduction of 512 integer items that //! are partitioned in a :ref:`blocked arrangement ` across 128 threads //! where each thread owns 4 consecutive items. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! __global__ void ExampleKernel(...) //! { //! // Specialize BlockReduce for a 1D block of 128 threads of type int //! typedef cub::BlockReduce BlockReduce; //! //! // Allocate shared memory for BlockReduce //! __shared__ typename BlockReduce::TempStorage temp_storage; //! //! // Obtain a segment of consecutive items that are blocked across threads //! int thread_data[4]; //! ... //! //! // Compute the block-wide sum for thread0 //! int aggregate = BlockReduce(temp_storage).Sum(thread_data); //! //! Re-using dynamically allocating shared memory //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The ``block/example_block_reduce_dyn_smem.cu`` example illustrates usage of dynamically shared memory with //! BlockReduce and how to re-purpose the same memory region. //! //! @endrst //! //! @tparam T //! Data type being reduced //! //! @tparam BLOCK_DIM_X //! The thread block length in threads along the X dimension //! //! @tparam ALGORITHM //! **[optional]** cub::BlockReduceAlgorithm enumerator specifying the underlying algorithm to use //! (default: cub::BLOCK_REDUCE_WARP_REDUCTIONS) //! //! @tparam BLOCK_DIM_Y //! **[optional]** The thread block length in threads along the Y dimension (default: 1) //! //! @tparam BLOCK_DIM_Z //! **[optional]** The thread block length in threads along the Z dimension (default: 1) //! //! @tparam LEGACY_PTX_ARCH //! **[optional]** Unused. template class BlockReduce { private: /// Constants enum { /// The thread block size in threads BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, }; typedef BlockReduceWarpReductions WarpReductions; typedef BlockReduceRakingCommutativeOnly RakingCommutativeOnly; typedef BlockReduceRaking Raking; /// Internal specialization type using InternalBlockReduce = cub::detail::conditional_t>; // BlockReduceRaking /// Shared memory storage layout type for BlockReduce typedef typename InternalBlockReduce::TempStorage _TempStorage; /// Internal storage allocator _CCCL_DEVICE _CCCL_FORCEINLINE _TempStorage& PrivateStorage() { __shared__ _TempStorage private_storage; return private_storage; } /// Shared storage reference _TempStorage& temp_storage; /// Linear thread-id unsigned int linear_tid; public: /// @smemstorage{BlockReduce} struct TempStorage : Uninitialized<_TempStorage> {}; //! @name Collective constructors //! @{ //! @brief Collective constructor using a private static allocation of shared memory as temporary storage. _CCCL_DEVICE _CCCL_FORCEINLINE BlockReduce() : temp_storage(PrivateStorage()) , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) {} /** * @brief Collective constructor using the specified memory allocation as temporary storage. * * @param[in] temp_storage * Reference to memory allocation having layout type TempStorage */ _CCCL_DEVICE _CCCL_FORCEINLINE BlockReduce(TempStorage& temp_storage) : temp_storage(temp_storage.Alias()) , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) {} //! @} end member group //! @name Generic reductions //! @{ //! @rst //! Computes a block-wide reduction for thread\ :sub:`0` using the specified binary reduction functor. //! Each thread contributes one input element. //! //! - The return value is undefined in threads other than thread\ :sub:`0`. //! - @rowmajor //! - @smemreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates a max reduction of 128 integer items that //! are partitioned across 128 threads. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! __global__ void ExampleKernel(...) //! { //! // Specialize BlockReduce for a 1D block of 128 threads of type int //! typedef cub::BlockReduce BlockReduce; //! //! // Allocate shared memory for BlockReduce //! __shared__ typename BlockReduce::TempStorage temp_storage; //! //! // Each thread obtains an input item //! int thread_data; //! ... //! //! // Compute the block-wide max for thread0 //! int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max()); //! //! @endrst //! //! @tparam ReductionOp //! **[inferred]** Binary reduction functor type having member `T operator()(const T &a, const T &b)` //! //! @param[in] input //! Calling thread's input //! //! @param[in] reduction_op //! Binary reduction functor template _CCCL_DEVICE _CCCL_FORCEINLINE T Reduce(T input, ReductionOp reduction_op) { return InternalBlockReduce(temp_storage).template Reduce(input, BLOCK_THREADS, reduction_op); } //! @rst //! Computes a block-wide reduction for thread\ :sub:`0` using the specified binary reduction functor. //! Each thread contributes an array of consecutive input elements. //! //! - The return value is undefined in threads other than thread\ :sub:`0`. //! - @granularity //! - @smemreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates a max reduction of 512 integer items that are partitioned in a //! :ref:`blocked arrangement ` across 128 threads where each thread owns //! 4 consecutive items. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! __global__ void ExampleKernel(...) //! { //! // Specialize BlockReduce for a 1D block of 128 threads of type int //! typedef cub::BlockReduce BlockReduce; //! //! // Allocate shared memory for BlockReduce //! __shared__ typename BlockReduce::TempStorage temp_storage; //! //! // Obtain a segment of consecutive items that are blocked across threads //! int thread_data[4]; //! ... //! //! // Compute the block-wide max for thread0 //! int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max()); //! //! @endrst //! //! @tparam ITEMS_PER_THREAD //! **[inferred]** The number of consecutive items partitioned onto each thread. //! //! @tparam ReductionOp //! **[inferred]** Binary reduction functor type having member `T operator()(const T &a, const T &b)` //! //! @param[in] inputs //! Calling thread's input segment //! //! @param[in] reduction_op //! Binary reduction functor template _CCCL_DEVICE _CCCL_FORCEINLINE T Reduce(T (&inputs)[ITEMS_PER_THREAD], ReductionOp reduction_op) { // Reduce partials T partial = internal::ThreadReduce(inputs, reduction_op); return Reduce(partial, reduction_op); } //! @rst //! Computes a block-wide reduction for thread\ :sub:`0` using the specified binary reduction functor. //! The first ``num_valid`` threads each contribute one input element. //! //! - The return value is undefined in threads other than thread0. //! - @rowmajor //! - @smemreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates a max reduction of a partially-full tile of integer items //! that are partitioned across 128 threads. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! __global__ void ExampleKernel(int num_valid, ...) //! { //! // Specialize BlockReduce for a 1D block of 128 threads of type int //! typedef cub::BlockReduce BlockReduce; //! //! // Allocate shared memory for BlockReduce //! __shared__ typename BlockReduce::TempStorage temp_storage; //! //! // Each thread obtains an input item //! int thread_data; //! if (threadIdx.x < num_valid) thread_data = ... //! //! // Compute the block-wide max for thread0 //! int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max(), num_valid); //! //! @endrst //! //! @tparam ReductionOp //! **[inferred]** Binary reduction functor type having member `T operator()(const T &a, const T &b)` //! //! @param[in] input //! Calling thread's input //! //! @param[in] reduction_op //! Binary reduction functor //! //! @param[in] num_valid //! Number of threads containing valid elements (may be less than BLOCK_THREADS) template _CCCL_DEVICE _CCCL_FORCEINLINE T Reduce(T input, ReductionOp reduction_op, int num_valid) { // Determine if we skip bounds checking if (num_valid >= BLOCK_THREADS) { return InternalBlockReduce(temp_storage).template Reduce(input, num_valid, reduction_op); } else { return InternalBlockReduce(temp_storage).template Reduce(input, num_valid, reduction_op); } } //! @} end member group //! @name Summation reductions //! @{ //! @rst //! Computes a block-wide reduction for thread\ :sub:`0` using addition (+) as the reduction operator. //! Each thread contributes one input element. //! //! - The return value is undefined in threads other than thread\ :sub:`0`. //! - @rowmajor //! - @smemreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates a sum reduction of 128 integer items that //! are partitioned across 128 threads. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! __global__ void ExampleKernel(...) //! { //! // Specialize BlockReduce for a 1D block of 128 threads of type int //! typedef cub::BlockReduce BlockReduce; //! //! // Allocate shared memory for BlockReduce //! __shared__ typename BlockReduce::TempStorage temp_storage; //! //! // Each thread obtains an input item //! int thread_data; //! ... //! //! // Compute the block-wide sum for thread0 //! int aggregate = BlockReduce(temp_storage).Sum(thread_data); //! //! @endrst //! //! @param[in] input //! Calling thread's input _CCCL_DEVICE _CCCL_FORCEINLINE T Sum(T input) { return InternalBlockReduce(temp_storage).template Sum(input, BLOCK_THREADS); } //! @rst //! Computes a block-wide reduction for thread0 using addition (+) as the reduction operator. //! Each thread contributes an array of consecutive input elements. //! //! - The return value is undefined in threads other than thread\ :sub:`0`. //! - @granularity //! - @smemreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates a sum reduction of 512 integer items that are partitioned in a //! :ref:`blocked arrangement ` across 128 threads where each thread owns //! 4 consecutive items. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! __global__ void ExampleKernel(...) //! { //! // Specialize BlockReduce for a 1D block of 128 threads of type int //! typedef cub::BlockReduce BlockReduce; //! //! // Allocate shared memory for BlockReduce //! __shared__ typename BlockReduce::TempStorage temp_storage; //! //! // Obtain a segment of consecutive items that are blocked across threads //! int thread_data[4]; //! ... //! //! // Compute the block-wide sum for thread0 //! int aggregate = BlockReduce(temp_storage).Sum(thread_data); //! //! @endrst //! //! @tparam ITEMS_PER_THREAD //! **[inferred]** The number of consecutive items partitioned onto each thread. //! //! @param[in] inputs //! Calling thread's input segment template _CCCL_DEVICE _CCCL_FORCEINLINE T Sum(T (&inputs)[ITEMS_PER_THREAD]) { // Reduce partials T partial = internal::ThreadReduce(inputs, cub::Sum()); return Sum(partial); } //! @rst //! Computes a block-wide reduction for thread\ :sub:`0` using addition (+) as the reduction operator. //! The first ``num_valid`` threads each contribute one input element. //! //! - The return value is undefined in threads other than thread\ :sub:`0`. //! - @rowmajor //! - @smemreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates a sum reduction of a partially-full tile of integer items //! that are partitioned across 128 threads. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! __global__ void ExampleKernel(int num_valid, ...) //! { //! // Specialize BlockReduce for a 1D block of 128 threads of type int //! typedef cub::BlockReduce BlockReduce; //! //! // Allocate shared memory for BlockReduce //! __shared__ typename BlockReduce::TempStorage temp_storage; //! //! // Each thread obtains an input item (up to num_items) //! int thread_data; //! if (threadIdx.x < num_valid) //! thread_data = ... //! //! // Compute the block-wide sum for thread0 //! int aggregate = BlockReduce(temp_storage).Sum(thread_data, num_valid); //! //! @endrst //! //! @param[in] input //! Calling thread's input //! //! @param[in] num_valid //! Number of threads containing valid elements (may be less than BLOCK_THREADS) _CCCL_DEVICE _CCCL_FORCEINLINE T Sum(T input, int num_valid) { // Determine if we skip bounds checking if (num_valid >= BLOCK_THREADS) { return InternalBlockReduce(temp_storage).template Sum(input, num_valid); } else { return InternalBlockReduce(temp_storage).template Sum(input, num_valid); } } //! @} end member group }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/block/block_run_length_decode.cuh000066400000000000000000000451201463375617100224000ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include #include #include #include #include CUB_NAMESPACE_BEGIN //! @rst //! The BlockRunLengthDecode class supports decoding a run-length encoded array of items. That //! is, given the two arrays ``run_value[N]`` and ``run_lengths[N]``, ``run_value[i]`` is repeated ``run_lengths[i]`` //! many times in the output array. Due to the nature of the run-length decoding algorithm //! ("decompression"), the output size of the run-length decoded array is runtime-dependent and //! potentially without any upper bound. To address this, BlockRunLengthDecode allows retrieving a //! "window" from the run-length decoded array. The window's offset can be specified and //! BLOCK_THREADS * DECODED_ITEMS_PER_THREAD (i.e., referred to as window_size) decoded items from //! the specified window will be returned. //! //! .. note:: //! //! Trailing runs of length 0 are supported (i.e., they may only appear at the end of the run_lengths array). //! A run of length zero may not be followed by a run length that is not zero. //! //! .. code-block:: c++ //! //! __global__ void ExampleKernel(...) //! { //! // Specialising BlockRunLengthDecode to run-length decode items of type uint64_t //! using RunItemT = uint64_t; //! // Type large enough to index into the run-length decoded array //! using RunLengthT = uint32_t; //! //! // Specialising BlockRunLengthDecode for a 1D block of 128 threads //! constexpr int BLOCK_DIM_X = 128; //! // Specialising BlockRunLengthDecode to have each thread contribute 2 run-length encoded runs //! constexpr int RUNS_PER_THREAD = 2; //! // Specialising BlockRunLengthDecode to have each thread hold 4 run-length decoded items //! constexpr int DECODED_ITEMS_PER_THREAD = 4; //! //! // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer items each //! using BlockRunLengthDecodeT = //! cub::BlockRunLengthDecode; //! //! // Allocate shared memory for BlockRunLengthDecode //! __shared__ typename BlockRunLengthDecodeT::TempStorage temp_storage; //! //! // The run-length encoded items and how often they shall be repeated in the run-length decoded output //! RunItemT run_values[RUNS_PER_THREAD]; //! RunLengthT run_lengths[RUNS_PER_THREAD]; //! ... //! //! // Initialize the BlockRunLengthDecode with the runs that we want to run-length decode //! uint32_t total_decoded_size = 0; //! BlockRunLengthDecodeT block_rld(temp_storage, run_values, run_lengths, total_decoded_size); //! //! // Run-length decode ("decompress") the runs into a window buffer of limited size. This is repeated until all //! runs //! // have been decoded. //! uint32_t decoded_window_offset = 0U; //! while (decoded_window_offset < total_decoded_size) //! { //! RunLengthT relative_offsets[DECODED_ITEMS_PER_THREAD]; //! RunItemT decoded_items[DECODED_ITEMS_PER_THREAD]; //! //! // The number of decoded items that are valid within this window (aka pass) of run-length decoding //! uint32_t num_valid_items = total_decoded_size - decoded_window_offset; //! block_rld.RunLengthDecode(decoded_items, relative_offsets, decoded_window_offset); //! //! decoded_window_offset += BLOCK_DIM_X * DECODED_ITEMS_PER_THREAD; //! //! ... //! } //! } //! //! Suppose the set of input ``run_values`` across the block of threads is //! ``{ [0, 1], [2, 3], [4, 5], [6, 7], ..., [254, 255] }`` and //! ``run_lengths`` is ``{ [1, 2], [3, 4], [5, 1], [2, 3], ..., [5, 1] }``. //! The corresponding output ``decoded_items`` in those threads will be //! ``{ [0, 1, 1, 2], [2, 2, 3, 3], [3, 3, 4, 4], [4, 4, 4, 5], ..., [169, 169, 170, 171] }`` //! and ``relative_offsets`` will be //! ``{ [0, 0, 1, 0], [1, 2, 0, 1], [2, 3, 0, 1], [2, 3, 4, 0], ..., [3, 4, 0, 0] }`` during the //! first iteration of the while loop. //! //! @endrst //! //! @tparam ItemT //! The data type of the items being run-length decoded //! //! @tparam BLOCK_DIM_X //! The thread block length in threads along the X dimension //! //! @tparam RUNS_PER_THREAD //! The number of consecutive runs that each thread contributes //! //! @tparam DECODED_ITEMS_PER_THREAD //! The maximum number of decoded items that each thread holds //! //! @tparam DecodedOffsetT //! Type used to index into the block's decoded items (large enough to hold the sum over all the //! runs' lengths) //! //! @tparam BLOCK_DIM_Y //! The thread block length in threads along the Y dimension //! //! @tparam BLOCK_DIM_Z //! The thread block length in threads along the Z dimension template class BlockRunLengthDecode { //--------------------------------------------------------------------- // CONFIGS & TYPE ALIASES //--------------------------------------------------------------------- private: /// The thread block size in threads static constexpr int BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z; /// The number of runs that the block decodes (out-of-bounds items may be padded with run lengths of '0') static constexpr int BLOCK_RUNS = BLOCK_THREADS * RUNS_PER_THREAD; /// BlockScan used to determine the beginning of each run (i.e., prefix sum over the runs' length) using RunOffsetScanT = BlockScan; /// Type used to index into the block's runs using RunOffsetT = uint32_t; #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document /// Shared memory type required by this thread block union _TempStorage { typename RunOffsetScanT::TempStorage offset_scan; struct { ItemT run_values[BLOCK_RUNS]; DecodedOffsetT run_offsets[BLOCK_RUNS]; } runs; }; // union TempStorage #endif // DOXYGEN_SHOULD_SKIP_THIS /// Internal storage allocator (used when the user does not provide pre-allocated shared memory) _CCCL_DEVICE _CCCL_FORCEINLINE _TempStorage& PrivateStorage() { __shared__ _TempStorage private_storage; return private_storage; } /// Shared storage reference _TempStorage& temp_storage; /// Linear thread-id uint32_t linear_tid; public: struct TempStorage : Uninitialized<_TempStorage> {}; //--------------------------------------------------------------------- // CONSTRUCTOR //--------------------------------------------------------------------- //! @brief Constructor specialised for user-provided temporary storage, initializing using the runs' lengths. //! The algorithm's temporary storage may not be repurposed between the constructor call and subsequent //! `RunLengthDecode` calls. template _CCCL_DEVICE _CCCL_FORCEINLINE BlockRunLengthDecode( TempStorage& temp_storage, ItemT (&run_values)[RUNS_PER_THREAD], RunLengthT (&run_lengths)[RUNS_PER_THREAD], TotalDecodedSizeT& total_decoded_size) : temp_storage(temp_storage.Alias()) , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) { InitWithRunLengths(run_values, run_lengths, total_decoded_size); } //! @brief Constructor specialised for user-provided temporary storage, initializing using the runs' offsets. //! The algorithm's temporary storage may not be repurposed between the constructor call and subsequent //! `RunLengthDecode` calls. template _CCCL_DEVICE _CCCL_FORCEINLINE BlockRunLengthDecode( TempStorage& temp_storage, ItemT (&run_values)[RUNS_PER_THREAD], UserRunOffsetT (&run_offsets)[RUNS_PER_THREAD]) : temp_storage(temp_storage.Alias()) , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) { InitWithRunOffsets(run_values, run_offsets); } /** * \brief Constructor specialised for static temporary storage, initializing using the runs' lengths. */ template _CCCL_DEVICE _CCCL_FORCEINLINE BlockRunLengthDecode( ItemT (&run_values)[RUNS_PER_THREAD], RunLengthT (&run_lengths)[RUNS_PER_THREAD], TotalDecodedSizeT& total_decoded_size) : temp_storage(PrivateStorage()) , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) { InitWithRunLengths(run_values, run_lengths, total_decoded_size); } /** * \brief Constructor specialised for static temporary storage, initializing using the runs' offsets. */ template _CCCL_DEVICE _CCCL_FORCEINLINE BlockRunLengthDecode(ItemT (&run_values)[RUNS_PER_THREAD], UserRunOffsetT (&run_offsets)[RUNS_PER_THREAD]) : temp_storage(PrivateStorage()) , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) { InitWithRunOffsets(run_values, run_offsets); } private: /** * @brief Returns the offset of the first value within @p input which compares greater than * @p val. This version takes @p MAX_NUM_ITEMS, an upper bound of the array size, which will * be used to determine the number of binary search iterations at compile time. * * @param[in] input * Input sequence * * @param[in] num_items * Input sequence length * * @param[in] val * Search key */ template _CCCL_DEVICE _CCCL_FORCEINLINE OffsetT StaticUpperBound(InputIteratorT input, OffsetT num_items, T val) { OffsetT lower_bound = 0; OffsetT upper_bound = num_items; #pragma unroll for (int i = 0; i <= Log2::VALUE; i++) { OffsetT mid = cub::MidPoint(lower_bound, upper_bound); mid = (cub::min)(mid, num_items - 1); if (val < input[mid]) { upper_bound = mid; } else { lower_bound = mid + 1; } } return lower_bound; } template _CCCL_DEVICE _CCCL_FORCEINLINE void InitWithRunOffsets(ItemT (&run_values)[RUNS_PER_THREAD], RunOffsetT (&run_offsets)[RUNS_PER_THREAD]) { // Keep the runs' items and the offsets of each run's beginning in the temporary storage RunOffsetT thread_dst_offset = static_cast(linear_tid) * static_cast(RUNS_PER_THREAD); #pragma unroll for (int i = 0; i < RUNS_PER_THREAD; i++) { temp_storage.runs.run_values[thread_dst_offset] = run_values[i]; temp_storage.runs.run_offsets[thread_dst_offset] = run_offsets[i]; thread_dst_offset++; } // Ensure run offsets and run values have been writen to shared memory CTA_SYNC(); } template _CCCL_DEVICE _CCCL_FORCEINLINE void InitWithRunLengths( ItemT (&run_values)[RUNS_PER_THREAD], RunLengthT (&run_lengths)[RUNS_PER_THREAD], TotalDecodedSizeT& total_decoded_size) { // Compute the offset for the beginning of each run DecodedOffsetT run_offsets[RUNS_PER_THREAD]; #pragma unroll for (int i = 0; i < RUNS_PER_THREAD; i++) { run_offsets[i] = static_cast(run_lengths[i]); } DecodedOffsetT decoded_size_aggregate; RunOffsetScanT(this->temp_storage.offset_scan).ExclusiveSum(run_offsets, run_offsets, decoded_size_aggregate); total_decoded_size = static_cast(decoded_size_aggregate); // Ensure the prefix scan's temporary storage can be reused (may be superfluous, but depends on scan implementation) CTA_SYNC(); InitWithRunOffsets(run_values, run_offsets); } public: /** * \brief Run-length decodes the runs previously passed via a call to Init(...) and returns the run-length decoded * items in a blocked arrangement to \p decoded_items. If the number of run-length decoded items exceeds the * run-length decode buffer (i.e., `DECODED_ITEMS_PER_THREAD * BLOCK_THREADS`), only the items that fit within * the buffer are returned. Subsequent calls to `RunLengthDecode` adjusting \p from_decoded_offset can be * used to retrieve the remaining run-length decoded items. Calling __syncthreads() between any two calls to * `RunLengthDecode` is not required. * \p item_offsets can be used to retrieve each run-length decoded item's relative index within its run. E.g., the * run-length encoded array of `3, 1, 4` with the respective run lengths of `2, 1, 3` would yield the run-length * decoded array of `3, 3, 1, 4, 4, 4` with the relative offsets of `0, 1, 0, 0, 1, 2`. * \smemreuse * * \param[out] decoded_items The run-length decoded items to be returned in a blocked arrangement * \param[out] item_offsets The run-length decoded items' relative offset within the run they belong to * \param[in] from_decoded_offset If invoked with from_decoded_offset that is larger than total_decoded_size results * in undefined behavior. */ template _CCCL_DEVICE _CCCL_FORCEINLINE void RunLengthDecode( ItemT (&decoded_items)[DECODED_ITEMS_PER_THREAD], RelativeOffsetT (&item_offsets)[DECODED_ITEMS_PER_THREAD], DecodedOffsetT from_decoded_offset = 0) { // The (global) offset of the first item decoded by this thread DecodedOffsetT thread_decoded_offset = from_decoded_offset + linear_tid * DECODED_ITEMS_PER_THREAD; // The run that the first decoded item of this thread belongs to // If this thread's is already beyond the total decoded size, it will be assigned to the // last run RunOffsetT assigned_run = StaticUpperBound(temp_storage.runs.run_offsets, BLOCK_RUNS, thread_decoded_offset) - static_cast(1U); DecodedOffsetT assigned_run_begin = temp_storage.runs.run_offsets[assigned_run]; // If this thread is getting assigned the last run, we make sure it will not fetch any other run after this DecodedOffsetT assigned_run_end = (assigned_run == BLOCK_RUNS - 1) ? thread_decoded_offset + DECODED_ITEMS_PER_THREAD : temp_storage.runs.run_offsets[assigned_run + 1]; ItemT val = temp_storage.runs.run_values[assigned_run]; #pragma unroll for (DecodedOffsetT i = 0; i < DECODED_ITEMS_PER_THREAD; i++) { decoded_items[i] = val; item_offsets[i] = thread_decoded_offset - assigned_run_begin; // A thread only needs to fetch the next run if this was not the last loop iteration const bool is_final_loop_iteration = (i + 1 >= DECODED_ITEMS_PER_THREAD); if (!is_final_loop_iteration && (thread_decoded_offset == assigned_run_end - 1)) { // We make sure that a thread is not re-entering this conditional when being assigned to the last run already by // extending the last run's length to all the thread's item assigned_run++; assigned_run_begin = temp_storage.runs.run_offsets[assigned_run]; // If this thread is getting assigned the last run, we make sure it will not fetch any other run after this assigned_run_end = (assigned_run == BLOCK_RUNS - 1) ? thread_decoded_offset + DECODED_ITEMS_PER_THREAD : temp_storage.runs.run_offsets[assigned_run + 1]; val = temp_storage.runs.run_values[assigned_run]; } thread_decoded_offset++; } } /** * \brief Run-length decodes the runs previously passed via a call to Init(...) and returns the run-length decoded * items in a blocked arrangement to `decoded_items`. If the number of run-length decoded items exceeds the * run-length decode buffer (i.e., `DECODED_ITEMS_PER_THREAD * BLOCK_THREADS`), only the items that fit within * the buffer are returned. Subsequent calls to `RunLengthDecode` adjusting `from_decoded_offset` can be * used to retrieve the remaining run-length decoded items. Calling __syncthreads() between any two calls to * `RunLengthDecode` is not required. * * \param[out] decoded_items The run-length decoded items to be returned in a blocked arrangement * \param[in] from_decoded_offset If invoked with from_decoded_offset that is larger than total_decoded_size results * in undefined behavior. */ _CCCL_DEVICE _CCCL_FORCEINLINE void RunLengthDecode(ItemT (&decoded_items)[DECODED_ITEMS_PER_THREAD], DecodedOffsetT from_decoded_offset = 0) { DecodedOffsetT item_offsets[DECODED_ITEMS_PER_THREAD]; RunLengthDecode(decoded_items, item_offsets, from_decoded_offset); } }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/block/block_scan.cuh000066400000000000000000003020171463375617100176550ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ //! @file The cub::BlockScan class provides :ref:`collective ` methods for computing a //! parallel prefix sum/scan of items partitioned across a CUDA thread block. #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include CUB_NAMESPACE_BEGIN /****************************************************************************** * Algorithmic variants ******************************************************************************/ //! @brief BlockScanAlgorithm enumerates alternative algorithms for cub::BlockScan to compute a //! parallel prefix scan across a CUDA thread block. enum BlockScanAlgorithm { //! @rst //! Overview //! ++++++++++++++++++++++++++ //! //! An efficient "raking reduce-then-scan" prefix scan algorithm. Execution is comprised of five phases: //! //! #. Upsweep sequential reduction in registers (if threads contribute more than one input each). //! Each thread then places the partial reduction of its item(s) into shared memory. //! #. Upsweep sequential reduction in shared memory. //! Threads within a single warp rake across segments of shared partial reductions. //! #. A warp-synchronous Kogge-Stone style exclusive scan within the raking warp. //! #. Downsweep sequential exclusive scan in shared memory. //! Threads within a single warp rake across segments of shared partial reductions, //! seeded with the warp-scan output. //! #. Downsweep sequential scan in registers (if threads contribute more than one input), //! seeded with the raking scan output. //! //! Performance Considerations //! ++++++++++++++++++++++++++ //! //! - Although this variant may suffer longer turnaround latencies when the //! GPU is under-occupied, it can often provide higher overall throughput //! across the GPU when suitably occupied. //! //! @endrst BLOCK_SCAN_RAKING, //! @rst //! Overview //! ++++++++++++++++++++++++++ //! //! Similar to cub::BLOCK_SCAN_RAKING, but with fewer shared memory reads at the expense of higher //! register pressure. Raking threads preserve their "upsweep" segment of values in registers while performing //! warp-synchronous scan, allowing the "downsweep" not to re-read them from shared memory. //! //! @endrst BLOCK_SCAN_RAKING_MEMOIZE, //! @rst //! Overview //! ++++++++++++++++++++++++++ //! //! A quick "tiled warpscans" prefix scan algorithm. Execution is comprised of four phases: //! #. Upsweep sequential reduction in registers (if threads contribute more than one input each). //! Each thread then places the partial reduction of its item(s) into shared memory. //! #. Compute a shallow, but inefficient warp-synchronous Kogge-Stone style scan within each warp. //! #. A propagation phase where the warp scan outputs in each warp are updated with the aggregate //! from each preceding warp. //! #. Downsweep sequential scan in registers (if threads contribute more than one input), //! seeded with the raking scan output. //! //! Performance Considerations //! ++++++++++++++++++++++++++ //! //! - Although this variant may suffer lower overall throughput across the //! GPU because due to a heavy reliance on inefficient warpscans, it can //! often provide lower turnaround latencies when the GPU is under-occupied. //! //! @endrst BLOCK_SCAN_WARP_SCANS, }; //! @rst //! The BlockScan class provides :ref:`collective ` methods for computing a parallel prefix //! sum/scan of items partitioned across a CUDA thread block. //! //! Overview //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! - Given a list of input elements and a binary reduction operator, a //! `prefix scan `_ produces an output list where each element is computed //! to be the reduction of the elements occurring earlier in the input list. *Prefix sum* connotes a prefix scan with //! the addition operator. The term *inclusive indicates* that the *i*\ :sup:`th` output reduction incorporates //! the *i*\ :sup:`th` input. The term *exclusive* indicates the *i*\ :sup:`th` input is not incorporated into //! the *i*\ :sup:`th` output reduction. //! - @rowmajor //! - BlockScan can be optionally specialized by algorithm to accommodate different workload profiles: //! //! #. :cpp:enumerator:`cub::BLOCK_SCAN_RAKING`: //! An efficient (high throughput) "raking reduce-then-scan" prefix scan algorithm. //! #. :cpp:enumerator:`cub::BLOCK_SCAN_RAKING_MEMOIZE`: //! Similar to cub::BLOCK_SCAN_RAKING, but having higher throughput at the expense of additional //! register pressure for intermediate storage. //! #. :cpp:enumerator:`cub::BLOCK_SCAN_WARP_SCANS`: //! A quick (low latency) "tiled warpscans" prefix scan algorithm. //! //! Performance Considerations //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! - @granularity //! - Uses special instructions when applicable (e.g., warp ``SHFL``) //! - Uses synchronization-free communication between warp lanes when applicable //! - Invokes a minimal number of minimal block-wide synchronization barriers (only //! one or two depending on algorithm selection) //! - Incurs zero bank conflicts for most types //! - Computation is slightly more efficient (i.e., having lower instruction overhead) for: //! //! - Prefix sum variants (vs. generic scan) //! - @blocksize //! //! - See cub::BlockScanAlgorithm for performance details regarding algorithmic alternatives //! //! A Simple Example //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! @blockcollective{BlockScan} //! //! The code snippet below illustrates an exclusive prefix sum of 512 integer items that //! are partitioned in a :ref:`blocked arrangement ` across 128 threads //! where each thread owns 4 consecutive items. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! __global__ void ExampleKernel(...) //! { //! // Specialize BlockScan for a 1D block of 128 threads of type int //! typedef cub::BlockScan BlockScan; //! //! // Allocate shared memory for BlockScan //! __shared__ typename BlockScan::TempStorage temp_storage; //! //! // Obtain a segment of consecutive items that are blocked across threads //! int thread_data[4]; //! ... //! //! // Collectively compute the block-wide exclusive prefix sum //! BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data); //! //! Suppose the set of input ``thread_data`` across the block of threads is //! ``{[1,1,1,1], [1,1,1,1], ..., [1,1,1,1]}``. //! The corresponding output ``thread_data`` in those threads will be //! ``{[0,1,2,3], [4,5,6,7], ..., [508,509,510,511]}``. //! //! Re-using dynamically allocating shared memory //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The ``block/example_block_reduce_dyn_smem.cu`` example illustrates usage of dynamically shared memory with //! BlockReduce and how to re-purpose the same memory region. //! This example can be easily adapted to the storage required by BlockScan. //! //! @endrst //! //! @tparam T //! Data type being scanned //! //! @tparam BLOCK_DIM_X //! The thread block length in threads along the X dimension //! //! @tparam ALGORITHM //! **[optional]** cub::BlockScanAlgorithm enumerator specifying the underlying algorithm to use //! (default: cub::BLOCK_SCAN_RAKING) //! //! @tparam BLOCK_DIM_Y //! **[optional]** The thread block length in threads along the Y dimension //! (default: 1) //! //! @tparam BLOCK_DIM_Z //! **[optional]** The thread block length in threads along the Z dimension (default: 1) //! //! @tparam LEGACY_PTX_ARCH //! **[optional]** Unused. template class BlockScan { private: /// Constants enum { /// The thread block size in threads BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, }; /** * Ensure the template parameterization meets the requirements of the * specified algorithm. Currently, the BLOCK_SCAN_WARP_SCANS policy * cannot be used with thread block sizes not a multiple of the * architectural warp size. */ static constexpr BlockScanAlgorithm SAFE_ALGORITHM = ((ALGORITHM == BLOCK_SCAN_WARP_SCANS) && (BLOCK_THREADS % CUB_WARP_THREADS(0) != 0)) ? BLOCK_SCAN_RAKING : ALGORITHM; typedef BlockScanWarpScans WarpScans; typedef BlockScanRaking Raking; /// Define the delegate type for the desired algorithm using InternalBlockScan = cub::detail::conditional_t; /// Shared memory storage layout type for BlockScan typedef typename InternalBlockScan::TempStorage _TempStorage; /// Shared storage reference _TempStorage& temp_storage; /// Linear thread-id unsigned int linear_tid; /// Internal storage allocator _CCCL_DEVICE _CCCL_FORCEINLINE _TempStorage& PrivateStorage() { __shared__ _TempStorage private_storage; return private_storage; } public: /// @smemstorage{BlockScan} struct TempStorage : Uninitialized<_TempStorage> {}; //! @name Collective constructors //! @{ //! @brief Collective constructor using a private static allocation of shared memory as temporary storage. _CCCL_DEVICE _CCCL_FORCEINLINE BlockScan() : temp_storage(PrivateStorage()) , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) {} /** * @brief Collective constructor using the specified memory allocation as temporary storage. * * @param[in] temp_storage * Reference to memory allocation having layout type TempStorage */ _CCCL_DEVICE _CCCL_FORCEINLINE BlockScan(TempStorage& temp_storage) : temp_storage(temp_storage.Alias()) , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) {} //! @} end member group //! @name Exclusive prefix sum operations //! @{ //! @rst //! Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. //! Each thread contributes one input element. The value of 0 is applied as the initial value, and is assigned //! to ``output`` in *thread*\ :sub:`0`. //! //! - @identityzero //! - @rowmajor //! - @smemreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates an exclusive prefix sum of 128 integer items that //! are partitioned across 128 threads. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! __global__ void ExampleKernel(...) //! { //! // Specialize BlockScan for a 1D block of 128 threads of type int //! typedef cub::BlockScan BlockScan; //! //! // Allocate shared memory for BlockScan //! __shared__ typename BlockScan::TempStorage temp_storage; //! //! // Obtain input item for each thread //! int thread_data; //! ... //! //! // Collectively compute the block-wide exclusive prefix sum //! BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data); //! //! Suppose the set of input ``thread_data`` across the block of threads is ``1, 1, ..., 1``. //! The corresponding output ``thread_data`` in those threads will be ``0, 1, ..., 127``. //! //! @endrst //! //! @param[in] input //! Calling thread's input item //! //! @param[out] output //! Calling thread's output item (may be aliased to `input`) _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveSum(T input, T& output) { T initial_value{}; ExclusiveScan(input, output, initial_value, cub::Sum()); } //! @rst //! Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. //! Each thread contributes one input element. //! The value of 0 is applied as the initial value, and is assigned to ``output`` in *thread*\ :sub:`0`. //! Also provides every thread with the block-wide ``block_aggregate`` of all inputs. //! //! - @identityzero //! - @rowmajor //! - @smemreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates an exclusive prefix sum of 128 integer items that //! are partitioned across 128 threads. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! __global__ void ExampleKernel(...) //! { //! // Specialize BlockScan for a 1D block of 128 threads of type int //! typedef cub::BlockScan BlockScan; //! //! // Allocate shared memory for BlockScan //! __shared__ typename BlockScan::TempStorage temp_storage; //! //! // Obtain input item for each thread //! int thread_data; //! ... //! //! // Collectively compute the block-wide exclusive prefix sum //! int block_aggregate; //! BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate); //! //! Suppose the set of input ``thread_data`` across the block of threads is ``1, 1, ..., 1``. //! The corresponding output ``thread_data`` in those threads will be ``0, 1, ..., 127``. //! Furthermore the value ``128`` will be stored in ``block_aggregate`` for all threads. //! //! @endrst //! //! @param[in] input //! Calling thread's input item //! //! @param[out] output //! Calling thread's output item (may be aliased to `input`) //! //! @param[out] block_aggregate //! block-wide aggregate reduction of input items _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveSum(T input, T& output, T& block_aggregate) { T initial_value{}; ExclusiveScan(input, output, initial_value, cub::Sum(), block_aggregate); } //! @rst //! Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. //! Each thread contributes one input element. Instead of using 0 as the block-wide prefix, the call-back functor //! ``block_prefix_callback_op`` is invoked by the first warp in the block, and the value returned by //! *lane*\ :sub:`0` in that warp is used as the "seed" value that logically prefixes the thread block's //! scan inputs. Also provides every thread with the block-wide ``block_aggregate`` of all inputs. //! //! - @identityzero //! - The ``block_prefix_callback_op`` functor must implement a member function //! ``T operator()(T block_aggregate)``. The functor's input parameter ``block_aggregate`` is the same value //! also returned by the scan operation. The functor will be invoked by the first warp of threads in the block, //! however only the return value from *lane*\ :sub:`0` is applied as the block-wide prefix. Can be stateful. //! - @rowmajor //! - @smemreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates a single thread block that progressively //! computes an exclusive prefix sum over multiple "tiles" of input using a //! prefix functor to maintain a running total between block-wide scans. Each tile consists //! of 128 integer items that are partitioned across 128 threads. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! // A stateful callback functor that maintains a running prefix to be applied //! // during consecutive scan operations. //! struct BlockPrefixCallbackOp //! { //! // Running prefix //! int running_total; //! //! // Constructor //! __host__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} //! //! // Callback operator to be entered by the first warp of threads in the block. //! // Thread-0 is responsible for returning a value for seeding the block-wide scan. //! __host__ int operator()(int block_aggregate) //! { //! int old_prefix = running_total; //! running_total += block_aggregate; //! return old_prefix; //! } //! }; //! //! __global__ void ExampleKernel(int *d_data, int num_items, ...) //! { //! // Specialize BlockScan for a 1D block of 128 threads //! typedef cub::BlockScan BlockScan; //! //! // Allocate shared memory for BlockScan //! __shared__ typename BlockScan::TempStorage temp_storage; //! //! // Initialize running total //! BlockPrefixCallbackOp prefix_op(0); //! //! // Have the block iterate over segments of items //! for (int block_offset = 0; block_offset < num_items; block_offset += 128) //! { //! // Load a segment of consecutive items that are blocked across threads //! int thread_data = d_data[block_offset]; //! //! // Collectively compute the block-wide exclusive prefix sum //! BlockScan(temp_storage).ExclusiveSum( //! thread_data, thread_data, prefix_op); //! CTA_SYNC(); //! //! // Store scanned items to output segment //! d_data[block_offset] = thread_data; //! } //! //! Suppose the input ``d_data`` is ``1, 1, 1, 1, 1, 1, 1, 1, ...``. //! The corresponding output for the first segment will be ``0, 1, ..., 127``. //! The output for the second segment will be ``128, 129, ..., 255``. //! //! @endrst //! //! @tparam BlockPrefixCallbackOp //! **[inferred]** Call-back functor type having member `T operator()(T block_aggregate)` //! //! @param[in] input //! Calling thread's input item //! //! @param[out] output //! Calling thread's output item (may be aliased to `input`) //! //! @param[in,out] block_prefix_callback_op //! @rst //! *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied to //! the logical input sequence. //! @endrst template _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveSum(T input, T& output, BlockPrefixCallbackOp& block_prefix_callback_op) { ExclusiveScan(input, output, cub::Sum(), block_prefix_callback_op); } //! @} end member group //! @name Exclusive prefix sum operations (multiple data per thread) //! @{ //! @rst //! Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. //! Each thread contributes an array of consecutive input elements. //! The value of 0 is applied as the initial value, and is assigned to ``output[0]`` in *thread*\ :sub:`0`. //! //! - @identityzero //! - @blocked //! - @granularity //! - @smemreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates an exclusive prefix sum of 512 integer items that //! are partitioned in a :ref:`blocked arrangement ` across 128 threads //! where each thread owns 4 consecutive items. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! __global__ void ExampleKernel(...) //! { //! // Specialize BlockScan for a 1D block of 128 threads of type int //! typedef cub::BlockScan BlockScan; //! //! // Allocate shared memory for BlockScan //! __shared__ typename BlockScan::TempStorage temp_storage; //! //! // Obtain a segment of consecutive items that are blocked across threads //! int thread_data[4]; //! ... //! //! // Collectively compute the block-wide exclusive prefix sum //! BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data); //! //! Suppose the set of input ``thread_data`` across the block of threads is //! ``{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }``. //! The corresponding output ``thread_data`` in those threads will be //! ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }``. //! //! @endrst //! //! @tparam ITEMS_PER_THREAD //! **[inferred]** The number of consecutive items partitioned onto each thread. //! //! @param[in] input //! Calling thread's input items //! //! @param[out] output //! Calling thread's output items (may be aliased to `input`) template _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveSum(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD]) { T initial_value{}; ExclusiveScan(input, output, initial_value, cub::Sum()); } //! @rst //! Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. //! Each thread contributes an array of consecutive input elements. //! The value of 0 is applied as the initial value, and is assigned to ``output[0]`` in *thread*\ :sub:`0`. //! Also provides every thread with the block-wide ``block_aggregate`` of all inputs. //! //! - @identityzero //! - @blocked //! - @granularity //! - @smemreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates an exclusive prefix sum of 512 integer items that are partitioned in //! a :ref:`blocked arrangement ` across 128 threads where each thread owns //! 4 consecutive items. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! __global__ void ExampleKernel(...) //! { //! // Specialize BlockScan for a 1D block of 128 threads of type int //! typedef cub::BlockScan BlockScan; //! //! // Allocate shared memory for BlockScan //! __shared__ typename BlockScan::TempStorage temp_storage; //! //! // Obtain a segment of consecutive items that are blocked across threads //! int thread_data[4]; //! ... //! //! // Collectively compute the block-wide exclusive prefix sum //! int block_aggregate; //! BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate); //! //! Suppose the set of input ``thread_data`` across the block of threads is //! ``{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }``. //! The corresponding output ``thread_data`` in those threads will be //! ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }``. //! Furthermore the value ``512`` will be stored in ``block_aggregate`` for all threads. //! //! @endrst //! //! @tparam ITEMS_PER_THREAD //! **[inferred]** The number of consecutive items partitioned onto each thread. //! //! @param[in] input //! Calling thread's input items //! //! @param[out] output //! Calling thread's output items (may be aliased to `input`) //! //! @param[out] block_aggregate //! block-wide aggregate reduction of input items template _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveSum(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], T& block_aggregate) { // Reduce consecutive thread items in registers T initial_value{}; ExclusiveScan(input, output, initial_value, cub::Sum(), block_aggregate); } //! @rst //! Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. //! Each thread contributes an array of consecutive input elements. //! Instead of using 0 as the block-wide prefix, the call-back functor ``block_prefix_callback_op`` is invoked by //! the first warp in the block, and the value returned by *lane*\ :sub:`0` in that warp is used as the "seed" //! value that logically prefixes the thread block's scan inputs. //! Also provides every thread with the block-wide ``block_aggregate`` of all inputs. //! //! - @identityzero //! - The ``block_prefix_callback_op`` functor must implement a member function ``T operator()(T block_aggregate)``. //! The functor's input parameter ``block_aggregate`` is the same value also returned //! by the scan operation. The functor will be invoked by the first warp of threads in //! the block, however only the return value from *lane*\ :sub:`0` is applied as the block-wide prefix. //! Can be stateful. //! - @blocked //! - @granularity //! - @smemreuse //! //! //! Snippet //! +++++++ //! //! The code snippet below illustrates a single thread block that progressively //! computes an exclusive prefix sum over multiple "tiles" of input using a //! prefix functor to maintain a running total between block-wide scans. Each tile consists //! of 512 integer items that are partitioned in a :ref:`blocked arrangement ` //! across 128 threads where each thread owns 4 consecutive items. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! // A stateful callback functor that maintains a running prefix to be applied //! // during consecutive scan operations. //! struct BlockPrefixCallbackOp //! { //! // Running prefix //! int running_total; //! //! // Constructor //! __host__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} //! //! // Callback operator to be entered by the first warp of threads in the block. //! // Thread-0 is responsible for returning a value for seeding the block-wide scan. //! __host__ int operator()(int block_aggregate) //! { //! int old_prefix = running_total; //! running_total += block_aggregate; //! return old_prefix; //! } //! }; //! //! __global__ void ExampleKernel(int *d_data, int num_items, ...) //! { //! // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread //! typedef cub::BlockLoad BlockLoad; //! typedef cub::BlockStore BlockStore; //! typedef cub::BlockScan BlockScan; //! //! // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan //! __shared__ union { //! typename BlockLoad::TempStorage load; //! typename BlockScan::TempStorage scan; //! typename BlockStore::TempStorage store; //! } temp_storage; //! //! // Initialize running total //! BlockPrefixCallbackOp prefix_op(0); //! //! // Have the block iterate over segments of items //! for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4) //! { //! // Load a segment of consecutive items that are blocked across threads //! int thread_data[4]; //! BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data); //! CTA_SYNC(); //! //! // Collectively compute the block-wide exclusive prefix sum //! int block_aggregate; //! BlockScan(temp_storage.scan).ExclusiveSum( //! thread_data, thread_data, prefix_op); //! CTA_SYNC(); //! //! // Store scanned items to output segment //! BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data); //! CTA_SYNC(); //! } //! //! Suppose the input ``d_data`` is ``1, 1, 1, 1, 1, 1, 1, 1, ...``. //! The corresponding output for the first segment will be ``0, 1, 2, 3, ..., 510, 511``. //! The output for the second segment will be ``512, 513, 514, 515, ..., 1022, 1023``. //! //! @endrst //! //! @tparam ITEMS_PER_THREAD //! **[inferred]** The number of consecutive items partitioned onto each thread. //! //! @tparam BlockPrefixCallbackOp //! **[inferred]** Call-back functor type having member //! `T operator()(T block_aggregate)` //! //! @param[in] input //! Calling thread's input items //! //! @param[out] output //! Calling thread's output items (may be aliased to `input`) //! //! @param[in,out] block_prefix_callback_op //! @rst //! *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied to //! the logical input sequence. //! @endrst template _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveSum( T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], BlockPrefixCallbackOp& block_prefix_callback_op) { ExclusiveScan(input, output, cub::Sum(), block_prefix_callback_op); } //! @} end member group // Exclusive prefix sums //! @name Exclusive prefix scan operations //! @{ //! @rst //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor. //! Each thread contributes one input element. //! //! - Supports non-commutative scan operators. //! - @rowmajor //! - @smemreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates an exclusive prefix max scan of 128 integer items that //! are partitioned across 128 threads. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! __global__ void ExampleKernel(...) //! { //! // Specialize BlockScan for a 1D block of 128 threads of type int //! typedef cub::BlockScan BlockScan; //! //! // Allocate shared memory for BlockScan //! __shared__ typename BlockScan::TempStorage temp_storage; //! //! // Obtain input item for each thread //! int thread_data; //! ... //! //! // Collectively compute the block-wide exclusive prefix max scan //! BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max()); //! //! Suppose the set of input ``thread_data`` across the block of threads is ``0, -1, 2, -3, ..., 126, -127``. //! The corresponding output ``thread_data`` in those threads will be ``INT_MIN, 0, 0, 2, ..., 124, 126``. //! //! @endrst //! //! @tparam ScanOp //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)` //! //! @param[in] input //! Calling thread's input item //! //! @param[out] output //! Calling thread's output item (may be aliased to `input`) //! //! @param[in] initial_value //! @rst //! Initial value to seed the exclusive scan (and is assigned to `output[0]` in *thread*\ :sub:`0`) //! @endrst //! //! @param[in] scan_op //! Binary scan functor template _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScan(T input, T& output, T initial_value, ScanOp scan_op) { InternalBlockScan(temp_storage).ExclusiveScan(input, output, initial_value, scan_op); } //! @rst //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor. //! Each thread contributes one input element. //! Also provides every thread with the block-wide ``block_aggregate`` of all inputs. //! //! - Supports non-commutative scan operators. //! - @rowmajor //! - @smemreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates an exclusive prefix max scan of 128 integer items that //! are partitioned across 128 threads. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! __global__ void ExampleKernel(...) //! { //! // Specialize BlockScan for a 1D block of 128 threads of type int //! typedef cub::BlockScan BlockScan; //! //! // Allocate shared memory for BlockScan //! __shared__ typename BlockScan::TempStorage temp_storage; //! //! // Obtain input item for each thread //! int thread_data; //! ... //! //! // Collectively compute the block-wide exclusive prefix max scan //! int block_aggregate; //! BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max(), block_aggregate); //! //! Suppose the set of input ``thread_data`` across the block of threads is ``0, -1, 2, -3, ..., 126, -127``. //! The corresponding output ``thread_data`` in those threads will be ``INT_MIN, 0, 0, 2, ..., 124, 126``. //! Furthermore the value ``126`` will be stored in ``block_aggregate`` for all threads. //! //! @endrst //! //! @tparam ScanOp //! **[inferred]** Binary scan functor type having member ``T operator()(const T &a, const T &b)`` //! //! @param[in] input //! Calling thread's input items //! //! @param[out] output //! Calling thread's output items (may be aliased to ``input``) //! //! @param[in] initial_value //! @rst //! Initial value to seed the exclusive scan (and is assigned to ``output[0]`` in *thread*\ :sub:`0`) //! @endrst //! //! @param[in] scan_op //! Binary scan functor //! //! @param[out] block_aggregate //! block-wide aggregate reduction of input items template _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScan(T input, T& output, T initial_value, ScanOp scan_op, T& block_aggregate) { InternalBlockScan(temp_storage).ExclusiveScan(input, output, initial_value, scan_op, block_aggregate); } //! @rst //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor. //! Each thread contributes one input element. The call-back functor ``block_prefix_callback_op`` is invoked by //! the first warp in the block, and the value returned by *lane*\ :sub:`0` in that warp is used as //! the "seed" value that logically prefixes the thread block's scan inputs. //! Also provides every thread with the block-wide ``block_aggregate`` of all inputs. //! //! - The ``block_prefix_callback_op`` functor must implement a member function ``T operator()(T block_aggregate)``. //! The functor's input parameter ``block_aggregate`` is the same value also returned by the scan operation. //! The functor will be invoked by the first warp of threads in the block, however only the return value from //! *lane*\ :sub:`0` is applied as the block-wide prefix. Can be stateful. //! - Supports non-commutative scan operators. //! - @rowmajor //! - @smemreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates a single thread block that progressively //! computes an exclusive prefix max scan over multiple "tiles" of input using a //! prefix functor to maintain a running total between block-wide scans. //! Each tile consists of 128 integer items that are partitioned across 128 threads. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! // A stateful callback functor that maintains a running prefix to be applied //! // during consecutive scan operations. //! struct BlockPrefixCallbackOp //! { //! // Running prefix //! int running_total; //! //! // Constructor //! __host__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} //! //! // Callback operator to be entered by the first warp of threads in the block. //! // Thread-0 is responsible for returning a value for seeding the block-wide scan. //! __host__ int operator()(int block_aggregate) //! { //! int old_prefix = running_total; //! running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix; //! return old_prefix; //! } //! }; //! //! __global__ void ExampleKernel(int *d_data, int num_items, ...) //! { //! // Specialize BlockScan for a 1D block of 128 threads //! typedef cub::BlockScan BlockScan; //! //! // Allocate shared memory for BlockScan //! __shared__ typename BlockScan::TempStorage temp_storage; //! //! // Initialize running total //! BlockPrefixCallbackOp prefix_op(INT_MIN); //! //! // Have the block iterate over segments of items //! for (int block_offset = 0; block_offset < num_items; block_offset += 128) //! { //! // Load a segment of consecutive items that are blocked across threads //! int thread_data = d_data[block_offset]; //! //! // Collectively compute the block-wide exclusive prefix max scan //! BlockScan(temp_storage).ExclusiveScan( //! thread_data, thread_data, INT_MIN, cub::Max(), prefix_op); //! CTA_SYNC(); //! //! // Store scanned items to output segment //! d_data[block_offset] = thread_data; //! } //! //! Suppose the input ``d_data`` is ``0, -1, 2, -3, 4, -5, ...``. //! The corresponding output for the first segment will be ``INT_MIN, 0, 0, 2, ..., 124, 126``. //! The output for the second segment will be ``126, 128, 128, 130, ..., 252, 254``. //! //! @endrst //! //! @tparam ScanOp //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)` //! //! @tparam BlockPrefixCallbackOp //! **[inferred]** Call-back functor type having member `T operator()(T block_aggregate)` //! //! @param[in] input //! Calling thread's input item //! //! @param[out] output //! Calling thread's output item (may be aliased to `input`) //! //! @param[in] scan_op //! Binary scan functor //! //! @param[in,out] block_prefix_callback_op //! @rst //! *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied to //! the logical input sequence. //! @endrst template _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScan(T input, T& output, ScanOp scan_op, BlockPrefixCallbackOp& block_prefix_callback_op) { InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op, block_prefix_callback_op); } //! @} end member group // Inclusive prefix sums //! @name Exclusive prefix scan operations (multiple data per thread) //! @{ //! @rst //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor. //! Each thread contributes an array of consecutive input elements. //! //! - Supports non-commutative scan operators. //! - @blocked //! - @granularity //! - @smemreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates an exclusive prefix max scan of 512 integer //! items that are partitioned in a [blocked arrangement](index.html#sec5sec3) //! across 128 threads where each thread owns 4 consecutive items. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! __global__ void ExampleKernel(...) //! { //! // Specialize BlockScan for a 1D block of 128 threads of type int //! typedef cub::BlockScan BlockScan; //! //! // Allocate shared memory for BlockScan //! __shared__ typename BlockScan::TempStorage temp_storage; //! //! // Obtain a segment of consecutive items that are blocked across threads //! int thread_data[4]; //! ... //! //! // Collectively compute the block-wide exclusive prefix max scan //! BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max()); //! //! Suppose the set of input ``thread_data`` across the block of threads is //! ``{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }``. //! The corresponding output ``thread_data`` in those threads will be //! ``{ [INT_MIN,0,0,2], [2,4,4,6], ..., [506,508,508,510] }``. //! //! @endrst //! //! @tparam ITEMS_PER_THREAD //! **[inferred]** The number of consecutive items partitioned onto each thread. //! //! @tparam ScanOp //! **[inferred]** Binary scan functor type having member //! `T operator()(const T &a, const T &b)` //! //! @param[in] input //! Calling thread's input items //! //! @param[out] output //! Calling thread's output items (may be aliased to `input`) //! //! @param[in] initial_value //! @rst //! Initial value to seed the exclusive scan (and is assigned to `output[0]` in *thread*\ :sub:`0`) //! @endrst //! //! @param[in] scan_op //! Binary scan functor template _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScan(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], T initial_value, ScanOp scan_op) { // Reduce consecutive thread items in registers T thread_prefix = internal::ThreadReduce(input, scan_op); // Exclusive thread block-scan ExclusiveScan(thread_prefix, thread_prefix, initial_value, scan_op); // Exclusive scan in registers with prefix as seed internal::ThreadScanExclusive(input, output, scan_op, thread_prefix); } //! @rst //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor. //! Each thread contributes an array of consecutive input elements. //! Also provides every thread with the block-wide ``block_aggregate`` of all inputs. //! //! - Supports non-commutative scan operators. //! - @blocked //! - @granularity //! - @smemreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates an exclusive prefix max scan of 512 integer items that are partitioned in //! a :ref:`blocked arrangement ` across 128 threads where each thread owns //! 4 consecutive items. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! __global__ void ExampleKernel(...) //! { //! // Specialize BlockScan for a 1D block of 128 threads of type int //! typedef cub::BlockScan BlockScan; //! //! // Allocate shared memory for BlockScan //! __shared__ typename BlockScan::TempStorage temp_storage; //! //! // Obtain a segment of consecutive items that are blocked across threads //! int thread_data[4]; //! ... //! //! // Collectively compute the block-wide exclusive prefix max scan //! int block_aggregate; //! BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max(), block_aggregate); //! //! Suppose the set of input ``thread_data`` across the block of threads is //! ``{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }``. //! The corresponding output ``thread_data`` in those threads will be //! ``{ [INT_MIN,0,0,2], [2,4,4,6], ..., [506,508,508,510] }``. //! Furthermore the value ``510`` will be stored in ``block_aggregate`` for all threads. //! //! @endrst //! //! @tparam ITEMS_PER_THREAD //! **[inferred]** The number of consecutive items partitioned onto each thread. //! //! @tparam ScanOp //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)` //! //! @param[in] input //! Calling thread's input items //! //! @param[out] output //! Calling thread's output items (may be aliased to `input`) //! //! @param[in] initial_value //! @rst //! Initial value to seed the exclusive scan (and is assigned to `output[0]` in *thread*\ :sub:`0`) //! @endrst //! //! @param[in] scan_op //! Binary scan functor //! //! @param[out] block_aggregate //! block-wide aggregate reduction of input items template _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScan( T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], T initial_value, ScanOp scan_op, T& block_aggregate) { // Reduce consecutive thread items in registers T thread_prefix = internal::ThreadReduce(input, scan_op); // Exclusive thread block-scan ExclusiveScan(thread_prefix, thread_prefix, initial_value, scan_op, block_aggregate); // Exclusive scan in registers with prefix as seed internal::ThreadScanExclusive(input, output, scan_op, thread_prefix); } //! @rst //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor. //! Each thread contributes an array of consecutive input elements. //! The call-back functor ``block_prefix_callback_op`` is invoked by the first warp in the block, and the value //! returned by *lane*\ :sub:`0` in that warp is used as the "seed" value that logically prefixes the thread //! block's scan inputs. Also provides every thread with the block-wide ``block_aggregate`` of all inputs. //! //! - The ``block_prefix_callback_op`` functor must implement a member function //! ``T operator()(T block_aggregate)``. The functor's input parameter ``block_aggregate`` //! is the same value also returned by the scan operation. The functor will be invoked by the //! first warp of threads in the block, however only the return value from //! *lane*\ :sub:`0` is applied as the block-wide prefix. Can be stateful. //! - Supports non-commutative scan operators. //! - @blocked //! - @granularity //! - @smemreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates a single thread block that progressively //! computes an exclusive prefix max scan over multiple "tiles" of input using a //! prefix functor to maintain a running total between block-wide scans. Each tile consists //! of 128 integer items that are partitioned across 128 threads. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! // A stateful callback functor that maintains a running prefix to be applied //! // during consecutive scan operations. //! struct BlockPrefixCallbackOp //! { //! // Running prefix //! int running_total; //! //! // Constructor //! __host__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} //! //! // Callback operator to be entered by the first warp of threads in the block. //! // Thread-0 is responsible for returning a value for seeding the block-wide scan. //! __host__ int operator()(int block_aggregate) //! { //! int old_prefix = running_total; //! running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix; //! return old_prefix; //! } //! }; //! //! __global__ void ExampleKernel(int *d_data, int num_items, ...) //! { //! // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread //! typedef cub::BlockLoad BlockLoad; //! typedef cub::BlockStore BlockStore; //! typedef cub::BlockScan BlockScan; //! //! // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan //! __shared__ union { //! typename BlockLoad::TempStorage load; //! typename BlockScan::TempStorage scan; //! typename BlockStore::TempStorage store; //! } temp_storage; //! //! // Initialize running total //! BlockPrefixCallbackOp prefix_op(0); //! //! // Have the block iterate over segments of items //! for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4) //! { //! // Load a segment of consecutive items that are blocked across threads //! int thread_data[4]; //! BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data); //! CTA_SYNC(); //! //! // Collectively compute the block-wide exclusive prefix max scan //! BlockScan(temp_storage.scan).ExclusiveScan( //! thread_data, thread_data, INT_MIN, cub::Max(), prefix_op); //! CTA_SYNC(); //! //! // Store scanned items to output segment //! BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data); //! CTA_SYNC(); //! } //! //! Suppose the input ``d_data`` is ``0, -1, 2, -3, 4, -5, ...``. //! The corresponding output for the first segment will be //! ``INT_MIN, 0, 0, 2, 2, 4, ..., 508, 510``. //! The output for the second segment will be //! ``510, 512, 512, 514, 514, 516, ..., 1020, 1022``. //! //! @endrst //! //! @tparam ITEMS_PER_THREAD //! **[inferred]** The number of consecutive items partitioned onto each thread. //! //! @tparam ScanOp //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)` //! //! @tparam BlockPrefixCallbackOp //! **[inferred]** Call-back functor type having member `T operator()(T block_aggregate)` //! //! @param[in] input //! Calling thread's input items //! //! @param[out] output //! Calling thread's output items (may be aliased to `input`) //! //! @param[in] scan_op //! Binary scan functor //! //! @param[in,out] block_prefix_callback_op //! @rst //! *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied to //! the logical input sequence. //! @endrst template _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScan( T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], ScanOp scan_op, BlockPrefixCallbackOp& block_prefix_callback_op) { // Reduce consecutive thread items in registers T thread_prefix = internal::ThreadReduce(input, scan_op); // Exclusive thread block-scan ExclusiveScan(thread_prefix, thread_prefix, scan_op, block_prefix_callback_op); // Exclusive scan in registers with prefix as seed internal::ThreadScanExclusive(input, output, scan_op, thread_prefix); } //! @} end member group #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document no-initial-value scans //! @name Exclusive prefix scan operations (no initial value, single datum per thread) //! @{ //! @rst //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor. //! Each thread contributes one input element. //! With no initial value, the output computed for *thread*\ :sub:`0` is undefined. //! //! - Supports non-commutative scan operators. //! - @rowmajor //! - @smemreuse //! //! @endrst //! //! @tparam ScanOp //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)` //! //! @param[in] input //! Calling thread's input item //! //! @param[out] output //! Calling thread's output item (may be aliased to `input`) //! //! @param[in] scan_op //! Binary scan functor template _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScan(T input, T& output, ScanOp scan_op) { InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op); } //! @rst //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor. //! Each thread contributes one input element. Also provides every thread with the block-wide //! ``block_aggregate`` of all inputs. With no initial value, the output computed for //! *thread*\ :sub:`0` is undefined. //! //! - Supports non-commutative scan operators. //! - @rowmajor //! - @smemreuse //! //! @endrst //! //! @tparam ScanOp //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)` //! //! @param[in] input //! Calling thread's input item //! //! @param[out] output //! Calling thread's output item (may be aliased to `input`) //! //! @param[in] scan_op //! Binary scan functor //! //! @param[out] block_aggregate //! block-wide aggregate reduction of input items template _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScan(T input, T& output, ScanOp scan_op, T& block_aggregate) { InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op, block_aggregate); } //! @} end member group //! @name Exclusive prefix scan operations (no initial value, multiple data per thread) //! @{ //! @rst //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor. //! Each thread contributes an array of consecutive input elements. With no initial value, the //! output computed for *thread*\ :sub:`0` is undefined. //! //! - Supports non-commutative scan operators. //! - @blocked //! - @granularity //! - @smemreuse //! //! @endrst //! //! @tparam ITEMS_PER_THREAD //! **[inferred]** The number of consecutive items partitioned onto each thread. //! //! @tparam ScanOp //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)` //! //! @param[in] input //! Calling thread's input items //! //! @param[out] output //! Calling thread's output items (may be aliased to `input`) //! //! @param[in] scan_op //! Binary scan functor template _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScan(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], ScanOp scan_op) { // Reduce consecutive thread items in registers T thread_partial = internal::ThreadReduce(input, scan_op); // Exclusive thread block-scan ExclusiveScan(thread_partial, thread_partial, scan_op); // Exclusive scan in registers with prefix internal::ThreadScanExclusive(input, output, scan_op, thread_partial, (linear_tid != 0)); } //! @rst //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor. //! Each thread contributes an array of consecutive input elements. Also provides every thread //! with the block-wide ``block_aggregate`` of all inputs. //! With no initial value, the output computed for *thread*\ :sub:`0` is undefined. //! //! - Supports non-commutative scan operators. //! - @blocked //! - @granularity //! - @smemreuse //! //! @endrst //! //! @tparam ITEMS_PER_THREAD //! **[inferred]** The number of consecutive items partitioned onto each thread. //! //! @tparam ScanOp //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)` //! //! @param[in] input //! Calling thread's input items //! //! @param[out] output //! Calling thread's output items (may be aliased to `input`) //! //! @param[in] scan_op //! Binary scan functor //! //! @param[out] block_aggregate //! block-wide aggregate reduction of input items template _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScan(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], ScanOp scan_op, T& block_aggregate) { // Reduce consecutive thread items in registers T thread_partial = internal::ThreadReduce(input, scan_op); // Exclusive thread block-scan ExclusiveScan(thread_partial, thread_partial, scan_op, block_aggregate); // Exclusive scan in registers with prefix internal::ThreadScanExclusive(input, output, scan_op, thread_partial, (linear_tid != 0)); } //! @} end member group #endif // DOXYGEN_SHOULD_SKIP_THIS // Do not document no-initial-value scans //! @name Inclusive prefix sum operations //! @{ //! @rst //! Computes an inclusive block-wide prefix scan using addition (+) //! as the scan operator. Each thread contributes one input element. //! //! - @rowmajor //! - @smemreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates an inclusive prefix sum of 128 integer items that //! are partitioned across 128 threads. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! __global__ void ExampleKernel(...) //! { //! // Specialize BlockScan for a 1D block of 128 threads of type int //! typedef cub::BlockScan BlockScan; //! //! // Allocate shared memory for BlockScan //! __shared__ typename BlockScan::TempStorage temp_storage; //! //! // Obtain input item for each thread //! int thread_data; //! ... //! //! // Collectively compute the block-wide inclusive prefix sum //! BlockScan(temp_storage).InclusiveSum(thread_data, thread_data); //! //! Suppose the set of input ``thread_data`` across the block of threads is ``1, 1, ..., 1``. //! The corresponding output ``thread_data`` in those threads will be ``1, 2, ..., 128``. //! //! @endrst //! //! @param[in] input //! Calling thread's input item //! //! @param[out] output //! Calling thread's output item (may be aliased to `input`) _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveSum(T input, T& output) { InclusiveScan(input, output, cub::Sum()); } //! @rst //! Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. //! Each thread contributes one input element. //! Also provides every thread with the block-wide ``block_aggregate`` of all inputs. //! //! - @rowmajor //! - @smemreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates an inclusive prefix sum of 128 integer items that //! are partitioned across 128 threads. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! __global__ void ExampleKernel(...) //! { //! // Specialize BlockScan for a 1D block of 128 threads of type int //! typedef cub::BlockScan BlockScan; //! //! // Allocate shared memory for BlockScan //! __shared__ typename BlockScan::TempStorage temp_storage; //! //! // Obtain input item for each thread //! int thread_data; //! ... //! //! // Collectively compute the block-wide inclusive prefix sum //! int block_aggregate; //! BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, block_aggregate); //! //! Suppose the set of input ``thread_data`` across the block of threads is ``1, 1, ..., 1``. //! The corresponding output ``thread_data`` in those threads will be ``1, 2, ..., 128``. //! Furthermore the value ``128`` will be stored in ``block_aggregate`` for all threads. //! //! @endrst //! //! @param[in] input //! Calling thread's input item //! //! @param[out] output //! Calling thread's output item (may be aliased to `input`) //! //! @param[out] block_aggregate //! block-wide aggregate reduction of input items _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveSum(T input, T& output, T& block_aggregate) { InclusiveScan(input, output, cub::Sum(), block_aggregate); } //! @rst //! Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. //! Each thread contributes one input element. Instead of using 0 as the block-wide prefix, the call-back functor //! ``block_prefix_callback_op`` is invoked by the first warp in the block, and the value returned by //! *lane*\ :sub:`0` in that warp is used as the "seed" value that logically prefixes the thread block's //! scan inputs. Also provides every thread with the block-wide ``block_aggregate`` of all inputs. //! //! - The ``block_prefix_callback_op`` functor must implement a member function //! ``T operator()(T block_aggregate)``. The functor's input parameter //! ``block_aggregate`` is the same value also returned by the scan operation. //! The functor will be invoked by the first warp of threads in the block, //! however only the return value from *lane*\ :sub:`0` is applied //! as the block-wide prefix. Can be stateful. //! - @rowmajor //! - @smemreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates a single thread block that progressively //! computes an inclusive prefix sum over multiple "tiles" of input using a //! prefix functor to maintain a running total between block-wide scans. //! Each tile consists of 128 integer items that are partitioned across 128 threads. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! // A stateful callback functor that maintains a running prefix to be applied //! // during consecutive scan operations. //! struct BlockPrefixCallbackOp //! { //! // Running prefix //! int running_total; //! //! // Constructor //! __host__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} //! //! // Callback operator to be entered by the first warp of threads in the block. //! // Thread-0 is responsible for returning a value for seeding the block-wide scan. //! __host__ int operator()(int block_aggregate) //! { //! int old_prefix = running_total; //! running_total += block_aggregate; //! return old_prefix; //! } //! }; //! //! __global__ void ExampleKernel(int *d_data, int num_items, ...) //! { //! // Specialize BlockScan for a 1D block of 128 threads //! typedef cub::BlockScan BlockScan; //! //! // Allocate shared memory for BlockScan //! __shared__ typename BlockScan::TempStorage temp_storage; //! //! // Initialize running total //! BlockPrefixCallbackOp prefix_op(0); //! //! // Have the block iterate over segments of items //! for (int block_offset = 0; block_offset < num_items; block_offset += 128) //! { //! // Load a segment of consecutive items that are blocked across threads //! int thread_data = d_data[block_offset]; //! //! // Collectively compute the block-wide inclusive prefix sum //! BlockScan(temp_storage).InclusiveSum( //! thread_data, thread_data, prefix_op); //! CTA_SYNC(); //! //! // Store scanned items to output segment //! d_data[block_offset] = thread_data; //! } //! //! Suppose the input ``d_data`` is ``1, 1, 1, 1, 1, 1, 1, 1, ...``. //! The corresponding output for the first segment will be ``1, 2, ..., 128``. //! The output for the second segment will be ``129, 130, ..., 256``. //! //! @endrst //! //! @tparam BlockPrefixCallbackOp //! **[inferred]** Call-back functor type having member `T operator()(T block_aggregate)` //! //! @param[in] input //! Calling thread's input item //! //! @param[out] output //! Calling thread's output item (may be aliased to `input`) //! //! @param[in,out] block_prefix_callback_op //! @rst //! *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied //! to the logical input sequence. //! @endrst template _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveSum(T input, T& output, BlockPrefixCallbackOp& block_prefix_callback_op) { InclusiveScan(input, output, cub::Sum(), block_prefix_callback_op); } //! @} end member group //! @name Inclusive prefix sum operations (multiple data per thread) //! @{ //! @rst //! Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. //! Each thread contributes an array of consecutive input elements. //! //! - @blocked //! - @granularity //! - @smemreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates an inclusive prefix sum of 512 integer items that //! are partitioned in a :ref:`blocked arrangement ` across 128 threads //! where each thread owns 4 consecutive items. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! __global__ void ExampleKernel(...) //! { //! // Specialize BlockScan for a 1D block of 128 threads of type int //! typedef cub::BlockScan BlockScan; //! //! // Allocate shared memory for BlockScan //! __shared__ typename BlockScan::TempStorage temp_storage; //! //! // Obtain a segment of consecutive items that are blocked across threads //! int thread_data[4]; //! ... //! //! // Collectively compute the block-wide inclusive prefix sum //! BlockScan(temp_storage).InclusiveSum(thread_data, thread_data); //! //! Suppose the set of input ``thread_data`` across the block of threads is //! ``{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }``. The corresponding output //! ``thread_data`` in those threads will be ``{ [1,2,3,4], [5,6,7,8], ..., [509,510,511,512] }``. //! //! @endrst //! //! @tparam ITEMS_PER_THREAD //! **[inferred]** The number of consecutive items partitioned onto each thread. //! //! @param[in] input //! Calling thread's input items //! //! @param[out] output //! Calling thread's output items (may be aliased to `input`) template _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveSum(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD]) { if (ITEMS_PER_THREAD == 1) { InclusiveSum(input[0], output[0]); } else { // Reduce consecutive thread items in registers Sum scan_op; T thread_prefix = internal::ThreadReduce(input, scan_op); // Exclusive thread block-scan ExclusiveSum(thread_prefix, thread_prefix); // Inclusive scan in registers with prefix as seed internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0)); } } //! @rst //! Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. //! Each thread contributes an array of consecutive input elements. //! Also provides every thread with the block-wide ``block_aggregate`` of all inputs. //! //! - @blocked //! - @granularity //! - @smemreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates an inclusive prefix sum of 512 integer items that //! are partitioned in a :ref:`blocked arrangement ` across 128 threads //! where each thread owns 4 consecutive items. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! __global__ void ExampleKernel(...) //! { //! // Specialize BlockScan for a 1D block of 128 threads of type int //! typedef cub::BlockScan BlockScan; //! //! // Allocate shared memory for BlockScan //! __shared__ typename BlockScan::TempStorage temp_storage; //! //! // Obtain a segment of consecutive items that are blocked across threads //! int thread_data[4]; //! ... //! //! // Collectively compute the block-wide inclusive prefix sum //! int block_aggregate; //! BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, block_aggregate); //! //! Suppose the set of input ``thread_data`` across the block of threads is //! ``{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }``. The //! corresponding output ``thread_data`` in those threads will be //! ``{ [1,2,3,4], [5,6,7,8], ..., [509,510,511,512] }``. //! Furthermore the value ``512`` will be stored in ``block_aggregate`` for all threads. //! //! @endrst //! //! @tparam ITEMS_PER_THREAD //! **[inferred]** The number of consecutive items partitioned onto each thread. //! //! @tparam ScanOp //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)` //! //! @param[in] input //! Calling thread's input items //! //! @param[out] output //! Calling thread's output items (may be aliased to `input`) //! //! @param[out] block_aggregate //! block-wide aggregate reduction of input items template _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveSum(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], T& block_aggregate) { if (ITEMS_PER_THREAD == 1) { InclusiveSum(input[0], output[0], block_aggregate); } else { // Reduce consecutive thread items in registers Sum scan_op; T thread_prefix = internal::ThreadReduce(input, scan_op); // Exclusive thread block-scan ExclusiveSum(thread_prefix, thread_prefix, block_aggregate); // Inclusive scan in registers with prefix as seed internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0)); } } //! @rst //! Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. //! Each thread contributes an array of consecutive input elements. //! Instead of using 0 as the block-wide prefix, the call-back functor ``block_prefix_callback_op`` is invoked by //! the first warp in the block, and the value returned by *lane*\ :sub:`0` in that warp is used as the "seed" //! value that logically prefixes the thread block's scan inputs. Also provides every thread with the //! block-wide ``block_aggregate`` of all inputs. //! //! - The ``block_prefix_callback_op`` functor must implement a member function //! ``T operator()(T block_aggregate)``. The functor's input parameter //! ``block_aggregate`` is the same value also returned by the scan operation. //! The functor will be invoked by the first warp of threads in the block, //! however only the return value from *lane*\ :sub:`0` is applied //! as the block-wide prefix. Can be stateful. //! - @blocked //! - @granularity //! - @smemreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates a single thread block that progressively //! computes an inclusive prefix sum over multiple "tiles" of input using a //! prefix functor to maintain a running total between block-wide scans. Each tile consists //! of 512 integer items that are partitioned in a :ref:`blocked arrangement ` //! across 128 threads where each thread owns 4 consecutive items. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! // A stateful callback functor that maintains a running prefix to be applied //! // during consecutive scan operations. //! struct BlockPrefixCallbackOp //! { //! // Running prefix //! int running_total; //! //! // Constructor //! __host__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} //! //! // Callback operator to be entered by the first warp of threads in the block. //! // Thread-0 is responsible for returning a value for seeding the block-wide scan. //! __host__ int operator()(int block_aggregate) //! { //! int old_prefix = running_total; //! running_total += block_aggregate; //! return old_prefix; //! } //! }; //! //! __global__ void ExampleKernel(int *d_data, int num_items, ...) //! { //! // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread //! typedef cub::BlockLoad BlockLoad; //! typedef cub::BlockStore BlockStore; //! typedef cub::BlockScan BlockScan; //! //! // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan //! __shared__ union { //! typename BlockLoad::TempStorage load; //! typename BlockScan::TempStorage scan; //! typename BlockStore::TempStorage store; //! } temp_storage; //! //! // Initialize running total //! BlockPrefixCallbackOp prefix_op(0); //! //! // Have the block iterate over segments of items //! for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4) //! { //! // Load a segment of consecutive items that are blocked across threads //! int thread_data[4]; //! BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data); //! CTA_SYNC(); //! //! // Collectively compute the block-wide inclusive prefix sum //! BlockScan(temp_storage.scan).IncluisveSum( //! thread_data, thread_data, prefix_op); //! CTA_SYNC(); //! //! // Store scanned items to output segment //! BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data); //! CTA_SYNC(); //! } //! //! Suppose the input ``d_data`` is ``1, 1, 1, 1, 1, 1, 1, 1, ...``. //! The corresponding output for the first segment will be //! ``1, 2, 3, 4, ..., 511, 512``. The output for the second segment will be //! ``513, 514, 515, 516, ..., 1023, 1024``. //! //! @endrst //! //! @tparam ITEMS_PER_THREAD //! **[inferred]** The number of consecutive items partitioned onto each thread. //! //! @tparam BlockPrefixCallbackOp //! **[inferred]** Call-back functor type having member `T operator()(T block_aggregate)` //! //! @param[in] input //! Calling thread's input items //! //! @param[out] output //! Calling thread's output items (may be aliased to `input`) //! //! @param[in,out] block_prefix_callback_op //! @rst //! *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied to the //! logical input sequence. //! @endrst template _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveSum( T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], BlockPrefixCallbackOp& block_prefix_callback_op) { if (ITEMS_PER_THREAD == 1) { InclusiveSum(input[0], output[0], block_prefix_callback_op); } else { // Reduce consecutive thread items in registers Sum scan_op; T thread_prefix = internal::ThreadReduce(input, scan_op); // Exclusive thread block-scan ExclusiveSum(thread_prefix, thread_prefix, block_prefix_callback_op); // Inclusive scan in registers with prefix as seed internal::ThreadScanInclusive(input, output, scan_op, thread_prefix); } } //! @} end member group //! @name Inclusive prefix scan operations //! @{ //! @rst //! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor. //! Each thread contributes one input element. //! //! - Supports non-commutative scan operators. //! - @rowmajor //! - @smemreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates an inclusive prefix max scan of 128 integer items that //! are partitioned across 128 threads. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! __global__ void ExampleKernel(...) //! { //! // Specialize BlockScan for a 1D block of 128 threads of type int //! typedef cub::BlockScan BlockScan; //! //! // Allocate shared memory for BlockScan //! __shared__ typename BlockScan::TempStorage temp_storage; //! //! // Obtain input item for each thread //! int thread_data; //! ... //! //! // Collectively compute the block-wide inclusive prefix max scan //! BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max()); //! //! Suppose the set of input ``thread_data`` across the block of threads is //! ``0, -1, 2, -3, ..., 126, -127``. The corresponding output ``thread_data`` //! in those threads will be ``0, 0, 2, 2, ..., 126, 126``. //! //! @endrst //! //! @tparam ScanOp //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)` //! //! @param[in] input //! Calling thread's input item //! //! @param[out] output //! Calling thread's output item (may be aliased to `input`) //! //! @param[in] scan_op //! Binary scan functor template _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScan(T input, T& output, ScanOp scan_op) { InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op); } //! @rst //! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor. //! Each thread contributes one input element. Also provides every thread with the block-wide //! ``block_aggregate`` of all inputs. //! //! - Supports non-commutative scan operators. //! - @rowmajor //! - @smemreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates an inclusive prefix max scan of 128 //! integer items that are partitioned across 128 threads. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! __global__ void ExampleKernel(...) //! { //! // Specialize BlockScan for a 1D block of 128 threads of type int //! typedef cub::BlockScan BlockScan; //! //! // Allocate shared memory for BlockScan //! __shared__ typename BlockScan::TempStorage temp_storage; //! //! // Obtain input item for each thread //! int thread_data; //! ... //! //! // Collectively compute the block-wide inclusive prefix max scan //! int block_aggregate; //! BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max(), block_aggregate); //! //! Suppose the set of input ``thread_data`` across the block of threads is //! ``0, -1, 2, -3, ..., 126, -127``. The corresponding output ``thread_data`` //! in those threads will be ``0, 0, 2, 2, ..., 126, 126``. Furthermore the value //! ``126`` will be stored in ``block_aggregate`` for all threads. //! //! @endrst //! //! @tparam ScanOp //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)` //! //! @param[in] input //! Calling thread's input item //! //! @param[out] output //! Calling thread's output item (may be aliased to `input`) //! //! @param[in] scan_op //! Binary scan functor //! //! @param[out] block_aggregate //! Block-wide aggregate reduction of input items template _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScan(T input, T& output, ScanOp scan_op, T& block_aggregate) { InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op, block_aggregate); } //! @rst //! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor. //! Each thread contributes one input element. The call-back functor ``block_prefix_callback_op`` //! is invoked by the first warp in the block, and the value returned by *lane*\ :sub:`0` in that warp is used as //! the "seed" value that logically prefixes the thread block's scan inputs. //! Also provides every thread with the block-wide ``block_aggregate`` of all inputs. //! //! - The ``block_prefix_callback_op`` functor must implement a member function //! ``T operator()(T block_aggregate)``. The functor's input parameter //! ``block_aggregate`` is the same value also returned by the scan operation. //! The functor will be invoked by the first warp of threads in the block, //! however only the return value from *lane*\ :sub:`0` is applied //! as the block-wide prefix. Can be stateful. //! - Supports non-commutative scan operators. //! - @rowmajor //! - @smemreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates a single thread block that progressively //! computes an inclusive prefix max scan over multiple "tiles" of input using a //! prefix functor to maintain a running total between block-wide scans. Each tile consists //! of 128 integer items that are partitioned across 128 threads. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! // A stateful callback functor that maintains a running prefix to be applied //! // during consecutive scan operations. //! struct BlockPrefixCallbackOp //! { //! // Running prefix //! int running_total; //! //! // Constructor //! __host__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} //! //! // Callback operator to be entered by the first warp of threads in the block. //! // Thread-0 is responsible for returning a value for seeding the block-wide scan. //! __host__ int operator()(int block_aggregate) //! { //! int old_prefix = running_total; //! running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix; //! return old_prefix; //! } //! }; //! //! __global__ void ExampleKernel(int *d_data, int num_items, ...) //! { //! // Specialize BlockScan for a 1D block of 128 threads //! typedef cub::BlockScan BlockScan; //! //! // Allocate shared memory for BlockScan //! __shared__ typename BlockScan::TempStorage temp_storage; //! //! // Initialize running total //! BlockPrefixCallbackOp prefix_op(INT_MIN); //! //! // Have the block iterate over segments of items //! for (int block_offset = 0; block_offset < num_items; block_offset += 128) //! { //! // Load a segment of consecutive items that are blocked across threads //! int thread_data = d_data[block_offset]; //! //! // Collectively compute the block-wide inclusive prefix max scan //! BlockScan(temp_storage).InclusiveScan( //! thread_data, thread_data, cub::Max(), prefix_op); //! CTA_SYNC(); //! //! // Store scanned items to output segment //! d_data[block_offset] = thread_data; //! } //! //! Suppose the input ``d_data`` is ``0, -1, 2, -3, 4, -5, ...``. //! The corresponding output for the first segment will be //! ``0, 0, 2, 2, ..., 126, 126``. The output for the second segment //! will be ``128, 128, 130, 130, ..., 254, 254``. //! //! @endrst //! //! @tparam ScanOp //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)` //! //! @tparam BlockPrefixCallbackOp //! **[inferred]** Call-back functor type having member `T operator()(T block_aggregate)` //! //! @param[in] input //! Calling thread's input item //! //! @param[out] output //! Calling thread's output item (may be aliased to `input`) //! //! @param[in] scan_op //! Binary scan functor //! //! @param[in,out] block_prefix_callback_op //! @rst //! *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied to //! the logical input sequence. //! @endrst template _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScan(T input, T& output, ScanOp scan_op, BlockPrefixCallbackOp& block_prefix_callback_op) { InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op, block_prefix_callback_op); } //! @} end member group //! @name Inclusive prefix scan operations (multiple data per thread) //! @{ //! @rst //! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor. //! Each thread contributes an array of consecutive input elements. //! //! - Supports non-commutative scan operators. //! - @blocked //! - @granularity //! - @smemreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates an inclusive prefix max scan of 512 integer items that //! are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads //! where each thread owns 4 consecutive items. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! __global__ void ExampleKernel(...) //! { //! // Specialize BlockScan for a 1D block of 128 threads of type int //! typedef cub::BlockScan BlockScan; //! //! // Allocate shared memory for BlockScan //! __shared__ typename BlockScan::TempStorage temp_storage; //! //! // Obtain a segment of consecutive items that are blocked across threads //! int thread_data[4]; //! ... //! //! // Collectively compute the block-wide inclusive prefix max scan //! BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max()); //! //! Suppose the set of input ``thread_data`` across the block of threads is //! ``{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }``. //! The corresponding output ``thread_data`` in those threads will be //! ``{ [0,0,2,2], [4,4,6,6], ..., [508,508,510,510] }``. //! //! @endrst //! //! @tparam ITEMS_PER_THREAD //! **[inferred]** The number of consecutive items partitioned onto each thread. //! //! @tparam ScanOp //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)` //! //! @param[in] input //! Calling thread's input items //! //! @param[out] output //! Calling thread's output items (may be aliased to `input`) //! //! @param[in] scan_op //! Binary scan functor template _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScan(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], ScanOp scan_op) { if (ITEMS_PER_THREAD == 1) { InclusiveScan(input[0], output[0], scan_op); } else { // Reduce consecutive thread items in registers T thread_prefix = internal::ThreadReduce(input, scan_op); // Exclusive thread block-scan ExclusiveScan(thread_prefix, thread_prefix, scan_op); // Inclusive scan in registers with prefix as seed (first thread does not seed) internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0)); } } //! @rst //! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor. //! Each thread contributes an array of consecutive input elements. Also provides every thread //! with the block-wide ``block_aggregate`` of all inputs. //! //! - Supports non-commutative scan operators. //! - @blocked //! - @granularity //! - @smemreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates an inclusive prefix max scan of 512 integer items that //! are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads //! where each thread owns 4 consecutive items. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! __global__ void ExampleKernel(...) //! { //! // Specialize BlockScan for a 1D block of 128 threads of type int //! typedef cub::BlockScan BlockScan; //! //! // Allocate shared memory for BlockScan //! __shared__ typename BlockScan::TempStorage temp_storage; //! //! // Obtain a segment of consecutive items that are blocked across threads //! int thread_data[4]; //! ... //! //! // Collectively compute the block-wide inclusive prefix max scan //! int block_aggregate; //! BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max(), block_aggregate); //! //! Suppose the set of input ``thread_data`` across the block of threads is //! ``{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }``. //! The corresponding output ``thread_data`` in those threads will be //! ``{ [0,0,2,2], [4,4,6,6], ..., [508,508,510,510] }``. //! Furthermore the value ``510`` will be stored in ``block_aggregate`` for all threads. //! //! @endrst //! //! @tparam ITEMS_PER_THREAD //! **[inferred]** The number of consecutive items partitioned onto each thread. //! //! @tparam ScanOp //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)` //! //! @param[in] input //! Calling thread's input items //! //! @param[out] output //! Calling thread's output items (may be aliased to `input`) //! //! @param[in] scan_op //! Binary scan functor //! //! @param[out] block_aggregate //! Block-wide aggregate reduction of input items template _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScan(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], ScanOp scan_op, T& block_aggregate) { if (ITEMS_PER_THREAD == 1) { InclusiveScan(input[0], output[0], scan_op, block_aggregate); } else { // Reduce consecutive thread items in registers T thread_prefix = internal::ThreadReduce(input, scan_op); // Exclusive thread block-scan (with no initial value) ExclusiveScan(thread_prefix, thread_prefix, scan_op, block_aggregate); // Inclusive scan in registers with prefix as seed (first thread does not seed) internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0)); } } //! @rst //! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor. //! Each thread contributes an array of consecutive input elements. //! The call-back functor ``block_prefix_callback_op`` is invoked by the first warp in the block, //! and the value returned by *lane*\ :sub:`0` in that warp is used as the "seed" value that logically prefixes the //! thread block's scan inputs. Also provides every thread with the block-wide ``block_aggregate`` of all inputs. //! //! - The ``block_prefix_callback_op`` functor must implement a member function ``T operator()(T block_aggregate)``. //! The functor's input parameter ``block_aggregate`` is the same value also returned by the scan operation. //! The functor will be invoked by the first warp of threads in the block, however only the return value //! from *lane*\ :sub:`0` is applied as the block-wide prefix. Can be stateful. //! - Supports non-commutative scan operators. //! - @blocked //! - @granularity //! - @smemreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates a single thread block that progressively //! computes an inclusive prefix max scan over multiple "tiles" of input using a //! prefix functor to maintain a running total between block-wide scans. Each tile consists //! of 128 integer items that are partitioned across 128 threads. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! // A stateful callback functor that maintains a running prefix to be applied //! // during consecutive scan operations. //! struct BlockPrefixCallbackOp //! { //! // Running prefix //! int running_total; //! //! // Constructor //! __host__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} //! //! // Callback operator to be entered by the first warp of threads in the block. //! // Thread-0 is responsible for returning a value for seeding the block-wide scan. //! __host__ int operator()(int block_aggregate) //! { //! int old_prefix = running_total; //! running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix; //! return old_prefix; //! } //! }; //! //! __global__ void ExampleKernel(int *d_data, int num_items, ...) //! { //! // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread //! typedef cub::BlockLoad BlockLoad; //! typedef cub::BlockStore BlockStore; //! typedef cub::BlockScan BlockScan; //! //! // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan //! __shared__ union { //! typename BlockLoad::TempStorage load; //! typename BlockScan::TempStorage scan; //! typename BlockStore::TempStorage store; //! } temp_storage; //! //! // Initialize running total //! BlockPrefixCallbackOp prefix_op(0); //! //! // Have the block iterate over segments of items //! for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4) //! { //! // Load a segment of consecutive items that are blocked across threads //! int thread_data[4]; //! BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data); //! CTA_SYNC(); //! //! // Collectively compute the block-wide inclusive prefix max scan //! BlockScan(temp_storage.scan).InclusiveScan( //! thread_data, thread_data, cub::Max(), prefix_op); //! CTA_SYNC(); //! //! // Store scanned items to output segment //! BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data); //! CTA_SYNC(); //! } //! //! Suppose the input ``d_data`` is ``0, -1, 2, -3, 4, -5, ...``. //! The corresponding output for the first segment will be //! ``0, 0, 2, 2, 4, 4, ..., 510, 510``. The output for the second //! segment will be ``512, 512, 514, 514, 516, 516, ..., 1022, 1022``. //! //! @endrst //! //! @tparam ITEMS_PER_THREAD //! **[inferred]** The number of consecutive items partitioned onto each thread. //! //! @tparam ScanOp //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)` //! //! @tparam BlockPrefixCallbackOp //! **[inferred]** Call-back functor type having member `T operator()(T block_aggregate)` //! //! @param[in] input //! Calling thread's input items //! //! @param[out] output //! Calling thread's output items (may be aliased to `input`) //! //! @param[in] scan_op //! Binary scan functor //! //! @param[in,out] block_prefix_callback_op //! @rst //! *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied to //! the logical input sequence. //! @endrst template _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScan( T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], ScanOp scan_op, BlockPrefixCallbackOp& block_prefix_callback_op) { if (ITEMS_PER_THREAD == 1) { InclusiveScan(input[0], output[0], scan_op, block_prefix_callback_op); } else { // Reduce consecutive thread items in registers T thread_prefix = internal::ThreadReduce(input, scan_op); // Exclusive thread block-scan ExclusiveScan(thread_prefix, thread_prefix, scan_op, block_prefix_callback_op); // Inclusive scan in registers with prefix as seed internal::ThreadScanInclusive(input, output, scan_op, thread_prefix); } } //! @} end member group }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/block/block_shuffle.cuh000066400000000000000000000251271463375617100203710ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ //! @file The cub::BlockShuffle class provides :ref:`collective ` methods for shuffling //! data partitioned across a CUDA thread block. #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include CUB_NAMESPACE_BEGIN //! @rst //! The BlockShuffle class provides :ref:`collective ` //! methods for shuffling data partitioned across a CUDA thread block. //! //! Overview //! ++++++++++++++++ //! //! It is commonplace for blocks of threads to rearrange data items between threads. //! The BlockShuffle abstraction allows threads to efficiently shift items either //! (a) up to their successor or //! (b) down to their predecessor //! //! @endrst //! //! @tparam T //! The data type to be exchanged. //! //! @tparam BLOCK_DIM_X //! The thread block length in threads along the X dimension //! //! @tparam BLOCK_DIM_Y //! **[optional]** The thread block length in threads along the Y dimension (default: 1) //! //! @tparam BLOCK_DIM_Z //! **[optional]** The thread block length in threads along the Z dimension (default: 1) //! //! @tparam LEGACY_PTX_ARCH //! **[optional]** Unused template class BlockShuffle { private: enum { BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, LOG_WARP_THREADS = CUB_LOG_WARP_THREADS(0), WARP_THREADS = 1 << LOG_WARP_THREADS, WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, }; /// Shared memory storage layout type (last element from each thread's input) typedef T _TempStorage[BLOCK_THREADS]; public: /// \smemstorage{BlockShuffle} struct TempStorage : Uninitialized<_TempStorage> {}; private: /// Shared storage reference _TempStorage& temp_storage; /// Linear thread-id unsigned int linear_tid; /// Internal storage allocator _CCCL_DEVICE _CCCL_FORCEINLINE _TempStorage& PrivateStorage() { __shared__ _TempStorage private_storage; return private_storage; } public: //! @name Collective constructors //! @{ //! @brief Collective constructor using a private static allocation of shared memory as temporary storage. _CCCL_DEVICE _CCCL_FORCEINLINE BlockShuffle() : temp_storage(PrivateStorage()) , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) {} /** * @brief Collective constructor using the specified memory allocation * as temporary storage. * * @param[in] temp_storage * Reference to memory allocation having layout type TempStorage */ _CCCL_DEVICE _CCCL_FORCEINLINE BlockShuffle(TempStorage& temp_storage) : temp_storage(temp_storage.Alias()) , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) {} //! @} end member group //! @name Shuffle movement //! @{ //! @rst //! //! Each *thread*\ :sub:`i` obtains the ``input`` provided by *thread*\ :sub:`i + distance`. //! The offset ``distance`` may be negative. //! //! - @smemreuse //! //! @endrst //! //! @param[in] input //! @rst //! The input item from the calling thread (*thread*\ :sub:`i`) //! @endrst //! //! @param[out] output //! @rst //! The ``input`` item from the successor (or predecessor) thread //! *thread*\ :sub:`i + distance` (may be aliased to ``input``). //! This value is only updated for for *thread*\ :sub:`i` when //! ``0 <= (i + distance) < BLOCK_THREADS - 1`` //! @endrst //! //! @param[in] distance //! Offset distance (may be negative) _CCCL_DEVICE _CCCL_FORCEINLINE void Offset(T input, T& output, int distance = 1) { temp_storage[linear_tid] = input; CTA_SYNC(); const int offset_tid = static_cast(linear_tid) + distance; if ((offset_tid >= 0) && (offset_tid < BLOCK_THREADS)) { output = temp_storage[static_cast(offset_tid)]; } } //! @rst //! Each *thread*\ :sub:`i` obtains the ``input`` provided by *thread*\ :sub:`i + distance`. //! //! - @smemreuse //! //! @endrst //! //! @param[in] input //! The calling thread's input item //! //! @param[out] output //! @rst //! The ``input`` item from thread //! *thread*\ :sub:`(i + distance>) % BLOCK_THREADS` (may be aliased to ``input``). //! This value is not updated for *thread*\ :sub:`BLOCK_THREADS - 1`. //! @endrst //! //! @param[in] distance //! Offset distance (`0 < distance < `BLOCK_THREADS`) _CCCL_DEVICE _CCCL_FORCEINLINE void Rotate(T input, T& output, unsigned int distance = 1) { temp_storage[linear_tid] = input; CTA_SYNC(); unsigned int offset = linear_tid + distance; if (offset >= BLOCK_THREADS) { offset -= BLOCK_THREADS; } output = temp_storage[offset]; } //! @rst //! The thread block rotates its :ref:`blocked arrangement ` of //! ``input`` items, shifting it up by one item. //! //! - @blocked //! - @granularity //! - @smemreuse //! //! @endrst //! //! @param[in] input //! The calling thread's input items //! //! @param[out] prev //! @rst //! The corresponding predecessor items (may be aliased to ``input``). //! The item ``prev[0]`` is not updated for *thread*\ :sub:`0`. //! @endrst template _CCCL_DEVICE _CCCL_FORCEINLINE void Up(T (&input)[ITEMS_PER_THREAD], T (&prev)[ITEMS_PER_THREAD]) { temp_storage[linear_tid] = input[ITEMS_PER_THREAD - 1]; CTA_SYNC(); #pragma unroll for (int ITEM = ITEMS_PER_THREAD - 1; ITEM > 0; --ITEM) { prev[ITEM] = input[ITEM - 1]; } if (linear_tid > 0) { prev[0] = temp_storage[linear_tid - 1]; } } //! @rst //! The thread block rotates its :ref:`blocked arrangement ` //! of ``input`` items, shifting it up by one item. All threads receive the ``input`` provided by //! *thread*\ :sub:`BLOCK_THREADS - 1`. //! //! - @blocked //! - @granularity //! - @smemreuse //! //! @endrst //! //! @param[in] input //! The calling thread's input items //! //! @param[out] prev //! @rst //! The corresponding predecessor items (may be aliased to ``input``). //! The item ``prev[0]`` is not updated for *thread*\ :sub:`0`. //! @endrst //! //! @param[out] block_suffix //! @rst //! The item ``input[ITEMS_PER_THREAD - 1]`` from *thread*\ :sub:`BLOCK_THREADS - 1`, provided to all threads //! @endrst template _CCCL_DEVICE _CCCL_FORCEINLINE void Up(T (&input)[ITEMS_PER_THREAD], T (&prev)[ITEMS_PER_THREAD], T& block_suffix) { Up(input, prev); block_suffix = temp_storage[BLOCK_THREADS - 1]; } //! @rst //! The thread block rotates its :ref:`blocked arrangement ` //! of ``input`` items, shifting it down by one item. //! //! - @blocked //! - @granularity //! - @smemreuse //! //! @endrst //! //! @param[in] input //! The calling thread's input items //! //! @param[out] prev //! @rst //! The corresponding predecessor items (may be aliased to ``input``). //! The value ``prev[0]`` is not updated for *thread*\ :sub:`BLOCK_THREADS - 1`. //! @endrst template _CCCL_DEVICE _CCCL_FORCEINLINE void Down(T (&input)[ITEMS_PER_THREAD], T (&prev)[ITEMS_PER_THREAD]) { temp_storage[linear_tid] = input[0]; CTA_SYNC(); #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD - 1; ITEM++) { prev[ITEM] = input[ITEM + 1]; } if (linear_tid < BLOCK_THREADS - 1) { prev[ITEMS_PER_THREAD - 1] = temp_storage[linear_tid + 1]; } } //! @rst //! The thread block rotates its :ref:`blocked arrangement ` of input items, //! shifting it down by one item. All threads receive ``input[0]`` provided by *thread*\ :sub:`0`. //! //! - @blocked //! - @granularity //! - @smemreuse //! //! @endrst //! //! @param[in] input //! The calling thread's input items //! //! @param[out] prev //! @rst //! The corresponding predecessor items (may be aliased to ``input``). //! The value ``prev[0]`` is not updated for *thread*\ :sub:`BLOCK_THREADS - 1`. //! @endrst //! //! @param[out] block_prefix //! @rst //! The item ``input[0]`` from *thread*\ :sub:`0`, provided to all threads //! @endrst template _CCCL_DEVICE _CCCL_FORCEINLINE void Down(T (&input)[ITEMS_PER_THREAD], T (&prev)[ITEMS_PER_THREAD], T& block_prefix) { Down(input, prev); block_prefix = temp_storage[0]; } //! @} end member group }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/block/block_store.cuh000066400000000000000000001211151463375617100200630ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ //! @file Operations for writing linear segments of data from the CUDA thread block #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include CUB_NAMESPACE_BEGIN //! @name Blocked arrangement I/O (direct) //! @{ //! @rst //! Store a blocked arrangement of items across a thread block into a linear segment of items //! //! @blocked //! //! @endrst //! //! @tparam T //! **[inferred]** The data type to store. //! //! @tparam ITEMS_PER_THREAD //! **[inferred]** The number of consecutive items partitioned onto each thread. //! //! @tparam OutputIteratorT //! **[inferred]** The random-access iterator type for output @iterator. //! //! @param[in] linear_tid //! A suitable 1D thread-identifier for the calling thread //! (e.g., ``(threadIdx.y * blockDim.x) + linear_tid`` for 2D thread blocks) //! //! @param[in] block_itr //! The thread block's base output iterator for storing to //! //! @param[in] items //! Data to store template _CCCL_DEVICE _CCCL_FORCEINLINE void StoreDirectBlocked(int linear_tid, OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD]) { OutputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD); // Store directly in thread-blocked order #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { thread_itr[ITEM] = items[ITEM]; } } //! @rst //! Store a blocked arrangement of items across a //! thread block into a linear segment of items, guarded by range //! //! @blocked //! //! @endrst //! //! @tparam T //! **[inferred]** The data type to store. //! //! @tparam ITEMS_PER_THREAD //! **[inferred]** The number of consecutive items partitioned onto each thread. //! //! @tparam OutputIteratorT //! **[inferred]** The random-access iterator type for output @iterator. //! //! @param[in] linear_tid //! A suitable 1D thread-identifier for the calling thread //! (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks) //! //! @param[in] block_itr //! The thread block's base output iterator for storing to //! //! @param[in] items //! Data to store //! //! @param[in] valid_items //! Number of valid items to write template _CCCL_DEVICE _CCCL_FORCEINLINE void StoreDirectBlocked(int linear_tid, OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD], int valid_items) { OutputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD); // Store directly in thread-blocked order #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { if (ITEM + (linear_tid * ITEMS_PER_THREAD) < valid_items) { thread_itr[ITEM] = items[ITEM]; } } } //! @rst //! Store a blocked arrangement of items across a //! thread block into a linear segment of items. //! //! @blocked //! //! The output offset (``block_ptr + block_offset``) must be quad-item aligned, //! which is the default starting offset returned by ``cudaMalloc()`` //! //! The following conditions will prevent vectorization and storing will //! fall back to cub::BLOCK_STORE_DIRECT: //! //! - ``ITEMS_PER_THREAD`` is odd //! - The data type ``T`` is not a built-in primitive or CUDA vector type //! (e.g., ``short``, ``int2``, ``double``, ``float2``, etc.) //! //! @endrst //! //! @tparam T //! **[inferred]** The data type to store. //! //! @tparam ITEMS_PER_THREAD //! **[inferred]** The number of consecutive items partitioned onto each thread. //! //! @param[in] linear_tid //! A suitable 1D thread-identifier for the calling thread //! (e.g., ``(threadIdx.y * blockDim.x) + linear_tid`` for 2D thread blocks) //! //! @param[in] block_ptr //! Input pointer for storing from //! //! @param[in] items //! Data to store template _CCCL_DEVICE _CCCL_FORCEINLINE void StoreDirectBlockedVectorized(int linear_tid, T* block_ptr, T (&items)[ITEMS_PER_THREAD]) { enum { // Maximum CUDA vector size is 4 elements MAX_VEC_SIZE = CUB_MIN(4, ITEMS_PER_THREAD), // Vector size must be a power of two and an even divisor of the items per thread VEC_SIZE = ((((MAX_VEC_SIZE - 1) & MAX_VEC_SIZE) == 0) && ((ITEMS_PER_THREAD % MAX_VEC_SIZE) == 0)) ? MAX_VEC_SIZE : 1, VECTORS_PER_THREAD = ITEMS_PER_THREAD / VEC_SIZE, }; // Vector type typedef typename CubVector::Type Vector; // Alias global pointer Vector* block_ptr_vectors = reinterpret_cast(const_cast(block_ptr)); // Alias pointers (use "raw" array here which should get optimized away to prevent conservative PTXAS lmem spilling) Vector raw_vector[VECTORS_PER_THREAD]; T* raw_items = reinterpret_cast(raw_vector); // Copy #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { raw_items[ITEM] = items[ITEM]; } // Direct-store using vector types StoreDirectBlocked(linear_tid, block_ptr_vectors, raw_vector); } //! @} end member group //! @name Striped arrangement I/O (direct) //! @{ //! @rst //! Store a striped arrangement of data across the thread block into a //! linear segment of items. //! //! @striped //! //! @endrst //! //! @tparam BLOCK_THREADS //! The thread block size in threads //! //! @tparam T //! **[inferred]** The data type to store. //! //! @tparam ITEMS_PER_THREAD //! **[inferred]** The number of consecutive items partitioned onto each thread. //! //! @tparam OutputIteratorT //! **[inferred]** The random-access iterator type for output @iterator. //! //! @param[in] linear_tid //! A suitable 1D thread-identifier for the calling thread //! (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks) //! //! @param[in] block_itr //! The thread block's base output iterator for storing to //! //! @param[in] items //! Data to store template _CCCL_DEVICE _CCCL_FORCEINLINE void StoreDirectStriped(int linear_tid, OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD]) { OutputIteratorT thread_itr = block_itr + linear_tid; // Store directly in striped order #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { thread_itr[(ITEM * BLOCK_THREADS)] = items[ITEM]; } } //! @rst //! Store a striped arrangement of data across the thread block into //! a linear segment of items, guarded by range //! //! @striped //! //! @endrst //! //! @tparam BLOCK_THREADS //! The thread block size in threads //! //! @tparam T //! **[inferred]** The data type to store. //! //! @tparam ITEMS_PER_THREAD //! **[inferred]** The number of consecutive items partitioned onto each thread. //! //! @tparam OutputIteratorT //! **[inferred]** The random-access iterator type for output @iterator. //! //! @param[in] linear_tid //! A suitable 1D thread-identifier for the calling thread //! (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks) //! //! @param[in] block_itr //! The thread block's base output iterator for storing to //! //! @param[in] items //! Data to store //! //! @param[in] valid_items //! Number of valid items to write template _CCCL_DEVICE _CCCL_FORCEINLINE void StoreDirectStriped(int linear_tid, OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD], int valid_items) { OutputIteratorT thread_itr = block_itr + linear_tid; // Store directly in striped order #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { if ((ITEM * BLOCK_THREADS) + linear_tid < valid_items) { thread_itr[(ITEM * BLOCK_THREADS)] = items[ITEM]; } } } //! @} end member group //! @name Warp-striped arrangement I/O (direct) //! @{ //! @rst //! Store a warp-striped arrangement of data across the //! thread block into a linear segment of items. //! //! @warpstriped //! //! Usage Considerations //! ++++++++++++++++++++ //! //! The number of threads in the thread block must be a multiple of the architecture's warp size. //! //! @endrst //! //! @tparam T //! **[inferred]** The data type to store. //! //! @tparam ITEMS_PER_THREAD //! **[inferred]** The number of consecutive items partitioned onto each thread. //! //! @tparam OutputIteratorT //! **[inferred]** The random-access iterator type for output @iterator. //! //! @param[in] linear_tid //! A suitable 1D thread-identifier for the calling thread //! (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks) //! //! @param[in] block_itr //! The thread block's base output iterator for storing to //! //! @param[out] items //! Data to load template _CCCL_DEVICE _CCCL_FORCEINLINE void StoreDirectWarpStriped(int linear_tid, OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD]) { int tid = linear_tid & (CUB_PTX_WARP_THREADS - 1); int wid = linear_tid >> CUB_PTX_LOG_WARP_THREADS; int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD; OutputIteratorT thread_itr = block_itr + warp_offset + tid; // Store directly in warp-striped order #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { thread_itr[(ITEM * CUB_PTX_WARP_THREADS)] = items[ITEM]; } } //! @rst //! Store a warp-striped arrangement of data across the thread block into a //! linear segment of items, guarded by range //! //! @warpstriped //! //! Usage Considerations //! ++++++++++++++++++++ //! //! The number of threads in the thread block must be a multiple of the architecture's warp size. //! //! @endrst //! //! @tparam T //! **[inferred]** The data type to store. //! //! @tparam ITEMS_PER_THREAD //! **[inferred]** The number of consecutive items partitioned onto each thread. //! //! @tparam OutputIteratorT //! **[inferred]** The random-access iterator type for output @iterator. //! //! @param[in] linear_tid //! A suitable 1D thread-identifier for the calling thread //! (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks) //! //! @param[in] block_itr //! The thread block's base output iterator for storing to //! //! @param[in] items //! Data to store //! //! @param[in] valid_items //! Number of valid items to write template _CCCL_DEVICE _CCCL_FORCEINLINE void StoreDirectWarpStriped(int linear_tid, OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD], int valid_items) { int tid = linear_tid & (CUB_PTX_WARP_THREADS - 1); int wid = linear_tid >> CUB_PTX_LOG_WARP_THREADS; int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD; OutputIteratorT thread_itr = block_itr + warp_offset + tid; // Store directly in warp-striped order #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { if (warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS) < valid_items) { thread_itr[(ITEM * CUB_PTX_WARP_THREADS)] = items[ITEM]; } } } //! @} end member group //----------------------------------------------------------------------------- // Generic BlockStore abstraction //----------------------------------------------------------------------------- //! cub::BlockStoreAlgorithm enumerates alternative algorithms for cub::BlockStore to write a //! blocked arrangement of items across a CUDA thread block to a linear segment of memory. enum BlockStoreAlgorithm { //! @rst //! Overview //! ++++++++++++++++++++++++++ //! //! A :ref:`blocked arrangement ` of data is written directly to memory. //! //! Performance Considerations //! ++++++++++++++++++++++++++ //! //! - The utilization of memory transactions (coalescing) decreases as the //! access stride between threads increases (i.e., the number items per thread). //! //! @endrst BLOCK_STORE_DIRECT, //! @rst //! Overview //! ++++++++++++++++++++++++++ //! //! A :ref:`striped arrangement ` of data is written directly to memory. //! //! Performance Considerations //! ++++++++++++++++++++++++++ //! //! The utilization of memory transactions (coalescing) remains high regardless //! of items written per thread. //! //! @endrst BLOCK_STORE_STRIPED, //! @rst //! Overview //! ++++++++++++++++++++++++++ //! //! A :ref:`blocked arrangement ` of data is written directly //! to memory using CUDA's built-in vectorized stores as a coalescing optimization. //! For example, ``st.global.v4.s32`` instructions will be generated //! when ``T = int`` and ``ITEMS_PER_THREAD % 4 == 0``. //! //! Performance Considerations //! ++++++++++++++++++++++++++ //! //! - The utilization of memory transactions (coalescing) remains high until the the //! access stride between threads (i.e., the number items per thread) exceeds the //! maximum vector store width (typically 4 items or 64B, whichever is lower). //! - The following conditions will prevent vectorization and writing will fall back to cub::BLOCK_STORE_DIRECT: //! - ``ITEMS_PER_THREAD`` is odd //! - The ``OutputIteratorT`` is not a simple pointer type //! - The block output offset is not quadword-aligned //! - The data type ``T`` is not a built-in primitive or CUDA vector type //! (e.g., ``short``, ``int2``, ``double``, ``float2``, etc.) //! //! @endrst BLOCK_STORE_VECTORIZE, //! @rst //! Overview //! ++++++++++++++++++++++++++ //! //! A :ref:`blocked arrangement ` is locally //! transposed and then efficiently written to memory as a :ref:`striped arrangement `. //! //! Performance Considerations //! ++++++++++++++++++++++++++ //! //! - The utilization of memory transactions (coalescing) remains high regardless //! of items written per thread. //! - The local reordering incurs slightly longer latencies and throughput than the //! direct cub::BLOCK_STORE_DIRECT and cub::BLOCK_STORE_VECTORIZE alternatives. //! //! @endrst BLOCK_STORE_TRANSPOSE, //! @rst //! Overview //! ++++++++++++++++++++++++++ //! A :ref:`blocked arrangement ` is locally //! transposed and then efficiently written to memory as a //! :ref:`warp-striped arrangement `. //! //! Usage Considerations //! ++++++++++++++++++++++++++ //! //! - BLOCK_THREADS must be a multiple of WARP_THREADS //! //! Performance Considerations //! ++++++++++++++++++++++++++ //! //! - The utilization of memory transactions (coalescing) remains high regardless //! of items written per thread. //! - The local reordering incurs slightly longer latencies and throughput than the //! direct cub::BLOCK_STORE_DIRECT and cub::BLOCK_STORE_VECTORIZE alternatives. //! //! @endrst BLOCK_STORE_WARP_TRANSPOSE, //! @rst //! Overview //! ++++++++++++++++++++++++++ //! //! A :ref:`blocked arrangement ` is locally //! transposed and then efficiently written to memory as a //! :ref:`warp-striped arrangement `. //! To reduce the shared memory requirement, only one warp's worth of shared //! memory is provisioned and is subsequently time-sliced among warps. //! //! Usage Considerations //! ++++++++++++++++++++++++++ //! //! - BLOCK_THREADS must be a multiple of WARP_THREADS //! //! Performance Considerations //! ++++++++++++++++++++++++++ //! //! - The utilization of memory transactions (coalescing) remains high regardless //! of items written per thread. //! - Provisions less shared memory temporary storage, but incurs larger //! latencies than the BLOCK_STORE_WARP_TRANSPOSE alternative. //! //! @endrst BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, }; //! @rst //! The BlockStore class provides :ref:`collective ` data movement //! methods for writing a :ref:`blocked arrangement ` of items //! partitioned across a CUDA thread block to a linear segment of memory. //! //! Overview //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! - The BlockStore class provides a single data movement abstraction that can be specialized //! to implement different cub::BlockStoreAlgorithm strategies. This facilitates different //! performance policies for different architectures, data types, granularity sizes, etc. //! - BlockStore can be optionally specialized by different data movement strategies: //! //! #. :cpp:enumerator:`cub::BLOCK_STORE_DIRECT`: //! A :ref:`blocked arrangement ` of data is written directly to memory. //! #. :cpp:enumerator:`cub::BLOCK_STORE_STRIPED`: //! A :ref:`striped arrangement ` of data is written directly to memory. //! #. :cpp:enumerator:`cub::BLOCK_STORE_VECTORIZE`: //! A :ref:`blocked arrangement ` of data is written directly to memory //! using CUDA's built-in vectorized stores as a coalescing optimization. //! #. :cpp:enumerator:`cub::BLOCK_STORE_TRANSPOSE` //! A :ref:`blocked arrangement ` is locally transposed into //! a :ref:`striped arrangement ` which is then written to memory. //! #. :cpp:enumerator:`cub::BLOCK_STORE_WARP_TRANSPOSE: //! A :ref:`blocked arrangement ` is locally transposed into //! a :ref:`warp-striped arrangement ` which is then written to memory. //! #. :cpp:enumerator:`cub::BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED`: //! A :ref:`blocked arrangement ` is locally transposed into //! a :ref:`warp-striped arrangement ` which is then written to memory. //! To reduce the shared memory requireent, only one warp's worth of shared memory is provisioned and is //! subsequently time-sliced among warps. //! //! - @rowmajor //! //! A Simple Example //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! @blockcollective{BlockStore} //! //! The code snippet below illustrates the storing of a "blocked" arrangement //! of 512 integers across 128 threads (where each thread owns 4 consecutive items) //! into a linear segment of memory. The store is specialized for ``BLOCK_STORE_WARP_TRANSPOSE``, //! meaning items are locally reordered among threads so that memory references will be //! efficiently coalesced using a warp-striped access pattern. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! __global__ void ExampleKernel(int *d_data, ...) //! { //! // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each //! typedef cub::BlockStore BlockStore; //! //! // Allocate shared memory for BlockStore //! __shared__ typename BlockStore::TempStorage temp_storage; //! //! // Obtain a segment of consecutive items that are blocked across threads //! int thread_data[4]; //! ... //! //! // Store items to linear memory //! BlockStore(temp_storage).Store(d_data, thread_data); //! //! Suppose the set of ``thread_data`` across the block of threads is //! ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }``. //! The output ``d_data`` will be ``0, 1, 2, 3, 4, 5, ...``. //! //! Re-using dynamically allocating shared memory //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The ``block/example_block_reduce_dyn_smem.cu`` example illustrates usage of //! dynamically shared memory with BlockReduce and how to re-purpose the same memory region. //! This example can be easily adapted to the storage required by BlockStore. //! //! @endrst //! //! @tparam T //! The type of data to be written. //! //! @tparam BLOCK_DIM_X //! The thread block length in threads along the X dimension //! //! @tparam ITEMS_PER_THREAD //! The number of consecutive items partitioned onto each thread. //! //! @tparam ALGORITHM //! **[optional]** cub::BlockStoreAlgorithm tuning policy enumeration (default: cub::BLOCK_STORE_DIRECT) //! //! @tparam BLOCK_DIM_Y //! **[optional]** The thread block length in threads along the Y dimension (default: 1) //! //! @tparam BLOCK_DIM_Z //! **[optional]** The thread block length in threads along the Z dimension (default: 1) //! //! @tparam LEGACY_PTX_ARCH //! **[optional]** Unused. template class BlockStore { private: enum { /// The thread block size in threads BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, }; /// Store helper template struct StoreInternal; template struct StoreInternal { /// Shared memory storage layout type typedef NullType TempStorage; /// Linear thread-id int linear_tid; /// Constructor _CCCL_DEVICE _CCCL_FORCEINLINE StoreInternal(TempStorage& /*temp_storage*/, int linear_tid) : linear_tid(linear_tid) {} /** * @brief Store items into a linear segment of memory * * @param[in] block_itr * The thread block's base output iterator for storing to * * @param[in] items * Data to store */ template _CCCL_DEVICE _CCCL_FORCEINLINE void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD]) { StoreDirectBlocked(linear_tid, block_itr, items); } /** * @brief Store items into a linear segment of memory, guarded by range * * @param[in] block_itr * The thread block's base output iterator for storing to * * @param[in] items * Data to store * * @param[in] valid_items * Number of valid items to write */ template _CCCL_DEVICE _CCCL_FORCEINLINE void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD], int valid_items) { StoreDirectBlocked(linear_tid, block_itr, items, valid_items); } }; /** * BLOCK_STORE_STRIPED specialization of store helper */ template struct StoreInternal { /// Shared memory storage layout type typedef NullType TempStorage; /// Linear thread-id int linear_tid; /// Constructor _CCCL_DEVICE _CCCL_FORCEINLINE StoreInternal(TempStorage& /*temp_storage*/, int linear_tid) : linear_tid(linear_tid) {} /** * @brief Store items into a linear segment of memory * * @param[in] block_itr * The thread block's base output iterator for storing to * * @param[in] items * Data to store */ template _CCCL_DEVICE _CCCL_FORCEINLINE void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD]) { StoreDirectStriped(linear_tid, block_itr, items); } /** * @brief Store items into a linear segment of memory, guarded by range * * @param[in] block_itr * The thread block's base output iterator for storing to * * @param[in] items * Data to store * * @param[in] valid_items * Number of valid items to write */ template _CCCL_DEVICE _CCCL_FORCEINLINE void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD], int valid_items) { StoreDirectStriped(linear_tid, block_itr, items, valid_items); } }; /** * BLOCK_STORE_VECTORIZE specialization of store helper */ template struct StoreInternal { /// Shared memory storage layout type typedef NullType TempStorage; /// Linear thread-id int linear_tid; /// Constructor _CCCL_DEVICE _CCCL_FORCEINLINE StoreInternal(TempStorage& /*temp_storage*/, int linear_tid) : linear_tid(linear_tid) {} /** * @brief Store items into a linear segment of memory, * specialized for native pointer types (attempts vectorization) * * @param[in] block_ptr * The thread block's base output iterator for storing to * * @param[in] items * Data to store */ _CCCL_DEVICE _CCCL_FORCEINLINE void Store(T* block_ptr, T (&items)[ITEMS_PER_THREAD]) { StoreDirectBlockedVectorized(linear_tid, block_ptr, items); } /** * @brief Store items into a linear segment of memory, * specialized for opaque input iterators (skips vectorization) * * @param[in] block_itr * The thread block's base output iterator for storing to * * @param[in] items * Data to store */ template _CCCL_DEVICE _CCCL_FORCEINLINE void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD]) { StoreDirectBlocked(linear_tid, block_itr, items); } /** * @brief Store items into a linear segment of memory, guarded by range * * @param[in] block_itr * The thread block's base output iterator for storing to * * @param[in] items * Data to store * * @param[in] valid_items * Number of valid items to write */ template _CCCL_DEVICE _CCCL_FORCEINLINE void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD], int valid_items) { StoreDirectBlocked(linear_tid, block_itr, items, valid_items); } }; /** * BLOCK_STORE_TRANSPOSE specialization of store helper */ template struct StoreInternal { // BlockExchange utility type for keys typedef BlockExchange BlockExchange; /// Shared memory storage layout type struct _TempStorage : BlockExchange::TempStorage { /// Temporary storage for partially-full block guard volatile int valid_items; }; /// Alias wrapper allowing storage to be unioned struct TempStorage : Uninitialized<_TempStorage> {}; /// Thread reference to shared storage _TempStorage& temp_storage; /// Linear thread-id int linear_tid; /// Constructor _CCCL_DEVICE _CCCL_FORCEINLINE StoreInternal(TempStorage& temp_storage, int linear_tid) : temp_storage(temp_storage.Alias()) , linear_tid(linear_tid) {} /** * @brief Store items into a linear segment of memory * * @param[in] block_itr * The thread block's base output iterator for storing to * * @param[in] items * Data to store */ template _CCCL_DEVICE _CCCL_FORCEINLINE void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD]) { BlockExchange(temp_storage).BlockedToStriped(items); StoreDirectStriped(linear_tid, block_itr, items); } /** * @brief Store items into a linear segment of memory, guarded by range * * @param[in] block_itr * The thread block's base output iterator for storing to * * @param[in] items * Data to store * * @param[in] valid_items * Number of valid items to write */ template _CCCL_DEVICE _CCCL_FORCEINLINE void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD], int valid_items) { BlockExchange(temp_storage).BlockedToStriped(items); if (linear_tid == 0) { // Move through volatile smem as a workaround to prevent RF spilling on // subsequent loads temp_storage.valid_items = valid_items; } CTA_SYNC(); StoreDirectStriped(linear_tid, block_itr, items, temp_storage.valid_items); } }; /** * BLOCK_STORE_WARP_TRANSPOSE specialization of store helper */ template struct StoreInternal { enum { WARP_THREADS = CUB_WARP_THREADS(0) }; // Assert BLOCK_THREADS must be a multiple of WARP_THREADS CUB_STATIC_ASSERT((int(BLOCK_THREADS) % int(WARP_THREADS) == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS"); // BlockExchange utility type for keys typedef BlockExchange BlockExchange; /// Shared memory storage layout type struct _TempStorage : BlockExchange::TempStorage { /// Temporary storage for partially-full block guard volatile int valid_items; }; /// Alias wrapper allowing storage to be unioned struct TempStorage : Uninitialized<_TempStorage> {}; /// Thread reference to shared storage _TempStorage& temp_storage; /// Linear thread-id int linear_tid; /// Constructor _CCCL_DEVICE _CCCL_FORCEINLINE StoreInternal(TempStorage& temp_storage, int linear_tid) : temp_storage(temp_storage.Alias()) , linear_tid(linear_tid) {} /** * @brief Store items into a linear segment of memory * * @param[in] block_itr * The thread block's base output iterator for storing to * * @param[in] items * Data to store */ template _CCCL_DEVICE _CCCL_FORCEINLINE void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD]) { BlockExchange(temp_storage).BlockedToWarpStriped(items); StoreDirectWarpStriped(linear_tid, block_itr, items); } /** * @brief Store items into a linear segment of memory, guarded by range * * @param[in] block_itr * The thread block's base output iterator for storing to * * @param[in] items * Data to store * * @param[in] valid_items * Number of valid items to write */ template _CCCL_DEVICE _CCCL_FORCEINLINE void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD], int valid_items) { BlockExchange(temp_storage).BlockedToWarpStriped(items); if (linear_tid == 0) { // Move through volatile smem as a workaround to prevent RF spilling on // subsequent loads temp_storage.valid_items = valid_items; } CTA_SYNC(); StoreDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items); } }; /** * BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED specialization of store helper */ template struct StoreInternal { enum { WARP_THREADS = CUB_WARP_THREADS(0) }; // Assert BLOCK_THREADS must be a multiple of WARP_THREADS CUB_STATIC_ASSERT((int(BLOCK_THREADS) % int(WARP_THREADS) == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS"); // BlockExchange utility type for keys typedef BlockExchange BlockExchange; /// Shared memory storage layout type struct _TempStorage : BlockExchange::TempStorage { /// Temporary storage for partially-full block guard volatile int valid_items; }; /// Alias wrapper allowing storage to be unioned struct TempStorage : Uninitialized<_TempStorage> {}; /// Thread reference to shared storage _TempStorage& temp_storage; /// Linear thread-id int linear_tid; /// Constructor _CCCL_DEVICE _CCCL_FORCEINLINE StoreInternal(TempStorage& temp_storage, int linear_tid) : temp_storage(temp_storage.Alias()) , linear_tid(linear_tid) {} /** * @brief Store items into a linear segment of memory * * @param[in] block_itr * The thread block's base output iterator for storing to * * @param[in] items * Data to store */ template _CCCL_DEVICE _CCCL_FORCEINLINE void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD]) { BlockExchange(temp_storage).BlockedToWarpStriped(items); StoreDirectWarpStriped(linear_tid, block_itr, items); } /** * @brief Store items into a linear segment of memory, guarded by range * * @param[in] block_itr * The thread block's base output iterator for storing to * * @param[in] items * Data to store * * @param[in] valid_items * Number of valid items to write */ template _CCCL_DEVICE _CCCL_FORCEINLINE void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD], int valid_items) { BlockExchange(temp_storage).BlockedToWarpStriped(items); if (linear_tid == 0) { // Move through volatile smem as a workaround to prevent RF spilling on // subsequent loads temp_storage.valid_items = valid_items; } CTA_SYNC(); StoreDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items); } }; /// Internal load implementation to use typedef StoreInternal InternalStore; /// Shared memory storage layout type typedef typename InternalStore::TempStorage _TempStorage; /// Internal storage allocator _CCCL_DEVICE _CCCL_FORCEINLINE _TempStorage& PrivateStorage() { __shared__ _TempStorage private_storage; return private_storage; } /// Thread reference to shared storage _TempStorage& temp_storage; /// Linear thread-id int linear_tid; public: //! @smemstorage{BlockStore} struct TempStorage : Uninitialized<_TempStorage> {}; //! @name Collective constructors //! @{ /** * @brief Collective constructor using a private static allocation of shared memory as temporary storage. */ _CCCL_DEVICE _CCCL_FORCEINLINE BlockStore() : temp_storage(PrivateStorage()) , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) {} /** * @brief Collective constructor using the specified memory allocation as temporary storage. * * @param temp_storage[in] * Reference to memory allocation having layout type TempStorage */ _CCCL_DEVICE _CCCL_FORCEINLINE BlockStore(TempStorage& temp_storage) : temp_storage(temp_storage.Alias()) , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) {} //! @} end member group //! @name Data movement //! @{ //! @rst //! Store items into a linear segment of memory //! //! - @blocked //! - @smemreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates the storing of a "blocked" arrangement //! of 512 integers across 128 threads (where each thread owns 4 consecutive items) //! into a linear segment of memory. The store is specialized for ``BLOCK_STORE_WARP_TRANSPOSE``, //! meaning items are locally reordered among threads so that memory references will be //! efficiently coalesced using a warp-striped access pattern. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! __global__ void ExampleKernel(int *d_data, ...) //! { //! // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each //! typedef cub::BlockStore BlockStore; //! //! // Allocate shared memory for BlockStore //! __shared__ typename BlockStore::TempStorage temp_storage; //! //! // Obtain a segment of consecutive items that are blocked across threads //! int thread_data[4]; //! ... //! //! // Store items to linear memory //! int thread_data[4]; //! BlockStore(temp_storage).Store(d_data, thread_data); //! //! Suppose the set of ``thread_data`` across the block of threads is //! ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }``. //! The output ``d_data`` will be ``0, 1, 2, 3, 4, 5, ...``. //! //! @endrst //! //! @param block_itr[out] //! The thread block's base output iterator for storing to //! //! @param items[in] //! Data to store template _CCCL_DEVICE _CCCL_FORCEINLINE void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD]) { InternalStore(temp_storage, linear_tid).Store(block_itr, items); } //! @rst //! Store items into a linear segment of memory, guarded by range. //! //! - @blocked //! - @smemreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates the guarded storing of a "blocked" arrangement //! of 512 integers across 128 threads (where each thread owns 4 consecutive items) //! into a linear segment of memory. The store is specialized for ``BLOCK_STORE_WARP_TRANSPOSE``, //! meaning items are locally reordered among threads so that memory references will be //! efficiently coalesced using a warp-striped access pattern. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! __global__ void ExampleKernel(int *d_data, int valid_items, ...) //! { //! // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each //! typedef cub::BlockStore BlockStore; //! //! // Allocate shared memory for BlockStore //! __shared__ typename BlockStore::TempStorage temp_storage; //! //! // Obtain a segment of consecutive items that are blocked across threads //! int thread_data[4]; //! ... //! //! // Store items to linear memory //! int thread_data[4]; //! BlockStore(temp_storage).Store(d_data, thread_data, valid_items); //! //! Suppose the set of ``thread_data`` across the block of threads is //! ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }`` and ``valid_items`` is ``5``. //! The output ``d_data`` will be ``0, 1, 2, 3, 4, ?, ?, ?, ...``, with //! only the first two threads being unmasked to store portions of valid data. //! //! @endrst //! //! @param block_itr[out] //! The thread block's base output iterator for storing to //! //! @param items[in] //! Data to store //! //! @param valid_items[in] //! Number of valid items to write template _CCCL_DEVICE _CCCL_FORCEINLINE void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD], int valid_items) { InternalStore(temp_storage, linear_tid).Store(block_itr, items, valid_items); } //! @} end member group }; #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template > struct BlockStoreType { using type = cub::BlockStore; }; #endif // DOXYGEN_SHOULD_SKIP_THIS CUB_NAMESPACE_END cccl-2.5.0/cub/cub/block/radix_rank_sort_operations.cuh000066400000000000000000000466461463375617100232300ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * radix_rank_sort_operations.cuh contains common abstractions, definitions and * operations used for radix sorting and ranking. */ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include #include #include #include CUB_NAMESPACE_BEGIN /** \brief Base struct for digit extractor. Contains common code to provide special handling for floating-point -0.0. \note This handles correctly both the case when the keys are bitwise-complemented after twiddling for descending sort (in onesweep) as well as when the keys are not bit-negated, but the implementation handles descending sort separately (in other implementations in CUB). Twiddling alone maps -0.0f to 0x7fffffff and +0.0f to 0x80000000 for float, which are subsequent bit patterns and bitwise complements of each other. For onesweep, both -0.0f and +0.0f are mapped to the bit pattern of +0.0f (0x80000000) for ascending sort, and to the pattern of -0.0f (0x7fffffff) for descending sort. For all other sorting implementations in CUB, both are always mapped to +0.0f. Since bit patterns for both -0.0f and +0.0f are next to each other and only one of them is used, the sorting works correctly. For double, the same applies, but with 64-bit patterns. */ template ::CATEGORY> struct BaseDigitExtractor { using TraitsT = Traits; using UnsignedBits = typename TraitsT::UnsignedBits; static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE UnsignedBits ProcessFloatMinusZero(UnsignedBits key) { return key; } }; template struct BaseDigitExtractor { using TraitsT = Traits; using UnsignedBits = typename TraitsT::UnsignedBits; static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE UnsignedBits ProcessFloatMinusZero(UnsignedBits key) { UnsignedBits TWIDDLED_MINUS_ZERO_BITS = TraitsT::TwiddleIn(UnsignedBits(1) << UnsignedBits(8 * sizeof(UnsignedBits) - 1)); UnsignedBits TWIDDLED_ZERO_BITS = TraitsT::TwiddleIn(0); return key == TWIDDLED_MINUS_ZERO_BITS ? TWIDDLED_ZERO_BITS : key; } }; /** \brief A wrapper type to extract digits. Uses the BFE intrinsic to extract a * key from a digit. */ template struct BFEDigitExtractor : BaseDigitExtractor { using typename BaseDigitExtractor::UnsignedBits; ::cuda::std::uint32_t bit_start; ::cuda::std::uint32_t num_bits; explicit _CCCL_DEVICE _CCCL_FORCEINLINE BFEDigitExtractor(::cuda::std::uint32_t bit_start = 0, ::cuda::std::uint32_t num_bits = 0) : bit_start(bit_start) , num_bits(num_bits) {} _CCCL_DEVICE _CCCL_FORCEINLINE ::cuda::std::uint32_t Digit(UnsignedBits key) const { return BFE(this->ProcessFloatMinusZero(key), bit_start, num_bits); } }; /** \brief A wrapper type to extract digits. Uses a combination of shift and * bitwise and to extract digits. */ template struct ShiftDigitExtractor : BaseDigitExtractor { using typename BaseDigitExtractor::UnsignedBits; ::cuda::std::uint32_t bit_start; ::cuda::std::uint32_t mask; explicit _CCCL_DEVICE _CCCL_FORCEINLINE ShiftDigitExtractor(::cuda::std::uint32_t bit_start = 0, ::cuda::std::uint32_t num_bits = 0) : bit_start(bit_start) , mask((1 << num_bits) - 1) {} _CCCL_DEVICE _CCCL_FORCEINLINE ::cuda::std::uint32_t Digit(UnsignedBits key) const { return ::cuda::std::uint32_t(this->ProcessFloatMinusZero(key) >> UnsignedBits(bit_start)) & mask; } }; #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document namespace detail { template struct logic_helper_t; template struct true_t { static constexpr bool value = true; }; template using all_t = // ::cuda::std::is_same< // logic_helper_t, // logic_helper_t::value...>>; struct identity_decomposer_t { template _CCCL_HOST_DEVICE T& operator()(T& key) const { return key; } }; template _CCCL_HOST_DEVICE void for_each_member_impl_helper(F f, const ::cuda::std::tuple& tpl, THRUST_NS_QUALIFIER::index_sequence) { auto sink = {(f(::cuda::std::get(tpl)), 0)...}; (void) sink; } template _CCCL_HOST_DEVICE void for_each_member_impl(F f, const ::cuda::std::tuple& tpl) { static_assert(sizeof...(Ts), "Empty aggregates are not supported"); // Most radix operations are indifferent to the order of operations. // Conversely, the digit extractor traverses fields from the least significant // to the most significant to imitate bitset printing where higher bits are on // the left. It also maps to intuition, where something coming first is more // important. Therefore, we traverse fields on the opposite order. for_each_member_impl_helper(f, tpl, THRUST_NS_QUALIFIER::make_reversed_index_sequence{}); } template _CCCL_HOST_DEVICE void for_each_member(F f, DecomposerT decomposer, T& aggregate) { for_each_member_impl(f, decomposer(aggregate)); } namespace radix { template using void_t = void; template struct is_fundamental_type { static constexpr bool value = false; }; template struct is_fundamental_type::UnsignedBits>> { static constexpr bool value = true; }; template struct is_tuple_of_references_to_fundamental_types_t : ::cuda::std::false_type {}; template struct is_tuple_of_references_to_fundamental_types_t< // ::cuda::std::tuple, // typename ::cuda::std::enable_if< // all_t::value...>::value // >::type> // : ::cuda::std::true_type {}; template using decomposer_check_t = is_tuple_of_references_to_fundamental_types_t>; template struct bit_ordered_conversion_policy_t { using bit_ordered_type = typename Traits::UnsignedBits; static _CCCL_HOST_DEVICE bit_ordered_type to_bit_ordered(detail::identity_decomposer_t, bit_ordered_type val) { return Traits::TwiddleIn(val); } static _CCCL_HOST_DEVICE bit_ordered_type from_bit_ordered(detail::identity_decomposer_t, bit_ordered_type val) { return Traits::TwiddleOut(val); } }; template struct bit_ordered_inversion_policy_t { using bit_ordered_type = typename Traits::UnsignedBits; static _CCCL_HOST_DEVICE bit_ordered_type inverse(detail::identity_decomposer_t, bit_ordered_type val) { return ~val; } }; template ::value> struct traits_t { using bit_ordered_type = typename Traits::UnsignedBits; using bit_ordered_conversion_policy = bit_ordered_conversion_policy_t; using bit_ordered_inversion_policy = bit_ordered_inversion_policy_t; template using digit_extractor_t = FundamentalExtractorT; static _CCCL_HOST_DEVICE bit_ordered_type min_raw_binary_key(detail::identity_decomposer_t) { return Traits::LOWEST_KEY; } static _CCCL_HOST_DEVICE bit_ordered_type max_raw_binary_key(detail::identity_decomposer_t) { return Traits::MAX_KEY; } static _CCCL_HOST_DEVICE int default_end_bit(detail::identity_decomposer_t) { return sizeof(T) * 8; } template static _CCCL_HOST_DEVICE digit_extractor_t digit_extractor(int begin_bit, int num_bits, detail::identity_decomposer_t) { return FundamentalExtractorT(begin_bit, num_bits); } }; template struct min_raw_binary_key_f { DecomposerT decomposer; template _CCCL_HOST_DEVICE void operator()(T& field) { using traits = traits_t::type>; using bit_ordered_type = typename traits::bit_ordered_type; reinterpret_cast(field) = traits::min_raw_binary_key(detail::identity_decomposer_t{}); } }; template _CCCL_HOST_DEVICE void min_raw_binary_key(DecomposerT decomposer, T& aggregate) { detail::for_each_member(min_raw_binary_key_f{decomposer}, decomposer, aggregate); } template struct max_raw_binary_key_f { DecomposerT decomposer; template _CCCL_HOST_DEVICE void operator()(T& field) { using traits = traits_t::type>; using bit_ordered_type = typename traits::bit_ordered_type; reinterpret_cast(field) = traits::max_raw_binary_key(detail::identity_decomposer_t{}); } }; template _CCCL_HOST_DEVICE void max_raw_binary_key(DecomposerT decomposer, T& aggregate) { detail::for_each_member(max_raw_binary_key_f{decomposer}, decomposer, aggregate); } template struct to_bit_ordered_f { DecomposerT decomposer; template _CCCL_HOST_DEVICE void operator()(T& field) { using traits = traits_t::type>; using bit_ordered_type = typename traits::bit_ordered_type; using bit_ordered_conversion = typename traits::bit_ordered_conversion_policy; auto& ordered_field = reinterpret_cast(field); ordered_field = bit_ordered_conversion::to_bit_ordered(detail::identity_decomposer_t{}, ordered_field); } }; template _CCCL_HOST_DEVICE void to_bit_ordered(DecomposerT decomposer, T& aggregate) { detail::for_each_member(to_bit_ordered_f{decomposer}, decomposer, aggregate); } template struct from_bit_ordered_f { DecomposerT decomposer; template _CCCL_HOST_DEVICE void operator()(T& field) { using traits = traits_t::type>; using bit_ordered_type = typename traits::bit_ordered_type; using bit_ordered_conversion = typename traits::bit_ordered_conversion_policy; auto& ordered_field = reinterpret_cast(field); ordered_field = bit_ordered_conversion::from_bit_ordered(detail::identity_decomposer_t{}, ordered_field); } }; template _CCCL_HOST_DEVICE void from_bit_ordered(DecomposerT decomposer, T& aggregate) { detail::for_each_member(from_bit_ordered_f{decomposer}, decomposer, aggregate); } template struct inverse_f { DecomposerT decomposer; template _CCCL_HOST_DEVICE void operator()(T& field) { using traits = traits_t::type>; using bit_ordered_type = typename traits::bit_ordered_type; auto& ordered_field = reinterpret_cast(field); ordered_field = ~ordered_field; } }; template _CCCL_HOST_DEVICE void inverse(DecomposerT decomposer, T& aggregate) { detail::for_each_member(inverse_f{decomposer}, decomposer, aggregate); } template struct default_end_bit_f { int& result; DecomposerT decomposer; template _CCCL_HOST_DEVICE void operator()(T& field) { result += sizeof(field) * 8; } }; template _CCCL_HOST_DEVICE int default_end_bit(DecomposerT decomposer, T& aggregate) { int result{}; detail::for_each_member(default_end_bit_f{result, decomposer}, decomposer, aggregate); return result; } struct digit_f { ::cuda::std::uint32_t& dst; ::cuda::std::uint32_t& dst_bit_start; ::cuda::std::uint32_t& src_bit_start; ::cuda::std::uint32_t& num_bits; template _CCCL_HOST_DEVICE void operator()(T& src) { constexpr ::cuda::std::uint32_t src_size = sizeof(T) * 8; if (src_bit_start >= src_size) { src_bit_start -= src_size; } else { using traits = traits_t::type>; using bit_ordered_type = typename traits::bit_ordered_type; const ::cuda::std::uint32_t bits_to_copy = min(src_size - src_bit_start, num_bits); if (bits_to_copy) { bit_ordered_type ordered_src = BaseDigitExtractor::ProcessFloatMinusZero(reinterpret_cast(src)); const ::cuda::std::uint32_t mask = (1 << bits_to_copy) - 1; dst = dst | (((ordered_src >> src_bit_start) & mask) << dst_bit_start); num_bits -= bits_to_copy; dst_bit_start += bits_to_copy; } src_bit_start = 0; } } }; template _CCCL_HOST_DEVICE void digit(DecomposerT decomposer, ::cuda::std::uint32_t& dst, T& src, ::cuda::std::uint32_t& dst_bit_start, ::cuda::std::uint32_t& src_bit_start, ::cuda::std::uint32_t& num_bits) { detail::for_each_member(digit_f{dst, dst_bit_start, src_bit_start, num_bits}, decomposer, src); } template struct custom_digit_extractor_t { DecomposerT decomposer; ::cuda::std::uint32_t bit_start; ::cuda::std::uint32_t num_bits; _CCCL_HOST_DEVICE _CCCL_FORCEINLINE custom_digit_extractor_t(DecomposerT decomposer, ::cuda::std::uint32_t bit_start, ::cuda::std::uint32_t num_bits) : decomposer(decomposer) , bit_start(bit_start) , num_bits(num_bits) {} template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE ::cuda::std::uint32_t Digit(T& key) const { ::cuda::std::uint32_t result{}; ::cuda::std::uint32_t dst_bit_start{}; ::cuda::std::uint32_t src_bit_start = bit_start; ::cuda::std::uint32_t bits_remaining{num_bits}; digit(decomposer, result, key, dst_bit_start, src_bit_start, bits_remaining); return result; } }; struct custom_bit_conversion_policy_t { template static _CCCL_HOST_DEVICE T to_bit_ordered(DecomposerT decomposer, T val) { detail::radix::to_bit_ordered(decomposer, val); return val; } template static _CCCL_HOST_DEVICE T from_bit_ordered(DecomposerT decomposer, T val) { detail::radix::from_bit_ordered(decomposer, val); return val; } }; struct custom_bit_inversion_policy_t { template static _CCCL_HOST_DEVICE T inverse(DecomposerT decomposer, T val) { detail::radix::inverse(decomposer, val); return val; } }; template struct traits_t { using bit_ordered_type = T; using bit_ordered_conversion_policy = custom_bit_conversion_policy_t; using bit_ordered_inversion_policy = custom_bit_inversion_policy_t; template using digit_extractor_t = custom_digit_extractor_t; template static _CCCL_HOST_DEVICE bit_ordered_type min_raw_binary_key(DecomposerT decomposer) { T val{}; detail::radix::min_raw_binary_key(decomposer, val); return val; } template static _CCCL_HOST_DEVICE bit_ordered_type max_raw_binary_key(DecomposerT decomposer) { T val{}; detail::radix::max_raw_binary_key(decomposer, val); return val; } template static _CCCL_HOST_DEVICE int default_end_bit(DecomposerT decomposer) { T aggregate{}; return detail::radix::default_end_bit(decomposer, aggregate); } template static _CCCL_HOST_DEVICE digit_extractor_t digit_extractor(int begin_bit, int num_bits, DecomposerT decomposer) { return custom_digit_extractor_t(decomposer, begin_bit, num_bits); } }; } // namespace radix } // namespace detail #endif // DOXYGEN_SHOULD_SKIP_THIS //! Twiddling keys for radix sort template struct RadixSortTwiddle { private: using traits = detail::radix::traits_t; using bit_ordered_type = typename traits::bit_ordered_type; using bit_ordered_conversion_policy = typename traits::bit_ordered_conversion_policy; using bit_ordered_inversion_policy = typename traits::bit_ordered_inversion_policy; public: template static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE // bit_ordered_type In(bit_ordered_type key, DecomposerT decomposer = {}) { key = bit_ordered_conversion_policy::to_bit_ordered(decomposer, key); _CCCL_IF_CONSTEXPR (IS_DESCENDING) { key = bit_ordered_inversion_policy::inverse(decomposer, key); } return key; } template static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE // bit_ordered_type Out(bit_ordered_type key, DecomposerT decomposer = {}) { _CCCL_IF_CONSTEXPR (IS_DESCENDING) { key = bit_ordered_inversion_policy::inverse(decomposer, key); } key = bit_ordered_conversion_policy::from_bit_ordered(decomposer, key); return key; } template static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE // bit_ordered_type DefaultKey(DecomposerT decomposer = {}) { return IS_DESCENDING ? traits::min_raw_binary_key(decomposer) : traits::max_raw_binary_key(decomposer); } }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/block/specializations/000077500000000000000000000000001463375617100202545ustar00rootroot00000000000000cccl-2.5.0/cub/cub/block/specializations/block_histogram_atomic.cuh000066400000000000000000000063331463375617100254650ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * @file * The cub::BlockHistogramAtomic class provides atomic-based methods for constructing block-wide * histograms from data samples partitioned across a CUDA thread block. */ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header CUB_NAMESPACE_BEGIN /** * @brief The BlockHistogramAtomic class provides atomic-based methods for constructing block-wide * histograms from data samples partitioned across a CUDA thread block. */ template struct BlockHistogramAtomic { /// Shared memory storage layout type struct TempStorage {}; /// Constructor _CCCL_DEVICE _CCCL_FORCEINLINE BlockHistogramAtomic(TempStorage& temp_storage) {} /** * @brief Composite data onto an existing histogram * * @param[in] items * Calling thread's input values to histogram * * @param[out] histogram * Reference to shared/device-accessible memory histogram */ template _CCCL_DEVICE _CCCL_FORCEINLINE void Composite(T (&items)[ITEMS_PER_THREAD], CounterT histogram[BINS]) { // Update histogram #pragma unroll for (int i = 0; i < ITEMS_PER_THREAD; ++i) { atomicAdd(histogram + items[i], 1); } } }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/block/specializations/block_histogram_sort.cuh000066400000000000000000000200271463375617100251740ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * @file * The cub::BlockHistogramSort class provides sorting-based methods for constructing block-wide * histograms from data samples partitioned across a CUDA thread block. */ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include CUB_NAMESPACE_BEGIN /** * @brief The BlockHistogramSort class provides sorting-based methods for constructing block-wide * histograms from data samples partitioned across a CUDA thread block. * * @tparam T * Sample type * * @tparam BLOCK_DIM_X * The thread block length in threads along the X dimension * * @tparam ITEMS_PER_THREAD * The number of samples per thread * * @tparam BINS * The number of bins into which histogram samples may fall * * @tparam BLOCK_DIM_Y * The thread block length in threads along the Y dimension * * @tparam BLOCK_DIM_Z * The thread block length in threads along the Z dimension * * @tparam LEGACY_PTX_ARCH * The PTX compute capability for which to to specialize this collective (unused) */ template struct BlockHistogramSort { /// Constants enum { /// The thread block size in threads BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, }; // Parameterize BlockRadixSort type for our thread block typedef BlockRadixSort BlockRadixSortT; // Parameterize BlockDiscontinuity type for our thread block typedef BlockDiscontinuity BlockDiscontinuityT; /// Shared memory union _TempStorage { // Storage for sorting bin values typename BlockRadixSortT::TempStorage sort; struct Discontinuities { // Storage for detecting discontinuities in the tile of sorted bin values typename BlockDiscontinuityT::TempStorage flag; // Storage for noting begin/end offsets of bin runs in the tile of sorted bin values unsigned int run_begin[BINS]; unsigned int run_end[BINS]; } discontinuities; }; /// Alias wrapper allowing storage to be unioned struct TempStorage : Uninitialized<_TempStorage> {}; // Thread fields _TempStorage& temp_storage; unsigned int linear_tid; /// Constructor _CCCL_DEVICE _CCCL_FORCEINLINE BlockHistogramSort(TempStorage& temp_storage) : temp_storage(temp_storage.Alias()) , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) {} // Discontinuity functor struct DiscontinuityOp { // Reference to temp_storage _TempStorage& temp_storage; // Constructor _CCCL_DEVICE _CCCL_FORCEINLINE DiscontinuityOp(_TempStorage& temp_storage) : temp_storage(temp_storage) {} // Discontinuity predicate _CCCL_DEVICE _CCCL_FORCEINLINE bool operator()(const T& a, const T& b, int b_index) { if (a != b) { // Note the begin/end offsets in shared storage temp_storage.discontinuities.run_begin[b] = b_index; temp_storage.discontinuities.run_end[a] = b_index; return true; } else { return false; } } }; /** * @brief Composite data onto an existing histogram * * @param[in] items * Calling thread's input values to histogram * * @param[out] histogram * Reference to shared/device-accessible memory histogram */ template _CCCL_DEVICE _CCCL_FORCEINLINE void Composite(T (&items)[ITEMS_PER_THREAD], CounterT histogram[BINS]) { enum { TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD }; // Sort bytes in blocked arrangement BlockRadixSortT(temp_storage.sort).Sort(items); CTA_SYNC(); // Initialize the shared memory's run_begin and run_end for each bin int histo_offset = 0; #pragma unroll for (; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS) { temp_storage.discontinuities.run_begin[histo_offset + linear_tid] = TILE_SIZE; temp_storage.discontinuities.run_end[histo_offset + linear_tid] = TILE_SIZE; } // Finish up with guarded initialization if necessary if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS)) { temp_storage.discontinuities.run_begin[histo_offset + linear_tid] = TILE_SIZE; temp_storage.discontinuities.run_end[histo_offset + linear_tid] = TILE_SIZE; } CTA_SYNC(); int flags[ITEMS_PER_THREAD]; // unused // Compute head flags to demarcate contiguous runs of the same bin in the sorted tile DiscontinuityOp flag_op(temp_storage); BlockDiscontinuityT(temp_storage.discontinuities.flag).FlagHeads(flags, items, flag_op); // Update begin for first item if (linear_tid == 0) { temp_storage.discontinuities.run_begin[items[0]] = 0; } CTA_SYNC(); // Composite into histogram histo_offset = 0; #pragma unroll for (; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS) { int thread_offset = histo_offset + linear_tid; CounterT count = temp_storage.discontinuities.run_end[thread_offset] - temp_storage.discontinuities.run_begin[thread_offset]; histogram[thread_offset] += count; } // Finish up with guarded composition if necessary if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS)) { int thread_offset = histo_offset + linear_tid; CounterT count = temp_storage.discontinuities.run_end[thread_offset] - temp_storage.discontinuities.run_begin[thread_offset]; histogram[thread_offset] += count; } } }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/block/specializations/block_reduce_raking.cuh000066400000000000000000000222331463375617100247330ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * @file * cub::BlockReduceRaking provides raking-based methods of parallel reduction across a CUDA thread * block. Supports non-commutative reduction operators. */ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include CUB_NAMESPACE_BEGIN /** * @brief BlockReduceRaking provides raking-based methods of parallel reduction across a CUDA thread * block. Supports non-commutative reduction operators. * * Supports non-commutative binary reduction operators. Unlike commutative * reduction operators (e.g., addition), the application of a non-commutative * reduction operator (e.g, string concatenation) across a sequence of inputs must * honor the relative ordering of items and partial reductions when applying the * reduction operator. * * Compared to the implementation of BlockReduceRakingCommutativeOnly (which * does not support non-commutative operators), this implementation requires a * few extra rounds of inter-thread communication. * * @tparam T * Data type being reduced * * @tparam BLOCK_DIM_X * The thread block length in threads along the X dimension * * @tparam BLOCK_DIM_Y * The thread block length in threads along the Y dimension * * @tparam BLOCK_DIM_Z * The thread block length in threads along the Z dimension * * @tparam LEGACY_PTX_ARCH * The PTX compute capability for which to to specialize this collective */ template struct BlockReduceRaking { /// Constants enum { /// The thread block size in threads BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, }; /// Layout type for padded thread block raking grid typedef BlockRakingLayout BlockRakingLayout; /// WarpReduce utility type typedef typename WarpReduce::InternalWarpReduce WarpReduce; /// Constants enum { /// Number of raking threads RAKING_THREADS = BlockRakingLayout::RAKING_THREADS, /// Number of raking elements per warp synchronous raking thread SEGMENT_LENGTH = BlockRakingLayout::SEGMENT_LENGTH, /// Cooperative work can be entirely warp synchronous WARP_SYNCHRONOUS = (int(RAKING_THREADS) == int(BLOCK_THREADS)), /// Whether or not warp-synchronous reduction should be unguarded (i.e., the warp-reduction elements is a power of /// two WARP_SYNCHRONOUS_UNGUARDED = PowerOfTwo::VALUE, /// Whether or not accesses into smem are unguarded RAKING_UNGUARDED = BlockRakingLayout::UNGUARDED, }; /// Shared memory storage layout type union _TempStorage { /// Storage for warp-synchronous reduction typename WarpReduce::TempStorage warp_storage; /// Padded thread block raking grid typename BlockRakingLayout::TempStorage raking_grid; }; /// Alias wrapper allowing storage to be unioned struct TempStorage : Uninitialized<_TempStorage> {}; // Thread fields _TempStorage& temp_storage; unsigned int linear_tid; /// Constructor _CCCL_DEVICE _CCCL_FORCEINLINE BlockReduceRaking(TempStorage& temp_storage) : temp_storage(temp_storage.Alias()) , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) {} /** * @param[in] reduction_op * Binary reduction operator * * @param[in] partial * [lane0 only] Warp-wide aggregate reduction of input items * * @param[in] num_valid * Number of valid elements (may be less than BLOCK_THREADS) */ template _CCCL_DEVICE _CCCL_FORCEINLINE T RakingReduction( ReductionOp reduction_op, T* raking_segment, T partial, int num_valid, Int2Type /*iteration*/) { // Update partial if addend is in range if ((IS_FULL_TILE && RAKING_UNGUARDED) || ((linear_tid * SEGMENT_LENGTH) + ITERATION < num_valid)) { T addend = raking_segment[ITERATION]; partial = reduction_op(partial, addend); } return RakingReduction(reduction_op, raking_segment, partial, num_valid, Int2Type()); } /** * @param[in] reduction_op * Binary reduction operator * * @param[in] partial * [lane0 only] Warp-wide aggregate reduction of input items * * @param[in] num_valid * Number of valid elements (may be less than BLOCK_THREADS) */ template _CCCL_DEVICE _CCCL_FORCEINLINE T RakingReduction( ReductionOp /*reduction_op*/, T* /*raking_segment*/, T partial, int /*num_valid*/, Int2Type /*iteration*/) { return partial; } /** * @brief Computes a thread block-wide reduction using the specified reduction operator. The * first num_valid threads each contribute one reduction partial. The return value is * only valid for thread0. * * @param[in] partial * Calling thread's input partial reductions * * @param[in] num_valid * Number of valid elements (may be less than BLOCK_THREADS) * * @param[in] reduction_op * Binary reduction operator */ template _CCCL_DEVICE _CCCL_FORCEINLINE T Reduce(T partial, int num_valid, ReductionOp reduction_op) { if (WARP_SYNCHRONOUS) { // Short-circuit directly to warp synchronous reduction (unguarded if active threads is a power-of-two) partial = WarpReduce(temp_storage.warp_storage).template Reduce(partial, num_valid, reduction_op); } else { // Place partial into shared memory grid. *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid) = partial; CTA_SYNC(); // Reduce parallelism to one warp if (linear_tid < RAKING_THREADS) { // Raking reduction in grid T* raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid); partial = raking_segment[0]; partial = RakingReduction(reduction_op, raking_segment, partial, num_valid, Int2Type<1>()); int valid_raking_threads = (IS_FULL_TILE) ? RAKING_THREADS : (num_valid + SEGMENT_LENGTH - 1) / SEGMENT_LENGTH; partial = WarpReduce(temp_storage.warp_storage) .template Reduce<(IS_FULL_TILE && RAKING_UNGUARDED)>(partial, valid_raking_threads, reduction_op); } } return partial; } /** * @brief Computes a thread block-wide reduction using addition (+) as the reduction operator. * The first num_valid threads each contribute one reduction partial. The return value is * only valid for thread0. * * @param[in] partial * Calling thread's input partial reductions * * @param[in] num_valid * Number of valid elements (may be less than BLOCK_THREADS) */ template _CCCL_DEVICE _CCCL_FORCEINLINE T Sum(T partial, int num_valid) { cub::Sum reduction_op; return Reduce(partial, num_valid, reduction_op); } }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/block/specializations/block_reduce_raking_commutative_only.cuh000066400000000000000000000202371463375617100304130ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * @file * cub::BlockReduceRakingCommutativeOnly provides raking-based methods of parallel reduction across * a CUDA thread block. Does not support non-commutative reduction operators. */ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include CUB_NAMESPACE_BEGIN /** * @brief BlockReduceRakingCommutativeOnly provides raking-based methods of parallel reduction * across a CUDA thread block. Does not support non-commutative reduction operators. Does not * support block sizes that are not a multiple of the warp size. * * @tparam T * Data type being reduced * * @tparam BLOCK_DIM_X * The thread block length in threads along the X dimension * * @tparam BLOCK_DIM_Y * The thread block length in threads along the Y dimension * * @tparam BLOCK_DIM_Z * The thread block length in threads along the Z dimension * * @tparam LEGACY_PTX_ARCH * The PTX compute capability for which to to specialize this collective */ template struct BlockReduceRakingCommutativeOnly { /// Constants enum { /// The thread block size in threads BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, }; // The fall-back implementation to use when BLOCK_THREADS is not a multiple of the warp size or not all threads have // valid values typedef BlockReduceRaking FallBack; /// Constants enum { /// Number of warp threads WARP_THREADS = CUB_WARP_THREADS(0), /// Whether or not to use fall-back USE_FALLBACK = ((BLOCK_THREADS % WARP_THREADS != 0) || (BLOCK_THREADS <= WARP_THREADS)), /// Number of raking threads RAKING_THREADS = WARP_THREADS, /// Number of threads actually sharing items with the raking threads SHARING_THREADS = CUB_MAX(1, BLOCK_THREADS - RAKING_THREADS), /// Number of raking elements per warp synchronous raking thread SEGMENT_LENGTH = SHARING_THREADS / WARP_THREADS, }; /// WarpReduce utility type typedef WarpReduce WarpReduce; /// Layout type for padded thread block raking grid typedef BlockRakingLayout BlockRakingLayout; /// Shared memory storage layout type union _TempStorage { struct DefaultStorage { /// Storage for warp-synchronous reduction typename WarpReduce::TempStorage warp_storage; /// Padded thread block raking grid typename BlockRakingLayout::TempStorage raking_grid; } default_storage; /// Fall-back storage for non-commutative block reduction typename FallBack::TempStorage fallback_storage; }; /// Alias wrapper allowing storage to be unioned struct TempStorage : Uninitialized<_TempStorage> {}; // Thread fields _TempStorage& temp_storage; unsigned int linear_tid; /// Constructor _CCCL_DEVICE _CCCL_FORCEINLINE BlockReduceRakingCommutativeOnly(TempStorage& temp_storage) : temp_storage(temp_storage.Alias()) , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) {} /** * @brief Computes a thread block-wide reduction using addition (+) as the reduction operator. * The first num_valid threads each contribute one reduction partial. * The return value is only valid for thread0. * * @param[in] partial * Calling thread's input partial reductions * * @param[in] num_valid * Number of valid elements (may be less than BLOCK_THREADS) */ template _CCCL_DEVICE _CCCL_FORCEINLINE T Sum(T partial, int num_valid) { if (USE_FALLBACK || !FULL_TILE) { return FallBack(temp_storage.fallback_storage).template Sum(partial, num_valid); } else { // Place partial into shared memory grid if (linear_tid >= RAKING_THREADS) { *BlockRakingLayout::PlacementPtr(temp_storage.default_storage.raking_grid, linear_tid - RAKING_THREADS) = partial; } CTA_SYNC(); // Reduce parallelism to one warp if (linear_tid < RAKING_THREADS) { // Raking reduction in grid T* raking_segment = BlockRakingLayout::RakingPtr(temp_storage.default_storage.raking_grid, linear_tid); partial = internal::ThreadReduce(raking_segment, cub::Sum(), partial); // Warp reduction partial = WarpReduce(temp_storage.default_storage.warp_storage).Sum(partial); } } return partial; } /** * @brief Computes a thread block-wide reduction using the specified reduction operator. * The first num_valid threads each contribute one reduction partial. * The return value is only valid for thread0. * * @param[in] partial * Calling thread's input partial reductions * * @param[in] num_valid * Number of valid elements (may be less than BLOCK_THREADS) * * @param[in] reduction_op * Binary reduction operator */ template _CCCL_DEVICE _CCCL_FORCEINLINE T Reduce(T partial, int num_valid, ReductionOp reduction_op) { if (USE_FALLBACK || !FULL_TILE) { return FallBack(temp_storage.fallback_storage).template Reduce(partial, num_valid, reduction_op); } else { // Place partial into shared memory grid if (linear_tid >= RAKING_THREADS) { *BlockRakingLayout::PlacementPtr(temp_storage.default_storage.raking_grid, linear_tid - RAKING_THREADS) = partial; } CTA_SYNC(); // Reduce parallelism to one warp if (linear_tid < RAKING_THREADS) { // Raking reduction in grid T* raking_segment = BlockRakingLayout::RakingPtr(temp_storage.default_storage.raking_grid, linear_tid); partial = internal::ThreadReduce(raking_segment, reduction_op, partial); // Warp reduction partial = WarpReduce(temp_storage.default_storage.warp_storage).Reduce(partial, reduction_op); } } return partial; } }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/block/specializations/block_reduce_warp_reductions.cuh000066400000000000000000000225361463375617100266760ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * @file * cub::BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction * across a CUDA thread block. Supports non-commutative reduction operators. */ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include CUB_NAMESPACE_BEGIN /** * @brief BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction * across a CUDA thread block. Supports non-commutative reduction operators. * @tparam T * Data type being reduced * * @tparam BLOCK_DIM_X * The thread block length in threads along the X dimension * * @tparam BLOCK_DIM_Y * The thread block length in threads along the Y dimension * * @tparam BLOCK_DIM_Z * The thread block length in threads along the Z dimension * * @tparam LEGACY_PTX_ARCH * The PTX compute capability for which to to specialize this collective */ template struct BlockReduceWarpReductions { /// Constants enum { /// The thread block size in threads BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, /// Number of warp threads WARP_THREADS = CUB_WARP_THREADS(0), /// Number of active warps WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, /// The logical warp size for warp reductions LOGICAL_WARP_SIZE = CUB_MIN(BLOCK_THREADS, WARP_THREADS), /// Whether or not the logical warp size evenly divides the thread block size EVEN_WARP_MULTIPLE = (BLOCK_THREADS % LOGICAL_WARP_SIZE == 0) }; /// WarpReduce utility type typedef typename WarpReduce::InternalWarpReduce WarpReduce; /// Shared memory storage layout type struct _TempStorage { /// Buffer for warp-synchronous reduction typename WarpReduce::TempStorage warp_reduce[WARPS]; /// Shared totals from each warp-synchronous reduction T warp_aggregates[WARPS]; /// Shared prefix for the entire thread block T block_prefix; }; /// Alias wrapper allowing storage to be unioned struct TempStorage : Uninitialized<_TempStorage> {}; // Thread fields _TempStorage& temp_storage; int linear_tid; int warp_id; int lane_id; /// Constructor _CCCL_DEVICE _CCCL_FORCEINLINE BlockReduceWarpReductions(TempStorage& temp_storage) : temp_storage(temp_storage.Alias()) , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) , warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS) , lane_id(LaneId()) {} /** * @param[in] reduction_op * Binary reduction operator * * @param[in] warp_aggregate * [lane0 only] Warp-wide aggregate reduction of input items * * @param[in] num_valid * Number of valid elements (may be less than BLOCK_THREADS) */ template _CCCL_DEVICE _CCCL_FORCEINLINE T ApplyWarpAggregates( ReductionOp reduction_op, T warp_aggregate, int num_valid, Int2Type /*successor_warp*/) { if (FULL_TILE || (SUCCESSOR_WARP * LOGICAL_WARP_SIZE < num_valid)) { T addend = temp_storage.warp_aggregates[SUCCESSOR_WARP]; warp_aggregate = reduction_op(warp_aggregate, addend); } return ApplyWarpAggregates(reduction_op, warp_aggregate, num_valid, Int2Type()); } /** * @param[in] reduction_op * Binary reduction operator * * @param[in] warp_aggregate * [lane0 only] Warp-wide aggregate reduction of input items * * @param[in] num_valid * Number of valid elements (may be less than BLOCK_THREADS) */ template _CCCL_DEVICE _CCCL_FORCEINLINE T ApplyWarpAggregates( ReductionOp /*reduction_op*/, T warp_aggregate, int /*num_valid*/, Int2Type /*successor_warp*/) { return warp_aggregate; } /** * @brief Returns block-wide aggregate in thread0. * * @param[in] reduction_op * Binary reduction operator * * @param[in] warp_aggregate * [lane0 only] Warp-wide aggregate reduction of input items * * @param[in] num_valid * Number of valid elements (may be less than BLOCK_THREADS) */ template _CCCL_DEVICE _CCCL_FORCEINLINE T ApplyWarpAggregates(ReductionOp reduction_op, T warp_aggregate, int num_valid) { // Share lane aggregates if (lane_id == 0) { detail::uninitialized_copy(temp_storage.warp_aggregates + warp_id, warp_aggregate); } CTA_SYNC(); // Update total aggregate in warp 0, lane 0 if (linear_tid == 0) { warp_aggregate = ApplyWarpAggregates(reduction_op, warp_aggregate, num_valid, Int2Type<1>()); } return warp_aggregate; } /** * @brief Computes a thread block-wide reduction using addition (+) as the reduction operator. * The first num_valid threads each contribute one reduction partial. The return value is * only valid for thread0. * * @param[in] input * Calling thread's input partial reductions * * @param[in] num_valid * Number of valid elements (may be less than BLOCK_THREADS) */ template _CCCL_DEVICE _CCCL_FORCEINLINE T Sum(T input, int num_valid) { cub::Sum reduction_op; int warp_offset = (warp_id * LOGICAL_WARP_SIZE); int warp_num_valid = ((FULL_TILE && EVEN_WARP_MULTIPLE) || (warp_offset + LOGICAL_WARP_SIZE <= num_valid)) ? LOGICAL_WARP_SIZE : num_valid - warp_offset; // Warp reduction in every warp T warp_aggregate = WarpReduce(temp_storage.warp_reduce[warp_id]) .template Reduce<(FULL_TILE && EVEN_WARP_MULTIPLE)>(input, warp_num_valid, cub::Sum()); // Update outputs and block_aggregate with warp-wide aggregates from lane-0s return ApplyWarpAggregates(reduction_op, warp_aggregate, num_valid); } /** * @brief Computes a thread block-wide reduction using the specified reduction operator. * The first num_valid threads each contribute one reduction partial. * The return value is only valid for thread0. * * @param[in] input * Calling thread's input partial reductions * * @param[in] num_valid * Number of valid elements (may be less than BLOCK_THREADS) * * @param[in] reduction_op * Binary reduction operator */ template _CCCL_DEVICE _CCCL_FORCEINLINE T Reduce(T input, int num_valid, ReductionOp reduction_op) { int warp_offset = warp_id * LOGICAL_WARP_SIZE; int warp_num_valid = ((FULL_TILE && EVEN_WARP_MULTIPLE) || (warp_offset + LOGICAL_WARP_SIZE <= num_valid)) ? LOGICAL_WARP_SIZE : num_valid - warp_offset; // Warp reduction in every warp T warp_aggregate = WarpReduce(temp_storage.warp_reduce[warp_id]) .template Reduce<(FULL_TILE && EVEN_WARP_MULTIPLE)>(input, warp_num_valid, reduction_op); // Update outputs and block_aggregate with warp-wide aggregates from lane-0s return ApplyWarpAggregates(reduction_op, warp_aggregate, num_valid); } }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/block/specializations/block_scan_raking.cuh000066400000000000000000000641751463375617100244230ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * @file * cub::BlockScanRaking provides variants of raking-based parallel prefix scan across a * CUDA thread block. */ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include #include #include CUB_NAMESPACE_BEGIN /** * @brief BlockScanRaking provides variants of raking-based parallel prefix scan across a CUDA * thread block. * * @tparam T * Data type being scanned * * @tparam BLOCK_DIM_X * The thread block length in threads along the X dimension * * @tparam BLOCK_DIM_Y * The thread block length in threads along the Y dimension * * @tparam BLOCK_DIM_Z * The thread block length in threads along the Z dimension * * @tparam MEMOIZE * Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the * expense of higher register pressure * * @tparam LEGACY_PTX_ARCH * The PTX compute capability for which to to specialize this collective */ template struct BlockScanRaking { //--------------------------------------------------------------------- // Types and constants //--------------------------------------------------------------------- /// Constants enum { /// The thread block size in threads BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, }; /// Layout type for padded thread block raking grid typedef BlockRakingLayout BlockRakingLayout; /// Constants enum { /// Number of raking threads RAKING_THREADS = BlockRakingLayout::RAKING_THREADS, /// Number of raking elements per warp synchronous raking thread SEGMENT_LENGTH = BlockRakingLayout::SEGMENT_LENGTH, /// Cooperative work can be entirely warp synchronous WARP_SYNCHRONOUS = (int(BLOCK_THREADS) == int(RAKING_THREADS)), }; /// WarpScan utility type typedef WarpScan WarpScan; /// Shared memory storage layout type struct _TempStorage { /// Buffer for warp-synchronous scan typename WarpScan::TempStorage warp_scan; /// Padded thread block raking grid typename BlockRakingLayout::TempStorage raking_grid; /// Block aggregate T block_aggregate; }; /// Alias wrapper allowing storage to be unioned struct TempStorage : Uninitialized<_TempStorage> {}; //--------------------------------------------------------------------- // Per-thread fields //--------------------------------------------------------------------- // Thread fields _TempStorage& temp_storage; unsigned int linear_tid; T cached_segment[SEGMENT_LENGTH]; //--------------------------------------------------------------------- // Utility methods //--------------------------------------------------------------------- /** * @brief Templated reduction * * @param[in] raking_ptr * Input array * * @param[in] scan_op * Binary reduction operator * * @param[in] raking_partial * Prefix to seed reduction with */ template _CCCL_DEVICE _CCCL_FORCEINLINE T GuardedReduce(T* raking_ptr, ScanOp scan_op, T raking_partial, Int2Type /*iteration*/) { if ((BlockRakingLayout::UNGUARDED) || (((linear_tid * SEGMENT_LENGTH) + ITERATION) < BLOCK_THREADS)) { T addend = raking_ptr[ITERATION]; raking_partial = scan_op(raking_partial, addend); } return GuardedReduce(raking_ptr, scan_op, raking_partial, Int2Type()); } /** * @brief Templated reduction (base case) * * @param[in] raking_ptr * Input array * * @param[in] scan_op * Binary reduction operator * * @param[in] raking_partial * Prefix to seed reduction with */ template _CCCL_DEVICE _CCCL_FORCEINLINE T GuardedReduce(T* /*raking_ptr*/, ScanOp /*scan_op*/, T raking_partial, Int2Type /*iteration*/) { return raking_partial; } /** * @brief Templated copy * * @param out * [out] Out array * * @param in * [in] Input array */ template _CCCL_DEVICE _CCCL_FORCEINLINE void CopySegment(T* out, T* in, Int2Type /*iteration*/) { out[ITERATION] = in[ITERATION]; CopySegment(out, in, Int2Type()); } /** * @brief Templated copy (base case) * * @param[out] out * Out array * * @param[in] in * Input array */ _CCCL_DEVICE _CCCL_FORCEINLINE void CopySegment(T* /*out*/, T* /*in*/, Int2Type /*iteration*/) {} /// Performs upsweep raking reduction, returning the aggregate template _CCCL_DEVICE _CCCL_FORCEINLINE T Upsweep(ScanOp scan_op) { T* smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid); // Read data into registers CopySegment(cached_segment, smem_raking_ptr, Int2Type<0>()); T raking_partial = cached_segment[0]; return GuardedReduce(cached_segment, scan_op, raking_partial, Int2Type<1>()); } /// Performs exclusive downsweep raking scan template _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveDownsweep(ScanOp scan_op, T raking_partial, bool apply_prefix = true) { T* smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid); // Read data back into registers if (!MEMOIZE) { CopySegment(cached_segment, smem_raking_ptr, Int2Type<0>()); } internal::ThreadScanExclusive(cached_segment, cached_segment, scan_op, raking_partial, apply_prefix); // Write data back to smem CopySegment(smem_raking_ptr, cached_segment, Int2Type<0>()); } /// Performs inclusive downsweep raking scan template _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveDownsweep(ScanOp scan_op, T raking_partial, bool apply_prefix = true) { T* smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid); // Read data back into registers if (!MEMOIZE) { CopySegment(cached_segment, smem_raking_ptr, Int2Type<0>()); } internal::ThreadScanInclusive(cached_segment, cached_segment, scan_op, raking_partial, apply_prefix); // Write data back to smem CopySegment(smem_raking_ptr, cached_segment, Int2Type<0>()); } //--------------------------------------------------------------------- // Constructors //--------------------------------------------------------------------- /// Constructor _CCCL_DEVICE _CCCL_FORCEINLINE BlockScanRaking(TempStorage& temp_storage) : temp_storage(temp_storage.Alias()) , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) {} //--------------------------------------------------------------------- // Exclusive scans //--------------------------------------------------------------------- /** * @brief Computes an exclusive thread block-wide prefix scan using the specified binary \p * scan_op functor. Each thread contributes one input element. With no initial value, * the output computed for thread0 is undefined. * * @param[in] input * Calling thread's input item * * @param[out] exclusive_output * Calling thread's output item (may be aliased to \p input) * * @param[in] scan_op * Binary scan operator */ template _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScan(T input, T& exclusive_output, ScanOp scan_op) { if (WARP_SYNCHRONOUS) { // Short-circuit directly to warp-synchronous scan WarpScan(temp_storage.warp_scan).ExclusiveScan(input, exclusive_output, scan_op); } else { // Place thread partial into shared memory raking grid T* placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); detail::uninitialized_copy(placement_ptr, input); CTA_SYNC(); // Reduce parallelism down to just raking threads if (linear_tid < RAKING_THREADS) { // Raking upsweep reduction across shared partials T upsweep_partial = Upsweep(scan_op); // Warp-synchronous scan T exclusive_partial; WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, scan_op); // Exclusive raking downsweep scan ExclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0)); } CTA_SYNC(); // Grab thread prefix from shared memory exclusive_output = *placement_ptr; } } /** * @brief Computes an exclusive thread block-wide prefix scan using the specified binary \p * scan_op functor. Each thread contributes one input element. * * @param[in] input * Calling thread's input items * * @param[out] output * Calling thread's output items (may be aliased to \p input) * * @param[in] initial_value * Initial value to seed the exclusive scan * * @param[in] scan_op * Binary scan operator */ template _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScan(T input, T& output, const T& initial_value, ScanOp scan_op) { if (WARP_SYNCHRONOUS) { // Short-circuit directly to warp-synchronous scan WarpScan(temp_storage.warp_scan).ExclusiveScan(input, output, initial_value, scan_op); } else { // Place thread partial into shared memory raking grid T* placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); detail::uninitialized_copy(placement_ptr, input); CTA_SYNC(); // Reduce parallelism down to just raking threads if (linear_tid < RAKING_THREADS) { // Raking upsweep reduction across shared partials T upsweep_partial = Upsweep(scan_op); // Exclusive Warp-synchronous scan T exclusive_partial; WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, initial_value, scan_op); // Exclusive raking downsweep scan ExclusiveDownsweep(scan_op, exclusive_partial); } CTA_SYNC(); // Grab exclusive partial from shared memory output = *placement_ptr; } } /** * @brief Computes an exclusive thread block-wide prefix scan using the specified binary \p * scan_op functor. Each thread contributes one input element. Also provides every * thread with the block-wide \p block_aggregate of all inputs. With no initial value, * the output computed for thread0 is undefined. * * @param[in] input * Calling thread's input item * * @param[out] output * Calling thread's output item (may be aliased to \p input) * * @param[in] scan_op * Binary scan operator * * @param[out] block_aggregate * Threadblock-wide aggregate reduction of input items */ template _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScan(T input, T& output, ScanOp scan_op, T& block_aggregate) { if (WARP_SYNCHRONOUS) { // Short-circuit directly to warp-synchronous scan WarpScan(temp_storage.warp_scan).ExclusiveScan(input, output, scan_op, block_aggregate); } else { // Place thread partial into shared memory raking grid T* placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); detail::uninitialized_copy(placement_ptr, input); CTA_SYNC(); // Reduce parallelism down to just raking threads if (linear_tid < RAKING_THREADS) { // Raking upsweep reduction across shared partials T upsweep_partial = Upsweep(scan_op); // Warp-synchronous scan T inclusive_partial; T exclusive_partial; WarpScan(temp_storage.warp_scan).Scan(upsweep_partial, inclusive_partial, exclusive_partial, scan_op); // Exclusive raking downsweep scan ExclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0)); // Broadcast aggregate to all threads if (linear_tid == RAKING_THREADS - 1) { temp_storage.block_aggregate = inclusive_partial; } } CTA_SYNC(); // Grab thread prefix from shared memory output = *placement_ptr; // Retrieve block aggregate block_aggregate = temp_storage.block_aggregate; } } /** * @brief Computes an exclusive thread block-wide prefix scan using the specified binary \p * scan_op functor. Each thread contributes one input element. Also provides every * thread with the block-wide \p block_aggregate of all inputs. * * @param[in] input * Calling thread's input items * * @param[out] output * Calling thread's output items (may be aliased to \p input) * * @param[in] initial_value * Initial value to seed the exclusive scan * * @param[in] scan_op * Binary scan operator * * @param[out] block_aggregate * Threadblock-wide aggregate reduction of input items */ template _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScan(T input, T& output, const T& initial_value, ScanOp scan_op, T& block_aggregate) { if (WARP_SYNCHRONOUS) { // Short-circuit directly to warp-synchronous scan WarpScan(temp_storage.warp_scan).ExclusiveScan(input, output, initial_value, scan_op, block_aggregate); } else { // Place thread partial into shared memory raking grid T* placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); detail::uninitialized_copy(placement_ptr, input); CTA_SYNC(); // Reduce parallelism down to just raking threads if (linear_tid < RAKING_THREADS) { // Raking upsweep reduction across shared partials T upsweep_partial = Upsweep(scan_op); // Warp-synchronous scan T exclusive_partial; WarpScan(temp_storage.warp_scan) .ExclusiveScan(upsweep_partial, exclusive_partial, initial_value, scan_op, block_aggregate); // Exclusive raking downsweep scan ExclusiveDownsweep(scan_op, exclusive_partial); // Broadcast aggregate to other threads if (linear_tid == 0) { temp_storage.block_aggregate = block_aggregate; } } CTA_SYNC(); // Grab exclusive partial from shared memory output = *placement_ptr; // Retrieve block aggregate block_aggregate = temp_storage.block_aggregate; } } /** * @brief Computes an exclusive thread block-wide prefix scan using the specified binary \p * scan_op functor. Each thread contributes one input element. the call-back functor \p * block_prefix_callback_op is invoked by the first warp in the block, and the value * returned by lane0 in that warp is used as the "seed" value that * logically prefixes the thread block's scan inputs. Also provides every thread with * the block-wide \p block_aggregate of all inputs. * * @param[in] input * Calling thread's input item * * @param[out] output * Calling thread's output item (may be aliased to \p input) * * @param[in] scan_op * Binary scan operator * * @param[in-out] block_prefix_callback_op * [warp0 only] Call-back functor for specifying a thread * block-wide prefix to be applied to all inputs. */ template _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScan(T input, T& output, ScanOp scan_op, BlockPrefixCallbackOp& block_prefix_callback_op) { if (WARP_SYNCHRONOUS) { // Short-circuit directly to warp-synchronous scan T block_aggregate; WarpScan warp_scan(temp_storage.warp_scan); warp_scan.ExclusiveScan(input, output, scan_op, block_aggregate); // Obtain warp-wide prefix in lane0, then broadcast to other lanes T block_prefix = block_prefix_callback_op(block_aggregate); block_prefix = warp_scan.Broadcast(block_prefix, 0); output = scan_op(block_prefix, output); if (linear_tid == 0) { output = block_prefix; } } else { // Place thread partial into shared memory raking grid T* placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); detail::uninitialized_copy(placement_ptr, input); CTA_SYNC(); // Reduce parallelism down to just raking threads if (linear_tid < RAKING_THREADS) { WarpScan warp_scan(temp_storage.warp_scan); // Raking upsweep reduction across shared partials T upsweep_partial = Upsweep(scan_op); // Warp-synchronous scan T exclusive_partial, block_aggregate; warp_scan.ExclusiveScan(upsweep_partial, exclusive_partial, scan_op, block_aggregate); // Obtain block-wide prefix in lane0, then broadcast to other lanes T block_prefix = block_prefix_callback_op(block_aggregate); block_prefix = warp_scan.Broadcast(block_prefix, 0); // Update prefix with warpscan exclusive partial T downsweep_prefix = scan_op(block_prefix, exclusive_partial); if (linear_tid == 0) { downsweep_prefix = block_prefix; } // Exclusive raking downsweep scan ExclusiveDownsweep(scan_op, downsweep_prefix); } CTA_SYNC(); // Grab thread prefix from shared memory output = *placement_ptr; } } //--------------------------------------------------------------------- // Inclusive scans //--------------------------------------------------------------------- /** * @brief Computes an inclusive thread block-wide prefix scan using the specified binary \p * scan_op functor. Each thread contributes one input element. * * @param[in] input * Calling thread's input item * * @param[out] output * Calling thread's output item (may be aliased to \p input) * * @param[in] scan_op * Binary scan operator */ template _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScan(T input, T& output, ScanOp scan_op) { if (WARP_SYNCHRONOUS) { // Short-circuit directly to warp-synchronous scan WarpScan(temp_storage.warp_scan).InclusiveScan(input, output, scan_op); } else { // Place thread partial into shared memory raking grid T* placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); detail::uninitialized_copy(placement_ptr, input); CTA_SYNC(); // Reduce parallelism down to just raking threads if (linear_tid < RAKING_THREADS) { // Raking upsweep reduction across shared partials T upsweep_partial = Upsweep(scan_op); // Exclusive Warp-synchronous scan T exclusive_partial; WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, scan_op); // Inclusive raking downsweep scan InclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0)); } CTA_SYNC(); // Grab thread prefix from shared memory output = *placement_ptr; } } /** * @brief Computes an inclusive thread block-wide prefix scan using the specified binary \p * scan_op functor. Each thread contributes one input element. Also provides every * thread with the block-wide \p block_aggregate of all inputs. * * @param[in] input * Calling thread's input item * * @param[out] output * Calling thread's output item (may be aliased to \p input) * * @param[in] scan_op * Binary scan operator * * @param[out] block_aggregate * Threadblock-wide aggregate reduction of input items */ template _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScan(T input, T& output, ScanOp scan_op, T& block_aggregate) { if (WARP_SYNCHRONOUS) { // Short-circuit directly to warp-synchronous scan WarpScan(temp_storage.warp_scan).InclusiveScan(input, output, scan_op, block_aggregate); } else { // Place thread partial into shared memory raking grid T* placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); detail::uninitialized_copy(placement_ptr, input); CTA_SYNC(); // Reduce parallelism down to just raking threads if (linear_tid < RAKING_THREADS) { // Raking upsweep reduction across shared partials T upsweep_partial = Upsweep(scan_op); // Warp-synchronous scan T inclusive_partial; T exclusive_partial; WarpScan(temp_storage.warp_scan).Scan(upsweep_partial, inclusive_partial, exclusive_partial, scan_op); // Inclusive raking downsweep scan InclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0)); // Broadcast aggregate to all threads if (linear_tid == RAKING_THREADS - 1) { temp_storage.block_aggregate = inclusive_partial; } } CTA_SYNC(); // Grab thread prefix from shared memory output = *placement_ptr; // Retrieve block aggregate block_aggregate = temp_storage.block_aggregate; } } /** * @brief Computes an inclusive thread block-wide prefix scan using the specified binary \p * scan_op functor. Each thread contributes one input element. the call-back functor \p * block_prefix_callback_op is invoked by the first warp in the block, and the value * returned by lane0 in that warp is used as the "seed" value that * logically prefixes the thread block's scan inputs. Also provides every thread with * the block-wide \p block_aggregate of all inputs. * * @param[in] input * Calling thread's input item * * @param[out] output * Calling thread's output item (may be aliased to \p input) * * @param[in] scan_op * Binary scan operator * * @param[in-out] block_prefix_callback_op * [warp0 only] Call-back functor for specifying a thread * block-wide prefix to be applied to all inputs. */ template _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScan(T input, T& output, ScanOp scan_op, BlockPrefixCallbackOp& block_prefix_callback_op) { if (WARP_SYNCHRONOUS) { // Short-circuit directly to warp-synchronous scan T block_aggregate; WarpScan warp_scan(temp_storage.warp_scan); warp_scan.InclusiveScan(input, output, scan_op, block_aggregate); // Obtain warp-wide prefix in lane0, then broadcast to other lanes T block_prefix = block_prefix_callback_op(block_aggregate); block_prefix = warp_scan.Broadcast(block_prefix, 0); // Update prefix with exclusive warpscan partial output = scan_op(block_prefix, output); } else { // Place thread partial into shared memory raking grid T* placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); detail::uninitialized_copy(placement_ptr, input); CTA_SYNC(); // Reduce parallelism down to just raking threads if (linear_tid < RAKING_THREADS) { WarpScan warp_scan(temp_storage.warp_scan); // Raking upsweep reduction across shared partials T upsweep_partial = Upsweep(scan_op); // Warp-synchronous scan T exclusive_partial, block_aggregate; warp_scan.ExclusiveScan(upsweep_partial, exclusive_partial, scan_op, block_aggregate); // Obtain block-wide prefix in lane0, then broadcast to other lanes T block_prefix = block_prefix_callback_op(block_aggregate); block_prefix = warp_scan.Broadcast(block_prefix, 0); // Update prefix with warpscan exclusive partial T downsweep_prefix = scan_op(block_prefix, exclusive_partial); if (linear_tid == 0) { downsweep_prefix = block_prefix; } // Inclusive raking downsweep scan InclusiveDownsweep(scan_op, downsweep_prefix); } CTA_SYNC(); // Grab thread prefix from shared memory output = *placement_ptr; } } }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/block/specializations/block_scan_warp_scans.cuh000066400000000000000000000452211463375617100252770ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * @file * cub::BlockScanWarpscans provides warpscan-based variants of parallel prefix scan across a CUDA thread block. */ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include CUB_NAMESPACE_BEGIN /** * @brief BlockScanWarpScans provides warpscan-based variants of parallel prefix scan across a CUDA * thread block. * * @tparam BLOCK_DIM_X * The thread block length in threads along the X dimension * * @tparam BLOCK_DIM_Y * The thread block length in threads along the Y dimension * * @tparam BLOCK_DIM_Z * The thread block length in threads along the Z dimension * * @tparam LEGACY_PTX_ARCH * The PTX compute capability for which to to specialize this collective */ template struct BlockScanWarpScans { //--------------------------------------------------------------------- // Types and constants //--------------------------------------------------------------------- /// Constants enum { /// Number of warp threads WARP_THREADS = CUB_WARP_THREADS(0), /// The thread block size in threads BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, /// Number of active warps WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, }; /// WarpScan utility type typedef WarpScan WarpScanT; /// WarpScan utility type typedef WarpScan WarpAggregateScan; /// Shared memory storage layout type struct __align__(32) _TempStorage { T warp_aggregates[WARPS]; /// Buffer for warp-synchronous scans typename WarpScanT::TempStorage warp_scan[WARPS]; /// Shared prefix for the entire thread block T block_prefix; }; /// Alias wrapper allowing storage to be unioned struct TempStorage : Uninitialized<_TempStorage> {}; //--------------------------------------------------------------------- // Per-thread fields //--------------------------------------------------------------------- // Thread fields _TempStorage& temp_storage; unsigned int linear_tid; unsigned int warp_id; unsigned int lane_id; //--------------------------------------------------------------------- // Constructors //--------------------------------------------------------------------- /// Constructor _CCCL_DEVICE _CCCL_FORCEINLINE BlockScanWarpScans(TempStorage& temp_storage) : temp_storage(temp_storage.Alias()) , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) , warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS) , lane_id(LaneId()) {} //--------------------------------------------------------------------- // Utility methods //--------------------------------------------------------------------- /** * @param[out] warp_prefix * The calling thread's partial reduction * * @param[in] scan_op * Binary scan operator * * @param[out] block_aggregate * Threadblock-wide aggregate reduction of input items */ template _CCCL_DEVICE _CCCL_FORCEINLINE void ApplyWarpAggregates(T& warp_prefix, ScanOp scan_op, T& block_aggregate, Int2Type /*addend_warp*/) { if (warp_id == WARP) { warp_prefix = block_aggregate; } T addend = temp_storage.warp_aggregates[WARP]; block_aggregate = scan_op(block_aggregate, addend); ApplyWarpAggregates(warp_prefix, scan_op, block_aggregate, Int2Type()); } /** * @param[out] warp_prefix * The calling thread's partial reduction * * @param[in] scan_op * Binary scan operator * * @param[out] block_aggregat * Threadblock-wide aggregate reduction of input items */ template _CCCL_DEVICE _CCCL_FORCEINLINE void ApplyWarpAggregates(T& /*warp_prefix*/, ScanOp /*scan_op*/, T& /*block_aggregate*/, Int2Type /*addend_warp*/) {} /** * @brief Use the warp-wide aggregates to compute the calling warp's prefix. Also returns * block-wide aggregate in all threads. * * @param[in] scan_op * Binary scan operator * * @param[in] warp_aggregate * [laneWARP_THREADS - 1 only] Warp-wide aggregate reduction of * input items * * @param[out] block_aggregate * Threadblock-wide aggregate reduction of input items */ template _CCCL_DEVICE _CCCL_FORCEINLINE T ComputeWarpPrefix(ScanOp scan_op, T warp_aggregate, T& block_aggregate) { // Last lane in each warp shares its warp-aggregate if (lane_id == WARP_THREADS - 1) { detail::uninitialized_copy(temp_storage.warp_aggregates + warp_id, warp_aggregate); } CTA_SYNC(); // Accumulate block aggregates and save the one that is our warp's prefix T warp_prefix; block_aggregate = temp_storage.warp_aggregates[0]; // Use template unrolling (since the PTX backend can't handle unrolling it for SM1x) ApplyWarpAggregates(warp_prefix, scan_op, block_aggregate, Int2Type<1>()); /* #pragma unroll for (int WARP = 1; WARP < WARPS; ++WARP) { if (warp_id == WARP) warp_prefix = block_aggregate; T addend = temp_storage.warp_aggregates[WARP]; block_aggregate = scan_op(block_aggregate, addend); } */ return warp_prefix; } /** * @brief Use the warp-wide aggregates and initial-value to compute the calling warp's prefix. * Also returns block-wide aggregate in all threads. * * @param[in] scan_op * Binary scan operator * * @param[in] warp_aggregate * [laneWARP_THREADS - 1 only] Warp-wide aggregate reduction of * input items * * @param[out] block_aggregate * Threadblock-wide aggregate reduction of input items * * @param[in] initial_value * Initial value to seed the exclusive scan */ template _CCCL_DEVICE _CCCL_FORCEINLINE T ComputeWarpPrefix(ScanOp scan_op, T warp_aggregate, T& block_aggregate, const T& initial_value) { T warp_prefix = ComputeWarpPrefix(scan_op, warp_aggregate, block_aggregate); warp_prefix = scan_op(initial_value, warp_prefix); if (warp_id == 0) { warp_prefix = initial_value; } return warp_prefix; } //--------------------------------------------------------------------- // Exclusive scans //--------------------------------------------------------------------- /** * @brief Computes an exclusive thread block-wide prefix scan using the specified binary \p * scan_op functor. Each thread contributes one input element. With no initial value, * the output computed for thread0 is undefined. * * @param[in] input * Calling thread's input item * * @param[out] exclusive_output * Calling thread's output item (may be aliased to \p input) * * @param[in] scan_op * Binary scan operator */ template _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScan(T input, T& exclusive_output, ScanOp scan_op) { // Compute block-wide exclusive scan. The exclusive output from tid0 is invalid. T block_aggregate; ExclusiveScan(input, exclusive_output, scan_op, block_aggregate); } /** * @brief Computes an exclusive thread block-wide prefix scan using the specified binary \p * scan_op functor. Each thread contributes one input element. * * @param[in] input * Calling thread's input items * * @param[out] exclusive_output * Calling thread's output items (may be aliased to \p input) * * @param[in] initial_value * Initial value to seed the exclusive scan * * @param[in] scan_op * Binary scan operator */ template _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScan(T input, T& exclusive_output, const T& initial_value, ScanOp scan_op) { T block_aggregate; ExclusiveScan(input, exclusive_output, initial_value, scan_op, block_aggregate); } /** * @brief Computes an exclusive thread block-wide prefix scan using the specified binary \p * scan_op functor. Each thread contributes one input element. Also provides every * thread with the block-wide \p block_aggregate of all inputs. With no initial value, * the output computed for thread0 is undefined. * * @param[in] input * Calling thread's input item * * @param[out] exclusive_output * Calling thread's output item (may be aliased to \p input) * * @param[in] scan_op * Binary scan operator * * @param[out] block_aggregate * Threadblock-wide aggregate reduction of input items */ template _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScan(T input, T& exclusive_output, ScanOp scan_op, T& block_aggregate) { // Compute warp scan in each warp. The exclusive output from each lane0 is invalid. T inclusive_output; WarpScanT(temp_storage.warp_scan[warp_id]).Scan(input, inclusive_output, exclusive_output, scan_op); // Compute the warp-wide prefix and block-wide aggregate for each warp. Warp prefix for warp0 is invalid. T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate); // Apply warp prefix to our lane's partial if (warp_id != 0) { exclusive_output = scan_op(warp_prefix, exclusive_output); if (lane_id == 0) { exclusive_output = warp_prefix; } } } /** * @brief Computes an exclusive thread block-wide prefix scan using the specified binary \p * scan_op functor. Each thread contributes one input element. Also provides every * thread with the block-wide \p block_aggregate of all inputs. * * @param[in] input * Calling thread's input items * * @param[out] exclusive_output * Calling thread's output items (may be aliased to \p input) * * @param[in] initial_value * Initial value to seed the exclusive scan * * @param[in] scan_op * Binary scan operator * * @param[out] block_aggregate * Threadblock-wide aggregate reduction of input items */ template _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScan(T input, T& exclusive_output, const T& initial_value, ScanOp scan_op, T& block_aggregate) { // Compute warp scan in each warp. The exclusive output from each lane0 is invalid. T inclusive_output; WarpScanT(temp_storage.warp_scan[warp_id]).Scan(input, inclusive_output, exclusive_output, scan_op); // Compute the warp-wide prefix and block-wide aggregate for each warp T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate, initial_value); // Apply warp prefix to our lane's partial exclusive_output = scan_op(warp_prefix, exclusive_output); if (lane_id == 0) { exclusive_output = warp_prefix; } } /** * @brief Computes an exclusive thread block-wide prefix scan using the specified binary \p * scan_op functor. Each thread contributes one input element. the call-back functor \p * block_prefix_callback_op is invoked by the first warp in the block, and the value * returned by lane0 in that warp is used as the "seed" value that * logically prefixes the thread block's scan inputs. Also provides every thread with * the block-wide \p block_aggregate of all inputs. * * @param[in] input * Calling thread's input item * * @param[out] exclusive_output * Calling thread's output item (may be aliased to \p input) * * @param[in] scan_op * Binary scan operator * * @param[in-out] block_prefix_callback_op * [warp0 only] Call-back functor for specifying a thread * block-wide prefix to be applied to all inputs. */ template _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScan(T input, T& exclusive_output, ScanOp scan_op, BlockPrefixCallbackOp& block_prefix_callback_op) { // Compute block-wide exclusive scan. The exclusive output from tid0 is invalid. T block_aggregate; ExclusiveScan(input, exclusive_output, scan_op, block_aggregate); // Use the first warp to determine the thread block prefix, returning the result in lane0 if (warp_id == 0) { T block_prefix = block_prefix_callback_op(block_aggregate); if (lane_id == 0) { // Share the prefix with all threads detail::uninitialized_copy(&temp_storage.block_prefix, block_prefix); exclusive_output = block_prefix; // The block prefix is the exclusive output for tid0 } } CTA_SYNC(); // Incorporate thread block prefix into outputs T block_prefix = temp_storage.block_prefix; if (linear_tid > 0) { exclusive_output = scan_op(block_prefix, exclusive_output); } } //--------------------------------------------------------------------- // Inclusive scans //--------------------------------------------------------------------- /** * @brief Computes an inclusive thread block-wide prefix scan using the specified binary \p * scan_op functor. Each thread contributes one input element. * * @param[in] input * Calling thread's input item * * @param[out] inclusive_output * Calling thread's output item (may be aliased to \p input) * * @param[in] scan_op * Binary scan operator */ template _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScan(T input, T& inclusive_output, ScanOp scan_op) { T block_aggregate; InclusiveScan(input, inclusive_output, scan_op, block_aggregate); } /** * @brief Computes an inclusive thread block-wide prefix scan using the specified binary \p * scan_op functor. Each thread contributes one input element. Also provides every * thread with the block-wide \p block_aggregate of all inputs. * * @param[in] input * Calling thread's input item * * @param[out] inclusive_output * Calling thread's output item (may be aliased to \p input) * * @param[in] scan_op * Binary scan operator * * @param[out] block_aggregate * Threadblock-wide aggregate reduction of input items */ template _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScan(T input, T& inclusive_output, ScanOp scan_op, T& block_aggregate) { WarpScanT(temp_storage.warp_scan[warp_id]).InclusiveScan(input, inclusive_output, scan_op); // Compute the warp-wide prefix and block-wide aggregate for each warp. Warp prefix for warp0 is invalid. T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate); // Apply warp prefix to our lane's partial if (warp_id != 0) { inclusive_output = scan_op(warp_prefix, inclusive_output); } } /** * @brief Computes an inclusive thread block-wide prefix scan using the specified binary \p * scan_op functor. Each thread contributes one input element. the call-back functor \p * block_prefix_callback_op is invoked by the first warp in the block, and the value * returned by lane0 in that warp is used as the "seed" value that * logically prefixes the thread block's scan inputs. Also provides every thread with * the block-wide \p block_aggregate of all inputs. * * @param[in] input * Calling thread's input item * * @param[out] exclusive_output * Calling thread's output item (may be aliased to \p input) * * @param[in] scan_op * Binary scan operator * * @param[in-out] block_prefix_callback_op * [warp0 only] Call-back functor for specifying a thread * block-wide prefix to be applied to all inputs. */ template _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScan(T input, T& exclusive_output, ScanOp scan_op, BlockPrefixCallbackOp& block_prefix_callback_op) { T block_aggregate; InclusiveScan(input, exclusive_output, scan_op, block_aggregate); // Use the first warp to determine the thread block prefix, returning the result in lane0 if (warp_id == 0) { T block_prefix = block_prefix_callback_op(block_aggregate); if (lane_id == 0) { // Share the prefix with all threads detail::uninitialized_copy(&temp_storage.block_prefix, block_prefix); } } CTA_SYNC(); // Incorporate thread block prefix into outputs T block_prefix = temp_storage.block_prefix; exclusive_output = scan_op(block_prefix, exclusive_output); } }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/cmake/000077500000000000000000000000001463375617100150415ustar00rootroot00000000000000cccl-2.5.0/cub/cub/cmake/cub-config-version.cmake000066400000000000000000000014711463375617100215450ustar00rootroot00000000000000# Parse version information from version.cuh: include("${CMAKE_CURRENT_LIST_DIR}/cub-header-search.cmake") set(CUB_VERSION_MAJOR 2) set(CUB_VERSION_MINOR 5) set(CUB_VERSION_PATCH 0) set(CUB_VERSION_TWEAK 0) set(CUB_VERSION "${CUB_VERSION_MAJOR}.${CUB_VERSION_MINOR}.${CUB_VERSION_PATCH}.${CUB_VERSION_TWEAK}") set(PACKAGE_VERSION ${CUB_VERSION}) set(PACKAGE_VERSION_COMPATIBLE FALSE) set(PACKAGE_VERSION_EXACT FALSE) set(PACKAGE_VERSION_UNSUITABLE FALSE) if(PACKAGE_VERSION VERSION_GREATER_EQUAL PACKAGE_FIND_VERSION) if(CUB_VERSION_MAJOR VERSION_EQUAL PACKAGE_FIND_VERSION_MAJOR AND CUB_VERSION_MINOR VERSION_GREATER_EQUAL PACKAGE_FIND_VERSION_MINOR) set(PACKAGE_VERSION_COMPATIBLE TRUE) endif() if(PACKAGE_FIND_VERSION VERSION_EQUAL PACKAGE_VERSION) set(PACKAGE_VERSION_EXACT TRUE) endif() endif() cccl-2.5.0/cub/cub/cmake/cub-config.cmake000066400000000000000000000111341463375617100200570ustar00rootroot00000000000000# # find_package(CUB) config file. # # Defines a CUB::CUB target that may be linked from user projects to include # CUB. if (TARGET CUB::CUB) return() endif() # Minimum supported libcudacxx version: set(cub_libcudacxx_version "${CUB_VERSION}") function(_cub_declare_interface_alias alias_name ugly_name) # 1) Only IMPORTED and ALIAS targets can be placed in a namespace. # 2) When an IMPORTED library is linked to another target, its include # directories are treated as SYSTEM includes. # 3) nvcc will automatically check the CUDA Toolkit include path *before* the # system includes. This means that the Toolkit CUB will *always* be used # during compilation, and the include paths of an IMPORTED CUB::CUB # target will never have any effect. # 4) This behavior can be fixed by setting the property NO_SYSTEM_FROM_IMPORTED # on EVERY target that links to CUB::CUB. This would be a burden and a # footgun for our users. Forgetting this would silently pull in the wrong CUB! # 5) A workaround is to make a non-IMPORTED library outside of the namespace, # configure it, and then ALIAS it into the namespace (or ALIAS and then # configure, that seems to work too). add_library(${ugly_name} INTERFACE) add_library(${alias_name} ALIAS ${ugly_name}) endfunction() # # Setup some internal cache variables # # Pull in the include dir detected by cub-config-version.cmake set(_CUB_INCLUDE_DIR "${_CUB_VERSION_INCLUDE_DIR}" CACHE INTERNAL "Location of CUB headers." FORCE ) unset(_CUB_VERSION_INCLUDE_DIR CACHE) # Clear tmp variable from cache if (${CMAKE_FIND_PACKAGE_NAME}_FIND_QUIETLY) set(_CUB_QUIET ON CACHE INTERNAL "Quiet mode enabled for CUB find_package calls." FORCE) set(_CUB_QUIET_FLAG "QUIET" CACHE INTERNAL "" FORCE) else() set(_CUB_QUIET OFF CACHE INTERNAL "Quiet mode enabled for CUB find_package calls." FORCE) set(_CUB_QUIET_FLAG "" CACHE INTERNAL "" FORCE) endif() # # Setup dependencies # if (NOT TARGET CUB::libcudacxx) if (TARGET Thrust::libcudacxx) # Prefer the same libcudacxx as Thrust, if available: _cub_declare_interface_alias(CUB::libcudacxx _CUB_libcudacxx) target_link_libraries(_CUB_libcudacxx INTERFACE Thrust::libcudacxx) else() if (NOT TARGET libcudacxx::libcudacxx) # First do a non-required search for any co-packaged versions. # These are preferred. find_package(libcudacxx ${cub_libcudacxx_version} CONFIG ${_CUB_QUIET_FLAG} NO_DEFAULT_PATH # Only check the explicit HINTS below: HINTS "${_CUB_INCLUDE_DIR}/../libcudacxx" # Source layout "${_CUB_CMAKE_DIR}/.." # Install layout ) # A second required search allows externally packaged to be used and fails if # no suitable package exists. find_package(libcudacxx ${cub_libcudacxx_version} CONFIG REQUIRED ${_CUB_QUIET_FLAG} ) endif() _cub_declare_interface_alias(CUB::libcudacxx _CUB_libcudacxx) target_link_libraries(_CUB_libcudacxx INTERFACE libcudacxx::libcudacxx) endif() endif() # # Setup targets # _cub_declare_interface_alias(CUB::CUB _CUB_CUB) target_include_directories(_CUB_CUB INTERFACE "${_CUB_INCLUDE_DIR}") target_link_libraries(_CUB_CUB INTERFACE CUB::libcudacxx) if (CUB_IGNORE_DEPRECATED_API OR THRUST_IGNORE_DEPRECATED_API) target_compile_definitions(_CUB_CUB INTERFACE "CUB_IGNORE_DEPRECATED_API") endif() if (CUB_IGNORE_DEPRECATED_CPP_DIALECT OR THRUST_IGNORE_DEPRECATED_CPP_DIALECT) target_compile_definitions(_CUB_CUB INTERFACE "CUB_IGNORE_DEPRECATED_CPP_DIALECT") endif() if (CUB_IGNORE_DEPRECATED_CPP_11 OR THRUST_IGNORE_DEPRECATED_CPP_11) target_compile_definitions(_CUB_CUB INTERFACE "CUB_IGNORE_DEPRECATED_CPP_11") endif() if (CUB_IGNORE_DEPRECATED_COMPILER OR THRUST_IGNORE_DEPRECATED_COMPILER) target_compile_definitions(_CUB_CUB INTERFACE "CUB_IGNORE_DEPRECATED_COMPILER") endif() # # Standardize version info # set(CUB_VERSION ${${CMAKE_FIND_PACKAGE_NAME}_VERSION} CACHE INTERNAL "" FORCE) set(CUB_VERSION_MAJOR ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_MAJOR} CACHE INTERNAL "" FORCE) set(CUB_VERSION_MINOR ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_MINOR} CACHE INTERNAL "" FORCE) set(CUB_VERSION_PATCH ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_PATCH} CACHE INTERNAL "" FORCE) set(CUB_VERSION_TWEAK ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_TWEAK} CACHE INTERNAL "" FORCE) set(CUB_VERSION_COUNT ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_COUNT} CACHE INTERNAL "" FORCE) include(FindPackageHandleStandardArgs) if (NOT CUB_CONFIG) set(CUB_CONFIG "${CMAKE_CURRENT_LIST_FILE}") endif() find_package_handle_standard_args(CUB CONFIG_MODE) cccl-2.5.0/cub/cub/cmake/cub-header-search.cmake000066400000000000000000000005521463375617100213070ustar00rootroot00000000000000# Parse version information from version.h in source tree set(_CUB_VERSION_INCLUDE_DIR "${CMAKE_CURRENT_LIST_DIR}/../..") if(EXISTS "${_CUB_VERSION_INCLUDE_DIR}/cub/version.cuh") set(_CUB_VERSION_INCLUDE_DIR "${_CUB_VERSION_INCLUDE_DIR}" CACHE FILEPATH "" FORCE) # Clear old result set_property(CACHE _CUB_VERSION_INCLUDE_DIR PROPERTY TYPE INTERNAL) endif() cccl-2.5.0/cub/cub/cmake/cub-header-search.cmake.in000066400000000000000000000010731463375617100217130ustar00rootroot00000000000000# Parse version information from version.h: unset(_CUB_VERSION_INCLUDE_DIR CACHE) # Clear old result to force search # Find CMAKE_INSTALL_INCLUDEDIR=@CMAKE_INSTALL_INCLUDEDIR@ directory" set(from_install_prefix "@from_install_prefix@") find_path(_CUB_VERSION_INCLUDE_DIR cub/version.cuh NO_CMAKE_FIND_ROOT_PATH # Don't allow CMake to re-root the search NO_DEFAULT_PATH # Only search explicit paths below: PATHS "${CMAKE_CURRENT_LIST_DIR}/${from_install_prefix}/@CMAKE_INSTALL_INCLUDEDIR@" ) set_property(CACHE _CUB_VERSION_INCLUDE_DIR PROPERTY TYPE INTERNAL) cccl-2.5.0/cub/cub/config.cuh000066400000000000000000000044521463375617100157340ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * Static configuration header for the CUB project. */ #pragma once // For _CCCL_IMPLICIT_SYSTEM_HEADER #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include #include #include cccl-2.5.0/cub/cub/cub.cuh000066400000000000000000000111071463375617100152330ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * CUB umbrella include file */ #pragma once // Static configuration #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header // Block #include #include #include #include #include #include #include #include #include #include #include // #include // Device #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include // Grid // #include #include #include #include // Thread #include #include #include #include #include // Warp #include #include #include #include #include #include // Iterator #include #include #include #include #include #include #include #include #include // Util #include #include #include #include #include #include cccl-2.5.0/cub/cub/detail/000077500000000000000000000000001463375617100152235ustar00rootroot00000000000000cccl-2.5.0/cub/cub/detail/choose_offset.cuh000066400000000000000000000106251463375617100205560ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2024, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include CUB_NAMESPACE_BEGIN namespace detail { /** * choose_offset checks NumItemsT, the type of the num_items parameter, and * selects the offset type based on it. */ template struct choose_offset { // NumItemsT must be an integral type (but not bool). static_assert(::cuda::std::is_integral::value && !::cuda::std::is_same::type, bool>::value, "NumItemsT must be an integral type, but not bool"); // Unsigned integer type for global offsets. using type = typename ::cuda::std::conditional::type; }; /** * choose_offset_t is an alias template that checks NumItemsT, the type of the num_items parameter, and * selects the offset type based on it. */ template using choose_offset_t = typename choose_offset::type; /** * promote_small_offset checks NumItemsT, the type of the num_items parameter, and * promotes any integral type smaller than 32 bits to a signed 32-bit integer type. */ template struct promote_small_offset { // NumItemsT must be an integral type (but not bool). static_assert(::cuda::std::is_integral::value && !::cuda::std::is_same::type, bool>::value, "NumItemsT must be an integral type, but not bool"); // Unsigned integer type for global offsets. using type = typename ::cuda::std::conditional::type; }; /** * promote_small_offset_t is an alias template that checks NumItemsT, the type of the num_items parameter, and * promotes any integral type smaller than 32 bits to a signed 32-bit integer type. */ template using promote_small_offset_t = typename promote_small_offset::type; /** * common_iterator_value sets member type to the common_type of * value_type for all argument types. used to get OffsetT in * DeviceSegmentedReduce. */ template struct common_iterator_value { using type = ::cuda::std::__common_type_t<::cuda::std::__iter_value_type...>; }; template using common_iterator_value_t = typename common_iterator_value::type; } // namespace detail CUB_NAMESPACE_END cccl-2.5.0/cub/cub/detail/detect_cuda_runtime.cuh000066400000000000000000000076231463375617100217430ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * @file * Utilities for CUDA dynamic parallelism. */ #pragma once // We cannot use `cub/config.cuh` here due to circular dependencies #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #ifdef DOXYGEN_SHOULD_SKIP_THIS // Only parse this during doxygen passes: /** * \def CUB_DISABLE_CDP * * If defined, support for device-side usage of CUB is disabled. */ # define CUB_DISABLE_CDP /** * \def CUB_RDC_ENABLED * * Defined if RDC is enabled and CUB_DISABLE_CDP is not defined. */ # define CUB_RDC_ENABLED /** * \def CUB_RUNTIME_FUNCTION * * Execution space for functions that can use the CUDA runtime API (`__host__` * when RDC is off, `__host__ __device__` when RDC is on). */ # define CUB_RUNTIME_FUNCTION /** * \def CUB_RUNTIME_ENABLED * * Whether or not the active compiler pass is allowed to invoke device kernels * or methods from the CUDA runtime API. * * This macro should not be used in CUB, as it depends on `__CUDA_ARCH__` * and is not compatible with `NV_IF_TARGET`. It is provided for legacy * purposes only. * * Replace any usages with `CUB_RDC_ENABLED` and `NV_IF_TARGET`. */ # define CUB_RUNTIME_ENABLED #else // Non-doxygen pass: # ifndef CUB_RUNTIME_FUNCTION # if defined(__CUDACC_RDC__) && !defined(CUB_DISABLE_CDP) # define CUB_RDC_ENABLED # define CUB_RUNTIME_FUNCTION _CCCL_HOST_DEVICE # else // RDC disabled: # define CUB_RUNTIME_FUNCTION _CCCL_HOST # endif // RDC enabled # if !defined(__CUDA_ARCH__) || defined(__CUDACC_RDC__) // Legacy only -- do not use in new code. # define CUB_RUNTIME_ENABLED # endif # endif // CUB_RUNTIME_FUNCTION predefined # ifdef CUB_RDC_ENABLED // Detect available version of CDP: # if __CUDACC_VER_MAJOR__ < 12 || defined(CUDA_FORCE_CDP1_IF_SUPPORTED) # define CUB_DETAIL_CDPv1 # else # define CUB_DETAIL_CDPv2 # endif # endif #endif // Do not document cccl-2.5.0/cub/cub/detail/device_double_buffer.cuh000066400000000000000000000055421463375617100220540ustar00rootroot00000000000000/* * Copyright 2021 NVIDIA Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include CUB_NAMESPACE_BEGIN namespace detail { /** * @brief It's a double-buffer storage wrapper for multi-pass stream * transformations that require more than one storage array for * streaming intermediate results back and forth. * * Many multi-pass computations require a pair of "ping-pong" storage buffers * (e.g., one for reading from and the other for writing to, and then * vice-versa for the subsequent pass). This structure wraps a set of device * buffers. * * Unlike `cub::DoubleBuffer` this class doesn't provide a "selector" member * to track which buffer is "current". The main reason for this class existence * is the performance difference. Since `cub::DoubleBuffer` relies on the * runtime variable to index pointers arrays, they are placed in the local * memory instead of registers. Local memory accesses significantly affect * performance. On the contrary, this class swaps pointer, so all operations * can be performed in registers. */ template class device_double_buffer { /// Pair of device buffer pointers T* m_current_buffer{}; T* m_alternate_buffer{}; public: /** * @param d_current * The currently valid buffer * * @param d_alternate * Alternate storage buffer of the same size as @p d_current */ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE device_double_buffer(T* current, T* alternate) : m_current_buffer(current) , m_alternate_buffer(alternate) {} /// \brief Return pointer to the currently valid buffer _CCCL_HOST_DEVICE _CCCL_FORCEINLINE T* current() const { return m_current_buffer; } /// \brief Return pointer to the currently invalid buffer _CCCL_HOST_DEVICE _CCCL_FORCEINLINE T* alternate() const { return m_alternate_buffer; } _CCCL_HOST_DEVICE void swap() { T* tmp = m_current_buffer; m_current_buffer = m_alternate_buffer; m_alternate_buffer = tmp; } }; } // namespace detail CUB_NAMESPACE_END cccl-2.5.0/cub/cub/detail/device_synchronize.cuh000066400000000000000000000037041463375617100216220ustar00rootroot00000000000000/* * Copyright 2021 NVIDIA Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include CUB_NAMESPACE_BEGIN namespace detail { /** * Call `cudaDeviceSynchronize()` using the proper API for the current CUB and * CUDA configuration. */ _CCCL_EXEC_CHECK_DISABLE CUB_RUNTIME_FUNCTION inline cudaError_t device_synchronize() { cudaError_t result = cudaErrorNotSupported; // Device-side sync is only available under CDPv1: #if defined(CUB_DETAIL_CDPv1) # if ((__CUDACC_VER_MAJOR__ > 11) || ((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 6))) // CUDA >= 11.6 # define CUB_TMP_DEVICE_SYNC_IMPL result = __cudaDeviceSynchronizeDeprecationAvoidance(); # else // CUDA < 11.6: # define CUB_TMP_DEVICE_SYNC_IMPL result = cudaDeviceSynchronize(); # endif #else // CDPv2 or no CDP: # define CUB_TMP_DEVICE_SYNC_IMPL /* unavailable */ #endif // CDP version NV_IF_TARGET(NV_IS_HOST, (result = cudaDeviceSynchronize();), (CUB_TMP_DEVICE_SYNC_IMPL)); #undef CUB_TMP_DEVICE_SYNC_IMPL return result; } } // namespace detail CUB_NAMESPACE_END cccl-2.5.0/cub/cub/detail/nvtx.cuh000066400000000000000000000112641463375617100167270ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header // Enable the functionality of this header if // * The NVTX3 C API is available in CTK // * NVTX is not explicitly disabled // * C++14 is availabl for cuda::std::optional #if __has_include() && !defined(NVTX_DISABLE) && _CCCL_STD_VER >= 2014 // Include our NVTX3 C++ wrapper if not available from the CTK # if __has_include() // TODO(bgruber): replace by a check for the first CTK version shipping the header # include # else // __has_include() # include "nvtx3.hpp" # endif // __has_include() # include CUB_NAMESPACE_BEGIN namespace detail { struct NVTXCCCLDomain { static constexpr const char* name = "CCCL"; }; } // namespace detail CUB_NAMESPACE_END // Hook for the NestedNVTXRangeGuard from the unit tests # ifndef CUB_DETAIL_BEFORE_NVTX_RANGE_SCOPE # define CUB_DETAIL_BEFORE_NVTX_RANGE_SCOPE(name) # endif // !CUB_DETAIL_BEFORE_NVTX_RANGE_SCOPE // Conditionally inserts a NVTX range starting here until the end of the current function scope in host code. Does // nothing in device code. // The optional is needed to defer the construction of an NVTX range (host-only code) and message string registration // into a dispatch region running only on the host, while preserving the semantic scope where the range is declared. # define CUB_DETAIL_NVTX_RANGE_SCOPE_IF(condition, name) \ CUB_DETAIL_BEFORE_NVTX_RANGE_SCOPE(name) \ ::cuda::std::optional<::nvtx3::scoped_range_in> __cub_nvtx3_range; \ NV_IF_TARGET( \ NV_IS_HOST, \ static const ::nvtx3::registered_string_in __cub_nvtx3_func_name{name}; \ static const ::nvtx3::event_attributes __cub_nvtx3_func_attr{__cub_nvtx3_func_name}; \ if (condition) __cub_nvtx3_range.emplace(__cub_nvtx3_func_attr); \ (void) __cub_nvtx3_range;) # define CUB_DETAIL_NVTX_RANGE_SCOPE(name) CUB_DETAIL_NVTX_RANGE_SCOPE_IF(true, name) #else // __has_include() && !defined(NVTX_DISABLE) && _CCCL_STD_VER > 2011 # define CUB_DETAIL_NVTX_RANGE_SCOPE_IF(condition, name) # define CUB_DETAIL_NVTX_RANGE_SCOPE(name) #endif // __has_include() && !defined(NVTX_DISABLE) && _CCCL_STD_VER > 2011 cccl-2.5.0/cub/cub/detail/nvtx3.hpp000066400000000000000000003165301463375617100170260ustar00rootroot00000000000000/* * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* Temporary helper #defines, #undef'ed at end of header */ #define NVTX3_CPP_VERSION_MAJOR 1 #define NVTX3_CPP_VERSION_MINOR 0 /* This section handles the decision of whether to provide unversioned symbols. * If NVTX3_CPP_REQUIRE_EXPLICIT_VERSION is #defined, unversioned symbols are * not provided, and explicit-version symbols such as nvtx3::v1::scoped_range * and NVTX3_V1_FUNC_RANGE must be used. By default, the first #include of this * header will define the unversioned symbols such as nvtx3::scoped_range and * NVTX3_FUNC_RANGE. Subsequently including a different major version of this * header without #defining NVTX3_CPP_REQUIRE_EXPLICIT_VERSION triggers an error * since the symbols would conflict. Subsequently including of a different * minor version within the same major version is allowed. Functionality of * minor versions is cumulative, regardless of include order. * * Since NVTX3_CPP_REQUIRE_EXPLICIT_VERSION allows all combinations of versions * to coexist without problems within a translation unit, the recommended best * practice for instrumenting header-based libraries with NVTX C++ Wrappers is * is to #define NVTX3_CPP_REQUIRE_EXPLICIT_VERSION before including nvtx3.hpp, * #undef it afterward, and only use explicit-version symbols. This is not * necessary in common cases, such as instrumenting a standalone application, or * static/shared libraries in .cpp files or headers private to those projects. */ /* clang-format off */ #if !defined(NVTX3_CPP_REQUIRE_EXPLICIT_VERSION) /* Define macro used by all definitions in this header to indicate the * unversioned symbols should be defined in addition to the versioned ones. */ #define NVTX3_INLINE_THIS_VERSION #if !defined(NVTX3_CPP_INLINED_VERSION_MAJOR) /* First occurrence of this header in the translation unit. Define macros * indicating which version shall be used for unversioned symbols. */ /** * @brief Semantic major version number for NVTX C++ wrappers of unversioned symbols * * Breaking changes may occur between major versions, and different major versions * cannot provide unversioned symbols in the same translation unit (.cpp file). * * Note: If NVTX3_CPP_REQUIRE_EXPLICIT_VERSION is defined, this macro is not defined. * * Not to be confused with the version number of the NVTX core library. */ #define NVTX3_CPP_INLINED_VERSION_MAJOR 1 // NVTX3_CPP_VERSION_MAJOR /** * @brief Semantic minor version number for NVTX C++ wrappers of unversioned symbols * * No breaking changes occur between minor versions -- minor version changes within * a major version are purely additive. * * Note: If NVTX3_CPP_REQUIRE_EXPLICIT_VERSION is defined, this macro is not defined. * * Not to be confused with the version number of the NVTX core library. */ #define NVTX3_CPP_INLINED_VERSION_MINOR 0 // NVTX3_CPP_VERSION_MINOR #elif NVTX3_CPP_INLINED_VERSION_MAJOR != NVTX3_CPP_VERSION_MAJOR /* Unsupported case -- cannot define unversioned symbols for different major versions * in the same translation unit. */ #error \ "Two different major versions of the NVTX C++ Wrappers are being included in a single .cpp file, with unversioned symbols enabled in both. Only one major version can enable unversioned symbols in a .cpp file. To disable unversioned symbols, #define NVTX3_CPP_REQUIRE_EXPLICIT_VERSION before #including nvtx3.hpp, and use the explicit-version symbols instead -- this is the preferred way to use nvtx3.hpp from a header file." #elif (NVTX3_CPP_INLINED_VERSION_MAJOR == NVTX3_CPP_VERSION_MAJOR) && \ (NVTX3_CPP_INLINED_VERSION_MINOR < NVTX3_CPP_VERSION_MINOR) /* An older minor version of the same major version already defined unversioned * symbols. The new features provided in this header will be inlined * redefine the minor version macro to this header's version. */ #undef NVTX3_CPP_INLINED_VERSION_MINOR #define NVTX3_CPP_INLINED_VERSION_MINOR 0 // NVTX3_CPP_VERSION_MINOR // else, already have this version or newer, nothing to do #endif #endif /* clang-format on */ /** * @file nvtx3.hpp * * @brief Provides C++ constructs making the NVTX library safer and easier to * use with zero overhead. */ /** * \mainpage * \tableofcontents * * \section QUICK_START Quick Start * * To add NVTX ranges to your code, use the `nvtx3::scoped_range` RAII object. A * range begins when the object is created, and ends when the object is * destroyed. * * \code{.cpp} * #include "nvtx3.hpp" * void some_function() { * // Begins a NVTX range with the messsage "some_function" * // The range ends when some_function() returns and `r` is destroyed * nvtx3::scoped_range r{"some_function"}; * * for(int i = 0; i < 6; ++i) { * nvtx3::scoped_range loop{"loop range"}; * std::this_thread::sleep_for(std::chrono::seconds{1}); * } * } // Range ends when `r` is destroyed * \endcode * * The example code above generates the following timeline view in Nsight * Systems: * * \image html * https://raw.githubusercontent.com/NVIDIA/NVTX/release-v3/docs/images/example_range.png * * Alternatively, use the \ref MACROS like `NVTX3_FUNC_RANGE()` to add * ranges to your code that automatically use the name of the enclosing function * as the range's message. * * \code{.cpp} * #include "nvtx3.hpp" * void some_function() { * // Creates a range with a message "some_function" that ends when the * // enclosing function returns * NVTX3_FUNC_RANGE(); * ... * } * \endcode * * * \section Overview * * The NVTX library provides a set of functions for users to annotate their code * to aid in performance profiling and optimization. These annotations provide * information to tools like Nsight Systems to improve visualization of * application timelines. * * \ref RANGES are one of the most commonly used NVTX constructs for annotating * a span of time. For example, imagine a user wanted to see every time a * function, `my_function`, is called and how long it takes to execute. This can * be accomplished with an NVTX range created on the entry to the function and * terminated on return from `my_function` using the push/pop C APIs: * * \code{.cpp} * void my_function(...) { * nvtxRangePushA("my_function"); // Begins NVTX range * // do work * nvtxRangePop(); // Ends NVTX range * } * \endcode * * One of the challenges with using the NVTX C API is that it requires manually * terminating the end of the range with `nvtxRangePop`. This can be challenging * if `my_function()` has multiple returns or can throw exceptions as it * requires calling `nvtxRangePop()` before all possible return points. * * NVTX C++ solves this inconvenience through the "RAII" technique by providing * a `nvtx3::scoped_range` class that begins a range at construction and ends * the range on destruction. The above example then becomes: * * \code{.cpp} * void my_function(...) { * nvtx3::scoped_range r{"my_function"}; // Begins NVTX range * // do work * } // Range ends on exit from `my_function` when `r` is destroyed * \endcode * * The range object `r` is deterministically destroyed whenever `my_function` * returns---ending the NVTX range without manual intervention. For more * information, see \ref RANGES and `nvtx3::scoped_range_in`. * * Another inconvenience of the NVTX C APIs are the several constructs where the * user is expected to initialize an object at the beginning of an application * and reuse that object throughout the lifetime of the application. For example * see domains, categories, and registered messages. * * Example: * \code{.cpp} * nvtxDomainHandle_t D = nvtxDomainCreateA("my domain"); * // Reuse `D` throughout the rest of the application * \endcode * * This can be problematic if the user application or library does not have an * explicit initialization function called before all other functions to * ensure that these long-lived objects are initialized before being used. * * NVTX C++ makes use of the "construct on first use" technique to alleviate * this inconvenience. In short, a function local static object is constructed * upon the first invocation of a function and returns a reference to that * object on all future invocations. See the documentation for `nvtx3::domain`, * `nvtx3::named_category`, `nvtx3::registered_string`, and * https://isocpp.org/wiki/faq/ctors#static-init-order-on-first-use for more * information. * * Using construct on first use, the above example becomes: * \code{.cpp} * struct my_domain{ static constexpr char const* name{"my domain"}; }; * * // The first invocation of `domain::get` for the type `my_domain` will * // construct a `nvtx3::domain` object and return a reference to it. Future * // invocations simply return a reference. * nvtx3::domain const& D = nvtx3::domain::get(); * \endcode * For more information about NVTX and how it can be used, see * https://docs.nvidia.com/cuda/profiler-users-guide/index.html#nvtx and * https://devblogs.nvidia.com/cuda-pro-tip-generate-custom-application-profile-timelines-nvtx/ * for more information. * * \section RANGES Ranges * * Ranges are used to describe a span of time during the execution of an * application. Common examples are using ranges to annotate the time it takes * to execute a function or an iteration of a loop. * * NVTX C++ uses RAII to automate the generation of ranges that are tied to the * lifetime of objects. Similar to `std::lock_guard` in the C++ Standard * Template Library. * * \subsection scoped_range Scoped Range * * `nvtx3::scoped_range_in` is a class that begins a range upon construction * and ends the range at destruction. This is one of the most commonly used * constructs in NVTX C++ and is useful for annotating spans of time on a * particular thread. These ranges can be nested to arbitrary depths. * * `nvtx3::scoped_range` is an alias for a `nvtx3::scoped_range_in` in the * global NVTX domain. For more information about Domains, see \ref DOMAINS. * * Various attributes of a range can be configured constructing a * `nvtx3::scoped_range_in` with a `nvtx3::event_attributes` object. For * more information, see \ref ATTRIBUTES. * * Example: * * \code{.cpp} * void some_function() { * // Creates a range for the duration of `some_function` * nvtx3::scoped_range r{}; * * while(true) { * // Creates a range for every loop iteration * // `loop_range` is nested inside `r` * nvtx3::scoped_range loop_range{}; * } * } * \endcode * * \subsection unique_range Unique Range * * `nvtx3::unique_range` is similar to `nvtx3::scoped_range`, with a few key differences: * - `unique_range` objects can be destroyed in any order whereas `scoped_range` objects must be * destroyed in exact reverse creation order * - `unique_range` can start and end on different threads * - `unique_range` is moveable * - `unique_range` objects can be constructed as heap objects * * There is extra overhead associated with `unique_range` constructs and therefore use of * `nvtx3::scoped_range_in` should be preferred. * * \section MARKS Marks * * `nvtx3::mark` annotates an instantaneous point in time with a "marker". * * Unlike a "range" which has a beginning and an end, a marker is a single event * in an application, such as detecting a problem: * * \code{.cpp} * bool success = do_operation(...); * if (!success) { * nvtx3::mark("operation failed!"); * } * \endcode * * \section DOMAINS Domains * * Similar to C++ namespaces, domains allow for scoping NVTX events. By default, * all NVTX events belong to the "global" domain. Libraries and applications * should scope their events to use a custom domain to differentiate where the * events originate from. * * It is common for a library or application to have only a single domain and * for the name of that domain to be known at compile time. Therefore, Domains * in NVTX C++ are represented by _tag types_. * * For example, to define a custom domain, simply define a new concrete type * (a `class` or `struct`) with a `static` member called `name` that contains * the desired name of the domain. * * \code{.cpp} * struct my_domain{ static constexpr char const* name{"my domain"}; }; * \endcode * * For any NVTX C++ construct that can be scoped to a domain, the type * `my_domain` can be passed as an explicit template argument to scope it to * the custom domain. * * The tag type `nvtx3::domain::global` represents the global NVTX domain. * * \code{.cpp} * // By default, `scoped_range_in` belongs to the global domain * nvtx3::scoped_range_in<> r0{}; * * // Alias for a `scoped_range_in` in the global domain * nvtx3::scoped_range r1{}; * * // `r` belongs to the custom domain * nvtx3::scoped_range_in r{}; * \endcode * * When using a custom domain, it is recommended to define type aliases for NVTX * constructs in the custom domain. * \code{.cpp} * using my_scoped_range = nvtx3::scoped_range_in; * using my_registered_string = nvtx3::registered_string_in; * using my_named_category = nvtx3::named_category_in; * \endcode * * See `nvtx3::domain` for more information. * * \section ATTRIBUTES Event Attributes * * NVTX events can be customized with various attributes to provide additional * information (such as a custom message) or to control visualization of the * event (such as the color used). These attributes can be specified per-event * via arguments to a `nvtx3::event_attributes` object. * * NVTX events can be customized via four "attributes": * - \ref COLOR : color used to visualize the event in tools. * - \ref MESSAGES : Custom message string. * - \ref PAYLOAD : User-defined numerical value. * - \ref CATEGORY : Intra-domain grouping. * * It is possible to construct a `nvtx3::event_attributes` from any number of * attribute objects (nvtx3::color, nvtx3::message, nvtx3::payload, * nvtx3::category) in any order. If an attribute is not specified, a tool * specific default value is used. See `nvtx3::event_attributes` for more * information. * * \code{.cpp} * // Set message, same as passing nvtx3::message{"message"} * nvtx3::event_attributes attr{"message"}; * * // Set message and color * nvtx3::event_attributes attr{"message", nvtx3::rgb{127, 255, 0}}; * * // Set message, color, payload, category * nvtx3::event_attributes attr{"message", * nvtx3::rgb{127, 255, 0}, * nvtx3::payload{42}, * nvtx3::category{1}}; * * // Same as above -- can use any order of arguments * nvtx3::event_attributes attr{nvtx3::payload{42}, * nvtx3::category{1}, * "message", * nvtx3::rgb{127, 255, 0}}; * * // Multiple arguments of the same type are allowed, but only the first is * // used -- in this example, payload is set to 42: * nvtx3::event_attributes attr{ nvtx3::payload{42}, nvtx3::payload{7} }; * * // Using the nvtx3 namespace in a local scope makes the syntax more succinct: * using namespace nvtx3; * event_attributes attr{"message", rgb{127, 255, 0}, payload{42}, category{1}}; * \endcode * * \subsection MESSAGES message * * `nvtx3::message` sets the message string for an NVTX event. * * Example: * \code{.cpp} * // Create an `event_attributes` with the message "my message" * nvtx3::event_attributes attr{nvtx3::message{"my message"}}; * * // strings and string literals implicitly assumed to be a `nvtx3::message` * nvtx3::event_attributes attr{"my message"}; * \endcode * * \subsubsection REGISTERED_MESSAGE Registered Messages * * Associating a `nvtx3::message` with an event requires copying the contents of * the message every time the message is used, i.e., copying the entire message * string. This may cause non-trivial overhead in performance sensitive code. * * To eliminate this overhead, NVTX allows registering a message string, * yielding a "handle" that is inexpensive to copy that may be used in place of * a message string. When visualizing the events, tools such as Nsight Systems * will take care of mapping the message handle to its string. * * A message should be registered once and the handle reused throughout the rest * of the application. This can be done by either explicitly creating static * `nvtx3::registered_string` objects, or using the * `nvtx3::registered_string::get` construct on first use helper (recommended). * * Similar to \ref DOMAINS, `nvtx3::registered_string::get` requires defining a * custom tag type with a static `message` member whose value will be the * contents of the registered string. * * Example: * \code{.cpp} * // Explicitly constructed, static `registered_string` in my_domain: * static registered_string_in static_message{"my message"}; * * // Or use construct on first use: * // Define a tag type with a `message` member string to register * struct my_message{ static constexpr char const* message{ "my message" }; }; * * // Uses construct on first use to register the contents of * // `my_message::message` * auto& msg = nvtx3::registered_string_in::get(); * \endcode * * \subsection COLOR color * * Associating a `nvtx3::color` with an event allows controlling how the event * is visualized in a tool such as Nsight Systems. This is a convenient way to * visually differentiate among different events. * * \code{.cpp} * // Define a color via rgb color values * nvtx3::color c{nvtx3::rgb{127, 255, 0}}; * nvtx3::event_attributes attr{c}; * * // rgb color values can be passed directly to an `event_attributes` * nvtx3::event_attributes attr1{nvtx3::rgb{127,255,0}}; * \endcode * * \subsection CATEGORY category * * A `nvtx3::category` is simply an integer id that allows for fine-grain * grouping of NVTX events. For example, one might use separate categories for * IO, memory allocation, compute, etc. * * \code{.cpp} * nvtx3::event_attributes{nvtx3::category{1}}; * \endcode * * \subsubsection NAMED_CATEGORIES Named Categories * * Associates a `name` string with a category `id` to help differentiate among * categories. * * For any given category id `Id`, a `named_category{Id, "name"}` should only * be constructed once and reused throughout an application. This can be done by * either explicitly creating static `nvtx3::named_category` objects, or using * the `nvtx3::named_category::get` construct on first use helper (recommended). * * Similar to \ref DOMAINS, `nvtx3::named_category::get` requires defining a * custom tag type with static `name` and `id` members. * * \code{.cpp} * // Explicitly constructed, static `named_category` in my_domain: * static nvtx3::named_category_in static_category{42, "my category"}; * * // Or use construct on first use: * // Define a tag type with `name` and `id` members * struct my_category { * static constexpr char const* name{"my category"}; // category name * static constexpr uint32_t id{42}; // category id * }; * * // Use construct on first use to name the category id `42` * // with name "my category": * auto& cat = named_category_in::get(); * * // Range `r` associated with category id `42` * nvtx3::event_attributes attr{cat}; * \endcode * * \subsection PAYLOAD payload * * Allows associating a user-defined numerical value with an event. * * \code{.cpp} * // Constructs a payload from the `int32_t` value 42 * nvtx3:: event_attributes attr{nvtx3::payload{42}}; * \endcode * * * \section EXAMPLE Example * * Putting it all together: * \code{.cpp} * // Define a custom domain tag type * struct my_domain{ static constexpr char const* name{"my domain"}; }; * * // Define a named category tag type * struct my_category{ * static constexpr char const* name{"my category"}; * static constexpr uint32_t id{42}; * }; * * // Define a registered string tag type * struct my_message{ static constexpr char const* message{"my message"}; }; * * // For convenience, use aliases for domain scoped objects * using my_scoped_range = nvtx3::scoped_range_in; * using my_registered_string = nvtx3::registered_string_in; * using my_named_category = nvtx3::named_category_in; * * // Default values for all attributes * nvtx3::event_attributes attr{}; * my_scoped_range r0{attr}; * * // Custom (unregistered) message, and unnamed category * nvtx3::event_attributes attr1{"message", nvtx3::category{2}}; * my_scoped_range r1{attr1}; * * // Alternatively, pass arguments of `event_attributes` ctor directly to * // `my_scoped_range` * my_scoped_range r2{"message", nvtx3::category{2}}; * * // construct on first use a registered string * auto& msg = my_registered_string::get(); * * // construct on first use a named category * auto& cat = my_named_category::get(); * * // Use registered string and named category with a custom payload * my_scoped_range r3{msg, cat, nvtx3::payload{42}}; * * // Any number of arguments in any order * my_scoped_range r{nvtx3::rgb{127, 255,0}, msg}; * * \endcode * \section MACROS Convenience Macros * * Oftentimes users want to quickly and easily add NVTX ranges to their library * or application to aid in profiling and optimization. * * A convenient way to do this is to use the \ref NVTX3_FUNC_RANGE and * \ref NVTX3_FUNC_RANGE_IN macros. These macros take care of constructing an * `nvtx3::scoped_range_in` with the name of the enclosing function as the * range's message. * * \code{.cpp} * void some_function() { * // Automatically generates an NVTX range for the duration of the function * // using "some_function" as the event's message. * NVTX3_FUNC_RANGE(); * } * \endcode * */ /* Temporary helper #defines, removed with #undef at end of header */ #if !defined(NVTX3_USE_CHECKED_OVERLOADS_FOR_GET) # if defined(_MSC_VER) && _MSC_VER < 1914 /* Microsoft's compiler prior to VS2017 Update 7 (15.7) uses an older parser * that does not work with domain::get's specialization for domain::global, * and would require extra conditions to make SFINAE work for the overloaded * get() functions. This macro disables use of overloaded get() in order to * work with VS2015 and versions of VS2017 below 15.7, without penalizing * users of newer compilers. Building with this flag set to 0 means errors * when defining tag structs (see documentation for domain, named_category, * and registered_string) will have more complex compiler error messages * instead of the clear static_assert messages from the get() overloads. */ # define NVTX3_USE_CHECKED_OVERLOADS_FOR_GET 0 # else # define NVTX3_USE_CHECKED_OVERLOADS_FOR_GET 1 # endif # define NVTX3_USE_CHECKED_OVERLOADS_FOR_GET_DEFINED_HERE #endif /* Within this header, nvtx3::NVTX3_VERSION_NAMESPACE resolves to nvtx3::vX, * where "X" is the major version number. */ #define NVTX3_CONCAT(A, B) A##B #define NVTX3_NAMESPACE_FOR(VERSION) NVTX3_CONCAT(v, VERSION) #define NVTX3_VERSION_NAMESPACE NVTX3_NAMESPACE_FOR(NVTX3_CPP_VERSION_MAJOR) /* Avoid duplicating #if defined(NVTX3_INLINE_THIS_VERSION) for namespaces * in each minor version by making a macro to use unconditionally, which * resolves to "inline" or nothing as appropriate. */ #if defined(NVTX3_INLINE_THIS_VERSION) # define NVTX3_INLINE_IF_REQUESTED inline #else # define NVTX3_INLINE_IF_REQUESTED #endif /* Enables the use of constexpr when support for C++14 constexpr is present. * * Initialization of a class member that is a union to a specific union member * can only be done in the body of a constructor, not in a member initializer * list. A constexpr constructor must have an empty body until C++14, so there * is no way to make an initializer of a member union constexpr in C++11. This * macro allows making functions constexpr in C++14 or newer, but non-constexpr * in C++11 compilation. It is used here on constructors that initialize their * member unions. */ #if __cpp_constexpr >= 201304L # define NVTX3_CONSTEXPR_IF_CPP14 constexpr #else # define NVTX3_CONSTEXPR_IF_CPP14 #endif /* Use a macro for static asserts, which defaults to static_assert, but that * testing tools can replace with a logging function. For example: * #define NVTX3_STATIC_ASSERT(c, m) \ * do { if (!(c)) printf("static_assert would fail: %s\n", m); } while (0) */ #if !defined(NVTX3_STATIC_ASSERT) # define NVTX3_STATIC_ASSERT(condition, message) static_assert(condition, message); # define NVTX3_STATIC_ASSERT_DEFINED_HERE #endif /* Implementation sections, enclosed in guard macros for each minor version */ #ifndef NVTX3_CPP_DEFINITIONS_V1_0 # define NVTX3_CPP_DEFINITIONS_V1_0 # include # include # include # include # include # include // NOTE(bgruber): "nvtx3/" prefix added and switched to angle brackets namespace nvtx3 { NVTX3_INLINE_IF_REQUESTED namespace NVTX3_VERSION_NAMESPACE { namespace detail { template struct always_false : std::false_type {}; template struct has_name : std::false_type {}; template struct has_name : std::true_type {}; template struct has_id : std::false_type {}; template struct has_id : std::true_type {}; template struct has_message : std::false_type {}; template struct has_message : std::true_type {}; template struct is_c_string : std::false_type {}; template struct is_c_string::value || std::is_convertible::value>::type> : std::true_type {}; template using is_uint32 = std::is_same::type, uint32_t>; } // namespace detail /** * @brief `domain`s allow for grouping NVTX events into a single scope to * differentiate them from events in other `domain`s. * * By default, all NVTX constructs are placed in the "global" NVTX domain. * * A custom `domain` may be used in order to differentiate a library's or * application's NVTX events from other events. * * `domain`s are expected to be long-lived and unique to a library or * application. As such, it is assumed a domain's name is known at compile * time. Therefore, all NVTX constructs that can be associated with a domain * require the domain to be specified via a *type* `D` passed as an * explicit template parameter. * * The type `domain::global` may be used to indicate that the global NVTX * domain should be used. * * None of the C++ NVTX constructs require the user to manually construct a * `domain` object. Instead, if a custom domain is desired, the user is * expected to define a type `D` that contains a member * `D::name` which resolves to either a `char const*` or `wchar_t * const*`. The value of `D::name` is used to name and uniquely * identify the custom domain. * * Upon the first use of an NVTX construct associated with the type * `D`, the "construct on first use" pattern is used to construct a * function local static `domain` object. All future NVTX constructs * associated with `D` will use a reference to the previously * constructed `domain` object. See `domain::get`. * * Example: * \code{.cpp} * // The type `my_domain` defines a `name` member used to name and identify * // the `domain` object identified by `my_domain`. * struct my_domain{ static constexpr char const* name{"my_domain"}; }; * * // The NVTX range `r` will be grouped with all other NVTX constructs * // associated with `my_domain`. * nvtx3::scoped_range_in r{}; * * // An alias can be created for a `scoped_range_in` in the custom domain * using my_scoped_range = nvtx3::scoped_range_in; * my_scoped_range my_range{}; * * // `domain::global` indicates that the global NVTX domain is used * nvtx3::scoped_range_in r2{}; * * // For convenience, `nvtx3::scoped_range` is an alias for a range in the * // global domain * nvtx3::scoped_range r3{}; * \endcode */ class domain { public: domain(domain const&) = delete; domain& operator=(domain const&) = delete; domain(domain&&) = delete; domain& operator=(domain&&) = delete; /** * @brief Tag type for the "global" NVTX domain. * * This type may be passed as a template argument to any function/class * expecting a type to identify a domain to indicate that the global domain * should be used. * * All NVTX events in the global domain across all libraries and * applications will be grouped together. * */ struct global {}; # if NVTX3_USE_CHECKED_OVERLOADS_FOR_GET /** * @brief Returns reference to an instance of a function local static * `domain` object. * * Uses the "construct on first use" idiom to safely ensure the `domain` * object is initialized exactly once upon first invocation of * `domain::get()`. All following invocations will return a * reference to the previously constructed `domain` object. See * https://isocpp.org/wiki/faq/ctors#static-init-order-on-first-use * * None of the constructs in this header require the user to directly invoke * `domain::get`. It is automatically invoked when constructing objects like * a `scoped_range_in` or `category`. Advanced users may wish to use * `domain::get` for the convenience of the "construct on first use" idiom * when using domains with their own use of the NVTX C API. * * This function is threadsafe as of C++11. If two or more threads call * `domain::get` concurrently, exactly one of them is guaranteed * to construct the `domain` object and the other(s) will receive a * reference to the object after it is fully constructed. * * The domain's name is specified via the type `D` pass as an * explicit template parameter. `D` is required to contain a * member `D::name` that resolves to either a `char const*` or * `wchar_t const*`. The value of `D::name` is used to name and * uniquely identify the `domain`. * * Example: * \code{.cpp} * // The type `my_domain` defines a `name` member used to name and identify * // the `domain` object identified by `my_domain`. * struct my_domain{ static constexpr char const* name{"my domain"}; }; * * auto& D1 = domain::get(); // First invocation constructs a * // `domain` with the name "my domain" * * auto& D2 = domain::get(); // Quickly returns reference to * // previously constructed `domain`. * \endcode * * @tparam D Type that contains a `D::name` member used to * name the `domain` object. * @return Reference to the `domain` corresponding to the type `D`. */ template ::value, int>::type = 0> static domain const& get() noexcept { static domain const d(D::name); return d; } /** * @brief Overload of `domain::get` to provide a clear compile error when * `D` has a `name` member that is not directly convertible to either * `char const*` or `wchar_t const*`. */ template ::value, int>::type = 0> static domain const& get() noexcept { NVTX3_STATIC_ASSERT(detail::always_false::value, "Type used to identify an NVTX domain must contain a static constexpr member " "called 'name' of type const char* or const wchar_t* -- 'name' member is not " "convertible to either of those types"); static domain const unused; return unused; // Function must compile for static_assert to be triggered } /** * @brief Overload of `domain::get` to provide a clear compile error when * `D` does not have a `name` member. */ template ::value, int>::type = 0> static domain const& get() noexcept { NVTX3_STATIC_ASSERT(detail::always_false::value, "Type used to identify an NVTX domain must contain a static constexpr member " "called 'name' of type const char* or const wchar_t* -- 'name' member is missing"); static domain const unused; return unused; // Function must compile for static_assert to be triggered } # else template static domain const& get() noexcept { static domain const d(D::name); return d; } # endif /** * @brief Conversion operator to `nvtxDomainHandle_t`. * * Allows transparently passing a domain object into an API expecting a * native `nvtxDomainHandle_t` object. */ operator nvtxDomainHandle_t() const noexcept { return _domain; } private: /** * @brief Construct a new domain with the specified `name`. * * This constructor is private as it is intended that `domain` objects only * be created through the `domain::get` function. * * @param name A unique name identifying the domain */ explicit domain(char const* name) noexcept : _domain{nvtxDomainCreateA(name)} {} /** * @brief Construct a new domain with the specified `name`. * * This constructor is private as it is intended that `domain` objects only * be created through the `domain::get` function. * * @param name A unique name identifying the domain */ explicit domain(wchar_t const* name) noexcept : _domain{nvtxDomainCreateW(name)} {} /** * @brief Construct a new domain with the specified `name`. * * This constructor is private as it is intended that `domain` objects only * be created through the `domain::get` function. * * @param name A unique name identifying the domain */ explicit domain(std::string const& name) noexcept : domain{name.c_str()} {} /** * @brief Construct a new domain with the specified `name`. * * This constructor is private as it is intended that `domain` objects only * be created through the `domain::get` function. * * @param name A unique name identifying the domain */ explicit domain(std::wstring const& name) noexcept : domain{name.c_str()} {} /** * @brief Default constructor creates a `domain` representing the * "global" NVTX domain. * * All events not associated with a custom `domain` are grouped in the * "global" NVTX domain. * */ domain() noexcept {} /** * @brief Intentionally avoid calling nvtxDomainDestroy on the `domain` object. * * No currently-available tools attempt to free domain resources when the * nvtxDomainDestroy function is called, due to the thread-safety and * efficiency challenges of freeing thread-local storage for other threads. * Since libraries may be disallowed from introducing static destructors, * and destroying the domain is likely to have no effect, the destructor * for `domain` intentionally chooses to not destroy the domain. * * In a situation where domain destruction is necessary, either manually * call nvtxDomainDestroy on the domain's handle, or make a class that * derives from `domain` and calls nvtxDomainDestroy in its destructor. */ ~domain() = default; private: nvtxDomainHandle_t const _domain{}; ///< The `domain`s NVTX handle }; /** * @brief Returns reference to the `domain` object that represents the global * NVTX domain. * * This specialization for `domain::global` returns a default constructed, * `domain` object for use when the "global" domain is desired. * * All NVTX events in the global domain across all libraries and applications * will be grouped together. * * @return Reference to the `domain` corresponding to the global NVTX domain. * */ template <> inline domain const& domain::get() noexcept { static domain const d{}; return d; } /** * @brief Indicates the values of the red, green, and blue color channels for * an RGB color to use as an event attribute (assumes no transparency). * */ struct rgb { /// Type used for component values using component_type = uint8_t; /** * @brief Construct a rgb with red, green, and blue channels * specified by `red_`, `green_`, and `blue_`, respectively. * * Valid values are in the range `[0,255]`. * * @param red_ Value of the red channel * @param green_ Value of the green channel * @param blue_ Value of the blue channel */ constexpr rgb(component_type red_, component_type green_, component_type blue_) noexcept : red{red_} , green{green_} , blue{blue_} {} component_type red{}; ///< Red channel value component_type green{}; ///< Green channel value component_type blue{}; ///< Blue channel value }; /** * @brief Indicates the value of the alpha, red, green, and blue color * channels for an ARGB color to use as an event attribute. * */ struct argb final : rgb { /** * @brief Construct an argb with alpha, red, green, and blue channels * specified by `alpha_`, `red_`, `green_`, and `blue_`, respectively. * * Valid values are in the range `[0,255]`. * * @param alpha_ Value of the alpha channel (opacity) * @param red_ Value of the red channel * @param green_ Value of the green channel * @param blue_ Value of the blue channel * */ constexpr argb(component_type alpha_, component_type red_, component_type green_, component_type blue_) noexcept : rgb{red_, green_, blue_} , alpha{alpha_} {} component_type alpha{}; ///< Alpha channel value }; /** * @brief Represents a custom color that can be associated with an NVTX event * via it's `event_attributes`. * * Specifying colors for NVTX events is a convenient way to visually * differentiate among different events in a visualization tool such as Nsight * Systems. * */ class color { public: /// Type used for the color's value using value_type = uint32_t; /** * @brief Constructs a `color` using the value provided by `hex_code`. * * `hex_code` is expected to be a 4 byte argb hex code. * * The most significant byte indicates the value of the alpha channel * (opacity) (0-255) * * The next byte indicates the value of the red channel (0-255) * * The next byte indicates the value of the green channel (0-255) * * The least significant byte indicates the value of the blue channel * (0-255) * * @param hex_code The hex code used to construct the `color` */ constexpr explicit color(value_type hex_code) noexcept : _value{hex_code} {} /** * @brief Construct a `color` using the alpha, red, green, blue components * in `argb`. * * @param argb The alpha, red, green, blue components of the desired `color` */ constexpr color(argb argb_) noexcept : color{from_bytes_msb_to_lsb(argb_.alpha, argb_.red, argb_.green, argb_.blue)} {} /** * @brief Construct a `color` using the red, green, blue components in * `rgb`. * * Uses maximum value for the alpha channel (opacity) of the `color`. * * @param rgb The red, green, blue components of the desired `color` */ constexpr color(rgb rgb_) noexcept : color{from_bytes_msb_to_lsb(0xFF, rgb_.red, rgb_.green, rgb_.blue)} {} /** * @brief Returns the `color`s argb hex code * */ constexpr value_type get_value() const noexcept { return _value; } /** * @brief Return the NVTX color type of the color. * */ constexpr nvtxColorType_t get_type() const noexcept { return _type; } color() = delete; ~color() = default; color(color const&) = default; color& operator=(color const&) = default; color(color&&) = default; color& operator=(color&&) = default; private: /** * @brief Constructs an unsigned, 4B integer from the component bytes in * most to least significant byte order. * */ constexpr static value_type from_bytes_msb_to_lsb(uint8_t byte3, uint8_t byte2, uint8_t byte1, uint8_t byte0) noexcept { return uint32_t{byte3} << 24 | uint32_t{byte2} << 16 | uint32_t{byte1} << 8 | uint32_t{byte0}; } value_type _value{}; ///< color's argb color code nvtxColorType_t _type{NVTX_COLOR_ARGB}; ///< NVTX color type code }; /** * @brief Object for intra-domain grouping of NVTX events. * * A `category` is simply an integer id that allows for fine-grain grouping of * NVTX events. For example, one might use separate categories for IO, memory * allocation, compute, etc. * * Example: * \code{.cpp} * nvtx3::category cat1{1}; * * // Range `r1` belongs to the category identified by the value `1`. * nvtx3::scoped_range r1{cat1}; * * // Range `r2` belongs to the same category as `r1` * nvtx3::scoped_range r2{nvtx3::category{1}}; * \endcode * * To associate a name string with a category id, see `named_category`. * */ class category { public: /// Type used for `category`s integer id. using id_type = uint32_t; /** * @brief Construct a `category` with the specified `id`. * * The `category` will be unnamed and identified only by its `id` value. * * All `category`s in a domain sharing the same `id` are equivalent. * * @param[in] id The `category`'s identifying value */ constexpr explicit category(id_type id) noexcept : id_{id} {} /** * @brief Returns the id of the category. * */ constexpr id_type get_id() const noexcept { return id_; } category() = delete; ~category() = default; category(category const&) = default; category& operator=(category const&) = default; category(category&&) = default; category& operator=(category&&) = default; private: id_type id_{}; ///< category's unique identifier }; /** * @brief A `category` with an associated name string. * * Associates a `name` string with a category `id` to help differentiate among * categories. * * For any given category id `Id`, a `named_category(Id, "name")` should only * be constructed once and reused throughout an application. This can be done * by either explicitly creating static `named_category` objects, or using the * `named_category::get` construct on first use helper (recommended). * * Creating two or more `named_category` objects with the same value for `id` * in the same domain results in undefined behavior. * * Similarly, behavior is undefined when a `named_category` and `category` * share the same value of `id`. * * Example: * \code{.cpp} * // Explicitly constructed, static `named_category` in global domain: * static nvtx3::named_category static_category{42, "my category"}; * * // Range `r` associated with category id `42` * nvtx3::scoped_range r{static_category}; * * // OR use construct on first use: * * // Define a type with `name` and `id` members * struct my_category { * static constexpr char const* name{"my category"}; // category name * static constexpr uint32_t id{42}; // category id * }; * * // Use construct on first use to name the category id `42` * // with name "my category" * auto& cat = named_category_in::get(); * * // Range `r` associated with category id `42` * nvtx3::scoped_range r{cat}; * \endcode * * `named_category_in`'s association of a name to a category id is local to * the domain specified by the type `D`. An id may have a different name in * another domain. * * @tparam D Type containing `name` member used to identify the `domain` to * which the `named_category_in` belongs. Else, `domain::global` to indicate * that the global NVTX domain should be used. */ template class named_category_in final : public category { public: # if NVTX3_USE_CHECKED_OVERLOADS_FOR_GET /** * @brief Returns a global instance of a `named_category_in` as a * function-local static. * * Creates a `named_category_in` with name and id specified by the contents * of a type `C`. `C::name` determines the name and `C::id` determines the * category id. * * This function is useful for constructing a named `category` exactly once * and reusing the same instance throughout an application. * * Example: * \code{.cpp} * // Define a type with `name` and `id` members * struct my_category { * static constexpr char const* name{"my category"}; // category name * static constexpr uint32_t id{42}; // category id * }; * * // Use construct on first use to name the category id `42` * // with name "my category" * auto& cat = named_category_in::get(); * * // Range `r` associated with category id `42` * nvtx3::scoped_range r{cat}; * \endcode * * Uses the "construct on first use" idiom to safely ensure the `category` * object is initialized exactly once. See * https://isocpp.org/wiki/faq/ctors#static-init-order-on-first-use * * @tparam C Type containing a member `C::name` that resolves to either a * `char const*` or `wchar_t const*` and `C::id`. */ template < typename C, typename std::enable_if::value && detail::is_uint32::value, int>::type = 0> static named_category_in const& get() noexcept { static named_category_in const cat(C::id, C::name); return cat; } /** * @brief Overload of `named_category_in::get` to provide a clear compile error * when `C` has the required `name` and `id` members, but they are not the * required types. `name` must be directly convertible to `char const*` or * `wchar_t const*`, and `id` must be `uint32_t`. */ template ::value || !detail::is_uint32::value, int>::type = 0> static named_category_in const& get() noexcept { NVTX3_STATIC_ASSERT(detail::is_c_string::value, "Type used to name an NVTX category must contain a static constexpr member " "called 'name' of type const char* or const wchar_t* -- 'name' member is not " "convertible to either of those types"); NVTX3_STATIC_ASSERT(detail::is_uint32::value, "Type used to name an NVTX category must contain a static constexpr member " "called 'id' of type uint32_t -- 'id' member is the wrong type"); static named_category_in const unused; return unused; // Function must compile for static_assert to be triggered } /** * @brief Overload of `named_category_in::get` to provide a clear compile error * when `C` does not have the required `name` and `id` members. */ template ::value || !detail::has_id::value, int>::type = 0> static named_category_in const& get() noexcept { NVTX3_STATIC_ASSERT(detail::has_name::value, "Type used to name an NVTX category must contain a static constexpr member " "called 'name' of type const char* or const wchar_t* -- 'name' member is missing"); NVTX3_STATIC_ASSERT(detail::has_id::value, "Type used to name an NVTX category must contain a static constexpr member " "called 'id' of type uint32_t -- 'id' member is missing"); static named_category_in const unused; return unused; // Function must compile for static_assert to be triggered } # else template static named_category_in const& get() noexcept { static named_category_in const cat(C::id, C::name); return cat; } # endif private: // Default constructor is only used internally for static_assert(false) cases. named_category_in() noexcept : category{0} {} public: /** * @brief Construct a `named_category_in` with the specified `id` and `name`. * * The name `name` will be registered with `id`. * * Every unique value of `id` should only be named once. * * @param[in] id The category id to name * @param[in] name The name to associated with `id` */ named_category_in(id_type id, char const* name) noexcept : category{id} { # ifndef NVTX_DISABLE nvtxDomainNameCategoryA(domain::get(), get_id(), name); # else (void) id; (void) name; # endif }; /** * @brief Construct a `named_category_in` with the specified `id` and `name`. * * The name `name` will be registered with `id`. * * Every unique value of `id` should only be named once. * * @param[in] id The category id to name * @param[in] name The name to associated with `id` */ named_category_in(id_type id, wchar_t const* name) noexcept : category{id} { # ifndef NVTX_DISABLE nvtxDomainNameCategoryW(domain::get(), get_id(), name); # else (void) id; (void) name; # endif }; }; /** * @brief Alias for a `named_category_in` in the global NVTX domain. * */ using named_category = named_category_in; /** * @brief A message registered with NVTX. * * Normally, associating a `message` with an NVTX event requires copying the * contents of the message string. This may cause non-trivial overhead in * highly performance sensitive regions of code. * * message registration is an optimization to lower the overhead of * associating a message with an NVTX event. Registering a message yields a * handle that is inexpensive to copy that may be used in place of a message * string. * * A particular message should only be registered once and the handle * reused throughout the rest of the application. This can be done by either * explicitly creating static `registered_string_in` objects, or using the * `registered_string_in::get` construct on first use helper (recommended). * * Example: * \code{.cpp} * // Explicitly constructed, static `registered_string` in my_domain: * static registered_string_in static_message{"message"}; * * // "message" is associated with the range `r` * nvtx3::scoped_range r{static_message}; * * // Or use construct on first use: * * // Define a type with a `message` member that defines the contents of the * // registered string * struct my_message{ static constexpr char const* message{ "my message" }; }; * * // Uses construct on first use to register the contents of * // `my_message::message` * auto& msg = registered_string_in::get(); * * // "my message" is associated with the range `r` * nvtx3::scoped_range r{msg}; * \endcode * * `registered_string_in`s are local to a particular domain specified via * the type `D`. * * @tparam D Type containing `name` member used to identify the `domain` to * which the `registered_string_in` belongs. Else, `domain::global` to indicate * that the global NVTX domain should be used. */ template class registered_string_in { public: # if NVTX3_USE_CHECKED_OVERLOADS_FOR_GET /** * @brief Returns a global instance of a `registered_string_in` as a function * local static. * * Provides a convenient way to register a message with NVTX without having * to explicitly register the message. * * Upon first invocation, constructs a `registered_string_in` whose contents * are specified by `message::message`. * * All future invocations will return a reference to the object constructed * in the first invocation. * * Example: * \code{.cpp} * // Define a type with a `message` member that defines the contents of the * // registered string * struct my_message{ static constexpr char const* message{ "my message" }; * }; * * // Uses construct on first use to register the contents of * // `my_message::message` * auto& msg = registered_string_in::get(); * * // "my message" is associated with the range `r` * nvtx3::scoped_range r{msg}; * \endcode * * @tparam M Type required to contain a member `M::message` that * resolves to either a `char const*` or `wchar_t const*` used as the * registered string's contents. * @return Reference to a `registered_string_in` associated with the type `M`. */ template ::value, int>::type = 0> static registered_string_in const& get() noexcept { static registered_string_in const regstr(M::message); return regstr; } /** * @brief Overload of `registered_string_in::get` to provide a clear compile error * when `M` has a `message` member that is not directly convertible to either * `char const*` or `wchar_t const*`. */ template ::value, int>::type = 0> static registered_string_in const& get() noexcept { NVTX3_STATIC_ASSERT(detail::always_false::value, "Type used to register an NVTX string must contain a static constexpr member " "called 'message' of type const char* or const wchar_t* -- 'message' member is " "not convertible to either of those types"); static registered_string_in const unused; return unused; // Function must compile for static_assert to be triggered } /** * @brief Overload of `registered_string_in::get` to provide a clear compile error when * `M` does not have a `message` member. */ template ::value, int>::type = 0> static registered_string_in const& get() noexcept { NVTX3_STATIC_ASSERT(detail::always_false::value, "Type used to register an NVTX string must contain a static constexpr member " "called 'message' of type const char* or const wchar_t* -- 'message' member " "is missing"); static registered_string_in const unused; return unused; // Function must compile for static_assert to be triggered } # else template static registered_string_in const& get() noexcept { static registered_string_in const regstr(M::message); return regstr; } # endif /** * @brief Constructs a `registered_string_in` from the specified `msg` string. * * Registers `msg` with NVTX and associates a handle with the registered * message. * * A particular message should should only be registered once and the handle * reused throughout the rest of the application. * * @param msg The contents of the message */ explicit registered_string_in(char const* msg) noexcept : handle_{nvtxDomainRegisterStringA(domain::get(), msg)} {} /** * @brief Constructs a `registered_string_in` from the specified `msg` string. * * Registers `msg` with NVTX and associates a handle with the registered * message. * * A particular message should should only be registered once and the handle * reused throughout the rest of the application. * * @param msg The contents of the message */ explicit registered_string_in(std::string const& msg) noexcept : registered_string_in{msg.c_str()} {} /** * @brief Constructs a `registered_string_in` from the specified `msg` string. * * Registers `msg` with NVTX and associates a handle with the registered * message. * * A particular message should should only be registered once and the handle * reused throughout the rest of the application. * * @param msg The contents of the message */ explicit registered_string_in(wchar_t const* msg) noexcept : handle_{nvtxDomainRegisterStringW(domain::get(), msg)} {} /** * @brief Constructs a `registered_string_in` from the specified `msg` string. * * Registers `msg` with NVTX and associates a handle with the registered * message. * * A particular message should only be registered once and the handle * reused throughout the rest of the application. * * @param msg The contents of the message */ explicit registered_string_in(std::wstring const& msg) noexcept : registered_string_in{msg.c_str()} {} /** * @brief Returns the registered string's handle * */ nvtxStringHandle_t get_handle() const noexcept { return handle_; } private: // Default constructor is only used internally for static_assert(false) cases. registered_string_in() noexcept {}; public: ~registered_string_in() = default; registered_string_in(registered_string_in const&) = default; registered_string_in& operator=(registered_string_in const&) = default; registered_string_in(registered_string_in&&) = default; registered_string_in& operator=(registered_string_in&&) = default; private: nvtxStringHandle_t handle_{}; ///< The handle returned from ///< registering the message with NVTX }; /** * @brief Alias for a `registered_string_in` in the global NVTX domain. * */ using registered_string = registered_string_in; /** * @brief Allows associating a message string with an NVTX event via * its `EventAttribute`s. * * Associating a `message` with an NVTX event through its `event_attributes` * allows for naming events to easily differentiate them from other events. * * Every time an NVTX event is created with an associated `message`, the * contents of the message string must be copied. This may cause non-trivial * overhead in highly performance sensitive sections of code. Use of a * `nvtx3::registered_string` is recommended in these situations. * * Example: * \code{.cpp} * // Creates an `event_attributes` with message "message 0" * nvtx3::event_attributes attr0{nvtx3::message{"message 0"}}; * * // `range0` contains message "message 0" * nvtx3::scoped_range range0{attr0}; * * // `std::string` and string literals are implicitly assumed to be * // the contents of an `nvtx3::message` * // Creates an `event_attributes` with message "message 1" * nvtx3::event_attributes attr1{"message 1"}; * * // `range1` contains message "message 1" * nvtx3::scoped_range range1{attr1}; * * // `range2` contains message "message 2" * nvtx3::scoped_range range2{nvtx3::Mesage{"message 2"}}; * * // `std::string` and string literals are implicitly assumed to be * // the contents of an `nvtx3::message` * // `range3` contains message "message 3" * nvtx3::scoped_range range3{"message 3"}; * \endcode */ class message { public: using value_type = nvtxMessageValue_t; /** * @brief Construct a `message` whose contents are specified by `msg`. * * @param msg The contents of the message */ NVTX3_CONSTEXPR_IF_CPP14 message(char const* msg) noexcept : type_{NVTX_MESSAGE_TYPE_ASCII} { value_.ascii = msg; } /** * @brief Construct a `message` whose contents are specified by `msg`. * * @param msg The contents of the message */ message(std::string const& msg) noexcept : message{msg.c_str()} {} /** * @brief Disallow construction for `std::string` r-value * * `message` is a non-owning type and therefore cannot take ownership of an * r-value. Therefore, constructing from an r-value is disallowed to prevent * a dangling pointer. * */ message(std::string&&) = delete; /** * @brief Construct a `message` whose contents are specified by `msg`. * * @param msg The contents of the message */ NVTX3_CONSTEXPR_IF_CPP14 message(wchar_t const* msg) noexcept : type_{NVTX_MESSAGE_TYPE_UNICODE} { value_.unicode = msg; } /** * @brief Construct a `message` whose contents are specified by `msg`. * * @param msg The contents of the message */ message(std::wstring const& msg) noexcept : message{msg.c_str()} {} /** * @brief Disallow construction for `std::wstring` r-value * * `message` is a non-owning type and therefore cannot take ownership of an * r-value. Therefore, constructing from an r-value is disallowed to prevent * a dangling pointer. * */ message(std::wstring&&) = delete; /** * @brief Construct a `message` from a `registered_string_in`. * * @tparam D Type containing `name` member used to identify the `domain` * to which the `registered_string_in` belongs. Else, `domain::global` to * indicate that the global NVTX domain should be used. * @param msg The message that has already been registered with NVTX. */ template NVTX3_CONSTEXPR_IF_CPP14 message(registered_string_in const& msg) noexcept : type_{NVTX_MESSAGE_TYPE_REGISTERED} { value_.registered = msg.get_handle(); } /** * @brief Construct a `message` from NVTX C API type and value. * * @param type nvtxMessageType_t enum value indicating type of the payload * @param value nvtxMessageValue_t union containing message */ constexpr message(nvtxMessageType_t const& type, nvtxMessageValue_t const& value) noexcept : type_{type} , value_(value) {} /** * @brief Construct a `message` from NVTX C API registered string handle. * * @param handle nvtxStringHandle_t value of registered string handle */ NVTX3_CONSTEXPR_IF_CPP14 message(nvtxStringHandle_t handle) noexcept : type_{NVTX_MESSAGE_TYPE_REGISTERED} { value_.registered = handle; } /** * @brief Return the union holding the value of the message. * */ constexpr value_type get_value() const noexcept { return value_; } /** * @brief Return the type information about the value the union holds. * */ constexpr nvtxMessageType_t get_type() const noexcept { return type_; } private: nvtxMessageType_t type_{}; ///< message type nvtxMessageValue_t value_{}; ///< message contents }; /** * @brief A numerical value that can be associated with an NVTX event via * its `event_attributes`. * * Example: * \code{.cpp} * // Constructs a payload from the int32_t value 42 * nvtx3:: event_attributes attr{nvtx3::payload{42}}; * * // `range0` will have an int32_t payload of 42 * nvtx3::scoped_range range0{attr}; * * // range1 has double payload of 3.14 * nvtx3::scoped_range range1{nvtx3::payload{3.14}}; * \endcode */ class payload { public: using value_type = typename nvtxEventAttributes_v2::payload_t; /** * @brief Construct a `payload` from a signed, 8 byte integer. * * @param value Value to use as contents of the payload */ NVTX3_CONSTEXPR_IF_CPP14 explicit payload(int64_t value) noexcept : type_{NVTX_PAYLOAD_TYPE_INT64} , value_{} { value_.llValue = value; } /** * @brief Construct a `payload` from a signed, 4 byte integer. * * @param value Value to use as contents of the payload */ NVTX3_CONSTEXPR_IF_CPP14 explicit payload(int32_t value) noexcept : type_{NVTX_PAYLOAD_TYPE_INT32} , value_{} { value_.iValue = value; } /** * @brief Construct a `payload` from an unsigned, 8 byte integer. * * @param value Value to use as contents of the payload */ NVTX3_CONSTEXPR_IF_CPP14 explicit payload(uint64_t value) noexcept : type_{NVTX_PAYLOAD_TYPE_UNSIGNED_INT64} , value_{} { value_.ullValue = value; } /** * @brief Construct a `payload` from an unsigned, 4 byte integer. * * @param value Value to use as contents of the payload */ NVTX3_CONSTEXPR_IF_CPP14 explicit payload(uint32_t value) noexcept : type_{NVTX_PAYLOAD_TYPE_UNSIGNED_INT32} , value_{} { value_.uiValue = value; } /** * @brief Construct a `payload` from a single-precision floating point * value. * * @param value Value to use as contents of the payload */ NVTX3_CONSTEXPR_IF_CPP14 explicit payload(float value) noexcept : type_{NVTX_PAYLOAD_TYPE_FLOAT} , value_{} { value_.fValue = value; } /** * @brief Construct a `payload` from a double-precision floating point * value. * * @param value Value to use as contents of the payload */ NVTX3_CONSTEXPR_IF_CPP14 explicit payload(double value) noexcept : type_{NVTX_PAYLOAD_TYPE_DOUBLE} , value_{} { value_.dValue = value; } /** * @brief Construct a `payload` from NVTX C API type and value. * * @param type nvtxPayloadType_t enum value indicating type of the payload * @param value nvtxEventAttributes_t::payload_t union containing payload */ constexpr payload(nvtxPayloadType_t const& type, value_type const& value) noexcept : type_{type} , value_(value) {} /** * @brief Return the union holding the value of the payload * */ constexpr value_type get_value() const noexcept { return value_; } /** * @brief Return the information about the type the union holds. * */ constexpr nvtxPayloadType_t get_type() const noexcept { return type_; } private: nvtxPayloadType_t type_; ///< Type of the payload value value_type value_; ///< Union holding the payload value }; /** * @brief Describes the attributes of a NVTX event. * * NVTX events can be customized via four "attributes": * * - color: color used to visualize the event in tools such as Nsight * Systems. See `color`. * - message: Custom message string. See `message`. * - payload: User-defined numerical value. See `payload`. * - category: Intra-domain grouping. See `category`. * * These component attributes are specified via an `event_attributes` object. * See `nvtx3::color`, `nvtx3::message`, `nvtx3::payload`, and * `nvtx3::category` for how these individual attributes are constructed. * * While it is possible to specify all four attributes, it is common to want * to only specify a subset of attributes and use default values for the * others. For convenience, `event_attributes` can be constructed from any * number of attribute components in any order. * * Example: * \code{.cpp} * // Set message, same as using nvtx3::message{"message"} * event_attributes attr{"message"}; * * // Set message and color * event_attributes attr{"message", nvtx3::rgb{127, 255, 0}}; * * // Set message, color, payload, category * event_attributes attr{"message", * nvtx3::rgb{127, 255, 0}, * nvtx3::payload{42}, * nvtx3::category{1}}; * * // Same as above -- can use any order of arguments * event_attributes attr{nvtx3::payload{42}, * nvtx3::category{1}, * "message", * nvtx3::rgb{127, 255, 0}}; * * // Multiple arguments of the same type are allowed, but only the first is * // used -- in this example, payload is set to 42: * event_attributes attr{ nvtx3::payload{42}, nvtx3::payload{7} }; * * // Range `r` will be customized according the attributes in `attr` * nvtx3::scoped_range r{attr}; * * // For convenience, `event_attributes` constructor arguments may be passed * // to the `scoped_range_in` contructor -- they are forwarded to the * // `event_attributes` constructor * nvtx3::scoped_range r{nvtx3::payload{42}, nvtx3::category{1}, "message"}; * * // Using the nvtx3 namespace in a local scope makes the syntax more succinct: * using namespace nvtx3; * scoped_range r{payload{42}, category{1}, "message"}; * \endcode * */ class event_attributes { public: using value_type = nvtxEventAttributes_t; /** * @brief Default constructor creates an `event_attributes` with no * category, color, payload, nor message. */ constexpr event_attributes() noexcept : attributes_{ NVTX_VERSION, // version sizeof(nvtxEventAttributes_t), // size 0, // category NVTX_COLOR_UNKNOWN, // color type 0, // color value NVTX_PAYLOAD_UNKNOWN, // payload type 0, // reserved 4B {0}, // payload value (union) // NOTE(bgruber): added braces NVTX_MESSAGE_UNKNOWN, // message type {0} // message value (union) // NOTE(bgruber): added braces } {} /** * @brief Variadic constructor where the first argument is a `category`. * * Sets the value of the `EventAttribute`s category based on `c` and * forwards the remaining variadic parameter pack to the next constructor. * */ template NVTX3_CONSTEXPR_IF_CPP14 explicit event_attributes(category const& c, Args const&... args) noexcept : event_attributes(args...) { attributes_.category = c.get_id(); } /** * @brief Variadic constructor where the first argument is a `color`. * * Sets the value of the `EventAttribute`s color based on `c` and forwards * the remaining variadic parameter pack to the next constructor. * */ template NVTX3_CONSTEXPR_IF_CPP14 explicit event_attributes(color const& c, Args const&... args) noexcept : event_attributes(args...) { attributes_.color = c.get_value(); attributes_.colorType = c.get_type(); } /** * @brief Variadic constructor where the first argument is a `payload`. * * Sets the value of the `EventAttribute`s payload based on `p` and forwards * the remaining variadic parameter pack to the next constructor. * */ template NVTX3_CONSTEXPR_IF_CPP14 explicit event_attributes(payload const& p, Args const&... args) noexcept : event_attributes(args...) { attributes_.payload = p.get_value(); attributes_.payloadType = p.get_type(); } /** * @brief Variadic constructor where the first argument is a `message`. * * Sets the value of the `EventAttribute`s message based on `m` and forwards * the remaining variadic parameter pack to the next constructor. * */ template NVTX3_CONSTEXPR_IF_CPP14 explicit event_attributes(message const& m, Args const&... args) noexcept : event_attributes(args...) { attributes_.message = m.get_value(); attributes_.messageType = m.get_type(); } ~event_attributes() = default; event_attributes(event_attributes const&) = default; event_attributes& operator=(event_attributes const&) = default; event_attributes(event_attributes&&) = default; event_attributes& operator=(event_attributes&&) = default; /** * @brief Get raw pointer to underlying NVTX attributes object. * */ constexpr value_type const* get() const noexcept { return &attributes_; } private: value_type attributes_{}; ///< The NVTX attributes structure }; /** * @brief A RAII object for creating a NVTX range local to a thread within a * domain. * * When constructed, begins a nested NVTX range on the calling thread in the * specified domain. Upon destruction, ends the NVTX range. * * Behavior is undefined if a `scoped_range_in` object is * created/destroyed on different threads. * * `scoped_range_in` is neither moveable nor copyable. * * `scoped_range_in`s may be nested within other ranges. * * The domain of the range is specified by the template type parameter `D`. * By default, the `domain::global` is used, which scopes the range to the * global NVTX domain. The convenience alias `scoped_range` is provided for * ranges scoped to the global domain. * * A custom domain can be defined by creating a type, `D`, with a static * member `D::name` whose value is used to name the domain associated with * `D`. `D::name` must resolve to either `char const*` or `wchar_t const*` * * Example: * \code{.cpp} * // Define a type `my_domain` with a member `name` used to name the domain * // associated with the type `my_domain`. * struct my_domain{ * static constexpr char const* name{"my domain"}; * }; * \endcode * * Usage: * \code{.cpp} * nvtx3::scoped_range_in r1{"range 1"}; // Range in my domain * * // Three equivalent ways to make a range in the global domain: * nvtx3::scoped_range_in r2{"range 2"}; * nvtx3::scoped_range_in<> r3{"range 3"}; * nvtx3::scoped_range r4{"range 4"}; * * // Create an alias to succinctly make ranges in my domain: * using my_scoped_range = nvtx3::scoped_range_in; * * my_scoped_range r3{"range 3"}; * \endcode */ template class scoped_range_in { public: /** * @brief Construct a `scoped_range_in` with the specified * `event_attributes` * * Example: * \code{cpp} * nvtx3::event_attributes attr{"msg", nvtx3::rgb{127,255,0}}; * nvtx3::scoped_range range{attr}; // Creates a range with message contents * // "msg" and green color * \endcode * * @param[in] attr `event_attributes` that describes the desired attributes * of the range. */ explicit scoped_range_in(event_attributes const& attr) noexcept { # ifndef NVTX_DISABLE nvtxDomainRangePushEx(domain::get(), attr.get()); # else (void) attr; # endif } /** * @brief Constructs a `scoped_range_in` from the constructor arguments * of an `event_attributes`. * * Forwards the arguments `args...` to construct an * `event_attributes` object. The `event_attributes` object is then * associated with the `scoped_range_in`. * * For more detail, see `event_attributes` documentation. * * Example: * \code{cpp} * // Creates a range with message "message" and green color * nvtx3::scoped_range r{"message", nvtx3::rgb{127,255,0}}; * \endcode * * @param[in] args Arguments to used to construct an `event_attributes` associated with this * range. * */ template explicit scoped_range_in(Args const&... args) noexcept : scoped_range_in{event_attributes{args...}} {} /** * @brief Default constructor creates a `scoped_range_in` with no * message, color, payload, nor category. * */ scoped_range_in() noexcept : scoped_range_in{event_attributes{}} {} /** * @brief Delete `operator new` to disallow heap allocated objects. * * `scoped_range_in` must follow RAII semantics to guarantee proper push/pop semantics. * */ void* operator new(std::size_t) = delete; scoped_range_in(scoped_range_in const&) = delete; scoped_range_in& operator=(scoped_range_in const&) = delete; scoped_range_in(scoped_range_in&&) = delete; scoped_range_in& operator=(scoped_range_in&&) = delete; /** * @brief Destroy the scoped_range_in, ending the NVTX range event. */ ~scoped_range_in() noexcept { # ifndef NVTX_DISABLE nvtxDomainRangePop(domain::get()); # endif } }; /** * @brief Alias for a `scoped_range_in` in the global NVTX domain. * */ using scoped_range = scoped_range_in; namespace detail { /// @cond internal template class optional_scoped_range_in { public: optional_scoped_range_in() = default; void begin(event_attributes const& attr) noexcept { # ifndef NVTX_DISABLE // This class is not meant to be part of the public NVTX C++ API and should // only be used in the `NVTX3_FUNC_RANGE_IF` and `NVTX3_FUNC_RANGE_IF_IN` // macros. However, to prevent developers from misusing this class, make // sure to not start multiple ranges. if (initialized) { return; } nvtxDomainRangePushEx(domain::get(), attr.get()); initialized = true; # endif } ~optional_scoped_range_in() noexcept { # ifndef NVTX_DISABLE if (initialized) { nvtxDomainRangePop(domain::get()); } # endif } void* operator new(std::size_t) = delete; optional_scoped_range_in(optional_scoped_range_in const&) = delete; optional_scoped_range_in& operator=(optional_scoped_range_in const&) = delete; optional_scoped_range_in(optional_scoped_range_in&&) = delete; optional_scoped_range_in& operator=(optional_scoped_range_in&&) = delete; private: # ifndef NVTX_DISABLE bool initialized = false; # endif }; /// @endcond } // namespace detail /** * @brief Handle used for correlating explicit range start and end events. * * A handle is "null" if it does not correspond to any range. * */ struct range_handle { /// Type used for the handle's value using value_type = nvtxRangeId_t; /** * @brief Construct a `range_handle` from the given id. * */ constexpr explicit range_handle(value_type id) noexcept : _range_id{id} {} /** * @brief Constructs a null range handle. * * A null range_handle corresponds to no range. Calling `end_range` on a * null handle is undefined behavior when a tool is active. * */ constexpr range_handle() noexcept = default; /** * @brief Checks whether this handle is null * * Provides contextual conversion to `bool`. * * \code{cpp} * range_handle handle{}; * if (handle) {...} * \endcode * */ constexpr explicit operator bool() const noexcept { return get_value() != null_range_id; }; /** * @brief Implicit conversion from `nullptr` constructs a null handle. * * Satisfies the "NullablePointer" requirement to make `range_handle` comparable with `nullptr`. * */ constexpr range_handle(std::nullptr_t) noexcept {} /** * @brief Returns the `range_handle`'s value * * @return value_type The handle's value */ constexpr value_type get_value() const noexcept { return _range_id; } private: /// Sentinel value for a null handle that corresponds to no range static constexpr value_type null_range_id = nvtxRangeId_t{0}; value_type _range_id{null_range_id}; ///< The underlying NVTX range id }; /** * @brief Compares two range_handles for equality * * @param lhs The first range_handle to compare * @param rhs The second range_handle to compare */ inline constexpr bool operator==(range_handle lhs, range_handle rhs) noexcept { return lhs.get_value() == rhs.get_value(); } /** * @brief Compares two range_handles for inequality * * @param lhs The first range_handle to compare * @param rhs The second range_handle to compare */ inline constexpr bool operator!=(range_handle lhs, range_handle rhs) noexcept { return !(lhs == rhs); } /** * @brief Manually begin an NVTX range. * * Explicitly begins an NVTX range and returns a unique handle. To end the * range, pass the handle to `end_range_in()`. * * `nvtx3::start_range(...)` is equivalent to `nvtx3::start_range_in<>(...)` and * `nvtx3::start_range_in(...)`. * * `start_range_in/end_range_in` are the most explicit and lowest level APIs * provided for creating ranges. Use of `nvtx3::unique_range_in` should be * preferred unless one is unable to tie the range to the lifetime of an object. * * Example: * \code{.cpp} * nvtx3::event_attributes attr{"msg", nvtx3::rgb{127,255,0}}; * // Manually begin a range * nvtx3::range_handle h = nvtx3::start_range_in(attr); * ... * nvtx3::end_range_in(h); // End the range * \endcode * * @tparam D Type containing `name` member used to identify the `domain` * to which the range belongs. Else, `domain::global` to indicate that the * global NVTX domain should be used. * @param[in] attr `event_attributes` that describes the desired attributes * of the range. * @return Unique handle to be passed to `end_range_in` to end the range. */ template inline range_handle start_range_in(event_attributes const& attr) noexcept { # ifndef NVTX_DISABLE return range_handle{nvtxDomainRangeStartEx(domain::get(), attr.get())}; # else (void) attr; return {}; # endif } /** * @brief Manually begin an NVTX range. * * Explicitly begins an NVTX range and returns a unique handle. To end the * range, pass the handle to `end_range_in()`. * * `nvtx3::start_range(...)` is equivalent to `nvtx3::start_range_in<>(...)` and * `nvtx3::start_range_in(...)`. * * `start_range_in/end_range_in` are the most explicit and lowest level APIs * provided for creating ranges. Use of `nvtx3::unique_range_in` should be * preferred unless one is unable to tie the range to the lifetime of an object. * * This overload uses `args...` to construct an `event_attributes` to * associate with the range. For more detail, see `event_attributes`. * * Example: * \code{cpp} * // Manually begin a range * nvtx3::range_handle h = nvtx3::start_range_in("msg", nvtx3::rgb{127,255,0}); * ... * nvtx3::end_range_in(h); // Ends the range * \endcode * * @tparam D Type containing `name` member used to identify the `domain` * to which the range belongs. Else, `domain::global` to indicate that the * global NVTX domain should be used. * @param args[in] Variadic parameter pack of the arguments for an `event_attributes`. * @return Unique handle to be passed to `end_range` to end the range. */ template inline range_handle start_range_in(Args const&... args) noexcept { # ifndef NVTX_DISABLE return start_range_in(event_attributes{args...}); # else return {}; # endif } /** * @brief Manually begin an NVTX range in the global domain. * * Explicitly begins an NVTX range and returns a unique handle. To end the * range, pass the handle to `end_range()`. * * `nvtx3::start_range(...)` is equivalent to `nvtx3::start_range_in<>(...)` and * `nvtx3::start_range_in(...)`. * * `start_range/end_range` are the most explicit and lowest level APIs * provided for creating ranges. Use of `nvtx3::unique_range` should be * preferred unless one is unable to tie the range to the lifetime of an object. * * Example: * \code{.cpp} * nvtx3::event_attributes attr{"msg", nvtx3::rgb{127,255,0}}; * // Manually begin a range * nvtx3::range_handle h = nvtx3::start_range(attr); * ... * nvtx3::end_range(h); // End the range * \endcode * * @param[in] attr `event_attributes` that describes the desired attributes * of the range. * @return Unique handle to be passed to `end_range_in` to end the range. */ inline range_handle start_range(event_attributes const& attr) noexcept { # ifndef NVTX_DISABLE return start_range_in(attr); # else (void) attr; return {}; # endif } /** * @brief Manually begin an NVTX range in the global domain. * * Explicitly begins an NVTX range and returns a unique handle. To end the * range, pass the handle to `end_range_in()`. * * `nvtx3::start_range(...)` is equivalent to `nvtx3::start_range_in<>(...)` and * `nvtx3::start_range_in(...)`. * * `start_range_in/end_range_in` are the most explicit and lowest level APIs * provided for creating ranges. Use of `nvtx3::unique_range_in` should be * preferred unless one is unable to tie the range to the lifetime of an object. * * This overload uses `args...` to construct an `event_attributes` to * associate with the range. For more detail, see `event_attributes`. * * Example: * \code{cpp} * // Manually begin a range * nvtx3::range_handle h = nvtx3::start_range("msg", nvtx3::rgb{127,255,0}); * ... * nvtx3::end_range(h); // Ends the range * \endcode * * @param args[in] Variadic parameter pack of the arguments for an `event_attributes`. * @return Unique handle to be passed to `end_range` to end the range. */ template inline range_handle start_range(Args const&... args) noexcept { # ifndef NVTX_DISABLE return start_range_in(args...); # else return {}; # endif } /** * @brief Manually end the range associated with the handle `r` in domain `D`. * * Explicitly ends the NVTX range indicated by the handle `r` returned from a * prior call to `start_range_in`. The range may end on a different thread * from where it began. * * @tparam D Type containing `name` member used to identify the `domain` to * which the range belongs. Else, `domain::global` to indicate that the global * NVTX domain should be used. * @param r Handle to a range started by a prior call to `start_range_in`. * * @warning The domain type specified as template parameter to this function * must be the same that was specified on the associated `start_range_in` call. */ template inline void end_range_in(range_handle r) noexcept { # ifndef NVTX_DISABLE nvtxDomainRangeEnd(domain::get(), r.get_value()); # else (void) r; # endif } /** * @brief Manually end the range associated with the handle `r` in the global * domain. * * Explicitly ends the NVTX range indicated by the handle `r` returned from a * prior call to `start_range`. The range may end on a different thread from * where it began. * * @param r Handle to a range started by a prior call to `start_range`. * * @warning The domain type specified as template parameter to this function * must be the same that was specified on the associated `start_range` call. */ inline void end_range(range_handle r) noexcept { # ifndef NVTX_DISABLE end_range_in(r); # else (void) r; # endif } /** * @brief A RAII object for creating a NVTX range within a domain that can * be created and destroyed on different threads. * * When constructed, begins a NVTX range in the specified domain. Upon * destruction, ends the NVTX range. * * Similar to `nvtx3::scoped_range_in`, with a few key differences: * - `unique_range` objects can be destroyed in an order whereas `scoped_range` objects must be * destroyed in exact reverse creation order * - `unique_range` can start and end on different threads * - `unique_range` is moveable * - `unique_range` objects can be constructed as heap objects * * There is extra overhead associated with `unique_range` constructs and therefore use of * `nvtx3::scoped_range_in` should be preferred. * * @tparam D Type containing `name` member used to identify the `domain` * to which the `unique_range_in` belongs. Else, `domain::global` to * indicate that the global NVTX domain should be used. */ template class unique_range_in { public: /** * @brief Construct a new unique_range_in object with the specified event attributes * * Example: * \code{cpp} * nvtx3::event_attributes attr{"msg", nvtx3::rgb{127,255,0}}; * nvtx3::unique_range_in range{attr}; // Creates a range with message contents * // "msg" and green color * \endcode * * @param[in] attr `event_attributes` that describes the desired attributes * of the range. */ explicit unique_range_in(event_attributes const& attr) noexcept : handle_{start_range_in(attr)} {} /** * @brief Constructs a `unique_range_in` from the constructor arguments * of an `event_attributes`. * * Forwards the arguments `args...` to construct an * `event_attributes` object. The `event_attributes` object is then * associated with the `unique_range_in`. * * For more detail, see `event_attributes` documentation. * * Example: * \code{.cpp} * // Creates a range with message "message" and green color * nvtx3::unique_range_in<> r{"message", nvtx3::rgb{127,255,0}}; * \endcode * * @param[in] args Variadic parameter pack of arguments to construct an `event_attributes` * associated with this range. */ template explicit unique_range_in(Args const&... args) noexcept : unique_range_in{event_attributes{args...}} {} /** * @brief Default constructor creates a `unique_range_in` with no * message, color, payload, nor category. * */ constexpr unique_range_in() noexcept : unique_range_in{event_attributes{}} {} /** * @brief Destroy the `unique_range_in` ending the range. * */ ~unique_range_in() noexcept = default; /** * @brief Move constructor allows taking ownership of the NVTX range from * another `unique_range_in`. * * @param other The range to take ownership of */ unique_range_in(unique_range_in&& other) noexcept = default; /** * @brief Move assignment operator allows taking ownership of an NVTX range * from another `unique_range_in`. * * @param other The range to take ownership of */ unique_range_in& operator=(unique_range_in&& other) noexcept = default; /// Copy construction is not allowed to prevent multiple objects from owning /// the same range handle unique_range_in(unique_range_in const&) = delete; /// Copy assignment is not allowed to prevent multiple objects from owning the /// same range handle unique_range_in& operator=(unique_range_in const&) = delete; private: struct end_range_handle { using pointer = range_handle; /// Override the pointer type of the unique_ptr void operator()(range_handle h) const noexcept { end_range_in(h); } }; /// Range handle used to correlate the start/end of the range std::unique_ptr handle_; }; /** * @brief Alias for a `unique_range_in` in the global NVTX domain. * */ using unique_range = unique_range_in; /** * @brief Annotates an instantaneous point in time with a "marker", using the * attributes specified by `attr`. * * Unlike a "range" which has a beginning and an end, a marker is a single event * in an application, such as detecting a problem: * * \code{.cpp} * bool success = do_operation(...); * if (!success) { * nvtx3::event_attributes attr{"operation failed!", nvtx3::rgb{255,0,0}}; * nvtx3::mark_in(attr); * } * \endcode * * Note that nvtx3::mark_in is a function, not a class like scoped_range_in. * * @tparam D Type containing `name` member used to identify the `domain` * to which the `unique_range_in` belongs. Else, `domain::global` to * indicate that the global NVTX domain should be used. * @param[in] attr `event_attributes` that describes the desired attributes * of the mark. */ template inline void mark_in(event_attributes const& attr) noexcept { # ifndef NVTX_DISABLE nvtxDomainMarkEx(domain::get(), attr.get()); # else (void) (attr); # endif } /** * @brief Annotates an instantaneous point in time with a "marker", using the * arguments to construct an `event_attributes`. * * Unlike a "range" which has a beginning and an end, a marker is a single event * in an application, such as detecting a problem: * * \code{.cpp} * bool success = do_operation(...); * if (!success) { * nvtx3::mark_in("operation failed!", nvtx3::rgb{255,0,0}); * } * \endcode * * Note that nvtx3::mark_in is a function, not a class like scoped_range_in. * * Forwards the arguments `args...` to construct an `event_attributes` object. * The attributes are then associated with the marker. For more detail, see * the `event_attributes` documentation. * * @tparam D Type containing `name` member used to identify the `domain` * to which the `unique_range_in` belongs. Else `domain::global` to * indicate that the global NVTX domain should be used. * @param[in] args Variadic parameter pack of arguments to construct an `event_attributes` * associated with this range. * */ template inline void mark_in(Args const&... args) noexcept { # ifndef NVTX_DISABLE mark_in(event_attributes{args...}); # endif } /** * @brief Annotates an instantaneous point in time with a "marker", using the * attributes specified by `attr`, in the global domain. * * Unlike a "range" which has a beginning and an end, a marker is a single event * in an application, such as detecting a problem: * * \code{.cpp} * bool success = do_operation(...); * if (!success) { * nvtx3::event_attributes attr{"operation failed!", nvtx3::rgb{255,0,0}}; * nvtx3::mark(attr); * } * \endcode * * Note that nvtx3::mark is a function, not a class like scoped_range. * * @param[in] attr `event_attributes` that describes the desired attributes * of the mark. */ inline void mark(event_attributes const& attr) noexcept { # ifndef NVTX_DISABLE mark_in(attr); # endif } /** * @brief Annotates an instantaneous point in time with a "marker", using the * arguments to construct an `event_attributes`, in the global domain. * * Unlike a "range" which has a beginning and an end, a marker is a single event * in an application, such as detecting a problem: * * \code{.cpp} * bool success = do_operation(...); * if (!success) { * nvtx3::mark("operation failed!", nvtx3::rgb{255,0,0}); * } * \endcode * * Note that nvtx3::mark is a function, not a class like scoped_range. * * Forwards the arguments `args...` to construct an `event_attributes` object. * The attributes are then associated with the marker. For more detail, see * the `event_attributes` documentation. * * @param[in] args Variadic parameter pack of arguments to construct an * `event_attributes` associated with this range. * */ template inline void mark(Args const&... args) noexcept { # ifndef NVTX_DISABLE mark_in(args...); # endif } } // namespace NVTX3_VERSION_NAMESPACE } // namespace nvtx3 # ifndef NVTX_DISABLE /** * @brief Convenience macro for generating a range in the specified `domain` * from the lifetime of a function * * This macro is useful for generating an NVTX range in `domain` from * the entry point of a function to its exit. It is intended to be the first * line of the function. * * Constructs a static `registered_string_in` using the name of the immediately * enclosing function returned by `__func__` and constructs a * `nvtx3::scoped_range` using the registered function name as the range's * message. * * Example: * \code{.cpp} * struct my_domain{static constexpr char const* name{"my_domain"};}; * * void foo(...) { * NVTX3_FUNC_RANGE_IN(my_domain); // Range begins on entry to foo() * // do stuff * ... * } // Range ends on return from foo() * \endcode * * @param[in] D Type containing `name` member used to identify the * `domain` to which the `registered_string_in` belongs. Else, * `domain::global` to indicate that the global NVTX domain should be used. */ # define NVTX3_V1_FUNC_RANGE_IN(D) \ static ::nvtx3::v1::registered_string_in const nvtx3_func_name__{__func__}; \ static ::nvtx3::v1::event_attributes const nvtx3_func_attr__{nvtx3_func_name__}; \ ::nvtx3::v1::scoped_range_in const nvtx3_range__{nvtx3_func_attr__}; /** * @brief Convenience macro for generating a range in the specified `domain` * from the lifetime of a function if the given boolean expression evaluates * to true. * * Similar to `NVTX3_V1_FUNC_RANGE_IN(D)`, the only difference being that * `NVTX3_V1_FUNC_RANGE_IF_IN(D, C)` only generates a range if the given boolean * expression evaluates to true. * * @param[in] D Type containing `name` member used to identify the * `domain` to which the `registered_string_in` belongs. Else, * `domain::global` to indicate that the global NVTX domain should be used. * * @param[in] C Boolean expression used to determine if a range should be * generated. */ # define NVTX3_V1_FUNC_RANGE_IF_IN(D, C) \ ::nvtx3::v1::detail::optional_scoped_range_in optional_nvtx3_range__; \ if (C) \ { \ static ::nvtx3::v1::registered_string_in const nvtx3_func_name__{__func__}; \ static ::nvtx3::v1::event_attributes const nvtx3_func_attr__{nvtx3_func_name__}; \ optional_nvtx3_range__.begin(nvtx3_func_attr__); \ } # else # define NVTX3_V1_FUNC_RANGE_IN(D) # define NVTX3_V1_FUNC_RANGE_IF_IN(D, C) # endif // NVTX_DISABLE /** * @brief Convenience macro for generating a range in the global domain from the * lifetime of a function. * * This macro is useful for generating an NVTX range in the global domain from * the entry point of a function to its exit. It is intended to be the first * line of the function. * * Constructs a static `registered_string_in` using the name of the immediately * enclosing function returned by `__func__` and constructs a * `nvtx3::scoped_range` using the registered function name as the range's * message. * * Example: * \code{.cpp} * void foo(...) { * NVTX3_FUNC_RANGE(); // Range begins on entry to foo() * // do stuff * ... * } // Range ends on return from foo() * \endcode */ # define NVTX3_V1_FUNC_RANGE() NVTX3_V1_FUNC_RANGE_IN(::nvtx3::v1::domain::global) /** * @brief Convenience macro for generating a range in the global domain from the * lifetime of a function if the given boolean expression evaluates to true. * * Similar to `NVTX3_V1_FUNC_RANGE()`, the only difference being that * `NVTX3_V1_FUNC_RANGE_IF(C)` only generates a range if the given boolean * expression evaluates to true. * * @param[in] C Boolean expression used to determine if a range should be * generated. */ # define NVTX3_V1_FUNC_RANGE_IF(C) NVTX3_V1_FUNC_RANGE_IF_IN(::nvtx3::v1::domain::global, C) /* When inlining this version, versioned macros must have unversioned aliases. * For each NVTX3_Vx_ #define, make an NVTX3_ alias of it here.*/ # if defined(NVTX3_INLINE_THIS_VERSION) /* clang format off */ # define NVTX3_FUNC_RANGE NVTX3_V1_FUNC_RANGE # define NVTX3_FUNC_RANGE_IF NVTX3_V1_FUNC_RANGE_IF # define NVTX3_FUNC_RANGE_IN NVTX3_V1_FUNC_RANGE_IN # define NVTX3_FUNC_RANGE_IF_IN NVTX3_V1_FUNC_RANGE_IF_IN /* clang format on */ # endif #endif // NVTX3_CPP_DEFINITIONS_V1_0 /* Add functionality for new minor versions here, by copying the above section enclosed * in #ifndef NVTX3_CPP_DEFINITIONS_Vx_y, and incrementing the minor version. This code * is an example of how additions for version 1.2 would look, indented for clarity. Note * that the versioned symbols and macros are always provided, and the unversioned symbols * are only provided if NVTX3_INLINE_THIS_VERSION was defined at the top of this header. * * \code{.cpp} * #ifndef NVTX3_CPP_DEFINITIONS_V1_2 * #define NVTX3_CPP_DEFINITIONS_V1_2 * namespace nvtx3 { * NVTX3_INLINE_IF_REQUESTED namespace NVTX3_VERSION_NAMESPACE { * class new_class {}; * inline void new_function() {} * } * } * * // Macros must have the major version in their names: * #define NVTX3_V1_NEW_MACRO_A() ... * #define NVTX3_V1_NEW_MACRO_B() ... * * // If inlining, make aliases for the macros with the version number omitted * #if defined(NVTX3_INLINE_THIS_VERSION) * #define NVTX3_NEW_MACRO_A NVTX3_V1_NEW_MACRO_A * #define NVTX3_NEW_MACRO_B NVTX3_V1_NEW_MACRO_B * #endif * #endif // NVTX3_CPP_DEFINITIONS_V1_2 * \endcode */ /* Undefine all temporarily-defined unversioned macros, which would conflict with * subsequent includes of different versions of this header. */ #undef NVTX3_CPP_VERSION_MAJOR #undef NVTX3_CPP_VERSION_MINOR #undef NVTX3_CONCAT #undef NVTX3_NAMESPACE_FOR #undef NVTX3_VERSION_NAMESPACE #undef NVTX3_INLINE_IF_REQUESTED #undef NVTX3_CONSTEXPR_IF_CPP14 #if defined(NVTX3_INLINE_THIS_VERSION) # undef NVTX3_INLINE_THIS_VERSION #endif #if defined(NVTX3_USE_CHECKED_OVERLOADS_FOR_GET_DEFINED_HERE) # undef NVTX3_USE_CHECKED_OVERLOADS_FOR_GET_DEFINED_HERE # undef NVTX3_USE_CHECKED_OVERLOADS_FOR_GET #endif #if defined(NVTX3_STATIC_ASSERT_DEFINED_HERE) # undef NVTX3_STATIC_ASSERT_DEFINED_HERE # undef NVTX3_STATIC_ASSERT #endif cccl-2.5.0/cub/cub/detail/strong_load.cuh000066400000000000000000000150241463375617100202410ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * @file Utilities for strong memory operations. */ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include CUB_NAMESPACE_BEGIN #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document namespace detail { static _CCCL_DEVICE _CCCL_FORCEINLINE uint4 load_relaxed(uint4 const* ptr) { uint4 retval; NV_IF_TARGET( NV_PROVIDES_SM_70, (asm volatile("ld.relaxed.gpu.v4.u32 {%0, %1, %2, %3}, [%4];" : "=r"(retval.x), "=r"(retval.y), "=r"(retval.z), "=r"(retval.w) : _CUB_ASM_PTR_(ptr) : "memory");), (asm volatile("ld.cg.v4.u32 {%0, %1, %2, %3}, [%4];" : "=r"(retval.x), "=r"(retval.y), "=r"(retval.z), "=r"(retval.w) : _CUB_ASM_PTR_(ptr) : "memory");)); return retval; } static _CCCL_DEVICE _CCCL_FORCEINLINE ulonglong2 load_relaxed(ulonglong2 const* ptr) { ulonglong2 retval; NV_IF_TARGET( NV_PROVIDES_SM_70, (asm volatile("ld.relaxed.gpu.v2.u64 {%0, %1}, [%2];" : "=l"(retval.x), "=l"(retval.y) : _CUB_ASM_PTR_(ptr) : "memory");), (asm volatile("ld.cg.v2.u64 {%0, %1}, [%2];" : "=l"(retval.x), "=l"(retval.y) : _CUB_ASM_PTR_(ptr) : "memory");)); return retval; } static _CCCL_DEVICE _CCCL_FORCEINLINE ushort4 load_relaxed(ushort4 const* ptr) { ushort4 retval; NV_IF_TARGET( NV_PROVIDES_SM_70, (asm volatile("ld.relaxed.gpu.v4.u16 {%0, %1, %2, %3}, [%4];" : "=h"(retval.x), "=h"(retval.y), "=h"(retval.z), "=h"(retval.w) : _CUB_ASM_PTR_(ptr) : "memory");), (asm volatile("ld.cg.v4.u16 {%0, %1, %2, %3}, [%4];" : "=h"(retval.x), "=h"(retval.y), "=h"(retval.z), "=h"(retval.w) : _CUB_ASM_PTR_(ptr) : "memory");)); return retval; } static _CCCL_DEVICE _CCCL_FORCEINLINE uint2 load_relaxed(uint2 const* ptr) { uint2 retval; NV_IF_TARGET( NV_PROVIDES_SM_70, (asm volatile("ld.relaxed.gpu.v2.u32 {%0, %1}, [%2];" : "=r"(retval.x), "=r"(retval.y) : _CUB_ASM_PTR_(ptr) : "memory");), (asm volatile("ld.cg.v2.u32 {%0, %1}, [%2];" : "=r"(retval.x), "=r"(retval.y) : _CUB_ASM_PTR_(ptr) : "memory");)); return retval; } static _CCCL_DEVICE _CCCL_FORCEINLINE unsigned long long load_relaxed(unsigned long long const* ptr) { unsigned long long retval; NV_IF_TARGET( NV_PROVIDES_SM_70, (asm volatile("ld.relaxed.gpu.u64 %0, [%1];" : "=l"(retval) : _CUB_ASM_PTR_(ptr) : "memory");), (asm volatile("ld.cg.u64 %0, [%1];" : "=l"(retval) : _CUB_ASM_PTR_(ptr) : "memory");)); return retval; } static _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int load_relaxed(unsigned int const* ptr) { unsigned int retval; NV_IF_TARGET( NV_PROVIDES_SM_70, (asm volatile("ld.relaxed.gpu.u32 %0, [%1];" : "=r"(retval) : _CUB_ASM_PTR_(ptr) : "memory");), (asm volatile("ld.cg.u32 %0, [%1];" : "=r"(retval) : _CUB_ASM_PTR_(ptr) : "memory");)); return retval; } static _CCCL_DEVICE _CCCL_FORCEINLINE unsigned short load_relaxed(unsigned short const* ptr) { unsigned short retval; NV_IF_TARGET( NV_PROVIDES_SM_70, (asm volatile("ld.relaxed.gpu.u16 %0, [%1];" : "=h"(retval) : _CUB_ASM_PTR_(ptr) : "memory");), (asm volatile("ld.cg.u16 %0, [%1];" : "=h"(retval) : _CUB_ASM_PTR_(ptr) : "memory");)); return retval; } static _CCCL_DEVICE _CCCL_FORCEINLINE unsigned char load_relaxed(unsigned char const* ptr) { unsigned short retval; NV_IF_TARGET( NV_PROVIDES_SM_70, (asm volatile( "{" " .reg .u8 datum;" " ld.relaxed.gpu.u8 datum, [%1];" " cvt.u16.u8 %0, datum;" "}" : "=h"(retval) : _CUB_ASM_PTR_(ptr) : "memory");), (asm volatile( "{" " .reg .u8 datum;" " ld.cg.u8 datum, [%1];" " cvt.u16.u8 %0, datum;" "}" : "=h"(retval) : _CUB_ASM_PTR_(ptr) : "memory");)); return (unsigned char) retval; } } // namespace detail #endif // DOXYGEN_SHOULD_SKIP_THIS CUB_NAMESPACE_END cccl-2.5.0/cub/cub/detail/strong_store.cuh000066400000000000000000000244211463375617100204570ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * @file Utilities for strong memory operations. */ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include CUB_NAMESPACE_BEGIN #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document namespace detail { static _CCCL_DEVICE _CCCL_FORCEINLINE void store_relaxed(uint4* ptr, uint4 val) { NV_IF_TARGET( NV_PROVIDES_SM_70, (asm volatile("st.relaxed.gpu.v4.u32 [%0], {%1, %2, %3, %4};" : : _CUB_ASM_PTR_(ptr), "r"(val.x), "r"(val.y), "r"(val.z), "r"(val.w) : "memory");), (asm volatile("st.cg.v4.u32 [%0], {%1, %2, %3, %4};" : : _CUB_ASM_PTR_(ptr), "r"(val.x), "r"(val.y), "r"(val.z), "r"(val.w) : "memory");)); } static _CCCL_DEVICE _CCCL_FORCEINLINE void store_relaxed(ulonglong2* ptr, ulonglong2 val) { NV_IF_TARGET( NV_PROVIDES_SM_70, (asm volatile("st.relaxed.gpu.v2.u64 [%0], {%1, %2};" : : _CUB_ASM_PTR_(ptr), "l"(val.x), "l"(val.y) : "memory");), (asm volatile("st.cg.v2.u64 [%0], {%1, %2};" : : _CUB_ASM_PTR_(ptr), "l"(val.x), "l"(val.y) : "memory");)); } static _CCCL_DEVICE _CCCL_FORCEINLINE void store_relaxed(ushort4* ptr, ushort4 val) { NV_IF_TARGET( NV_PROVIDES_SM_70, (asm volatile("st.relaxed.gpu.v4.u16 [%0], {%1, %2, %3, %4};" : : _CUB_ASM_PTR_(ptr), "h"(val.x), "h"(val.y), "h"(val.z), "h"(val.w) : "memory");), (asm volatile("st.cg.v4.u16 [%0], {%1, %2, %3, %4};" : : _CUB_ASM_PTR_(ptr), "h"(val.x), "h"(val.y), "h"(val.z), "h"(val.w) : "memory");)); } static _CCCL_DEVICE _CCCL_FORCEINLINE void store_relaxed(uint2* ptr, uint2 val) { NV_IF_TARGET( NV_PROVIDES_SM_70, (asm volatile("st.relaxed.gpu.v2.u32 [%0], {%1, %2};" : : _CUB_ASM_PTR_(ptr), "r"(val.x), "r"(val.y) : "memory");), (asm volatile("st.cg.v2.u32 [%0], {%1, %2};" : : _CUB_ASM_PTR_(ptr), "r"(val.x), "r"(val.y) : "memory");)); } static _CCCL_DEVICE _CCCL_FORCEINLINE void store_relaxed(unsigned long long* ptr, unsigned long long val) { NV_IF_TARGET( NV_PROVIDES_SM_70, (asm volatile("st.relaxed.gpu.u64 [%0], %1;" : : _CUB_ASM_PTR_(ptr), "l"(val) : "memory");), (asm volatile("st.cg.u64 [%0], %1;" : : _CUB_ASM_PTR_(ptr), "l"(val) : "memory");)); } static _CCCL_DEVICE _CCCL_FORCEINLINE void store_relaxed(unsigned int* ptr, unsigned int val) { NV_IF_TARGET( NV_PROVIDES_SM_70, (asm volatile("st.relaxed.gpu.u32 [%0], %1;" : : _CUB_ASM_PTR_(ptr), "r"(val) : "memory");), (asm volatile("st.cg.u32 [%0], %1;" : : _CUB_ASM_PTR_(ptr), "r"(val) : "memory");)); } static _CCCL_DEVICE _CCCL_FORCEINLINE void store_relaxed(unsigned short* ptr, unsigned short val) { NV_IF_TARGET( NV_PROVIDES_SM_70, (asm volatile("st.relaxed.gpu.u16 [%0], %1;" : : _CUB_ASM_PTR_(ptr), "h"(val) : "memory");), (asm volatile("st.cg.u16 [%0], %1;" : : _CUB_ASM_PTR_(ptr), "h"(val) : "memory");)); } static _CCCL_DEVICE _CCCL_FORCEINLINE void store_relaxed(unsigned char* ptr, unsigned char val) { NV_IF_TARGET( NV_PROVIDES_SM_70, (asm volatile("{" " .reg .u8 datum;" " cvt.u8.u16 datum, %1;" " st.relaxed.gpu.u8 [%0], datum;" "}" : : _CUB_ASM_PTR_(ptr), "h"((unsigned short) val) : "memory");), (asm volatile("{" " .reg .u8 datum;" " cvt.u8.u16 datum, %1;" " st.cg.u8 [%0], datum;" "}" : : _CUB_ASM_PTR_(ptr), "h"((unsigned short) val) : "memory");)); } _CCCL_DEVICE _CCCL_FORCEINLINE void store_release(uint4* ptr, uint4 val) { NV_IF_TARGET( NV_PROVIDES_SM_70, (asm volatile("st.release.gpu.v4.u32 [%0], {%1, %2, %3, %4};" : : _CUB_ASM_PTR_(ptr), "r"(val.x), "r"(val.y), "r"(val.z), "r"(val.w) : "memory");), (__threadfence(); asm volatile("st.cg.v4.u32 [%0], {%1, %2, %3, %4};" : : _CUB_ASM_PTR_(ptr), "r"(val.x), "r"(val.y), "r"(val.z), "r"(val.w) : "memory");)); } _CCCL_DEVICE _CCCL_FORCEINLINE void store_release(ulonglong2* ptr, ulonglong2 val) { NV_IF_TARGET( NV_PROVIDES_SM_70, (asm volatile("st.release.gpu.v2.u64 [%0], {%1, %2};" : : _CUB_ASM_PTR_(ptr), "l"(val.x), "l"(val.y) : "memory");), (__threadfence(); asm volatile("st.cg.v2.u64 [%0], {%1, %2};" : : _CUB_ASM_PTR_(ptr), "l"(val.x), "l"(val.y) : "memory");)); } _CCCL_DEVICE _CCCL_FORCEINLINE void store_release(ushort4* ptr, ushort4 val) { NV_IF_TARGET( NV_PROVIDES_SM_70, (asm volatile("st.release.gpu.v4.u16 [%0], {%1, %2, %3, %4};" : : _CUB_ASM_PTR_(ptr), "h"(val.x), "h"(val.y), "h"(val.z), "h"(val.w) : "memory");), (__threadfence(); asm volatile("st.cg.v4.u16 [%0], {%1, %2, %3, %4};" : : _CUB_ASM_PTR_(ptr), "h"(val.x), "h"(val.y), "h"(val.z), "h"(val.w) : "memory");)); } _CCCL_DEVICE _CCCL_FORCEINLINE void store_release(uint2* ptr, uint2 val) { NV_IF_TARGET( NV_PROVIDES_SM_70, (asm volatile("st.release.gpu.v2.u32 [%0], {%1, %2};" : : _CUB_ASM_PTR_(ptr), "r"(val.x), "r"(val.y) : "memory");), (__threadfence(); asm volatile("st.cg.v2.u32 [%0], {%1, %2};" : : _CUB_ASM_PTR_(ptr), "r"(val.x), "r"(val.y) : "memory");)); } _CCCL_DEVICE _CCCL_FORCEINLINE void store_release(unsigned long long* ptr, unsigned long long val) { NV_IF_TARGET( NV_PROVIDES_SM_70, (asm volatile("st.release.gpu.u64 [%0], %1;" : : _CUB_ASM_PTR_(ptr), "l"(val) : "memory");), (__threadfence(); asm volatile("st.cg.u64 [%0], %1;" : : _CUB_ASM_PTR_(ptr), "l"(val) : "memory");)); } _CCCL_DEVICE _CCCL_FORCEINLINE void store_release(unsigned int* ptr, unsigned int val) { NV_IF_TARGET( NV_PROVIDES_SM_70, (asm volatile("st.release.gpu.u32 [%0], %1;" : : _CUB_ASM_PTR_(ptr), "r"(val) : "memory");), (__threadfence(); asm volatile("st.cg.u32 [%0], %1;" : : _CUB_ASM_PTR_(ptr), "r"(val) : "memory");)); } _CCCL_DEVICE _CCCL_FORCEINLINE void store_release(unsigned short* ptr, unsigned short val) { NV_IF_TARGET( NV_PROVIDES_SM_70, (asm volatile("st.release.gpu.u16 [%0], %1;" : : _CUB_ASM_PTR_(ptr), "h"(val) : "memory");), (__threadfence(); asm volatile("st.cg.u16 [%0], %1;" : : _CUB_ASM_PTR_(ptr), "h"(val) : "memory");)); } _CCCL_DEVICE _CCCL_FORCEINLINE void store_release(unsigned char* ptr, unsigned char val) { NV_IF_TARGET( NV_PROVIDES_SM_70, (asm volatile("{" " .reg .u8 datum;" " cvt.u8.u16 datum, %1;" " st.release.gpu.u8 [%0], datum;" "}" : : _CUB_ASM_PTR_(ptr), "h"((unsigned short) val) : "memory");), (__threadfence(); asm volatile( "{" " .reg .u8 datum;" " cvt.u8.u16 datum, %1;" " st.cg.u8 [%0], datum;" "}" : : _CUB_ASM_PTR_(ptr), "h"((unsigned short) val) : "memory");)); } } // namespace detail #endif // DOXYGEN_SHOULD_SKIP_THIS CUB_NAMESPACE_END cccl-2.5.0/cub/cub/detail/temporary_storage.cuh000066400000000000000000000210411463375617100214700ustar00rootroot00000000000000/* * Copyright 2021 NVIDIA Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include CUB_NAMESPACE_BEGIN namespace detail { namespace temporary_storage { class slot; template class alias; template class layout; /** * @brief Temporary storage slot that can be considered a C++ union with an * arbitrary fields count. * * @warning slot lifetime is defined by the lifetime of the associated layout. * It's impossible to request new array if layout is already mapped. * * @par A Simple Example * @code * auto slot = temporary_storage.get_slot(0); * * // Add fields into the slot * // Create an int alias with 0 elements: * auto int_array = slot->create_alias(); * // Create a double alias with 2 elements: * auto double_array = slot->create_alias(2); * // Create a char alias with 0 elements: * auto empty_array = slot->create_alias(); * // Slot size is defined by double_array size (2 * sizeof(double)) * * if (condition) * { * int_array.grow(42); * // Now slot size is defined by int_array size (42 * sizeof(int)) * } * * // Temporary storage mapping * // ... * int *d_int_array = int_array.get(); * double *d_double_array = double_array.get(); * char *d_empty_array = empty_array.get(); // Guaranteed to return nullptr * @endcode */ class slot { std::size_t m_size{}; void* m_pointer{}; public: slot() = default; /** * @brief Returns an array of type @p T and length @p elements */ template _CCCL_HOST_DEVICE alias create_alias(std::size_t elements = 0); private: _CCCL_HOST_DEVICE void set_bytes_required(std::size_t new_size) { m_size = (max) (m_size, new_size); } _CCCL_HOST_DEVICE std::size_t get_bytes_required() const { return m_size; } _CCCL_HOST_DEVICE void set_storage(void* ptr) { m_pointer = ptr; } _CCCL_HOST_DEVICE void* get_storage() const { return m_pointer; } template friend class alias; template friend class layout; }; /** * @brief Named memory region of a temporary storage slot * * @par Overview * This class provides a typed wrapper of a temporary slot memory region. * It can be considered as a field in the C++ union. It's only possible to * increase the array size. * * @warning alias lifetime is defined by the lifetime of the associated slot * It's impossible to grow the array if the layout is already mapped. */ template class alias { slot& m_slot; std::size_t m_elements{}; _CCCL_HOST_DEVICE explicit alias(slot& slot, std::size_t elements = 0) : m_slot(slot) , m_elements(elements) { this->update_slot(); } _CCCL_HOST_DEVICE void update_slot() { m_slot.set_bytes_required(m_elements * sizeof(T)); } public: alias() = delete; /** * @brief Increases the number of elements * * @warning * This method should be called before temporary storage mapping stage. * * @param[in] new_elements Increases the memory region occupied in the * temporary slot to fit up to @p new_elements items * of type @p T. */ _CCCL_HOST_DEVICE void grow(std::size_t new_elements) { m_elements = new_elements; this->update_slot(); } /** * @brief Returns pointer to array * * If the @p elements number is equal to zero, or storage layout isn't mapped, * @p nullptr is returned. */ _CCCL_HOST_DEVICE T* get() const { if (m_elements == 0) { return nullptr; } return reinterpret_cast(m_slot.get_storage()); } friend class slot; }; template _CCCL_HOST_DEVICE alias slot::create_alias(std::size_t elements) { return alias(*this, elements); } /** * @brief Temporary storage layout represents a structure with * @p SlotsCount union-like fields * * The layout can be mapped to a temporary buffer only once. * * @par A Simple Example * @code * cub::detail::temporary_storage::layout<3> temporary_storage; * * auto slot_1 = temporary_storage.get_slot(0); * auto slot_2 = temporary_storage.get_slot(1); * * // Add fields into the first slot * auto int_array = slot_1->create_alias(1); * auto double_array = slot_1->create_alias(2); * * // Add fields into the second slot * auto char_array = slot_2->create_alias(); * * // The equivalent C++ structure could look like * // struct StorageLayout * // { * // union { * // } slot_0; * // std::byte padding_0[256 - sizeof (slot_0)]; * // * // union { * // int alias_0[1]; * // double alias_1[2]; * // } slot_1; * // std::byte padding_1[256 - sizeof (slot_1)]; * // * // union { * // char alias_0[0]; * // } slot_2; * // std::byte padding_2[256 - sizeof (slot_2)]; * // }; * * // The third slot is empty * * // Temporary storage mapping * if (d_temp_storage == nullptr) * { * temp_storage_bytes = temporary_storage.get_size(); * return; * } * else * { * temporary_storage.map_to_buffer(d_temp_storage, temp_storage_bytes); * } * * // Use pointers * int *d_int_array = int_array.get(); * double *d_double_array = double_array.get(); * char *d_char_array = char_array.get(); * @endcode */ template class layout { slot m_slots[SlotsCount]; std::size_t m_sizes[SlotsCount]; void* m_pointers[SlotsCount]; bool m_layout_was_mapped{}; public: layout() = default; _CCCL_HOST_DEVICE slot* get_slot(int slot_id) { if (slot_id < SlotsCount) { return &m_slots[slot_id]; } return nullptr; } /** * @brief Returns required temporary storage size in bytes */ _CCCL_HOST_DEVICE std::size_t get_size() { this->prepare_interface(); // AliasTemporaries can return error only in mapping stage, // so it's safe to ignore it here. std::size_t temp_storage_bytes{}; AliasTemporaries(nullptr, temp_storage_bytes, m_pointers, m_sizes); if (temp_storage_bytes == 0) { // The current CUB convention implies that there are two stages for each // device-scope function call. The first one returns the required storage // size. The second stage consumes temporary storage to perform some work. // The only way to distinguish between the two stages is by checking the // value of the temporary storage pointer. If zero bytes are requested, // `cudaMalloc` will return `nullptr`. This fact makes it impossible to // distinguish between the two stages, so we request some fixed amount of // bytes (even if we don't need it) to have a non-null temporary storage // pointer. return 1; } return temp_storage_bytes; } /** * @brief Maps the layout to the temporary storage buffer. */ _CCCL_HOST_DEVICE cudaError_t map_to_buffer(void* d_temp_storage, std::size_t temp_storage_bytes) { if (m_layout_was_mapped) { return cudaErrorAlreadyMapped; } this->prepare_interface(); cudaError_t error = cudaSuccess; if ((error = AliasTemporaries(d_temp_storage, temp_storage_bytes, m_pointers, m_sizes))) { return error; } for (std::size_t slot_id = 0; slot_id < SlotsCount; slot_id++) { m_slots[slot_id].set_storage(m_pointers[slot_id]); } m_layout_was_mapped = true; return error; } private: _CCCL_HOST_DEVICE void prepare_interface() { if (m_layout_was_mapped) { return; } for (std::size_t slot_id = 0; slot_id < SlotsCount; slot_id++) { const std::size_t slot_size = m_slots[slot_id].get_bytes_required(); m_sizes[slot_id] = slot_size; m_pointers[slot_id] = nullptr; } } }; } // namespace temporary_storage } // namespace detail CUB_NAMESPACE_END cccl-2.5.0/cub/cub/detail/type_traits.cuh000066400000000000000000000054451463375617100203030ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * Wrappers and extensions around utilities. */ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include _CCCL_SUPPRESS_DEPRECATED_PUSH #include _CCCL_SUPPRESS_DEPRECATED_POP #include CUB_NAMESPACE_BEGIN namespace detail { template using invoke_result_t = #if _CCCL_STD_VER < 2017 typename ::cuda::std::result_of::type; #else // 2017+ ::cuda::std::invoke_result_t; #endif /// The type of intermediate accumulator (according to P2322R6) template using accumulator_t = typename ::cuda::std::decay>::type; } // namespace detail CUB_NAMESPACE_END cccl-2.5.0/cub/cub/detail/uninitialized_copy.cuh000066400000000000000000000055171463375617100216360ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include CUB_NAMESPACE_BEGIN namespace detail { #if defined(_NVHPC_CUDA) template _CCCL_HOST_DEVICE void uninitialized_copy(T* ptr, U&& val) { // NVBug 3384810 new (ptr) T(::cuda::std::forward(val)); } #else template ::value, int>::type = 0> _CCCL_HOST_DEVICE void uninitialized_copy(T* ptr, U&& val) { *ptr = ::cuda::std::forward(val); } template ::value, int>::type = 0> _CCCL_HOST_DEVICE void uninitialized_copy(T* ptr, U&& val) { new (ptr) T(::cuda::std::forward(val)); } #endif } // namespace detail CUB_NAMESPACE_END cccl-2.5.0/cub/cub/device/000077500000000000000000000000001463375617100152205ustar00rootroot00000000000000cccl-2.5.0/cub/cub/device/device_adjacent_difference.cuh000066400000000000000000000607641463375617100232000ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include #include #include #include #include CUB_NAMESPACE_BEGIN //! @rst //! DeviceAdjacentDifference provides device-wide, parallel operations for //! computing the differences of adjacent elements residing within //! device-accessible memory. //! //! Overview //! ++++++++++++++++++++++++++ //! //! - DeviceAdjacentDifference calculates the differences of adjacent elements in //! d_input. Because the binary operation could be noncommutative, there //! are two sets of methods. Methods named SubtractLeft subtract left element //! ``*(i - 1)`` of input sequence from current element ``*i``. //! Methods named ``SubtractRight`` subtract current element ``*i`` from the //! right one ``*(i + 1)``: //! //! .. code-block:: c++ //! //! int *d_values; // [1, 2, 3, 4] //! //... //! int *d_subtract_left_result <-- [ 1, 1, 1, 1 ] //! int *d_subtract_right_result <-- [ -1, -1, -1, 4 ] //! //! - For SubtractLeft, if the left element is out of bounds, the iterator is //! assigned to ``*(result + (i - first))`` without modification. //! - For SubtractRight, if the right element is out of bounds, the iterator is //! assigned to ``*(result + (i - first))`` without modification. //! //! Snippet //! ++++++++++++++++++++++++++ //! //! The code snippet below illustrates how to use ``DeviceAdjacentDifference`` to //! compute the left difference between adjacent elements. //! //! .. code-block:: c++ //! //! #include //! // or equivalently //! //! // Declare, allocate, and initialize device-accessible pointers //! int num_items; // e.g., 8 //! int *d_values; // e.g., [1, 2, 1, 2, 1, 2, 1, 2] //! //... //! //! // Determine temporary device storage requirements //! void *d_temp_storage = NULL; //! size_t temp_storage_bytes = 0; //! //! cub::DeviceAdjacentDifference::SubtractLeft( //! d_temp_storage, temp_storage_bytes, d_values, num_items); //! //! // Allocate temporary storage //! cudaMalloc(&d_temp_storage, temp_storage_bytes); //! //! // Run operation //! cub::DeviceAdjacentDifference::SubtractLeft( //! d_temp_storage, temp_storage_bytes, d_values, num_items); //! //! // d_values <-- [1, 1, -1, 1, -1, 1, -1, 1] //! //! @endrst struct DeviceAdjacentDifference { private: template static CUB_RUNTIME_FUNCTION cudaError_t AdjacentDifference( void* d_temp_storage, std::size_t& temp_storage_bytes, InputIteratorT d_input, OutputIteratorT d_output, NumItemsT num_items, DifferenceOpT difference_op, cudaStream_t stream) { using OffsetT = detail::choose_offset_t; using DispatchT = DispatchAdjacentDifference; return DispatchT::Dispatch( d_temp_storage, temp_storage_bytes, d_input, d_output, static_cast(num_items), difference_op, stream); } public: //! @rst //! Subtracts the left element of each adjacent pair of elements residing within device-accessible memory //! //! Overview //! ++++++++++++++++++++++++++ //! //! - Calculates the differences of adjacent elements in ``d_input``. //! That is, ``*d_input`` is assigned to ``*d_output``, and, for each iterator ``i`` in the //! range ``[d_input + 1, d_input + num_items)``, the result of //! ``difference_op(*i, *(i - 1))`` is assigned to ``*(d_output + (i - d_input))``. //! - Note that the behavior is undefined if the input and output ranges //! overlap in any way. //! //! Snippet //! ++++++++++++++++++++++++++ //! //! The code snippet below illustrates how to use ``DeviceAdjacentDifference`` //! to compute the difference between adjacent elements. //! //! //! .. code-block:: c++ //! //! #include //! // or equivalently //! //! struct CustomDifference //! { //! template //! __host__ DataType operator()(DataType &lhs, DataType &rhs) //! { //! return lhs - rhs; //! } //! }; //! //! // Declare, allocate, and initialize device-accessible pointers //! int num_items; // e.g., 8 //! int *d_input; // e.g., [1, 2, 1, 2, 1, 2, 1, 2] //! int *d_output; //! ... //! //! // Determine temporary device storage requirements //! void *d_temp_storage = NULL; //! size_t temp_storage_bytes = 0; //! //! cub::DeviceAdjacentDifference::SubtractLeftCopy( //! d_temp_storage, temp_storage_bytes, //! d_input, d_output, //! num_items, CustomDifference()); //! //! // Allocate temporary storage //! cudaMalloc(&d_temp_storage, temp_storage_bytes); //! //! // Run operation //! cub::DeviceAdjacentDifference::SubtractLeftCopy( //! d_temp_storage, temp_storage_bytes, //! d_input, d_output, //! num_items, CustomDifference()); //! //! // d_input <-- [1, 2, 1, 2, 1, 2, 1, 2] //! // d_output <-- [1, 1, -1, 1, -1, 1, -1, 1] //! //! @endrst //! //! @tparam InputIteratorT //! @rst //! is a model of `Input Iterator `_, //! and ``x`` and ``y`` are objects of ``InputIteratorT``'s ``value_type``, then //! ``x - y`` is defined, and ``InputIteratorT``'s ``value_type`` is convertible to //! a type in ``OutputIteratorT``'s set of ``value_types``, and the return type //! of ``x - y`` is convertible to a type in ``OutputIteratorT``'s set of //! ``value_types``. //! @endrst //! //! @tparam OutputIteratorT //! @rst //! is a model of `Output Iterator `_. //! @endrst //! //! @tparam DifferenceOpT //! Its `result_type` is convertible to a type in `OutputIteratorT`'s set of `value_types`. //! //! @tparam NumItemsT //! **[inferred]** Type of num_items //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in] d_input //! Pointer to the input sequence //! //! @param[out] d_output //! Pointer to the output sequence //! //! @param[in] num_items //! Number of items in the input sequence //! //! @param[in] difference_op //! The binary function used to compute differences //! //! @param[in] stream //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0` //! @endrst template static CUB_RUNTIME_FUNCTION cudaError_t SubtractLeftCopy( void* d_temp_storage, std::size_t& temp_storage_bytes, InputIteratorT d_input, OutputIteratorT d_output, NumItemsT num_items, DifferenceOpT difference_op = {}, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceAdjacentDifference::SubtractLeftCopy"); constexpr bool may_alias = false; constexpr bool read_left = true; return AdjacentDifference( d_temp_storage, temp_storage_bytes, d_input, d_output, num_items, difference_op, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED static CUB_RUNTIME_FUNCTION cudaError_t SubtractLeftCopy( void* d_temp_storage, std::size_t& temp_storage_bytes, InputIteratorT d_input, OutputIteratorT d_output, NumItemsT num_items, DifferenceOpT difference_op, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return SubtractLeftCopy(d_temp_storage, temp_storage_bytes, d_input, d_output, num_items, difference_op, stream); } //! @rst //! Subtracts the left element of each adjacent pair of elements residing within device-accessible memory. //! //! Overview //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! Calculates the differences of adjacent elements in ``d_input``. That is, for //! each iterator ``i`` in the range ``[d_input + 1, d_input + num_items)``, the //! result of ``difference_op(*i, *(i - 1))`` is assigned to //! ``*(d_input + (i - d_input))``. //! //! Snippet //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The code snippet below illustrates how to use ``DeviceAdjacentDifference`` //! to compute the difference between adjacent elements. //! //! //! .. code-block:: c++ //! //! #include //! // or equivalently //! //! struct CustomDifference //! { //! template //! __host__ DataType operator()(DataType &lhs, DataType &rhs) //! { //! return lhs - rhs; //! } //! }; //! //! // Declare, allocate, and initialize device-accessible pointers //! int num_items; // e.g., 8 //! int *d_data; // e.g., [1, 2, 1, 2, 1, 2, 1, 2] //! ... //! //! // Determine temporary device storage requirements //! void *d_temp_storage = NULL; //! size_t temp_storage_bytes = 0; //! cub::DeviceAdjacentDifference::SubtractLeft( //! d_temp_storage, temp_storage_bytes, //! d_data, num_items, CustomDifference()); //! //! // Allocate temporary storage //! cudaMalloc(&d_temp_storage, temp_storage_bytes); //! //! // Run operation //! cub::DeviceAdjacentDifference::SubtractLeft( //! d_temp_storage, temp_storage_bytes, //! d_data, num_items, CustomDifference()); //! //! // d_data <-- [1, 1, -1, 1, -1, 1, -1, 1] //! //! @endrst //! //! @tparam RandomAccessIteratorT //! @rst //! is a model of `Random Access Iterator `_, //! ``RandomAccessIteratorT`` is mutable. If ``x`` and ``y`` are objects of //! ``RandomAccessIteratorT``'s ``value_type``, and ``x - y`` is defined, then the //! return type of ``x - y`` should be convertible to a type in //! ``RandomAccessIteratorT``'s set of ``value_types``. //! @endrst //! //! @tparam DifferenceOpT //! Its `result_type` is convertible to a type in `RandomAccessIteratorT`'s //! set of `value_types`. //! //! @tparam NumItemsT //! **[inferred]** Type of `num_items` //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work //! is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in,out] d_input //! Pointer to the input sequence and the result //! //! @param[in] num_items //! Number of items in the input sequence //! //! @param[in] difference_op //! The binary function used to compute differences //! //! @param[in] stream //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst template static CUB_RUNTIME_FUNCTION cudaError_t SubtractLeft( void* d_temp_storage, std::size_t& temp_storage_bytes, RandomAccessIteratorT d_input, NumItemsT num_items, DifferenceOpT difference_op = {}, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceAdjacentDifference::SubtractLeft"); constexpr bool may_alias = true; constexpr bool read_left = true; return AdjacentDifference( d_temp_storage, temp_storage_bytes, d_input, d_input, num_items, difference_op, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED static CUB_RUNTIME_FUNCTION cudaError_t SubtractLeft( void* d_temp_storage, std::size_t& temp_storage_bytes, RandomAccessIteratorT d_input, NumItemsT num_items, DifferenceOpT difference_op, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return SubtractLeft(d_temp_storage, temp_storage_bytes, d_input, num_items, difference_op, stream); } //! @rst //! Subtracts the right element of each adjacent pair of elements residing within device-accessible memory. //! //! Overview //! ++++++++++++++++++++++++++ //! //! - Calculates the right differences of adjacent elements in ``d_input``. //! That is, ``*(d_input + num_items - 1)`` is assigned to //! ``*(d_output + num_items - 1)``, and, for each iterator ``i`` in the range //! ``[d_input, d_input + num_items - 1)``, the result of //! ``difference_op(*i, *(i + 1))`` is assigned to //! ``*(d_output + (i - d_input))``. //! - Note that the behavior is undefined if the input and output ranges //! overlap in any way. //! //! Snippet //! ++++++++++++++++++++++++++ //! //! The code snippet below illustrates how to use ``DeviceAdjacentDifference`` //! to compute the difference between adjacent elements. //! //! //! .. code-block:: c++ //! //! #include //! // or equivalently //! //! struct CustomDifference //! { //! template //! __host__ DataType operator()(DataType &lhs, DataType &rhs) //! { //! return lhs - rhs; //! } //! }; //! //! // Declare, allocate, and initialize device-accessible pointers //! int num_items; // e.g., 8 //! int *d_input; // e.g., [1, 2, 1, 2, 1, 2, 1, 2] //! int *d_output; //! .. //! //! // Determine temporary device storage requirements //! void *d_temp_storage = nullptr; //! size_t temp_storage_bytes = 0; //! cub::DeviceAdjacentDifference::SubtractRightCopy( //! d_temp_storage, temp_storage_bytes, //! d_input, d_output, num_items, CustomDifference()); //! //! // Allocate temporary storage //! cudaMalloc(&d_temp_storage, temp_storage_bytes); //! //! // Run operation //! cub::DeviceAdjacentDifference::SubtractRightCopy( //! d_temp_storage, temp_storage_bytes, //! d_input, d_output, num_items, CustomDifference()); //! //! // d_input <-- [1, 2, 1, 2, 1, 2, 1, 2] //! // d_data <-- [-1, 1, -1, 1, -1, 1, -1, 2] //! //! @endrst //! //! @tparam InputIteratorT //! @rst //! is a model of `Input Iterator `_, //! and ``x`` and ``y`` are objects of ``InputIteratorT``'s ``value_type``, then //! ``x - y`` is defined, and ``InputIteratorT``'s ``value_type`` is convertible to //! a type in ``OutputIteratorT``'s set of ``value_types``, and the return type //! of ``x - y`` is convertible to a type in ``OutputIteratorT``'s set of //! ``value_types``. //! @endrst //! //! @tparam OutputIteratorT //! @rst //! is a model of `Output Iterator `_. //! @endrst //! //! @tparam DifferenceOpT //! Its `result_type` is convertible to a type in `RandomAccessIteratorT`'s //! set of `value_types`. //! //! @tparam NumItemsT //! **[inferred]** Type of num_items //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work //! is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in] d_input //! Pointer to the input sequence //! //! @param[out] d_output //! Pointer to the output sequence //! //! @param[in] num_items //! Number of items in the input sequence //! //! @param[in] difference_op //! The binary function used to compute differences. //! //! @param[in] stream //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst template static CUB_RUNTIME_FUNCTION cudaError_t SubtractRightCopy( void* d_temp_storage, std::size_t& temp_storage_bytes, InputIteratorT d_input, OutputIteratorT d_output, NumItemsT num_items, DifferenceOpT difference_op = {}, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceAdjacentDifference::SubtractRightCopy"); constexpr bool may_alias = false; constexpr bool read_left = false; return AdjacentDifference( d_temp_storage, temp_storage_bytes, d_input, d_output, num_items, difference_op, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED static CUB_RUNTIME_FUNCTION cudaError_t SubtractRightCopy( void* d_temp_storage, std::size_t& temp_storage_bytes, InputIteratorT d_input, OutputIteratorT d_output, NumItemsT num_items, DifferenceOpT difference_op, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return SubtractRightCopy(d_temp_storage, temp_storage_bytes, d_input, d_output, num_items, difference_op, stream); } //! @rst //! Subtracts the right element of each adjacent pair of elements residing within device-accessible memory. //! //! Overview //! ++++++++++++++++++++++++++ //! //! Calculates the right differences of adjacent elements in ``d_input``. //! That is, for each iterator ``i`` in the range //! ``[d_input, d_input + num_items - 1)``, the result of //! ``difference_op(*i, *(i + 1))`` is assigned to ``*(d_input + (i - d_input))``. //! //! Snippet //! ++++++++++++++++++++++++++ //! //! The code snippet below illustrates how to use ``DeviceAdjacentDifference`` //! to compute the difference between adjacent elements. //! //! //! .. code-block:: c++ //! //! #include //! // or equivalently //! //! // Declare, allocate, and initialize device-accessible pointers //! int num_items; // e.g., 8 //! int *d_data; // e.g., [1, 2, 1, 2, 1, 2, 1, 2] //! ... //! //! // Determine temporary device storage requirements //! void *d_temp_storage = NULL; //! size_t temp_storage_bytes = 0; //! cub::DeviceAdjacentDifference::SubtractRight( //! d_temp_storage, temp_storage_bytes, d_data, num_items); //! //! // Allocate temporary storage //! cudaMalloc(&d_temp_storage, temp_storage_bytes); //! //! // Run operation //! cub::DeviceAdjacentDifference::SubtractRight( //! d_temp_storage, temp_storage_bytes, d_data, num_items); //! //! // d_data <-- [-1, 1, -1, 1, -1, 1, -1, 2] //! //! @endrst //! //! @tparam RandomAccessIteratorT //! @rst //! is a model of `Random Access Iterator `_, //! ``RandomAccessIteratorT`` is mutable. If ``x`` and ``y`` are objects of //! ``RandomAccessIteratorT``'s `value_type`, and ``x - y`` is defined, then the //! return type of ``x - y`` should be convertible to a type in //! ``RandomAccessIteratorT``'s set of ``value_types``. //! @endrst //! //! @tparam DifferenceOpT //! Its `result_type` is convertible to a type in `RandomAccessIteratorT`'s //! set of `value_types`. //! //! @tparam NumItemsT //! **[inferred]** Type of num_items //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work //! is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in,out] d_input //! Pointer to the input sequence //! //! @param[in] num_items //! Number of items in the input sequence //! //! @param[in] difference_op //! The binary function used to compute differences //! //! @param[in] stream //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst template static CUB_RUNTIME_FUNCTION cudaError_t SubtractRight( void* d_temp_storage, std::size_t& temp_storage_bytes, RandomAccessIteratorT d_input, NumItemsT num_items, DifferenceOpT difference_op = {}, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceAdjacentDifference::SubtractRight"); constexpr bool may_alias = true; constexpr bool read_left = false; return AdjacentDifference( d_temp_storage, temp_storage_bytes, d_input, d_input, num_items, difference_op, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED static CUB_RUNTIME_FUNCTION cudaError_t SubtractRight( void* d_temp_storage, std::size_t& temp_storage_bytes, RandomAccessIteratorT d_input, NumItemsT num_items, DifferenceOpT difference_op, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return SubtractRight(d_temp_storage, temp_storage_bytes, d_input, num_items, difference_op, stream); } }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/device/device_copy.cuh000066400000000000000000000170731463375617100202220ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ //! @file cub::DeviceCopy provides device-wide, parallel operations for copying data. #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include CUB_NAMESPACE_BEGIN //! @brief cub::DeviceCopy provides device-wide, parallel operations for copying data. struct DeviceCopy { //! @rst //! Copies data from a batch of given source ranges to their corresponding destination ranges. //! //! .. note:: //! //! If any input range aliases any output range the behavior is undefined. //! If any output range aliases another output range the behavior is undefined. //! Input ranges can alias one another. //! //! Snippet //! +++++++ //! //! The code snippet below illustrates usage of DeviceCopy::Batched to perform a DeviceRunLength Decode operation. //! //! .. code-block:: c++ //! //! struct GetIteratorToRange //! { //! __host__ __device__ __forceinline__ auto operator()(uint32_t index) //! { //! return thrust::make_constant_iterator(d_data_in[index]); //! } //! int32_t *d_data_in; //! }; //! //! struct GetPtrToRange //! { //! __host__ __device__ __forceinline__ auto operator()(uint32_t index) //! { //! return d_data_out + d_offsets[index]; //! } //! int32_t *d_data_out; //! uint32_t *d_offsets; //! }; //! //! struct GetRunLength //! { //! __host__ __device__ __forceinline__ uint32_t operator()(uint32_t index) //! { //! return d_offsets[index + 1] - d_offsets[index]; //! } //! uint32_t *d_offsets; //! }; //! //! uint32_t num_ranges = 5; //! int32_t *d_data_in; // e.g., [4, 2, 7, 3, 1] //! int32_t *d_data_out; // e.g., [0, ... ] //! uint32_t *d_offsets; // e.g., [0, 2, 5, 6, 9, 14] //! //! // Returns a constant iterator to the element of the i-th run //! thrust::counting_iterator iota(0); //! auto iterators_in = thrust::make_transform_iterator(iota, GetIteratorToRange{d_data_in}); //! //! // Returns the run length of the i-th run //! auto sizes = thrust::make_transform_iterator(iota, GetRunLength{d_offsets}); //! //! // Returns pointers to the output range for each run //! auto ptrs_out = thrust::make_transform_iterator(iota, GetPtrToRange{d_data_out, d_offsets}); //! //! // Determine temporary device storage requirements //! void *d_temp_storage = nullptr; //! size_t temp_storage_bytes = 0; //! cub::DeviceCopy::Batched(d_temp_storage, temp_storage_bytes, iterators_in, ptrs_out, sizes, //! num_ranges); //! //! // Allocate temporary storage //! cudaMalloc(&d_temp_storage, temp_storage_bytes); //! //! // Run batched copy algorithm (used to perform runlength decoding) //! cub::DeviceCopy::Batched(d_temp_storage, temp_storage_bytes, iterators_in, ptrs_out, sizes, //! num_ranges); //! //! // d_data_out <-- [4, 4, 2, 2, 2, 7, 3, 3, 3, 1, 1, 1, 1, 1] //! //! @endrst //! //! @tparam InputIt //! **[inferred]** Device-accessible random-access input iterator type providing the iterators to the source ranges //! //! @tparam OutputIt //! **[inferred]** Device-accessible random-access input iterator type providing the iterators to //! the destination ranges //! //! @tparam SizeIteratorT //! **[inferred]** Device-accessible random-access input iterator type providing the number of items to be //! copied for each pair of ranges //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. //! When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in] input_it //! Device-accessible iterator providing the iterators to the source ranges //! //! @param[in] output_it //! Device-accessible iterator providing the iterators to the destination ranges //! //! @param[in] sizes //! Device-accessible iterator providing the number of elements to be copied for each pair of ranges //! //! @param[in] num_ranges //! The total number of range pairs //! //! @param[in] stream //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst template CUB_RUNTIME_FUNCTION static cudaError_t Batched( void* d_temp_storage, size_t& temp_storage_bytes, InputIt input_it, OutputIt output_it, SizeIteratorT sizes, uint32_t num_ranges, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceCopy::Batched"); // Integer type large enough to hold any offset in [0, num_ranges) using RangeOffsetT = uint32_t; // Integer type large enough to hold any offset in [0, num_thread_blocks_launched), where a safe // uppper bound on num_thread_blocks_launched can be assumed to be given by // IDIV_CEIL(num_ranges, 64) using BlockOffsetT = uint32_t; return detail::DispatchBatchMemcpy< InputIt, OutputIt, SizeIteratorT, RangeOffsetT, BlockOffsetT, detail::DeviceBatchMemcpyPolicy, false>::Dispatch(d_temp_storage, temp_storage_bytes, input_it, output_it, sizes, num_ranges, stream); } }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/device/device_for.cuh000066400000000000000000000740701463375617100200360ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include #include #include #include #include #include CUB_NAMESPACE_BEGIN namespace detail { namespace for_each { /** * `op_wrapper_t` turns bulk into a for-each operation by wrapping the user-provided unary operator. */ template struct op_wrapper_t { RandomAccessIteratorT input; OpT op; _CCCL_DEVICE _CCCL_FORCEINLINE void operator()(OffsetT i) { // Dereferencing `thrust::device_vector` iterators returns a `thrust::device_reference` // instead of `T`. Since user-provided operator expects `T` as an argument, we need to unwrap. (void) op(THRUST_NS_QUALIFIER::raw_reference_cast(*(input + i))); } }; /** * `op_wrapper_vectorized_t` turns bulk into a for-each-copy operation. * `op_wrapper_vectorized_t` is similar to `op_wrapper_t` but does not provide any guarantees about * address of the input parameter. `OpT` might be given a copy of the value or an actual reference * to the input iterator value (depending on the alignment of input iterator) */ template struct op_wrapper_vectorized_t { const T* input; // Raw pointer to the input data OpT op; // User-provided operator OffsetT partially_filled_vector_id; // Index of the vector that doesn't have all elements OffsetT num_items; // Total number of non-vectorized items // TODO Can be extracted into tuning constexpr static int vec_size = 4; // Type of the vector that is used to load the input data using vector_t = typename CubVector::Type; _CCCL_DEVICE _CCCL_FORCEINLINE void operator()(OffsetT i) { // Surrounding `Bulk` call doesn't invoke this operator on invalid indices, so we don't need to // check for out-of-bounds access here. if (i != partially_filled_vector_id) { // Case of fully filled vector const vector_t vec = *reinterpret_cast(input + vec_size * i); #pragma unroll for (int j = 0; j < vec_size; j++) { (void) op(*(reinterpret_cast(&vec) + j)); } } else { // Case of partially filled vector for (OffsetT j = i * vec_size; j < num_items; j++) { (void) op(input[j]); } } } }; } // namespace for_each } // namespace detail struct DeviceFor { private: /** * Checks if the pointer is aligned to the given vector type */ template CUB_RUNTIME_FUNCTION static bool is_aligned(const T* ptr) { return (reinterpret_cast(ptr) & (sizeof(VectorT) - 1)) == 0; } template CUB_RUNTIME_FUNCTION static cudaError_t for_each_n( RandomAccessIteratorT first, OffsetT num_items, OpT op, cudaStream_t stream, ::cuda::std::false_type /* do_not_vectorize */) { using wrapped_op_t = detail::for_each::op_wrapper_t; return detail::for_each::dispatch_t::dispatch(num_items, wrapped_op_t{first, op}, stream); } template CUB_RUNTIME_FUNCTION static cudaError_t for_each_n( RandomAccessIteratorT first, OffsetT num_items, OpT op, cudaStream_t stream, ::cuda::std::true_type /* vectorize */) { auto unwrapped_first = THRUST_NS_QUALIFIER::raw_pointer_cast(&*first); using wrapped_op_t = detail::for_each::op_wrapper_vectorized_t>; if (is_aligned(unwrapped_first)) { // Vectorize loads const OffsetT num_vec_items = cub::DivideAndRoundUp(num_items, wrapped_op_t::vec_size); return detail::for_each::dispatch_t::dispatch( num_vec_items, wrapped_op_t{ unwrapped_first, op, num_items % wrapped_op_t::vec_size ? num_vec_items - 1 : num_vec_items, num_items}, stream); } // Fallback to non-vectorized version return for_each_n(first, num_items, op, stream, ::cuda::std::false_type{}); } public: //! @rst //! Overview //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! Applies the function object ``op`` to each index in the provided shape //! The algorithm is similar to //! `bulk `_ //! from P2300. //! //! - The return value of ``op``, if any, is ignored. //! - @devicestorage //! //! A Simple Example //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The following code snippet demonstrates how to use Bulk to square each element in a device vector. //! //! .. literalinclude:: ../../test/catch2_test_device_for_api.cu //! :language: c++ //! :dedent: //! :start-after: example-begin bulk-square-t //! :end-before: example-end bulk-square-t //! //! .. literalinclude:: ../../test/catch2_test_device_for_api.cu //! :language: c++ //! :dedent: //! :start-after: example-begin bulk-temp-storage //! :end-before: example-end bulk-temp-storage //! //! @endrst //! //! @tparam ShapeT //! is an integral type //! //! @tparam OpT //! is a model of [Unary Function](https://en.cppreference.com/w/cpp/utility/functional/unary_function) //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, //! the required allocation size is written to `temp_storage_bytes` and no work is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in] shape //! Shape of the index space to iterate over //! //! @param[in] op //! Function object to apply to each index in the index space //! //! @param[in] stream //! CUDA stream to launch kernels within. Default stream is `0`. template CUB_RUNTIME_FUNCTION static cudaError_t Bulk(void* d_temp_storage, size_t& temp_storage_bytes, ShapeT shape, OpT op, cudaStream_t stream = {}) { static_assert(::cuda::std::is_integral::value, "ShapeT must be an integral type"); if (d_temp_storage == nullptr) { temp_storage_bytes = 1; return cudaSuccess; } return Bulk(shape, op, stream); } //! @rst //! Overview //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! Applies the function object ``op`` to each element in the range ``[first, first + num_items)`` //! //! - The return value of ``op``, if any, is ignored. //! - @devicestorage //! //! A Simple Example //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The following code snippet demonstrates how to use `ForEachN` to square each element in a device vector. //! //! .. literalinclude:: ../../test/catch2_test_device_for_api.cu //! :language: c++ //! :dedent: //! :start-after: example-begin bulk-square-ref-t //! :end-before: example-end bulk-square-ref-t //! //! .. literalinclude:: ../../test/catch2_test_device_for_api.cu //! :language: c++ //! :dedent: //! :start-after: example-begin for-each-n-temp-storage //! :end-before: example-end for-each-n-temp-storage //! //! @endrst //! //! @tparam RandomAccessIteratorT //! is a model of Random Access Iterator whose value type is convertible to `op`'s argument type. //! //! @tparam NumItemsT //! is an integral type representing the number of elements to iterate over //! //! @tparam OpT //! is a model of [Unary Function](https://en.cppreference.com/w/cpp/utility/functional/unary_function) //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, //! the required allocation size is written to `temp_storage_bytes` and no work is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in] first //! The beginning of the sequence //! //! @param[in] num_items //! Number of elements to iterate over //! //! @param[in] op //! Function object to apply to each element in the range //! //! @param[in] stream //! CUDA stream to launch kernels within. Default stream is `0`. template CUB_RUNTIME_FUNCTION static cudaError_t ForEachN( void* d_temp_storage, size_t& temp_storage_bytes, RandomAccessIteratorT first, NumItemsT num_items, OpT op, cudaStream_t stream = {}) { if (d_temp_storage == nullptr) { temp_storage_bytes = 1; return cudaSuccess; } return ForEachN(first, num_items, op, stream); } //! @rst //! Overview //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! Applies the function object ``op`` to each element in the range ``[first, last)`` //! //! - The return value of ``op``, if any, is ignored. //! - @devicestorage //! //! A Simple Example //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The following code snippet demonstrates how to use `ForEach` to square each element in a device vector. //! //! .. literalinclude:: ../../test/catch2_test_device_for_api.cu //! :language: c++ //! :dedent: //! :start-after: example-begin bulk-square-ref-t //! :end-before: example-end bulk-square-ref-t //! //! .. literalinclude:: ../../test/catch2_test_device_for_api.cu //! :language: c++ //! :dedent: //! :start-after: example-begin for-each-temp-storage //! :end-before: example-end for-each-temp-storage //! //! @endrst //! //! @tparam RandomAccessIteratorT //! is a model of Random Access Iterator whose value type is convertible to `op`'s argument type. //! //! @tparam OpT //! is a model of [Unary Function](https://en.cppreference.com/w/cpp/utility/functional/unary_function) //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, //! the required allocation size is written to `temp_storage_bytes` and no work is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in] first //! The beginning of the sequence //! //! @param[in] last //! The end of the sequence //! //! @param[in] op //! Function object to apply to each element in the range //! //! @param[in] stream //! CUDA stream to launch kernels within. Default stream is `0`. template CUB_RUNTIME_FUNCTION static cudaError_t ForEach( void* d_temp_storage, size_t& temp_storage_bytes, RandomAccessIteratorT first, RandomAccessIteratorT last, OpT op, cudaStream_t stream = {}) { if (d_temp_storage == nullptr) { temp_storage_bytes = 1; return cudaSuccess; } return ForEach(first, last, op, stream); } //! @rst //! Overview //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! Applies the function object ``op`` to each element in the range ``[first, first + num_items)``. //! Unlike the ``ForEachN`` algorithm, ``ForEachCopyN`` is allowed to invoke ``op`` on copies of the elements. //! This relaxation allows ``ForEachCopyN`` to vectorize loads. //! //! - Allowed to invoke ``op`` on copies of the elements //! - The return value of ``op``, if any, is ignored. //! - @devicestorage //! //! A Simple Example //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The following code snippet demonstrates how to use `ForEachCopyN` to count odd elements in a device vector. //! //! .. literalinclude:: ../../test/catch2_test_device_for_api.cu //! :language: c++ //! :dedent: //! :start-after: example-begin bulk-odd-count-t //! :end-before: example-end bulk-odd-count-t //! //! .. literalinclude:: ../../test/catch2_test_device_for_api.cu //! :language: c++ //! :dedent: //! :start-after: example-begin for-each-copy-n-temp-storage //! :end-before: example-end for-each-copy-n-temp-storage //! //! @endrst //! //! @tparam RandomAccessIteratorT //! is a model of Random Access Iterator whose value type is convertible to `op`'s argument type. //! //! @tparam NumItemsT //! is an integral type representing the number of elements to iterate over //! //! @tparam OpT //! is a model of [Unary Function](https://en.cppreference.com/w/cpp/utility/functional/unary_function) //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, //! the required allocation size is written to `temp_storage_bytes` and no work is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in] first //! The beginning of the sequence //! //! @param[in] num_items //! Number of elements to iterate over //! //! @param[in] op //! Function object to apply to a copy of each element in the range //! //! @param[in] stream //! CUDA stream to launch kernels within. Default stream is `0`. template CUB_RUNTIME_FUNCTION static cudaError_t ForEachCopyN( void* d_temp_storage, size_t& temp_storage_bytes, RandomAccessIteratorT first, NumItemsT num_items, OpT op, cudaStream_t stream = {}) { if (d_temp_storage == nullptr) { temp_storage_bytes = 1; return cudaSuccess; } return ForEachCopyN(first, num_items, op, stream); } //! @rst //! Overview //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! Applies the function object ``op`` to each element in the range ``[first, last)``. //! Unlike the ``ForEach`` algorithm, ``ForEachCopy`` is allowed to invoke ``op`` on copies of the elements. //! This relaxation allows ``ForEachCopy`` to vectorize loads. //! //! - Allowed to invoke ``op`` on copies of the elements //! - The return value of ``op``, if any, is ignored. //! - @devicestorage //! //! A Simple Example //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The following code snippet demonstrates how to use `ForEachCopy` to count odd elements in a device vector. //! //! .. literalinclude:: ../../test/catch2_test_device_for_api.cu //! :language: c++ //! :dedent: //! :start-after: example-begin bulk-odd-count-t //! :end-before: example-end bulk-odd-count-t //! //! .. literalinclude:: ../../test/catch2_test_device_for_api.cu //! :language: c++ //! :dedent: //! :start-after: example-begin for-each-copy-temp-storage //! :end-before: example-end for-each-copy-temp-storage //! //! @endrst //! //! @tparam RandomAccessIteratorT //! is a model of Random Access Iterator whose value type is convertible to `op`'s argument type. //! //! @tparam OpT //! is a model of [Unary Function](https://en.cppreference.com/w/cpp/utility/functional/unary_function) //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, //! the required allocation size is written to `temp_storage_bytes` and no work is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in] first //! The beginning of the sequence //! //! @param[in] last //! The end of the sequence //! //! @param[in] op //! Function object to apply to a copy of each element in the range //! //! @param[in] stream //! CUDA stream to launch kernels within. Default stream is `0`. template CUB_RUNTIME_FUNCTION static cudaError_t ForEachCopy( void* d_temp_storage, size_t& temp_storage_bytes, RandomAccessIteratorT first, RandomAccessIteratorT last, OpT op, cudaStream_t stream = {}) { if (d_temp_storage == nullptr) { temp_storage_bytes = 1; return cudaSuccess; } return ForEachCopy(first, last, op, stream); } //! @rst //! Overview //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! Applies the function object ``op`` to each index in the provided shape //! The algorithm is similar to //! `bulk `_ //! from P2300. //! //! - The return value of ``op``, if any, is ignored. //! //! A Simple Example //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The following code snippet demonstrates how to use Bulk to square each element in a device vector. //! //! .. literalinclude:: ../../test/catch2_test_device_for_api.cu //! :language: c++ //! :dedent: //! :start-after: example-begin bulk-square-t //! :end-before: example-end bulk-square-t //! //! .. literalinclude:: ../../test/catch2_test_device_for_api.cu //! :language: c++ //! :dedent: //! :start-after: example-begin bulk-wo-temp-storage //! :end-before: example-end bulk-wo-temp-storage //! //! @endrst //! //! @tparam ShapeT //! is an integral type //! //! @tparam OpT //! is a model of [Unary Function](https://en.cppreference.com/w/cpp/utility/functional/unary_function) //! //! @param[in] shape //! Shape of the index space to iterate over //! //! @param[in] op //! Function object to apply to each index in the index space //! //! @param[in] stream //! CUDA stream to launch kernels within. Default stream is `0`. template CUB_RUNTIME_FUNCTION static cudaError_t Bulk(ShapeT shape, OpT op, cudaStream_t stream = {}) { CUB_DETAIL_NVTX_RANGE_SCOPE("cub::DeviceFor::Bulk"); static_assert(::cuda::std::is_integral::value, "ShapeT must be an integral type"); using offset_t = ShapeT; return detail::for_each::dispatch_t::dispatch(static_cast(shape), op, stream); } private: // Internal version without NVTX raNGE template CUB_RUNTIME_FUNCTION static cudaError_t ForEachNNoNVTX(RandomAccessIteratorT first, NumItemsT num_items, OpT op, cudaStream_t stream = {}) { using offset_t = NumItemsT; using use_vectorization_t = ::cuda::std::integral_constant; // Disable auto-vectorization for now: // constexpr bool use_vectorization = // detail::for_each::can_regain_copy_freedom, OpT>::value // && THRUST_NS_QUALIFIER::is_contiguous_iterator::value; return for_each_n(first, num_items, op, stream, use_vectorization_t{}); } public: //! @rst //! Overview //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! Applies the function object ``op`` to each element in the range ``[first, first + num_items)`` //! //! - The return value of ``op``, if any, is ignored. //! //! A Simple Example //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The following code snippet demonstrates how to use `ForEachN` to square each element in a device vector. //! //! .. literalinclude:: ../../test/catch2_test_device_for_api.cu //! :language: c++ //! :dedent: //! :start-after: example-begin bulk-square-ref-t //! :end-before: example-end bulk-square-ref-t //! //! .. literalinclude:: ../../test/catch2_test_device_for_api.cu //! :language: c++ //! :dedent: //! :start-after: example-begin for-each-n-wo-temp-storage //! :end-before: example-end for-each-n-wo-temp-storage //! //! @endrst //! //! @tparam RandomAccessIteratorT //! is a model of Random Access Iterator whose value type is convertible to `op`'s argument type. //! //! @tparam NumItemsT //! is an integral type representing the number of elements to iterate over //! //! @tparam OpT //! is a model of [Unary Function](https://en.cppreference.com/w/cpp/utility/functional/unary_function) //! //! @param[in] first //! The beginning of the sequence //! //! @param[in] num_items //! Number of elements to iterate over //! //! @param[in] op //! Function object to apply to each element in the range //! //! @param[in] stream //! CUDA stream to launch kernels within. Default stream is `0`. template CUB_RUNTIME_FUNCTION static cudaError_t ForEachN(RandomAccessIteratorT first, NumItemsT num_items, OpT op, cudaStream_t stream = {}) { CUB_DETAIL_NVTX_RANGE_SCOPE("cub::DeviceFor::ForEachN"); return ForEachNNoNVTX(first, num_items, op, stream); } //! @rst //! Overview //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! Applies the function object ``op`` to each element in the range ``[first, last)`` //! //! - The return value of ``op``, if any, is ignored. //! //! A Simple Example //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The following code snippet demonstrates how to use `ForEach` to square each element in a device vector. //! //! .. literalinclude:: ../../test/catch2_test_device_for_api.cu //! :language: c++ //! :dedent: //! :start-after: example-begin bulk-square-ref-t //! :end-before: example-end bulk-square-ref-t //! //! .. literalinclude:: ../../test/catch2_test_device_for_api.cu //! :language: c++ //! :dedent: //! :start-after: example-begin for-each-wo-temp-storage //! :end-before: example-end for-each-wo-temp-storage //! //! @endrst //! //! @tparam RandomAccessIteratorT //! is a model of Random Access Iterator whose value type is convertible to `op`'s argument type. //! //! @tparam OpT //! is a model of [Unary Function](https://en.cppreference.com/w/cpp/utility/functional/unary_function) //! //! @param[in] first //! The beginning of the sequence //! //! @param[in] last //! The end of the sequence //! //! @param[in] op //! Function object to apply to each element in the range //! //! @param[in] stream //! CUDA stream to launch kernels within. Default stream is `0`. template CUB_RUNTIME_FUNCTION static cudaError_t ForEach(RandomAccessIteratorT first, RandomAccessIteratorT last, OpT op, cudaStream_t stream = {}) { CUB_DETAIL_NVTX_RANGE_SCOPE("cub::DeviceFor::ForEach"); using offset_t = typename THRUST_NS_QUALIFIER::iterator_traits::difference_type; const auto num_items = static_cast(THRUST_NS_QUALIFIER::distance(first, last)); return ForEachNNoNVTX(first, num_items, op, stream); } private: // Internal version without NVTX range template CUB_RUNTIME_FUNCTION static cudaError_t ForEachCopyNNoNVTX(RandomAccessIteratorT first, NumItemsT num_items, OpT op, cudaStream_t stream = {}) { static_assert(THRUST_NS_QUALIFIER::is_contiguous_iterator::value, "Iterator must be contiguous"); using offset_t = NumItemsT; using use_vectorization_t = ::cuda::std::integral_constant; return for_each_n(first, num_items, op, stream, use_vectorization_t{}); } public: //! @rst //! Overview //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! Applies the function object ``op`` to each element in the range ``[first, first + num_items)``. //! Unlike the ``ForEachN`` algorithm, ``ForEachCopyN`` is allowed to invoke ``op`` on copies of the elements. //! This relaxation allows ``ForEachCopyN`` to vectorize loads. //! //! - Allowed to invoke ``op`` on copies of the elements //! - The return value of ``op``, if any, is ignored. //! //! A Simple Example //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The following code snippet demonstrates how to use `ForEachCopyN` to count odd elements in a device vector. //! //! .. literalinclude:: ../../test/catch2_test_device_for_api.cu //! :language: c++ //! :dedent: //! :start-after: example-begin bulk-odd-count-t //! :end-before: example-end bulk-odd-count-t //! //! .. literalinclude:: ../../test/catch2_test_device_for_api.cu //! :language: c++ //! :dedent: //! :start-after: example-begin for-each-copy-n-wo-temp-storage //! :end-before: example-end for-each-copy-n-wo-temp-storage //! //! @endrst //! //! @tparam RandomAccessIteratorT //! is a model of Random Access Iterator whose value type is convertible to `op`'s argument type. //! //! @tparam NumItemsT //! is an integral type representing the number of elements to iterate over //! //! @tparam OpT //! is a model of [Unary Function](https://en.cppreference.com/w/cpp/utility/functional/unary_function) //! //! @param[in] first //! The beginning of the sequence //! //! @param[in] num_items //! Number of elements to iterate over //! //! @param[in] op //! Function object to apply to a copy of each element in the range //! //! @param[in] stream //! CUDA stream to launch kernels within. Default stream is `0`. template CUB_RUNTIME_FUNCTION static cudaError_t ForEachCopyN(RandomAccessIteratorT first, NumItemsT num_items, OpT op, cudaStream_t stream = {}) { CUB_DETAIL_NVTX_RANGE_SCOPE("cub::DeviceFor::ForEachCopyN"); return ForEachCopyNNoNVTX(first, num_items, op, stream); } //! @rst //! Overview //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! Applies the function object ``op`` to each element in the range ``[first, last)``. //! Unlike the ``ForEach`` algorithm, ``ForEachCopy`` is allowed to invoke ``op`` on copies of the elements. //! This relaxation allows ``ForEachCopy`` to vectorize loads. //! //! - Allowed to invoke ``op`` on copies of the elements //! - The return value of ``op``, if any, is ignored. //! //! A Simple Example //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The following code snippet demonstrates how to use `ForEachCopy` to count odd elements in a device vector. //! //! .. literalinclude:: ../../test/catch2_test_device_for_api.cu //! :language: c++ //! :dedent: //! :start-after: example-begin bulk-odd-count-t //! :end-before: example-end bulk-odd-count-t //! //! .. literalinclude:: ../../test/catch2_test_device_for_api.cu //! :language: c++ //! :dedent: //! :start-after: example-begin for-each-copy-wo-temp-storage //! :end-before: example-end for-each-copy-wo-temp-storage //! //! @endrst //! //! @tparam RandomAccessIteratorT //! is a model of Random Access Iterator whose value type is convertible to `op`'s argument type. //! //! @tparam OpT //! is a model of [Unary Function](https://en.cppreference.com/w/cpp/utility/functional/unary_function) //! //! @param[in] first //! The beginning of the sequence //! //! @param[in] last //! The end of the sequence //! //! @param[in] op //! Function object to apply to a copy of each element in the range //! //! @param[in] stream //! CUDA stream to launch kernels within. Default stream is `0`. template CUB_RUNTIME_FUNCTION static cudaError_t ForEachCopy(RandomAccessIteratorT first, RandomAccessIteratorT last, OpT op, cudaStream_t stream = {}) { CUB_DETAIL_NVTX_RANGE_SCOPE("cub::DeviceFor::ForEachCopy"); static_assert(THRUST_NS_QUALIFIER::is_contiguous_iterator::value, "Iterator must be contiguous"); using offset_t = typename THRUST_NS_QUALIFIER::iterator_traits::difference_type; const auto num_items = static_cast(THRUST_NS_QUALIFIER::distance(first, last)); return ForEachCopyNNoNVTX(first, num_items, op, stream); } }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/device/device_histogram.cuh000066400000000000000000001750141463375617100212450ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ //! @file cub::DeviceHistogram provides device-wide parallel operations for //! constructing histogram(s) from a sequence of samples data residing //! within device-accessible memory. #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include #include #include CUB_NAMESPACE_BEGIN //! @rst //! DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of //! samples data residing within device-accessible memory. //! //! Overview //! ++++++++++++++++++++++++++ //! //! A `histogram `_ counts the number of observations that fall into each //! of the disjoint categories (known as *bins*). //! //! Usage Considerations //! ++++++++++++++++++++++++++ //! //! @cdp_class{DeviceHistogram} //! //! @endrst struct DeviceHistogram { //! @name Evenly-segmented bin ranges //! @{ //! @rst //! Computes an intensity histogram from a sequence of data samples using equal-width bins. //! //! - The number of histogram bins is (``num_levels - 1``) //! - All bins comprise the same width of sample values: ``(upper_level - lower_level) / (num_levels - 1)``. //! - If the common type of ``SampleT`` and ``LevelT`` is of integral type, the bin for a sample is //! computed as ``(sample - lower_level) * (num_levels - 1) / (upper_level - lower_level)``, round //! down to the nearest whole number. To protect against potential overflows, if the product //! ``(upper_level - lower_level) * (num_levels - 1)`` exceeds the number representable by an //! ``uint64_t``, the cuda error ``cudaErrorInvalidValue`` is returned. If the common type is 128 //! bits wide, bin computation will use 128-bit arithmetic and ``cudaErrorInvalidValue`` will only //! be returned if bin computation would overflow for 128-bit arithmetic. //! - The ranges ``[d_samples, d_samples + num_samples)`` and //! ``[d_histogram, d_histogram + num_levels - 1)`` shall not overlap in any way. //! - ``cuda::std::common_type`` must be valid, and both LevelT and SampleT must be valid //! arithmetic types. The common type must be convertible to ``int`` and trivially copyable. //! - @devicestorage //! //! Snippet //! +++++++ //! //! The code snippet below illustrates the computation of a six-bin histogram //! from a sequence of float samples //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! // Declare, allocate, and initialize device-accessible pointers for //! // input samples and output histogram //! int num_samples; // e.g., 10 //! float* d_samples; // e.g., [2.2, 6.1, 7.1, 2.9, 3.5, 0.3, 2.9, 2.1, 6.1, 999.5] //! int* d_histogram; // e.g., [ -, -, -, -, -, -] //! int num_levels; // e.g., 7 (seven level boundaries for six bins) //! float lower_level; // e.g., 0.0 (lower sample value boundary of lowest bin) //! float upper_level; // e.g., 12.0 (upper sample value boundary of upper bin) //! ... //! //! // Determine temporary device storage requirements //! void* d_temp_storage = NULL; //! size_t temp_storage_bytes = 0; //! cub::DeviceHistogram::HistogramEven( //! d_temp_storage, temp_storage_bytes, //! d_samples, d_histogram, num_levels, //! lower_level, upper_level, num_samples); //! //! // Allocate temporary storage //! cudaMalloc(&d_temp_storage, temp_storage_bytes); //! //! // Compute histograms //! cub::DeviceHistogram::HistogramEven( //! d_temp_storage, temp_storage_bytes, //! d_samples, d_histogram, num_levels, //! lower_level, upper_level, num_samples); //! //! // d_histogram <-- [1, 5, 0, 3, 0, 0]; //! //! @endrst //! //! @tparam SampleIteratorT //! **[inferred]** Random-access input iterator type for reading input samples @iterator //! //! @tparam CounterT //! **[inferred]** Integer type for histogram bin counters //! //! @tparam LevelT //! **[inferred]** Type for specifying boundaries (levels) //! //! @tparam OffsetT //! **[inferred]** Signed integer type for sequence offsets, list lengths, //! pointer differences, etc. @offset_size1 //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no //! work is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in] d_samples //! The pointer to the input sequence of data samples. //! //! @param[out] d_histogram //! The pointer to the histogram counter output array of length //! `num_levels - 1`. //! //! @param[in] num_levels //! The number of boundaries (levels) for delineating histogram samples. //! Implies that the number of bins is `num_levels - 1`. //! //! @param[in] lower_level //! The lower sample value bound (inclusive) for the lowest histogram bin. //! //! @param[in] upper_level //! The upper sample value bound (exclusive) for the highest histogram bin. //! //! @param[in] num_samples //! The number of input samples (i.e., the length of `d_samples`) //! //! @param[in] stream //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst template CUB_RUNTIME_FUNCTION static cudaError_t HistogramEven( void* d_temp_storage, size_t& temp_storage_bytes, SampleIteratorT d_samples, CounterT* d_histogram, int num_levels, LevelT lower_level, LevelT upper_level, OffsetT num_samples, cudaStream_t stream = 0) { /// The sample value type of the input iterator using SampleT = cub::detail::value_t; return MultiHistogramEven<1, 1>( d_temp_storage, temp_storage_bytes, d_samples, &d_histogram, &num_levels, &lower_level, &upper_level, num_samples, static_cast(1), sizeof(SampleT) * num_samples, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t HistogramEven( void* d_temp_storage, size_t& temp_storage_bytes, SampleIteratorT d_samples, CounterT* d_histogram, int num_levels, LevelT lower_level, LevelT upper_level, OffsetT num_samples, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return HistogramEven( d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, lower_level, upper_level, num_samples, stream); } //! @rst //! Computes an intensity histogram from a sequence of data samples using equal-width bins. //! //! - A two-dimensional *region of interest* within ``d_samples`` can be specified using //! the ``num_row_samples``, ``num_rows``, and ``row_stride_bytes`` parameters. //! - The row stride must be a whole multiple of the sample data type //! size, i.e., ``(row_stride_bytes % sizeof(SampleT)) == 0``. //! - The number of histogram bins is (``num_levels - 1``) //! - All bins comprise the same width of sample values: ``(upper_level - lower_level) / (num_levels - 1)`` //! - If the common type of ``SampleT`` and ``LevelT`` is of integral type, the bin for a sample is //! computed as ``(sample - lower_level) * (num_levels - 1) / (upper_level - lower_level)``, round //! down to the nearest whole number. To protect against potential overflows, if the product //! ``(upper_level - lower_level) * (num_levels - 1)`` exceeds the number representable by an //! ``uint64_t``, the cuda error ``cudaErrorInvalidValue`` is returned. If the common type is 128 //! bits wide, bin computation will use 128-bit arithmetic and ``cudaErrorInvalidValue`` will only //! be returned if bin computation would overflow for 128-bit arithmetic. //! - For a given row ``r`` in ``[0, num_rows)``, let //! ``row_begin = d_samples + r * row_stride_bytes / sizeof(SampleT)`` and //! ``row_end = row_begin + num_row_samples``. The ranges //! ``[row_begin, row_end)`` and ``[d_histogram, d_histogram + num_levels - 1)`` //! shall not overlap in any way. //! - ``cuda::std::common_type`` must be valid, and both LevelT //! and SampleT must be valid arithmetic types. The common type must be //! convertible to ``int`` and trivially copyable. //! - @devicestorage //! //! Snippet //! +++++++ //! //! The code snippet below illustrates the computation of a six-bin histogram //! from a 2x5 region of interest within a flattened 2x7 array of float samples. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! // Declare, allocate, and initialize device-accessible pointers for //! // input samples and output histogram //! int num_row_samples; // e.g., 5 //! int num_rows; // e.g., 2; //! size_t row_stride_bytes; // e.g., 7 * sizeof(float) //! float* d_samples; // e.g., [2.2, 6.1, 7.1, 2.9, 3.5, -, -, //! // 0.3, 2.9, 2.1, 6.1, 999.5, -, -] //! int* d_histogram; // e.g., [ -, -, -, -, -, -] //! int num_levels; // e.g., 7 (seven level boundaries for six bins) //! float lower_level; // e.g., 0.0 (lower sample value boundary of lowest bin) //! float upper_level; // e.g., 12.0 (upper sample value boundary of upper bin) //! ... //! //! // Determine temporary device storage requirements //! void* d_temp_storage = NULL; //! size_t temp_storage_bytes = 0; //! cub::DeviceHistogram::HistogramEven( //! d_temp_storage, temp_storage_bytes, //! d_samples, d_histogram, num_levels, lower_level, upper_level, //! num_row_samples, num_rows, row_stride_bytes); //! //! // Allocate temporary storage //! cudaMalloc(&d_temp_storage, temp_storage_bytes); //! //! // Compute histograms //! cub::DeviceHistogram::HistogramEven( //! d_temp_storage, temp_storage_bytes, d_samples, d_histogram, //! d_samples, d_histogram, num_levels, lower_level, upper_level, //! num_row_samples, num_rows, row_stride_bytes); //! //! // d_histogram <-- [1, 5, 0, 3, 0, 0]; //! //! @endrst //! //! @tparam SampleIteratorT //! **[inferred]** Random-access input iterator type for reading //! input samples. @iterator //! //! @tparam CounterT //! **[inferred]** Integer type for histogram bin counters //! //! @tparam LevelT //! **[inferred]** Type for specifying boundaries (levels) //! //! @tparam OffsetT //! **[inferred]** Signed integer type for sequence offsets, list lengths, //! pointer differences, etc. @offset_size1 //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no //! work is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in] d_samples //! The pointer to the input sequence of data samples. //! //! @param[out] d_histogram //! The pointer to the histogram counter output array of //! length `num_levels - 1`. //! //! @param[in] num_levels //! The number of boundaries (levels) for delineating histogram samples. //! Implies that the number of bins is `num_levels - 1`. //! //! @param[in] lower_level //! The lower sample value bound (inclusive) for the lowest histogram bin. //! //! @param[in] upper_level //! The upper sample value bound (exclusive) for the highest histogram bin. //! //! @param[in] num_row_samples //! The number of data samples per row in the region of interest //! //! @param[in] num_rows //! The number of rows in the region of interest //! //! @param[in] row_stride_bytes //! The number of bytes between starts of consecutive rows in //! the region of interest //! //! @param[in] stream //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst template CUB_RUNTIME_FUNCTION static cudaError_t HistogramEven( void* d_temp_storage, size_t& temp_storage_bytes, SampleIteratorT d_samples, CounterT* d_histogram, int num_levels, LevelT lower_level, LevelT upper_level, OffsetT num_row_samples, OffsetT num_rows, size_t row_stride_bytes, cudaStream_t stream = 0) { return MultiHistogramEven<1, 1>( d_temp_storage, temp_storage_bytes, d_samples, &d_histogram, &num_levels, &lower_level, &upper_level, num_row_samples, num_rows, row_stride_bytes, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t HistogramEven( void* d_temp_storage, size_t& temp_storage_bytes, SampleIteratorT d_samples, CounterT* d_histogram, int num_levels, LevelT lower_level, LevelT upper_level, OffsetT num_row_samples, OffsetT num_rows, size_t row_stride_bytes, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return HistogramEven( d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, lower_level, upper_level, num_row_samples, num_rows, row_stride_bytes, stream); } //! @rst //! Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using //! equal-width bins. //! //! - The input is a sequence of *pixel* structures, where each pixel comprises //! a record of ``NUM_CHANNELS`` consecutive data samples //! (e.g., an *RGBA* pixel). //! - Of the ``NUM_CHANNELS`` specified, the function will only compute //! histograms for the first ``NUM_ACTIVE_CHANNELS`` //! (e.g., only *RGB* histograms from *RGBA* pixel samples). //! - The number of histogram bins for channel\ :sub:`i` is ``num_levels[i] - 1``. //! - For channel\ :sub:`i`, the range of values for all histogram bins have the same width: //! ``(upper_level[i] - lower_level[i]) / (num_levels[i] - 1)`` //! - If the common type of sample and level is of integral type, the bin for a sample is //! computed as ``(sample - lower_level[i]) * (num_levels - 1) / (upper_level[i] - lower_level[i])``, round down //! to the nearest whole number. To protect against potential overflows, if, for any channel ``i``, the product //! ``(upper_level[i] - lower_level[i]) * (num_levels[i] - 1)`` exceeds the number representable by an ``uint64_t``, //! the cuda error ``cudaErrorInvalidValue`` is returned. If the common type is 128 bits wide, bin computation //! will use 128-bit arithmetic and ``cudaErrorInvalidValue`` will only be returned if bin //! computation would overflow for 128-bit arithmetic. //! - For a given channel ``c`` in ``[0, NUM_ACTIVE_CHANNELS)``, the ranges //! ``[d_samples, d_samples + NUM_CHANNELS * num_pixels)`` and //! ``[d_histogram[c], d_histogram[c] + num_levels[c] - 1)`` shall not overlap in any way. //! - ``cuda::std::common_type`` must be valid, and both LevelT //! and SampleT must be valid arithmetic types. //! The common type must be convertible to ``int`` and trivially copyable. //! - @devicestorage //! //! Snippet //! +++++++ //! //! The code snippet below illustrates the computation of three 256-bin *RGB* histograms //! from a quad-channel sequence of *RGBA* pixels (8 bits per channel per pixel) //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! // Declare, allocate, and initialize device-accessible pointers for //! // input samples and output histograms //! int num_pixels; // e.g., 5 //! unsigned char* d_samples; // e.g., [(2, 6, 7, 5), (3, 0, 2, 1), (7, 0, 6, 2), //! // (0, 6, 7, 5), (3, 0, 2, 6)] //! int* d_histogram[3]; // e.g., three device pointers to three device buffers, //! // each allocated with 256 integer counters //! int num_levels[3]; // e.g., {257, 257, 257}; //! unsigned int lower_level[3]; // e.g., {0, 0, 0}; //! unsigned int upper_level[3]; // e.g., {256, 256, 256}; //! ... //! //! // Determine temporary device storage requirements //! void* d_temp_storage = NULL; //! size_t temp_storage_bytes = 0; //! cub::DeviceHistogram::MultiHistogramEven<4, 3>( //! d_temp_storage, temp_storage_bytes, //! d_samples, d_histogram, num_levels, //! lower_level, upper_level, num_pixels); //! //! // Allocate temporary storage //! cudaMalloc(&d_temp_storage, temp_storage_bytes); //! //! // Compute histograms //! cub::DeviceHistogram::MultiHistogramEven<4, 3>( //! d_temp_storage, temp_storage_bytes, //! d_samples, d_histogram, num_levels, //! lower_level, upper_level, num_pixels); //! //! // d_histogram <-- [ [1, 0, 1, 2, 0, 0, 0, 1, 0, 0, 0, ..., 0], //! // [0, 3, 0, 0, 0, 0, 2, 0, 0, 0, 0, ..., 0], //! // [0, 0, 2, 0, 0, 0, 1, 2, 0, 0, 0, ..., 0] ] //! //! @endrst //! //! @tparam NUM_CHANNELS //! Number of channels interleaved in the input data (may be greater than //! the number of channels being actively histogrammed) //! //! @tparam NUM_ACTIVE_CHANNELS //! **[inferred]** Number of channels actively being histogrammed //! //! @tparam SampleIteratorT //! **[inferred]** Random-access input iterator type for reading //! input samples. @iterator //! //! @tparam CounterT //! **[inferred]** Integer type for histogram bin counters //! //! @tparam LevelT //! **[inferred]** Type for specifying boundaries (levels) //! //! @tparam OffsetT //! **[inferred]** Signed integer type for sequence offsets, list lengths, //! pointer differences, etc. @offset_size1 //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no //! work is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in] d_samples //! The pointer to the multi-channel input sequence of data samples. //! The samples from different channels are assumed to be interleaved //! (e.g., an array of 32-bit pixels where each pixel consists of four //! *RGBA* 8-bit samples). //! //! @param[out] d_histogram //! @rst //! The pointers to the histogram counter output arrays, one for each active //! channel. For channel\ :sub:`i`, the allocation length of //! ``d_histogram[i]`` should be `num_levels[i] - 1``. //! @endrst //! //! @param[in] num_levels //! @rst //! The number of boundaries (levels) for delineating histogram samples in each active channel. //! Implies that the number of bins for channel\ :sub:`i` is ``num_levels[i] - 1``. //! @endrst //! //! @param[in] lower_level //! The lower sample value bound (inclusive) for the lowest histogram bin in each active channel. //! //! @param[in] upper_level //! The upper sample value bound (exclusive) for the highest histogram bin in each active channel. //! //! @param[in] num_pixels //! The number of multi-channel pixels (i.e., the length of `d_samples / NUM_CHANNELS`) //! //! @param[in] stream //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst template CUB_RUNTIME_FUNCTION static cudaError_t MultiHistogramEven( void* d_temp_storage, size_t& temp_storage_bytes, SampleIteratorT d_samples, CounterT* d_histogram[NUM_ACTIVE_CHANNELS], const int num_levels[NUM_ACTIVE_CHANNELS], const LevelT lower_level[NUM_ACTIVE_CHANNELS], const LevelT upper_level[NUM_ACTIVE_CHANNELS], OffsetT num_pixels, cudaStream_t stream = 0) { /// The sample value type of the input iterator using SampleT = cub::detail::value_t; return MultiHistogramEven( d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, lower_level, upper_level, num_pixels, static_cast(1), sizeof(SampleT) * NUM_CHANNELS * num_pixels, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t MultiHistogramEven( void* d_temp_storage, size_t& temp_storage_bytes, SampleIteratorT d_samples, CounterT* d_histogram[NUM_ACTIVE_CHANNELS], const int num_levels[NUM_ACTIVE_CHANNELS], const LevelT lower_level[NUM_ACTIVE_CHANNELS], const LevelT upper_level[NUM_ACTIVE_CHANNELS], OffsetT num_pixels, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return MultiHistogramEven( d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, lower_level, upper_level, num_pixels, stream); } //! @rst //! Computes per-channel intensity histograms from a sequence of //! multi-channel "pixel" data samples using equal-width bins. //! //! - The input is a sequence of *pixel* structures, where each pixel //! comprises a record of ``NUM_CHANNELS`` consecutive data samples (e.g., an *RGBA* pixel). //! - Of the ``NUM_CHANNELS`` specified, the function will only compute //! histograms for the first ``NUM_ACTIVE_CHANNELS`` (e.g., only *RGB* //! histograms from *RGBA* pixel samples). //! - A two-dimensional *region of interest* within ``d_samples`` can be //! specified using the ``num_row_samples``, ``num_rows``, and ``row_stride_bytes`` parameters. //! - The row stride must be a whole multiple of the sample data type //! size, i.e., ``(row_stride_bytes % sizeof(SampleT)) == 0``. //! - The number of histogram bins for channel\ :sub:`i` is ``num_levels[i] - 1``. //! - For channel\ :sub:`i`, the range of values for all histogram bins have the same width: //! ``(upper_level[i] - lower_level[i]) / (num_levels[i] - 1)`` //! - If the common type of sample and level is of integral type, the bin for a sample is //! computed as ``(sample - lower_level[i]) * (num_levels - 1) / (upper_level[i] - lower_level[i])``, //! round down to the nearest whole number. To protect against potential overflows, if, for any channel ``i``, //! the product ``(upper_level[i] - lower_level[i]) * (num_levels[i] - 1)`` exceeds the number representable by //! an ``uint64_t``, the cuda error ``cudaErrorInvalidValue`` is returned. //! If the common type is 128 bits wide, bin computation will use 128-bit arithmetic and ``cudaErrorInvalidValue`` //! will only be returned if bin computation would overflow for 128-bit arithmetic. //! - For a given row ``r`` in ``[0, num_rows)``, and sample ``s`` in //! ``[0, num_row_pixels)``, let //! ``row_begin = d_samples + r * row_stride_bytes / sizeof(SampleT)``, //! ``sample_begin = row_begin + s * NUM_CHANNELS``, and //! ``sample_end = sample_begin + NUM_ACTIVE_CHANNELS``. For a given channel //! ``c`` in ``[0, NUM_ACTIVE_CHANNELS)``, the ranges //! ``[sample_begin, sample_end)`` and //! ``[d_histogram[c], d_histogram[c] + num_levels[c] - 1)`` shall not overlap in any way. //! - ``cuda::std::common_type`` must be valid, and both LevelT //! and SampleT must be valid arithmetic types. The common type must be //! convertible to ``int`` and trivially copyable. //! - @devicestorage //! //! Snippet //! +++++++ //! //! The code snippet below illustrates the computation of three 256-bin //! *RGB* histograms from a 2x3 region of interest of within a flattened 2x4 //! array of quad-channel *RGBA* pixels (8 bits per channel per pixel). //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! // Declare, allocate, and initialize device-accessible pointers for input //! // samples and output histograms //! int num_row_pixels; // e.g., 3 //! int num_rows; // e.g., 2 //! size_t row_stride_bytes; // e.g., 4 * sizeof(unsigned char) * NUM_CHANNELS //! unsigned char* d_samples; // e.g., [(2, 6, 7, 5), (3, 0, 2, 1), (7, 0, 6, 2), (-, -, -, -), //! // (0, 6, 7, 5), (3, 0, 2, 6), (1, 1, 1, 1), (-, -, -, -)] //! int* d_histogram[3]; // e.g., three device pointers to three device buffers, //! // each allocated with 256 integer counters //! int num_levels[3]; // e.g., {257, 257, 257}; //! unsigned int lower_level[3]; // e.g., {0, 0, 0}; //! unsigned int upper_level[3]; // e.g., {256, 256, 256}; //! ... //! //! // Determine temporary device storage requirements //! void* d_temp_storage = NULL; //! size_t temp_storage_bytes = 0; //! cub::DeviceHistogram::MultiHistogramEven<4, 3>( //! d_temp_storage, temp_storage_bytes, //! d_samples, d_histogram, num_levels, lower_level, upper_level, //! num_row_pixels, num_rows, row_stride_bytes); //! //! // Allocate temporary storage //! cudaMalloc(&d_temp_storage, temp_storage_bytes); //! //! // Compute histograms //! cub::DeviceHistogram::MultiHistogramEven<4, 3>( //! d_temp_storage, temp_storage_bytes, //! d_samples, d_histogram, num_levels, lower_level, upper_level, //! num_row_pixels, num_rows, row_stride_bytes); //! //! // d_histogram <-- [ [1, 1, 1, 2, 0, 0, 0, 1, 0, 0, 0, ..., 0], //! // [0, 4, 0, 0, 0, 0, 2, 0, 0, 0, 0, ..., 0], //! // [0, 1, 2, 0, 0, 0, 1, 2, 0, 0, 0, ..., 0] ] //! //! @endrst //! //! @tparam NUM_CHANNELS //! Number of channels interleaved in the input data (may be greater than //! the number of channels being actively histogrammed) //! //! @tparam NUM_ACTIVE_CHANNELS //! **[inferred]** Number of channels actively being histogrammed //! //! @tparam SampleIteratorT //! **[inferred]** Random-access input iterator type for reading input //! samples. @iterator //! //! @tparam CounterT //! **[inferred]** Integer type for histogram bin counters //! //! @tparam LevelT //! **[inferred]** Type for specifying boundaries (levels) //! //! @tparam OffsetT //! **[inferred]** Signed integer type for sequence offsets, list lengths, //! pointer differences, etc. @offset_size1 //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no //! work is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in] d_samples //! The pointer to the multi-channel input sequence of data samples. The //! samples from different channels are assumed to be interleaved (e.g., //! an array of 32-bit pixels where each pixel consists of four //! *RGBA* 8-bit samples). //! //! @param[out] d_histogram //! @rst //! The pointers to the histogram counter output arrays, one for each //! active channel. For channel\ :sub:`i`, the allocation length //! of ``d_histogram[i]`` should be ``num_levels[i] - 1``. //! @endrst //! //! @param[in] num_levels //! @rst //! The number of boundaries (levels) for delineating histogram samples in each active channel. //! Implies that the number of bins for channel\ :sub:`i` is ``num_levels[i] - 1``. //! @endrst //! //! @param[in] lower_level //! The lower sample value bound (inclusive) for the lowest histogram bin in each active channel. //! //! @param[in] upper_level //! The upper sample value bound (exclusive) for the highest histogram bin in each active channel. //! //! @param[in] num_row_pixels //! The number of multi-channel pixels per row in the region of interest //! //! @param[in] num_rows //! The number of rows in the region of interest //! //! @param[in] row_stride_bytes //! The number of bytes between starts of consecutive rows in the region of //! interest //! //! @param[in] stream //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst template CUB_RUNTIME_FUNCTION static cudaError_t MultiHistogramEven( void* d_temp_storage, size_t& temp_storage_bytes, SampleIteratorT d_samples, CounterT* d_histogram[NUM_ACTIVE_CHANNELS], const int num_levels[NUM_ACTIVE_CHANNELS], const LevelT lower_level[NUM_ACTIVE_CHANNELS], const LevelT upper_level[NUM_ACTIVE_CHANNELS], OffsetT num_row_pixels, OffsetT num_rows, size_t row_stride_bytes, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceHistogram::MultiHistogramEven"); /// The sample value type of the input iterator using SampleT = cub::detail::value_t; Int2Type is_byte_sample; _CCCL_IF_CONSTEXPR (sizeof(OffsetT) > sizeof(int)) { if ((unsigned long long) (num_rows * row_stride_bytes) < (unsigned long long) INT_MAX) { // Down-convert OffsetT data type return DispatchHistogram::DispatchEven( d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, lower_level, upper_level, (int) num_row_pixels, (int) num_rows, (int) (row_stride_bytes / sizeof(SampleT)), stream, is_byte_sample); } } return DispatchHistogram::DispatchEven( d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, lower_level, upper_level, num_row_pixels, num_rows, (OffsetT) (row_stride_bytes / sizeof(SampleT)), stream, is_byte_sample); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t MultiHistogramEven( void* d_temp_storage, size_t& temp_storage_bytes, SampleIteratorT d_samples, CounterT* d_histogram[NUM_ACTIVE_CHANNELS], const int num_levels[NUM_ACTIVE_CHANNELS], const LevelT lower_level[NUM_ACTIVE_CHANNELS], const LevelT upper_level[NUM_ACTIVE_CHANNELS], OffsetT num_row_pixels, OffsetT num_rows, size_t row_stride_bytes, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return MultiHistogramEven( d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, lower_level, upper_level, num_row_pixels, num_rows, row_stride_bytes, stream); } //! @} end member group //! @name Custom bin ranges //! @{ //! @rst //! Computes an intensity histogram from a sequence of data samples using the specified bin boundary levels. //! //! - The number of histogram bins is (``num_levels - 1``) //! - The value range for bin\ :sub:`i` is ``[level[i], level[i+1])`` //! - The range ``[d_histogram, d_histogram + num_levels - 1)`` shall not //! overlap ``[d_samples, d_samples + num_samples)`` nor //! ``[d_levels, d_levels + num_levels)`` in any way. The ranges //! ``[d_levels, d_levels + num_levels)`` and //! ``[d_samples, d_samples + num_samples)`` may overlap. //! - @devicestorage //! //! Snippet //! +++++++ //! //! The code snippet below illustrates the computation of an six-bin histogram //! from a sequence of float samples //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! // Declare, allocate, and initialize device-accessible pointers for input //! // samples and output histogram //! int num_samples; // e.g., 10 //! float* d_samples; // e.g., [2.2, 6.0, 7.1, 2.9, 3.5, 0.3, 2.9, 2.0, 6.1, 999.5] //! int* d_histogram; // e.g., [ -, -, -, -, -, -] //! int num_levels // e.g., 7 (seven level boundaries for six bins) //! float* d_levels; // e.g., [0.0, 2.0, 4.0, 6.0, 8.0, 12.0, 16.0] //! ... //! //! // Determine temporary device storage requirements //! void* d_temp_storage = NULL; //! size_t temp_storage_bytes = 0; //! cub::DeviceHistogram::HistogramRange( //! d_temp_storage, temp_storage_bytes, //! d_samples, d_histogram, num_levels, d_levels, num_samples); //! //! // Allocate temporary storage //! cudaMalloc(&d_temp_storage, temp_storage_bytes); //! //! // Compute histograms //! cub::DeviceHistogram::HistogramRange( //! d_temp_storage, temp_storage_bytes, //! d_samples, d_histogram, num_levels, d_levels, num_samples); //! //! // d_histogram <-- [1, 5, 0, 3, 0, 0]; //! //! @endrst //! //! @tparam SampleIteratorT //! **[inferred]** Random-access input iterator type for reading //! input samples. @iterator //! //! @tparam CounterT //! **[inferred]** Integer type for histogram bin counters //! //! @tparam LevelT //! **[inferred]** Type for specifying boundaries (levels) //! //! @tparam OffsetT //! **[inferred]** Signed integer type for sequence offsets, list lengths, //! pointer differences, etc. @offset_size1 //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work //! is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in] d_samples //! The pointer to the input sequence of data samples. //! //! @param[out] d_histogram //! The pointer to the histogram counter output array of length //! `num_levels - 1`. //! //! @param[in] num_levels //! The number of boundaries (levels) for delineating histogram samples. //! Implies that the number of bins is `num_levels - 1`. //! //! @param[in] d_levels //! The pointer to the array of boundaries (levels). Bin ranges are defined //! by consecutive boundary pairings: lower sample value boundaries are //! inclusive and upper sample value boundaries are exclusive. //! //! @param[in] num_samples //! The number of data samples per row in the region of interest //! //! @param[in] stream //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst template CUB_RUNTIME_FUNCTION static cudaError_t HistogramRange( void* d_temp_storage, size_t& temp_storage_bytes, SampleIteratorT d_samples, CounterT* d_histogram, int num_levels, const LevelT* d_levels, OffsetT num_samples, cudaStream_t stream = 0) { /// The sample value type of the input iterator using SampleT = cub::detail::value_t; return MultiHistogramRange<1, 1>( d_temp_storage, temp_storage_bytes, d_samples, &d_histogram, &num_levels, &d_levels, num_samples, (OffsetT) 1, (size_t) (sizeof(SampleT) * num_samples), stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t HistogramRange( void* d_temp_storage, size_t& temp_storage_bytes, SampleIteratorT d_samples, CounterT* d_histogram, int num_levels, const LevelT* d_levels, OffsetT num_samples, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return HistogramRange( d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels, num_samples, stream); } //! @rst //! Computes an intensity histogram from a sequence of data samples using the specified bin boundary levels. //! //! - A two-dimensional *region of interest* within ``d_samples`` can be //! specified using the ``num_row_samples``, ``num_rows``, and ``row_stride_bytes`` parameters. //! - The row stride must be a whole multiple of the sample data type //! size, i.e., ``(row_stride_bytes % sizeof(SampleT)) == 0``. //! - The number of histogram bins is (``num_levels - 1``) //! - The value range for bin\ :sub:`i` is ``[level[i], level[i+1])`` //! - For a given row ``r`` in ``[0, num_rows)``, let //! ``row_begin = d_samples + r * row_stride_bytes / sizeof(SampleT)`` and //! ``row_end = row_begin + num_row_samples``. The range //! ``[d_histogram, d_histogram + num_levels - 1)`` shall not overlap //! ``[row_begin, row_end)`` nor ``[d_levels, d_levels + num_levels)``. //! The ranges ``[d_levels, d_levels + num_levels)`` and ``[row_begin, row_end)`` may overlap. //! - @devicestorage //! //! Snippet //! +++++++ //! //! The code snippet below illustrates the computation of a six-bin histogram //! from a 2x5 region of interest within a flattened 2x7 array of float samples. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! // Declare, allocate, and initialize device-accessible pointers for input samples and //! // output histogram //! int num_row_samples; // e.g., 5 //! int num_rows; // e.g., 2; //! int row_stride_bytes; // e.g., 7 * sizeof(float) //! float* d_samples; // e.g., [2.2, 6.0, 7.1, 2.9, 3.5, -, -, //! // 0.3, 2.9, 2.0, 6.1, 999.5, -, -] //! int* d_histogram; // e.g., [ -, -, -, -, -, -] //! int num_levels // e.g., 7 (seven level boundaries for six bins) //! float *d_levels; // e.g., [0.0, 2.0, 4.0, 6.0, 8.0, 12.0, 16.0] //! ... //! //! // Determine temporary device storage requirements //! void* d_temp_storage = NULL; //! size_t temp_storage_bytes = 0; //! cub::DeviceHistogram::HistogramRange( //! d_temp_storage, temp_storage_bytes, //! d_samples, d_histogram, num_levels, d_levels, //! num_row_samples, num_rows, row_stride_bytes); //! //! // Allocate temporary storage //! cudaMalloc(&d_temp_storage, temp_storage_bytes); //! //! // Compute histograms //! cub::DeviceHistogram::HistogramRange( //! d_temp_storage, temp_storage_bytes, //! d_samples, d_histogram, num_levels, d_levels, //! num_row_samples, num_rows, row_stride_bytes); //! //! // d_histogram <-- [1, 5, 0, 3, 0, 0]; //! //! @endrst //! //! @tparam SampleIteratorT //! **[inferred]** Random-access input iterator type for reading //! input samples. @iterator //! //! @tparam CounterT //! **[inferred]** Integer type for histogram bin counters //! //! @tparam LevelT //! **[inferred]** Type for specifying boundaries (levels) //! //! @tparam OffsetT //! **[inferred]** Signed integer type for sequence offsets, list lengths, //! pointer differences, etc. @offset_size1 //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no //! work is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in] d_samples //! The pointer to the input sequence of data samples. //! //! @param[out] d_histogram //! The pointer to the histogram counter output array of length //! `num_levels - 1`. //! //! @param[in] num_levels //! The number of boundaries (levels) for delineating histogram samples. //! Implies that the number of bins is `num_levels - 1`. //! //! @param[in] d_levels //! The pointer to the array of boundaries (levels). Bin ranges are defined //! by consecutive boundary pairings: lower sample value boundaries are //! inclusive and upper sample value boundaries are exclusive. //! //! @param[in] num_row_samples //! The number of data samples per row in the region of interest //! //! @param[in] num_rows //! The number of rows in the region of interest //! //! @param[in] row_stride_bytes //! The number of bytes between starts of consecutive rows in the region //! of interest //! //! @param[in] stream //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst template CUB_RUNTIME_FUNCTION static cudaError_t HistogramRange( void* d_temp_storage, size_t& temp_storage_bytes, SampleIteratorT d_samples, CounterT* d_histogram, int num_levels, const LevelT* d_levels, OffsetT num_row_samples, OffsetT num_rows, size_t row_stride_bytes, cudaStream_t stream = 0) { return MultiHistogramRange<1, 1>( d_temp_storage, temp_storage_bytes, d_samples, &d_histogram, &num_levels, &d_levels, num_row_samples, num_rows, row_stride_bytes, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t HistogramRange( void* d_temp_storage, size_t& temp_storage_bytes, SampleIteratorT d_samples, CounterT* d_histogram, int num_levels, const LevelT* d_levels, OffsetT num_row_samples, OffsetT num_rows, size_t row_stride_bytes, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return HistogramRange( d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels, num_row_samples, num_rows, row_stride_bytes, stream); } //! @rst //! Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples //! using the specified bin boundary levels. //! //! - The input is a sequence of *pixel* structures, where each pixel //! comprises a record of ``NUM_CHANNELS`` consecutive data samples (e.g., an *RGBA* pixel). //! - Of the ``NUM_CHANNELS`` specified, the function will only compute //! histograms for the first ``NUM_ACTIVE_CHANNELS`` (e.g., *RGB* histograms from *RGBA* pixel samples). //! - The number of histogram bins for channel\ :sub:`i` is ``num_levels[i] - 1``. //! - For channel\ :sub:`i`, the range of values for all histogram bins have the same width: //! ``(upper_level[i] - lower_level[i]) / (num_levels[i] - 1)`` //! - For given channels ``c1`` and ``c2`` in ``[0, NUM_ACTIVE_CHANNELS)``, the //! range ``[d_histogram[c1], d_histogram[c1] + num_levels[c1] - 1)`` shall //! not overlap ``[d_samples, d_samples + NUM_CHANNELS * num_pixels)`` nor //! ``[d_levels[c2], d_levels[c2] + num_levels[c2])`` in any way. //! The ranges ``[d_levels[c2], d_levels[c2] + num_levels[c2])`` and //! ``[d_samples, d_samples + NUM_CHANNELS * num_pixels)`` may overlap. //! - @devicestorage //! //! Snippet //! +++++++ //! //! The code snippet below illustrates the computation of three 4-bin *RGB* //! histograms from a quad-channel sequence of *RGBA* pixels //! (8 bits per channel per pixel) //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! // Declare, allocate, and initialize device-accessible pointers for //! // input samples and output histograms //! int num_pixels; // e.g., 5 //! unsigned char *d_samples; // e.g., [(2, 6, 7, 5),(3, 0, 2, 1),(7, 0, 6, 2), //! // (0, 6, 7, 5),(3, 0, 2, 6)] //! unsigned int *d_histogram[3]; // e.g., [[ -, -, -, -],[ -, -, -, -],[ -, -, -, -]]; //! int num_levels[3]; // e.g., {5, 5, 5}; //! unsigned int *d_levels[3]; // e.g., [ [0, 2, 4, 6, 8], //! // [0, 2, 4, 6, 8], //! // [0, 2, 4, 6, 8] ]; //! ... //! //! // Determine temporary device storage requirements //! void* d_temp_storage = NULL; //! size_t temp_storage_bytes = 0; //! cub::DeviceHistogram::MultiHistogramRange<4, 3>( //! d_temp_storage, temp_storage_bytes, //! d_samples, d_histogram, num_levels, d_levels, num_pixels); //! //! // Allocate temporary storage //! cudaMalloc(&d_temp_storage, temp_storage_bytes); //! //! // Compute histograms //! cub::DeviceHistogram::MultiHistogramRange<4, 3>( //! d_temp_storage, temp_storage_bytes, //! d_samples, d_histogram, num_levels, d_levels, num_pixels); //! //! // d_histogram <-- [ [1, 3, 0, 1], //! // [3, 0, 0, 2], //! // [0, 2, 0, 3] ] //! //! @endrst //! //! @tparam NUM_CHANNELS //! Number of channels interleaved in the input data (may be greater than //! the number of channels being actively histogrammed) //! //! @tparam NUM_ACTIVE_CHANNELS //! **[inferred]** Number of channels actively being histogrammed //! //! @tparam SampleIteratorT //! **[inferred]** Random-access input iterator type for reading //! input samples. @iterator //! //! @tparam CounterT //! **[inferred]** Integer type for histogram bin counters //! //! @tparam LevelT //! **[inferred]** Type for specifying boundaries (levels) //! //! @tparam OffsetT //! **[inferred]** Signed integer type for sequence offsets, list lengths, //! pointer differences, etc. @offset_size1 //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no //! work is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in] d_samples //! The pointer to the multi-channel input sequence of data samples. //! The samples from different channels are assumed to be interleaved (e.g., //! an array of 32-bit pixels where each pixel consists of four *RGBA* //! 8-bit samples). //! //! @param[out] d_histogram //! @rst //! The pointers to the histogram counter output arrays, one for each active //! channel. For channel\ :sub:`i`, the allocation length of //! ``d_histogram[i]`` should be ``num_levels[i] - 1``. //! @endrst //! //! @param[in] num_levels //! @rst //! The number of boundaries (levels) for delineating histogram samples in //! each active channel. Implies that the number of bins for //! channel\ :sub:`i` is ``num_levels[i] - 1``. //! @endrst //! //! @param[in] d_levels //! The pointers to the arrays of boundaries (levels), one for each active //! channel. Bin ranges are defined by consecutive boundary pairings: lower //! sample value boundaries are inclusive and upper sample value boundaries //! are exclusive. //! //! @param[in] num_pixels //! The number of multi-channel pixels (i.e., the length of `d_samples / NUM_CHANNELS`) //! //! @param[in] stream //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst template CUB_RUNTIME_FUNCTION static cudaError_t MultiHistogramRange( void* d_temp_storage, size_t& temp_storage_bytes, SampleIteratorT d_samples, CounterT* d_histogram[NUM_ACTIVE_CHANNELS], const int num_levels[NUM_ACTIVE_CHANNELS], const LevelT* const d_levels[NUM_ACTIVE_CHANNELS], OffsetT num_pixels, cudaStream_t stream = 0) { /// The sample value type of the input iterator using SampleT = cub::detail::value_t; return MultiHistogramRange( d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels, num_pixels, (OffsetT) 1, (size_t) (sizeof(SampleT) * NUM_CHANNELS * num_pixels), stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t MultiHistogramRange( void* d_temp_storage, size_t& temp_storage_bytes, SampleIteratorT d_samples, CounterT* d_histogram[NUM_ACTIVE_CHANNELS], const int num_levels[NUM_ACTIVE_CHANNELS], const LevelT* const d_levels[NUM_ACTIVE_CHANNELS], OffsetT num_pixels, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return MultiHistogramRange( d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels, num_pixels, stream); } //! @rst //! Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using //! the specified bin boundary levels. //! //! - The input is a sequence of *pixel* structures, where each pixel comprises //! a record of ``NUM_CHANNELS`` consecutive data samples (e.g., an *RGBA* pixel). //! - Of the ``NUM_CHANNELS`` specified, the function will only compute //! histograms for the first ``NUM_ACTIVE_CHANNELS`` (e.g., *RGB* histograms from *RGBA* pixel samples). //! - A two-dimensional *region of interest* within ``d_samples`` can be //! specified using the ``num_row_samples``, ``num_rows``, and ``row_stride_bytes`` parameters. //! - The row stride must be a whole multiple of the sample data type //! size, i.e., ``(row_stride_bytes % sizeof(SampleT)) == 0``. //! - The number of histogram bins for channel\ :sub:`i` is ``num_levels[i] - 1``. //! - For channel\ :sub:`i`, the range of values for all histogram bins have the same width: //! ``(upper_level[i] - lower_level[i]) / (num_levels[i] - 1)`` //! - For a given row ``r`` in ``[0, num_rows)``, and sample ``s`` in ``[0, num_row_pixels)``, let //! ``row_begin = d_samples + r * row_stride_bytes / sizeof(SampleT)``, //! ``sample_begin = row_begin + s * NUM_CHANNELS``, and //! ``sample_end = sample_begin + NUM_ACTIVE_CHANNELS``. For given channels //! ``c1`` and ``c2`` in ``[0, NUM_ACTIVE_CHANNELS)``, the range //! ``[d_histogram[c1], d_histogram[c1] + num_levels[c1] - 1)`` shall not overlap //! ``[sample_begin, sample_end)`` nor //! ``[d_levels[c2], d_levels[c2] + num_levels[c2])`` in any way. The ranges //! ``[d_levels[c2], d_levels[c2] + num_levels[c2])`` and //! ``[sample_begin, sample_end)`` may overlap. //! - @devicestorage //! //! Snippet //! +++++++ //! //! The code snippet below illustrates the computation of three 4-bin *RGB* //! histograms from a 2x3 region of interest of within a flattened 2x4 array //! of quad-channel *RGBA* pixels (8 bits per channel per pixel). //! //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! // Declare, allocate, and initialize device-accessible pointers for input //! // samples and output histograms //! int num_row_pixels; // e.g., 3 //! int num_rows; // e.g., 2 //! size_t row_stride_bytes; // e.g., 4 * sizeof(unsigned char) * NUM_CHANNELS //! unsigned char* d_samples; // e.g., [(2, 6, 7, 5),(3, 0, 2, 1),(1, 1, 1, 1),(-, -, -, -), //! // (7, 0, 6, 2),(0, 6, 7, 5),(3, 0, 2, 6),(-, -, -, -)] //! int* d_histogram[3]; // e.g., [[ -, -, -, -],[ -, -, -, -],[ -, -, -, -]]; //! int num_levels[3]; // e.g., {5, 5, 5}; //! unsigned int* d_levels[3]; // e.g., [ [0, 2, 4, 6, 8], //! // [0, 2, 4, 6, 8], //! // [0, 2, 4, 6, 8] ]; //! ... //! //! // Determine temporary device storage requirements //! void* d_temp_storage = NULL; //! size_t temp_storage_bytes = 0; //! cub::DeviceHistogram::MultiHistogramRange<4, 3>( //! d_temp_storage, temp_storage_bytes, //! d_samples, d_histogram, num_levels, d_levels, //! num_row_pixels, num_rows, row_stride_bytes); //! //! // Allocate temporary storage //! cudaMalloc(&d_temp_storage, temp_storage_bytes); //! //! // Compute histograms //! cub::DeviceHistogram::MultiHistogramRange<4, 3>( //! d_temp_storage, temp_storage_bytes, //! d_samples, d_histogram, num_levels, //! d_levels, num_row_pixels, num_rows, row_stride_bytes); //! //! // d_histogram <-- [ [2, 3, 0, 1], //! // [3, 0, 0, 2], //! // [1, 2, 0, 3] ] //! //! @endrst //! //! @tparam NUM_CHANNELS //! Number of channels interleaved in the input data (may be greater than //! the number of channels being actively histogrammed) //! //! @tparam NUM_ACTIVE_CHANNELS //! **[inferred]** Number of channels actively being histogrammed //! //! @tparam SampleIteratorT //! **[inferred]** Random-access input iterator type for reading input //! samples. @iterator //! //! @tparam CounterT //! **[inferred]** Integer type for histogram bin counters //! //! @tparam LevelT //! **[inferred]** Type for specifying boundaries (levels) //! //! @tparam OffsetT //! **[inferred]** Signed integer type for sequence offsets, list lengths, //! pointer differences, etc. @offset_size1 //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in] d_samples //! The pointer to the multi-channel input sequence of data samples. The //! samples from different channels are assumed to be interleaved (e.g., an //! array of 32-bit pixels where each pixel consists of four //! *RGBA* 8-bit samples). //! //! @param[out] d_histogram //! @rst //! The pointers to the histogram counter output arrays, one for each active //! channel. For channel\ :sub:`i`, the allocation length of //! ``d_histogram[i]`` should be ``num_levels[i] - 1``. //! @endrst //! //! @param[in] num_levels //! @rst //! The number of boundaries (levels) for delineating histogram samples in //! each active channel. Implies that the number of bins for //! channel\ :sub:`i` is ``num_levels[i] - 1``. //! @endrst //! //! @param[in] d_levels //! The pointers to the arrays of boundaries (levels), one for each active //! channel. Bin ranges are defined by consecutive boundary pairings: lower //! sample value boundaries are inclusive and upper sample value boundaries //! are exclusive. //! //! @param[in] num_row_pixels //! The number of multi-channel pixels per row in the region of interest //! //! @param[in] num_rows //! The number of rows in the region of interest //! //! @param[in] row_stride_bytes //! The number of bytes between starts of consecutive rows in the //! region of interest //! //! @param[in] stream //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst template CUB_RUNTIME_FUNCTION static cudaError_t MultiHistogramRange( void* d_temp_storage, size_t& temp_storage_bytes, SampleIteratorT d_samples, CounterT* d_histogram[NUM_ACTIVE_CHANNELS], const int num_levels[NUM_ACTIVE_CHANNELS], const LevelT* const d_levels[NUM_ACTIVE_CHANNELS], OffsetT num_row_pixels, OffsetT num_rows, size_t row_stride_bytes, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceHistogram::MultiHistogramRange"); /// The sample value type of the input iterator using SampleT = cub::detail::value_t; Int2Type is_byte_sample; _CCCL_IF_CONSTEXPR (sizeof(OffsetT) > sizeof(int)) { if ((unsigned long long) (num_rows * row_stride_bytes) < (unsigned long long) INT_MAX) { // Down-convert OffsetT data type return DispatchHistogram::DispatchRange( d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels, (int) num_row_pixels, (int) num_rows, (int) (row_stride_bytes / sizeof(SampleT)), stream, is_byte_sample); } } return DispatchHistogram::DispatchRange( d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels, num_row_pixels, num_rows, (OffsetT) (row_stride_bytes / sizeof(SampleT)), stream, is_byte_sample); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t MultiHistogramRange( void* d_temp_storage, size_t& temp_storage_bytes, SampleIteratorT d_samples, CounterT* d_histogram[NUM_ACTIVE_CHANNELS], const int num_levels[NUM_ACTIVE_CHANNELS], const LevelT* const d_levels[NUM_ACTIVE_CHANNELS], OffsetT num_row_pixels, OffsetT num_rows, size_t row_stride_bytes, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return MultiHistogramRange( d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels, num_row_pixels, num_rows, row_stride_bytes, stream); } //@} end member group }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/device/device_memcpy.cuh000066400000000000000000000213671463375617100205430ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ //! @file cub::DeviceMemcpy provides device-wide, parallel operations for copying data. #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include CUB_NAMESPACE_BEGIN //! @brief cub::DeviceMemcpy provides device-wide, parallel operations for copying data. struct DeviceMemcpy { //! @rst //! Copies data from a batch of given source buffers to their corresponding destination buffer. //! //! .. note:: //! //! If any input buffer aliases memory from any output buffer the behavior is undefined. //! If any output buffer aliases memory of another output buffer the behavior is undefined. //! Input buffers can alias one another. //! //! Snippet //! +++++++ //! //! The code snippet below illustrates usage of DeviceMemcpy::Batched for mutating strings withing //! a single string buffer. //! //! .. code-block:: c++ //! //! struct GetPtrToStringItem //! { //! __host__ __device__ __forceinline__ void *operator()(uint32_t index) //! { //! return &d_string_data_in[d_string_offsets[index]]; //! } //! char *d_string_data_in; //! uint32_t *d_string_offsets; //! }; //! //! struct GetStringItemSize //! { //! __host__ __device__ __forceinline__ uint32_t operator()(uint32_t index) //! { //! return d_string_offsets[index + 1] - d_string_offsets[index]; //! } //! uint32_t *d_string_offsets; //! }; //! //! uint32_t num_strings = 5; //! char *d_string_data_in; // e.g., "TomatoesBananasApplesOrangesGrapes" //! char *d_string_data_out; // e.g., " ... " //! uint32_t *d_string_offsets_old; // e.g., [0, 8, 15, 21, 28, 34] //! uint32_t *d_string_offsets_new; // e.g., [0, 6, 13, 19, 26, 34] //! uint32_t *d_gather_index; // e.g., [2, 1, 4, 3, 0] //! //! // Initialize an iterator that returns d_gather_index[i] when the i-th item is dereferenced //! auto gather_iterator = thrust::make_permutation_iterator(thrust::make_counting_iterator(0), //! d_gather_index); //! //! // Returns pointers to the input buffer for each string //! auto str_ptrs_in = thrust::make_transform_iterator(gather_iterator, //! GetPtrToStringItem{d_string_data_in, //! d_string_offsets_old}); //! //! // Returns the string size of the i-th string //! auto str_sizes = thrust::make_transform_iterator(gather_iterator, //! GetStringItemSize{d_string_offsets_old}); //! //! // Returns pointers to the output buffer for each string //! auto str_ptrs_out = thrust::make_transform_iterator(thrust::make_counting_iterator(0), //! GetPtrToStringItem{d_string_data_out, //! d_string_offsets_new}); //! //! // Determine temporary device storage requirements //! void *d_temp_storage = nullptr; //! size_t temp_storage_bytes = 0; //! cub::DeviceMemcpy::Batched(d_temp_storage, temp_storage_bytes, str_ptrs_in, str_ptrs_out, //! str_sizes, num_strings); //! //! // Allocate temporary storage //! cudaMalloc(&d_temp_storage, temp_storage_bytes); //! //! // Run batched copy algorithm (used to permute strings) //! cub::DeviceMemcpy::Batched(d_temp_storage, temp_storage_bytes, str_ptrs_in, str_ptrs_out, //! str_sizes, num_strings); //! //! // d_string_data_out <-- "ApplesBananasGrapesOrangesTomatoe" //! //! @endrst //! //! @tparam InputBufferIt //! **[inferred]** Device-accessible random-access input iterator type providing the pointers to //! the source memory buffers //! //! @tparam OutputBufferIt //! **[inferred]** Device-accessible random-access input iterator type providing the pointers to //! the destination memory buffers //! //! @tparam BufferSizeIteratorT //! **[inferred]** Device-accessible random-access input iterator type providing the number of bytes //! to be copied for each pair of buffers //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in] input_buffer_it //! Device-accessible iterator providing the pointers to the source memory buffers //! //! @param[in] output_buffer_it //! Device-accessible iterator providing the pointers to the destination memory buffers //! //! @param[in] buffer_sizes //! Device-accessible iterator providing the number of bytes to be copied for each pair of buffers //! //! @param[in] num_buffers //! The total number of buffer pairs //! //! @param[in] stream //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst template CUB_RUNTIME_FUNCTION static cudaError_t Batched( void* d_temp_storage, size_t& temp_storage_bytes, InputBufferIt input_buffer_it, OutputBufferIt output_buffer_it, BufferSizeIteratorT buffer_sizes, uint32_t num_buffers, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceMemcpy::Batched"); static_assert(std::is_pointer>::value, "DeviceMemcpy::Batched only supports copying of memory buffers." "Please consider using DeviceCopy::Batched instead."); static_assert(std::is_pointer>::value, "DeviceMemcpy::Batched only supports copying of memory buffers." "Please consider using DeviceCopy::Batched instead."); // Integer type large enough to hold any offset in [0, num_buffers) using BufferOffsetT = uint32_t; // Integer type large enough to hold any offset in [0, num_thread_blocks_launched), where a safe // uppper bound on num_thread_blocks_launched can be assumed to be given by // IDIV_CEIL(num_buffers, 64) using BlockOffsetT = uint32_t; return detail::DispatchBatchMemcpy< InputBufferIt, OutputBufferIt, BufferSizeIteratorT, BufferOffsetT, BlockOffsetT, detail::DeviceBatchMemcpyPolicy, true>::Dispatch(d_temp_storage, temp_storage_bytes, input_buffer_it, output_buffer_it, buffer_sizes, num_buffers, stream); } }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/device/device_merge_sort.cuh000066400000000000000000001157061463375617100214200ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include #include CUB_NAMESPACE_BEGIN /** * @brief DeviceMergeSort provides device-wide, parallel operations for * computing a merge sort across a sequence of data items residing within * device-accessible memory. * * @par Overview * - DeviceMergeSort arranges items into ascending order using a comparison * functor with less-than semantics. Merge sort can handle arbitrary types (as * long as a value of these types is a model of [LessThan Comparable]) and * comparison functors, but is slower than DeviceRadixSort when sorting * arithmetic types into ascending/descending order. * - Another difference from RadixSort is the fact that DeviceMergeSort can * handle arbitrary random-access iterators, as shown below. * * @par A Simple Example * @par * The code snippet below illustrates a thrust reverse iterator usage. * @par * @code * #include // or equivalently * * struct CustomLess * { * template * __device__ bool operator()(const DataType &lhs, const DataType &rhs) * { * return lhs < rhs; * } * }; * * // Declare, allocate, and initialize device-accessible pointers * // for sorting data * thrust::device_vector d_keys(num_items); * thrust::device_vector d_values(num_items); * // ... * * // Initialize iterator * using KeyIterator = typename thrust::device_vector::iterator; * thrust::reverse_iterator reverse_iter(d_keys.end()); * * // Determine temporary device storage requirements * std::size_t temp_storage_bytes = 0; * cub::DeviceMergeSort::SortPairs( * nullptr, * temp_storage_bytes, * reverse_iter, * thrust::raw_pointer_cast(d_values.data()), * num_items, * CustomLess()); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run sorting operation * cub::DeviceMergeSort::SortPairs( * d_temp_storage, * temp_storage_bytes, * reverse_iter, * thrust::raw_pointer_cast(d_values.data()), * num_items, * CustomLess()); * @endcode * * [LessThan Comparable]: https://en.cppreference.com/w/cpp/named_req/LessThanComparable */ struct DeviceMergeSort { private: // Name reported for NVTX ranges _CCCL_HOST_DEVICE static constexpr auto GetName() -> const char* { return "cub::DeviceMergeSort"; } // Internal version without NVTX range template CUB_RUNTIME_FUNCTION static cudaError_t SortPairsNoNVTX( void* d_temp_storage, std::size_t& temp_storage_bytes, KeyIteratorT d_keys, ValueIteratorT d_items, OffsetT num_items, CompareOpT compare_op, cudaStream_t stream = 0) { using PromotedOffsetT = detail::promote_small_offset_t; using DispatchMergeSortT = DispatchMergeSort; return DispatchMergeSortT::Dispatch( d_temp_storage, temp_storage_bytes, d_keys, d_items, d_keys, d_items, num_items, compare_op, stream); } public: /** * @brief Sorts items using a merge sorting method. * * @par * SortPairs is not guaranteed to be stable. That is, suppose that i and j are * equivalent: neither one is less than the other. It is not guaranteed * that the relative order of these two elements will be preserved by sort. * * @par Snippet * The code snippet below illustrates the sorting of a device vector of `int` * keys with associated vector of `int` values. * @par * @code * #include * // or equivalently * * // Declare, allocate, and initialize device-accessible pointers for * // sorting data * int num_items; // e.g., 7 * int *d_keys; // e.g., [8, 6, 6, 5, 3, 0, 9] * int *d_values; // e.g., [0, 1, 2, 3, 4, 5, 6] * ... * * // Initialize comparator * CustomOpT custom_op; * * // Determine temporary device storage requirements * void *d_temp_storage = nullptr; * std::size_t temp_storage_bytes = 0; * cub::DeviceMergeSort::SortPairs( * d_temp_storage, temp_storage_bytes, * d_keys, d_values, num_items, custom_op); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run sorting operation * cub::DeviceMergeSort::SortPairs( * d_temp_storage, temp_storage_bytes, * d_keys, d_values, num_items, custom_op); * * // d_keys <-- [0, 3, 5, 6, 6, 8, 9] * // d_values <-- [5, 4, 3, 2, 1, 0, 6] * * @endcode * * @tparam KeyIteratorT * is a model of [Random Access Iterator]. `KeyIteratorT` is mutable, and * its `value_type` is a model of [LessThan Comparable]. This `value_type`'s * ordering relation is a *strict weak ordering* as defined in * the [LessThan Comparable] requirements. * * @tparam ValueIteratorT * is a model of [Random Access Iterator], and `ValueIteratorT` is mutable. * * @tparam OffsetT * is an integer type for global offsets. * * @tparam CompareOpT * is a type of callable object with the signature * `bool operator()(KeyT lhs, KeyT rhs)` that models * the [Strict Weak Ordering] concept. * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no work * is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in,out] d_keys * Pointer to the input sequence of unsorted input keys * * @param[in,out] d_items * Pointer to the input sequence of unsorted input values * * @param[in] num_items * Number of items to sort * * @param[in] compare_op * Comparison function object which returns true if the first argument is * ordered before the second * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. Default is * stream0. * * [Random Access Iterator]: https://en.cppreference.com/w/cpp/iterator/random_access_iterator * [Strict Weak Ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order * [LessThan Comparable]: https://en.cppreference.com/w/cpp/named_req/LessThanComparable */ template CUB_RUNTIME_FUNCTION static cudaError_t SortPairs( void* d_temp_storage, std::size_t& temp_storage_bytes, KeyIteratorT d_keys, ValueIteratorT d_items, OffsetT num_items, CompareOpT compare_op, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); return SortPairsNoNVTX(d_temp_storage, temp_storage_bytes, d_keys, d_items, num_items, compare_op, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairs( void* d_temp_storage, std::size_t& temp_storage_bytes, KeyIteratorT d_keys, ValueIteratorT d_items, OffsetT num_items, CompareOpT compare_op, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return SortPairs( d_temp_storage, temp_storage_bytes, d_keys, d_items, num_items, compare_op, stream); } /** * @brief Sorts items using a merge sorting method. * * @par * - SortPairsCopy is not guaranteed to be stable. That is, suppose * that `i` and `j` are equivalent: neither one is less than the * other. It is not guaranteed that the relative order of these * two elements will be preserved by sort. * - Input arrays `d_input_keys` and `d_input_items` are not modified. * - Note that the behavior is undefined if the input and output ranges * overlap in any way. * * @par Snippet * The code snippet below illustrates the sorting of a device vector of * `int` keys with associated vector of `int` values. * @par * @code * #include * // or equivalently * * // Declare, allocate, and initialize device-accessible pointers * // for sorting data * int num_items; // e.g., 7 * int *d_keys; // e.g., [8, 6, 6, 5, 3, 0, 9] * int *d_values; // e.g., [0, 1, 2, 3, 4, 5, 6] * ... * * // Initialize comparator * CustomOpT custom_op; * * // Determine temporary device storage requirements * void *d_temp_storage = nullptr; * std::size_t temp_storage_bytes = 0; * cub::DeviceMergeSort::SortPairsCopy( * d_temp_storage, temp_storage_bytes, * d_keys, d_values, num_items, custom_op); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run sorting operation * cub::DeviceMergeSort::SortPairsCopy( * d_temp_storage, temp_storage_bytes, * d_keys, d_values, num_items, custom_op); * * // d_keys <-- [0, 3, 5, 6, 6, 8, 9] * // d_values <-- [5, 4, 3, 2, 1, 0, 6] * * @endcode * * @tparam KeyInputIteratorT * is a model of [Random Access Iterator]. Its `value_type` is a model of * [LessThan Comparable]. This `value_type`'s ordering relation is a * *strict weak ordering* as defined in the [LessThan Comparable] * requirements. * * @tparam ValueInputIteratorT * is a model of [Random Access Iterator]. * * @tparam KeyIteratorT * is a model of [Random Access Iterator]. `KeyIteratorT` is mutable, and * its `value_type` is a model of [LessThan Comparable]. This `value_type`'s * ordering relation is a *strict weak ordering* as defined in * the [LessThan Comparable] requirements. * * @tparam ValueIteratorT * is a model of [Random Access Iterator], and `ValueIteratorT` is mutable. * * @tparam OffsetT * is an integer type for global offsets. * * @tparam CompareOpT * is a type of callable object with the signature * `bool operator()(KeyT lhs, KeyT rhs)` that models * the [Strict Weak Ordering] concept. * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no work * is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in] d_input_keys * Pointer to the input sequence of unsorted input keys * * @param[in] d_input_items * Pointer to the input sequence of unsorted input values * * @param[out] d_output_keys * Pointer to the output sequence of sorted input keys * * @param[out] d_output_items * Pointer to the output sequence of sorted input values * * @param[in] num_items * Number of items to sort * * @param[in] compare_op * Comparison function object which returns `true` if the first argument is * ordered before the second * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. Default is * stream0. * * [Random Access Iterator]: https://en.cppreference.com/w/cpp/iterator/random_access_iterator * [Strict Weak Ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order * [LessThan Comparable]: https://en.cppreference.com/w/cpp/named_req/LessThanComparable */ template CUB_RUNTIME_FUNCTION static cudaError_t SortPairsCopy( void* d_temp_storage, std::size_t& temp_storage_bytes, KeyInputIteratorT d_input_keys, ValueInputIteratorT d_input_items, KeyIteratorT d_output_keys, ValueIteratorT d_output_items, OffsetT num_items, CompareOpT compare_op, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); using PromotedOffsetT = detail::promote_small_offset_t; using DispatchMergeSortT = DispatchMergeSort; return DispatchMergeSortT::Dispatch( d_temp_storage, temp_storage_bytes, d_input_keys, d_input_items, d_output_keys, d_output_items, num_items, compare_op, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairsCopy( void* d_temp_storage, std::size_t& temp_storage_bytes, KeyInputIteratorT d_input_keys, ValueInputIteratorT d_input_items, KeyIteratorT d_output_keys, ValueIteratorT d_output_items, OffsetT num_items, CompareOpT compare_op, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return SortPairsCopy( d_temp_storage, temp_storage_bytes, d_input_keys, d_input_items, d_output_keys, d_output_items, num_items, compare_op, stream); } private: // Internal version without NVTX range template CUB_RUNTIME_FUNCTION static cudaError_t SortKeysNoNVTX( void* d_temp_storage, std::size_t& temp_storage_bytes, KeyIteratorT d_keys, OffsetT num_items, CompareOpT compare_op, cudaStream_t stream = 0) { using PromotedOffsetT = detail::promote_small_offset_t; using DispatchMergeSortT = DispatchMergeSort; return DispatchMergeSortT::Dispatch( d_temp_storage, temp_storage_bytes, d_keys, static_cast(nullptr), d_keys, static_cast(nullptr), num_items, compare_op, stream); } public: /** * @brief Sorts items using a merge sorting method. * * @par * SortKeys is not guaranteed to be stable. That is, suppose that `i` and `j` * are equivalent: neither one is less than the other. It is not guaranteed * that the relative order of these two elements will be preserved by sort. * * @par Snippet * The code snippet below illustrates the sorting of a device vector of `int` * keys. * @par * @code * #include * // or equivalently * * // Declare, allocate, and initialize device-accessible pointers * // for sorting data * int num_items; // e.g., 7 * int *d_keys; // e.g., [8, 6, 7, 5, 3, 0, 9] * ... * * // Initialize comparator * CustomOpT custom_op; * * // Determine temporary device storage requirements * void *d_temp_storage = nullptr; * std::size_t temp_storage_bytes = 0; * cub::DeviceMergeSort::SortKeys( * d_temp_storage, temp_storage_bytes, * d_keys, num_items, custom_op); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run sorting operation * cub::DeviceMergeSort::SortKeys( * d_temp_storage, temp_storage_bytes, * d_keys, num_items, custom_op); * * // d_keys <-- [0, 3, 5, 6, 7, 8, 9] * @endcode * * @tparam KeyIteratorT * is a model of [Random Access Iterator]. `KeyIteratorT` is mutable, and * its `value_type` is a model of [LessThan Comparable]. This `value_type`'s * ordering relation is a *strict weak ordering* as defined in * the [LessThan Comparable] requirements. * * @tparam OffsetT * is an integer type for global offsets. * * @tparam CompareOpT * is a type of callable object with the signature * `bool operator()(KeyT lhs, KeyT rhs)` that models * the [Strict Weak Ordering] concept. * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no work * is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in,out] d_keys * Pointer to the input sequence of unsorted input keys * * @param[in] num_items * Number of items to sort * * @param[in] compare_op * Comparison function object which returns true if the first argument is * ordered before the second * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. Default is * stream0. * * [Random Access Iterator]: https://en.cppreference.com/w/cpp/iterator/random_access_iterator * [Strict Weak Ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order * [LessThan Comparable]: https://en.cppreference.com/w/cpp/named_req/LessThanComparable */ template CUB_RUNTIME_FUNCTION static cudaError_t SortKeys( void* d_temp_storage, std::size_t& temp_storage_bytes, KeyIteratorT d_keys, OffsetT num_items, CompareOpT compare_op, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); return SortKeysNoNVTX(d_temp_storage, temp_storage_bytes, d_keys, num_items, compare_op, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeys( void* d_temp_storage, std::size_t& temp_storage_bytes, KeyIteratorT d_keys, OffsetT num_items, CompareOpT compare_op, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return SortKeys( d_temp_storage, temp_storage_bytes, d_keys, num_items, compare_op, stream); } private: // Internal version without NVTX range template CUB_RUNTIME_FUNCTION static cudaError_t SortKeysCopyNoNVTX( void* d_temp_storage, std::size_t& temp_storage_bytes, KeyInputIteratorT d_input_keys, KeyIteratorT d_output_keys, OffsetT num_items, CompareOpT compare_op, cudaStream_t stream = 0) { using PromotedOffsetT = detail::promote_small_offset_t; using DispatchMergeSortT = DispatchMergeSort; return DispatchMergeSortT::Dispatch( d_temp_storage, temp_storage_bytes, d_input_keys, static_cast(nullptr), d_output_keys, static_cast(nullptr), num_items, compare_op, stream); } public: /** * @brief Sorts items using a merge sorting method. * * @par * - SortKeysCopy is not guaranteed to be stable. That is, suppose that `i` * and `j` are equivalent: neither one is less than the other. It is not * guaranteed that the relative order of these two elements will be * preserved by sort. * - Input array d_input_keys is not modified. * - Note that the behavior is undefined if the input and output ranges * overlap in any way. * * @par Snippet * The code snippet below illustrates the sorting of a device vector of * `int` keys. * @par * @code * #include * // or equivalently * * // Declare, allocate, and initialize device-accessible pointers for * // sorting data * int num_items; // e.g., 7 * int *d_keys; // e.g., [8, 6, 7, 5, 3, 0, 9] * ... * * // Initialize comparator * CustomOpT custom_op; * * // Determine temporary device storage requirements * void *d_temp_storage = nullptr; * std::size_t temp_storage_bytes = 0; * cub::DeviceMergeSort::SortKeysCopy( * d_temp_storage, temp_storage_bytes, * d_keys, num_items, custom_op); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run sorting operation * cub::DeviceMergeSort::SortKeysCopy( * d_temp_storage, temp_storage_bytes, * d_keys, num_items, custom_op); * * // d_keys <-- [0, 3, 5, 6, 7, 8, 9] * @endcode * * @tparam KeyInputIteratorT * is a model of [Random Access Iterator]. Its `value_type` is a model of * [LessThan Comparable]. This `value_type`'s ordering relation is a * *strict weak ordering* as defined in the [LessThan Comparable] * requirements. * * @tparam KeyIteratorT * is a model of [Random Access Iterator]. `KeyIteratorT` is mutable, and * its `value_type` is a model of [LessThan Comparable]. This `value_type`'s * ordering relation is a *strict weak ordering* as defined in * the [LessThan Comparable] requirements. * * @tparam OffsetT * is an integer type for global offsets. * * @tparam CompareOpT * is a type of callable object with the signature * `bool operator()(KeyT lhs, KeyT rhs)` that models * the [Strict Weak Ordering] concept. * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no work * is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in] d_input_keys * Pointer to the input sequence of unsorted input keys * * @param[out] d_output_keys * Pointer to the output sequence of sorted input keys * * @param[in] num_items * Number of items to sort * * @param[in] compare_op * Comparison function object which returns true if the first argument is * ordered before the second * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. Default is * stream0. * * [Random Access Iterator]: https://en.cppreference.com/w/cpp/iterator/random_access_iterator * [Strict Weak Ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order * [LessThan Comparable]: https://en.cppreference.com/w/cpp/named_req/LessThanComparable */ template CUB_RUNTIME_FUNCTION static cudaError_t SortKeysCopy( void* d_temp_storage, std::size_t& temp_storage_bytes, KeyInputIteratorT d_input_keys, KeyIteratorT d_output_keys, OffsetT num_items, CompareOpT compare_op, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); return SortKeysCopyNoNVTX( d_temp_storage, temp_storage_bytes, d_input_keys, d_output_keys, num_items, compare_op, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeysCopy( void* d_temp_storage, std::size_t& temp_storage_bytes, KeyInputIteratorT d_input_keys, KeyIteratorT d_output_keys, OffsetT num_items, CompareOpT compare_op, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return SortKeysCopy( d_temp_storage, temp_storage_bytes, d_input_keys, d_output_keys, num_items, compare_op, stream); } /** * @brief Sorts items using a merge sorting method. * * @par * StableSortPairs is stable: it preserves the relative ordering of equivalent * elements. That is, if x and y are elements such that x precedes y, * and if the two elements are equivalent (neither x < y nor y < x) then * a postcondition of stable_sort is that x still precedes y. * * @par Snippet * The code snippet below illustrates the sorting of a device vector of `int` * keys with associated vector of `int` values. * @par * @code * #include * // or equivalently * * // Declare, allocate, and initialize device-accessible pointers for * // sorting data * int num_items; // e.g., 7 * int *d_keys; // e.g., [8, 6, 6, 5, 3, 0, 9] * int *d_values; // e.g., [0, 1, 2, 3, 4, 5, 6] * ... * * // Initialize comparator * CustomOpT custom_op; * * // Determine temporary device storage requirements * void *d_temp_storage = nullptr; * std::size_t temp_storage_bytes = 0; * cub::DeviceMergeSort::StableSortPairs( * d_temp_storage, temp_storage_bytes, * d_keys, d_values, num_items, custom_op); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run sorting operation * cub::DeviceMergeSort::StableSortPairs( * d_temp_storage, temp_storage_bytes, * d_keys, d_values, num_items, custom_op); * * // d_keys <-- [0, 3, 5, 6, 6, 8, 9] * // d_values <-- [5, 4, 3, 1, 2, 0, 6] * @endcode * * @tparam KeyIteratorT * is a model of [Random Access Iterator]. `KeyIteratorT` is mutable, and * its `value_type` is a model of [LessThan Comparable]. This `value_type`'s * ordering relation is a *strict weak ordering* as defined in * the [LessThan Comparable] requirements. * * @tparam ValueIteratorT * is a model of [Random Access Iterator], and `ValueIteratorT` is mutable. * * @tparam OffsetT * is an integer type for global offsets. * * @tparam CompareOpT * is a type of callable object with the signature * `bool operator()(KeyT lhs, KeyT rhs)` that models * the [Strict Weak Ordering] concept. * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no work * is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in,out] d_keys * Pointer to the input sequence of unsorted input keys * * @param[in,out] d_items * Pointer to the input sequence of unsorted input values * * @param[in] num_items * Number of items to sort * * @param[in] compare_op * Comparison function object which returns true if the first argument is * ordered before the second * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. Default is * stream0. * * [Random Access Iterator]: https://en.cppreference.com/w/cpp/iterator/random_access_iterator * [Strict Weak Ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order * [LessThan Comparable]: https://en.cppreference.com/w/cpp/named_req/LessThanComparable */ template CUB_RUNTIME_FUNCTION static cudaError_t StableSortPairs( void* d_temp_storage, std::size_t& temp_storage_bytes, KeyIteratorT d_keys, ValueIteratorT d_items, OffsetT num_items, CompareOpT compare_op, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); using PromotedOffsetT = detail::promote_small_offset_t; return SortPairsNoNVTX( d_temp_storage, temp_storage_bytes, d_keys, d_items, num_items, compare_op, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortPairs( void* d_temp_storage, std::size_t& temp_storage_bytes, KeyIteratorT d_keys, ValueIteratorT d_items, OffsetT num_items, CompareOpT compare_op, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return StableSortPairs( d_temp_storage, temp_storage_bytes, d_keys, d_items, num_items, compare_op, stream); } /** * @brief Sorts items using a merge sorting method. * * @par * StableSortKeys is stable: it preserves the relative ordering of equivalent * elements. That is, if `x` and `y` are elements such that `x` precedes `y`, * and if the two elements are equivalent (neither `x < y` nor `y < x`) then * a postcondition of stable_sort is that `x` still precedes `y`. * * @par Snippet * The code snippet below illustrates the sorting of a device vector of `int` * keys. * \par * \code * #include * // or equivalently * * // Declare, allocate, and initialize device-accessible pointers for * // sorting data * int num_items; // e.g., 7 * int *d_keys; // e.g., [8, 6, 7, 5, 3, 0, 9] * ... * * // Initialize comparator * CustomOpT custom_op; * * // Determine temporary device storage requirements * void *d_temp_storage = nullptr; * std::size_t temp_storage_bytes = 0; * cub::DeviceMergeSort::StableSortKeys( * d_temp_storage, temp_storage_bytes, * d_keys, num_items, custom_op); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run sorting operation * cub::DeviceMergeSort::StableSortKeys( * d_temp_storage, temp_storage_bytes, * d_keys, num_items, custom_op); * * // d_keys <-- [0, 3, 5, 6, 7, 8, 9] * @endcode * * @tparam KeyIteratorT * is a model of [Random Access Iterator]. `KeyIteratorT` is mutable, and * its `value_type` is a model of [LessThan Comparable]. This `value_type`'s * ordering relation is a *strict weak ordering* as defined in * the [LessThan Comparable] requirements. * * @tparam OffsetT * is an integer type for global offsets. * * @tparam CompareOpT * is a type of callable object with the signature * `bool operator()(KeyT lhs, KeyT rhs)` that models * the [Strict Weak Ordering] concept. * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no work * is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in,out] d_keys * Pointer to the input sequence of unsorted input keys * * @param[in] num_items * Number of items to sort * * @param[in] compare_op * Comparison function object which returns true if the first argument is * ordered before the second * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. Default is * stream0. * * [Random Access Iterator]: https://en.cppreference.com/w/cpp/iterator/random_access_iterator * [Strict Weak Ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order * [LessThan Comparable]: https://en.cppreference.com/w/cpp/named_req/LessThanComparable */ template CUB_RUNTIME_FUNCTION static cudaError_t StableSortKeys( void* d_temp_storage, std::size_t& temp_storage_bytes, KeyIteratorT d_keys, OffsetT num_items, CompareOpT compare_op, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); using PromotedOffsetT = detail::promote_small_offset_t; return SortKeysNoNVTX( d_temp_storage, temp_storage_bytes, d_keys, num_items, compare_op, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortKeys( void* d_temp_storage, std::size_t& temp_storage_bytes, KeyIteratorT d_keys, OffsetT num_items, CompareOpT compare_op, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return StableSortKeys( d_temp_storage, temp_storage_bytes, d_keys, num_items, compare_op, stream); } /** * @brief Sorts items using a merge sorting method. * * @par * - StableSortKeysCopy is stable: it preserves the relative ordering of equivalent * elements. That is, if `x` and `y` are elements such that `x` precedes `y`, * and if the two elements are equivalent (neither `x < y` nor `y < x`) then * a postcondition of stable_sort is that `x` still precedes `y`. * - Input array d_input_keys is not modified * - Note that the behavior is undefined if the input and output ranges overlap * in any way. * * @par Snippet * The code snippet below illustrates the sorting of a device vector of `int` * keys. * \par * \code * #include * // or equivalently * * // Declare, allocate, and initialize device-accessible pointers for * // sorting data * int num_items; // e.g., 7 * int *d_input_keys; // e.g., [8, 6, 7, 5, 3, 0, 9] * int *d_output_keys; // must hold at least num_items elements * ... * * // Initialize comparator * CustomOpT custom_op; * * // Determine temporary device storage requirements * void *d_temp_storage = nullptr; * std::size_t temp_storage_bytes = 0; * cub::DeviceMergeSort::StableSortKeysCopy( * d_temp_storage, temp_storage_bytes, * d_input_keys, d_output_keys, num_items, custom_op); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run sorting operation * cub::DeviceMergeSort::StableSortKeysCopy( * d_temp_storage, temp_storage_bytes, * d_input_keys, d_output_keys, num_items, custom_op); * * // d_output_keys <-- [0, 3, 5, 6, 7, 8, 9] * @endcode * * @tparam KeyInputIteratorT * is a model of [Random Access Iterator]. Its `value_type` is a model of * [LessThan Comparable]. This `value_type`'s ordering relation is a * *strict weak ordering* as defined in the [LessThan Comparable] * requirements. * * @tparam KeyIteratorT * is a model of [Random Access Iterator]. `KeyIteratorT` is mutable, and * its `value_type` is a model of [LessThan Comparable]. This `value_type`'s * ordering relation is a *strict weak ordering* as defined in * the [LessThan Comparable] requirements. * * @tparam OffsetT * is an integer type for global offsets. * * @tparam CompareOpT * is a type of callable object with the signature * `bool operator()(KeyT lhs, KeyT rhs)` that models * the [Strict Weak Ordering] concept. * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no work * is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in] d_input_keys * Pointer to the input sequence of unsorted input keys * * @param[out] d_output_keys * Pointer to the output sequence of sorted input keys * * @param[in] num_items * Number of elements in d_input_keys to sort * * @param[in] compare_op * Comparison function object which returns true if the first argument is * ordered before the second * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. Default is * stream0. * * [Random Access Iterator]: https://en.cppreference.com/w/cpp/iterator/random_access_iterator * [Strict Weak Ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order * [LessThan Comparable]: https://en.cppreference.com/w/cpp/named_req/LessThanComparable */ template CUB_RUNTIME_FUNCTION static cudaError_t StableSortKeysCopy( void* d_temp_storage, std::size_t& temp_storage_bytes, KeyInputIteratorT d_input_keys, KeyIteratorT d_output_keys, OffsetT num_items, CompareOpT compare_op, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); using PromotedOffsetT = detail::promote_small_offset_t; return SortKeysCopyNoNVTX( d_temp_storage, temp_storage_bytes, d_input_keys, d_output_keys, num_items, compare_op, stream); } }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/device/device_partition.cuh000066400000000000000000000645151463375617100212640ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ //! @file cub::DevicePartition provides device-wide, parallel operations for //! partitioning sequences of data items residing within device-accessible memory. #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include #include #include CUB_NAMESPACE_BEGIN //! @rst //! DevicePartition provides device-wide, parallel operations for //! partitioning sequences of data items residing within device-accessible memory. //! //! Overview //! ++++++++++++++++++++++++++ //! //! These operations apply a selection criterion to construct a partitioned //! output sequence from items selected/unselected from a specified input //! sequence. //! //! Usage Considerations //! ++++++++++++++++++++++++++ //! //! @cdp_class{DevicePartition} //! //! Performance //! ++++++++++++++++++++++++++ //! //! @linear_performance{partition} //! //! @endrst struct DevicePartition { //! @rst //! Uses the ``d_flags`` sequence to split the corresponding items from //! ``d_in`` into a partitioned sequence ``d_out``. //! The total number of items copied into the first partition is written to ``d_num_selected_out``. //! //! - The value type of ``d_flags`` must be castable to ``bool`` (e.g., ``bool``, ``char``, ``int``, etc.). //! - Copies of the selected items are compacted into ``d_out`` and maintain //! their original relative ordering, however copies of the unselected //! items are compacted into the rear of ``d_out`` in reverse order. //! - The range ``[d_out, d_out + num_items)`` shall not overlap //! ``[d_in, d_in + num_items)`` nor ``[d_flags, d_flags + num_items)`` in any way. //! The range ``[d_in, d_in + num_items)`` may overlap ``[d_flags, d_flags + num_items)``. //! - @devicestorage //! //! Snippet //! ++++++++++++++++++++++++++ //! //! The code snippet below illustrates the compaction of items selected from an ``int`` device vector. //! //! .. code-block:: c++ //! //! #include //! // or equivalently //! //! // Declare, allocate, and initialize device-accessible pointers for //! // input, flags, and output //! int num_items; // e.g., 8 //! int *d_in; // e.g., [1, 2, 3, 4, 5, 6, 7, 8] //! char *d_flags; // e.g., [1, 0, 0, 1, 0, 1, 1, 0] //! int *d_out; // e.g., [ , , , , , , , ] //! int *d_num_selected_out; // e.g., [ ] //! ... //! //! // Determine temporary device storage requirements //! void *d_temp_storage = nullptr; //! std::size_t temp_storage_bytes = 0; //! cub::DevicePartition::Flagged( //! d_temp_storage, temp_storage_bytes, //! d_in, d_flags, d_out, d_num_selected_out, num_items); //! //! // Allocate temporary storage //! cudaMalloc(&d_temp_storage, temp_storage_bytes); //! //! // Run selection //! cub::DevicePartition::Flagged( //! d_temp_storage, temp_storage_bytes, //! d_in, d_flags, d_out, d_num_selected_out, num_items); //! //! // d_out <-- [1, 4, 6, 7, 8, 5, 3, 2] //! // d_num_selected_out <-- [4] //! //! @endrst //! //! @tparam InputIteratorT //! **[inferred]** Random-access input iterator type for reading input items @iterator //! //! @tparam FlagIterator //! **[inferred]** Random-access input iterator type for reading selection flags @iterator //! //! @tparam OutputIteratorT //! **[inferred]** Random-access output iterator type for writing output items @iterator //! //! @tparam NumSelectedIteratorT //! **[inferred]** Output iterator type for recording the number of items selected @iterator //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in] d_in //! Pointer to the input sequence of data items //! //! @param[in] d_flags //! Pointer to the input sequence of selection flags //! //! @param[out] d_out //! Pointer to the output sequence of partitioned data items //! //! @param[out] d_num_selected_out //! Pointer to the output total number of items selected (i.e., the //! offset of the unselected partition) //! //! @param[in] num_items //! Total number of items to select from //! //! @param[in] stream //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst template CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Flagged( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, FlagIterator d_flags, OutputIteratorT d_out, NumSelectedIteratorT d_num_selected_out, int num_items, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DevicePartition::Flagged"); using OffsetT = int; // Signed integer type for global offsets using SelectOp = NullType; // Selection op (not used) using EqualityOp = NullType; // Equality operator (not used) using DispatchSelectIfT = DispatchSelectIf; return DispatchSelectIfT::Dispatch( d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, SelectOp{}, EqualityOp{}, num_items, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Flagged( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, FlagIterator d_flags, OutputIteratorT d_out, NumSelectedIteratorT d_num_selected_out, int num_items, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return Flagged( d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, stream); } //! @rst //! Uses the ``select_op`` functor to split the corresponding items from ``d_in`` into //! a partitioned sequence ``d_out``. The total number of items copied into the first partition is written //! to ``d_num_selected_out``. //! //! - Copies of the selected items are compacted into ``d_out`` and maintain //! their original relative ordering, however copies of the unselected //! items are compacted into the rear of ``d_out`` in reverse order. //! - The range ``[d_out, d_out + num_items)`` shall not overlap //! ``[d_in, d_in + num_items)`` in any way. //! - @devicestorage //! //! Snippet //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The code snippet below illustrates the compaction of items selected from an ``int`` device vector. //! //! .. code-block:: c++ //! //! #include //! // or equivalently //! //! // Functor type for selecting values less than some criteria //! struct LessThan //! { //! int compare; //! //! CUB_RUNTIME_FUNCTION __forceinline__ //! explicit LessThan(int compare) : compare(compare) {} //! //! CUB_RUNTIME_FUNCTION __forceinline__ //! bool operator()(const int &a) const //! { //! return (a < compare); //! } //! }; //! //! // Declare, allocate, and initialize device-accessible pointers for //! // input and output //! int num_items; // e.g., 8 //! int *d_in; // e.g., [0, 2, 3, 9, 5, 2, 81, 8] //! int *d_out; // e.g., [ , , , , , , , ] //! int *d_num_selected_out; // e.g., [ ] //! LessThan select_op(7); //! ... //! //! // Determine temporary device storage requirements //! void *d_temp_storage = nullptr; //! std::size_t temp_storage_bytes = 0; //! cub::DevicePartition::If( //! d_temp_storage, temp_storage_bytes, //! d_in, d_out, d_num_selected_out, num_items, select_op); //! //! // Allocate temporary storage //! cudaMalloc(&d_temp_storage, temp_storage_bytes); //! //! // Run selection //! cub::DevicePartition::If( //! d_temp_storage, temp_storage_bytes, //! d_in, d_out, d_num_selected_out, num_items, select_op); //! //! // d_out <-- [0, 2, 3, 5, 2, 8, 81, 9] //! // d_num_selected_out <-- [5] //! //! @endrst //! //! @tparam InputIteratorT //! **[inferred]** Random-access input iterator type for reading input items @iterator //! //! @tparam OutputIteratorT //! **[inferred]** Random-access output iterator type for writing output items @iterator //! //! @tparam NumSelectedIteratorT //! **[inferred]** Output iterator type for recording the number of items selected @iterator //! //! @tparam SelectOp //! **[inferred]** Selection functor type having member `bool operator()(const T &a)` //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of ``d_temp_storage`` allocation //! //! @param[in] d_in //! Pointer to the input sequence of data items //! //! @param[out] d_out //! Pointer to the output sequence of partitioned data items //! //! @param[out] d_num_selected_out //! Pointer to the output total number of items selected (i.e., the offset of the unselected partition) //! //! @param[in] num_items //! Total number of items to select from //! //! @param[in] select_op //! Unary selection operator //! //! @param[in] stream //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst template CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t If(void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, NumSelectedIteratorT d_num_selected_out, int num_items, SelectOp select_op, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DevicePartition::If"); using OffsetT = int; // Signed integer type for global offsets using FlagIterator = NullType*; // FlagT iterator type (not used) using EqualityOp = NullType; // Equality operator (not used) using DispatchSelectIfT = DispatchSelectIf; return DispatchSelectIfT::Dispatch( d_temp_storage, temp_storage_bytes, d_in, nullptr, d_out, d_num_selected_out, select_op, EqualityOp{}, num_items, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t If(void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, NumSelectedIteratorT d_num_selected_out, int num_items, SelectOp select_op, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return If( d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op, stream); } private: template friend class DispatchSegmentedSort; // Internal version without NVTX range template CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t IfNoNVTX( void* d_temp_storage, std::size_t& temp_storage_bytes, InputIteratorT d_in, FirstOutputIteratorT d_first_part_out, SecondOutputIteratorT d_second_part_out, UnselectedOutputIteratorT d_unselected_out, NumSelectedIteratorT d_num_selected_out, int num_items, SelectFirstPartOp select_first_part_op, SelectSecondPartOp select_second_part_op, cudaStream_t stream = 0) { using OffsetT = int; using DispatchThreeWayPartitionIfT = DispatchThreeWayPartitionIf< InputIteratorT, FirstOutputIteratorT, SecondOutputIteratorT, UnselectedOutputIteratorT, NumSelectedIteratorT, SelectFirstPartOp, SelectSecondPartOp, OffsetT>; return DispatchThreeWayPartitionIfT::Dispatch( d_temp_storage, temp_storage_bytes, d_in, d_first_part_out, d_second_part_out, d_unselected_out, d_num_selected_out, select_first_part_op, select_second_part_op, num_items, stream); } public: //! @rst //! Uses two functors to split the corresponding items from ``d_in`` into a three partitioned sequences //! ``d_first_part_out``, ``d_second_part_out``, and ``d_unselected_out``. //! The total number of items copied into the first partition is written //! to ``d_num_selected_out[0]``, while the total number of items copied into the second partition is written //! to ``d_num_selected_out[1]``. //! //! - Copies of the items selected by ``select_first_part_op`` are compacted //! into ``d_first_part_out`` and maintain their original relative ordering. //! - Copies of the items selected by ``select_second_part_op`` are compacted //! into ``d_second_part_out`` and maintain their original relative ordering. //! - Copies of the unselected items are compacted into the ``d_unselected_out`` in reverse order. //! - The ranges ``[d_out, d_out + num_items)``, //! ``[d_first_part_out, d_first_part_out + d_num_selected_out[0])``, //! ``[d_second_part_out, d_second_part_out + d_num_selected_out[1])``, //! ``[d_unselected_out, d_unselected_out + num_items - d_num_selected_out[0] - d_num_selected_out[1])``, //! shall not overlap in any way. //! //! Snippet //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The code snippet below illustrates how this algorithm can partition an //! input vector into small, medium, and large items so that the relative //! order of items remain deterministic. //! //! Let's consider any value that doesn't exceed six a small one. On the //! other hand, any value that exceeds 50 will be considered a large one. //! Since the value used to define a small part doesn't match one that //! defines the large part, the intermediate segment is implied. //! //! These definitions partition a value space into three categories. We want //! to preserve the order of items in which they appear in the input vector. //! Since the algorithm provides stable partitioning, this is possible. //! //! Since the number of items in each category is unknown beforehand, we need //! three output arrays of num_items elements each. To reduce the memory //! requirements, we can combine the output storage for two categories. //! //! Since each value falls precisely in one category, it's safe to add //! "large" values into the head of the shared output vector and the "middle" //! values into its tail. To add items into the tail of the output array, we //! can use ``thrust::reverse_iterator``. //! //! .. code-block:: c++ //! //! #include //! // or equivalently //! //! // Functor type for selecting values less than some criteria //! struct LessThan //! { //! int compare; //! //! CUB_RUNTIME_FUNCTION __forceinline__ //! explicit LessThan(int compare) : compare(compare) {} //! //! CUB_RUNTIME_FUNCTION __forceinline__ //! bool operator()(const int &a) const //! { //! return a < compare; //! } //! }; //! //! // Functor type for selecting values greater than some criteria //! struct GreaterThan //! { //! int compare; //! //! CUB_RUNTIME_FUNCTION __forceinline__ //! explicit GreaterThan(int compare) : compare(compare) {} //! //! CUB_RUNTIME_FUNCTION __forceinline__ //! bool operator()(const int &a) const //! { //! return a > compare; //! } //! }; //! //! // Declare, allocate, and initialize device-accessible pointers for //! // input and output //! int num_items; // e.g., 8 //! int *d_in; // e.g., [0, 2, 3, 9, 5, 2, 81, 8] //! int *d_large_and_unselected_out; // e.g., [ , , , , , , , ] //! int *d_small_out; // e.g., [ , , , , , , , ] //! int *d_num_selected_out; // e.g., [ , ] //! thrust::reverse_iterator unselected_out(d_large_and_unselected_out + num_items); //! LessThan small_items_selector(7); //! GreaterThan large_items_selector(50); //! ... //! //! // Determine temporary device storage requirements //! void *d_temp_storage = nullptr; //! std::size_t temp_storage_bytes = 0; //! cub::DevicePartition::If( //! d_temp_storage, temp_storage_bytes, //! d_in, d_large_and_medium_out, d_small_out, unselected_out, //! d_num_selected_out, num_items, //! large_items_selector, small_items_selector); //! //! // Allocate temporary storage //! cudaMalloc(&d_temp_storage, temp_storage_bytes); //! //! // Run selection //! cub::DevicePartition::If( //! d_temp_storage, temp_storage_bytes, //! d_in, d_large_and_medium_out, d_small_out, unselected_out, //! d_num_selected_out, num_items, //! large_items_selector, small_items_selector); //! //! // d_large_and_unselected_out <-- [ 81, , , , , , 8, 9 ] //! // d_small_out <-- [ 0, 2, 3, 5, 2, , , ] //! // d_num_selected_out <-- [ 1, 5 ] //! //! @endrst //! //! @tparam InputIteratorT //! **[inferred]** Random-access input iterator type for reading input items @iterator //! //! @tparam FirstOutputIteratorT //! **[inferred]** Random-access output iterator type for writing output //! items selected by first operator @iterator //! //! @tparam SecondOutputIteratorT //! **[inferred]** Random-access output iterator type for writing output //! items selected by second operator @iterator //! //! @tparam UnselectedOutputIteratorT //! **[inferred]** Random-access output iterator type for writing //! unselected items @iterator //! //! @tparam NumSelectedIteratorT //! **[inferred]** Output iterator type for recording the number of items //! selected @iterator //! //! @tparam SelectFirstPartOp //! **[inferred]** Selection functor type having member `bool operator()(const T &a)` //! //! @tparam SelectSecondPartOp //! **[inferred]** Selection functor type having member `bool operator()(const T &a)` //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in] d_in //! Pointer to the input sequence of data items //! //! @param[out] d_first_part_out //! Pointer to the output sequence of data items selected by `select_first_part_op` //! //! @param[out] d_second_part_out //! Pointer to the output sequence of data items selected by `select_second_part_op` //! //! @param[out] d_unselected_out //! Pointer to the output sequence of unselected data items //! //! @param[out] d_num_selected_out //! Pointer to the output array with two elements, where total number of //! items selected by `select_first_part_op` is stored as //! `d_num_selected_out[0]` and total number of items selected by //! `select_second_part_op` is stored as `d_num_selected_out[1]`, //! respectively //! //! @param[in] num_items //! Total number of items to select from //! //! @param[in] select_first_part_op //! Unary selection operator to select `d_first_part_out` //! //! @param[in] select_second_part_op //! Unary selection operator to select `d_second_part_out` //! //! @param[in] stream //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst template CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t If(void* d_temp_storage, std::size_t& temp_storage_bytes, InputIteratorT d_in, FirstOutputIteratorT d_first_part_out, SecondOutputIteratorT d_second_part_out, UnselectedOutputIteratorT d_unselected_out, NumSelectedIteratorT d_num_selected_out, int num_items, SelectFirstPartOp select_first_part_op, SelectSecondPartOp select_second_part_op, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DevicePartition::If"); return IfNoNVTX( d_temp_storage, temp_storage_bytes, d_in, d_first_part_out, d_second_part_out, d_unselected_out, d_num_selected_out, num_items, select_first_part_op, select_second_part_op, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t If(void* d_temp_storage, std::size_t& temp_storage_bytes, InputIteratorT d_in, FirstOutputIteratorT d_first_part_out, SecondOutputIteratorT d_second_part_out, UnselectedOutputIteratorT d_unselected_out, NumSelectedIteratorT d_num_selected_out, int num_items, SelectFirstPartOp select_first_part_op, SelectSecondPartOp select_second_part_op, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return If( d_temp_storage, temp_storage_bytes, d_in, d_first_part_out, d_second_part_out, d_unselected_out, d_num_selected_out, num_items, select_first_part_op, select_second_part_op, stream); } }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/device/device_radix_sort.cuh000066400000000000000000004260741463375617100214330ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ //! @file cub::DeviceRadixSort provides device-wide, parallel operations for //! computing a radix sort across a sequence of data items residing within //! device-accessible memory. #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include #include CUB_NAMESPACE_BEGIN //! @brief DeviceRadixSort provides device-wide, parallel operations for //! computing a radix sort across a sequence of data items residing //! within device-accessible memory. ![](sorting_logo.png) //! //! @par Overview //! The [*radix sorting method*](http://en.wikipedia.org/wiki/Radix_sort) //! arranges items into ascending (or descending) order. The algorithm relies //! upon a positional representation for keys, i.e., each key is comprised of an //! ordered sequence of symbols (e.g., digits, characters, etc.) specified from //! least-significant to most-significant. For a given input sequence of keys //! and a set of rules specifying a total ordering of the symbolic alphabet, the //! radix sorting method produces a lexicographic ordering of those keys. //! //! @par Supported Types //! DeviceRadixSort can sort all of the built-in C++ numeric primitive types //! (`unsigned char`, `int`, `double`, etc.) as well as CUDA's `__half` //! and `__nv_bfloat16` 16-bit floating-point types. User-defined types are //! supported as long as decomposer object is provided. //! //! @par Floating-Point Special Cases //! //! - Positive and negative zeros are considered equivalent, and will be treated //! as such in the output. //! - No special handling is implemented for NaN values; these are sorted //! according to their bit representations after any transformations. //! //! @par Transformations //! Although the direct radix sorting method can only be applied to unsigned //! integral types, DeviceRadixSort is able to sort signed and floating-point //! types via simple bit-wise transformations that ensure lexicographic key //! ordering. Additional transformations occur for descending sorts. These //! transformations must be considered when restricting the //! `[begin_bit, end_bit)` range, as the bitwise transformations will occur //! before the bit-range truncation. //! //! Any transformations applied to the keys prior to sorting are reversed //! while writing to the final output buffer. //! //! @par Type Specific Bitwise Transformations //! To convert the input values into a radix-sortable bitwise representation, //! the following transformations take place prior to sorting: //! //! - For unsigned integral values, the keys are used directly. //! - For signed integral values, the sign bit is inverted. //! - For positive floating point values, the sign bit is inverted. //! - For negative floating point values, the full key is inverted. //! //! For floating point types, positive and negative zero are a special case and //! will be considered equivalent during sorting. //! //! @par Descending Sort Bitwise Transformations //! If descending sort is used, the keys are inverted after performing any //! type-specific transformations, and the resulting keys are sorted in ascending //! order. //! //! @par Stability //! DeviceRadixSort is stable. For floating-point types, `-0.0` and `+0.0` are //! considered equal and appear in the result in the same order as they appear in //! the input. //! //! @par Usage Considerations //! @cdp_class{DeviceRadixSort} //! //! @par Performance //! @linear_performance{radix sort} The following chart illustrates //! DeviceRadixSort::SortKeys performance across different CUDA architectures //! for uniform-random `uint32` keys. //! @plots_below //! //! @image html lsb_radix_sort_int32_keys.png struct DeviceRadixSort { private: template CUB_RUNTIME_FUNCTION static cudaError_t custom_radix_sort( ::cuda::std::false_type, void* d_temp_storage, size_t& temp_storage_bytes, bool is_overwrite_okay, DoubleBuffer& d_keys, DoubleBuffer& d_values, NumItemsT num_items, DecomposerT decomposer, int begin_bit, int end_bit, cudaStream_t stream); template CUB_RUNTIME_FUNCTION static cudaError_t custom_radix_sort( ::cuda::std::true_type, void* d_temp_storage, size_t& temp_storage_bytes, bool is_overwrite_okay, DoubleBuffer& d_keys, DoubleBuffer& d_values, OffsetT num_items, DecomposerT decomposer, int begin_bit, int end_bit, cudaStream_t stream) { return DispatchRadixSort, DecomposerT>:: Dispatch( d_temp_storage, temp_storage_bytes, d_keys, d_values, static_cast(num_items), begin_bit, end_bit, is_overwrite_okay, stream, decomposer); } template CUB_RUNTIME_FUNCTION static cudaError_t custom_radix_sort( ::cuda::std::false_type, void* d_temp_storage, size_t& temp_storage_bytes, bool is_overwrite_okay, DoubleBuffer& d_keys, DoubleBuffer& d_values, NumItemsT num_items, DecomposerT decomposer, cudaStream_t stream); template CUB_RUNTIME_FUNCTION static cudaError_t custom_radix_sort( ::cuda::std::true_type, void* d_temp_storage, size_t& temp_storage_bytes, bool is_overwrite_okay, DoubleBuffer& d_keys, DoubleBuffer& d_values, OffsetT num_items, DecomposerT decomposer, cudaStream_t stream) { constexpr int begin_bit = 0; const int end_bit = detail::radix::traits_t::default_end_bit(decomposer); return DeviceRadixSort::custom_radix_sort( ::cuda::std::true_type{}, d_temp_storage, temp_storage_bytes, is_overwrite_okay, d_keys, d_values, num_items, decomposer, begin_bit, end_bit, stream); } // Name reported for NVTX ranges _CCCL_HOST_DEVICE static constexpr auto GetName() -> const char* { return "cub::DeviceRadixSort"; } public: //! @name KeyT-value pairs //@{ //! @brief Sorts key-value pairs into ascending order. //! (`~2N` auxiliary storage required) //! //! @par //! - The contents of the input data are not altered by the sorting operation. //! - Pointers to contiguous memory must be used; iterators are not currently //! supported. //! - In-place operations are not supported. There must be no overlap between //! any of the provided ranges: //! - `[d_keys_in, d_keys_in + num_items)` //! - `[d_keys_out, d_keys_out + num_items)` //! - `[d_values_in, d_values_in + num_items)` //! - `[d_values_out, d_values_out + num_items)` //! - An optional bit subrange `[begin_bit, end_bit)` of differentiating key //! bits can be specified. This can reduce overall sorting overhead and //! yield a corresponding performance improvement. //! - @devicestorageNP For sorting using only `O(P)` temporary storage, see //! the sorting interface using DoubleBuffer wrappers below. //! - @devicestorage //! //! @par Performance //! The following charts illustrate saturated sorting performance across //! different CUDA architectures for uniform-random `uint32, uint32` and //! `uint64, uint64` pairs, respectively. //! //! @image html lsb_radix_sort_int32_pairs.png //! @image html lsb_radix_sort_int64_pairs.png //! //! @par Snippet //! The code snippet below illustrates the sorting of a device vector of `int` //! keys with associated vector of `int` values. //! @par //! @code //! #include //! // or equivalently //! //! // Declare, allocate, and initialize device-accessible pointers //! // for sorting data //! int num_items; // e.g., 7 //! int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] //! int *d_keys_out; // e.g., [ ... ] //! int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6] //! int *d_values_out; // e.g., [ ... ] //! ... //! //! // Determine temporary device storage requirements //! void *d_temp_storage = NULL; //! size_t temp_storage_bytes = 0; //! cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, //! d_keys_in, d_keys_out, d_values_in, d_values_out, num_items); //! //! // Allocate temporary storage //! cudaMalloc(&d_temp_storage, temp_storage_bytes); //! //! // Run sorting operation //! cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, //! d_keys_in, d_keys_out, d_values_in, d_values_out, num_items); //! //! // d_keys_out <-- [0, 3, 5, 6, 7, 8, 9] //! // d_values_out <-- [5, 4, 3, 1, 2, 0, 6] //! @endcode //! //! @tparam KeyT //! **[inferred]** KeyT type //! //! @tparam ValueT //! **[inferred]** ValueT type //! //! @tparam NumItemsT //! **[inferred]** Type of num_items //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work //! is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in] d_keys_in //! Pointer to the input data of key data to sort //! //! @param[out] d_keys_out //! Pointer to the sorted output sequence of key data //! //! @param[in] d_values_in //! Pointer to the corresponding input sequence of associated value items //! //! @param[out] d_values_out //! Pointer to the correspondingly-reordered output sequence of associated //! value items //! //! @param[in] num_items //! Number of items to sort //! //! @param[in] begin_bit //! **[optional]** The least-significant bit index (inclusive) needed for //! key comparison //! //! @param[in] end_bit //! **[optional]** The most-significant bit index (exclusive) needed for key //! comparison (e.g., sizeof(unsigned int) * 8) //! //! @param[in] stream //! **[optional]** CUDA stream to launch kernels within. //! Default is stream0. template CUB_RUNTIME_FUNCTION static cudaError_t SortPairs( void* d_temp_storage, size_t& temp_storage_bytes, const KeyT* d_keys_in, KeyT* d_keys_out, const ValueT* d_values_in, ValueT* d_values_out, NumItemsT num_items, int begin_bit = 0, int end_bit = sizeof(KeyT) * 8, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); // Unsigned integer type for global offsets. using OffsetT = detail::choose_offset_t; // TODO API that doesn't accept decomposer should also contain a static // assert that the key type is fundamental. // We cast away const-ness, but will *not* write to these arrays. // `DispatchRadixSort::Dispatch` will allocate temporary storage and // create a new double-buffer internally when the `is_overwrite_ok` flag // is not set. constexpr bool is_overwrite_okay = false; DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); DoubleBuffer d_values(const_cast(d_values_in), d_values_out); return DispatchRadixSort::Dispatch( d_temp_storage, temp_storage_bytes, d_keys, d_values, static_cast(num_items), begin_bit, end_bit, is_overwrite_okay, stream); } #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairs( void* d_temp_storage, size_t& temp_storage_bytes, const KeyT* d_keys_in, KeyT* d_keys_out, const ValueT* d_values_in, ValueT* d_values_out, NumItemsT num_items, int begin_bit, int end_bit, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return SortPairs( d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, num_items, begin_bit, end_bit, stream); } #endif //! @rst //! Sorts key-value pairs into ascending order using :math:`\approx 2N` auxiliary storage. //! //! * The contents of the input data are not altered by the sorting operation. //! * Pointers to contiguous memory must be used; iterators are not currently //! supported. //! * In-place operations are not supported. There must be no overlap between //! any of the provided ranges: //! //! * ``[d_keys_in, d_keys_in + num_items)`` //! * ``[d_keys_out, d_keys_out + num_items)`` //! * ``[d_values_in, d_values_in + num_items)`` //! * ``[d_values_out, d_values_out + num_items)`` //! //! * A bit subrange ``[begin_bit, end_bit)`` is provided to specify //! differentiating key bits. This can reduce overall sorting overhead and //! yield a corresponding performance improvement. //! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see //! the sorting interface using DoubleBuffer wrappers below. //! * @devicestorage //! //! Snippet //! ========================================================================== //! //! Let's consider a user-defined ``custom_t`` type below. To sort an array of //! ``custom_t`` objects, we have to tell CUB about relevant members of the //! ``custom_t`` type. We do this by providing a decomposer that returns a //! tuple of references to relevant members of the key. //! //! .. literalinclude:: ../../test/catch2_test_device_radix_sort_custom.cu //! :language: c++ //! :dedent: //! :start-after: example-begin custom-type //! :end-before: example-end custom-type //! //! The following snippet shows how to sort an array of ``custom_t`` objects //! using ``cub::DeviceRadixSort::SortPairs``: //! //! .. literalinclude:: ../../test/catch2_test_device_radix_sort_custom.cu //! :language: c++ //! :dedent: //! :start-after: example-begin pairs-bits //! :end-before: example-end pairs-bits //! //! @endrst //! //! @tparam KeyT //! **[inferred]** KeyT type //! //! @tparam ValueT //! **[inferred]** ValueT type //! //! @tparam NumItemsT //! **[inferred]** Type of num_items //! //! @tparam DecomposerT //! **[inferred]** Type of a callable object responsible for decomposing a //! ``KeyT`` into a tuple of references to its constituent arithmetic types: //! ``::cuda::std::tuple operator()(KeyT &key)``. //! The leftmost element of the tuple is considered the most significant. //! The call operator must not modify members of the key. //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work //! is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in] d_keys_in //! Pointer to the input data of key data to sort //! //! @param[out] d_keys_out //! Pointer to the sorted output sequence of key data //! //! @param[in] d_values_in //! Pointer to the corresponding input sequence of associated value items //! //! @param[out] d_values_out //! Pointer to the correspondingly-reordered output sequence of associated //! value items //! //! @param[in] num_items //! Number of items to sort //! //! @param decomposer //! Callable object responsible for decomposing a ``KeyT`` into a tuple of //! references to its constituent arithmetic types. The leftmost element of //! the tuple is considered the most significant. The call operator must not //! modify members of the key. //! //! @param[in] begin_bit //! **[optional]** The least-significant bit index (inclusive) needed for //! key comparison //! //! @param[in] end_bit //! **[optional]** The most-significant bit index (exclusive) needed for key //! comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`) //! //! @param[in] stream //! **[optional]** CUDA stream to launch kernels within. //! Default is stream0. template CUB_RUNTIME_FUNCTION static // typename ::cuda::std::enable_if< // !::cuda::std::is_convertible::value, // cudaError_t>::type SortPairs(void* d_temp_storage, size_t& temp_storage_bytes, const KeyT* d_keys_in, KeyT* d_keys_out, const ValueT* d_values_in, ValueT* d_values_out, NumItemsT num_items, DecomposerT decomposer, int begin_bit, int end_bit, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); // unsigned integer type for global offsets using offset_t = detail::choose_offset_t; using decomposer_check_t = detail::radix::decomposer_check_t; static_assert(decomposer_check_t::value, "DecomposerT must be a callable object returning a tuple of references to " "arithmetic types"); // We cast away const-ness, but will *not* write to these arrays. // `DispatchRadixSort::Dispatch` will allocate temporary storage and // create a new double-buffer internally when the `is_overwrite_ok` flag // is not set. constexpr bool is_overwrite_okay = false; constexpr bool is_descending = false; DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); DoubleBuffer d_values(const_cast(d_values_in), d_values_out); return DeviceRadixSort::custom_radix_sort( decomposer_check_t{}, d_temp_storage, temp_storage_bytes, is_overwrite_okay, d_keys, d_values, static_cast(num_items), decomposer, begin_bit, end_bit, stream); } //! @rst //! Sorts key-value pairs into ascending order using :math:`\approx 2N` auxiliary storage. //! //! * The contents of the input data are not altered by the sorting operation. //! * Pointers to contiguous memory must be used; iterators are not currently //! supported. //! * In-place operations are not supported. There must be no overlap between //! any of the provided ranges: //! //! * ``[d_keys_in, d_keys_in + num_items)`` //! * ``[d_keys_out, d_keys_out + num_items)`` //! * ``[d_values_in, d_values_in + num_items)`` //! * ``[d_values_out, d_values_out + num_items)`` //! //! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see //! the sorting interface using DoubleBuffer wrappers below. //! * @devicestorage //! //! Snippet //! ========================================================================== //! //! Let's consider a user-defined ``custom_t`` type below. To sort an array of //! ``custom_t`` objects, we have to tell CUB about relevant members of the //! ``custom_t`` type. We do this by providing a decomposer that returns a //! tuple of references to relevant members of the key. //! //! .. literalinclude:: ../../test/catch2_test_device_radix_sort_custom.cu //! :language: c++ //! :dedent: //! :start-after: example-begin custom-type //! :end-before: example-end custom-type //! //! The following snippet shows how to sort an array of ``custom_t`` objects //! using ``cub::DeviceRadixSort::SortPairs``: //! //! .. literalinclude:: ../../test/catch2_test_device_radix_sort_custom.cu //! :language: c++ //! :dedent: //! :start-after: example-begin pairs //! :end-before: example-end pairs //! //! @endrst //! //! @tparam KeyT //! **[inferred]** KeyT type //! //! @tparam ValueT //! **[inferred]** ValueT type //! //! @tparam NumItemsT //! **[inferred]** Type of num_items //! //! @tparam DecomposerT //! **[inferred]** Type of a callable object responsible for decomposing a //! ``KeyT`` into a tuple of references to its constituent arithmetic types: //! ``::cuda::std::tuple operator()(KeyT &key)``. //! The leftmost element of the tuple is considered the most significant. //! The call operator must not modify members of the key. //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work //! is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in] d_keys_in //! Pointer to the input data of key data to sort //! //! @param[out] d_keys_out //! Pointer to the sorted output sequence of key data //! //! @param[in] d_values_in //! Pointer to the corresponding input sequence of associated value items //! //! @param[out] d_values_out //! Pointer to the correspondingly-reordered output sequence of associated //! value items //! //! @param[in] num_items //! Number of items to sort //! //! @param decomposer //! Callable object responsible for decomposing a ``KeyT`` into a tuple of //! references to its constituent arithmetic types. The leftmost element of //! the tuple is considered the most significant. The call operator must not //! modify members of the key. //! //! @param[in] stream //! **[optional]** CUDA stream to launch kernels within. //! Default is stream0. template CUB_RUNTIME_FUNCTION static // typename ::cuda::std::enable_if< // !::cuda::std::is_convertible::value, // cudaError_t>::type SortPairs(void* d_temp_storage, size_t& temp_storage_bytes, const KeyT* d_keys_in, KeyT* d_keys_out, const ValueT* d_values_in, ValueT* d_values_out, NumItemsT num_items, DecomposerT decomposer, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); // unsigned integer type for global offsets using offset_t = detail::choose_offset_t; using decomposer_check_t = detail::radix::decomposer_check_t; static_assert(decomposer_check_t::value, "DecomposerT must be a callable object returning a tuple of references to " "arithmetic types"); // We cast away const-ness, but will *not* write to these arrays. // `DispatchRadixSort::Dispatch` will allocate temporary storage and // create a new double-buffer internally when the `is_overwrite_ok` flag // is not set. constexpr bool is_overwrite_okay = false; constexpr bool is_descending = false; DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); DoubleBuffer d_values(const_cast(d_values_in), d_values_out); return DeviceRadixSort::custom_radix_sort( decomposer_check_t{}, d_temp_storage, temp_storage_bytes, is_overwrite_okay, d_keys, d_values, static_cast(num_items), decomposer, stream); } //! @brief Sorts key-value pairs into ascending order. //! (`~N` auxiliary storage required) //! //! @par //! - The sorting operation is given a pair of key buffers and a corresponding //! pair of associated value buffers. Each pair is managed by a DoubleBuffer //! structure that indicates which of the two buffers is "current" (and thus //! contains the input data to be sorted). //! - The contents of both buffers within each pair may be altered by the //! sorting operation. //! - In-place operations are not supported. There must be no overlap between //! any of the provided ranges: //! - `[d_keys.Current(), d_keys.Current() + num_items)` //! - `[d_keys.Alternate(), d_keys.Alternate() + num_items)` //! - `[d_values.Current(), d_values.Current() + num_items)` //! - `[d_values.Alternate(), d_values.Alternate() + num_items)` //! - Upon completion, the sorting operation will update the "current" //! indicator within each DoubleBuffer wrapper to reference which of the two //! buffers now contains the sorted output sequence (a function of the //! number of key bits specified and the targeted device architecture). //! - An optional bit subrange `[begin_bit, end_bit)` of differentiating key //! bits can be specified. This can reduce overall sorting overhead and //! yield a corresponding performance improvement. //! - @devicestorageP //! - @devicestorage //! //! @par Performance //! The following charts illustrate saturated sorting performance across //! different CUDA architectures for uniform-random `uint32, uint32` and //! `uint64, uint64` pairs, respectively. //! //! @image html lsb_radix_sort_int32_pairs.png //! @image html lsb_radix_sort_int64_pairs.png //! //! @par Snippet //! The code snippet below illustrates the sorting of a device vector of `int` //! keys with associated vector of `int` values. //! @par //! @code //! #include //! // or equivalently //! //! // Declare, allocate, and initialize device-accessible pointers for //! // sorting data //! int num_items; // e.g., 7 //! int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] //! int *d_key_alt_buf; // e.g., [ ... ] //! int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6] //! int *d_value_alt_buf; // e.g., [ ... ] //! ... //! //! // Create a set of DoubleBuffers to wrap pairs of device pointers //! cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); //! cub::DoubleBuffer d_values(d_value_buf, d_value_alt_buf); //! //! // Determine temporary device storage requirements //! void *d_temp_storage = NULL; //! size_t temp_storage_bytes = 0; //! cub::DeviceRadixSort::SortPairs( //! d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items); //! //! // Allocate temporary storage //! cudaMalloc(&d_temp_storage, temp_storage_bytes); //! //! // Run sorting operation //! cub::DeviceRadixSort::SortPairs( //! d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items); //! //! // d_keys.Current() <-- [0, 3, 5, 6, 7, 8, 9] //! // d_values.Current() <-- [5, 4, 3, 1, 2, 0, 6] //! //! @endcode //! //! @tparam KeyT //! **[inferred]** KeyT type //! //! @tparam ValueT //! **[inferred]** ValueT type //! //! @tparam NumItemsT //! **[inferred]** Type of num_items //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in,out] d_keys //! Reference to the double-buffer of keys whose "current" device-accessible //! buffer contains the unsorted input keys and, upon return, is updated to //! point to the sorted output keys //! //! @param[in,out] d_values //! Double-buffer of values whose "current" device-accessible buffer //! contains the unsorted input values and, upon return, is updated to point //! to the sorted output values //! //! @param[in] num_items //! Number of items to sort //! //! @param[in] begin_bit //! **[optional]** The least-significant bit index (inclusive) needed for //! key comparison //! //! @param[in] end_bit //! **[optional]** The most-significant bit index (exclusive) needed for key //! comparison (e.g., `sizeof(unsigned int) * 8`) //! //! @param[in] stream //! **[optional]** CUDA stream to launch kernels within. //! Default is stream0. template CUB_RUNTIME_FUNCTION static cudaError_t SortPairs( void* d_temp_storage, size_t& temp_storage_bytes, DoubleBuffer& d_keys, DoubleBuffer& d_values, NumItemsT num_items, int begin_bit = 0, int end_bit = sizeof(KeyT) * 8, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); // Unsigned integer type for global offsets. using OffsetT = detail::choose_offset_t; constexpr bool is_overwrite_okay = true; return DispatchRadixSort::Dispatch( d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream); } #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairs( void* d_temp_storage, size_t& temp_storage_bytes, DoubleBuffer& d_keys, DoubleBuffer& d_values, NumItemsT num_items, int begin_bit, int end_bit, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return SortPairs( d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, stream); } #endif //! @rst //! Sorts key-value pairs into ascending order using :math:`\approx N` auxiliary storage. //! //! * The sorting operation is given a pair of key buffers and a corresponding //! pair of associated value buffers. Each pair is managed by a DoubleBuffer //! structure that indicates which of the two buffers is "current" (and thus //! contains the input data to be sorted). //! * The contents of both buffers within each pair may be altered by the //! sorting operation. //! * In-place operations are not supported. There must be no overlap between //! any of the provided ranges: //! //! - ``[d_keys.Current(), d_keys.Current() + num_items)`` //! - ``[d_keys.Alternate(), d_keys.Alternate() + num_items)`` //! - ``[d_values.Current(), d_values.Current() + num_items)`` //! - ``[d_values.Alternate(), d_values.Alternate() + num_items)`` //! //! - Upon completion, the sorting operation will update the "current" //! indicator within each DoubleBuffer wrapper to reference which of the two //! buffers now contains the sorted output sequence (a function of the //! number of key bits specified and the targeted device architecture). //! - @devicestorageP //! - @devicestorage //! //! Snippet //! ========================================================================== //! //! Let's consider a user-defined ``custom_t`` type below. To sort an array of //! ``custom_t`` objects, we have to tell CUB about relevant members of the //! ``custom_t`` type. We do this by providing a decomposer that returns a //! tuple of references to relevant members of the key. //! //! .. literalinclude:: ../../test/catch2_test_device_radix_sort_custom.cu //! :language: c++ //! :dedent: //! :start-after: example-begin custom-type //! :end-before: example-end custom-type //! //! The following snippet shows how to sort an array of ``custom_t`` objects //! using ``cub::DeviceRadixSort::SortPairs``: //! //! .. literalinclude:: ../../test/catch2_test_device_radix_sort_custom.cu //! :language: c++ //! :dedent: //! :start-after: example-begin pairs-db //! :end-before: example-end pairs-db //! //! @endrst //! //! @tparam KeyT //! **[inferred]** KeyT type //! //! @tparam ValueT //! **[inferred]** ValueT type //! //! @tparam NumItemsT //! **[inferred]** Type of num_items //! //! @tparam DecomposerT //! **[inferred]** Type of a callable object responsible for decomposing a //! ``KeyT`` into a tuple of references to its constituent arithmetic types: //! ``::cuda::std::tuple operator()(KeyT &key)``. //! The leftmost element of the tuple is considered the most significant. //! The call operator must not modify members of the key. //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work //! is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in,out] d_keys //! Reference to the double-buffer of keys whose "current" device-accessible //! buffer contains the unsorted input keys and, upon return, is updated to //! point to the sorted output keys //! //! @param[in,out] d_values //! Double-buffer of values whose "current" device-accessible buffer //! contains the unsorted input values and, upon return, is updated to point //! to the sorted output values //! //! @param[in] num_items //! Number of items to sort //! //! @param decomposer //! Callable object responsible for decomposing a ``KeyT`` into a tuple of //! references to its constituent arithmetic types. The leftmost element of //! the tuple is considered the most significant. The call operator must not //! modify members of the key. //! //! @param[in] stream //! **[optional]** CUDA stream to launch kernels within. //! Default is stream0. template CUB_RUNTIME_FUNCTION static // typename ::cuda::std::enable_if< // !::cuda::std::is_convertible::value, // cudaError_t>::type SortPairs(void* d_temp_storage, size_t& temp_storage_bytes, DoubleBuffer& d_keys, DoubleBuffer& d_values, NumItemsT num_items, DecomposerT decomposer, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); // unsigned integer type for global offsets using offset_t = detail::choose_offset_t; using decomposer_check_t = detail::radix::decomposer_check_t; static_assert(decomposer_check_t::value, "DecomposerT must be a callable object returning a tuple of references to " "arithmetic types"); constexpr bool is_overwrite_okay = true; constexpr bool is_descending = false; return DeviceRadixSort::custom_radix_sort( decomposer_check_t{}, d_temp_storage, temp_storage_bytes, is_overwrite_okay, d_keys, d_values, static_cast(num_items), decomposer, stream); } //! @rst //! Sorts key-value pairs into ascending order using :math:`\approx N` auxiliary storage. //! //! * The sorting operation is given a pair of key buffers and a corresponding //! pair of associated value buffers. Each pair is managed by a DoubleBuffer //! structure that indicates which of the two buffers is "current" (and thus //! contains the input data to be sorted). //! * The contents of both buffers within each pair may be altered by the //! sorting operation. //! * In-place operations are not supported. There must be no overlap between //! any of the provided ranges: //! //! - ``[d_keys.Current(), d_keys.Current() + num_items)`` //! - ``[d_keys.Alternate(), d_keys.Alternate() + num_items)`` //! - ``[d_values.Current(), d_values.Current() + num_items)`` //! - ``[d_values.Alternate(), d_values.Alternate() + num_items)`` //! //! - Upon completion, the sorting operation will update the "current" //! indicator within each DoubleBuffer wrapper to reference which of the two //! buffers now contains the sorted output sequence (a function of the //! number of key bits specified and the targeted device architecture). //! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key //! bits can be specified. This can reduce overall sorting overhead and //! yield a corresponding performance improvement. //! - @devicestorageP //! - @devicestorage //! //! Snippet //! ========================================================================== //! //! Let's consider a user-defined ``custom_t`` type below. To sort an array of //! ``custom_t`` objects, we have to tell CUB about relevant members of the //! ``custom_t`` type. We do this by providing a decomposer that returns a //! tuple of references to relevant members of the key. //! //! .. literalinclude:: ../../test/catch2_test_device_radix_sort_custom.cu //! :language: c++ //! :dedent: //! :start-after: example-begin custom-type //! :end-before: example-end custom-type //! //! The following snippet shows how to sort an array of ``custom_t`` objects //! using ``cub::DeviceRadixSort::SortPairs``: //! //! .. literalinclude:: ../../test/catch2_test_device_radix_sort_custom.cu //! :language: c++ //! :dedent: //! :start-after: example-begin pairs-bits-db //! :end-before: example-end pairs-bits-db //! //! @endrst //! //! @tparam KeyT //! **[inferred]** KeyT type //! //! @tparam ValueT //! **[inferred]** ValueT type //! //! @tparam NumItemsT //! **[inferred]** Type of num_items //! //! @tparam DecomposerT //! **[inferred]** Type of a callable object responsible for decomposing a //! ``KeyT`` into a tuple of references to its constituent arithmetic types: //! ``::cuda::std::tuple operator()(KeyT &key)``. //! The leftmost element of the tuple is considered the most significant. //! The call operator must not modify members of the key. //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work //! is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in,out] d_keys //! Reference to the double-buffer of keys whose "current" device-accessible //! buffer contains the unsorted input keys and, upon return, is updated to //! point to the sorted output keys //! //! @param[in,out] d_values //! Double-buffer of values whose "current" device-accessible buffer //! contains the unsorted input values and, upon return, is updated to point //! to the sorted output values //! //! @param[in] num_items //! Number of items to sort //! //! @param decomposer //! Callable object responsible for decomposing a ``KeyT`` into a tuple of //! references to its constituent arithmetic types. The leftmost element of //! the tuple is considered the most significant. The call operator must not //! modify members of the key. //! //! @param[in] begin_bit //! **[optional]** The least-significant bit index (inclusive) needed for //! key comparison //! //! @param[in] end_bit //! **[optional]** The most-significant bit index (exclusive) needed for key //! comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`) //! //! @param[in] stream //! **[optional]** CUDA stream to launch kernels within. //! Default is stream0. template CUB_RUNTIME_FUNCTION static // typename ::cuda::std::enable_if< // !::cuda::std::is_convertible::value, // cudaError_t>::type SortPairs(void* d_temp_storage, size_t& temp_storage_bytes, DoubleBuffer& d_keys, DoubleBuffer& d_values, NumItemsT num_items, DecomposerT decomposer, int begin_bit, int end_bit, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); // unsigned integer type for global offsets using offset_t = detail::choose_offset_t; using decomposer_check_t = detail::radix::decomposer_check_t; static_assert(decomposer_check_t::value, "DecomposerT must be a callable object returning a tuple of references to " "arithmetic types"); constexpr bool is_overwrite_okay = true; constexpr bool is_descending = false; return DeviceRadixSort::custom_radix_sort( decomposer_check_t{}, d_temp_storage, temp_storage_bytes, is_overwrite_okay, d_keys, d_values, static_cast(num_items), decomposer, begin_bit, end_bit, stream); } //! @brief Sorts key-value pairs into descending order. //! (`~2N` auxiliary storage required). //! //! @par //! - The contents of the input data are not altered by the sorting operation. //! - Pointers to contiguous memory must be used; iterators are not currently //! supported. //! - In-place operations are not supported. There must be no overlap between //! any of the provided ranges: //! - `[d_keys_in, d_keys_in + num_items)` //! - `[d_keys_out, d_keys_out + num_items)` //! - `[d_values_in, d_values_in + num_items)` //! - `[d_values_out, d_values_out + num_items)` //! - An optional bit subrange `[begin_bit, end_bit)` of differentiating key //! bits can be specified. This can reduce overall sorting overhead and //! yield a corresponding performance improvement. //! - @devicestorageNP For sorting using only `O(P)` temporary storage, see //! the sorting interface using DoubleBuffer wrappers below. //! - @devicestorage //! //! @par Performance //! Performance is similar to DeviceRadixSort::SortPairs. //! //! @par Snippet //! The code snippet below illustrates the sorting of a device vector of `int` //! keys with associated vector of `int` values. //! @par //! @code //! #include //! // or equivalently //! //! // Declare, allocate, and initialize device-accessible pointers //! // for sorting data //! int num_items; // e.g., 7 //! int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] //! int *d_keys_out; // e.g., [ ... ] //! int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6] //! int *d_values_out; // e.g., [ ... ] //! ... //! //! // Determine temporary device storage requirements //! void *d_temp_storage = NULL; //! size_t temp_storage_bytes = 0; //! cub::DeviceRadixSort::SortPairsDescending( //! d_temp_storage, temp_storage_bytes, //! d_keys_in, d_keys_out, d_values_in, d_values_out, num_items); //! //! // Allocate temporary storage //! cudaMalloc(&d_temp_storage, temp_storage_bytes); //! //! // Run sorting operation //! cub::DeviceRadixSort::SortPairsDescending( //! d_temp_storage, temp_storage_bytes, //! d_keys_in, d_keys_out, d_values_in, d_values_out, num_items); //! //! // d_keys_out <-- [9, 8, 7, 6, 5, 3, 0] //! // d_values_out <-- [6, 0, 2, 1, 3, 4, 5] //! @endcode //! //! @tparam KeyT //! **[inferred]** KeyT type //! //! @tparam ValueT //! **[inferred]** ValueT type //! //! @tparam NumItemsT //! **[inferred]** Type of num_items //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work //! is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in] d_keys_in //! Pointer to the input data of key data to sort //! //! @param[out] d_keys_out //! Pointer to the sorted output sequence of key data //! //! @param[in] d_values_in //! Pointer to the corresponding input sequence of associated value items //! //! @param[out] d_values_out //! Pointer to the correspondingly-reordered output sequence of associated //! value items //! //! @param[in] num_items //! Number of items to sort //! //! @param[in] begin_bit //! **[optional]** The least-significant bit index (inclusive) needed for //! key comparison //! //! @param[in] end_bit //! **[optional]** The most-significant bit index (exclusive) needed for key //! comparison (e.g., `sizeof(unsigned int) * 8`) //! //! @param[in] stream //! **[optional]** CUDA stream to launch kernels within. //! Default is stream0. template CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending( void* d_temp_storage, size_t& temp_storage_bytes, const KeyT* d_keys_in, KeyT* d_keys_out, const ValueT* d_values_in, ValueT* d_values_out, NumItemsT num_items, int begin_bit = 0, int end_bit = sizeof(KeyT) * 8, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); // Unsigned integer type for global offsets. using OffsetT = detail::choose_offset_t; // We cast away const-ness, but will *not* write to these arrays. // `DispatchRadixSort::Dispatch` will allocate temporary storage and // create a new double-buffer internally when the `is_overwrite_ok` flag // is not set. constexpr bool is_overwrite_okay = false; DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); DoubleBuffer d_values(const_cast(d_values_in), d_values_out); return DispatchRadixSort::Dispatch( d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream); } #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending( void* d_temp_storage, size_t& temp_storage_bytes, const KeyT* d_keys_in, KeyT* d_keys_out, const ValueT* d_values_in, ValueT* d_values_out, NumItemsT num_items, int begin_bit, int end_bit, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return SortPairsDescending( d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, num_items, begin_bit, end_bit, stream); } #endif //! @rst //! Sorts key-value pairs into descending order using :math:`\approx 2N` auxiliary storage. //! //! * The contents of the input data are not altered by the sorting operation. //! * Pointers to contiguous memory must be used; iterators are not currently //! supported. //! * In-place operations are not supported. There must be no overlap between //! any of the provided ranges: //! //! * ``[d_keys_in, d_keys_in + num_items)`` //! * ``[d_keys_out, d_keys_out + num_items)`` //! * ``[d_values_in, d_values_in + num_items)`` //! * ``[d_values_out, d_values_out + num_items)`` //! //! * A bit subrange ``[begin_bit, end_bit)`` is provided to specify //! differentiating key bits. This can reduce overall sorting overhead and //! yield a corresponding performance improvement. //! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see //! the sorting interface using DoubleBuffer wrappers below. //! * @devicestorage //! //! Snippet //! ========================================================================== //! //! Let's consider a user-defined ``custom_t`` type below. To sort an array of //! ``custom_t`` objects, we have to tell CUB about relevant members of the //! ``custom_t`` type. We do this by providing a decomposer that returns a //! tuple of references to relevant members of the key. //! //! .. literalinclude:: ../../test/catch2_test_device_radix_sort_custom.cu //! :language: c++ //! :dedent: //! :start-after: example-begin custom-type //! :end-before: example-end custom-type //! //! The following snippet shows how to sort an array of ``custom_t`` objects //! using ``cub::DeviceRadixSort::SortPairsDescending``: //! //! .. literalinclude:: ../../test/catch2_test_device_radix_sort_custom.cu //! :language: c++ //! :dedent: //! :start-after: example-begin pairs-descending-bits //! :end-before: example-end pairs-descending-bits //! //! @endrst //! //! @tparam KeyT //! **[inferred]** KeyT type //! //! @tparam ValueT //! **[inferred]** ValueT type //! //! @tparam NumItemsT //! **[inferred]** Type of num_items //! //! @tparam DecomposerT //! **[inferred]** Type of a callable object responsible for decomposing a //! ``KeyT`` into a tuple of references to its constituent arithmetic types: //! ``::cuda::std::tuple operator()(KeyT &key)``. //! The leftmost element of the tuple is considered the most significant. //! The call operator must not modify members of the key. //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work //! is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in] d_keys_in //! Pointer to the input data of key data to sort //! //! @param[out] d_keys_out //! Pointer to the sorted output sequence of key data //! //! @param[in] d_values_in //! Pointer to the corresponding input sequence of associated value items //! //! @param[out] d_values_out //! Pointer to the correspondingly-reordered output sequence of associated //! value items //! //! @param[in] num_items //! Number of items to sort //! //! @param decomposer //! Callable object responsible for decomposing a ``KeyT`` into a tuple of //! references to its constituent arithmetic types. The leftmost element of //! the tuple is considered the most significant. The call operator must not //! modify members of the key. //! //! @param[in] begin_bit //! **[optional]** The least-significant bit index (inclusive) needed for //! key comparison //! //! @param[in] end_bit //! **[optional]** The most-significant bit index (exclusive) needed for key //! comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`) //! //! @param[in] stream //! **[optional]** CUDA stream to launch kernels within. //! Default is stream0. template CUB_RUNTIME_FUNCTION static // typename ::cuda::std::enable_if< // !::cuda::std::is_convertible::value, // cudaError_t>::type SortPairsDescending( void* d_temp_storage, size_t& temp_storage_bytes, const KeyT* d_keys_in, KeyT* d_keys_out, const ValueT* d_values_in, ValueT* d_values_out, NumItemsT num_items, DecomposerT decomposer, int begin_bit, int end_bit, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); // unsigned integer type for global offsets using offset_t = detail::choose_offset_t; using decomposer_check_t = detail::radix::decomposer_check_t; static_assert(decomposer_check_t::value, "DecomposerT must be a callable object returning a tuple of references to " "arithmetic types"); // We cast away const-ness, but will *not* write to these arrays. // `DispatchRadixSort::Dispatch` will allocate temporary storage and // create a new double-buffer internally when the `is_overwrite_ok` flag // is not set. constexpr bool is_overwrite_okay = false; constexpr bool is_descending = true; DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); DoubleBuffer d_values(const_cast(d_values_in), d_values_out); return DeviceRadixSort::custom_radix_sort( decomposer_check_t{}, d_temp_storage, temp_storage_bytes, is_overwrite_okay, d_keys, d_values, static_cast(num_items), decomposer, begin_bit, end_bit, stream); } //! @rst //! Sorts key-value pairs into descending order using :math:`\approx 2N` auxiliary storage. //! //! * The contents of the input data are not altered by the sorting operation. //! * Pointers to contiguous memory must be used; iterators are not currently //! supported. //! * In-place operations are not supported. There must be no overlap between //! any of the provided ranges: //! //! * ``[d_keys_in, d_keys_in + num_items)`` //! * ``[d_keys_out, d_keys_out + num_items)`` //! * ``[d_values_in, d_values_in + num_items)`` //! * ``[d_values_out, d_values_out + num_items)`` //! //! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see //! the sorting interface using DoubleBuffer wrappers below. //! * @devicestorage //! //! Snippet //! ========================================================================== //! //! Let's consider a user-defined ``custom_t`` type below. To sort an array of //! ``custom_t`` objects, we have to tell CUB about relevant members of the //! ``custom_t`` type. We do this by providing a decomposer that returns a //! tuple of references to relevant members of the key. //! //! .. literalinclude:: ../../test/catch2_test_device_radix_sort_custom.cu //! :language: c++ //! :dedent: //! :start-after: example-begin custom-type //! :end-before: example-end custom-type //! //! The following snippet shows how to sort an array of ``custom_t`` objects //! using ``cub::DeviceRadixSort::SortPairsDescending``: //! //! .. literalinclude:: ../../test/catch2_test_device_radix_sort_custom.cu //! :language: c++ //! :dedent: //! :start-after: example-begin pairs-descending //! :end-before: example-end pairs-descending //! //! @endrst //! //! @tparam KeyT //! **[inferred]** KeyT type //! //! @tparam ValueT //! **[inferred]** ValueT type //! //! @tparam NumItemsT //! **[inferred]** Type of num_items //! //! @tparam DecomposerT //! **[inferred]** Type of a callable object responsible for decomposing a //! ``KeyT`` into a tuple of references to its constituent arithmetic types: //! ``::cuda::std::tuple operator()(KeyT &key)``. //! The leftmost element of the tuple is considered the most significant. //! The call operator must not modify members of the key. //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work //! is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in] d_keys_in //! Pointer to the input data of key data to sort //! //! @param[out] d_keys_out //! Pointer to the sorted output sequence of key data //! //! @param[in] d_values_in //! Pointer to the corresponding input sequence of associated value items //! //! @param[out] d_values_out //! Pointer to the correspondingly-reordered output sequence of associated //! value items //! //! @param[in] num_items //! Number of items to sort //! //! @param decomposer //! Callable object responsible for decomposing a ``KeyT`` into a tuple of //! references to its constituent arithmetic types. The leftmost element of //! the tuple is considered the most significant. The call operator must not //! modify members of the key. //! //! @param[in] stream //! **[optional]** CUDA stream to launch kernels within. //! Default is stream0. template CUB_RUNTIME_FUNCTION static // typename ::cuda::std::enable_if< // !::cuda::std::is_convertible::value, // cudaError_t>::type SortPairsDescending( void* d_temp_storage, size_t& temp_storage_bytes, const KeyT* d_keys_in, KeyT* d_keys_out, const ValueT* d_values_in, ValueT* d_values_out, NumItemsT num_items, DecomposerT decomposer, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); // unsigned integer type for global offsets using offset_t = detail::choose_offset_t; using decomposer_check_t = detail::radix::decomposer_check_t; static_assert(decomposer_check_t::value, "DecomposerT must be a callable object returning a tuple of references to " "arithmetic types"); // We cast away const-ness, but will *not* write to these arrays. // `DispatchRadixSort::Dispatch` will allocate temporary storage and // create a new double-buffer internally when the `is_overwrite_ok` flag // is not set. constexpr bool is_overwrite_okay = false; constexpr bool is_descending = true; DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); DoubleBuffer d_values(const_cast(d_values_in), d_values_out); return DeviceRadixSort::custom_radix_sort( decomposer_check_t{}, d_temp_storage, temp_storage_bytes, is_overwrite_okay, d_keys, d_values, static_cast(num_items), decomposer, stream); } //! @brief Sorts key-value pairs into descending order. //! (`~N` auxiliary storage required). //! //! @par //! - The sorting operation is given a pair of key buffers and a corresponding //! pair of associated value buffers. Each pair is managed by a DoubleBuffer //! structure that indicates which of the two buffers is "current" (and thus //! contains the input data to be sorted). //! - The contents of both buffers within each pair may be altered by the //! sorting operation. //! - In-place operations are not supported. There must be no overlap between //! any of the provided ranges: //! - `[d_keys.Current(), d_keys.Current() + num_items)` //! - `[d_keys.Alternate(), d_keys.Alternate() + num_items)` //! - `[d_values.Current(), d_values.Current() + num_items)` //! - `[d_values.Alternate(), d_values.Alternate() + num_items)` //! - Upon completion, the sorting operation will update the "current" //! indicator within each DoubleBuffer wrapper to reference which of the two //! buffers now contains the sorted output sequence (a function of the number //! of key bits specified and the targeted device architecture). //! - An optional bit subrange `[begin_bit, end_bit)` of differentiating key //! bits can be specified. This can reduce overall sorting overhead and //! yield a corresponding performance improvement. //! - @devicestorageP //! - @devicestorage //! //! @par Performance //! Performance is similar to DeviceRadixSort::SortPairs. //! //! @par Snippet //! The code snippet below illustrates the sorting of a device vector of `int` //! keys with associated vector of `int` values. //! @par //! @code //! #include //! // or equivalently //! //! // Declare, allocate, and initialize device-accessible pointers //! // for sorting data //! int num_items; // e.g., 7 //! int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] //! int *d_key_alt_buf; // e.g., [ ... ] //! int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6] //! int *d_value_alt_buf; // e.g., [ ... ] //! ... //! //! // Create a set of DoubleBuffers to wrap pairs of device pointers //! cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); //! cub::DoubleBuffer d_values(d_value_buf, d_value_alt_buf); //! //! // Determine temporary device storage requirements //! void *d_temp_storage = NULL; //! size_t temp_storage_bytes = 0; //! cub::DeviceRadixSort::SortPairsDescending( //! d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items); //! //! // Allocate temporary storage //! cudaMalloc(&d_temp_storage, temp_storage_bytes); //! //! // Run sorting operation //! cub::DeviceRadixSort::SortPairsDescending( //! d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items); //! //! // d_keys.Current() <-- [9, 8, 7, 6, 5, 3, 0] //! // d_values.Current() <-- [6, 0, 2, 1, 3, 4, 5] //! @endcode //! //! @tparam KeyT //! **[inferred]** KeyT type //! //! @tparam ValueT //! **[inferred]** ValueT type //! //! @tparam NumItemsT //! **[inferred]** Type of num_items //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work //! is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in,out] d_keys //! Reference to the double-buffer of keys whose "current" device-accessible //! buffer contains the unsorted input keys and, upon return, is updated to //! point to the sorted output keys //! //! @param[in,out] d_values //! Double-buffer of values whose "current" device-accessible buffer //! contains the unsorted input values and, upon return, is updated to point //! to the sorted output values //! //! @param[in] num_items //! Number of items to sort //! //! @param[in] begin_bit //! **[optional]** The least-significant bit index (inclusive) needed for //! key comparison //! //! @param[in] end_bit //! **[optional]** The most-significant bit index (exclusive) needed for key //! comparison (e.g., `sizeof(unsigned int) * 8`) //! //! @param[in] stream //! **[optional]** CUDA stream to launch kernels within. //! Default is stream0. template CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending( void* d_temp_storage, size_t& temp_storage_bytes, DoubleBuffer& d_keys, DoubleBuffer& d_values, NumItemsT num_items, int begin_bit = 0, int end_bit = sizeof(KeyT) * 8, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); // Unsigned integer type for global offsets. using OffsetT = detail::choose_offset_t; constexpr bool is_overwrite_okay = true; return DispatchRadixSort::Dispatch( d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream); } #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending( void* d_temp_storage, size_t& temp_storage_bytes, DoubleBuffer& d_keys, DoubleBuffer& d_values, NumItemsT num_items, int begin_bit, int end_bit, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return SortPairsDescending( d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, stream); } #endif //! @rst //! Sorts key-value pairs into descending order using :math:`\approx N` auxiliary storage. //! //! * The sorting operation is given a pair of key buffers and a corresponding //! pair of associated value buffers. Each pair is managed by a DoubleBuffer //! structure that indicates which of the two buffers is "current" (and thus //! contains the input data to be sorted). //! * The contents of both buffers within each pair may be altered by the //! sorting operation. //! * In-place operations are not supported. There must be no overlap between //! any of the provided ranges: //! //! - ``[d_keys.Current(), d_keys.Current() + num_items)`` //! - ``[d_keys.Alternate(), d_keys.Alternate() + num_items)`` //! - ``[d_values.Current(), d_values.Current() + num_items)`` //! - ``[d_values.Alternate(), d_values.Alternate() + num_items)`` //! //! - Upon completion, the sorting operation will update the "current" //! indicator within each DoubleBuffer wrapper to reference which of the two //! buffers now contains the sorted output sequence (a function of the //! number of key bits specified and the targeted device architecture). //! - @devicestorageP //! - @devicestorage //! //! Snippet //! ========================================================================== //! //! Let's consider a user-defined ``custom_t`` type below. To sort an array of //! ``custom_t`` objects, we have to tell CUB about relevant members of the //! ``custom_t`` type. We do this by providing a decomposer that returns a //! tuple of references to relevant members of the key. //! //! .. literalinclude:: ../../test/catch2_test_device_radix_sort_custom.cu //! :language: c++ //! :dedent: //! :start-after: example-begin custom-type //! :end-before: example-end custom-type //! //! The following snippet shows how to sort an array of ``custom_t`` objects //! using ``cub::DeviceRadixSort::SortPairsDescending``: //! //! .. literalinclude:: ../../test/catch2_test_device_radix_sort_custom.cu //! :language: c++ //! :dedent: //! :start-after: example-begin pairs-descending-db //! :end-before: example-end pairs-descending-db //! //! @endrst //! //! @tparam KeyT //! **[inferred]** KeyT type //! //! @tparam ValueT //! **[inferred]** ValueT type //! //! @tparam NumItemsT //! **[inferred]** Type of num_items //! //! @tparam DecomposerT //! **[inferred]** Type of a callable object responsible for decomposing a //! ``KeyT`` into a tuple of references to its constituent arithmetic types: //! ``::cuda::std::tuple operator()(KeyT &key)``. //! The leftmost element of the tuple is considered the most significant. //! The call operator must not modify members of the key. //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work //! is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in,out] d_keys //! Reference to the double-buffer of keys whose "current" device-accessible //! buffer contains the unsorted input keys and, upon return, is updated to //! point to the sorted output keys //! //! @param[in,out] d_values //! Double-buffer of values whose "current" device-accessible buffer //! contains the unsorted input values and, upon return, is updated to point //! to the sorted output values //! //! @param[in] num_items //! Number of items to sort //! //! @param decomposer //! Callable object responsible for decomposing a ``KeyT`` into a tuple of //! references to its constituent arithmetic types. The leftmost element of //! the tuple is considered the most significant. The call operator must not //! modify members of the key. //! //! @param[in] stream //! **[optional]** CUDA stream to launch kernels within. //! Default is stream0. template CUB_RUNTIME_FUNCTION static // typename ::cuda::std::enable_if< // !::cuda::std::is_convertible::value, // cudaError_t>::type SortPairsDescending( void* d_temp_storage, size_t& temp_storage_bytes, DoubleBuffer& d_keys, DoubleBuffer& d_values, NumItemsT num_items, DecomposerT decomposer, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); // unsigned integer type for global offsets using offset_t = detail::choose_offset_t; using decomposer_check_t = detail::radix::decomposer_check_t; static_assert(decomposer_check_t::value, "DecomposerT must be a callable object returning a tuple of references to " "arithmetic types"); constexpr bool is_overwrite_okay = true; constexpr bool is_descending = true; return DeviceRadixSort::custom_radix_sort( decomposer_check_t{}, d_temp_storage, temp_storage_bytes, is_overwrite_okay, d_keys, d_values, static_cast(num_items), decomposer, stream); } //! @rst //! Sorts key-value pairs into descending order using :math:`\approx N` auxiliary storage. //! //! * The sorting operation is given a pair of key buffers and a corresponding //! pair of associated value buffers. Each pair is managed by a DoubleBuffer //! structure that indicates which of the two buffers is "current" (and thus //! contains the input data to be sorted). //! * The contents of both buffers within each pair may be altered by the //! sorting operation. //! * In-place operations are not supported. There must be no overlap between //! any of the provided ranges: //! //! - ``[d_keys.Current(), d_keys.Current() + num_items)`` //! - ``[d_keys.Alternate(), d_keys.Alternate() + num_items)`` //! - ``[d_values.Current(), d_values.Current() + num_items)`` //! - ``[d_values.Alternate(), d_values.Alternate() + num_items)`` //! //! - Upon completion, the sorting operation will update the "current" //! indicator within each DoubleBuffer wrapper to reference which of the two //! buffers now contains the sorted output sequence (a function of the //! number of key bits specified and the targeted device architecture). //! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key //! bits can be specified. This can reduce overall sorting overhead and //! yield a corresponding performance improvement. //! - @devicestorageP //! - @devicestorage //! //! Snippet //! ========================================================================== //! //! Let's consider a user-defined ``custom_t`` type below. To sort an array of //! ``custom_t`` objects, we have to tell CUB about relevant members of the //! ``custom_t`` type. We do this by providing a decomposer that returns a //! tuple of references to relevant members of the key. //! //! .. literalinclude:: ../../test/catch2_test_device_radix_sort_custom.cu //! :language: c++ //! :dedent: //! :start-after: example-begin custom-type //! :end-before: example-end custom-type //! //! The following snippet shows how to sort an array of ``custom_t`` objects //! using ``cub::DeviceRadixSort::SortPairsDescending``: //! //! .. literalinclude:: ../../test/catch2_test_device_radix_sort_custom.cu //! :language: c++ //! :dedent: //! :start-after: example-begin pairs-descending-bits-db //! :end-before: example-end pairs-descending-bits-db //! //! @endrst //! //! @tparam KeyT //! **[inferred]** KeyT type //! //! @tparam ValueT //! **[inferred]** ValueT type //! //! @tparam NumItemsT //! **[inferred]** Type of num_items //! //! @tparam DecomposerT //! **[inferred]** Type of a callable object responsible for decomposing a //! ``KeyT`` into a tuple of references to its constituent arithmetic types: //! ``::cuda::std::tuple operator()(KeyT &key)``. //! The leftmost element of the tuple is considered the most significant. //! The call operator must not modify members of the key. //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work //! is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in,out] d_keys //! Reference to the double-buffer of keys whose "current" device-accessible //! buffer contains the unsorted input keys and, upon return, is updated to //! point to the sorted output keys //! //! @param[in,out] d_values //! Double-buffer of values whose "current" device-accessible buffer //! contains the unsorted input values and, upon return, is updated to point //! to the sorted output values //! //! @param[in] num_items //! Number of items to sort //! //! @param decomposer //! Callable object responsible for decomposing a ``KeyT`` into a tuple of //! references to its constituent arithmetic types. The leftmost element of //! the tuple is considered the most significant. The call operator must not //! modify members of the key. //! //! @param[in] begin_bit //! **[optional]** The least-significant bit index (inclusive) needed for //! key comparison //! //! @param[in] end_bit //! **[optional]** The most-significant bit index (exclusive) needed for key //! comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`) //! //! @param[in] stream //! **[optional]** CUDA stream to launch kernels within. //! Default is stream0. template CUB_RUNTIME_FUNCTION static // typename ::cuda::std::enable_if< // !::cuda::std::is_convertible::value, // cudaError_t>::type SortPairsDescending( void* d_temp_storage, size_t& temp_storage_bytes, DoubleBuffer& d_keys, DoubleBuffer& d_values, NumItemsT num_items, DecomposerT decomposer, int begin_bit, int end_bit, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); // unsigned integer type for global offsets using offset_t = detail::choose_offset_t; using decomposer_check_t = detail::radix::decomposer_check_t; static_assert(decomposer_check_t::value, "DecomposerT must be a callable object returning a tuple of references to " "arithmetic types"); constexpr bool is_overwrite_okay = true; constexpr bool is_descending = true; return DeviceRadixSort::custom_radix_sort( decomposer_check_t{}, d_temp_storage, temp_storage_bytes, is_overwrite_okay, d_keys, d_values, static_cast(num_items), decomposer, begin_bit, end_bit, stream); } //@} end member group /******************************************************************/ /** * @name Keys-only *********************************************************************/ //@{ //! @brief Sorts keys into ascending order. //! (`~2N` auxiliary storage required) //! //! @par //! - The contents of the input data are not altered by the sorting operation. //! - Pointers to contiguous memory must be used; iterators are not currently //! supported. //! - In-place operations are not supported. There must be no overlap between //! any of the provided ranges: //! - `[d_keys_in, d_keys_in + num_items)` //! - `[d_keys_out, d_keys_out + num_items)` //! - An optional bit subrange `[begin_bit, end_bit)` of differentiating key //! bits can be specified. This can reduce overall sorting overhead and //! yield a corresponding performance improvement. //! - @devicestorageNP For sorting using only `O(P)` temporary storage, see //! the sorting interface using DoubleBuffer wrappers below. //! - @devicestorage //! //! @par Performance //! The following charts illustrate saturated sorting performance across //! different CUDA architectures for uniform-random `uint32` and `uint64` //! keys, respectively. //! //! @image html lsb_radix_sort_int32_keys.png //! @image html lsb_radix_sort_int64_keys.png //! //! @par Snippet //! The code snippet below illustrates the sorting of a device vector of //! `int` keys. //! @par //! @code //! #include //! // or equivalently //! //! // Declare, allocate, and initialize device-accessible pointers //! // for sorting data //! int num_items; // e.g., 7 //! int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] //! int *d_keys_out; // e.g., [ ... ] //! ... //! //! // Determine temporary device storage requirements //! void *d_temp_storage = NULL; //! size_t temp_storage_bytes = 0; //! cub::DeviceRadixSort::SortKeys( //! d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items); //! //! // Allocate temporary storage //! cudaMalloc(&d_temp_storage, temp_storage_bytes); //! //! // Run sorting operation //! cub::DeviceRadixSort::SortKeys( //! d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items); //! //! // d_keys_out <-- [0, 3, 5, 6, 7, 8, 9] //! @endcode //! //! @tparam KeyT //! **[inferred]** KeyT type //! //! @tparam NumItemsT //! **[inferred]** Type of num_items //! //! @tparam NumItemsT //! **[inferred]** Type of num_items //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work //! is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in] d_keys_in //! Pointer to the input data of key data to sort //! //! @param[out] d_keys_out //! Pointer to the sorted output sequence of key data //! //! @param[in] num_items //! Number of items to sort //! //! @param[in] begin_bit //! **[optional]** The least-significant bit index (inclusive) needed for //! key comparison //! //! @param[in] end_bit //! **[optional]** The most-significant bit index (exclusive) needed for key //! comparison (e.g., `sizeof(unsigned int) * 8`) //! //! @param[in] stream //! **[optional]** CUDA stream to launch kernels within. //! Default is stream0. template CUB_RUNTIME_FUNCTION static cudaError_t SortKeys( void* d_temp_storage, size_t& temp_storage_bytes, const KeyT* d_keys_in, KeyT* d_keys_out, NumItemsT num_items, int begin_bit = 0, int end_bit = sizeof(KeyT) * 8, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); // Unsigned integer type for global offsets. using OffsetT = detail::choose_offset_t; // We cast away const-ness, but will *not* write to these arrays. // `DispatchRadixSort::Dispatch` will allocate temporary storage and // create a new double-buffer internally when the `is_overwrite_ok` flag // is not set. constexpr bool is_overwrite_okay = false; DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); // Null value type DoubleBuffer d_values; return DispatchRadixSort::Dispatch( d_temp_storage, temp_storage_bytes, d_keys, d_values, static_cast(num_items), begin_bit, end_bit, is_overwrite_okay, stream); } //! @rst //! Sorts keys into ascending order using :math:`\approx 2N` auxiliary storage. //! //! * The contents of the input data are not altered by the sorting operation. //! * Pointers to contiguous memory must be used; iterators are not currently //! supported. //! * In-place operations are not supported. There must be no overlap between //! any of the provided ranges: //! //! * ``[d_keys_in, d_keys_in + num_items)`` //! * ``[d_keys_out, d_keys_out + num_items)`` //! //! * A bit subrange ``[begin_bit, end_bit)`` is provided to specify //! differentiating key bits. This can reduce overall sorting overhead and //! yield a corresponding performance improvement. //! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see //! the sorting interface using DoubleBuffer wrappers below. //! * @devicestorage //! //! Snippet //! ========================================================================== //! //! Let's consider a user-defined ``custom_t`` type below. To sort an array of //! ``custom_t`` objects, we have to tell CUB about relevant members of the //! ``custom_t`` type. We do this by providing a decomposer that returns a //! tuple of references to relevant members of the key. //! //! .. literalinclude:: ../../test/catch2_test_device_radix_sort_custom.cu //! :language: c++ //! :dedent: //! :start-after: example-begin custom-type //! :end-before: example-end custom-type //! //! The following snippet shows how to sort an array of ``custom_t`` objects //! using ``cub::DeviceRadixSort::SortKeys``: //! //! .. literalinclude:: ../../test/catch2_test_device_radix_sort_custom.cu //! :language: c++ //! :dedent: //! :start-after: example-begin keys-bits //! :end-before: example-end keys-bits //! //! @endrst //! //! @tparam KeyT //! **[inferred]** KeyT type //! //! @tparam NumItemsT //! **[inferred]** Type of num_items //! //! @tparam DecomposerT //! **[inferred]** Type of a callable object responsible for decomposing a //! ``KeyT`` into a tuple of references to its constituent arithmetic types: //! ``::cuda::std::tuple operator()(KeyT &key)``. //! The leftmost element of the tuple is considered the most significant. //! The call operator must not modify members of the key. //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work //! is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in] d_keys_in //! Pointer to the input data of key data to sort //! //! @param[out] d_keys_out //! Pointer to the sorted output sequence of key data //! //! @param[in] num_items //! Number of items to sort //! //! @param decomposer //! Callable object responsible for decomposing a ``KeyT`` into a tuple of //! references to its constituent arithmetic types. The leftmost element of //! the tuple is considered the most significant. The call operator must not //! modify members of the key. //! //! @param[in] begin_bit //! **[optional]** The least-significant bit index (inclusive) needed for //! key comparison //! //! @param[in] end_bit //! **[optional]** The most-significant bit index (exclusive) needed for key //! comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`) //! //! @param[in] stream //! **[optional]** CUDA stream to launch kernels within. //! Default is stream0. template CUB_RUNTIME_FUNCTION static // typename ::cuda::std::enable_if< // !::cuda::std::is_convertible::value, // cudaError_t>::type SortKeys(void* d_temp_storage, size_t& temp_storage_bytes, const KeyT* d_keys_in, KeyT* d_keys_out, NumItemsT num_items, DecomposerT decomposer, int begin_bit, int end_bit, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); // unsigned integer type for global offsets using offset_t = detail::choose_offset_t; using decomposer_check_t = detail::radix::decomposer_check_t; static_assert(decomposer_check_t::value, "DecomposerT must be a callable object returning a tuple of references to " "arithmetic types"); // We cast away const-ness, but will *not* write to these arrays. // `DispatchRadixSort::Dispatch` will allocate temporary storage and // create a new double-buffer internally when the `is_overwrite_ok` flag // is not set. constexpr bool is_overwrite_okay = false; constexpr bool is_descending = false; DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); DoubleBuffer d_values; return DeviceRadixSort::custom_radix_sort( decomposer_check_t{}, d_temp_storage, temp_storage_bytes, is_overwrite_okay, d_keys, d_values, static_cast(num_items), decomposer, begin_bit, end_bit, stream); } //! @rst //! Sorts keys into ascending order using :math:`\approx 2N` auxiliary storage. //! //! * The contents of the input data are not altered by the sorting operation. //! * Pointers to contiguous memory must be used; iterators are not currently //! supported. //! * In-place operations are not supported. There must be no overlap between //! any of the provided ranges: //! //! * ``[d_keys_in, d_keys_in + num_items)`` //! * ``[d_keys_out, d_keys_out + num_items)`` //! //! * An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key //! bits can be specified. This can reduce overall sorting overhead and //! yield a corresponding performance improvement. //! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see //! the sorting interface using DoubleBuffer wrappers below. //! * @devicestorage //! //! Snippet //! ========================================================================== //! //! Let's consider a user-defined ``custom_t`` type below. To sort an array of //! ``custom_t`` objects, we have to tell CUB about relevant members of the //! ``custom_t`` type. We do this by providing a decomposer that returns a //! tuple of references to relevant members of the key. //! //! .. literalinclude:: ../../test/catch2_test_device_radix_sort_custom.cu //! :language: c++ //! :dedent: //! :start-after: example-begin custom-type //! :end-before: example-end custom-type //! //! The following snippet shows how to sort an array of ``custom_t`` objects //! using ``cub::DeviceRadixSort::SortKeys``: //! //! .. literalinclude:: ../../test/catch2_test_device_radix_sort_custom.cu //! :language: c++ //! :dedent: //! :start-after: example-begin keys //! :end-before: example-end keys //! //! @endrst //! //! @tparam KeyT //! **[inferred]** KeyT type //! //! @tparam NumItemsT //! **[inferred]** Type of num_items //! //! @tparam DecomposerT //! **[inferred]** Type of a callable object responsible for decomposing a //! ``KeyT`` into a tuple of references to its constituent arithmetic types: //! ``::cuda::std::tuple operator()(KeyT &key)``. //! The leftmost element of the tuple is considered the most significant. //! The call operator must not modify members of the key. //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work //! is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in] d_keys_in //! Pointer to the input data of key data to sort //! //! @param[out] d_keys_out //! Pointer to the sorted output sequence of key data //! //! @param[in] num_items //! Number of items to sort //! //! @param decomposer //! Callable object responsible for decomposing a ``KeyT`` into a tuple of //! references to its constituent arithmetic types. The leftmost element of //! the tuple is considered the most significant. The call operator must not //! modify members of the key. //! //! @param[in] stream //! **[optional]** CUDA stream to launch kernels within. //! Default is stream0. template CUB_RUNTIME_FUNCTION static // typename ::cuda::std::enable_if< // !::cuda::std::is_convertible::value, // cudaError_t>::type SortKeys(void* d_temp_storage, size_t& temp_storage_bytes, const KeyT* d_keys_in, KeyT* d_keys_out, NumItemsT num_items, DecomposerT decomposer, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); // unsigned integer type for global offsets using offset_t = detail::choose_offset_t; using decomposer_check_t = detail::radix::decomposer_check_t; static_assert(decomposer_check_t::value, "DecomposerT must be a callable object returning a tuple of references to " "arithmetic types"); // We cast away const-ness, but will *not* write to these arrays. // `DispatchRadixSort::Dispatch` will allocate temporary storage and // create a new double-buffer internally when the `is_overwrite_ok` flag // is not set. constexpr bool is_overwrite_okay = false; constexpr bool is_descending = false; DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); DoubleBuffer d_values; return DeviceRadixSort::custom_radix_sort( decomposer_check_t{}, d_temp_storage, temp_storage_bytes, is_overwrite_okay, d_keys, d_values, static_cast(num_items), decomposer, stream); } #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeys( void* d_temp_storage, size_t& temp_storage_bytes, const KeyT* d_keys_in, KeyT* d_keys_out, NumItemsT num_items, int begin_bit, int end_bit, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return SortKeys( d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items, begin_bit, end_bit, stream); } #endif //! @brief Sorts keys into ascending order. (`~N` auxiliary storage required). //! //! @par //! - The sorting operation is given a pair of key buffers managed by a //! DoubleBuffer structure that indicates which of the two buffers is //! "current" (and thus contains the input data to be sorted). //! - The contents of both buffers may be altered by the sorting operation. //! - In-place operations are not supported. There must be no overlap between //! any of the provided ranges: //! - `[d_keys.Current(), d_keys.Current() + num_items)` //! - `[d_keys.Alternate(), d_keys.Alternate() + num_items)` //! - Upon completion, the sorting operation will update the "current" //! indicator within the DoubleBuffer wrapper to reference which of the two //! buffers now contains the sorted output sequence (a function of the //! number of key bits specified and the targeted device architecture). //! - An optional bit subrange `[begin_bit, end_bit)` of differentiating key //! bits can be specified. This can reduce overall sorting overhead and //! yield a corresponding performance improvement. //! - @devicestorageP //! - @devicestorage //! //! @par Performance //! The following charts illustrate saturated sorting performance across //! different CUDA architectures for uniform-random `uint32` and `uint64` //! keys, respectively. //! //! @image html lsb_radix_sort_int32_keys.png //! @image html lsb_radix_sort_int64_keys.png //! //! @par Snippet //! The code snippet below illustrates the sorting of a device vector of //! `int` keys. //! @par //! @code //! #include //! // or equivalently //! //! // Declare, allocate, and initialize device-accessible pointers //! // for sorting data //! int num_items; // e.g., 7 //! int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] //! int *d_key_alt_buf; // e.g., [ ... ] //! ... //! //! // Create a DoubleBuffer to wrap the pair of device pointers //! cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); //! //! // Determine temporary device storage requirements //! void *d_temp_storage = NULL; //! size_t temp_storage_bytes = 0; //! cub::DeviceRadixSort::SortKeys( //! d_temp_storage, temp_storage_bytes, d_keys, num_items); //! //! // Allocate temporary storage //! cudaMalloc(&d_temp_storage, temp_storage_bytes); //! //! // Run sorting operation //! cub::DeviceRadixSort::SortKeys( //! d_temp_storage, temp_storage_bytes, d_keys, num_items); //! //! // d_keys.Current() <-- [0, 3, 5, 6, 7, 8, 9] //! @endcode //! //! @tparam KeyT //! **[inferred]** KeyT type //! //! @tparam NumItemsT //! **[inferred]** Type of num_items //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work //! is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in,out] d_keys //! Reference to the double-buffer of keys whose "current" device-accessible //! buffer contains the unsorted input keys and, upon return, is updated to //! point to the sorted output keys //! //! @param[in] num_items //! Number of items to sort //! //! @param[in] begin_bit //! **[optional]** The least-significant bit index (inclusive) needed for //! key comparison //! //! @param[in] end_bit //! **[optional]** The most-significant bit index (exclusive) needed for key //! comparison (e.g., `sizeof(unsigned int) * 8`) //! //! @param[in] stream //! **[optional]** CUDA stream to launch kernels within. //! Default is stream0. template CUB_RUNTIME_FUNCTION static cudaError_t SortKeys( void* d_temp_storage, size_t& temp_storage_bytes, DoubleBuffer& d_keys, NumItemsT num_items, int begin_bit = 0, int end_bit = sizeof(KeyT) * 8, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); // Unsigned integer type for global offsets. using OffsetT = detail::choose_offset_t; constexpr bool is_overwrite_okay = true; // Null value type DoubleBuffer d_values; return DispatchRadixSort::Dispatch( d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream); } #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeys( void* d_temp_storage, size_t& temp_storage_bytes, DoubleBuffer& d_keys, NumItemsT num_items, int begin_bit, int end_bit, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return SortKeys(d_temp_storage, temp_storage_bytes, d_keys, num_items, begin_bit, end_bit, stream); } #endif //! @rst //! Sorts keys into ascending order using :math:`\approx N` auxiliary storage. //! //! * The sorting operation is given a pair of key buffers managed by a //! DoubleBuffer structure that indicates which of the two buffers is //! "current" (and thus contains the input data to be sorted). //! * The contents of both buffers may be altered by the sorting operation. //! * In-place operations are not supported. There must be no overlap between //! any of the provided ranges: //! //! * ``[d_keys.Current(), d_keys.Current() + num_items)`` //! * ``[d_keys.Alternate(), d_keys.Alternate() + num_items)`` //! //! * Upon completion, the sorting operation will update the "current" //! indicator within the DoubleBuffer wrapper to reference which of the two //! buffers now contains the sorted output sequence (a function of the //! number of key bits specified and the targeted device architecture). //! * @devicestorageP //! * @devicestorage //! //! Snippet //! ========================================================================== //! //! Let's consider a user-defined ``custom_t`` type below. To sort an array of //! ``custom_t`` objects, we have to tell CUB about relevant members of the //! ``custom_t`` type. We do this by providing a decomposer that returns a //! tuple of references to relevant members of the key. //! //! .. literalinclude:: ../../test/catch2_test_device_radix_sort_custom.cu //! :language: c++ //! :dedent: //! :start-after: example-begin custom-type //! :end-before: example-end custom-type //! //! The following snippet shows how to sort an array of ``custom_t`` objects //! using ``cub::DeviceRadixSort::SortKeys``: //! //! .. literalinclude:: ../../test/catch2_test_device_radix_sort_custom.cu //! :language: c++ //! :dedent: //! :start-after: example-begin keys-db //! :end-before: example-end keys-db //! //! @endrst //! //! @tparam KeyT //! **[inferred]** KeyT type //! //! @tparam NumItemsT //! **[inferred]** Type of num_items //! //! @tparam DecomposerT //! **[inferred]** Type of a callable object responsible for decomposing a //! ``KeyT`` into a tuple of references to its constituent arithmetic types: //! ``::cuda::std::tuple operator()(KeyT &key)``. //! The leftmost element of the tuple is considered the most significant. //! The call operator must not modify members of the key. //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work //! is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in,out] d_keys //! Reference to the double-buffer of keys whose "current" device-accessible //! buffer contains the unsorted input keys and, upon return, is updated to //! point to the sorted output keys //! //! @param[in] num_items //! Number of items to sort //! //! @param decomposer //! Callable object responsible for decomposing a ``KeyT`` into a tuple of //! references to its constituent arithmetic types. The leftmost element of //! the tuple is considered the most significant. The call operator must not //! modify members of the key. //! //! @param[in] stream //! **[optional]** CUDA stream to launch kernels within. //! Default is stream0. template CUB_RUNTIME_FUNCTION static // typename ::cuda::std::enable_if< // !::cuda::std::is_convertible::value, // cudaError_t>::type SortKeys(void* d_temp_storage, size_t& temp_storage_bytes, DoubleBuffer& d_keys, NumItemsT num_items, DecomposerT decomposer, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); // unsigned integer type for global offsets using offset_t = detail::choose_offset_t; using decomposer_check_t = detail::radix::decomposer_check_t; static_assert(decomposer_check_t::value, "DecomposerT must be a callable object returning a tuple of references to " "arithmetic types"); constexpr bool is_overwrite_okay = true; constexpr bool is_descending = false; DoubleBuffer d_values; return DeviceRadixSort::custom_radix_sort( decomposer_check_t{}, d_temp_storage, temp_storage_bytes, is_overwrite_okay, d_keys, d_values, static_cast(num_items), decomposer, stream); } //! @rst //! Sorts keys into ascending order using :math:`\approx N` auxiliary storage. //! //! * The sorting operation is given a pair of key buffers managed by a //! DoubleBuffer structure that indicates which of the two buffers is //! "current" (and thus contains the input data to be sorted). //! * The contents of both buffers may be altered by the sorting operation. //! * In-place operations are not supported. There must be no overlap between //! any of the provided ranges: //! //! * ``[d_keys.Current(), d_keys.Current() + num_items)`` //! * ``[d_keys.Alternate(), d_keys.Alternate() + num_items)`` //! //! * A bit subrange ``[begin_bit, end_bit)`` is provided to specify //! differentiating key bits. This can reduce overall sorting overhead and //! yield a corresponding performance improvement. //! * Upon completion, the sorting operation will update the "current" //! indicator within the DoubleBuffer wrapper to reference which of the two //! buffers now contains the sorted output sequence (a function of the //! number of key bits specified and the targeted device architecture). //! * @devicestorageP //! * @devicestorage //! //! Snippet //! ========================================================================== //! //! Let's consider a user-defined ``custom_t`` type below. To sort an array of //! ``custom_t`` objects, we have to tell CUB about relevant members of the //! ``custom_t`` type. We do this by providing a decomposer that returns a //! tuple of references to relevant members of the key. //! //! .. literalinclude:: ../../test/catch2_test_device_radix_sort_custom.cu //! :language: c++ //! :dedent: //! :start-after: example-begin custom-type //! :end-before: example-end custom-type //! //! The following snippet shows how to sort an array of ``custom_t`` objects //! using ``cub::DeviceRadixSort::SortKeys``: //! //! .. literalinclude:: ../../test/catch2_test_device_radix_sort_custom.cu //! :language: c++ //! :dedent: //! :start-after: example-begin keys-bits-db //! :end-before: example-end keys-bits-db //! //! @endrst //! //! @tparam KeyT //! **[inferred]** KeyT type //! //! @tparam NumItemsT //! **[inferred]** Type of num_items //! //! @tparam DecomposerT //! **[inferred]** Type of a callable object responsible for decomposing a //! ``KeyT`` into a tuple of references to its constituent arithmetic types: //! ``::cuda::std::tuple operator()(KeyT &key)``. //! The leftmost element of the tuple is considered the most significant. //! The call operator must not modify members of the key. //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work //! is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in,out] d_keys //! Reference to the double-buffer of keys whose "current" device-accessible //! buffer contains the unsorted input keys and, upon return, is updated to //! point to the sorted output keys //! //! @param[in] num_items //! Number of items to sort //! //! @param decomposer //! Callable object responsible for decomposing a ``KeyT`` into a tuple of //! references to its constituent arithmetic types. The leftmost element of //! the tuple is considered the most significant. The call operator must not //! modify members of the key. //! //! @param[in] begin_bit //! **[optional]** The least-significant bit index (inclusive) needed for //! key comparison //! //! @param[in] end_bit //! **[optional]** The most-significant bit index (exclusive) needed for key //! comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`) //! //! @param[in] stream //! **[optional]** CUDA stream to launch kernels within. //! Default is stream0. template CUB_RUNTIME_FUNCTION static // typename ::cuda::std::enable_if< // !::cuda::std::is_convertible::value, // cudaError_t>::type SortKeys(void* d_temp_storage, size_t& temp_storage_bytes, DoubleBuffer& d_keys, NumItemsT num_items, DecomposerT decomposer, int begin_bit, int end_bit, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); // unsigned integer type for global offsets using offset_t = detail::choose_offset_t; using decomposer_check_t = detail::radix::decomposer_check_t; static_assert(decomposer_check_t::value, "DecomposerT must be a callable object returning a tuple of references to " "arithmetic types"); constexpr bool is_overwrite_okay = true; constexpr bool is_descending = false; DoubleBuffer d_values; return DeviceRadixSort::custom_radix_sort( decomposer_check_t{}, d_temp_storage, temp_storage_bytes, is_overwrite_okay, d_keys, d_values, static_cast(num_items), decomposer, begin_bit, end_bit, stream); } //! @brief Sorts keys into descending order. //! (`~2N` auxiliary storage required). //! //! @par //! - The contents of the input data are not altered by the sorting operation. //! - Pointers to contiguous memory must be used; iterators are not currently //! supported. //! - In-place operations are not supported. There must be no overlap between //! any of the provided ranges: //! - `[d_keys_in, d_keys_in + num_items)` //! - `[d_keys_out, d_keys_out + num_items)` //! - An optional bit subrange `[begin_bit, end_bit)` of differentiating key //! bits can be specified. This can reduce overall sorting overhead and //! yield a corresponding performance improvement. //! - @devicestorageNP For sorting using only `O(P)` temporary storage, see //! the sorting interface using DoubleBuffer wrappers below. //! - @devicestorage //! //! @par Performance //! Performance is similar to DeviceRadixSort::SortKeys. //! //! @par Snippet //! The code snippet below illustrates the sorting of a device vector of //! `int` keys. //! @par //! @code //! #include //! // or equivalently //! //! // Declare, allocate, and initialize device-accessible pointers //! // for sorting data //! int num_items; // e.g., 7 //! int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] //! int *d_keys_out; // e.g., [ ... ] //! ... //! //! // Create a DoubleBuffer to wrap the pair of device pointers //! cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); //! //! // Determine temporary device storage requirements //! void *d_temp_storage = NULL; //! size_t temp_storage_bytes = 0; //! cub::DeviceRadixSort::SortKeysDescending( //! d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items); //! //! // Allocate temporary storage //! cudaMalloc(&d_temp_storage, temp_storage_bytes); //! //! // Run sorting operation //! cub::DeviceRadixSort::SortKeysDescending( //! d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items); //! //! // d_keys_out <-- [9, 8, 7, 6, 5, 3, 0]s //! //! @endcode //! //! @tparam KeyT //! **[inferred]** KeyT type //! //! @tparam NumItemsT //! **[inferred]** Type of num_items //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work //! is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in] d_keys_in //! Pointer to the input data of key data to sort //! //! @param[out] d_keys_out //! Pointer to the sorted output sequence of key data //! //! @param[in] num_items //! Number of items to sort //! //! @param[in] begin_bit //! **[optional]** The least-significant bit index (inclusive) needed for //! key comparison //! //! @param[in] end_bit //! **[optional]** The most-significant bit index (exclusive) needed for key //! comparison (e.g., `sizeof(unsigned int) * 8`) //! //! @param[in] stream //! **[optional]** CUDA stream to launch kernels within. //! Default is stream0. template CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending( void* d_temp_storage, size_t& temp_storage_bytes, const KeyT* d_keys_in, KeyT* d_keys_out, NumItemsT num_items, int begin_bit = 0, int end_bit = sizeof(KeyT) * 8, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); // Unsigned integer type for global offsets. using OffsetT = detail::choose_offset_t; // We cast away const-ness, but will *not* write to these arrays. // `DispatchRadixSort::Dispatch` will allocate temporary storage and // create a new double-buffer internally when the `is_overwrite_ok` flag // is not set. constexpr bool is_overwrite_okay = false; DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); DoubleBuffer d_values; return DispatchRadixSort::Dispatch( d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream); } #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending( void* d_temp_storage, size_t& temp_storage_bytes, const KeyT* d_keys_in, KeyT* d_keys_out, NumItemsT num_items, int begin_bit, int end_bit, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return SortKeysDescending( d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items, begin_bit, end_bit, stream); } #endif //! @rst //! Sorts keys into descending order using :math:`\approx 2N` auxiliary storage. //! //! * The contents of the input data are not altered by the sorting operation. //! * Pointers to contiguous memory must be used; iterators are not currently //! supported. //! * In-place operations are not supported. There must be no overlap between //! any of the provided ranges: //! //! * ``[d_keys_in, d_keys_in + num_items)`` //! * ``[d_keys_out, d_keys_out + num_items)`` //! //! * An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key //! bits can be specified. This can reduce overall sorting overhead and //! yield a corresponding performance improvement. //! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see //! the sorting interface using DoubleBuffer wrappers below. //! * @devicestorage //! //! Snippet //! ========================================================================== //! //! Let's consider a user-defined ``custom_t`` type below. To sort an array of //! ``custom_t`` objects, we have to tell CUB about relevant members of the //! ``custom_t`` type. We do this by providing a decomposer that returns a //! tuple of references to relevant members of the key. //! //! .. literalinclude:: ../../test/catch2_test_device_radix_sort_custom.cu //! :language: c++ //! :dedent: //! :start-after: example-begin custom-type //! :end-before: example-end custom-type //! //! The following snippet shows how to sort an array of ``custom_t`` objects //! using ``cub::DeviceRadixSort::SortKeysDescending``: //! //! .. literalinclude:: ../../test/catch2_test_device_radix_sort_custom.cu //! :language: c++ //! :dedent: //! :start-after: example-begin keys-descending-bits //! :end-before: example-end keys-descending-bits //! //! @endrst //! //! @tparam KeyT //! **[inferred]** KeyT type //! //! @tparam NumItemsT //! **[inferred]** Type of num_items //! //! @tparam DecomposerT //! **[inferred]** Type of a callable object responsible for decomposing a //! ``KeyT`` into a tuple of references to its constituent arithmetic types: //! ``::cuda::std::tuple operator()(KeyT &key)``. //! The leftmost element of the tuple is considered the most significant. //! The call operator must not modify members of the key. //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work //! is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in] d_keys_in //! Pointer to the input data of key data to sort //! //! @param[out] d_keys_out //! Pointer to the sorted output sequence of key data //! //! @param[in] num_items //! Number of items to sort //! //! @param decomposer //! Callable object responsible for decomposing a ``KeyT`` into a tuple of //! references to its constituent arithmetic types. The leftmost element of //! the tuple is considered the most significant. The call operator must not //! modify members of the key. //! //! @param[in] begin_bit //! **[optional]** The least-significant bit index (inclusive) needed for //! key comparison //! //! @param[in] end_bit //! **[optional]** The most-significant bit index (exclusive) needed for key //! comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`) //! //! @param[in] stream //! **[optional]** CUDA stream to launch kernels within. //! Default is stream0. template CUB_RUNTIME_FUNCTION static // typename ::cuda::std::enable_if< // !::cuda::std::is_convertible::value, // cudaError_t>::type SortKeysDescending( void* d_temp_storage, size_t& temp_storage_bytes, const KeyT* d_keys_in, KeyT* d_keys_out, NumItemsT num_items, DecomposerT decomposer, int begin_bit, int end_bit, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); // unsigned integer type for global offsets using offset_t = detail::choose_offset_t; using decomposer_check_t = detail::radix::decomposer_check_t; static_assert(decomposer_check_t::value, "DecomposerT must be a callable object returning a tuple of references to " "arithmetic types"); // We cast away const-ness, but will *not* write to these arrays. // `DispatchRadixSort::Dispatch` will allocate temporary storage and // create a new double-buffer internally when the `is_overwrite_ok` flag // is not set. constexpr bool is_overwrite_okay = false; constexpr bool is_descending = true; DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); DoubleBuffer d_values; return DeviceRadixSort::custom_radix_sort( decomposer_check_t{}, d_temp_storage, temp_storage_bytes, is_overwrite_okay, d_keys, d_values, static_cast(num_items), decomposer, begin_bit, end_bit, stream); } //! @rst //! Sorts keys into descending order using :math:`\approx 2N` auxiliary storage. //! //! * The contents of the input data are not altered by the sorting operation. //! * Pointers to contiguous memory must be used; iterators are not currently //! supported. //! * In-place operations are not supported. There must be no overlap between //! any of the provided ranges: //! //! * ``[d_keys_in, d_keys_in + num_items)`` //! * ``[d_keys_out, d_keys_out + num_items)`` //! //! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see //! the sorting interface using DoubleBuffer wrappers below. //! * @devicestorage //! //! Snippet //! ========================================================================== //! //! Let's consider a user-defined ``custom_t`` type below. To sort an array of //! ``custom_t`` objects, we have to tell CUB about relevant members of the //! ``custom_t`` type. We do this by providing a decomposer that returns a //! tuple of references to relevant members of the key. //! //! .. literalinclude:: ../../test/catch2_test_device_radix_sort_custom.cu //! :language: c++ //! :dedent: //! :start-after: example-begin custom-type //! :end-before: example-end custom-type //! //! The following snippet shows how to sort an array of ``custom_t`` objects //! using ``cub::DeviceRadixSort::SortKeysDescending``: //! //! .. literalinclude:: ../../test/catch2_test_device_radix_sort_custom.cu //! :language: c++ //! :dedent: //! :start-after: example-begin keys-descending //! :end-before: example-end keys-descending //! //! @endrst //! //! @tparam KeyT //! **[inferred]** KeyT type //! //! @tparam NumItemsT //! **[inferred]** Type of num_items //! //! @tparam DecomposerT //! **[inferred]** Type of a callable object responsible for decomposing a //! ``KeyT`` into a tuple of references to its constituent arithmetic types: //! ``::cuda::std::tuple operator()(KeyT &key)``. //! The leftmost element of the tuple is considered the most significant. //! The call operator must not modify members of the key. //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work //! is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in] d_keys_in //! Pointer to the input data of key data to sort //! //! @param[out] d_keys_out //! Pointer to the sorted output sequence of key data //! //! @param[in] num_items //! Number of items to sort //! //! @param decomposer //! Callable object responsible for decomposing a ``KeyT`` into a tuple of //! references to its constituent arithmetic types. The leftmost element of //! the tuple is considered the most significant. The call operator must not //! modify members of the key. //! //! @param[in] stream //! **[optional]** CUDA stream to launch kernels within. //! Default is stream0. template CUB_RUNTIME_FUNCTION static // typename ::cuda::std::enable_if< // !::cuda::std::is_convertible::value, // cudaError_t>::type SortKeysDescending( void* d_temp_storage, size_t& temp_storage_bytes, const KeyT* d_keys_in, KeyT* d_keys_out, NumItemsT num_items, DecomposerT decomposer, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); // unsigned integer type for global offsets using offset_t = detail::choose_offset_t; using decomposer_check_t = detail::radix::decomposer_check_t; static_assert(decomposer_check_t::value, "DecomposerT must be a callable object returning a tuple of references to " "arithmetic types"); // We cast away const-ness, but will *not* write to these arrays. // `DispatchRadixSort::Dispatch` will allocate temporary storage and // create a new double-buffer internally when the `is_overwrite_ok` flag // is not set. constexpr bool is_overwrite_okay = false; constexpr bool is_descending = true; DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); DoubleBuffer d_values; return DeviceRadixSort::custom_radix_sort( decomposer_check_t{}, d_temp_storage, temp_storage_bytes, is_overwrite_okay, d_keys, d_values, static_cast(num_items), decomposer, stream); } //! @brief Sorts keys into descending order. //! (`~N` auxiliary storage required). //! //! @par //! - The sorting operation is given a pair of key buffers managed by a //! DoubleBuffer structure that indicates which of the two buffers is //! "current" (and thus contains the input data to be sorted). //! - The contents of both buffers may be altered by the sorting operation. //! - In-place operations are not supported. There must be no overlap between //! any of the provided ranges: //! - `[d_keys.Current(), d_keys.Current() + num_items)` //! - `[d_keys.Alternate(), d_keys.Alternate() + num_items)` //! - Upon completion, the sorting operation will update the "current" //! indicator within the DoubleBuffer wrapper to reference which of the two //! buffers now contains the sorted output sequence (a function of the //! number of key bits specified and the targeted device architecture). //! - An optional bit subrange `[begin_bit, end_bit)` of differentiating key //! bits can be specified. This can reduce overall sorting overhead and //! yield a corresponding performance improvement. //! - @devicestorageP //! - @devicestorage //! //! @par Performance //! Performance is similar to DeviceRadixSort::SortKeys. //! //! @par Snippet //! The code snippet below illustrates the sorting of a device vector of `i`nt keys. //! @par //! @code //! #include //! // or equivalently //! //! // Declare, allocate, and initialize device-accessible pointers //! // for sorting data //! int num_items; // e.g., 7 //! int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] //! int *d_key_alt_buf; // e.g., [ ... ] //! ... //! //! // Create a DoubleBuffer to wrap the pair of device pointers //! cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); //! //! // Determine temporary device storage requirements //! void *d_temp_storage = NULL; //! size_t temp_storage_bytes = 0; //! cub::DeviceRadixSort::SortKeysDescending( //! d_temp_storage, temp_storage_bytes, d_keys, num_items); //! //! // Allocate temporary storage //! cudaMalloc(&d_temp_storage, temp_storage_bytes); //! //! // Run sorting operation //! cub::DeviceRadixSort::SortKeysDescending( //! d_temp_storage, temp_storage_bytes, d_keys, num_items); //! //! // d_keys.Current() <-- [9, 8, 7, 6, 5, 3, 0] //! @endcode //! //! @tparam KeyT //! **[inferred]** KeyT type //! //! @tparam NumItemsT //! **[inferred]** Type of num_items //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work //! is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in,out] d_keys //! Reference to the double-buffer of keys whose "current" device-accessible //! buffer contains the unsorted input keys and, upon return, is updated to //! point to the sorted output keys //! //! @param[in] num_items //! Number of items to sort //! //! @param[in] begin_bit //! **[optional]** The least-significant bit index (inclusive) needed for //! key comparison //! //! @param[in] end_bit //! **[optional]** The most-significant bit index (exclusive) needed for key //! comparison (e.g., `sizeof(unsigned int) * 8`) //! //! @param[in] stream //! **[optional]** CUDA stream to launch kernels within. //! Default is stream0. template CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending( void* d_temp_storage, size_t& temp_storage_bytes, DoubleBuffer& d_keys, NumItemsT num_items, int begin_bit = 0, int end_bit = sizeof(KeyT) * 8, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); // Unsigned integer type for global offsets. using OffsetT = detail::choose_offset_t; constexpr bool is_overwrite_okay = true; // Null value type DoubleBuffer d_values; return DispatchRadixSort::Dispatch( d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream); } #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending( void* d_temp_storage, size_t& temp_storage_bytes, DoubleBuffer& d_keys, NumItemsT num_items, int begin_bit, int end_bit, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return SortKeysDescending( d_temp_storage, temp_storage_bytes, d_keys, num_items, begin_bit, end_bit, stream); } #endif //! @rst //! Sorts keys into descending order using :math:`\approx N` auxiliary storage. //! //! * The sorting operation is given a pair of key buffers managed by a //! DoubleBuffer structure that indicates which of the two buffers is //! "current" (and thus contains the input data to be sorted). //! * The contents of both buffers may be altered by the sorting operation. //! * In-place operations are not supported. There must be no overlap between //! any of the provided ranges: //! //! * ``[d_keys.Current(), d_keys.Current() + num_items)`` //! * ``[d_keys.Alternate(), d_keys.Alternate() + num_items)`` //! //! * Upon completion, the sorting operation will update the "current" //! indicator within the DoubleBuffer wrapper to reference which of the two //! buffers now contains the sorted output sequence (a function of the //! number of key bits specified and the targeted device architecture). //! * @devicestorageP //! * @devicestorage //! //! Snippet //! ========================================================================== //! //! Let's consider a user-defined ``custom_t`` type below. To sort an array of //! ``custom_t`` objects, we have to tell CUB about relevant members of the //! ``custom_t`` type. We do this by providing a decomposer that returns a //! tuple of references to relevant members of the key. //! //! .. literalinclude:: ../../test/catch2_test_device_radix_sort_custom.cu //! :language: c++ //! :dedent: //! :start-after: example-begin custom-type //! :end-before: example-end custom-type //! //! The following snippet shows how to sort an array of ``custom_t`` objects //! using ``cub::DeviceRadixSort::SortKeysDescending``: //! //! .. literalinclude:: ../../test/catch2_test_device_radix_sort_custom.cu //! :language: c++ //! :dedent: //! :start-after: example-begin keys-descending-db //! :end-before: example-end keys-descending-db //! //! @endrst //! //! @tparam KeyT //! **[inferred]** KeyT type //! //! @tparam NumItemsT //! **[inferred]** Type of num_items //! //! @tparam DecomposerT //! **[inferred]** Type of a callable object responsible for decomposing a //! ``KeyT`` into a tuple of references to its constituent arithmetic types: //! ``::cuda::std::tuple operator()(KeyT &key)``. //! The leftmost element of the tuple is considered the most significant. //! The call operator must not modify members of the key. //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work //! is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in,out] d_keys //! Reference to the double-buffer of keys whose "current" device-accessible //! buffer contains the unsorted input keys and, upon return, is updated to //! point to the sorted output keys //! //! @param[in] num_items //! Number of items to sort //! //! @param decomposer //! Callable object responsible for decomposing a ``KeyT`` into a tuple of //! references to its constituent arithmetic types. The leftmost element of //! the tuple is considered the most significant. The call operator must not //! modify members of the key. //! //! @param[in] stream //! **[optional]** CUDA stream to launch kernels within. //! Default is stream0. template CUB_RUNTIME_FUNCTION static // typename ::cuda::std::enable_if< // !::cuda::std::is_convertible::value, // cudaError_t>::type SortKeysDescending( void* d_temp_storage, size_t& temp_storage_bytes, DoubleBuffer& d_keys, NumItemsT num_items, DecomposerT decomposer, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); // unsigned integer type for global offsets using offset_t = detail::choose_offset_t; using decomposer_check_t = detail::radix::decomposer_check_t; static_assert(decomposer_check_t::value, "DecomposerT must be a callable object returning a tuple of references to " "arithmetic types"); constexpr bool is_overwrite_okay = true; constexpr bool is_descending = true; DoubleBuffer d_values; return DeviceRadixSort::custom_radix_sort( decomposer_check_t{}, d_temp_storage, temp_storage_bytes, is_overwrite_okay, d_keys, d_values, static_cast(num_items), decomposer, stream); } //! @rst //! Sorts keys into descending order using :math:`\approx N` auxiliary storage. //! //! * The sorting operation is given a pair of key buffers managed by a //! DoubleBuffer structure that indicates which of the two buffers is //! "current" (and thus contains the input data to be sorted). //! * The contents of both buffers may be altered by the sorting operation. //! * In-place operations are not supported. There must be no overlap between //! any of the provided ranges: //! //! * ``[d_keys.Current(), d_keys.Current() + num_items)`` //! * ``[d_keys.Alternate(), d_keys.Alternate() + num_items)`` //! //! * A bit subrange ``[begin_bit, end_bit)`` is provided to specify //! differentiating key bits. This can reduce overall sorting overhead and //! yield a corresponding performance improvement. //! * Upon completion, the sorting operation will update the "current" //! indicator within the DoubleBuffer wrapper to reference which of the two //! buffers now contains the sorted output sequence (a function of the //! number of key bits specified and the targeted device architecture). //! * @devicestorageP //! * @devicestorage //! //! Snippet //! ========================================================================== //! //! Let's consider a user-defined ``custom_t`` type below. To sort an array of //! ``custom_t`` objects, we have to tell CUB about relevant members of the //! ``custom_t`` type. We do this by providing a decomposer that returns a //! tuple of references to relevant members of the key. //! //! .. literalinclude:: ../../test/catch2_test_device_radix_sort_custom.cu //! :language: c++ //! :dedent: //! :start-after: example-begin custom-type //! :end-before: example-end custom-type //! //! The following snippet shows how to sort an array of ``custom_t`` objects //! using ``cub::DeviceRadixSort::SortKeysDescending``: //! //! .. literalinclude:: ../../test/catch2_test_device_radix_sort_custom.cu //! :language: c++ //! :dedent: //! :start-after: example-begin keys-descending-bits-db //! :end-before: example-end keys-descending-bits-db //! //! @endrst //! //! @tparam KeyT //! **[inferred]** KeyT type //! //! @tparam NumItemsT //! **[inferred]** Type of num_items //! //! @tparam DecomposerT //! **[inferred]** Type of a callable object responsible for decomposing a //! ``KeyT`` into a tuple of references to its constituent arithmetic types: //! ``::cuda::std::tuple operator()(KeyT &key)``. //! The leftmost element of the tuple is considered the most significant. //! The call operator must not modify members of the key. //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work //! is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in,out] d_keys //! Reference to the double-buffer of keys whose "current" device-accessible //! buffer contains the unsorted input keys and, upon return, is updated to //! point to the sorted output keys //! //! @param[in] num_items //! Number of items to sort //! //! @param decomposer //! Callable object responsible for decomposing a ``KeyT`` into a tuple of //! references to its constituent arithmetic types. The leftmost element of //! the tuple is considered the most significant. The call operator must not //! modify members of the key. //! //! @param[in] begin_bit //! **[optional]** The least-significant bit index (inclusive) needed for //! key comparison //! //! @param[in] end_bit //! **[optional]** The most-significant bit index (exclusive) needed for key //! comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`) //! //! @param[in] stream //! **[optional]** CUDA stream to launch kernels within. //! Default is stream0. template CUB_RUNTIME_FUNCTION static // typename ::cuda::std::enable_if< // !::cuda::std::is_convertible::value, // cudaError_t>::type SortKeysDescending( void* d_temp_storage, size_t& temp_storage_bytes, DoubleBuffer& d_keys, NumItemsT num_items, DecomposerT decomposer, int begin_bit, int end_bit, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); // unsigned integer type for global offsets using offset_t = detail::choose_offset_t; using decomposer_check_t = detail::radix::decomposer_check_t; static_assert(decomposer_check_t::value, "DecomposerT must be a callable object returning a tuple of references to " "arithmetic types"); constexpr bool is_overwrite_okay = true; constexpr bool is_descending = true; DoubleBuffer d_values; return DeviceRadixSort::custom_radix_sort( decomposer_check_t{}, d_temp_storage, temp_storage_bytes, is_overwrite_okay, d_keys, d_values, static_cast(num_items), decomposer, begin_bit, end_bit, stream); } //! @} end member group }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/device/device_reduce.cuh000066400000000000000000001304231463375617100205120ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ //! @file cub::DeviceReduce provides device-wide, parallel operations for //! computing a reduction across a sequence of data items residing within //! device-accessible memory. #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include #include #include #include #include CUB_NAMESPACE_BEGIN //! @rst //! DeviceReduce provides device-wide, parallel operations for computing //! a reduction across a sequence of data items residing within //! device-accessible memory. //! //! .. image:: ../img/reduce_logo.png //! :align: center //! //! Overview //! ==================================== //! //! A `reduction `_ //! (or *fold*) uses a binary combining operator to compute a single aggregate //! from a sequence of input elements. //! //! Usage Considerations //! ==================================== //! //! @cdp_class{DeviceReduce} //! //! Performance //! ==================================== //! //! @linear_performance{reduction, reduce-by-key, and run-length encode} //! //! @endrst struct DeviceReduce { //! @rst //! Computes a device-wide reduction using the specified binary ``reduction_op`` functor and initial value ``init``. //! //! - Does not support binary reduction operators that are non-commutative. //! - Provides "run-to-run" determinism for pseudo-associative reduction //! (e.g., addition of floating point types) on the same GPU device. //! However, results for pseudo-associative reduction may be inconsistent //! from one device to a another device of a different compute-capability //! because CUB can employ different tile-sizing for different architectures. //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_out``. //! - @devicestorage //! //! Snippet //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The code snippet below illustrates a user-defined min-reduction of a //! device vector of ``int`` data elements. //! //! .. code-block:: c++ //! //! #include //! // or equivalently //! //! // CustomMin functor //! struct CustomMin //! { //! template //! __host__ __forceinline__ //! T operator()(const T &a, const T &b) const { //! return (b < a) ? b : a; //! } //! }; //! //! // Declare, allocate, and initialize device-accessible pointers for //! // input and output //! int num_items; // e.g., 7 //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] //! int *d_out; // e.g., [-] //! CustomMin min_op; //! int init; // e.g., INT_MAX //! ... //! //! // Determine temporary device storage requirements //! void *d_temp_storage = NULL; //! size_t temp_storage_bytes = 0; //! cub::DeviceReduce::Reduce( //! d_temp_storage, temp_storage_bytes, //! d_in, d_out, num_items, min_op, init); //! //! // Allocate temporary storage //! cudaMalloc(&d_temp_storage, temp_storage_bytes); //! //! // Run reduction //! cub::DeviceReduce::Reduce( //! d_temp_storage, temp_storage_bytes, //! d_in, d_out, num_items, min_op, init); //! //! // d_out <-- [0] //! //! @endrst //! //! @tparam InputIteratorT //! **[inferred]** Random-access input iterator type for reading input items @iterator //! //! @tparam OutputIteratorT //! **[inferred]** Output iterator type for recording the reduced aggregate @iterator //! //! @tparam ReductionOpT //! **[inferred]** Binary reduction functor type having member `T operator()(const T &a, const T &b)` //! //! @tparam T //! **[inferred]** Data element type that is convertible to the `value` type of `InputIteratorT` //! //! @tparam NumItemsT //! **[inferred]** Type of num_items //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work //! is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in] d_in //! Pointer to the input sequence of data items //! //! @param[out] d_out //! Pointer to the output aggregate //! //! @param[in] num_items //! Total number of input items (i.e., length of `d_in`) //! //! @param[in] reduction_op //! Binary reduction functor //! //! @param[in] init //! Initial value of the reduction //! //! @param[in] stream //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst template CUB_RUNTIME_FUNCTION static cudaError_t Reduce( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, NumItemsT num_items, ReductionOpT reduction_op, T init, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::Reduce"); // Signed integer type for global offsets using OffsetT = detail::choose_offset_t; return DispatchReduce::Dispatch( d_temp_storage, temp_storage_bytes, d_in, d_out, static_cast(num_items), reduction_op, init, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t Reduce( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, ReductionOpT reduction_op, T init, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return Reduce( d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, reduction_op, init, stream); } //! @rst //! Computes a device-wide sum using the addition (``+``) operator. //! //! - Uses ``0`` as the initial value of the reduction. //! - Does not support ``+`` operators that are non-commutative.. //! - Provides "run-to-run" determinism for pseudo-associative reduction //! (e.g., addition of floating point types) on the same GPU device. //! However, results for pseudo-associative reduction may be inconsistent //! from one device to a another device of a different compute-capability //! because CUB can employ different tile-sizing for different architectures. //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_out``. //! - @devicestorage //! //! Snippet //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The code snippet below illustrates the sum-reduction of a device vector //! of ``int`` data elements. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! // Declare, allocate, and initialize device-accessible pointers //! // for input and output //! int num_items; // e.g., 7 //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] //! int *d_out; // e.g., [-] //! ... //! //! // Determine temporary device storage requirements //! void *d_temp_storage = NULL; //! size_t temp_storage_bytes = 0; //! cub::DeviceReduce::Sum( //! d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); //! //! // Allocate temporary storage //! cudaMalloc(&d_temp_storage, temp_storage_bytes); //! //! // Run sum-reduction //! cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); //! //! // d_out <-- [38] //! //! @endrst //! //! @tparam InputIteratorT //! **[inferred]** Random-access input iterator type for reading input items @iterator //! //! @tparam OutputIteratorT //! **[inferred]** Output iterator type for recording the reduced aggregate @iterator //! //! @tparam NumItemsT //! **[inferred]** Type of num_items //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in] d_in //! Pointer to the input sequence of data items //! //! @param[out] d_out //! Pointer to the output aggregate //! //! @param[in] num_items //! Total number of input items (i.e., length of `d_in`) //! //! @param[in] stream //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst template CUB_RUNTIME_FUNCTION static cudaError_t Sum(void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, NumItemsT num_items, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::Sum"); // Signed integer type for global offsets using OffsetT = detail::choose_offset_t; // The output value type using OutputT = cub::detail::non_void_value_t>; using InitT = OutputT; return DispatchReduce::Dispatch( d_temp_storage, temp_storage_bytes, d_in, d_out, static_cast(num_items), cub::Sum(), InitT{}, // zero-initialize stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t Sum(void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream); } //! @rst //! Computes a device-wide minimum using the less-than (``<``) operator. //! //! - Uses ``std::numeric_limits::max()`` as the initial value of the reduction. //! - Does not support ``<`` operators that are non-commutative. //! - Provides "run-to-run" determinism for pseudo-associative reduction //! (e.g., addition of floating point types) on the same GPU device. //! However, results for pseudo-associative reduction may be inconsistent //! from one device to a another device of a different compute-capability //! because CUB can employ different tile-sizing for different architectures. //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_out``. //! - @devicestorage //! //! Snippet //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The code snippet below illustrates the min-reduction of a device vector of ``int`` data elements. //! //! .. code-block:: c++ //! //! #include //! // or equivalently //! //! // Declare, allocate, and initialize device-accessible pointers //! // for input and output //! int num_items; // e.g., 7 //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] //! int *d_out; // e.g., [-] //! ... //! //! // Determine temporary device storage requirements //! void *d_temp_storage = NULL; //! size_t temp_storage_bytes = 0; //! cub::DeviceReduce::Min( //! d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); //! //! // Allocate temporary storage //! cudaMalloc(&d_temp_storage, temp_storage_bytes); //! //! // Run min-reduction //! cub::DeviceReduce::Min( //! d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); //! //! // d_out <-- [0] //! //! @endrst //! //! @tparam InputIteratorT //! **[inferred]** Random-access input iterator type for reading input items @iterator //! //! @tparam OutputIteratorT //! **[inferred]** Output iterator type for recording the reduced aggregate @iterator //! //! @tparam NumItemsT //! **[inferred]** Type of num_items //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in] d_in //! Pointer to the input sequence of data items //! //! @param[out] d_out //! Pointer to the output aggregate //! //! @param[in] num_items //! Total number of input items (i.e., length of `d_in`) //! //! @param[in] stream //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst template CUB_RUNTIME_FUNCTION static cudaError_t Min(void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, NumItemsT num_items, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::Min"); // Signed integer type for global offsets using OffsetT = detail::choose_offset_t; // The input value type using InputT = cub::detail::value_t; using InitT = InputT; return DispatchReduce::Dispatch( d_temp_storage, temp_storage_bytes, d_in, d_out, static_cast(num_items), cub::Min(), // replace with // std::numeric_limits::max() when // C++11 support is more prevalent Traits::Max(), stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t Min(void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return Min(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream); } //! @rst //! Finds the first device-wide minimum using the less-than (``<``) operator, also returning the index of that item. //! //! - The output value type of ``d_out`` is ``cub::KeyValuePair`` //! (assuming the value type of ``d_in`` is ``T``) //! //! - The minimum is written to ``d_out.value`` and its offset in the input array is written to ``d_out.key``. //! - The ``{1, std::numeric_limits::max()}`` tuple is produced for zero-length inputs //! //! - Does not support ``<`` operators that are non-commutative. //! - Provides "run-to-run" determinism for pseudo-associative reduction //! (e.g., addition of floating point types) on the same GPU device. //! However, results for pseudo-associative reduction may be inconsistent //! from one device to a another device of a different compute-capability //! because CUB can employ different tile-sizing for different architectures. //! - The range ``[d_in, d_in + num_items)`` shall not overlap `d_out`. //! - @devicestorage //! //! Snippet //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The code snippet below illustrates the argmin-reduction of a device vector //! of ``int`` data elements. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! // Declare, allocate, and initialize device-accessible pointers //! // for input and output //! int num_items; // e.g., 7 //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] //! KeyValuePair *d_argmin; // e.g., [{-,-}] //! ... //! //! // Determine temporary device storage requirements //! void *d_temp_storage = NULL; //! size_t temp_storage_bytes = 0; //! cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_argmin, num_items); //! //! // Allocate temporary storage //! cudaMalloc(&d_temp_storage, temp_storage_bytes); //! //! // Run argmin-reduction //! cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_argmin, num_items); //! //! // d_argmin <-- [{5, 0}] //! //! @endrst //! //! @tparam InputIteratorT //! **[inferred]** Random-access input iterator type for reading input items //! (of some type `T`) @iterator //! //! @tparam OutputIteratorT //! **[inferred]** Output iterator type for recording the reduced aggregate //! (having value type `cub::KeyValuePair`) @iterator //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in] d_in //! Pointer to the input sequence of data items //! //! @param[out] d_out //! Pointer to the output aggregate //! //! @param[in] num_items //! Total number of input items (i.e., length of `d_in`) //! //! @param[in] stream //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst template CUB_RUNTIME_FUNCTION static cudaError_t ArgMin( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::ArgMin"); // Signed integer type for global offsets using OffsetT = int; // The input type using InputValueT = cub::detail::value_t; // The output tuple type using OutputTupleT = cub::detail::non_void_value_t>; using AccumT = OutputTupleT; using InitT = detail::reduce::empty_problem_init_t; // The output value type using OutputValueT = typename OutputTupleT::Value; // Wrapped input iterator to produce index-value tuples using ArgIndexInputIteratorT = ArgIndexInputIterator; ArgIndexInputIteratorT d_indexed_in(d_in); // Initial value // TODO Address https://github.com/NVIDIA/cub/issues/651 InitT initial_value{AccumT(1, Traits::Max())}; return DispatchReduce::Dispatch( d_temp_storage, temp_storage_bytes, d_indexed_in, d_out, num_items, cub::ArgMin(), initial_value, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ArgMin( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream); } //! @rst //! Computes a device-wide maximum using the greater-than (``>``) operator. //! //! - Uses ``std::numeric_limits::lowest()`` as the initial value of the reduction. //! - Does not support ``>`` operators that are non-commutative. //! - Provides "run-to-run" determinism for pseudo-associative reduction //! (e.g., addition of floating point types) on the same GPU device. //! However, results for pseudo-associative reduction may be inconsistent //! from one device to a another device of a different compute-capability //! because CUB can employ different tile-sizing for different architectures. //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_out``. //! - @devicestorage //! //! Snippet //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The code snippet below illustrates the max-reduction of a device vector of ``int`` data elements. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! // Declare, allocate, and initialize device-accessible pointers //! // for input and output //! int num_items; // e.g., 7 //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] //! int *d_max; // e.g., [-] //! ... //! //! // Determine temporary device storage requirements //! void *d_temp_storage = NULL; //! size_t temp_storage_bytes = 0; //! cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_max, num_items); //! //! // Allocate temporary storage //! cudaMalloc(&d_temp_storage, temp_storage_bytes); //! //! // Run max-reduction //! cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_max, num_items); //! //! // d_max <-- [9] //! //! @endrst //! //! @tparam InputIteratorT //! **[inferred]** Random-access input iterator type for reading input items @iterator //! //! @tparam OutputIteratorT //! **[inferred]** Output iterator type for recording the reduced aggregate @iterator //! //! @tparam NumItemsT //! **[inferred]** Type of num_items //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in] d_in //! Pointer to the input sequence of data items //! //! @param[out] d_out //! Pointer to the output aggregate //! //! @param[in] num_items //! Total number of input items (i.e., length of `d_in`) //! //! @param[in] stream //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst template CUB_RUNTIME_FUNCTION static cudaError_t Max(void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, NumItemsT num_items, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::Max"); // Signed integer type for global offsets using OffsetT = detail::choose_offset_t; // The input value type using InputT = cub::detail::value_t; using InitT = InputT; return DispatchReduce::Dispatch( d_temp_storage, temp_storage_bytes, d_in, d_out, static_cast(num_items), cub::Max(), // replace with // std::numeric_limits::lowest() // when C++11 support is more // prevalent Traits::Lowest(), stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t Max(void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return Max(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream); } //! @rst //! Finds the first device-wide maximum using the greater-than (``>``) //! operator, also returning the index of that item //! //! - The output value type of ``d_out`` is ``cub::KeyValuePair`` //! (assuming the value type of ``d_in`` is ``T``) //! //! - The maximum is written to ``d_out.value`` and its offset in the input //! array is written to ``d_out.key``. //! - The ``{1, std::numeric_limits::lowest()}`` tuple is produced for zero-length inputs //! //! - Does not support ``>`` operators that are non-commutative. //! - Provides "run-to-run" determinism for pseudo-associative reduction //! (e.g., addition of floating point types) on the same GPU device. //! However, results for pseudo-associative reduction may be inconsistent //! from one device to a another device of a different compute-capability //! because CUB can employ different tile-sizing for different architectures. //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_out``. //! - @devicestorage //! //! Snippet //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The code snippet below illustrates the argmax-reduction of a device vector //! of `int` data elements. //! //! .. code-block:: c++ //! //! #include //! // or equivalently //! //! // Declare, allocate, and initialize device-accessible pointers //! // for input and output //! int num_items; // e.g., 7 //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] //! KeyValuePair *d_argmax; // e.g., [{-,-}] //! ... //! //! // Determine temporary device storage requirements //! void *d_temp_storage = NULL; //! size_t temp_storage_bytes = 0; //! cub::DeviceReduce::ArgMax( //! d_temp_storage, temp_storage_bytes, d_in, d_argmax, num_items); //! //! // Allocate temporary storage //! cudaMalloc(&d_temp_storage, temp_storage_bytes); //! //! // Run argmax-reduction //! cub::DeviceReduce::ArgMax( //! d_temp_storage, temp_storage_bytes, d_in, d_argmax, num_items); //! //! // d_argmax <-- [{6, 9}] //! //! @endrst //! //! @tparam InputIteratorT //! **[inferred]** Random-access input iterator type for reading input items (of some type `T`) @iterator //! //! @tparam OutputIteratorT //! **[inferred]** Output iterator type for recording the reduced aggregate //! (having value type `cub::KeyValuePair`) @iterator //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in] d_in //! Pointer to the input sequence of data items //! //! @param[out] d_out //! Pointer to the output aggregate //! //! @param[in] num_items //! Total number of input items (i.e., length of `d_in`) //! //! @param[in] stream //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst template CUB_RUNTIME_FUNCTION static cudaError_t ArgMax( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::ArgMax"); // Signed integer type for global offsets using OffsetT = int; // The input type using InputValueT = cub::detail::value_t; // The output tuple type using OutputTupleT = cub::detail::non_void_value_t>; using AccumT = OutputTupleT; // The output value type using OutputValueT = typename OutputTupleT::Value; using InitT = detail::reduce::empty_problem_init_t; // Wrapped input iterator to produce index-value tuples using ArgIndexInputIteratorT = ArgIndexInputIterator; ArgIndexInputIteratorT d_indexed_in(d_in); // Initial value // TODO Address https://github.com/NVIDIA/cub/issues/651 InitT initial_value{AccumT(1, Traits::Lowest())}; return DispatchReduce::Dispatch( d_temp_storage, temp_storage_bytes, d_indexed_in, d_out, num_items, cub::ArgMax(), initial_value, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ArgMax( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream); } //! @rst //! Fuses transform and reduce operations //! //! - Does not support binary reduction operators that are non-commutative. //! - Provides "run-to-run" determinism for pseudo-associative reduction //! (e.g., addition of floating point types) on the same GPU device. //! However, results for pseudo-associative reduction may be inconsistent //! from one device to a another device of a different compute-capability //! because CUB can employ different tile-sizing for different architectures. //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_out``. //! - @devicestorage //! //! Snippet //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The code snippet below illustrates a user-defined min-reduction of a //! device vector of `int` data elements. //! //! .. code-block:: c++ //! //! #include //! // or equivalently //! //! thrust::device_vector in = { 1, 2, 3, 4 }; //! thrust::device_vector out(1); //! //! std::size_t temp_storage_bytes = 0; //! std::uint8_t *d_temp_storage = nullptr; //! //! const int init = 42; //! //! cub::DeviceReduce::TransformReduce( //! d_temp_storage, //! temp_storage_bytes, //! in.begin(), //! out.begin(), //! in.size(), //! cub::Sum{}, //! square_t{}, //! init); //! //! thrust::device_vector temp_storage(temp_storage_bytes); //! d_temp_storage = temp_storage.data().get(); //! //! cub::DeviceReduce::TransformReduce( //! d_temp_storage, //! temp_storage_bytes, //! in.begin(), //! out.begin(), //! in.size(), //! cub::Sum{}, //! square_t{}, //! init); //! //! // out[0] <-- 72 //! //! @endrst //! //! @tparam InputIteratorT //! **[inferred]** Random-access input iterator type for reading input items @iterator //! //! @tparam OutputIteratorT //! **[inferred]** Output iterator type for recording the reduced aggregate @iterator //! //! @tparam ReductionOpT //! **[inferred]** Binary reduction functor type having member `T operator()(const T &a, const T &b)` //! //! @tparam TransformOpT //! **[inferred]** Unary reduction functor type having member `auto operator()(const T &a)` //! //! @tparam T //! **[inferred]** Data element type that is convertible to the `value` type of `InputIteratorT` //! //! @tparam NumItemsT //! **[inferred]** Type of num_items //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in] d_in //! Pointer to the input sequence of data items //! //! @param[out] d_out //! Pointer to the output aggregate //! //! @param[in] num_items //! Total number of input items (i.e., length of `d_in`) //! //! @param[in] reduction_op //! Binary reduction functor //! //! @param[in] transform_op //! Unary transform functor //! //! @param[in] init //! Initial value of the reduction //! //! @param[in] stream //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst template CUB_RUNTIME_FUNCTION static cudaError_t TransformReduce( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, NumItemsT num_items, ReductionOpT reduction_op, TransformOpT transform_op, T init, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::TransformReduce"); using OffsetT = detail::choose_offset_t; return DispatchTransformReduce::Dispatch( d_temp_storage, temp_storage_bytes, d_in, d_out, static_cast(num_items), reduction_op, init, stream, transform_op); } //! @rst //! Reduces segments of values, where segments are demarcated by corresponding runs of identical keys. //! //! This operation computes segmented reductions within ``d_values_in`` using the specified binary ``reduction_op`` //! functor. The segments are identified by "runs" of corresponding keys in `d_keys_in`, where runs are maximal //! ranges of consecutive, identical keys. For the *i*\ :sup:`th` run encountered, the first key of the run and //! the corresponding value aggregate of that run are written to ``d_unique_out[i]`` and ``d_aggregates_out[i]``, //! respectively. The total number of runs encountered is written to ``d_num_runs_out``. //! //! - The ``==`` equality operator is used to determine whether keys are equivalent //! - Provides "run-to-run" determinism for pseudo-associative reduction //! (e.g., addition of floating point types) on the same GPU device. //! However, results for pseudo-associative reduction may be inconsistent //! from one device to a another device of a different compute-capability //! because CUB can employ different tile-sizing for different architectures. //! - Let ``out`` be any of //! ``[d_unique_out, d_unique_out + *d_num_runs_out)`` //! ``[d_aggregates_out, d_aggregates_out + *d_num_runs_out)`` //! ``d_num_runs_out``. The ranges represented by ``out`` shall not overlap //! ``[d_keys_in, d_keys_in + num_items)``, //! ``[d_values_in, d_values_in + num_items)`` nor ``out`` in any way. //! - @devicestorage //! //! Snippet //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The code snippet below illustrates the segmented reduction of ``int`` values grouped by runs of //! associated ``int`` keys. //! //! .. code-block:: c++ //! //! #include //! // or equivalently //! //! // CustomMin functor //! struct CustomMin //! { //! template //! CUB_RUNTIME_FUNCTION __forceinline__ //! T operator()(const T &a, const T &b) const { //! return (b < a) ? b : a; //! } //! }; //! //! // Declare, allocate, and initialize device-accessible pointers //! // for input and output //! int num_items; // e.g., 8 //! int *d_keys_in; // e.g., [0, 2, 2, 9, 5, 5, 5, 8] //! int *d_values_in; // e.g., [0, 7, 1, 6, 2, 5, 3, 4] //! int *d_unique_out; // e.g., [-, -, -, -, -, -, -, -] //! int *d_aggregates_out; // e.g., [-, -, -, -, -, -, -, -] //! int *d_num_runs_out; // e.g., [-] //! CustomMin reduction_op; //! ... //! //! // Determine temporary device storage requirements //! void *d_temp_storage = NULL; //! size_t temp_storage_bytes = 0; //! cub::DeviceReduce::ReduceByKey( //! d_temp_storage, temp_storage_bytes, //! d_keys_in, d_unique_out, d_values_in, //! d_aggregates_out, d_num_runs_out, reduction_op, num_items); //! //! // Allocate temporary storage //! cudaMalloc(&d_temp_storage, temp_storage_bytes); //! //! // Run reduce-by-key //! cub::DeviceReduce::ReduceByKey( //! d_temp_storage, temp_storage_bytes, //! d_keys_in, d_unique_out, d_values_in, //! d_aggregates_out, d_num_runs_out, reduction_op, num_items); //! //! // d_unique_out <-- [0, 2, 9, 5, 8] //! // d_aggregates_out <-- [0, 1, 6, 2, 4] //! // d_num_runs_out <-- [5] //! //! @endrst //! //! @tparam KeysInputIteratorT //! **[inferred]** Random-access input iterator type for reading input keys @iterator //! //! @tparam UniqueOutputIteratorT //! **[inferred]** Random-access output iterator type for writing unique output keys @iterator //! //! @tparam ValuesInputIteratorT //! **[inferred]** Random-access input iterator type for reading input values @iterator //! //! @tparam AggregatesOutputIterator //! **[inferred]** Random-access output iterator type for writing output value aggregates @iterator //! //! @tparam NumRunsOutputIteratorT //! **[inferred]** Output iterator type for recording the number of runs encountered @iterator //! //! @tparam ReductionOpT //! **[inferred]** Binary reduction functor type having member `T operator()(const T &a, const T &b)` //! //! @tparam NumItemsT //! **[inferred]** Type of num_items //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in] d_keys_in //! Pointer to the input sequence of keys //! //! @param[out] d_unique_out //! Pointer to the output sequence of unique keys (one key per run) //! //! @param[in] d_values_in //! Pointer to the input sequence of corresponding values //! //! @param[out] d_aggregates_out //! Pointer to the output sequence of value aggregates //! (one aggregate per run) //! //! @param[out] d_num_runs_out //! Pointer to total number of runs encountered //! (i.e., the length of `d_unique_out`) //! //! @param[in] reduction_op //! Binary reduction functor //! //! @param[in] num_items //! Total number of associated key+value pairs //! (i.e., the length of `d_in_keys` and `d_in_values`) //! //! @param[in] stream //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst template CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t ReduceByKey( void* d_temp_storage, size_t& temp_storage_bytes, KeysInputIteratorT d_keys_in, UniqueOutputIteratorT d_unique_out, ValuesInputIteratorT d_values_in, AggregatesOutputIteratorT d_aggregates_out, NumRunsOutputIteratorT d_num_runs_out, ReductionOpT reduction_op, NumItemsT num_items, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::ReduceByKey"); // Signed integer type for global offsets using OffsetT = detail::choose_offset_t; // FlagT iterator type (not used) // Selection op (not used) // Default == operator typedef Equality EqualityOp; return DispatchReduceByKey< KeysInputIteratorT, UniqueOutputIteratorT, ValuesInputIteratorT, AggregatesOutputIteratorT, NumRunsOutputIteratorT, EqualityOp, ReductionOpT, OffsetT>::Dispatch(d_temp_storage, temp_storage_bytes, d_keys_in, d_unique_out, d_values_in, d_aggregates_out, d_num_runs_out, EqualityOp(), reduction_op, static_cast(num_items), stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t ReduceByKey( void* d_temp_storage, size_t& temp_storage_bytes, KeysInputIteratorT d_keys_in, UniqueOutputIteratorT d_unique_out, ValuesInputIteratorT d_values_in, AggregatesOutputIteratorT d_aggregates_out, NumRunsOutputIteratorT d_num_runs_out, ReductionOpT reduction_op, int num_items, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return ReduceByKey( d_temp_storage, temp_storage_bytes, d_keys_in, d_unique_out, d_values_in, d_aggregates_out, d_num_runs_out, reduction_op, num_items, stream); } }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/device/device_run_length_encode.cuh000066400000000000000000000377651463375617100227440ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ //! @file cub::DeviceRunLengthEncode provides device-wide, parallel operations //! for computing a run-length encoding across a sequence of data items //! residing within device-accessible memory. #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include #include #include #include CUB_NAMESPACE_BEGIN //! @rst //! DeviceRunLengthEncode provides device-wide, parallel operations for //! demarcating "runs" of same-valued items within a sequence residing //! within device-accessible memory. //! //! Overview //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! A `run-length encoding `_ //! computes a simple compressed representation of a sequence of input elements //! such that each maximal "run" of consecutive same-valued data items is //! encoded as a single data value along with a count of the elements in that //! run. //! //! Usage Considerations //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! @cdp_class{DeviceRunLengthEncode} //! //! Performance //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! @linear_performance{run-length encode} //! //! @endrst struct DeviceRunLengthEncode { //! @rst //! Computes a run-length encoding of the sequence ``d_in``. //! //! - For the *i*\ :sup:`th` run encountered, the first key of the run and //! its length are written to ``d_unique_out[i]`` and ``d_counts_out[i]``, respectively. //! - The total number of runs encountered is written to ``d_num_runs_out``. //! - The ``==`` equality operator is used to determine whether values are equivalent //! - In-place operations are not supported. There must be no overlap between any of the provided ranges: //! //! - ``[d_unique_out, d_unique_out + *d_num_runs_out)`` //! - ``[d_counts_out, d_counts_out + *d_num_runs_out)`` //! - ``[d_num_runs_out, d_num_runs_out + 1)`` //! - ``[d_in, d_in + num_items)`` //! //! - @devicestorage //! //! Snippet //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The code snippet below illustrates the run-length encoding of a sequence of ``int`` values. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! // Declare, allocate, and initialize device-accessible pointers for //! // input and output //! int num_items; // e.g., 8 //! int *d_in; // e.g., [0, 2, 2, 9, 5, 5, 5, 8] //! int *d_unique_out; // e.g., [ , , , , , , , ] //! int *d_counts_out; // e.g., [ , , , , , , , ] //! int *d_num_runs_out; // e.g., [ ] //! ... //! //! // Determine temporary device storage requirements //! void *d_temp_storage = NULL; //! size_t temp_storage_bytes = 0; //! cub::DeviceRunLengthEncode::Encode( //! d_temp_storage, temp_storage_bytes, //! d_in, d_unique_out, d_counts_out, d_num_runs_out, num_items); //! //! // Allocate temporary storage //! cudaMalloc(&d_temp_storage, temp_storage_bytes); //! //! // Run encoding //! cub::DeviceRunLengthEncode::Encode( //! d_temp_storage, temp_storage_bytes, //! d_in, d_unique_out, d_counts_out, d_num_runs_out, num_items); //! //! // d_unique_out <-- [0, 2, 9, 5, 8] //! // d_counts_out <-- [1, 2, 1, 3, 1] //! // d_num_runs_out <-- [5] //! //! @endrst //! //! @tparam InputIteratorT //! **[inferred]** Random-access input iterator type for reading input items @iterator //! //! @tparam UniqueOutputIteratorT //! **[inferred]** Random-access output iterator type for writing unique output items @iterator //! //! @tparam LengthsOutputIteratorT //! **[inferred]** Random-access output iterator type for writing output counts @iterator //! //! @tparam NumRunsOutputIteratorT //! **[inferred]** Output iterator type for recording the number of runs encountered @iterator //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in] d_in //! Pointer to the input sequence of keys //! //! @param[out] d_unique_out //! Pointer to the output sequence of unique keys (one key per run) //! //! @param[out] d_counts_out //! Pointer to the output sequence of run-lengths (one count per run) //! //! @param[out] d_num_runs_out //! Pointer to total number of runs //! //! @param[in] num_items //! Total number of associated key+value pairs (i.e., the length of `d_in_keys` and `d_in_values`) //! //! @param[in] stream //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst template CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Encode( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, UniqueOutputIteratorT d_unique_out, LengthsOutputIteratorT d_counts_out, NumRunsOutputIteratorT d_num_runs_out, int num_items, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceRunLengthEncode::Encode"); using offset_t = int; // Signed integer type for global offsets using equality_op = Equality; // Default == operator using reduction_op = cub::Sum; // Value reduction operator // The lengths output value type using length_t = cub::detail::non_void_value_t; // Generator type for providing 1s values for run-length reduction using lengths_input_iterator_t = ConstantInputIterator; using accum_t = detail::accumulator_t; using key_t = cub::detail::non_void_value_t>; using policy_t = detail::device_run_length_encode_policy_hub; return DispatchReduceByKey< InputIteratorT, UniqueOutputIteratorT, lengths_input_iterator_t, LengthsOutputIteratorT, NumRunsOutputIteratorT, equality_op, reduction_op, offset_t, accum_t, policy_t>::Dispatch(d_temp_storage, temp_storage_bytes, d_in, d_unique_out, lengths_input_iterator_t((length_t) 1), d_counts_out, d_num_runs_out, equality_op(), reduction_op(), num_items, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Encode( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, UniqueOutputIteratorT d_unique_out, LengthsOutputIteratorT d_counts_out, NumRunsOutputIteratorT d_num_runs_out, int num_items, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return Encode( d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_counts_out, d_num_runs_out, num_items, stream); } //! @rst //! Enumerates the starting offsets and lengths of all non-trivial runs //! (of ``length > 1``) of same-valued keys in the sequence ``d_in``. //! //! - For the *i*\ :sup:`th` non-trivial run, the run's starting offset and //! its length are written to ``d_offsets_out[i]`` and ``d_lengths_out[i]``, respectively. //! - The total number of runs encountered is written to ``d_num_runs_out``. //! - The ``==`` equality operator is used to determine whether values are equivalent //! - In-place operations are not supported. There must be no overlap between any of the provided ranges: //! //! - ``[d_offsets_out, d_offsets_out + *d_num_runs_out)`` //! - ``[d_lengths_out, d_lengths_out + *d_num_runs_out)`` //! - ``[d_num_runs_out, d_num_runs_out + 1)`` //! - ``[d_in, d_in + num_items)`` //! //! - @devicestorage //! //! Snippet //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The code snippet below illustrates the identification of non-trivial runs //! within a sequence of ``int`` values. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! // Declare, allocate, and initialize device-accessible pointers //! // for input and output //! int num_items; // e.g., 8 //! int *d_in; // e.g., [0, 2, 2, 9, 5, 5, 5, 8] //! int *d_offsets_out; // e.g., [ , , , , , , , ] //! int *d_lengths_out; // e.g., [ , , , , , , , ] //! int *d_num_runs_out; // e.g., [ ] //! ... //! //! // Determine temporary device storage requirements //! void *d_temp_storage = NULL; //! size_t temp_storage_bytes = 0; //! cub::DeviceRunLengthEncode::NonTrivialRuns( //! d_temp_storage, temp_storage_bytes, //! d_in, d_offsets_out, d_lengths_out, d_num_runs_out, num_items); //! //! // Allocate temporary storage //! cudaMalloc(&d_temp_storage, temp_storage_bytes); //! //! // Run encoding //! cub::DeviceRunLengthEncode::NonTrivialRuns( //! d_temp_storage, temp_storage_bytes, //! d_in, d_offsets_out, d_lengths_out, d_num_runs_out, num_items); //! //! // d_offsets_out <-- [1, 4] //! // d_lengths_out <-- [2, 3] //! // d_num_runs_out <-- [2] //! //! @endrst //! //! @tparam InputIteratorT //! **[inferred]** Random-access input iterator type for reading input items @iterator //! //! @tparam OffsetsOutputIteratorT //! **[inferred]** Random-access output iterator type for writing run-offset values @iterator //! //! @tparam LengthsOutputIteratorT //! **[inferred]** Random-access output iterator type for writing run-length values @iterator //! //! @tparam NumRunsOutputIteratorT //! **[inferred]** Output iterator type for recording the number of runs encountered @iterator //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in] d_in //! Pointer to input sequence of data items //! //! @param[out] d_offsets_out //! Pointer to output sequence of run-offsets //! (one offset per non-trivial run) //! //! @param[out] d_lengths_out //! Pointer to output sequence of run-lengths (one count per non-trivial run) //! //! @param[out] d_num_runs_out //! Pointer to total number of runs (i.e., length of `d_offsets_out`) //! //! @param[in] num_items //! Total number of associated key+value pairs (i.e., the length of `d_in_keys` and `d_in_values`) //! //! @param[in] stream //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst template CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t NonTrivialRuns( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OffsetsOutputIteratorT d_offsets_out, LengthsOutputIteratorT d_lengths_out, NumRunsOutputIteratorT d_num_runs_out, int num_items, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceRunLengthEncode::NonTrivialRuns"); using OffsetT = int; // Signed integer type for global offsets using EqualityOp = Equality; // Default == operator return DeviceRleDispatch< InputIteratorT, OffsetsOutputIteratorT, LengthsOutputIteratorT, NumRunsOutputIteratorT, EqualityOp, OffsetT>::Dispatch(d_temp_storage, temp_storage_bytes, d_in, d_offsets_out, d_lengths_out, d_num_runs_out, EqualityOp(), num_items, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t NonTrivialRuns( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OffsetsOutputIteratorT d_offsets_out, LengthsOutputIteratorT d_lengths_out, NumRunsOutputIteratorT d_num_runs_out, int num_items, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return NonTrivialRuns( d_temp_storage, temp_storage_bytes, d_in, d_offsets_out, d_lengths_out, d_num_runs_out, num_items, stream); } }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/device/device_scan.cuh000066400000000000000000002236311463375617100201730ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ //! @file cub::DeviceScan provides device-wide, parallel operations for computing a prefix scan across //! a sequence of data items residing within device-accessible memory. #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include #include CUB_NAMESPACE_BEGIN //! @rst //! DeviceScan provides device-wide, parallel operations for computing a //! prefix scan across a sequence of data items residing within //! device-accessible memory. //! //! Overview //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! Given a sequence of input elements and a binary reduction operator, a //! `prefix scan `_ produces an output //! sequence where each element is computed to be the reduction of the elements //! occurring earlier in the input sequence. *Prefix sum* connotes a prefix scan //! with the addition operator. The term *inclusive* indicates that the //! *i*\ :sup:`th` output reduction incorporates the *i*\ :sup:`th` input. //! The term *exclusive* indicates the *i*\ :sup:`th` input is not //! incorporated into the *i*\ :sup:`th` output reduction. When the input and //! output sequences are the same, the scan is performed in-place. //! //! As of CUB 1.0.1 (2013), CUB's device-wide scan APIs have implemented our //! *"decoupled look-back"* algorithm for performing global prefix scan with //! only a single pass through the input data, as described in our 2016 technical //! report [1]_. The central idea is to leverage a small, constant factor of //! redundant work in order to overlap the latencies of global prefix //! propagation with local computation. As such, our algorithm requires only //! ``~2*n*`` data movement (``n`` inputs are read, ``n`` outputs are written), and //! typically proceeds at "memcpy" speeds. Our algorithm supports inplace operations. //! //! .. [1] Duane Merrill and Michael Garland. `Single-pass Parallel Prefix Scan with Decoupled Look-back //! `_, //! *NVIDIA Technical Report NVR-2016-002*, 2016. //! //! Usage Considerations //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! @cdp_class{DeviceScan} //! //! Performance //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! @linear_performance{prefix scan} //! //! @endrst struct DeviceScan { //! @name Exclusive scans //! @{ //! @rst //! Computes a device-wide exclusive prefix sum. //! The value of ``0`` is applied as the initial value, and is assigned to ``*d_out``. //! //! - Supports non-commutative sum operators. //! - Results are not deterministic for pseudo-associative operators (e.g., //! addition of floating-point types). Results for pseudo-associative //! operators may vary from run to run. Additional details can be found in //! the @lookback description. //! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place. //! The range ``[d_in, d_in + num_items)`` and ``[d_out, d_out + num_items)`` //! shall not overlap in any other way. //! - @devicestorage //! //! Snippet //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The code snippet below illustrates the exclusive prefix sum of an ``int`` //! device vector. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! // Declare, allocate, and initialize device-accessible pointers for //! // input and output //! int num_items; // e.g., 7 //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] //! int *d_out; // e.g., [ , , , , , , ] //! ... //! //! // Determine temporary device storage requirements //! void *d_temp_storage = NULL; //! size_t temp_storage_bytes = 0; //! cub::DeviceScan::ExclusiveSum( //! d_temp_storage, temp_storage_bytes, //! d_in, d_out, num_items); //! //! // Allocate temporary storage //! cudaMalloc(&d_temp_storage, temp_storage_bytes); //! //! // Run exclusive prefix sum //! cub::DeviceScan::ExclusiveSum( //! d_temp_storage, temp_storage_bytes, //! d_in, d_out, num_items); //! //! // d_out <-- [0, 8, 14, 21, 26, 29, 29] //! //! @endrst //! //! @tparam InputIteratorT //! **[inferred]** Random-access input iterator type for reading scan inputs @iterator //! //! @tparam OutputIteratorT //! **[inferred]** Random-access output iterator type for writing scan outputs @iterator //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in] d_in //! Random-access iterator to the input sequence of data items //! //! @param[out] d_out //! Random-access iterator to the output sequence of data items //! //! @param[in] num_items //! Total number of input items (i.e., the length of `d_in`) //! //! @param[in] stream //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst template CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSum( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::ExclusiveSum"); // Signed integer type for global offsets using OffsetT = int; using InitT = cub::detail::value_t; // Initial value InitT init_value{}; return DispatchScan, OffsetT>::Dispatch( d_temp_storage, temp_storage_bytes, d_in, d_out, Sum(), detail::InputValue(init_value), num_items, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSum( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return ExclusiveSum( d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream); } //! @rst //! Computes a device-wide exclusive prefix sum in-place. //! The value of ``0`` is applied as the initial value, and is assigned to ``*d_data``. //! //! - Supports non-commutative sum operators. //! - Results are not deterministic for pseudo-associative operators (e.g., //! addition of floating-point types). Results for pseudo-associative //! operators may vary from run to run. Additional details can be found in //! the @lookback description. //! - @devicestorage //! //! Snippet //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The code snippet below illustrates the exclusive prefix sum of an ``int`` //! device vector. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! // Declare, allocate, and initialize device-accessible pointers for //! // input and output //! int num_items; // e.g., 7 //! int *d_data; // e.g., [8, 6, 7, 5, 3, 0, 9] //! ... //! //! // Determine temporary device storage requirements //! void *d_temp_storage = NULL; //! size_t temp_storage_bytes = 0; //! cub::DeviceScan::ExclusiveSum( //! d_temp_storage, temp_storage_bytes, //! d_data, num_items); //! //! // Allocate temporary storage //! cudaMalloc(&d_temp_storage, temp_storage_bytes); //! //! // Run exclusive prefix sum //! cub::DeviceScan::ExclusiveSum( //! d_temp_storage, temp_storage_bytes, //! d_data, num_items); //! //! // d_data <-- [0, 8, 14, 21, 26, 29, 29] //! //! @endrst //! //! @tparam IteratorT //! **[inferred]** Random-access iterator type for reading scan inputs and wrigin scan outputs //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in,out] d_data //! Random-access iterator to the sequence of data items //! //! @param[in] num_items //! Total number of input items (i.e., the length of `d_in`) //! //! @param[in] stream //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst template CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSum( void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, int num_items, cudaStream_t stream = 0) { return ExclusiveSum(d_temp_storage, temp_storage_bytes, d_data, d_data, num_items, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSum( void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, int num_items, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return ExclusiveSum(d_temp_storage, temp_storage_bytes, d_data, num_items, stream); } //! @rst //! Computes a device-wide exclusive prefix scan using the specified //! binary ``scan_op`` functor. The ``init_value`` value is applied as //! the initial value, and is assigned to ``*d_out``. //! //! - Supports non-commutative scan operators. //! - Results are not deterministic for pseudo-associative operators (e.g., //! addition of floating-point types). Results for pseudo-associative //! operators may vary from run to run. Additional details can be found in //! the @lookback description. //! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place. The //! range ``[d_in, d_in + num_items)`` and ``[d_out, d_out + num_items)`` //! shall not overlap in any other way. //! - @devicestorage //! //! Snippet //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The code snippet below illustrates the exclusive prefix min-scan of an ``int`` device vector //! //! .. code-block:: c++ //! //! #include // or equivalently //! #include // for INT_MAX //! //! // CustomMin functor //! struct CustomMin //! { //! template //! CUB_RUNTIME_FUNCTION __forceinline__ //! T operator()(const T &a, const T &b) const { //! return (b < a) ? b : a; //! } //! }; //! //! // Declare, allocate, and initialize device-accessible pointers for //! // input and output //! int num_items; // e.g., 7 //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] //! int *d_out; // e.g., [ , , , , , , ] //! CustomMin min_op; //! ... //! //! // Determine temporary device storage requirements for exclusive //! // prefix scan //! void *d_temp_storage = NULL; //! size_t temp_storage_bytes = 0; //! cub::DeviceScan::ExclusiveScan( //! d_temp_storage, temp_storage_bytes, //! d_in, d_out, min_op, (int) INT_MAX, num_items); //! //! // Allocate temporary storage for exclusive prefix scan //! cudaMalloc(&d_temp_storage, temp_storage_bytes); //! //! // Run exclusive prefix min-scan //! cub::DeviceScan::ExclusiveScan( //! d_temp_storage, temp_storage_bytes, //! d_in, d_out, min_op, (int) INT_MAX, num_items); //! //! // d_out <-- [2147483647, 8, 6, 6, 5, 3, 0] //! //! @endrst //! //! @tparam InputIteratorT //! **[inferred]** Random-access input iterator type for reading scan inputs @iterator //! //! @tparam OutputIteratorT //! **[inferred]** Random-access output iterator type for writing scan outputs @iterator //! //! @tparam ScanOp //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)` //! //! @tparam InitValueT //! **[inferred]** Type of the `init_value` used Binary scan functor type //! having member `T operator()(const T &a, const T &b)` //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in] d_in //! Random-access iterator to the input sequence of data items //! //! @param[out] d_out //! Random-access iterator to the output sequence of data items //! //! @param[in] scan_op //! Binary scan functor //! //! @param[in] init_value //! Initial value to seed the exclusive scan (and is assigned to `*d_out`) //! //! @param[in] num_items //! Total number of input items (i.e., the length of `d_in`) //! //! @param[in] stream //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst template CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, ScanOpT scan_op, InitValueT init_value, int num_items, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::ExclusiveScan"); // Signed integer type for global offsets using OffsetT = int; return DispatchScan, OffsetT>::Dispatch( d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, detail::InputValue(init_value), num_items, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, ScanOpT scan_op, InitValueT init_value, int num_items, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return ExclusiveScan( d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, init_value, num_items, stream); } //! @rst //! Computes a device-wide exclusive prefix scan using the specified //! binary ``scan_op`` functor. The ``init_value`` value is applied as //! the initial value, and is assigned to ``*d_data``. //! //! - Supports non-commutative scan operators. //! - Results are not deterministic for pseudo-associative operators (e.g., //! addition of floating-point types). Results for pseudo-associative //! operators may vary from run to run. Additional details can be found in //! the @lookback description. //! - @devicestorage //! //! Snippet //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The code snippet below illustrates the exclusive prefix min-scan of an //! ``int`` device vector: //! //! .. code-block:: c++ //! //! #include // or equivalently //! #include // for INT_MAX //! //! // CustomMin functor //! struct CustomMin //! { //! template //! CUB_RUNTIME_FUNCTION __forceinline__ //! T operator()(const T &a, const T &b) const { //! return (b < a) ? b : a; //! } //! }; //! //! // Declare, allocate, and initialize device-accessible pointers for //! // input and output //! int num_items; // e.g., 7 //! int *d_data; // e.g., [8, 6, 7, 5, 3, 0, 9] //! CustomMin min_op; //! ... //! //! // Determine temporary device storage requirements for exclusive //! // prefix scan //! void *d_temp_storage = NULL; //! size_t temp_storage_bytes = 0; //! cub::DeviceScan::ExclusiveScan( //! d_temp_storage, temp_storage_bytes, //! d_data, min_op, (int) INT_MAX, num_items); //! //! // Allocate temporary storage for exclusive prefix scan //! cudaMalloc(&d_temp_storage, temp_storage_bytes); //! //! // Run exclusive prefix min-scan //! cub::DeviceScan::ExclusiveScan( //! d_temp_storage, temp_storage_bytes, //! d_data, min_op, (int) INT_MAX, num_items); //! //! // d_data <-- [2147483647, 8, 6, 6, 5, 3, 0] //! //! @endrst //! //! @tparam IteratorT //! **[inferred]** Random-access input iterator type for reading scan inputs and writing scan outputs //! //! @tparam ScanOp //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)` //! //! @tparam InitValueT //! **[inferred]** Type of the `init_value` used Binary scan functor type //! having member `T operator()(const T &a, const T &b)` //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in,out] d_data //! Random-access iterator to the sequence of data items //! //! @param[in] scan_op //! Binary scan functor //! //! @param[in] init_value //! Initial value to seed the exclusive scan (and is assigned to `*d_out`) //! //! @param[in] num_items //! Total number of input items (i.e., the length of `d_in`) //! //! @param[in] stream //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst template CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan( void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, ScanOpT scan_op, InitValueT init_value, int num_items, cudaStream_t stream = 0) { return ExclusiveScan(d_temp_storage, temp_storage_bytes, d_data, d_data, scan_op, init_value, num_items, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan( void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, ScanOpT scan_op, InitValueT init_value, int num_items, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return ExclusiveScan( d_temp_storage, temp_storage_bytes, d_data, scan_op, init_value, num_items, stream); } //! @rst //! Computes a device-wide exclusive prefix scan using the specified //! binary ``scan_op`` functor. The ``init_value`` value is provided as a future value. //! //! - Supports non-commutative scan operators. //! - Results are not deterministic for pseudo-associative operators (e.g., //! addition of floating-point types). Results for pseudo-associative //! operators may vary from run to run. Additional details can be found in //! the @lookback description. //! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place. //! The range ``[d_in, d_in + num_items)`` and ``[d_out, d_out + num_items)`` //! shall not overlap in any other way. //! - @devicestorage //! //! Snippet //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The code snippet below illustrates the exclusive prefix min-scan of an ``int`` device vector //! //! .. code-block:: c++ //! //! #include // or equivalently //! #include // for INT_MAX //! //! // CustomMin functor //! struct CustomMin //! { //! template //! CUB_RUNTIME_FUNCTION __forceinline__ //! T operator()(const T &a, const T &b) const { //! return (b < a) ? b : a; //! } //! }; //! //! // Declare, allocate, and initialize device-accessible pointers for //! // input and output //! int num_items; // e.g., 7 //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] //! int *d_out; // e.g., [ , , , , , , ] //! int *d_init_iter; // e.g., INT_MAX //! CustomMin min_op; //! //! auto future_init_value = //! cub::FutureValue(d_init_iter); //! //! ... //! //! // Determine temporary device storage requirements for exclusive //! // prefix scan //! void *d_temp_storage = NULL; //! size_t temp_storage_bytes = 0; //! cub::DeviceScan::ExclusiveScan( //! d_temp_storage, temp_storage_bytes, //! d_in, d_out, min_op, future_init_value, num_items); //! //! // Allocate temporary storage for exclusive prefix scan //! cudaMalloc(&d_temp_storage, temp_storage_bytes); //! //! // Run exclusive prefix min-scan //! cub::DeviceScan::ExclusiveScan( //! d_temp_storage, temp_storage_bytes, //! d_in, d_out, min_op, future_init_value, num_items); //! //! // d_out <-- [2147483647, 8, 6, 6, 5, 3, 0] //! //! @endrst //! //! @tparam InputIteratorT //! **[inferred]** Random-access input iterator type for reading scan inputs @iterator //! //! @tparam OutputIteratorT //! **[inferred]** Random-access output iterator type for writing scan outputs @iterator //! //! @tparam ScanOp //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)` //! //! @tparam InitValueT //! **[inferred]** Type of the `init_value` used Binary scan functor type //! having member `T operator()(const T &a, const T &b)` //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in] d_in //! Pointer to the input sequence of data items //! //! @param[out] d_out //! Pointer to the output sequence of data items //! //! @param[in] scan_op //! Binary scan functor //! //! @param[in] init_value //! Initial value to seed the exclusive scan (and is assigned to `*d_out`) //! //! @param[in] num_items //! Total number of input items (i.e., the length of `d_in`) //! //! @param[in] stream //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst template CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, ScanOpT scan_op, FutureValue init_value, int num_items, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::ExclusiveScan"); // Signed integer type for global offsets using OffsetT = int; return DispatchScan, OffsetT>::Dispatch( d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, detail::InputValue(init_value), num_items, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, ScanOpT scan_op, FutureValue init_value, int num_items, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return ExclusiveScan( d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, init_value, num_items, stream); } //! @rst //! Computes a device-wide exclusive prefix scan using the specified binary ``scan_op`` functor. //! The ``init_value`` value is provided as a future value. //! //! - Supports non-commutative scan operators. //! - Results are not deterministic for pseudo-associative operators (e.g., //! addition of floating-point types). Results for pseudo-associative //! operators may vary from run to run. Additional details can be found in //! the @lookback description. //! - @devicestorage //! //! Snippet //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The code snippet below illustrates the exclusive prefix min-scan of an ``int`` device vector //! //! .. code-block:: c++ //! //! #include // or equivalently //! #include // for INT_MAX //! //! // CustomMin functor //! struct CustomMin //! { //! template //! CUB_RUNTIME_FUNCTION __forceinline__ //! T operator()(const T &a, const T &b) const { //! return (b < a) ? b : a; //! } //! }; //! //! // Declare, allocate, and initialize device-accessible pointers for //! // input and output //! int num_items; // e.g., 7 //! int *d_data; // e.g., [8, 6, 7, 5, 3, 0, 9] //! int *d_init_iter; // e.g., INT_MAX //! CustomMin min_op; //! //! auto future_init_value = //! cub::FutureValue(d_init_iter); //! //! ... //! //! // Determine temporary device storage requirements for exclusive //! // prefix scan //! void *d_temp_storage = NULL; //! size_t temp_storage_bytes = 0; //! cub::DeviceScan::ExclusiveScan( //! d_temp_storage, temp_storage_bytes, //! d_data, min_op, future_init_value, num_items); //! //! // Allocate temporary storage for exclusive prefix scan //! cudaMalloc(&d_temp_storage, temp_storage_bytes); //! //! // Run exclusive prefix min-scan //! cub::DeviceScan::ExclusiveScan( //! d_temp_storage, temp_storage_bytes, //! d_data, min_op, future_init_value, num_items); //! //! // d_data <-- [2147483647, 8, 6, 6, 5, 3, 0] //! //! @endrst //! //! @tparam IteratorT //! **[inferred]** Random-access input iterator type for reading scan inputs and writing scan outputs //! //! @tparam ScanOp //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)` //! //! @tparam InitValueT //! **[inferred]** Type of the `init_value` used Binary scan functor type //! having member `T operator()(const T &a, const T &b)` //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in,out] d_data //! Pointer to the sequence of data items //! //! @param[in] scan_op //! Binary scan functor //! //! @param[in] init_value //! Initial value to seed the exclusive scan (and is assigned to `*d_out`) //! //! @param[in] num_items //! Total number of input items (i.e., the length of `d_in`) //! //! @param[in] stream //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst template CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan( void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, ScanOpT scan_op, FutureValue init_value, int num_items, cudaStream_t stream = 0) { return ExclusiveScan(d_temp_storage, temp_storage_bytes, d_data, d_data, scan_op, init_value, num_items, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan( void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, ScanOpT scan_op, FutureValue init_value, int num_items, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return ExclusiveScan( d_temp_storage, temp_storage_bytes, d_data, scan_op, init_value, num_items, stream); } //! @} end member group //! @name Inclusive scans //! @{ //! @rst //! Computes a device-wide inclusive prefix sum. //! //! - Supports non-commutative sum operators. //! - Results are not deterministic for pseudo-associative operators (e.g., //! addition of floating-point types). Results for pseudo-associative //! operators may vary from run to run. Additional details can be found in //! the @lookback description. //! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place. The //! range ``[d_in, d_in + num_items)`` and ``[d_out, d_out + num_items)`` //! shall not overlap in any other way. //! - @devicestorage //! //! Snippet //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The code snippet below illustrates the inclusive prefix sum of an ``int`` device vector. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! // Declare, allocate, and initialize device-accessible pointers for //! // input and output //! int num_items; // e.g., 7 //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] //! int *d_out; // e.g., [ , , , , , , ] //! ... //! //! // Determine temporary device storage requirements for inclusive //! // prefix sum //! void *d_temp_storage = nullptr; //! size_t temp_storage_bytes = 0; //! cub::DeviceScan::InclusiveSum( //! d_temp_storage, temp_storage_bytes, //! d_in, d_out, num_items); //! //! // Allocate temporary storage for inclusive prefix sum //! cudaMalloc(&d_temp_storage, temp_storage_bytes); //! //! // Run inclusive prefix sum //! cub::DeviceScan::InclusiveSum( //! d_temp_storage, temp_storage_bytes, //! d_in, d_out, num_items); //! //! // d_out <-- [8, 14, 21, 26, 29, 29, 38] //! //! @endrst //! //! @tparam InputIteratorT //! **[inferred]** Random-access input iterator type for reading scan inputs @iterator //! //! @tparam OutputIteratorT //! **[inferred]** Random-access output iterator type for writing scan outputs @iterator //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in] d_in //! Random-access iterator to the input sequence of data items //! //! @param[out] d_out //! Random-access iterator to the output sequence of data items //! //! @param[in] num_items //! Total number of input items (i.e., the length of `d_in`) //! //! @param[in] stream //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst template CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSum( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::InclusiveSum"); // Signed integer type for global offsets using OffsetT = int; return DispatchScan::Dispatch( d_temp_storage, temp_storage_bytes, d_in, d_out, Sum(), NullType(), num_items, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSum( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return InclusiveSum( d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream); } //! @rst //! Computes a device-wide inclusive prefix sum in-place. //! //! - Supports non-commutative sum operators. //! - Results are not deterministic for pseudo-associative operators (e.g., //! addition of floating-point types). Results for pseudo-associative //! operators may vary from run to run. Additional details can be found in //! the @lookback description. //! - @devicestorage //! //! Snippet //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The code snippet below illustrates the inclusive prefix sum of an ``int`` device vector. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! // Declare, allocate, and initialize device-accessible pointers for //! // input and output //! int num_items; // e.g., 7 //! int *d_data; // e.g., [8, 6, 7, 5, 3, 0, 9] //! ... //! //! // Determine temporary device storage requirements for inclusive //! // prefix sum //! void *d_temp_storage = nullptr; //! size_t temp_storage_bytes = 0; //! cub::DeviceScan::InclusiveSum( //! d_temp_storage, temp_storage_bytes, //! d_data, num_items); //! //! // Allocate temporary storage for inclusive prefix sum //! cudaMalloc(&d_temp_storage, temp_storage_bytes); //! //! // Run inclusive prefix sum //! cub::DeviceScan::InclusiveSum( //! d_temp_storage, temp_storage_bytes, //! d_data, num_items); //! //! // d_data <-- [8, 14, 21, 26, 29, 29, 38] //! //! @endrst //! //! @tparam IteratorT //! **[inferred]** Random-access input iterator type for reading scan inputs and writing scan outputs //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in,out] d_data //! Random-access iterator to the sequence of data items //! //! @param[in] num_items //! Total number of input items (i.e., the length of `d_in`) //! //! @param[in] stream //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst template CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSum( void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, int num_items, cudaStream_t stream = 0) { return InclusiveSum(d_temp_storage, temp_storage_bytes, d_data, d_data, num_items, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSum( void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, int num_items, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return InclusiveSum(d_temp_storage, temp_storage_bytes, d_data, num_items, stream); } //! @rst //! Computes a device-wide inclusive prefix scan using the specified binary ``scan_op`` functor. //! //! - Supports non-commutative scan operators. //! - Results are not deterministic for pseudo-associative operators (e.g., //! addition of floating-point types). Results for pseudo-associative //! operators may vary from run to run. Additional details can be found in //! the @lookback description. //! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place. The //! range ``[d_in, d_in + num_items)`` and ``[d_out, d_out + num_items)`` //! shall not overlap in any other way. //! - @devicestorage //! //! Snippet //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The code snippet below illustrates the inclusive prefix min-scan of an ``int`` device vector. //! //! .. code-block:: c++ //! //! #include // or equivalently //! #include // for INT_MAX //! //! // CustomMin functor //! struct CustomMin //! { //! template //! CUB_RUNTIME_FUNCTION __forceinline__ //! T operator()(const T &a, const T &b) const { //! return (b < a) ? b : a; //! } //! }; //! //! // Declare, allocate, and initialize device-accessible pointers for //! // input and output //! int num_items; // e.g., 7 //! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] //! int *d_out; // e.g., [ , , , , , , ] //! CustomMin min_op; //! ... //! //! // Determine temporary device storage requirements for inclusive //! // prefix scan //! void *d_temp_storage = nullptr; //! size_t temp_storage_bytes = 0; //! cub::DeviceScan::InclusiveScan( //! d_temp_storage, temp_storage_bytes, //! d_in, d_out, min_op, num_items); //! //! // Allocate temporary storage for inclusive prefix scan //! cudaMalloc(&d_temp_storage, temp_storage_bytes); //! //! // Run inclusive prefix min-scan //! cub::DeviceScan::InclusiveScan( //! d_temp_storage, temp_storage_bytes, //! d_in, d_out, min_op, num_items); //! //! // d_out <-- [8, 6, 6, 5, 3, 0, 0] //! //! @endrst //! //! @tparam InputIteratorT //! **[inferred]** Random-access input iterator type for reading scan inputs @iterator //! //! @tparam OutputIteratorT //! **[inferred]** Random-access output iterator type for writing scan outputs @iterator //! //! @tparam ScanOp //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)` //! //! @param[in] //! d_temp_storage Device-accessible allocation of temporary storage. //! When `nullptr`, the required allocation size is written to //! `temp_storage_bytes` and no work is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in] d_in //! Random-access iterator to the input sequence of data items //! //! @param[out] d_out //! Random-access iterator to the output sequence of data items //! //! @param[in] scan_op //! Binary scan functor //! //! @param[in] num_items //! Total number of input items (i.e., the length of `d_in`) //! //! @param[in] stream //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst template CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScan( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, ScanOpT scan_op, int num_items, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::InclusiveScan"); // Signed integer type for global offsets using OffsetT = int; return DispatchScan::Dispatch( d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, NullType(), num_items, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScan( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, ScanOpT scan_op, int num_items, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return InclusiveScan( d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, num_items, stream); } //! @rst //! Computes a device-wide inclusive prefix scan using the specified binary ``scan_op`` functor. //! //! - Supports non-commutative scan operators. //! - Results are not deterministic for pseudo-associative operators (e.g., //! addition of floating-point types). Results for pseudo-associative //! operators may vary from run to run. Additional details can be found in //! the @lookback description. //! - @devicestorage //! //! Snippet //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The code snippet below illustrates the inclusive prefix min-scan of an ``int`` device vector. //! //! .. code-block:: c++ //! //! #include // or equivalently //! #include // for INT_MAX //! //! // CustomMin functor //! struct CustomMin //! { //! template //! CUB_RUNTIME_FUNCTION __forceinline__ //! T operator()(const T &a, const T &b) const { //! return (b < a) ? b : a; //! } //! }; //! //! // Declare, allocate, and initialize device-accessible pointers for //! // input and output //! int num_items; // e.g., 7 //! int *d_data; // e.g., [8, 6, 7, 5, 3, 0, 9] //! CustomMin min_op; //! ... //! //! // Determine temporary device storage requirements for inclusive //! // prefix scan //! void *d_temp_storage = nullptr; //! size_t temp_storage_bytes = 0; //! cub::DeviceScan::InclusiveScan( //! d_temp_storage, temp_storage_bytes, //! d_data, min_op, num_items); //! //! // Allocate temporary storage for inclusive prefix scan //! cudaMalloc(&d_temp_storage, temp_storage_bytes); //! //! // Run inclusive prefix min-scan //! cub::DeviceScan::InclusiveScan( //! d_temp_storage, temp_storage_bytes, //! d_in, d_out, min_op, num_items); //! //! // d_data <-- [8, 6, 6, 5, 3, 0, 0] //! //! @endrst //! //! @tparam IteratorT //! **[inferred]** Random-access input iterator type for reading scan inputs and writing scan outputs //! //! @tparam ScanOp //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)` //! //! @param[in] //! d_temp_storage Device-accessible allocation of temporary storage. //! When `nullptr`, the required allocation size is written to //! `temp_storage_bytes` and no work is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in] d_data //! Random-access iterator to the sequence of data items //! //! @param[in] scan_op //! Binary scan functor //! //! @param[in] num_items //! Total number of input items (i.e., the length of `d_in`) //! //! @param[in] stream //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst template CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScan( void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, ScanOpT scan_op, int num_items, cudaStream_t stream = 0) { return InclusiveScan(d_temp_storage, temp_storage_bytes, d_data, d_data, scan_op, num_items, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScan( void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, ScanOpT scan_op, int num_items, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return InclusiveScan(d_temp_storage, temp_storage_bytes, d_data, scan_op, num_items, stream); } //! @rst //! Computes a device-wide exclusive prefix sum-by-key with key equality //! defined by ``equality_op``. The value of ``0`` is applied as the initial //! value, and is assigned to the beginning of each segment in ``d_values_out``. //! //! - Supports non-commutative sum operators. //! - Results are not deterministic for pseudo-associative operators (e.g., //! addition of floating-point types). Results for pseudo-associative //! operators may vary from run to run. Additional details can be found in //! the @lookback description. //! - ``d_keys_in`` may equal ``d_values_out`` but the range //! ``[d_keys_in, d_keys_in + num_items)`` and the range //! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise. //! - ``d_values_in`` may equal ``d_values_out`` but the range //! ``[d_values_in, d_values_in + num_items)`` and the range //! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise. //! - @devicestorage //! //! Snippet //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The code snippet below illustrates the exclusive prefix sum-by-key of an ``int`` device vector. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! // Declare, allocate, and initialize device-accessible pointers for //! // input and output //! int num_items; // e.g., 7 //! int *d_keys_in; // e.g., [0, 0, 1, 1, 1, 2, 2] //! int *d_values_in; // e.g., [8, 6, 7, 5, 3, 0, 9] //! int *d_values_out; // e.g., [ , , , , , , ] //! ... //! //! // Determine temporary device storage requirements //! void *d_temp_storage = nullptr; //! size_t temp_storage_bytes = 0; //! cub::DeviceScan::ExclusiveSumByKey( //! d_temp_storage, temp_storage_bytes, //! d_keys_in, d_values_in, d_values_out, num_items); //! //! // Allocate temporary storage //! cudaMalloc(&d_temp_storage, temp_storage_bytes); //! //! // Run exclusive prefix sum //! cub::DeviceScan::ExclusiveSumByKey( //! d_temp_storage, temp_storage_bytes, //! d_keys_in, d_values_in, d_values_out, num_items); //! //! // d_values_out <-- [0, 8, 0, 7, 12, 0, 0] //! //! @endrst //! //! @tparam KeysInputIteratorT //! **[inferred]** Random-access input iterator type for reading scan keys inputs @iterator //! //! @tparam ValuesInputIteratorT //! **[inferred]** Random-access input iterator type for reading scan values inputs @iterator //! //! @tparam ValuesOutputIteratorT //! **[inferred]** Random-access output iterator type for writing scan values outputs @iterator //! //! @tparam EqualityOpT //! **[inferred]** Functor type having member //! `T operator()(const T &a, const T &b)` for binary operations that defines the equality of keys //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in] d_keys_in //! Random-access input iterator to the input sequence of key items //! //! @param[in] d_values_in //! Random-access input iterator to the input sequence of value items //! //! @param[out] d_values_out //! Random-access output iterator to the output sequence of value items //! //! @param[in] num_items //! Total number of input items (i.e., the length of `d_keys_in` and `d_values_in`) //! //! @param[in] equality_op //! Binary functor that defines the equality of keys. //! Default is cub::Equality(). //! //! @param[in] stream //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst template CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSumByKey( void* d_temp_storage, size_t& temp_storage_bytes, KeysInputIteratorT d_keys_in, ValuesInputIteratorT d_values_in, ValuesOutputIteratorT d_values_out, int num_items, EqualityOpT equality_op = EqualityOpT(), cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::ExclusiveSumByKey"); // Signed integer type for global offsets using OffsetT = int; using InitT = cub::detail::value_t; // Initial value InitT init_value{}; return DispatchScanByKey< KeysInputIteratorT, ValuesInputIteratorT, ValuesOutputIteratorT, EqualityOpT, Sum, InitT, OffsetT>::Dispatch(d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_values_out, equality_op, Sum(), init_value, num_items, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSumByKey( void* d_temp_storage, size_t& temp_storage_bytes, KeysInputIteratorT d_keys_in, ValuesInputIteratorT d_values_in, ValuesOutputIteratorT d_values_out, int num_items, EqualityOpT equality_op, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return ExclusiveSumByKey( d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_values_out, num_items, equality_op, stream); } //! @rst //! Computes a device-wide exclusive prefix scan-by-key using the //! specified binary ``scan_op`` functor. The key equality is defined by //! ``equality_op``. The ``init_value`` value is applied as the initial //! value, and is assigned to the beginning of each segment in ``d_values_out``. //! //! - Supports non-commutative scan operators. //! - Results are not deterministic for pseudo-associative operators (e.g., //! addition of floating-point types). Results for pseudo-associative //! operators may vary from run to run. Additional details can be found in //! the @lookback description. //! - ``d_keys_in`` may equal ``d_values_out`` but the range //! ``[d_keys_in, d_keys_in + num_items)`` and the range //! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise. //! - ``d_values_in`` may equal ``d_values_out`` but the range //! ``[d_values_in, d_values_in + num_items)`` and the range //! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise. //! - @devicestorage //! //! Snippet //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The code snippet below illustrates the exclusive prefix min-scan-by-key of an ``int`` device vector //! //! .. code-block:: c++ //! //! #include // or equivalently //! #include // for INT_MAX //! //! // CustomMin functor //! struct CustomMin //! { //! template //! CUB_RUNTIME_FUNCTION __forceinline__ //! T operator()(const T &a, const T &b) const { //! return (b < a) ? b : a; //! } //! }; //! //! // CustomEqual functor //! struct CustomEqual //! { //! template //! CUB_RUNTIME_FUNCTION __forceinline__ //! T operator()(const T &a, const T &b) const { //! return a == b; //! } //! }; //! //! // Declare, allocate, and initialize device-accessible pointers for //! // input and output //! int num_items; // e.g., 7 //! int *d_keys_in; // e.g., [0, 0, 1, 1, 1, 2, 2] //! int *d_values_in; // e.g., [8, 6, 7, 5, 3, 0, 9] //! int *d_values_out; // e.g., [ , , , , , , ] //! CustomMin min_op; //! CustomEqual equality_op; //! ... //! //! // Determine temporary device storage requirements for exclusive //! // prefix scan //! void *d_temp_storage = NULL; //! size_t temp_storage_bytes = 0; //! cub::DeviceScan::ExclusiveScanByKey( //! d_temp_storage, temp_storage_bytes, //! d_keys_in, d_values_in, d_values_out, min_op, //! (int) INT_MAX, num_items, equality_op); //! //! // Allocate temporary storage for exclusive prefix scan //! cudaMalloc(&d_temp_storage, temp_storage_bytes); //! //! // Run exclusive prefix min-scan //! cub::DeviceScan::ExclusiveScanByKey( //! d_temp_storage, temp_storage_bytes, //! d_keys_in, d_values_in, d_values_out, min_op, //! (int) INT_MAX, num_items, equality_op); //! //! // d_values_out <-- [2147483647, 8, 2147483647, 7, 5, 2147483647, 0] //! //! @endrst //! //! @tparam KeysInputIteratorT //! **[inferred]** Random-access input iterator type for reading scan keys inputs @iterator //! //! @tparam ValuesInputIteratorT //! **[inferred]** Random-access input iterator type for reading scan values inputs @iterator //! //! @tparam ValuesOutputIteratorT //! **[inferred]** Random-access output iterator type for writing scan values outputs @iterator //! //! @tparam ScanOp //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)` //! //! @tparam InitValueT //! **[inferred]** Type of the `init_value` value used in Binary scan //! functor type having member `T operator()(const T &a, const T &b)` //! //! @tparam EqualityOpT //! **[inferred]** Functor type having member //! `T operator()(const T &a, const T &b)` for binary operations that defines the equality of keys //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in] d_keys_in //! Random-access input iterator to the input sequence of key items //! //! @param[in] d_values_in //! Random-access input iterator to the input sequence of value items //! //! @param[out] d_values_out //! Random-access output iterator to the output sequence of value items //! //! @param[in] scan_op //! Binary scan functor //! //! @param[in] init_value //! Initial value to seed the exclusive scan (and is assigned to the //! beginning of each segment in `d_values_out`) //! //! @param[in] num_items //! Total number of input items (i.e., the length of `d_keys_in` and //! `d_values_in`) //! //! @param[in] equality_op //! Binary functor that defines the equality of keys. //! Default is cub::Equality(). //! //! @param[in] stream //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst template CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScanByKey( void* d_temp_storage, size_t& temp_storage_bytes, KeysInputIteratorT d_keys_in, ValuesInputIteratorT d_values_in, ValuesOutputIteratorT d_values_out, ScanOpT scan_op, InitValueT init_value, int num_items, EqualityOpT equality_op = EqualityOpT(), cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::ExclusiveScanByKey"); // Signed integer type for global offsets using OffsetT = int; return DispatchScanByKey< KeysInputIteratorT, ValuesInputIteratorT, ValuesOutputIteratorT, EqualityOpT, ScanOpT, InitValueT, OffsetT>::Dispatch(d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_values_out, equality_op, scan_op, init_value, num_items, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScanByKey( void* d_temp_storage, size_t& temp_storage_bytes, KeysInputIteratorT d_keys_in, ValuesInputIteratorT d_values_in, ValuesOutputIteratorT d_values_out, ScanOpT scan_op, InitValueT init_value, int num_items, EqualityOpT equality_op, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return ExclusiveScanByKey( d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_values_out, scan_op, init_value, num_items, equality_op, stream); } //! @rst //! Computes a device-wide inclusive prefix sum-by-key with key equality defined by ``equality_op``. //! //! - Supports non-commutative sum operators. //! - Results are not deterministic for pseudo-associative operators (e.g., //! addition of floating-point types). Results for pseudo-associative //! operators may vary from run to run. Additional details can be found in //! the @lookback description. //! - ``d_keys_in`` may equal ``d_values_out`` but the range //! ``[d_keys_in, d_keys_in + num_items)`` and the range //! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise. //! - ``d_values_in`` may equal ``d_values_out`` but the range //! ``[d_values_in, d_values_in + num_items)`` and the range //! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise. //! - @devicestorage //! //! Snippet //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The code snippet below illustrates the inclusive prefix sum-by-key of an ``int`` device vector. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! // Declare, allocate, and initialize device-accessible pointers for //! // input and output //! int num_items; // e.g., 7 //! int *d_keys_in; // e.g., [0, 0, 1, 1, 1, 2, 2] //! int *d_values_in; // e.g., [8, 6, 7, 5, 3, 0, 9] //! int *d_values_out; // e.g., [ , , , , , , ] //! ... //! //! // Determine temporary device storage requirements for inclusive prefix sum //! void *d_temp_storage = NULL; //! size_t temp_storage_bytes = 0; //! cub::DeviceScan::InclusiveSumByKey( //! d_temp_storage, temp_storage_bytes, //! d_keys_in, d_values_in, d_values_out, num_items); //! //! // Allocate temporary storage for inclusive prefix sum //! cudaMalloc(&d_temp_storage, temp_storage_bytes); //! //! // Run inclusive prefix sum //! cub::DeviceScan::InclusiveSumByKey( //! d_temp_storage, temp_storage_bytes, //! d_keys_in, d_values_in, d_values_out, num_items); //! //! // d_out <-- [8, 14, 7, 12, 15, 0, 9] //! //! @endrst //! //! @tparam KeysInputIteratorT //! **[inferred]** Random-access input iterator type for reading scan keys inputs @iterator //! //! @tparam ValuesInputIteratorT //! **[inferred]** Random-access input iterator type for reading scan values inputs @iterator //! //! @tparam ValuesOutputIteratorT //! **[inferred]** Random-access output iterator type for writing scan values outputs @iterator //! //! @tparam EqualityOpT //! **[inferred]** Functor type having member //! `T operator()(const T &a, const T &b)` for binary operations that defines the equality of keys //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. //! When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in] d_keys_in //! Random-access input iterator to the input sequence of key items //! //! @param[in] d_values_in //! Random-access input iterator to the input sequence of value items //! //! @param[out] d_values_out //! Random-access output iterator to the output sequence of value items //! //! @param[in] num_items //! Total number of input items (i.e., the length of `d_keys_in` and `d_values_in`) //! //! @param[in] equality_op //! Binary functor that defines the equality of keys. //! Default is cub::Equality(). //! //! @param[in] stream //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst template CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSumByKey( void* d_temp_storage, size_t& temp_storage_bytes, KeysInputIteratorT d_keys_in, ValuesInputIteratorT d_values_in, ValuesOutputIteratorT d_values_out, int num_items, EqualityOpT equality_op = EqualityOpT(), cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::InclusiveSumByKey"); // Signed integer type for global offsets using OffsetT = int; return DispatchScanByKey< KeysInputIteratorT, ValuesInputIteratorT, ValuesOutputIteratorT, EqualityOpT, Sum, NullType, OffsetT>::Dispatch(d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_values_out, equality_op, Sum(), NullType(), num_items, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSumByKey( void* d_temp_storage, size_t& temp_storage_bytes, KeysInputIteratorT d_keys_in, ValuesInputIteratorT d_values_in, ValuesOutputIteratorT d_values_out, int num_items, EqualityOpT equality_op, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return InclusiveSumByKey( d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_values_out, num_items, equality_op, stream); } //! @rst //! Computes a device-wide inclusive prefix scan-by-key using the //! specified binary ``scan_op`` functor. The key equality is defined by ``equality_op``. //! //! - Supports non-commutative scan operators. //! - Results are not deterministic for pseudo-associative operators (e.g., //! addition of floating-point types). Results for pseudo-associative //! operators may vary from run to run. Additional details can be found in //! the @lookback description. //! - ``d_keys_in`` may equal ``d_values_out`` but the range //! ``[d_keys_in, d_keys_in + num_items)`` and the range //! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise. //! - ``d_values_in`` may equal ``d_values_out`` but the range //! ``[d_values_in, d_values_in + num_items)`` and the range //! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise. //! - @devicestorage //! //! Snippet //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The code snippet below illustrates the inclusive prefix min-scan-by-key of an ``int`` device vector. //! //! .. code-block:: c++ //! //! #include // or equivalently //! #include // for INT_MAX //! //! // CustomMin functor //! struct CustomMin //! { //! template //! CUB_RUNTIME_FUNCTION __forceinline__ //! T operator()(const T &a, const T &b) const { //! return (b < a) ? b : a; //! } //! }; //! //! // CustomEqual functor //! struct CustomEqual //! { //! template //! CUB_RUNTIME_FUNCTION __forceinline__ //! T operator()(const T &a, const T &b) const { //! return a == b; //! } //! }; //! //! // Declare, allocate, and initialize device-accessible pointers for //! // input and output //! int num_items; // e.g., 7 //! int *d_keys_in; // e.g., [0, 0, 1, 1, 1, 2, 2] //! int *d_values_in; // e.g., [8, 6, 7, 5, 3, 0, 9] //! int *d_values_out; // e.g., [ , , , , , , ] //! CustomMin min_op; //! CustomEqual equality_op; //! ... //! //! // Determine temporary device storage requirements for inclusive prefix scan //! void *d_temp_storage = NULL; //! size_t temp_storage_bytes = 0; //! cub::DeviceScan::InclusiveScanByKey( //! d_temp_storage, temp_storage_bytes, //! d_keys_in, d_values_in, d_values_out, min_op, num_items, equality_op); //! //! // Allocate temporary storage for inclusive prefix scan //! cudaMalloc(&d_temp_storage, temp_storage_bytes); //! //! // Run inclusive prefix min-scan //! cub::DeviceScan::InclusiveScanByKey( //! d_temp_storage, temp_storage_bytes, //! d_keys_in, d_values_in, d_values_out, min_op, num_items, equality_op); //! //! // d_out <-- [8, 6, 7, 5, 3, 0, 0] //! //! @endrst //! //! @tparam KeysInputIteratorT //! **[inferred]** Random-access input iterator type for reading scan keys inputs @iterator //! //! @tparam ValuesInputIteratorT //! **[inferred]** Random-access input iterator type for reading scan values inputs @iterator //! //! @tparam ValuesOutputIteratorT //! **[inferred]** Random-access output iterator type for writing scan values outputs @iterator //! //! @tparam ScanOp //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)` //! //! @tparam EqualityOpT //! **[inferred]** Functor type having member //! `T operator()(const T &a, const T &b)` for binary operations that defines the equality of keys //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. //! When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in] d_keys_in //! Random-access input iterator to the input sequence of key items //! //! @param[in] d_values_in //! Random-access input iterator to the input sequence of value items //! //! @param[out] d_values_out //! Random-access output iterator to the output sequence of value items //! //! @param[in] scan_op //! Binary scan functor //! //! @param[in] num_items //! Total number of input items (i.e., the length of `d_keys_in` and `d_values_in`) //! //! @param[in] equality_op //! Binary functor that defines the equality of keys. //! Default is cub::Equality(). //! //! @param[in] stream //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst template CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScanByKey( void* d_temp_storage, size_t& temp_storage_bytes, KeysInputIteratorT d_keys_in, ValuesInputIteratorT d_values_in, ValuesOutputIteratorT d_values_out, ScanOpT scan_op, int num_items, EqualityOpT equality_op = EqualityOpT(), cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::InclusiveScanByKey"); // Signed integer type for global offsets using OffsetT = int; return DispatchScanByKey< KeysInputIteratorT, ValuesInputIteratorT, ValuesOutputIteratorT, EqualityOpT, ScanOpT, NullType, OffsetT>::Dispatch(d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_values_out, equality_op, scan_op, NullType(), num_items, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScanByKey( void* d_temp_storage, size_t& temp_storage_bytes, KeysInputIteratorT d_keys_in, ValuesInputIteratorT d_values_in, ValuesOutputIteratorT d_values_out, ScanOpT scan_op, int num_items, EqualityOpT equality_op, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return InclusiveScanByKey( d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_values_out, scan_op, num_items, equality_op, stream); } //! @} end member group }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/device/device_segmented_radix_sort.cuh000066400000000000000000002037631463375617100234640ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ //! @file cub::DeviceSegmentedRadixSort provides device-wide, parallel operations for computing a batched radix sort //! across multiple, non-overlapping sequences of data items residing within device-accessible memory. #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include #include CUB_NAMESPACE_BEGIN //! @rst //! DeviceSegmentedRadixSort provides device-wide, parallel operations //! for computing a batched radix sort across multiple, non-overlapping //! sequences of data items residing within device-accessible memory. //! //! Overview //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The `radix sorting method `_ //! arranges items into ascending (or descending) order. The algorithm relies //! upon a positional representation for keys, i.e., each key is comprised of an //! ordered sequence of symbols (e.g., digits, characters, etc.) specified from //! least-significant to most-significant. For a given input sequence of keys //! and a set of rules specifying a total ordering of the symbolic alphabet, the //! radix sorting method produces a lexicographic ordering of those keys. //! //! See Also //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! DeviceSegmentedRadixSort shares its implementation with DeviceRadixSort. See //! that algorithm's documentation for more information. //! //! Segments are not required to be contiguous. Any element of input(s) or //! output(s) outside the specified segments will not be accessed nor modified. //! //! Usage Considerations //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! @cdp_class{DeviceSegmentedRadixSort} //! //! @endrst struct DeviceSegmentedRadixSort { private: // Name reported for NVTX ranges _CCCL_HOST_DEVICE static constexpr auto GetName() -> const char* { return "cub::DeviceSegmentedRadixSort"; } public: //! @name Key-value pairs //! @{ //! @rst //! Sorts segments of key-value pairs into ascending order. (``~2N`` auxiliary storage required) //! //! - The contents of the input data are not altered by the sorting operation //! - When input a contiguous sequence of segments, a single sequence //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased //! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where //! the latter is specified as ``segment_offsets + 1``). //! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key //! bits can be specified. This can reduce overall sorting overhead and //! yield a corresponding performance improvement. //! - Let ``in`` be one of ``{d_keys_in, d_values_in}`` and ``out`` be any of //! ``{d_keys_out, d_values_out}``. The range ``[out, out + num_items)`` shall //! not overlap ``[in, in + num_items)``, //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor //! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way. //! - @devicestorageNP For sorting using only ``O(P)`` temporary storage, see //! the sorting interface using DoubleBuffer wrappers below. //! - Segments are not required to be contiguous. For all index values ``i`` //! outside the specified segments ``d_keys_in[i]``, ``d_values_in[i]``, //! ``d_keys_out[i]``, ``d_values_out[i]`` will not be accessed nor modified. //! - @devicestorage //! //! Snippet //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The code snippet below illustrates the batched sorting of three segments //! (with one zero-length segment) of ``int`` keys with associated vector of ``int`` values. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! // Declare, allocate, and initialize device-accessible pointers for sorting data //! int num_items; // e.g., 7 //! int num_segments; // e.g., 3 //! int *d_offsets; // e.g., [0, 3, 3, 7] //! int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] //! int *d_keys_out; // e.g., [-, -, -, -, -, -, -] //! int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6] //! int *d_values_out; // e.g., [-, -, -, -, -, -, -] //! ... //! //! // Determine temporary device storage requirements //! void *d_temp_storage = NULL; //! size_t temp_storage_bytes = 0; //! cub::DeviceSegmentedRadixSort::SortPairs( //! d_temp_storage, temp_storage_bytes, //! d_keys_in, d_keys_out, d_values_in, d_values_out, //! num_items, num_segments, d_offsets, d_offsets + 1); //! //! // Allocate temporary storage //! cudaMalloc(&d_temp_storage, temp_storage_bytes); //! //! // Run sorting operation //! cub::DeviceSegmentedRadixSort::SortPairs( //! d_temp_storage, temp_storage_bytes, //! d_keys_in, d_keys_out, d_values_in, d_values_out, //! num_items, num_segments, d_offsets, d_offsets + 1); //! //! // d_keys_out <-- [6, 7, 8, 0, 3, 5, 9] //! // d_values_out <-- [1, 2, 0, 5, 4, 3, 6] //! //! @endrst //! //! @tparam KeyT //! **[inferred]** Key type //! //! @tparam ValueT //! **[inferred]** Value type //! //! @tparam BeginOffsetIteratorT //! **[inferred]** Random-access input iterator type for reading segment beginning offsets @iterator //! //! @tparam EndOffsetIteratorT //! **[inferred]** Random-access input iterator type for reading segment ending offsets @iterator //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in] d_keys_in //! Device-accessible pointer to the input data of key data to sort //! //! @param[out] d_keys_out //! Device-accessible pointer to the sorted output sequence of key data //! //! @param[in] d_values_in //! Device-accessible pointer to the corresponding input sequence of //! associated value items //! //! @param[out] d_values_out //! Device-accessible pointer to the correspondingly-reordered output //! sequence of associated value items //! //! @param[in] num_items //! The total number of items within the segmented array, including items not //! covered by segments. `num_items` should match the largest element within //! the range `[d_end_offsets, d_end_offsets + num_segments)`. //! //! @param[in] num_segments //! The number of segments that comprise the sorting data //! //! @param[in] d_begin_offsets //! Random-access input iterator to the sequence of beginning offsets of //! length `num_segments`, such that `d_begin_offsets[i]` is the first //! element of the *i*th data segment in `d_keys_*` and `d_values_*` //! //! @param[in] d_end_offsets //! @rst //! Random-access input iterator to the sequence of ending offsets of length //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``. If //! ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty. //! @endrst //! //! @param[in] begin_bit //! **[optional]** The least-significant bit index (inclusive) needed for key comparison //! //! @param[in] end_bit //! **[optional]** The most-significant bit index (exclusive) needed for key //! comparison (e.g., `sizeof(unsigned int) * 8`) //! //! @param[in] stream //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst template CUB_RUNTIME_FUNCTION static cudaError_t SortPairs( void* d_temp_storage, size_t& temp_storage_bytes, const KeyT* d_keys_in, KeyT* d_keys_out, const ValueT* d_values_in, ValueT* d_values_out, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, int begin_bit = 0, int end_bit = sizeof(KeyT) * 8, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); // Signed integer type for global offsets using OffsetT = int; DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); DoubleBuffer d_values(const_cast(d_values_in), d_values_out); return DispatchSegmentedRadixSort::Dispatch( d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, num_segments, d_begin_offsets, d_end_offsets, begin_bit, end_bit, false, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairs( void* d_temp_storage, size_t& temp_storage_bytes, const KeyT* d_keys_in, KeyT* d_keys_out, const ValueT* d_values_in, ValueT* d_values_out, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, int begin_bit, int end_bit, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return SortPairs( d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, num_items, num_segments, d_begin_offsets, d_end_offsets, begin_bit, end_bit, stream); } //! @rst //! Sorts segments of key-value pairs into ascending order. (``~N`` auxiliary storage required) //! //! - The sorting operation is given a pair of key buffers and a corresponding //! pair of associated value buffers. Each pair is managed by a DoubleBuffer //! structure that indicates which of the two buffers is "current" (and thus //! contains the input data to be sorted). //! - The contents of both buffers within each pair may be altered by the sorting operation. //! - Upon completion, the sorting operation will update the "current" //! indicator within each DoubleBuffer wrapper to reference which of the two //! buffers now contains the sorted output sequence (a function of the number //! of key bits specified and the targeted device architecture). //! - When input a contiguous sequence of segments, a single sequence //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased for both //! the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where the latter is //! specified as ``segment_offsets + 1``). //! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key //! bits can be specified. This can reduce overall sorting overhead and yield //! a corresponding performance improvement. //! - Let ``cur`` be one of ``{d_keys.Current(), d_values.Current()}`` and ``alt`` //! be any of ``{d_keys.Alternate(), d_values.Alternate()}``. The range //! ``[cur, cur + num_items)`` shall not overlap //! ``[alt, alt + num_items)``. Both ranges shall not overlap //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor //! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way. //! - Segments are not required to be contiguous. For all index values ``i`` //! outside the specified segments ``d_keys.Current()[i]``, //! ``d_values.Current()[i]``, ``d_keys.Alternate()[i]``, //! ``d_values.Alternate()[i]`` will not be accessed nor modified. //! - @devicestorageP //! - @devicestorage //! //! Snippet //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The code snippet below illustrates the batched sorting of three segments //! (with one zero-length segment) of `int` keys with associated vector of ``int`` values. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! // Declare, allocate, and initialize device-accessible pointers //! // for sorting data //! int num_items; // e.g., 7 //! int num_segments; // e.g., 3 //! int *d_offsets; // e.g., [0, 3, 3, 7] //! int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] //! int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -] //! int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6] //! int *d_value_alt_buf; // e.g., [-, -, -, -, -, -, -] //! ... //! //! // Create a set of DoubleBuffers to wrap pairs of device pointers //! cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); //! cub::DoubleBuffer d_values(d_value_buf, d_value_alt_buf); //! //! // Determine temporary device storage requirements //! void *d_temp_storage = NULL; //! size_t temp_storage_bytes = 0; //! cub::DeviceSegmentedRadixSort::SortPairs( //! d_temp_storage, temp_storage_bytes, d_keys, d_values, //! num_items, num_segments, d_offsets, d_offsets + 1); //! //! // Allocate temporary storage //! cudaMalloc(&d_temp_storage, temp_storage_bytes); //! //! // Run sorting operation //! cub::DeviceSegmentedRadixSort::SortPairs( //! d_temp_storage, temp_storage_bytes, d_keys, d_values, //! num_items, num_segments, d_offsets, d_offsets + 1); //! //! // d_keys.Current() <-- [6, 7, 8, 0, 3, 5, 9] //! // d_values.Current() <-- [5, 4, 3, 1, 2, 0, 6] //! //! @endrst //! //! @tparam KeyT //! **[inferred]** Key type //! //! @tparam ValueT //! **[inferred]** Value type //! //! @tparam BeginOffsetIteratorT //! **[inferred]** Random-access input iterator type for reading segment beginning offsets @iterator //! //! @tparam EndOffsetIteratorT //! **[inferred]** Random-access input iterator type for reading segment ending offsets @iterator //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in,out] d_keys //! Reference to the double-buffer of keys whose "current" device-accessible //! buffer contains the unsorted input keys and, upon return, is updated to //! point to the sorted output keys //! //! @param[in,out] d_values //! Double-buffer of values whose "current" device-accessible buffer //! contains the unsorted input values and, upon return, is updated to point //! to the sorted output values //! //! @param[in] num_items //! The total number of items within the segmented array, including items not //! covered by segments. `num_items` should match the largest element within //! the range `[d_end_offsets, d_end_offsets + num_segments)`. //! //! @param[in] num_segments //! The number of segments that comprise the sorting data //! //! @param[in] d_begin_offsets //! @rst //! Random-access input iterator to the sequence of beginning offsets of //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*`` //! @endrst //! //! @param[in] d_end_offsets //! @rst //! Random-access input iterator to the sequence of ending offsets of length //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``. //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty. //! @endrst //! //! @param[in] begin_bit //! **[optional]** The least-significant bit index (inclusive) needed for key comparison //! //! @param[in] end_bit //! **[optional]** The most-significant bit index (exclusive) needed for key //! comparison (e.g., `sizeof(unsigned int) * 8`) //! //! @param[in] stream //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst template CUB_RUNTIME_FUNCTION static cudaError_t SortPairs( void* d_temp_storage, size_t& temp_storage_bytes, DoubleBuffer& d_keys, DoubleBuffer& d_values, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, int begin_bit = 0, int end_bit = sizeof(KeyT) * 8, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); // Signed integer type for global offsets using OffsetT = int; return DispatchSegmentedRadixSort::Dispatch( d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, num_segments, d_begin_offsets, d_end_offsets, begin_bit, end_bit, true, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairs( void* d_temp_storage, size_t& temp_storage_bytes, DoubleBuffer& d_keys, DoubleBuffer& d_values, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, int begin_bit, int end_bit, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return SortPairs( d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, num_segments, d_begin_offsets, d_end_offsets, begin_bit, end_bit, stream); } //! @rst //! Sorts segments of key-value pairs into descending order. (``~2N`` auxiliary storage required). //! //! - The contents of the input data are not altered by the sorting operation //! - When input a contiguous sequence of segments, a single sequence //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased for both //! the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where the latter is //! specified as ``segment_offsets + 1``). //! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key //! bits can be specified. This can reduce overall sorting overhead and //! yield a corresponding performance improvement. //! - Let ``in`` be one of ``{d_keys_in, d_values_in}`` and `out` be any of //! ``{d_keys_out, d_values_out}``. The range ``[out, out + num_items)`` shall //! not overlap ``[in, in + num_items)``, //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor //! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way. //! - @devicestorageNP For sorting using only ``O(P)`` temporary storage, see //! the sorting interface using DoubleBuffer wrappers below. //! - Segments are not required to be contiguous. For all index values ``i`` //! outside the specified segments ``d_keys_in[i]``, ``d_values_in[i]``, //! ``d_keys_out[i]``, ``d_values_out[i]`` will not be accessed nor modified. //! - @devicestorage //! //! Snippet //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The code snippet below illustrates the batched sorting of three segments //! (with one zero-length segment) of ``int`` keys with associated vector of ``int`` values. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! // Declare, allocate, and initialize device-accessible pointers //! // for sorting data //! int num_items; // e.g., 7 //! int num_segments; // e.g., 3 //! int *d_offsets; // e.g., [0, 3, 3, 7] //! int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] //! int *d_keys_out; // e.g., [-, -, -, -, -, -, -] //! int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6] //! int *d_values_out; // e.g., [-, -, -, -, -, -, -] //! ... //! //! // Determine temporary device storage requirements //! void *d_temp_storage = NULL; //! size_t temp_storage_bytes = 0; //! cub::DeviceSegmentedRadixSort::SortPairsDescending( //! d_temp_storage, temp_storage_bytes, //! d_keys_in, d_keys_out, d_values_in, d_values_out, //! num_items, num_segments, d_offsets, d_offsets + 1); //! //! // Allocate temporary storage //! cudaMalloc(&d_temp_storage, temp_storage_bytes); //! //! // Run sorting operation //! cub::DeviceSegmentedRadixSort::SortPairsDescending( //! d_temp_storage, temp_storage_bytes, //! d_keys_in, d_keys_out, d_values_in, d_values_out, //! num_items, num_segments, d_offsets, d_offsets + 1); //! //! // d_keys_out <-- [8, 7, 6, 9, 5, 3, 0] //! // d_values_out <-- [0, 2, 1, 6, 3, 4, 5] //! //! @endrst //! //! @tparam KeyT //! **[inferred]** Key type //! //! @tparam ValueT //! **[inferred]** Value type //! //! @tparam BeginOffsetIteratorT //! **[inferred]** Random-access input iterator type for reading segment //! beginning offsets @iterator //! //! @tparam EndOffsetIteratorT //! **[inferred]** Random-access input iterator type for reading segment //! ending offsets @iterator //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in] d_keys_in //! Device-accessible pointer to the input data of key data to sort //! //! @param[out] d_keys_out //! Device-accessible pointer to the sorted output sequence of key data //! //! @param[in] d_values_in //! Device-accessible pointer to the corresponding input sequence of //! associated value items //! //! @param[out] d_values_out //! Device-accessible pointer to the correspondingly-reordered output //! sequence of associated value items //! //! @param[in] num_items //! The total number of items within the segmented array, including items not //! covered by segments. `num_items` should match the largest element within //! the range `[d_end_offsets, d_end_offsets + num_segments)`. //! //! @param[in] num_segments //! The number of segments that comprise the sorting data //! //! @param[in] d_begin_offsets //! @rst //! Random-access input iterator to the sequence of beginning offsets of //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*`` //! @endrst //! //! @param[in] d_end_offsets //! @rst //! Random-access input iterator to the sequence of ending offsets of length //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``. //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty. //! @endrst //! //! @param[in] begin_bit //! **[optional]** The least-significant bit index (inclusive) needed for key comparison //! //! @param[in] end_bit //! **[optional]** The most-significant bit index (exclusive) needed for key //! comparison (e.g., `sizeof(unsigned int) * 8`) //! //! @param[in] stream //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst template CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending( void* d_temp_storage, size_t& temp_storage_bytes, const KeyT* d_keys_in, KeyT* d_keys_out, const ValueT* d_values_in, ValueT* d_values_out, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, int begin_bit = 0, int end_bit = sizeof(KeyT) * 8, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); // Signed integer type for global offsets using OffsetT = int; DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); DoubleBuffer d_values(const_cast(d_values_in), d_values_out); return DispatchSegmentedRadixSort::Dispatch( d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, num_segments, d_begin_offsets, d_end_offsets, begin_bit, end_bit, false, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending( void* d_temp_storage, size_t& temp_storage_bytes, const KeyT* d_keys_in, KeyT* d_keys_out, const ValueT* d_values_in, ValueT* d_values_out, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, int begin_bit, int end_bit, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return SortPairsDescending( d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, num_items, num_segments, d_begin_offsets, d_end_offsets, begin_bit, end_bit, stream); } //! @rst //! Sorts segments of key-value pairs into descending order. (``~N`` auxiliary storage required). //! //! - The sorting operation is given a pair of key buffers and a corresponding //! pair of associated value buffers. Each pair is managed by a DoubleBuffer //! structure that indicates which of the two buffers is "current" (and thus //! contains the input data to be sorted). //! - The contents of both buffers within each pair may be altered by the //! sorting operation. //! - Upon completion, the sorting operation will update the "current" //! indicator within each DoubleBuffer wrapper to reference which of the two //! buffers now contains the sorted output sequence (a function of the number //! of key bits specified and the targeted device architecture). //! - When input a contiguous sequence of segments, a single sequence //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased for both //! the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where the latter is //! specified as ``segment_offsets + 1``). //! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key //! bits can be specified. This can reduce overall sorting overhead and //! yield a corresponding performance improvement. //! - Let ``cur`` be one of ``{d_keys.Current(), d_values.Current()}`` and ``alt`` //! be any of ``{d_keys.Alternate(), d_values.Alternate()}``. The range //! ``[cur, cur + num_items)`` shall not overlap //! ``[alt, alt + num_items)``. Both ranges shall not overlap //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor //! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way. //! - Segments are not required to be contiguous. For all index values ``i`` //! outside the specified segments ``d_keys.Current()[i]``, //! ``d_values.Current()[i]``, ``d_keys.Alternate()[i]``, //! ``d_values.Alternate()[i]`` will not be accessed nor modified. //! not to be modified. //! - @devicestorageP //! - @devicestorage //! //! Snippet //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The code snippet below illustrates the batched sorting of three segments //! (with one zero-length segment) of ``int`` keys with associated vector of ``int`` values. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! // Declare, allocate, and initialize device-accessible pointers //! // for sorting data //! int num_items; // e.g., 7 //! int num_segments; // e.g., 3 //! int *d_offsets; // e.g., [0, 3, 3, 7] //! int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] //! int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -] //! int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6] //! int *d_value_alt_buf; // e.g., [-, -, -, -, -, -, -] //! ... //! //! // Create a set of DoubleBuffers to wrap pairs of device pointers //! cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); //! cub::DoubleBuffer d_values(d_value_buf, d_value_alt_buf); //! //! // Determine temporary device storage requirements //! void *d_temp_storage = NULL; //! size_t temp_storage_bytes = 0; //! cub::DeviceSegmentedRadixSort::SortPairsDescending( //! d_temp_storage, temp_storage_bytes, d_keys, d_values, //! num_items, num_segments, d_offsets, d_offsets + 1); //! //! // Allocate temporary storage //! cudaMalloc(&d_temp_storage, temp_storage_bytes); //! //! // Run sorting operation //! cub::DeviceSegmentedRadixSort::SortPairsDescending( //! d_temp_storage, temp_storage_bytes, d_keys, d_values, //! num_items, num_segments, d_offsets, d_offsets + 1); //! //! // d_keys.Current() <-- [8, 7, 6, 9, 5, 3, 0] //! // d_values.Current() <-- [0, 2, 1, 6, 3, 4, 5] //! //! @endrst //! //! @tparam KeyT //! **[inferred]** Key type //! //! @tparam ValueT //! **[inferred]** Value type //! //! @tparam BeginOffsetIteratorT //! **[inferred]** Random-access input iterator type for reading segment //! beginning offsets @iterator //! //! @tparam EndOffsetIteratorT //! **[inferred]** Random-access input iterator type for reading segment //! ending offsets @iterator //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in,out] d_keys //! Reference to the double-buffer of keys whose "current" device-accessible //! buffer contains the unsorted input keys and, upon return, is updated to //! point to the sorted output keys //! //! @param[in,out] d_values //! Double-buffer of values whose "current" device-accessible buffer //! contains the unsorted input values and, upon return, is updated to point //! to the sorted output values //! //! @param[in] num_items //! The total number of items within the segmented array, including items not //! covered by segments. `num_items` should match the largest element within //! the range `[d_end_offsets, d_end_offsets + num_segments)`. //! //! @param[in] num_segments //! The number of segments that comprise the sorting data //! //! @param[in] d_begin_offsets //! @rst //! Random-access input iterator to the sequence of beginning offsets of //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*`` //! @endrst //! //! @param[in] d_end_offsets //! @rst //! Random-access input iterator to the sequence of ending offsets of length //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``. //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty. //! @endrst //! //! @param[in] begin_bit //! **[optional]** The least-significant bit index (inclusive) needed for key comparison //! //! @param[in] end_bit //! **[optional]** The most-significant bit index (exclusive) needed for key //! comparison (e.g., `sizeof(unsigned int) * 8`) //! //! @param[in] stream //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst template CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending( void* d_temp_storage, size_t& temp_storage_bytes, DoubleBuffer& d_keys, DoubleBuffer& d_values, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, int begin_bit = 0, int end_bit = sizeof(KeyT) * 8, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); // Signed integer type for global offsets using OffsetT = int; return DispatchSegmentedRadixSort::Dispatch( d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, num_segments, d_begin_offsets, d_end_offsets, begin_bit, end_bit, true, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending( void* d_temp_storage, size_t& temp_storage_bytes, DoubleBuffer& d_keys, DoubleBuffer& d_values, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, int begin_bit, int end_bit, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return SortPairsDescending( d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, num_segments, d_begin_offsets, d_end_offsets, begin_bit, end_bit, stream); } //! @} end member group //! @name Keys-only //! @{ //! @rst //! Sorts segments of keys into ascending order. (``~2N`` auxiliary storage required) //! //! - The contents of the input data are not altered by the sorting operation //! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key //! bits can be specified. This can reduce overall sorting overhead and //! yield a corresponding performance improvement. //! - When input a contiguous sequence of segments, a single sequence //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased for both //! the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where the latter //! is specified as ``segment_offsets + 1``). //! - The range ``[d_keys_out, d_keys_out + num_items)`` shall not overlap //! ``[d_keys_in, d_keys_in + num_items)``, //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor //! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way. //! - @devicestorageNP For sorting using only ``O(P)`` temporary storage, see //! the sorting interface using DoubleBuffer wrappers below. //! - Segments are not required to be contiguous. For all index values ``i`` //! outside the specified segments ``d_keys_in[i]``, ``d_keys_out[i]`` will not //! be accessed nor modified. //! - @devicestorage //! //! Snippet //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The code snippet below illustrates the batched sorting of three segments //! (with one zero-length segment) of `int` keys. //! //! .. code-block:: c++ //! //! #include //! // or equivalently //! //! // Declare, allocate, and initialize device-accessible pointers //! // for sorting data //! int num_items; // e.g., 7 //! int num_segments; // e.g., 3 //! int *d_offsets; // e.g., [0, 3, 3, 7] //! int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] //! int *d_keys_out; // e.g., [-, -, -, -, -, -, -] //! ... //! //! // Determine temporary device storage requirements //! void *d_temp_storage = NULL; //! size_t temp_storage_bytes = 0; //! cub::DeviceSegmentedRadixSort::SortKeys( //! d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, //! num_items, num_segments, d_offsets, d_offsets + 1); //! //! // Allocate temporary storage //! cudaMalloc(&d_temp_storage, temp_storage_bytes); //! //! // Run sorting operation //! cub::DeviceSegmentedRadixSort::SortKeys( //! d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, //! num_items, num_segments, d_offsets, d_offsets + 1); //! //! // d_keys_out <-- [6, 7, 8, 0, 3, 5, 9] //! //! @endrst //! //! @tparam KeyT //! **[inferred]** Key type //! //! @tparam BeginOffsetIteratorT //! **[inferred]** Random-access input iterator type for reading segment //! beginning offsets @iterator //! //! @tparam EndOffsetIteratorT //! **[inferred]** Random-access input iterator type for reading segment //! ending offsets @iterator //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. //! When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in] d_keys_in //! Device-accessible pointer to the input data of key data to sort //! //! @param[out] d_keys_out //! Device-accessible pointer to the sorted output sequence of key data //! //! @param[in] num_items //! The total number of items within the segmented array, including items not //! covered by segments. `num_items` should match the largest element within //! the range `[d_end_offsets, d_end_offsets + num_segments)`. //! //! @param[in] num_segments //! The number of segments that comprise the sorting data //! //! @param[in] d_begin_offsets //! @rst //! Random-access input iterator to the sequence of beginning offsets of //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*`` //! @endrst //! //! @param[in] d_end_offsets //! @rst //! Random-access input iterator to the sequence of ending offsets of length //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``. //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty. //! @endrst //! //! @param[in] begin_bit //! **[optional]** The least-significant bit index (inclusive) needed for key comparison //! //! @param[in] end_bit //! **[optional]** The most-significant bit index (exclusive) needed for key //! comparison (e.g., `sizeof(unsigned int) * 8`) //! //! @param[in] stream //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst template CUB_RUNTIME_FUNCTION static cudaError_t SortKeys( void* d_temp_storage, size_t& temp_storage_bytes, const KeyT* d_keys_in, KeyT* d_keys_out, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, int begin_bit = 0, int end_bit = sizeof(KeyT) * 8, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); // Signed integer type for global offsets using OffsetT = int; // Null value type DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); DoubleBuffer d_values; return DispatchSegmentedRadixSort::Dispatch( d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, num_segments, d_begin_offsets, d_end_offsets, begin_bit, end_bit, false, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeys( void* d_temp_storage, size_t& temp_storage_bytes, const KeyT* d_keys_in, KeyT* d_keys_out, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, int begin_bit, int end_bit, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return SortKeys( d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items, num_segments, d_begin_offsets, d_end_offsets, begin_bit, end_bit, stream); } //! @rst //! Sorts segments of keys into ascending order. (``~N`` auxiliary storage required). //! //! - The sorting operation is given a pair of key buffers managed by a //! DoubleBuffer structure that indicates which of the two buffers is //! "current" (and thus contains the input data to be sorted). //! - The contents of both buffers may be altered by the sorting operation. //! - Upon completion, the sorting operation will update the "current" //! indicator within the DoubleBuffer wrapper to reference which of the two //! buffers now contains the sorted output sequence (a function of the //! number of key bits specified and the targeted device architecture). //! - When input a contiguous sequence of segments, a single sequence //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased for both //! the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where the latter //! is specified as ``segment_offsets + 1``). //! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key //! bits can be specified. This can reduce overall sorting overhead and //! yield a corresponding performance improvement. //! - Let ``cur = d_keys.Current()`` and ``alt = d_keys.Alternate()``. //! The range ``[cur, cur + num_items)`` shall not overlap //! ``[alt, alt + num_items)``. Both ranges shall not overlap //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor //! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way. //! - Segments are not required to be contiguous. For all index values ``i`` //! outside the specified segments ``d_keys.Current()[i]``, //! ``d_keys[i].Alternate()[i]`` will not be accessed nor modified. //! - @devicestorageP //! - @devicestorage //! //! Snippet //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The code snippet below illustrates the batched sorting of three segments //! (with one zero-length segment) of ``int`` keys. //! //! .. code-block:: c++ //! //! #include //! // or equivalently //! //! // Declare, allocate, and initialize device-accessible pointers for //! // sorting data //! int num_items; // e.g., 7 //! int num_segments; // e.g., 3 //! int *d_offsets; // e.g., [0, 3, 3, 7] //! int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] //! int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -] //! ... //! //! // Create a DoubleBuffer to wrap the pair of device pointers //! cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); //! //! // Determine temporary device storage requirements //! void *d_temp_storage = NULL; //! size_t temp_storage_bytes = 0; //! cub::DeviceSegmentedRadixSort::SortKeys( //! d_temp_storage, temp_storage_bytes, d_keys, //! num_items, num_segments, d_offsets, d_offsets + 1); //! //! // Allocate temporary storage //! cudaMalloc(&d_temp_storage, temp_storage_bytes); //! //! // Run sorting operation //! cub::DeviceSegmentedRadixSort::SortKeys( //! d_temp_storage, temp_storage_bytes, d_keys, //! num_items, num_segments, d_offsets, d_offsets + 1); //! //! // d_keys.Current() <-- [6, 7, 8, 0, 3, 5, 9] //! //! @endrst //! //! @tparam KeyT //! **[inferred]** Key type //! //! @tparam BeginOffsetIteratorT //! **[inferred]** Random-access input iterator type for reading segment //! beginning offsets @iterator //! //! @tparam EndOffsetIteratorT //! **[inferred]** Random-access input iterator type for reading segment //! ending offsets @iterator //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work //! is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in,out] d_keys //! Reference to the double-buffer of keys whose "current" device-accessible //! buffer contains the unsorted input keys and, upon return, is updated to //! point to the sorted output keys //! //! @param[in] num_items //! The total number of items within the segmented array, including items not //! covered by segments. `num_items` should match the largest element within //! the range `[d_end_offsets, d_end_offsets + num_segments)`. //! //! @param[in] num_segments //! The number of segments that comprise the sorting data //! //! @param[in] d_begin_offsets //! @rst //! Random-access input iterator to the sequence of beginning offsets of //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*`` //! @endrst //! //! @param[in] d_end_offsets //! @rst //! Random-access input iterator to the sequence of ending offsets of length //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``. //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty. //! @endrst //! //! @param[in] begin_bit //! **[optional]** The least-significant bit index (inclusive) //! needed for key comparison //! //! @param[in] end_bit //! **[optional]** The most-significant bit index (exclusive) needed for key //! comparison (e.g., `sizeof(unsigned int) * 8`) //! //! @param[in] stream //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst template CUB_RUNTIME_FUNCTION static cudaError_t SortKeys( void* d_temp_storage, size_t& temp_storage_bytes, DoubleBuffer& d_keys, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, int begin_bit = 0, int end_bit = sizeof(KeyT) * 8, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); // Signed integer type for global offsets using OffsetT = int; // Null value type DoubleBuffer d_values; return DispatchSegmentedRadixSort::Dispatch( d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, num_segments, d_begin_offsets, d_end_offsets, begin_bit, end_bit, true, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeys( void* d_temp_storage, size_t& temp_storage_bytes, DoubleBuffer& d_keys, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, int begin_bit, int end_bit, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return SortKeys( d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, begin_bit, end_bit, stream); } //! @rst //! Sorts segments of keys into descending order. (``~2N`` auxiliary storage required). //! //! - The contents of the input data are not altered by the sorting operation //! - When input a contiguous sequence of segments, a single sequence //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased for both //! the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where the latter //! is specified as ``segment_offsets + 1``). //! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key //! bits can be specified. This can reduce overall sorting overhead and //! yield a corresponding performance improvement. //! - The range ``[d_keys_out, d_keys_out + num_items)`` shall not overlap //! ``[d_keys_in, d_keys_in + num_items)``, //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor //! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way. //! - @devicestorageNP For sorting using only ``O(P)`` temporary storage, see //! the sorting interface using DoubleBuffer wrappers below. //! - Segments are not required to be contiguous. For all index values ``i`` //! outside the specified segments ``d_keys_in[i]``, ``d_keys_out[i]`` will not //! be accessed nor modified. //! - @devicestorage //! //! Snippet //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The code snippet below illustrates the batched sorting of three segments //! (with one zero-length segment) of ``int`` keys. //! //! .. code-block:: c++ //! //! #include //! // or equivalently //! //! // Declare, allocate, and initialize device-accessible pointers //! // for sorting data //! int num_items; // e.g., 7 //! int num_segments; // e.g., 3 //! int *d_offsets; // e.g., [0, 3, 3, 7] //! int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] //! int *d_keys_out; // e.g., [-, -, -, -, -, -, -] //! ... //! //! // Create a DoubleBuffer to wrap the pair of device pointers //! cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); //! //! // Determine temporary device storage requirements //! void *d_temp_storage = NULL; //! size_t temp_storage_bytes = 0; //! cub::DeviceSegmentedRadixSort::SortKeysDescending( //! d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, //! num_items, num_segments, d_offsets, d_offsets + 1); //! //! // Allocate temporary storage //! cudaMalloc(&d_temp_storage, temp_storage_bytes); //! //! // Run sorting operation //! cub::DeviceSegmentedRadixSort::SortKeysDescending( //! d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, //! num_items, num_segments, d_offsets, d_offsets + 1); //! //! // d_keys_out <-- [8, 7, 6, 9, 5, 3, 0] //! //! @endrst //! //! @tparam KeyT //! **[inferred]** Key type //! //! @tparam BeginOffsetIteratorT //! **[inferred]** Random-access input iterator type for reading segment beginning offsets @iterator //! //! @tparam EndOffsetIteratorT //! **[inferred]** Random-access input iterator type for reading segment ending offsets @iterator //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in] d_keys_in //! Device-accessible pointer to the input data of key data to sort //! //! @param[out] d_keys_out //! Device-accessible pointer to the sorted output sequence of key data //! //! @param[in] num_items //! The total number of items within the segmented array, including items not //! covered by segments. `num_items` should match the largest element within //! the range `[d_end_offsets, d_end_offsets + num_segments)`. //! //! @param[in] num_segments //! The number of segments that comprise the sorting data //! //! @param[in] d_begin_offsets //! @rst //! Random-access input iterator to the sequence of beginning offsets of //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*`` //! @endrst //! //! @param[in] d_end_offsets //! @rst //! Random-access input iterator to the sequence of ending offsets of length //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``. //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty. //! @endrst //! //! @param[in] begin_bit //! **[optional]** The least-significant bit index (inclusive) needed for key comparison //! //! @param[in] end_bit //! **[optional]** The most-significant bit index (exclusive) needed for key //! comparison (e.g., sizeof(unsigned int) * 8) //! //! @param[in] stream //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst template CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending( void* d_temp_storage, size_t& temp_storage_bytes, const KeyT* d_keys_in, KeyT* d_keys_out, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, int begin_bit = 0, int end_bit = sizeof(KeyT) * 8, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); // Signed integer type for global offsets using OffsetT = int; DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); DoubleBuffer d_values; return DispatchSegmentedRadixSort::Dispatch( d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, num_segments, d_begin_offsets, d_end_offsets, begin_bit, end_bit, false, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending( void* d_temp_storage, size_t& temp_storage_bytes, const KeyT* d_keys_in, KeyT* d_keys_out, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, int begin_bit, int end_bit, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return SortKeysDescending( d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items, num_segments, d_begin_offsets, d_end_offsets, begin_bit, end_bit, stream); } //! @rst //! Sorts segments of keys into descending order. (``~N`` auxiliary storage required). //! //! - The sorting operation is given a pair of key buffers managed by a //! DoubleBuffer structure that indicates which of the two buffers is //! "current" (and thus contains the input data to be sorted). //! - The contents of both buffers may be altered by the sorting operation. //! - Upon completion, the sorting operation will update the "current" //! indicator within the DoubleBuffer wrapper to reference which of the two //! buffers now contains the sorted output sequence (a function of the //! number of key bits specified and the targeted device architecture). //! - When input a contiguous sequence of segments, a single sequence //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased //! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where //! the latter is specified as ``segment_offsets + 1``). //! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key //! bits can be specified. This can reduce overall sorting overhead and //! yield a corresponding performance improvement. //! - Let ``cur = d_keys.Current()`` and ``alt = d_keys.Alternate()``. //! The range ``[cur, cur + num_items)`` shall not overlap //! ``[alt, alt + num_items)``. Both ranges shall not overlap //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor //! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way. //! - Segments are not required to be contiguous. For all index values ``i`` //! outside the specified segments ``d_keys.Current()[i]``, //! ``d_keys[i].Alternate()[i]`` will not be accessed nor modified. //! - @devicestorageP //! - @devicestorage //! //! Snippet //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The code snippet below illustrates the batched sorting of three segments //! (with one zero-length segment) of `int` keys. //! //! .. code-block:: c++ //! //! #include //! // or equivalently //! //! // Declare, allocate, and initialize device-accessible pointers //! // for sorting data //! int num_items; // e.g., 7 //! int num_segments; // e.g., 3 //! int *d_offsets; // e.g., [0, 3, 3, 7] //! int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] //! int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -] //! ... //! //! // Create a DoubleBuffer to wrap the pair of device pointers //! cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); //! //! // Determine temporary device storage requirements //! void *d_temp_storage = NULL; //! size_t temp_storage_bytes = 0; //! cub::DeviceSegmentedRadixSort::SortKeysDescending( //! d_temp_storage, temp_storage_bytes, d_keys, //! num_items, num_segments, d_offsets, d_offsets + 1); //! //! // Allocate temporary storage //! cudaMalloc(&d_temp_storage, temp_storage_bytes); //! //! // Run sorting operation //! cub::DeviceSegmentedRadixSort::SortKeysDescending( //! d_temp_storage, temp_storage_bytes, d_keys, //! num_items, num_segments, d_offsets, d_offsets + 1); //! //! // d_keys.Current() <-- [8, 7, 6, 9, 5, 3, 0] //! //! @endrst //! //! @tparam KeyT //! **[inferred]** Key type //! //! @tparam BeginOffsetIteratorT //! **[inferred]** Random-access input iterator type for reading segment //! beginning offsets @iterator //! //! @tparam EndOffsetIteratorT //! **[inferred]** Random-access input iterator type for reading segment //! ending offsets @iterator //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in,out] d_keys //! Reference to the double-buffer of keys whose "current" device-accessible //! buffer contains the unsorted input keys and, upon return, is updated to //! point to the sorted output keys //! //! @param[in] num_items //! The total number of items within the segmented array, including items not //! covered by segments. `num_items` should match the largest element within //! the range `[d_end_offsets, d_end_offsets + num_segments)`. //! //! @param[in] num_segments //! The number of segments that comprise the sorting data //! //! @param[in] d_begin_offsets //! @rst //! Random-access input iterator to the sequence of beginning offsets of //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*`` //! @endrst //! //! @param[in] d_end_offsets //! @rst //! Random-access input iterator to the sequence of ending offsets of length //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``. //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty. //! @endrst //! //! @param[in] begin_bit //! **[optional]** The least-significant bit index (inclusive) needed for key comparison //! //! @param[in] end_bit //! **[optional]** The most-significant bit index (exclusive) needed for key //! comparison (e.g., `sizeof(unsigned int) * 8`) //! //! @param[in] stream //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst template CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending( void* d_temp_storage, size_t& temp_storage_bytes, DoubleBuffer& d_keys, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, int begin_bit = 0, int end_bit = sizeof(KeyT) * 8, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); // Signed integer type for global offsets using OffsetT = int; // Null value type DoubleBuffer d_values; return DispatchSegmentedRadixSort::Dispatch( d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, num_segments, d_begin_offsets, d_end_offsets, begin_bit, end_bit, true, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending( void* d_temp_storage, size_t& temp_storage_bytes, DoubleBuffer& d_keys, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, int begin_bit, int end_bit, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return SortKeysDescending( d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, begin_bit, end_bit, stream); } //! @} end member group }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/device/device_segmented_reduce.cuh000066400000000000000000001221711463375617100225460ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ //! @file cub::DeviceSegmentedReduce provides device-wide, parallel operations //! for computing a batched reduction across multiple sequences of data //! items residing within device-accessible memory. #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include #include #include #include #include #include CUB_NAMESPACE_BEGIN //! @rst //! DeviceSegmentedReduce provides device-wide, parallel operations for //! computing a reduction across multiple sequences of data items //! residing within device-accessible memory. //! //! Overview //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! A `reduction `_ //! (or *fold*) uses a binary combining operator to compute a single aggregate //! from a sequence of input elements. //! //! Usage Considerations //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! @cdp_class{DeviceSegmentedReduce} //! //! @endrst struct DeviceSegmentedReduce { private: template CUB_RUNTIME_FUNCTION static cudaError_t segmented_reduce( ::cuda::std::false_type, void* d_temp_storage, std::size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, ReductionOpT reduction_op, InitT initial_value, cudaStream_t stream); template CUB_RUNTIME_FUNCTION static cudaError_t segmented_reduce( ::cuda::std::true_type, void* d_temp_storage, std::size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, ReductionOpT reduction_op, InitT initial_value, cudaStream_t stream) { return DispatchSegmentedReduce< InputIteratorT, OutputIteratorT, BeginOffsetIteratorT, EndOffsetIteratorT, OffsetT, ReductionOpT, Ts...>::Dispatch(d_temp_storage, temp_storage_bytes, d_in, d_out, num_segments, d_begin_offsets, d_end_offsets, reduction_op, initial_value, stream); } public: //! @rst //! Computes a device-wide segmented reduction using the specified //! binary ``reduction_op`` functor. //! //! - Does not support binary reduction operators that are non-commutative. //! - Provides "run-to-run" determinism for pseudo-associative reduction //! (e.g., addition of floating point types) on the same GPU device. //! However, results for pseudo-associative reduction may be inconsistent //! from one device to a another device of a different compute-capability //! because CUB can employ different tile-sizing for different architectures. //! - When input a contiguous sequence of segments, a single sequence //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased //! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where //! the latter is specified as ``segment_offsets + 1``). //! - Let ``s`` be in ``[0, num_segments)``. The range //! ``[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])`` shall not //! overlap ``[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])``, //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor //! ``[d_end_offsets, d_end_offsets + num_segments)``. //! - @devicestorage //! //! Snippet //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The code snippet below illustrates a custom min-reduction of a device vector of ``int`` data elements. //! //! .. literalinclude:: ../../test/catch2_test_device_segmented_reduce_api.cu //! :language: c++ //! :dedent: //! :start-after: example-begin segmented-reduce-reduce //! :end-before: example-end segmented-reduce-reduce //! //! @endrst //! //! @tparam InputIteratorT //! **[inferred]** Random-access input iterator type for reading input items @iterator //! //! @tparam OutputIteratorT //! **[inferred]** Output iterator type for recording the reduced aggregate @iterator //! //! @tparam BeginOffsetIteratorT //! **[inferred]** Random-access input iterator type for reading segment beginning offsets @iterator //! //! @tparam EndOffsetIteratorT //! **[inferred]** Random-access input iterator type for reading segment ending offsets @iterator //! //! @tparam ReductionOpT //! **[inferred]** Binary reduction functor type having member `T operator()(const T &a, const T &b)` //! //! @tparam T //! **[inferred]** Data element type that is convertible to the `value` type of `InputIteratorT` //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in] d_in //! Pointer to the input sequence of data items //! //! @param[out] d_out //! Pointer to the output aggregate //! //! @param[in] num_segments //! The number of segments that comprise the sorting data //! //! @param[in] d_begin_offsets //! @rst //! Random-access input iterator to the sequence of beginning offsets of //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*`` //! @endrst //! //! @param[in] d_end_offsets //! @rst //! Random-access input iterator to the sequence of ending offsets of length //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``. //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty. //! @endrst //! //! @param[in] reduction_op //! Binary reduction functor //! //! @param[in] initial_value //! Initial value of the reduction for each segment //! //! @param[in] stream //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst template CUB_RUNTIME_FUNCTION static cudaError_t Reduce( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, ReductionOpT reduction_op, T initial_value, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSegmentedReduce::Reduce"); // Integer type for global offsets using OffsetT = detail::common_iterator_value_t; using integral_offset_check = ::cuda::std::is_integral; static_assert(integral_offset_check::value, "Offset iterator value type should be integral."); return segmented_reduce( integral_offset_check{}, d_temp_storage, temp_storage_bytes, d_in, d_out, num_segments, d_begin_offsets, d_end_offsets, reduction_op, initial_value, // zero-initialize stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t Reduce( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, ReductionOpT reduction_op, T initial_value, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return Reduce( d_temp_storage, temp_storage_bytes, d_in, d_out, num_segments, d_begin_offsets, d_end_offsets, reduction_op, initial_value, stream); } //! @rst //! Computes a device-wide segmented sum using the addition (``+``) operator. //! //! - Uses ``0`` as the initial value of the reduction for each segment. //! - When input a contiguous sequence of segments, a single sequence //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased //! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where //! the latter is specified as ``segment_offsets + 1``). //! - Does not support ``+`` operators that are non-commutative. //! - Let ``s`` be in ``[0, num_segments)``. The range //! ``[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])`` shall not //! overlap ``[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])``, //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor //! ``[d_end_offsets, d_end_offsets + num_segments)``. //! - @devicestorage //! //! Snippet //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The code snippet below illustrates the sum reduction of a device vector of ``int`` data elements. //! //! .. literalinclude:: ../../test/catch2_test_device_segmented_reduce_api.cu //! :language: c++ //! :dedent: //! :start-after: example-begin segmented-reduce-sum //! :end-before: example-end segmented-reduce-sum //! //! @endrst //! //! @tparam InputIteratorT //! **[inferred]** Random-access input iterator type for reading input items @iterator //! //! @tparam OutputIteratorT //! **[inferred]** Output iterator type for recording the reduced aggregate @iterator //! //! @tparam BeginOffsetIteratorT //! **[inferred]** Random-access input iterator type for reading segment beginning offsets @iterator //! //! @tparam EndOffsetIteratorT //! **[inferred]** Random-access input iterator type for reading segment ending offsets @iterator //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in] d_in //! Pointer to the input sequence of data items //! //! @param[out] d_out //! Pointer to the output aggregate //! //! @param[in] num_segments //! The number of segments that comprise the sorting data //! //! @param[in] d_begin_offsets //! @rst //! Random-access input iterator to the sequence of beginning offsets of //! length ``num_segments`, such that ``d_begin_offsets[i]`` is the first //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and //! ``d_values_*`` //! @endrst //! //! @param[in] d_end_offsets //! @rst //! Random-access input iterator to the sequence of ending offsets of length //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``. //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty. //! @endrst //! //! @param[in] stream //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst template CUB_RUNTIME_FUNCTION static cudaError_t Sum(void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSegmentedReduce::Sum"); // Integer type for global offsets using OffsetT = detail::common_iterator_value_t; // The output value type using OutputT = cub::detail::non_void_value_t>; using integral_offset_check = ::cuda::std::is_integral; static_assert(integral_offset_check::value, "Offset iterator value type should be integral."); return segmented_reduce( integral_offset_check{}, d_temp_storage, temp_storage_bytes, d_in, d_out, num_segments, d_begin_offsets, d_end_offsets, cub::Sum(), OutputT(), // zero-initialize stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t Sum(void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return Sum( d_temp_storage, temp_storage_bytes, d_in, d_out, num_segments, d_begin_offsets, d_end_offsets, stream); } //! @rst //! Computes a device-wide segmented minimum using the less-than (``<``) operator. //! //! - Uses ``std::numeric_limits::max()`` as the initial value of the reduction for each segment. //! - When input a contiguous sequence of segments, a single sequence //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased for both //! the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where the latter is //! specified as ``segment_offsets + 1``). //! - Does not support ``<`` operators that are non-commutative. //! - Let ``s`` be in ``[0, num_segments)``. The range //! ``[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])`` shall not //! overlap ``[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])``, //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor //! ``[d_end_offsets, d_end_offsets + num_segments)``. //! - @devicestorage //! //! Snippet //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The code snippet below illustrates the min-reduction of a device vector of ``int`` data elements. //! //! .. literalinclude:: ../../test/catch2_test_device_segmented_reduce_api.cu //! :language: c++ //! :dedent: //! :start-after: example-begin segmented-reduce-custommin //! :end-before: example-end segmented-reduce-custommin //! //! .. literalinclude:: ../../test/catch2_test_device_segmented_reduce_api.cu //! :language: c++ //! :dedent: //! :start-after: example-begin segmented-reduce-min //! :end-before: example-end segmented-reduce-min //! //! @endrst //! //! @tparam InputIteratorT //! **[inferred]** Random-access input iterator type for reading input items @iterator //! //! @tparam OutputIteratorT //! **[inferred]** Output iterator type for recording the reduced aggregate @iterator //! //! @tparam BeginOffsetIteratorT //! **[inferred]** Random-access input iterator type for reading segment beginning offsets @iterator //! //! @tparam EndOffsetIteratorT //! **[inferred]** Random-access input iterator type for reading segment ending offsets @iterator //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in] d_in //! Pointer to the input sequence of data items //! //! @param[out] d_out //! Pointer to the output aggregate //! //! @param[in] num_segments //! The number of segments that comprise the sorting data //! //! @param[in] d_begin_offsets //! @rst //! Random-access input iterator to the sequence of beginning offsets of //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*`` //! @endrst //! //! @param[in] d_end_offsets //! @rst //! Random-access input iterator to the sequence of ending offsets of length //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``. //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty. //! @endrst //! //! @param[in] stream //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst template CUB_RUNTIME_FUNCTION static cudaError_t Min(void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSegmentedReduce::Min"); // Integer type for global offsets using OffsetT = detail::common_iterator_value_t; // The input value type using InputT = cub::detail::value_t; using integral_offset_check = ::cuda::std::is_integral; static_assert(integral_offset_check::value, "Offset iterator value type should be integral."); return segmented_reduce( integral_offset_check{}, d_temp_storage, temp_storage_bytes, d_in, d_out, num_segments, d_begin_offsets, d_end_offsets, cub::Min(), Traits::Max(), // replace with // std::numeric_limits::max() // when C++11 support is // more prevalent stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t Min(void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return Min( d_temp_storage, temp_storage_bytes, d_in, d_out, num_segments, d_begin_offsets, d_end_offsets, stream); } //! @rst //! Finds the first device-wide minimum in each segment using the //! less-than (``<``) operator, also returning the in-segment index of that item. //! //! - The output value type of ``d_out`` is ``cub::KeyValuePair`` //! (assuming the value type of ``d_in`` is ``T``) //! //! - The minimum of the *i*\ :sup:`th` segment is written to //! ``d_out[i].value`` and its offset in that segment is written to ``d_out[i].key``. //! - The ``{1, std::numeric_limits::max()}`` tuple is produced for zero-length inputs //! //! - When input a contiguous sequence of segments, a single sequence //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased for both //! the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where the latter //! is specified as ``segment_offsets + 1``). //! - Does not support ``<`` operators that are non-commutative. //! - Let ``s`` be in ``[0, num_segments)``. The range //! ``[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])`` shall not //! overlap ``[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])``, //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor //! ``[d_end_offsets, d_end_offsets + num_segments)``. //! - @devicestorage //! //! Snippet //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The code snippet below illustrates the argmin-reduction of a device vector of ``int`` data elements. //! //! .. literalinclude:: ../../test/catch2_test_device_segmented_reduce_api.cu //! :language: c++ //! :dedent: //! :start-after: example-begin segmented-reduce-argmin //! :end-before: example-end segmented-reduce-argmin //! //! @endrst //! //! @tparam InputIteratorT //! **[inferred]** Random-access input iterator type for reading input items (of some type `T`) @iterator //! //! @tparam OutputIteratorT //! **[inferred]** Output iterator type for recording the reduced aggregate //! (having value type `KeyValuePair`) @iterator //! //! @tparam BeginOffsetIteratorT //! **[inferred]** Random-access input iterator type for reading segment //! beginning offsets @iterator //! //! @tparam EndOffsetIteratorT //! **[inferred]** Random-access input iterator type for reading segment //! ending offsets @iterator //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in] d_in //! Pointer to the input sequence of data items //! //! @param[out] d_out //! Pointer to the output aggregate //! //! @param[in] num_segments //! The number of segments that comprise the sorting data //! //! @param[in] d_begin_offsets //! @rst //! Random-access input iterator to the sequence of beginning offsets of //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*`` //! @endrst //! //! @param[in] d_end_offsets //! @rst //! Random-access input iterator to the sequence of ending offsets of length //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``. //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty. //! @endrst //! //! @param[in] stream //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst template CUB_RUNTIME_FUNCTION static cudaError_t ArgMin( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSegmentedReduce::ArgMin"); // Integer type for global offsets // Using common iterator value type is a breaking change, see: // https://github.com/NVIDIA/cccl/pull/414#discussion_r1330632615 using OffsetT = int; // detail::common_iterator_value_t; // The input type using InputValueT = cub::detail::value_t; // The output tuple type using OutputTupleT = cub::detail::non_void_value_t>; // The output value type using OutputValueT = typename OutputTupleT::Value; using AccumT = OutputTupleT; using InitT = detail::reduce::empty_problem_init_t; // Wrapped input iterator to produce index-value tuples using ArgIndexInputIteratorT = ArgIndexInputIterator; ArgIndexInputIteratorT d_indexed_in(d_in); // Initial value // TODO Address https://github.com/NVIDIA/cub/issues/651 InitT initial_value{AccumT(1, Traits::Max())}; using integral_offset_check = ::cuda::std::is_integral; static_assert(integral_offset_check::value, "Offset iterator value type should be integral."); return segmented_reduce( integral_offset_check{}, d_temp_storage, temp_storage_bytes, d_indexed_in, d_out, num_segments, d_begin_offsets, d_end_offsets, cub::ArgMin(), initial_value, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ArgMin( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return ArgMin( d_temp_storage, temp_storage_bytes, d_in, d_out, num_segments, d_begin_offsets, d_end_offsets, stream); } //! @rst //! Computes a device-wide segmented maximum using the greater-than (``>``) operator. //! //! - Uses ``std::numeric_limits::lowest()`` as the initial value of the reduction. //! - When input a contiguous sequence of segments, a single sequence //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased //! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where //! the latter is specified as ``segment_offsets + 1``). //! - Does not support ``>`` operators that are non-commutative. //! - Let ``s`` be in ``[0, num_segments)``. The range //! ``[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])`` shall not //! overlap ``[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])``, //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor //! ``[d_end_offsets, d_end_offsets + num_segments)``. //! - @devicestorage //! //! Snippet //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The code snippet below illustrates the max-reduction of a device vector of ``int`` data elements. //! //! .. literalinclude:: ../../test/catch2_test_device_segmented_reduce_api.cu //! :language: c++ //! :dedent: //! :start-after: example-begin segmented-reduce-max //! :end-before: example-end segmented-reduce-max //! //! @endrst //! //! @tparam InputIteratorT //! **[inferred]** Random-access input iterator type for reading input items @iterator //! //! @tparam OutputIteratorT //! **[inferred]** Output iterator type for recording the reduced aggregate @iterator //! //! @tparam BeginOffsetIteratorT //! **[inferred]** Random-access input iterator type for reading segment beginning offsets @iterator //! //! @tparam EndOffsetIteratorT //! **[inferred]** Random-access input iterator type for reading segment ending offsets @iterator //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in] d_in //! Pointer to the input sequence of data items //! //! @param[out] d_out //! Pointer to the output aggregate //! //! @param[in] num_segments //! The number of segments that comprise the sorting data //! //! @param[in] d_begin_offsets //! @rst //! Random-access input iterator to the sequence of beginning offsets of //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*`` //! @endrst //! //! @param[in] d_end_offsets //! @rst //! Random-access input iterator to the sequence of ending offsets of length //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``. //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty. //! @endrst //! //! @param[in] stream //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst template CUB_RUNTIME_FUNCTION static cudaError_t Max(void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSegmentedReduce::Max"); // Integer type for global offsets using OffsetT = detail::common_iterator_value_t; // The input value type using InputT = cub::detail::value_t; using integral_offset_check = ::cuda::std::is_integral; static_assert(integral_offset_check::value, "Offset iterator value type should be integral."); return segmented_reduce( integral_offset_check{}, d_temp_storage, temp_storage_bytes, d_in, d_out, num_segments, d_begin_offsets, d_end_offsets, cub::Max(), Traits::Lowest(), // replace with // std::numeric_limits::lowest() // when C++11 support is // more prevalent stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t Max(void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return Max( d_temp_storage, temp_storage_bytes, d_in, d_out, num_segments, d_begin_offsets, d_end_offsets, stream); } //! @rst //! Finds the first device-wide maximum in each segment using the //! greater-than (``>``) operator, also returning the in-segment index of that item //! //! - The output value type of ``d_out`` is ``cub::KeyValuePair`` //! (assuming the value type of ``d_in`` is ``T``) //! //! - The maximum of the *i*\ :sup:`th` segment is written to //! ``d_out[i].value`` and its offset in that segment is written to ``d_out[i].key``. //! - The ``{1, std::numeric_limits::lowest()}`` tuple is produced for zero-length inputs //! //! - When input a contiguous sequence of segments, a single sequence //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased //! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where //! the latter is specified as ``segment_offsets + 1``). //! - Does not support ``>`` operators that are non-commutative. //! - Let ``s`` be in ``[0, num_segments)``. The range //! ``[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])`` shall not //! overlap ``[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])``, //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor //! ``[d_end_offsets, d_end_offsets + num_segments)``. //! - @devicestorage //! //! Snippet //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The code snippet below illustrates the argmax-reduction of a device vector //! of `int` data elements. //! //! .. literalinclude:: ../../test/catch2_test_device_segmented_reduce_api.cu //! :language: c++ //! :dedent: //! :start-after: example-begin segmented-reduce-argmax //! :end-before: example-end segmented-reduce-argmax //! //! @endrst //! //! @tparam InputIteratorT //! **[inferred]** Random-access input iterator type for reading input items //! (of some type `T`) @iterator //! //! @tparam OutputIteratorT //! **[inferred]** Output iterator type for recording the reduced aggregate //! (having value type `KeyValuePair`) @iterator //! //! @tparam BeginOffsetIteratorT //! **[inferred]** Random-access input iterator type for reading segment //! beginning offsets @iterator //! //! @tparam EndOffsetIteratorT //! **[inferred]** Random-access input iterator type for reading segment //! ending offsets @iterator //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work //! is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in] d_in //! Pointer to the input sequence of data items //! //! @param[out] d_out //! Pointer to the output aggregate //! //! @param[in] num_segments //! The number of segments that comprise the sorting data //! //! @param[in] d_begin_offsets //! @rst //! Random-access input iterator to the sequence of beginning offsets of //! length `num_segments`, such that ``d_begin_offsets[i]`` is the first //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*`` //! @endrst //! //! @param[in] d_end_offsets //! @rst //! Random-access input iterator to the sequence of ending offsets of length //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``. //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty. //! @endrst //! //! @param[in] stream //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst template CUB_RUNTIME_FUNCTION static cudaError_t ArgMax( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSegmentedReduce::ArgMax"); // Integer type for global offsets // Using common iterator value type is a breaking change, see: // https://github.com/NVIDIA/cccl/pull/414#discussion_r1330632615 using OffsetT = int; // detail::common_iterator_value_t; // The input type using InputValueT = cub::detail::value_t; // The output tuple type using OutputTupleT = cub::detail::non_void_value_t>; using AccumT = OutputTupleT; using InitT = detail::reduce::empty_problem_init_t; // The output value type using OutputValueT = typename OutputTupleT::Value; // Wrapped input iterator to produce index-value tuples using ArgIndexInputIteratorT = ArgIndexInputIterator; ArgIndexInputIteratorT d_indexed_in(d_in); // Initial value // TODO Address https://github.com/NVIDIA/cub/issues/651 InitT initial_value{AccumT(1, Traits::Lowest())}; using integral_offset_check = ::cuda::std::is_integral; static_assert(integral_offset_check::value, "Offset iterator value type should be integral."); return segmented_reduce( integral_offset_check{}, d_temp_storage, temp_storage_bytes, d_indexed_in, d_out, num_segments, d_begin_offsets, d_end_offsets, cub::ArgMax(), initial_value, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ArgMax( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return ArgMax( d_temp_storage, temp_storage_bytes, d_in, d_out, num_segments, d_begin_offsets, d_end_offsets, stream); } }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/device/device_segmented_sort.cuh000066400000000000000000003715221463375617100222740ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ //! @file cub::DeviceSegmentedSort provides device-wide, parallel operations for //! computing a batched sort across multiple, non-overlapping sequences of //! data items residing within device-accessible memory. #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include CUB_NAMESPACE_BEGIN //! @rst //! DeviceSegmentedSort provides device-wide, parallel operations for //! computing a batched sort across multiple, non-overlapping sequences of //! data items residing within device-accessible memory. //! //! Overview //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The algorithm arranges items into ascending (or descending) order. //! The underlying sorting algorithm is undefined. Depending on the segment size, //! it might be radix sort, merge sort or something else. Therefore, no //! assumptions on the underlying implementation should be made. //! //! Differences from DeviceSegmentedRadixSort //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! DeviceSegmentedRadixSort is optimized for significantly large segments (tens //! of thousands of items and more). Nevertheless, some domains produce a wide //! range of segment sizes. DeviceSegmentedSort partitions segments into size //! groups and specialize sorting algorithms for each group. This approach leads //! to better resource utilization in the presence of segment size imbalance or //! moderate segment sizes (up to thousands of items). //! This algorithm is more complex and consists of multiple kernels. This fact //! leads to longer compilation times as well as larger binaries sizes. //! //! Supported Types //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The algorithm has to satisfy the underlying algorithms restrictions. Radix //! sort usage restricts the list of supported types. Therefore, //! DeviceSegmentedSort can sort all of the built-in C++ numeric primitive types //! (``unsigned char``, ``int``, ``double``, etc.) as well as CUDA's ``__half`` and //! ``__nv_bfloat16`` 16-bit floating-point types. //! //! Segments are not required to be contiguous. Any element of input(s) or //! output(s) outside the specified segments will not be accessed nor modified. //! //! A simple example //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! .. code-block:: c++ //! //! #include //! // or equivalently //! //! // Declare, allocate, and initialize device-accessible pointers //! // for sorting data //! int num_items; // e.g., 7 //! int num_segments; // e.g., 3 //! int *d_offsets; // e.g., [0, 3, 3, 7] //! int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] //! int *d_keys_out; // e.g., [-, -, -, -, -, -, -] //! int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6] //! int *d_values_out; // e.g., [-, -, -, -, -, -, -] //! ... //! //! // Determine temporary device storage requirements //! void *d_temp_storage = NULL; //! size_t temp_storage_bytes = 0; //! cub::DeviceSegmentedSort::SortPairs( //! d_temp_storage, temp_storage_bytes, //! d_keys_in, d_keys_out, d_values_in, d_values_out, //! num_items, num_segments, d_offsets, d_offsets + 1); //! //! // Allocate temporary storage //! cudaMalloc(&d_temp_storage, temp_storage_bytes); //! //! // Run sorting operation //! cub::DeviceSegmentedSort::SortPairs( //! d_temp_storage, temp_storage_bytes, //! d_keys_in, d_keys_out, d_values_in, d_values_out, //! num_items, num_segments, d_offsets, d_offsets + 1); //! //! // d_keys_out <-- [6, 7, 8, 0, 3, 5, 9] //! // d_values_out <-- [1, 2, 0, 5, 4, 3, 6] //! //! @endrst struct DeviceSegmentedSort { private: // Name reported for NVTX ranges _CCCL_HOST_DEVICE static constexpr auto GetName() -> const char* { return "cub::DeviceSegmentedRadixSort"; } // Internal version without NVTX range template CUB_RUNTIME_FUNCTION static cudaError_t SortKeysNoNVTX( void* d_temp_storage, std::size_t& temp_storage_bytes, const KeyT* d_keys_in, KeyT* d_keys_out, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { constexpr bool is_descending = false; constexpr bool is_overwrite_okay = false; using DispatchT = DispatchSegmentedSort; DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); DoubleBuffer d_values; return DispatchT::Dispatch( d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, num_segments, d_begin_offsets, d_end_offsets, is_overwrite_okay, stream); } public: //! @name Keys-only //! @{ //! @rst //! Sorts segments of keys into ascending order. //! Approximately ``num_items + 2 * num_segments`` auxiliary storage required. //! //! - The contents of the input data are not altered by the sorting operation. //! - When the input is a contiguous sequence of segments, a single sequence //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased //! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where //! the latter is specified as `segment_offsets+1`). //! - SortKeys is not guaranteed to be stable. That is, suppose that ``i`` and //! ``j`` are equivalent: neither one is less than the other. It is not //! guaranteed that the relative order of these two elements will be //! preserved by sort. //! - The range ``[d_keys_out, d_keys_out + num_items)`` shall not overlap //! ``[d_keys_in, d_keys_in + num_items)``, //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor //! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way. //! - Segments are not required to be contiguous. For all index values ``i`` //! outside the specified segments ``d_keys_in[i]``, ``d_keys_out[i]`` will not //! be accessed nor modified. //! //! Snippet //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The code snippet below illustrates the batched sorting of three segments //! (with one zero-length segment) of ``int`` keys. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! // Declare, allocate, and initialize device-accessible //! // pointers for sorting data //! int num_items; // e.g., 7 //! int num_segments; // e.g., 3 //! int *d_offsets; // e.g., [0, 3, 3, 7] //! int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] //! int *d_keys_out; // e.g., [-, -, -, -, -, -, -] //! ... //! //! // Determine temporary device storage requirements //! void *d_temp_storage = NULL; //! size_t temp_storage_bytes = 0; //! cub::DeviceSegmentedSort::SortKeys( //! d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, //! num_items, num_segments, d_offsets, d_offsets + 1); //! //! // Allocate temporary storage //! cudaMalloc(&d_temp_storage, temp_storage_bytes); //! //! // Run sorting operation //! cub::DeviceSegmentedSort::SortKeys( //! d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, //! num_items, num_segments, d_offsets, d_offsets + 1); //! //! // d_keys_out <-- [6, 7, 8, 0, 3, 5, 9] //! //! @endrst //! //! @tparam KeyT //! **[inferred]** Key type //! //! @tparam BeginOffsetIteratorT //! **[inferred]** Random-access input iterator type for reading segment //! beginning offsets @iterator //! //! @tparam EndOffsetIteratorT //! **[inferred]** Random-access input iterator type for reading segment //! ending offsets @iterator //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When nullptr, the //! required allocation size is written to `temp_storage_bytes` and no work //! is done //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in] d_keys_in //! Device-accessible pointer to the input data of key data to sort //! //! @param[out] d_keys_out //! Device-accessible pointer to the sorted output sequence of key data //! //! @param[in] num_items //! The total number of items to sort (across all segments) //! //! @param[in] num_segments //! The number of segments that comprise the sorting data //! //! @param[in] d_begin_offsets //! @rst //! Random-access input iterator to the sequence of beginning offsets of //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*`` //! @endrst //! //! @param[in] d_end_offsets //! @rst //! Random-access input iterator to the sequence of ending offsets of length //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``. //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the i-th segment is considered empty. //! @endrst //! //! @param[in] stream //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst template CUB_RUNTIME_FUNCTION static cudaError_t SortKeys( void* d_temp_storage, std::size_t& temp_storage_bytes, const KeyT* d_keys_in, KeyT* d_keys_out, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); return SortKeysNoNVTX( d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items, num_segments, d_begin_offsets, d_end_offsets, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeys( void* d_temp_storage, std::size_t& temp_storage_bytes, const KeyT* d_keys_in, KeyT* d_keys_out, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return SortKeys( d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items, num_segments, d_begin_offsets, d_end_offsets, stream); } private: // Internal version without NVTX range template CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescendingNoNVTX( void* d_temp_storage, std::size_t& temp_storage_bytes, const KeyT* d_keys_in, KeyT* d_keys_out, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { constexpr bool is_descending = true; constexpr bool is_overwrite_okay = false; using DispatchT = DispatchSegmentedSort; DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); DoubleBuffer d_values; return DispatchT::Dispatch( d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, num_segments, d_begin_offsets, d_end_offsets, is_overwrite_okay, stream); } public: //! @rst //! Sorts segments of keys into descending order. Approximately //! ``num_items + 2 * num_segments`` auxiliary storage required. //! //! - The contents of the input data are not altered by the sorting operation. //! - When the input is a contiguous sequence of segments, a single sequence //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased //! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where //! the latter is specified as ``segment_offsets + 1``). //! - SortKeysDescending is not guaranteed to be stable. That is, suppose that //! ``i`` and ``j`` are equivalent: neither one is less than the other. It is //! not guaranteed that the relative order of these two elements will be //! preserved by sort. //! - The range ``[d_keys_out, d_keys_out + num_items)`` shall not overlap //! ``[d_keys_in, d_keys_in + num_items)``, //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor //! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way. //! - Segments are not required to be contiguous. For all index values ``i`` //! outside the specified segments ``d_keys_in[i]``, ``d_keys_out[i]`` will not //! be accessed nor modified. //! //! Snippet //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The code snippet below illustrates the batched sorting of three segments //! (with one zero-length segment) of ``i`` nt keys. //! //! .. code-block:: c++ //! //! #include //! // or equivalently //! //! // Declare, allocate, and initialize device-accessible pointers //! // for sorting data //! int num_items; // e.g., 7 //! int num_segments; // e.g., 3 //! int *d_offsets; // e.g., [0, 3, 3, 7] //! int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] //! int *d_keys_out; // e.g., [-, -, -, -, -, -, -] //! ... //! //! // Determine temporary device storage requirements //! void *d_temp_storage = NULL; //! size_t temp_storage_bytes = 0; //! cub::DeviceSegmentedSort::SortKeysDescending( //! d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, //! num_items, num_segments, d_offsets, d_offsets + 1); //! //! // Allocate temporary storage //! cudaMalloc(&d_temp_storage, temp_storage_bytes); //! //! // Run sorting operation //! cub::DeviceSegmentedSort::SortKeysDescending( //! d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, //! num_items, num_segments, d_offsets, d_offsets + 1); //! //! // d_keys_out <-- [8, 7, 6, 9, 5, 3, 0] //! //! @endrst //! //! @tparam KeyT //! **[inferred]** Key type //! //! @tparam BeginOffsetIteratorT //! **[inferred]** Random-access input iterator type for reading segment beginning offsets @iterator //! //! @tparam EndOffsetIteratorT //! **[inferred]** Random-access input iterator type for reading segment ending offsets @iterator //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When nullptr, the //! required allocation size is written to `temp_storage_bytes` and no work is done //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in] d_keys_in //! Device-accessible pointer to the input data of key data to sort //! //! @param[out] d_keys_out //! Device-accessible pointer to the sorted output sequence of key data //! //! @param[in] num_items //! The total number of items to sort (across all segments) //! //! @param[in] num_segments //! The number of segments that comprise the sorting data //! //! @param[in] d_begin_offsets //! @rst //! Random-access input iterator to the sequence of beginning offsets of //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*`` //! @endrst //! //! @param[in] d_end_offsets //! @rst //! Random-access input iterator to the sequence of ending offsets of length //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``. //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the ``i``-th segment is considered empty. //! @endrst //! //! @param[in] stream //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst template CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending( void* d_temp_storage, std::size_t& temp_storage_bytes, const KeyT* d_keys_in, KeyT* d_keys_out, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); return SortKeysDescendingNoNVTX( d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items, num_segments, d_begin_offsets, d_end_offsets, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending( void* d_temp_storage, std::size_t& temp_storage_bytes, const KeyT* d_keys_in, KeyT* d_keys_out, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return SortKeysDescending( d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items, num_segments, d_begin_offsets, d_end_offsets, stream); } private: // Internal version without NVTX range template CUB_RUNTIME_FUNCTION static cudaError_t SortKeysNoNVTX( void* d_temp_storage, std::size_t& temp_storage_bytes, DoubleBuffer& d_keys, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { constexpr bool is_descending = false; constexpr bool is_overwrite_okay = true; using DispatchT = DispatchSegmentedSort; DoubleBuffer d_values; return DispatchT::Dispatch( d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, num_segments, d_begin_offsets, d_end_offsets, is_overwrite_okay, stream); } public: //! @rst //! Sorts segments of keys into ascending order. Approximately ``2 * num_segments`` auxiliary storage required. //! //! - The sorting operation is given a pair of key buffers managed by a //! DoubleBuffer structure that indicates which of the two buffers is //! "current" (and thus contains the input data to be sorted). //! - The contents of both buffers may be altered by the sorting operation. //! - Upon completion, the sorting operation will update the "current" //! indicator within the DoubleBuffer wrapper to reference which of the two //! buffers now contains the sorted output sequence (a function of the number //! of key bits and the targeted device architecture). //! - When the input is a contiguous sequence of segments, a single sequence //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased //! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where //! the latter is specified as ``segment_offsets +1``). //! - SortKeys is not guaranteed to be stable. That is, suppose that //! ``i`` and ``j`` are equivalent: neither one is less than the other. It is //! not guaranteed that the relative order of these two elements will be //! preserved by sort. //! - Let ``cur = d_keys.Current()`` and ``alt = d_keys.Alternate()``. //! The range ``[cur, cur + num_items)`` shall not overlap //! ``[alt, alt + num_items)``. Both ranges shall not overlap //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor //! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way. //! - Segments are not required to be contiguous. For all index values ``i`` //! outside the specified segments ``d_keys.Current()[i]``, //! ``d_keys[i].Alternate()[i]`` will not be accessed nor modified. //! //! Snippet //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The code snippet below illustrates the batched sorting of three segments //! (with one zero-length segment) of ``i`` nt keys. //! //! .. code-block:: c++ //! //! #include //! // or equivalently //! //! // Declare, allocate, and initialize device-accessible //! // pointers for sorting data //! int num_items; // e.g., 7 //! int num_segments; // e.g., 3 //! int *d_offsets; // e.g., [0, 3, 3, 7] //! int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] //! int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -] //! ... //! //! // Create a DoubleBuffer to wrap the pair of device pointers //! cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); //! //! // Determine temporary device storage requirements //! void *d_temp_storage = NULL; //! size_t temp_storage_bytes = 0; //! cub::DeviceSegmentedSort::SortKeys( //! d_temp_storage, temp_storage_bytes, d_keys, //! num_items, num_segments, d_offsets, d_offsets + 1); //! //! // Allocate temporary storage //! cudaMalloc(&d_temp_storage, temp_storage_bytes); //! //! // Run sorting operation //! cub::DeviceSegmentedSort::SortKeys( //! d_temp_storage, temp_storage_bytes, d_keys, //! num_items, num_segments, d_offsets, d_offsets + 1); //! //! // d_keys.Current() <-- [6, 7, 8, 0, 3, 5, 9] //! //! @endrst //! //! @tparam KeyT //! **[inferred]** Key type //! //! @tparam BeginOffsetIteratorT //! **[inferred]** Random-access input iterator type for reading segment //! beginning offsets @iterator //! //! @tparam EndOffsetIteratorT //! **[inferred]** Random-access input iterator type for reading segment //! ending offsets @iterator //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When nullptr, the //! required allocation size is written to `temp_storage_bytes` and no //! work is done //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in,out] d_keys //! Reference to the double-buffer of keys whose "current" device-accessible //! buffer contains the unsorted input keys and, upon return, is updated to //! point to the sorted output keys //! //! @param[in] num_items //! The total number of items to sort (across all segments) //! //! @param[in] num_segments //! The number of segments that comprise the sorting data //! //! @param[in] d_begin_offsets //! @rst //! Random-access input iterator to the sequence of beginning offsets of //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*`` //! @endrst //! //! @param[in] d_end_offsets //! @rst //! Random-access input iterator to the sequence of ending offsets of length //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``. //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the ``i``-th segment is considered empty. //! @endrst //! //! @param[in] stream //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst template CUB_RUNTIME_FUNCTION static cudaError_t SortKeys( void* d_temp_storage, std::size_t& temp_storage_bytes, DoubleBuffer& d_keys, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); return SortKeysNoNVTX( d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeys( void* d_temp_storage, std::size_t& temp_storage_bytes, DoubleBuffer& d_keys, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return SortKeys( d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream); } private: // Internal version without NVTX range template CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescendingNoNVTX( void* d_temp_storage, std::size_t& temp_storage_bytes, DoubleBuffer& d_keys, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { constexpr bool is_descending = true; constexpr bool is_overwrite_okay = true; using DispatchT = DispatchSegmentedSort; DoubleBuffer d_values; return DispatchT::Dispatch( d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, num_segments, d_begin_offsets, d_end_offsets, is_overwrite_okay, stream); } public: //! @rst //! Sorts segments of keys into descending order. Approximately //! ``2 * num_segments`` auxiliary storage required. //! //! - The sorting operation is given a pair of key buffers managed by a //! DoubleBuffer structure that indicates which of the two buffers is //! "current" (and thus contains the input data to be sorted). //! - The contents of both buffers may be altered by the sorting operation. //! - Upon completion, the sorting operation will update the "current" //! indicator within the DoubleBuffer wrapper to reference which of the two //! buffers now contains the sorted output sequence (a function of the number //! of key bits and the targeted device architecture). //! - When the input is a contiguous sequence of segments, a single sequence //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased //! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where //! the latter is specified as ``segment_offsets + 1``). //! - SortKeysDescending is not guaranteed to be stable. That is, suppose that //! ``i`` and ``j`` are equivalent: neither one is less than the other. It is //! not guaranteed that the relative order of these two elements will be //! preserved by sort. //! - Let ``cur = d_keys.Current()`` and ``alt = d_keys.Alternate()``. //! The range ``[cur, cur + num_items)`` shall not overlap //! ``[alt, alt + num_items)``. Both ranges shall not overlap //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor //! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way. //! - Segments are not required to be contiguous. For all index values ``i`` //! outside the specified segments ``d_keys.Current()[i]``, //! ``d_keys[i].Alternate()[i]`` will not be accessed nor modified. //! //! Snippet //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The code snippet below illustrates the batched sorting of three segments //! (with one zero-length segment) of ``i`` nt keys. //! //! .. code-block:: c++ //! //! #include //! // or equivalently //! //! // Declare, allocate, and initialize device-accessible pointers for //! // sorting data //! int num_items; // e.g., 7 //! int num_segments; // e.g., 3 //! int *d_offsets; // e.g., [0, 3, 3, 7] //! int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] //! int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -] //! ... //! //! // Create a DoubleBuffer to wrap the pair of device pointers //! cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); //! //! // Determine temporary device storage requirements //! void *d_temp_storage = NULL; //! size_t temp_storage_bytes = 0; //! cub::DeviceSegmentedSort::SortKeysDescending( //! d_temp_storage, temp_storage_bytes, d_keys, //! num_items, num_segments, d_offsets, d_offsets + 1); //! //! // Allocate temporary storage //! cudaMalloc(&d_temp_storage, temp_storage_bytes); //! //! // Run sorting operation //! cub::DeviceSegmentedSort::SortKeysDescending( //! d_temp_storage, temp_storage_bytes, d_keys, //! num_items, num_segments, d_offsets, d_offsets + 1); //! //! // d_keys.Current() <-- [8, 7, 6, 9, 5, 3, 0] //! //! @endrst //! //! @tparam KeyT //! **[inferred]** Key type //! //! @tparam BeginOffsetIteratorT //! **[inferred]** Random-access input iterator type for reading segment //! beginning offsets @iterator //! //! @tparam EndOffsetIteratorT //! **[inferred]** Random-access input iterator type for reading segment //! ending offsets @iterator //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work //! is done //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in,out] d_keys //! Reference to the double-buffer of keys whose "current" device-accessible //! buffer contains the unsorted input keys and, upon return, is updated to //! point to the sorted output keys //! //! @param[in] num_items //! The total number of items to sort (across all segments) //! //! @param[in] num_segments //! The number of segments that comprise the sorting data //! //! @param[in] d_begin_offsets //! @rst //! Random-access input iterator to the sequence of beginning offsets of //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*`` //! @endrst //! //! @param[in] d_end_offsets //! @rst //! Random-access input iterator to the sequence of ending offsets of length //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``. //! If ``d_end_offsets[i] - 1<= d_begin_offsets[i]``, the ``i``-th segment is considered empty. //! @endrst //! //! @param[in] stream //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst template CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending( void* d_temp_storage, std::size_t& temp_storage_bytes, DoubleBuffer& d_keys, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); return SortKeysDescendingNoNVTX( d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending( void* d_temp_storage, std::size_t& temp_storage_bytes, DoubleBuffer& d_keys, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return SortKeysDescending( d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream); } //! @rst //! Sorts segments of keys into ascending order. Approximately //! ``num_items + 2 * num_segments`` auxiliary storage required. //! //! - The contents of the input data are not altered by the sorting operation. //! - When the input is a contiguous sequence of segments, a single sequence //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased //! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where //! the latter is specified as ``segment_offsets + 1``). //! - StableSortKeys is stable: it preserves the relative ordering of //! equivalent elements. That is, if ``x`` and ``y`` are elements such that //! ``x`` precedes ``y``, and if the two elements are equivalent (neither //! ``x < y`` nor ``y < x``) then a postcondition of stable sort is that //! ``x`` still precedes ``y``. //! - The range ``[d_keys_out, d_keys_out + num_items)`` shall not overlap //! ``[d_keys_in, d_keys_in + num_items)``, //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor //! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way. //! - Segments are not required to be contiguous. For all index values ``i`` //! outside the specified segments ``d_keys_in[i]``, ``d_keys_out[i]`` will not //! be accessed nor modified. //! //! Snippet //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The code snippet below illustrates the batched sorting of three segments //! (with one zero-length segment) of ``i`` nt keys. //! //! .. code-block:: c++ //! //! #include //! // or equivalently //! //! // Declare, allocate, and initialize device-accessible pointers //! // for sorting data //! int num_items; // e.g., 7 //! int num_segments; // e.g., 3 //! int *d_offsets; // e.g., [0, 3, 3, 7] //! int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] //! int *d_keys_out; // e.g., [-, -, -, -, -, -, -] //! ... //! //! // Determine temporary device storage requirements //! void *d_temp_storage = NULL; //! size_t temp_storage_bytes = 0; //! cub::DeviceSegmentedSort::StableSortKeys( //! d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, //! num_items, num_segments, d_offsets, d_offsets + 1); //! //! // Allocate temporary storage //! cudaMalloc(&d_temp_storage, temp_storage_bytes); //! //! // Run sorting operation //! cub::DeviceSegmentedSort::StableSortKeys( //! d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, //! num_items, num_segments, d_offsets, d_offsets + 1); //! //! // d_keys_out <-- [6, 7, 8, 0, 3, 5, 9] //! //! @endrst //! //! @tparam KeyT //! **[inferred]** Key type //! //! @tparam BeginOffsetIteratorT //! **[inferred]** Random-access input iterator type for reading segment //! beginning offsets @iterator //! //! @tparam EndOffsetIteratorT //! **[inferred]** Random-access input iterator type for reading segment //! ending offsets @iterator //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When nullptr, the //! required allocation size is written to `temp_storage_bytes` and no work //! is done //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in] d_keys_in //! Device-accessible pointer to the input data of key data to sort //! //! @param[out] d_keys_out //! Device-accessible pointer to the sorted output sequence of key data //! //! @param[in] num_items //! The total number of items to sort (across all segments) //! //! @param[in] num_segments //! The number of segments that comprise the sorting data //! //! @param[in] d_begin_offsets //! @rst //! Random-access input iterator to the sequence of beginning offsets of //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*`` //! @endrst //! //! @param[in] d_end_offsets //! @rst //! Random-access input iterator to the sequence of ending offsets of length //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``. //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the ``i``-th segment is considered empty. //! @endrst //! //! @param[in] stream //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst template CUB_RUNTIME_FUNCTION static cudaError_t StableSortKeys( void* d_temp_storage, std::size_t& temp_storage_bytes, const KeyT* d_keys_in, KeyT* d_keys_out, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); return SortKeysNoNVTX( d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items, num_segments, d_begin_offsets, d_end_offsets, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortKeys( void* d_temp_storage, std::size_t& temp_storage_bytes, const KeyT* d_keys_in, KeyT* d_keys_out, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return StableSortKeys( d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items, num_segments, d_begin_offsets, d_end_offsets, stream); } //! @rst //! Sorts segments of keys into descending order. //! Approximately ``num_items + 2 * num_segments`` auxiliary storage required. //! //! - The contents of the input data are not altered by the sorting operation. //! - When the input is a contiguous sequence of segments, a single sequence //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased //! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where //! the latter is specified as ``segment_offsets + 1``). //! - StableSortKeysDescending is stable: it preserves the relative ordering of //! equivalent elements. That is, if ``x`` and ``y`` are elements such that //! ``x`` precedes ``y``, and if the two elements are equivalent (neither ``x < y`` nor ``y < x``) //! then a postcondition of stable sort is that ``x`` still precedes ``y``. //! - The range ``[d_keys_out, d_keys_out + num_items)`` shall not overlap //! ``[d_keys_in, d_keys_in + num_items)``, //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor //! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way. //! - Segments are not required to be contiguous. For all index values ``i`` //! outside the specified segments ``d_keys_in[i]``, ``d_keys_out[i]`` will not //! be accessed nor modified. //! //! Snippet //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The code snippet below illustrates the batched sorting of three segments //! (with one zero-length segment) of ``i`` nt keys. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! // Declare, allocate, and initialize device-accessible pointers //! // for sorting data //! int num_items; // e.g., 7 //! int num_segments; // e.g., 3 //! int *d_offsets; // e.g., [0, 3, 3, 7] //! int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] //! int *d_keys_out; // e.g., [-, -, -, -, -, -, -] //! ... //! //! // Determine temporary device storage requirements //! void *d_temp_storage = NULL; //! size_t temp_storage_bytes = 0; //! cub::DeviceSegmentedSort::StableSortKeysDescending( //! d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, //! num_items, num_segments, d_offsets, d_offsets + 1); //! //! // Allocate temporary storage //! cudaMalloc(&d_temp_storage, temp_storage_bytes); //! //! // Run sorting operation //! cub::DeviceSegmentedSort::StableSortKeysDescending( //! d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, //! num_items, num_segments, d_offsets, d_offsets + 1); //! //! // d_keys_out <-- [8, 7, 6, 9, 5, 3, 0] //! //! @endrst //! //! @tparam KeyT //! **[inferred]** Key type //! //! @tparam BeginOffsetIteratorT //! **[inferred]** Random-access input iterator type for reading segment //! beginning offsets @iterator //! //! @tparam EndOffsetIteratorT //! **[inferred]** Random-access input iterator type for reading segment //! ending offsets @iterator //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When nullptr, the //! required allocation size is written to `temp_storage_bytes` and no work //! is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in] d_keys_in //! Device-accessible pointer to the input data of key data to sort //! //! @param[out] d_keys_out //! Device-accessible pointer to the sorted output sequence of key data //! //! @param[in] num_items //! The total number of items to sort (across all segments) //! //! @param[in] num_segments //! The number of segments that comprise the sorting data //! //! @param[in] d_begin_offsets //! @rst //! Random-access input iterator to the sequence of beginning offsets of //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and //! ``d_values_*`` //! @endrst //! //! @param[in] d_end_offsets //! @rst //! Random-access input iterator to the sequence of ending offsets of length //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``. //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the ``i``-th segment is //! considered empty. //! @endrst //! //! @param[in] stream //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst template CUB_RUNTIME_FUNCTION static cudaError_t StableSortKeysDescending( void* d_temp_storage, std::size_t& temp_storage_bytes, const KeyT* d_keys_in, KeyT* d_keys_out, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); return SortKeysDescendingNoNVTX( d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items, num_segments, d_begin_offsets, d_end_offsets, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortKeysDescending( void* d_temp_storage, std::size_t& temp_storage_bytes, const KeyT* d_keys_in, KeyT* d_keys_out, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return StableSortKeysDescending( d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items, num_segments, d_begin_offsets, d_end_offsets, stream); } //! @rst //! Sorts segments of keys into ascending order. //! Approximately ``2 * num_segments`` auxiliary storage required. //! //! - The sorting operation is given a pair of key buffers managed by a //! DoubleBuffer structure that indicates which of the two buffers is //! "current" (and thus contains the input data to be sorted). //! - The contents of both buffers may be altered by the sorting operation. //! - Upon completion, the sorting operation will update the "current" //! indicator within the DoubleBuffer wrapper to reference which of the two //! buffers now contains the sorted output sequence (a function of the number //! of key bits and the targeted device architecture). //! - When the input is a contiguous sequence of segments, a single sequence //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased //! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where //! the latter is specified as ``segment_offsets + 1``). //! - StableSortKeys is stable: it preserves the relative ordering of //! equivalent elements. That is, if ``x`` and ``y`` are elements such that //! ``x`` precedes ``y``, and if the two elements are equivalent (neither //! ``x < y`` nor ``y < x``) then a postcondition of stable sort is that //! ``x`` still precedes ``y``. //! - Let ``cur = d_keys.Current()`` and ``alt = d_keys.Alternate()``. //! The range ``[cur, cur + num_items)`` shall not overlap //! ``[alt, alt + num_items)``. Both ranges shall not overlap //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor //! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way. //! - Segments are not required to be contiguous. For all index values ``i`` //! outside the specified segments ``d_keys.Current()[i]``, //! ``d_keys[i].Alternate()[i]`` will not be accessed nor modified. //! //! Snippet //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The code snippet below illustrates the batched sorting of three segments //! (with one zero-length segment) of ``i`` nt keys. //! //! .. code-block:: c++ //! //! #include //! // or equivalently //! //! // Declare, allocate, and initialize device-accessible pointers //! // for sorting data //! int num_items; // e.g., 7 //! int num_segments; // e.g., 3 //! int *d_offsets; // e.g., [0, 3, 3, 7] //! int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] //! int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -] //! ... //! //! // Create a DoubleBuffer to wrap the pair of device pointers //! cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); //! //! // Determine temporary device storage requirements //! void *d_temp_storage = NULL; //! size_t temp_storage_bytes = 0; //! cub::DeviceSegmentedSort::StableSortKeys( //! d_temp_storage, temp_storage_bytes, d_keys, //! num_items, num_segments, d_offsets, d_offsets + 1); //! //! // Allocate temporary storage //! cudaMalloc(&d_temp_storage, temp_storage_bytes); //! //! // Run sorting operation //! cub::DeviceSegmentedSort::StableSortKeys( //! d_temp_storage, temp_storage_bytes, d_keys, //! num_items, num_segments, d_offsets, d_offsets + 1); //! //! // d_keys.Current() <-- [6, 7, 8, 0, 3, 5, 9] //! //! @endrst //! //! @tparam KeyT //! **[inferred]** Key type //! //! @tparam BeginOffsetIteratorT //! **[inferred]** Random-access input iterator type for reading segment //! beginning offsets @iterator //! //! @tparam EndOffsetIteratorT //! **[inferred]** Random-access input iterator type for reading segment //! ending offsets @iterator //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When nullptr, the //! required allocation size is written to `temp_storage_bytes` and no work //! is done //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in,out] d_keys //! Reference to the double-buffer of keys whose "current" device-accessible //! buffer contains the unsorted input keys and, upon return, is updated to //! point to the sorted output keys //! //! @param[in] num_items //! The total number of items to sort (across all segments) //! //! @param[in] num_segments //! The number of segments that comprise the sorting data //! //! @param[in] d_begin_offsets //! @rst //! Random-access input iterator to the sequence of beginning offsets of //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*`` //! @endrst //! //! @param[in] d_end_offsets //! @rst //! Random-access input iterator to the sequence of ending offsets of length //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``. //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the ``i``-th segment is //! considered empty. //! @endrst //! //! @param[in] stream //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst template CUB_RUNTIME_FUNCTION static cudaError_t StableSortKeys( void* d_temp_storage, std::size_t& temp_storage_bytes, DoubleBuffer& d_keys, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); return SortKeysNoNVTX( d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortKeys( void* d_temp_storage, std::size_t& temp_storage_bytes, DoubleBuffer& d_keys, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return StableSortKeys( d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream); } //! @rst //! Sorts segments of keys into descending order. //! Approximately ``2 * num_segments`` auxiliary storage required. //! //! - The sorting operation is given a pair of key buffers managed by a //! DoubleBuffer structure that indicates which of the two buffers is //! "current" (and thus contains the input data to be sorted). //! - The contents of both buffers may be altered by the sorting operation. //! - Upon completion, the sorting operation will update the "current" //! indicator within the DoubleBuffer wrapper to reference which of the two //! buffers now contains the sorted output sequence (a function of the number //! of key bits and the targeted device architecture). //! - When the input is a contiguous sequence of segments, a single sequence //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased //! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where //! the latter is specified as ``segment_offsets + 1``). //! - StableSortKeysDescending is stable: it preserves the relative ordering of //! equivalent elements. That is, if ``x`` and ``y`` are elements such that //! ``x`` precedes ``y``, and if the two elements are equivalent (neither //! ``x < y`` nor ``y < x``) then a postcondition of stable sort is that //! ``x`` still precedes ``y``. //! - Let ``cur = d_keys.Current()`` and ``alt = d_keys.Alternate()``. //! The range ``[cur, cur + num_items)`` shall not overlap //! ``[alt, alt + num_items)``. Both ranges shall not overlap //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor //! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way. //! - Segments are not required to be contiguous. For all index values ```i` //! outside the specified segments ``d_keys.Current()[i]``, //! ``d_keys[i].Alternate()[i]`` will not be accessed nor modified. //! //! Snippet //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The code snippet below illustrates the batched sorting of three segments //! (with one zero-length segment) of ``i`` nt keys. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! // Declare, allocate, and initialize device-accessible pointers //! // for sorting data //! int num_items; // e.g., 7 //! int num_segments; // e.g., 3 //! int *d_offsets; // e.g., [0, 3, 3, 7] //! int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] //! int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -] //! ... //! //! // Create a DoubleBuffer to wrap the pair of device pointers //! cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); //! //! // Determine temporary device storage requirements //! void *d_temp_storage = NULL; //! size_t temp_storage_bytes = 0; //! cub::DeviceSegmentedSort::StableSortKeysDescending( //! d_temp_storage, temp_storage_bytes, d_keys, //! num_items, num_segments, d_offsets, d_offsets + 1); //! //! // Allocate temporary storage //! cudaMalloc(&d_temp_storage, temp_storage_bytes); //! //! // Run sorting operation //! cub::DeviceSegmentedSort::StableSortKeysDescending( //! d_temp_storage, temp_storage_bytes, d_keys, //! num_items, num_segments, d_offsets, d_offsets + 1); //! //! // d_keys.Current() <-- [8, 7, 6, 9, 5, 3, 0] //! //! @endrst //! //! @tparam KeyT //! **[inferred]** Key type //! //! @tparam BeginOffsetIteratorT //! **[inferred]** Random-access input iterator type for reading segment //! beginning offsets @iterator //! //! @tparam EndOffsetIteratorT //! **[inferred]** Random-access input iterator type for reading segment //! ending offsets @iterator //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When nullptr, the //! required allocation size is written to `temp_storage_bytes` and no work //! is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in,out] d_keys //! Reference to the double-buffer of keys whose "current" device-accessible //! buffer contains the unsorted input keys and, upon return, is updated to //! point to the sorted output keys //! //! @param[in] num_items //! The total number of items to sort (across all segments) //! //! @param[in] num_segments //! The number of segments that comprise the sorting data //! //! @param[in] d_begin_offsets //! @rst //! Random-access input iterator to the sequence of beginning offsets of //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*`` //! @endrst //! //! @param[in] d_end_offsets //! @rst //! Random-access input iterator to the sequence of ending offsets of length //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and //! ``d_values_*``. If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the //! ``i``-th segment is considered empty. //! @endrst //! //! @param[in] stream //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst template CUB_RUNTIME_FUNCTION static cudaError_t StableSortKeysDescending( void* d_temp_storage, std::size_t& temp_storage_bytes, DoubleBuffer& d_keys, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); return SortKeysDescendingNoNVTX( d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortKeysDescending( void* d_temp_storage, std::size_t& temp_storage_bytes, DoubleBuffer& d_keys, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return StableSortKeysDescending( d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream); } private: // Internal version without NVTX range template CUB_RUNTIME_FUNCTION static cudaError_t SortPairsNoNVTX( void* d_temp_storage, std::size_t& temp_storage_bytes, const KeyT* d_keys_in, KeyT* d_keys_out, const ValueT* d_values_in, ValueT* d_values_out, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { constexpr bool is_descending = false; constexpr bool is_overwrite_okay = false; using DispatchT = DispatchSegmentedSort; DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); DoubleBuffer d_values(const_cast(d_values_in), d_values_out); return DispatchT::Dispatch( d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, num_segments, d_begin_offsets, d_end_offsets, is_overwrite_okay, stream); } public: //! @} end member group //! @name Key-value pairs //! @{ //! @rst //! Sorts segments of key-value pairs into ascending order. //! Approximately ``2 * num_items + 2 * num_segments`` auxiliary storage required. //! //! - The contents of the input data are not altered by the sorting operation. //! - When the input is a contiguous sequence of segments, a single sequence //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased //! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where //! the latter is specified as ``segment_offsets + 1``). //! - SortPairs is not guaranteed to be stable. That is, suppose that ``i`` and //! ``j`` are equivalent: neither one is less than the other. It is not //! guaranteed that the relative order of these two elements will be //! preserved by sort. //! - Let ``in`` be one of ``{d_keys_in, d_values_in}`` and ``out`` be any of //! ``{d_keys_out, d_values_out}``. The range ``[out, out + num_items)`` shall //! not overlap ``[in, in + num_items)``, //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor //! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way. //! - Segments are not required to be contiguous. For all index values ``i`` //! outside the specified segments ``d_keys_in[i]``, ``d_values_in[i]``, //! ``d_keys_out[i]``, ``d_values_out[i]`` will not be accessed nor modified. //! //! Snippet //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The code snippet below illustrates the batched sorting of three segments //! (with one zero-length segment) of ``i`` nt keys with associated vector of //! ``i`` nt values. //! //! .. code-block:: c++ //! //! #include //! // or equivalently //! //! // Declare, allocate, and initialize device-accessible pointers //! // for sorting data //! int num_items; // e.g., 7 //! int num_segments; // e.g., 3 //! int *d_offsets; // e.g., [0, 3, 3, 7] //! int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] //! int *d_keys_out; // e.g., [-, -, -, -, -, -, -] //! int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6] //! int *d_values_out; // e.g., [-, -, -, -, -, -, -] //! ... //! //! // Determine temporary device storage requirements //! void *d_temp_storage = NULL; //! size_t temp_storage_bytes = 0; //! cub::DeviceSegmentedSort::SortPairs( //! d_temp_storage, temp_storage_bytes, //! d_keys_in, d_keys_out, d_values_in, d_values_out, //! num_items, num_segments, d_offsets, d_offsets + 1); //! //! // Allocate temporary storage //! cudaMalloc(&d_temp_storage, temp_storage_bytes); //! //! // Run sorting operation //! cub::DeviceSegmentedSort::SortPairs( //! d_temp_storage, temp_storage_bytes, //! d_keys_in, d_keys_out, d_values_in, d_values_out, //! num_items, num_segments, d_offsets, d_offsets + 1); //! //! // d_keys_out <-- [6, 7, 8, 0, 3, 5, 9] //! // d_values_out <-- [1, 2, 0, 5, 4, 3, 6] //! //! @endrst //! //! @tparam KeyT //! **[inferred]** Key type //! //! @tparam ValueT //! **[inferred]** Value type //! //! @tparam BeginOffsetIteratorT //! **[inferred]** Random-access input iterator type for reading segment //! beginning offsets @iterator //! //! @tparam EndOffsetIteratorT //! **[inferred]** Random-access input iterator type for reading segment //! ending offsets @iterator //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work //! is done //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in] d_keys_in //! Device-accessible pointer to the input data of key data to sort //! //! @param[out] d_keys_out //! Device-accessible pointer to the sorted output sequence of key data //! //! @param[in] d_values_in //! Device-accessible pointer to the corresponding input sequence of //! associated value items //! //! @param[out] d_values_out //! Device-accessible pointer to the correspondingly-reordered output //! sequence of associated value items //! //! @param[in] num_items //! The total number of items to sort (across all segments) //! //! @param[in] num_segments //! The number of segments that comprise the sorting data //! //! @param[in] d_begin_offsets //! @rst //! Random-access input iterator to the sequence of beginning offsets of //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*`` //! @endrst //! //! @param[in] d_end_offsets //! @rst //! Random-access input iterator to the sequence of ending offsets of length //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``. //! If ``d_end_offsets[i]-1 <= d_begin_offsets[i]``, the ``i``-th segment is //! considered empty. //! @endrst //! //! @param[in] stream //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst template CUB_RUNTIME_FUNCTION static cudaError_t SortPairs( void* d_temp_storage, std::size_t& temp_storage_bytes, const KeyT* d_keys_in, KeyT* d_keys_out, const ValueT* d_values_in, ValueT* d_values_out, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); return SortPairsNoNVTX( d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, num_items, num_segments, d_begin_offsets, d_end_offsets, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairs( void* d_temp_storage, std::size_t& temp_storage_bytes, const KeyT* d_keys_in, KeyT* d_keys_out, const ValueT* d_values_in, ValueT* d_values_out, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return SortPairs( d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, num_items, num_segments, d_begin_offsets, d_end_offsets, stream); } private: // Internal version without NVTX range template CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescendingNoNVTX( void* d_temp_storage, std::size_t& temp_storage_bytes, const KeyT* d_keys_in, KeyT* d_keys_out, const ValueT* d_values_in, ValueT* d_values_out, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { constexpr bool is_descending = true; constexpr bool is_overwrite_okay = false; using DispatchT = DispatchSegmentedSort; DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); DoubleBuffer d_values(const_cast(d_values_in), d_values_out); return DispatchT::Dispatch( d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, num_segments, d_begin_offsets, d_end_offsets, is_overwrite_okay, stream); } public: //! @rst //! Sorts segments of key-value pairs into descending order. //! Approximately ``2 * num_items + 2 * num_segments`` auxiliary storage required. //! //! - The contents of the input data are not altered by the sorting operation. //! - When the input is a contiguous sequence of segments, a single sequence //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased //! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where //! the latter is specified as ``segment_offsets + 1``). //! - SortPairs is not guaranteed to be stable. That is, suppose that ``i`` and //! ``j`` are equivalent: neither one is less than the other. It is not //! guaranteed that the relative order of these two elements will be //! preserved by sort. //! - Let ``in`` be one of ``{d_keys_in, d_values_in}`` and ``out`` be any of //! ``{d_keys_out, d_values_out}``. The range ``[out, out + num_items)`` shall //! not overlap ``[in, in + num_items)``, //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor //! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way. //! - Segments are not required to be contiguous. For all index values ``i`` //! outside the specified segments ``d_keys_in[i]``, ``d_values_in[i]``, //! ``d_keys_out[i]``, ``d_values_out[i]`` will not be accessed nor modified. //! //! Snippet //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The code snippet below illustrates the batched sorting of three segments //! (with one zero-length segment) of ``i`` nt keys with associated vector of //! ``i`` nt values. //! //! .. code-block:: c++ //! //! #include //! // or equivalently //! //! // Declare, allocate, and initialize device-accessible pointers for //! // sorting data //! int num_items; // e.g., 7 //! int num_segments; // e.g., 3 //! int *d_offsets; // e.g., [0, 3, 3, 7] //! int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] //! int *d_keys_out; // e.g., [-, -, -, -, -, -, -] //! int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6] //! int *d_values_out; // e.g., [-, -, -, -, -, -, -] //! ... //! //! // Determine temporary device storage requirements //! void *d_temp_storage = NULL; //! size_t temp_storage_bytes = 0; //! cub::DeviceSegmentedSort::SortPairsDescending( //! d_temp_storage, temp_storage_bytes, //! d_keys_in, d_keys_out, d_values_in, d_values_out, //! num_items, num_segments, d_offsets, d_offsets + 1); //! //! // Allocate temporary storage //! cudaMalloc(&d_temp_storage, temp_storage_bytes); //! //! // Run sorting operation //! cub::DeviceSegmentedSort::SortPairsDescending( //! d_temp_storage, temp_storage_bytes, //! d_keys_in, d_keys_out, d_values_in, d_values_out, //! num_items, num_segments, d_offsets, d_offsets + 1); //! //! // d_keys_out <-- [8, 7, 6, 9, 5, 3, 0] //! // d_values_out <-- [0, 2, 1, 6, 3, 4, 5] //! //! @endrst //! //! @tparam KeyT //! **[inferred]** Key type //! //! @tparam ValueT //! **[inferred]** Value type //! //! @tparam BeginOffsetIteratorT //! **[inferred]** Random-access input iterator type for reading segment //! beginning offsets @iterator //! //! @tparam EndOffsetIteratorT //! **[inferred]** Random-access input iterator type for reading segment //! ending offsets @iterator //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When nullptr, the //! required allocation size is written to `temp_storage_bytes` and no work //! is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in] d_keys_in //! Device-accessible pointer to the input data of key data to sort //! //! @param[out] d_keys_out //! Device-accessible pointer to the sorted output sequence of key data //! //! @param[in] d_values_in //! Device-accessible pointer to the corresponding input sequence of //! associated value items //! //! @param[out] d_values_out //! Device-accessible pointer to the correspondingly-reordered output //! sequence of associated value items //! //! @param[in] num_items //! The total number of items to sort (across all segments) //! //! @param[in] num_segments //! The number of segments that comprise the sorting data //! //! @param[in] d_begin_offsets //! @rst //! Random-access input iterator to the sequence of beginning offsets of //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*`` //! @endrst //! //! @param[in] d_end_offsets //! @rst //! Random-access input iterator to the sequence of ending offsets of length //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``. //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the i-th segment is //! considered empty. //! @endrst //! //! @param[in] stream //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst template CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending( void* d_temp_storage, std::size_t& temp_storage_bytes, const KeyT* d_keys_in, KeyT* d_keys_out, const ValueT* d_values_in, ValueT* d_values_out, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); return SortPairsDescendingNoNVTX( d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, num_items, num_segments, d_begin_offsets, d_end_offsets, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending( void* d_temp_storage, std::size_t& temp_storage_bytes, const KeyT* d_keys_in, KeyT* d_keys_out, const ValueT* d_values_in, ValueT* d_values_out, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return SortPairsDescending( d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, num_items, num_segments, d_begin_offsets, d_end_offsets, stream); } private: // Internal version without NVTX range template CUB_RUNTIME_FUNCTION static cudaError_t SortPairsNoNVTX( void* d_temp_storage, std::size_t& temp_storage_bytes, DoubleBuffer& d_keys, DoubleBuffer& d_values, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { constexpr bool is_descending = false; constexpr bool is_overwrite_okay = true; using DispatchT = DispatchSegmentedSort; return DispatchT::Dispatch( d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, num_segments, d_begin_offsets, d_end_offsets, is_overwrite_okay, stream); } public: //! @rst //! Sorts segments of key-value pairs into ascending order. //! Approximately ``2 * num_segments`` auxiliary storage required. //! //! - The sorting operation is given a pair of key buffers and a corresponding //! pair of associated value buffers. Each pair is managed by a DoubleBuffer //! structure that indicates which of the two buffers is "current" (and thus //! contains the input data to be sorted). //! - The contents of both buffers within each pair may be altered by the sorting //! operation. //! - Upon completion, the sorting operation will update the "current" indicator //! within each DoubleBuffer wrapper to reference which of the two buffers //! now contains the sorted output sequence (a function of the number of key bits //! specified and the targeted device architecture). //! - When the input is a contiguous sequence of segments, a single sequence //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased //! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where //! the latter is specified as ``segment_offsets + 1``). //! - SortPairs is not guaranteed to be stable. That is, suppose that ``i`` and //! ``j`` are equivalent: neither one is less than the other. It is not //! guaranteed that the relative order of these two elements will be //! preserved by sort. //! - Let ``cur`` be one of ``{d_keys.Current(), d_values.Current()}`` and ``alt`` //! be any of ``{d_keys.Alternate(), d_values.Alternate()}``. The range //! ``[cur, cur + num_items)`` shall not overlap //! ``[alt, alt + num_items)``. Both ranges shall not overlap //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor //! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way. //! - Segments are not required to be contiguous. For all index values ``i`` //! outside the specified segments ``d_keys.Current()[i]``, //! ``d_values.Current()[i]``, ``d_keys.Alternate()[i]``, //! ``d_values.Alternate()[i]`` will not be accessed nor modified. //! //! Snippet //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The code snippet below illustrates the batched sorting of three segments //! (with one zero-length segment) of ``i`` nt keys with associated vector of //! ``i`` nt values. //! //! .. code-block:: c++ //! //! #include //! // or equivalently //! //! // Declare, allocate, and initialize device-accessible pointers //! // for sorting data //! int num_items; // e.g., 7 //! int num_segments; // e.g., 3 //! int *d_offsets; // e.g., [0, 3, 3, 7] //! int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] //! int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -] //! int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6] //! int *d_value_alt_buf; // e.g., [-, -, -, -, -, -, -] //! ... //! //! // Create a set of DoubleBuffers to wrap pairs of device pointers //! cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); //! cub::DoubleBuffer d_values(d_value_buf, d_value_alt_buf); //! //! // Determine temporary device storage requirements //! void *d_temp_storage = NULL; //! size_t temp_storage_bytes = 0; //! cub::DeviceSegmentedSort::SortPairs( //! d_temp_storage, temp_storage_bytes, d_keys, d_values, //! num_items, num_segments, d_offsets, d_offsets + 1); //! //! // Allocate temporary storage //! cudaMalloc(&d_temp_storage, temp_storage_bytes); //! //! // Run sorting operation //! cub::DeviceSegmentedSort::SortPairs( //! d_temp_storage, temp_storage_bytes, d_keys, d_values, //! num_items, num_segments, d_offsets, d_offsets + 1); //! //! // d_keys.Current() <-- [6, 7, 8, 0, 3, 5, 9] //! // d_values.Current() <-- [5, 4, 3, 1, 2, 0, 6] //! //! @endrst //! //! @tparam KeyT //! **[inferred]** Key type //! //! @tparam ValueT //! **[inferred]** Value type //! //! @tparam BeginOffsetIteratorT //! **[inferred]** Random-access input iterator type for reading segment //! beginning offsets @iterator //! //! @tparam EndOffsetIteratorT //! **[inferred]** Random-access input iterator type for reading segment //! ending offsets @iterator //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work //! is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in,out] d_keys //! Reference to the double-buffer of keys whose "current" device-accessible //! buffer contains the unsorted input keys and, upon return, is updated to //! point to the sorted output keys //! //! @param[in,out] d_values //! Double-buffer of values whose "current" device-accessible buffer contains //! the unsorted input values and, upon return, is updated to point to the //! sorted output values //! //! @param[in] num_items //! The total number of items to sort (across all segments) //! //! @param[in] num_segments //! The number of segments that comprise the sorting data //! //! @param[in] d_begin_offsets //! @rst //! Random-access input iterator to the sequence of beginning offsets of //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*`` //! @endrst //! //! @param[in] d_end_offsets //! @rst //! Random-access input iterator to the sequence of ending offsets of length //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``. //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the i-th segment is //! considered empty. //! @endrst //! //! @param[in] stream //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst template CUB_RUNTIME_FUNCTION static cudaError_t SortPairs( void* d_temp_storage, std::size_t& temp_storage_bytes, DoubleBuffer& d_keys, DoubleBuffer& d_values, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); return SortPairsNoNVTX( d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, num_segments, d_begin_offsets, d_end_offsets, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairs( void* d_temp_storage, std::size_t& temp_storage_bytes, DoubleBuffer& d_keys, DoubleBuffer& d_values, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return SortPairs( d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, num_segments, d_begin_offsets, d_end_offsets, stream); } private: // Internal version without NVTX range template CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescendingNoNVTX( void* d_temp_storage, std::size_t& temp_storage_bytes, DoubleBuffer& d_keys, DoubleBuffer& d_values, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { constexpr bool is_descending = true; constexpr bool is_overwrite_okay = true; using DispatchT = DispatchSegmentedSort; return DispatchT::Dispatch( d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, num_segments, d_begin_offsets, d_end_offsets, is_overwrite_okay, stream); } public: //! @rst //! Sorts segments of key-value pairs into descending order. //! Approximately ``2 * num_segments`` auxiliary storage required. //! //! - The sorting operation is given a pair of key buffers and a corresponding //! pair of associated value buffers. Each pair is managed by a DoubleBuffer //! structure that indicates which of the two buffers is "current" (and thus //! contains the input data to be sorted). //! - The contents of both buffers within each pair may be altered by the //! sorting operation. //! - Upon completion, the sorting operation will update the "current" //! indicator within each DoubleBuffer wrapper to reference which of the two //! buffers now contains the sorted output sequence (a function of the number //! of key bits specified and the targeted device architecture). //! - When the input is a contiguous sequence of segments, a single sequence //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased //! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where //! the latter is specified as ``segment_offsets + 1``). //! - SortPairsDescending is not guaranteed to be stable. That is, suppose that //! ``i`` and ``j`` are equivalent: neither one is less than the other. It is //! not guaranteed that the relative order of these two elements will be //! preserved by sort. //! - Let ``cur`` be one of ``{d_keys.Current(), d_values.Current()}`` and ``alt`` //! be any of ``{d_keys.Alternate(), d_values.Alternate()}``. The range //! ``[cur, cur + num_items)`` shall not overlap //! ``[alt, alt + num_items)``. Both ranges shall not overlap //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor //! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way. //! - Segments are not required to be contiguous. For all index values ``i`` //! outside the specified segments ``d_keys.Current()[i]``, //! ``d_values.Current()[i]``, ``d_keys.Alternate()[i]``, //! ``d_values.Alternate()[i]`` will not be accessed nor modified. //! //! Snippet //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The code snippet below illustrates the batched sorting of three segments //! (with one zero-length segment) of ``i`` nt keys with associated vector of //! ``i`` nt values. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! // Declare, allocate, and initialize device-accessible pointers for //! // sorting data //! int num_items; // e.g., 7 //! int num_segments; // e.g., 3 //! int *d_offsets; // e.g., [0, 3, 3, 7] //! int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] //! int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -] //! int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6] //! int *d_value_alt_buf; // e.g., [-, -, -, -, -, -, -] //! ... //! //! // Create a set of DoubleBuffers to wrap pairs of device pointers //! cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); //! cub::DoubleBuffer d_values(d_value_buf, d_value_alt_buf); //! //! // Determine temporary device storage requirements //! void *d_temp_storage = NULL; //! size_t temp_storage_bytes = 0; //! cub::DeviceSegmentedSort::SortPairsDescending( //! d_temp_storage, temp_storage_bytes, d_keys, d_values, //! num_items, num_segments, d_offsets, d_offsets + 1); //! //! // Allocate temporary storage //! cudaMalloc(&d_temp_storage, temp_storage_bytes); //! //! // Run sorting operation //! cub::DeviceSegmentedSort::SortPairsDescending( //! d_temp_storage, temp_storage_bytes, d_keys, d_values, //! num_items, num_segments, d_offsets, d_offsets + 1); //! //! // d_keys.Current() <-- [8, 7, 6, 9, 5, 3, 0] //! // d_values.Current() <-- [0, 2, 1, 6, 3, 4, 5] //! //! @endrst //! //! @tparam KeyT //! **[inferred]** Key type //! //! @tparam ValueT //! **[inferred]** Value type //! //! @tparam BeginOffsetIteratorT //! **[inferred]** Random-access input iterator type for reading segment //! beginning offsets @iterator //! //! @tparam EndOffsetIteratorT //! **[inferred]** Random-access input iterator type for reading segment //! ending offsets @iterator //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When nullptr, the //! required allocation size is written to `temp_storage_bytes` and no work //! is done //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in,out] d_keys //! Reference to the double-buffer of keys whose "current" device-accessible //! buffer contains the unsorted input keys and, upon return, is updated to //! point to the sorted output keys //! //! @param[in,out] d_values //! Double-buffer of values whose "current" device-accessible buffer contains //! the unsorted input values and, upon return, is updated to point to the //! sorted output values //! //! @param[in] num_items //! The total number of items to sort (across all segments) //! //! @param[in] num_segments //! The number of segments that comprise the sorting data //! //! @param[in] d_begin_offsets //! @rst //! Random-access input iterator to the sequence of beginning offsets of //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*`` //! @endrst //! //! @param[in] d_end_offsets //! @rst //! Random-access input iterator to the sequence of ending offsets of length //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``. //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the ``i``-th segment is //! considered empty. //! @endrst //! //! @param[in] stream //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst template CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending( void* d_temp_storage, std::size_t& temp_storage_bytes, DoubleBuffer& d_keys, DoubleBuffer& d_values, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); return SortPairsDescendingNoNVTX( d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, num_segments, d_begin_offsets, d_end_offsets, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending( void* d_temp_storage, std::size_t& temp_storage_bytes, DoubleBuffer& d_keys, DoubleBuffer& d_values, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return SortPairsDescending( d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, num_segments, d_begin_offsets, d_end_offsets, stream); } //! @rst //! Sorts segments of key-value pairs into ascending order. //! Approximately ``2 * num_items + 2 * num_segments`` auxiliary storage required. //! //! - The contents of the input data are not altered by the sorting operation. //! - When the input is a contiguous sequence of segments, a single sequence //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased //! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where //! the latter is specified as ``segment_offsets + 1``). //! - StableSortPairs is stable: it preserves the relative ordering of //! equivalent elements. That is, if ``x`` and ``y`` are elements such that //! ``x`` precedes ``y``, and if the two elements are equivalent (neither //! ``x < y`` nor ``y < x``) then a postcondition of stable sort is that //! ``x`` still precedes ``y``. //! - Let ``in`` be one of ``{d_keys_in, d_values_in}`` and ``out`` be any of //! ``{d_keys_out, d_values_out}``. The range ``[out, out + num_items)`` shall //! not overlap ``[in, in + num_items)``, //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor //! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way. //! - Segments are not required to be contiguous. For all index values ``i`` //! outside the specified segments ``d_keys_in[i]``, ``d_values_in[i]``, //! ``d_keys_out[i]``, ``d_values_out[i]`` will not be accessed nor modified. //! //! Snippet //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The code snippet below illustrates the batched sorting of three segments //! (with one zero-length segment) of ``i`` nt keys with associated vector of //! ``i`` nt values. //! //! .. code-block:: c++ //! //! #include //! // or equivalently //! //! // Declare, allocate, and initialize device-accessible pointers //! // for sorting data //! int num_items; // e.g., 7 //! int num_segments; // e.g., 3 //! int *d_offsets; // e.g., [0, 3, 3, 7] //! int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] //! int *d_keys_out; // e.g., [-, -, -, -, -, -, -] //! int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6] //! int *d_values_out; // e.g., [-, -, -, -, -, -, -] //! ... //! //! // Determine temporary device storage requirements //! void *d_temp_storage = NULL; //! size_t temp_storage_bytes = 0; //! cub::DeviceSegmentedSort::StableSortPairs( //! d_temp_storage, temp_storage_bytes, //! d_keys_in, d_keys_out, d_values_in, d_values_out, //! num_items, num_segments, d_offsets, d_offsets + 1); //! //! // Allocate temporary storage //! cudaMalloc(&d_temp_storage, temp_storage_bytes); //! //! // Run sorting operation //! cub::DeviceSegmentedSort::StableSortPairs( //! d_temp_storage, temp_storage_bytes, //! d_keys_in, d_keys_out, d_values_in, d_values_out, //! num_items, num_segments, d_offsets, d_offsets + 1); //! //! // d_keys_out <-- [6, 7, 8, 0, 3, 5, 9] //! // d_values_out <-- [1, 2, 0, 5, 4, 3, 6] //! //! @endrst //! //! @tparam KeyT //! **[inferred]** Key type //! //! @tparam ValueT //! **[inferred]** Value type //! //! @tparam BeginOffsetIteratorT //! **[inferred]** Random-access input iterator type for reading segment //! beginning offsets @iterator //! //! @tparam EndOffsetIteratorT //! **[inferred]** Random-access input iterator type for reading segment //! ending offsets @iterator //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When nullptr, the //! required allocation size is written to `temp_storage_bytes` and no work is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in] d_keys_in //! Device-accessible pointer to the input data of key data to sort //! //! @param[out] d_keys_out //! Device-accessible pointer to the sorted output sequence of key data //! //! @param[in] d_values_in //! Device-accessible pointer to the corresponding input sequence of //! associated value items //! //! @param[out] d_values_out //! Device-accessible pointer to the correspondingly-reordered output //! sequence of associated value items //! //! @param[in] num_items //! The total number of items to sort (across all segments) //! //! @param[in] num_segments //! The number of segments that comprise the sorting data //! //! @param[in] d_begin_offsets //! @rst //! Random-access input iterator to the sequence of beginning offsets of //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*`` //! @endrst //! //! @param[in] d_end_offsets //! @rst //! Random-access input iterator to the sequence of ending offsets of length //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``. //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the ``i``-th segment is //! considered empty. //! @endrst //! //! @param[in] stream //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst template CUB_RUNTIME_FUNCTION static cudaError_t StableSortPairs( void* d_temp_storage, std::size_t& temp_storage_bytes, const KeyT* d_keys_in, KeyT* d_keys_out, const ValueT* d_values_in, ValueT* d_values_out, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); return SortPairsNoNVTX( d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, num_items, num_segments, d_begin_offsets, d_end_offsets, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortPairs( void* d_temp_storage, std::size_t& temp_storage_bytes, const KeyT* d_keys_in, KeyT* d_keys_out, const ValueT* d_values_in, ValueT* d_values_out, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return StableSortPairs( d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, num_items, num_segments, d_begin_offsets, d_end_offsets, stream); } //! @rst //! Sorts segments of key-value pairs into descending order. //! Approximately ``2 * num_items + 2 * num_segments`` auxiliary storage required. //! //! - The contents of the input data are not altered by the sorting operation. //! - When the input is a contiguous sequence of segments, a single sequence //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased //! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where //! the latter is specified as ``segment_offsets + 1``). //! - StableSortPairsDescending is stable: it preserves the relative ordering //! of equivalent elements. That is, if ``x`` and ``y`` are elements such that //! ``x`` precedes ``y``, and if the two elements are equivalent (neither //! ``x < y`` nor ``y < x``) then a postcondition of stable sort is that //! ``x`` still precedes ``y``. //! - Let `in` be one of ``{d_keys_in, d_values_in}`` and ``out`` be any of //! ``{d_keys_out, d_values_out}``. The range ``[out, out + num_items)`` shall //! not overlap ``[in, in + num_items)``, //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor //! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way. //! - Segments are not required to be contiguous. For all index values ``i`` //! outside the specified segments ``d_keys_in[i]``, ``d_values_in[i]``, //! ``d_keys_out[i]``, ``d_values_out[i]`` will not be accessed nor modified. //! //! Snippet //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The code snippet below illustrates the batched sorting of three segments //! (with one zero-length segment) of ``i`` nt keys with associated vector of //! ``i`` nt values. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! // Declare, allocate, and initialize device-accessible pointers //! // for sorting data //! int num_items; // e.g., 7 //! int num_segments; // e.g., 3 //! int *d_offsets; // e.g., [0, 3, 3, 7] //! int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] //! int *d_keys_out; // e.g., [-, -, -, -, -, -, -] //! int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6] //! int *d_values_out; // e.g., [-, -, -, -, -, -, -] //! ... //! //! // Determine temporary device storage requirements //! void *d_temp_storage = NULL; //! size_t temp_storage_bytes = 0; //! cub::DeviceSegmentedSort::StableSortPairsDescending( //! d_temp_storage, temp_storage_bytes, //! d_keys_in, d_keys_out, d_values_in, d_values_out, //! num_items, num_segments, d_offsets, d_offsets + 1); //! //! // Allocate temporary storage //! cudaMalloc(&d_temp_storage, temp_storage_bytes); //! //! // Run sorting operation //! cub::DeviceSegmentedSort::StableSortPairsDescending( //! d_temp_storage, temp_storage_bytes, //! d_keys_in, d_keys_out, d_values_in, d_values_out, //! num_items, num_segments, d_offsets, d_offsets + 1); //! //! // d_keys_out <-- [8, 7, 6, 9, 5, 3, 0] //! // d_values_out <-- [0, 2, 1, 6, 3, 4, 5] //! //! @endrst //! //! @tparam KeyT //! **[inferred]** Key type //! //! @tparam ValueT //! **[inferred]** Value type //! //! @tparam BeginOffsetIteratorT //! **[inferred]** Random-access input iterator type for reading segment //! beginning offsets @iterator //! //! @tparam EndOffsetIteratorT //! **[inferred]** Random-access input iterator type for reading segment //! ending offsets @iterator //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work //! is done //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in] d_keys_in //! Device-accessible pointer to the input data of key data to sort //! //! @param[out] d_keys_out //! Device-accessible pointer to the sorted output sequence of key data //! //! @param[in] d_values_in //! Device-accessible pointer to the corresponding input sequence of //! associated value items //! //! @param[out] d_values_out //! Device-accessible pointer to the correspondingly-reordered output //! sequence of associated value items //! //! @param[in] num_items //! The total number of items to sort (across all segments) //! //! @param[in] num_segments //! The number of segments that comprise the sorting data //! //! @param[in] d_begin_offsets //! @rst //! Random-access input iterator to the sequence of beginning offsets of //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*`` //! @endrst //! //! @param[in] d_end_offsets //! @rst //! Random-access input iterator to the sequence of ending offsets of length //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``. //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the ``i``-th segment is //! considered empty. //! @endrst //! //! @param[in] stream //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst template CUB_RUNTIME_FUNCTION static cudaError_t StableSortPairsDescending( void* d_temp_storage, std::size_t& temp_storage_bytes, const KeyT* d_keys_in, KeyT* d_keys_out, const ValueT* d_values_in, ValueT* d_values_out, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); return SortPairsDescendingNoNVTX( d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, num_items, num_segments, d_begin_offsets, d_end_offsets, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortPairsDescending( void* d_temp_storage, std::size_t& temp_storage_bytes, const KeyT* d_keys_in, KeyT* d_keys_out, const ValueT* d_values_in, ValueT* d_values_out, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return StableSortPairsDescending( d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, num_items, num_segments, d_begin_offsets, d_end_offsets, stream); } //! @rst //! Sorts segments of key-value pairs into ascending order. //! Approximately ``2 * num_segments`` auxiliary storage required. //! //! - The sorting operation is given a pair of key buffers and a corresponding //! pair of associated value buffers. Each pair is managed by a DoubleBuffer //! structure that indicates which of the two buffers is "current" (and thus //! contains the input data to be sorted). //! - The contents of both buffers within each pair may be altered by the //! sorting operation. //! - Upon completion, the sorting operation will update the "current" //! indicator within each DoubleBuffer wrapper to reference which of the two //! buffers now contains the sorted output sequence (a function of the number //! of key bits specified and the targeted device architecture). //! - When the input is a contiguous sequence of segments, a single sequence //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased //! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where //! the latter is specified as ``segment_offsets + 1``). //! - StableSortPairs is stable: it preserves the relative ordering //! of equivalent elements. That is, if ``x`` and ``y`` are elements such that //! ``x`` precedes `y`, and if the two elements are equivalent (neither //! ``x < y`` nor ``y < x``) then a postcondition of stable sort is that //! ``x`` still precedes ``y``. //! - Let ``cur`` be one of ``{d_keys.Current(), d_values.Current()}`` and ``alt`` //! be any of ``{d_keys.Alternate(), d_values.Alternate()}``. The range //! ``[cur, cur + num_items)`` shall not overlap //! ``[alt, alt + num_items)``. Both ranges shall not overlap //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor //! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way. //! - Segments are not required to be contiguous. For all index values ``i`` //! outside the specified segments ``d_keys.Current()[i]``, //! ``d_values.Current()[i]``, ``d_keys.Alternate()[i]``, //! ``d_values.Alternate()[i]`` will not be accessed nor modified. //! //! Snippet //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The code snippet below illustrates the batched sorting of three segments //! (with one zero-length segment) of ``i`` nt keys with associated vector of //! ``i`` nt values. //! //! .. code-block:: c++ //! //! #include //! // or equivalently //! //! // Declare, allocate, and initialize device-accessible pointers //! // for sorting data //! int num_items; // e.g., 7 //! int num_segments; // e.g., 3 //! int *d_offsets; // e.g., [0, 3, 3, 7] //! int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] //! int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -] //! int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6] //! int *d_value_alt_buf; // e.g., [-, -, -, -, -, -, -] //! ... //! //! // Create a set of DoubleBuffers to wrap pairs of device pointers //! cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); //! cub::DoubleBuffer d_values(d_value_buf, d_value_alt_buf); //! //! // Determine temporary device storage requirements //! void *d_temp_storage = NULL; //! size_t temp_storage_bytes = 0; //! cub::DeviceSegmentedSort::StableSortPairs( //! d_temp_storage, temp_storage_bytes, d_keys, d_values, //! num_items, num_segments, d_offsets, d_offsets + 1); //! //! // Allocate temporary storage //! cudaMalloc(&d_temp_storage, temp_storage_bytes); //! //! // Run sorting operation //! cub::DeviceSegmentedSort::StableSortPairs( //! d_temp_storage, temp_storage_bytes, d_keys, d_values, //! num_items, num_segments, d_offsets, d_offsets + 1); //! //! // d_keys.Current() <-- [6, 7, 8, 0, 3, 5, 9] //! // d_values.Current() <-- [5, 4, 3, 1, 2, 0, 6] //! //! @endrst //! //! @tparam KeyT //! **[inferred]** Key type //! //! @tparam ValueT //! **[inferred]** Value type //! //! @tparam BeginOffsetIteratorT //! **[inferred]** Random-access input iterator type for reading segment //! beginning offsets @iterator //! //! @tparam EndOffsetIteratorT //! **[inferred]** Random-access input iterator type for reading segment //! ending offsets @iterator //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work //! is done //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in,out] d_keys //! Reference to the double-buffer of keys whose "current" device-accessible //! buffer contains the unsorted input keys and, upon return, is updated to //! point to the sorted output keys //! //! @param[in,out] d_values //! Double-buffer of values whose "current" device-accessible buffer contains //! the unsorted input values and, upon return, is updated to point to the //! sorted output values //! //! @param[in] num_items //! The total number of items to sort (across all segments) //! //! @param[in] num_segments //! The number of segments that comprise the sorting data //! //! @param[in] d_begin_offsets //! @rst //! Random-access input iterator to the sequence of beginning offsets of //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*`` //! @endrst //! //! @param[in] d_end_offsets //! @rst //! Random-access input iterator to the sequence of ending offsets of length //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``. //! If ``d_end_offsets[i]-1 <= d_begin_offsets[i]``, the ``i``-th segment is //! considered empty. //! @endrst //! //! @param[in] stream //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst template CUB_RUNTIME_FUNCTION static cudaError_t StableSortPairs( void* d_temp_storage, std::size_t& temp_storage_bytes, DoubleBuffer& d_keys, DoubleBuffer& d_values, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); return SortPairsNoNVTX( d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, num_segments, d_begin_offsets, d_end_offsets, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortPairs( void* d_temp_storage, std::size_t& temp_storage_bytes, DoubleBuffer& d_keys, DoubleBuffer& d_values, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return StableSortPairs( d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, num_segments, d_begin_offsets, d_end_offsets, stream); } //! @rst //! Sorts segments of key-value pairs into descending order. //! Approximately ``2 * num_segments`` auxiliary storage required. //! //! - The sorting operation is given a pair of key buffers and a corresponding //! pair of associated value buffers. Each pair is managed by a DoubleBuffer //! structure that indicates which of the two buffers is "current" (and thus //! contains the input data to be sorted). //! - The contents of both buffers within each pair may be altered by the sorting //! operation. //! - Upon completion, the sorting operation will update the "current" indicator //! within each DoubleBuffer wrapper to reference which of the two buffers //! now contains the sorted output sequence (a function of the number of key bits //! specified and the targeted device architecture). //! - When the input is a contiguous sequence of segments, a single sequence //! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased //! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where //! the latter is specified as ``segment_offsets + 1``). //! - StableSortPairsDescending is stable: it preserves the relative ordering //! of equivalent elements. That is, if ``x`` and ``y`` are elements such that //! ``x`` precedes ``y``, and if the two elements are equivalent (neither //! ``x < y`` nor ``y < x``) then a postcondition of stable sort is that //! ``x`` still precedes ``y``. //! - Let ``cur`` be one of ``{d_keys.Current(), d_values.Current()}`` and ``alt`` //! be any of ``{d_keys.Alternate(), d_values.Alternate()}``. The range //! ``[cur, cur + num_items)`` shall not overlap //! ``[alt, alt + num_items)``. Both ranges shall not overlap //! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor //! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way. //! - Segments are not required to be contiguous. For all index values ``i`` //! outside the specified segments ``d_keys.Current()[i]``, //! ``d_values.Current()[i]``, ``d_keys.Alternate()[i]``, //! ``d_values.Alternate()[i]`` will not be accessed nor modified. //! //! Snippet //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The code snippet below illustrates the batched sorting of three segments //! (with one zero-length segment) of ``i`` nt keys with associated vector of //! ``i`` nt values. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! // Declare, allocate, and initialize device-accessible pointers //! // for sorting data //! int num_items; // e.g., 7 //! int num_segments; // e.g., 3 //! int *d_offsets; // e.g., [0, 3, 3, 7] //! int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] //! int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -] //! int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6] //! int *d_value_alt_buf; // e.g., [-, -, -, -, -, -, -] //! ... //! //! // Create a set of DoubleBuffers to wrap pairs of device pointers //! cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); //! cub::DoubleBuffer d_values(d_value_buf, d_value_alt_buf); //! //! // Determine temporary device storage requirements //! void *d_temp_storage = NULL; //! size_t temp_storage_bytes = 0; //! cub::DeviceSegmentedSort::StableSortPairsDescending( //! d_temp_storage, temp_storage_bytes, d_keys, d_values, //! num_items, num_segments, d_offsets, d_offsets + 1); //! //! // Allocate temporary storage //! cudaMalloc(&d_temp_storage, temp_storage_bytes); //! //! // Run sorting operation //! cub::DeviceSegmentedSort::StableSortPairsDescending( //! d_temp_storage, temp_storage_bytes, d_keys, d_values, //! num_items, num_segments, d_offsets, d_offsets + 1); //! //! // d_keys.Current() <-- [8, 7, 6, 9, 5, 3, 0] //! // d_values.Current() <-- [0, 2, 1, 6, 3, 4, 5] //! //! @endrst //! //! @tparam KeyT //! **[inferred]** Key type //! //! @tparam ValueT //! **[inferred]** Value type //! //! @tparam BeginOffsetIteratorT //! **[inferred]** Random-access input iterator type for reading segment //! beginning offsets @iterator //! //! @tparam EndOffsetIteratorT //! **[inferred]** Random-access input iterator type for reading segment //! ending offsets @iterator //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work //! is done //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in,out] d_keys //! Reference to the double-buffer of keys whose "current" device-accessible //! buffer contains the unsorted input keys and, upon return, is updated to //! point to the sorted output keys //! //! @param[in,out] d_values //! Double-buffer of values whose "current" device-accessible buffer contains //! the unsorted input values and, upon return, is updated to point to the //! sorted output values //! //! @param[in] num_items //! The total number of items to sort (across all segments) //! //! @param[in] num_segments //! The number of segments that comprise the sorting data //! //! @param[in] d_begin_offsets //! @rst //! Random-access input iterator to the sequence of beginning offsets of //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*`` //! @endrst //! //! @param[in] d_end_offsets //! @rst //! Random-access input iterator to the sequence of ending offsets of length //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``. //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the ``i``-th segment is //! considered empty. //! @endrst //! //! @param[in] stream //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst template CUB_RUNTIME_FUNCTION static cudaError_t StableSortPairsDescending( void* d_temp_storage, std::size_t& temp_storage_bytes, DoubleBuffer& d_keys, DoubleBuffer& d_values, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName()); return SortPairsDescendingNoNVTX( d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, num_segments, d_begin_offsets, d_end_offsets, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortPairsDescending( void* d_temp_storage, std::size_t& temp_storage_bytes, DoubleBuffer& d_keys, DoubleBuffer& d_values, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return StableSortPairsDescending( d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, num_segments, d_begin_offsets, d_end_offsets, stream); } //! @} end member group }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/device/device_select.cuh000066400000000000000000001437561463375617100205370ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ //! @file cub::DeviceSelect provides device-wide, parallel operations for //! compacting selected items from sequences of data items residing within //! device-accessible memory. #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include #include #include #include #include CUB_NAMESPACE_BEGIN //! @rst //! DeviceSelect provides device-wide, parallel operations for compacting //! selected items from sequences of data items residing within //! device-accessible memory. //! //! Overview //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! These operations apply a selection criterion to selectively copy //! items from a specified input sequence to a compact output sequence. //! //! Usage Considerations //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! @cdp_class{DeviceSelect} //! //! Performance //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! @linear_performance{select-flagged, select-if, and select-unique} //! //! @endrst struct DeviceSelect { //! @rst //! Uses the ``d_flags`` sequence to selectively copy the corresponding items from ``d_in`` into ``d_out``. //! The total number of items selected is written to ``d_num_selected_out``. //! //! - The value type of ``d_flags`` must be castable to ``bool`` (e.g., ``bool``, ``char``, ``int``, etc.). //! - Copies of the selected items are compacted into ``d_out`` and maintain their original relative ordering. //! - | The range ``[d_out, d_out + *d_num_selected_out)`` shall not overlap ``[d_in, d_in + num_items)``, //! | ``[d_flags, d_flags + num_items)`` nor ``d_num_selected_out`` in any way. //! - @devicestorage //! //! Snippet //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The code snippet below illustrates the compaction of items selected from an ``int`` device vector. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! // Declare, allocate, and initialize device-accessible pointers for input, //! // flags, and output //! int num_items; // e.g., 8 //! int *d_in; // e.g., [1, 2, 3, 4, 5, 6, 7, 8] //! char *d_flags; // e.g., [1, 0, 0, 1, 0, 1, 1, 0] //! int *d_out; // e.g., [ , , , , , , , ] //! int *d_num_selected_out; // e.g., [ ] //! ... //! //! // Determine temporary device storage requirements //! void *d_temp_storage = NULL; //! size_t temp_storage_bytes = 0; //! cub::DeviceSelect::Flagged( //! d_temp_storage, temp_storage_bytes, //! d_in, d_flags, d_out, d_num_selected_out, num_items); //! //! // Allocate temporary storage //! cudaMalloc(&d_temp_storage, temp_storage_bytes); //! //! // Run selection //! cub::DeviceSelect::Flagged( //! d_temp_storage, temp_storage_bytes, //! d_in, d_flags, d_out, d_num_selected_out, num_items); //! //! // d_out <-- [1, 4, 6, 7] //! // d_num_selected_out <-- [4] //! //! @endrst //! //! @tparam InputIteratorT //! **[inferred]** Random-access input iterator type for reading input items @iterator //! //! @tparam FlagIterator //! **[inferred]** Random-access input iterator type for reading selection flags @iterator //! //! @tparam OutputIteratorT //! **[inferred]** Random-access output iterator type for writing selected items @iterator //! //! @tparam NumSelectedIteratorT //! **[inferred]** Output iterator type for recording the number of items selected @iterator //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in] d_in //! Pointer to the input sequence of data items //! //! @param[in] d_flags //! Pointer to the input sequence of selection flags //! //! @param[out] d_out //! Pointer to the output sequence of selected data items //! //! @param[out] d_num_selected_out //! Pointer to the output total number of items selected (i.e., length of `d_out`) //! //! @param[in] num_items //! Total number of input items (i.e., length of `d_in`) //! //! @param[in] stream //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst template CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Flagged( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, FlagIterator d_flags, OutputIteratorT d_out, NumSelectedIteratorT d_num_selected_out, int num_items, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSelect::Flagged"); using OffsetT = int; // Signed integer type for global offsets using SelectOp = NullType; // Selection op (not used) using EqualityOp = NullType; // Equality operator (not used) return DispatchSelectIf< InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, false>::Dispatch(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, SelectOp(), EqualityOp(), num_items, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Flagged( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, FlagIterator d_flags, OutputIteratorT d_out, NumSelectedIteratorT d_num_selected_out, int num_items, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return Flagged( d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, stream); } //! @rst //! Uses the ``d_flags`` sequence to selectively compact the items in `d_data``. //! The total number of items selected is written to ``d_num_selected_out``. //! //! - The value type of ``d_flags`` must be castable to ``bool`` (e.g., ``bool``, ``char``, ``int``, etc.). //! - Copies of the selected items are compacted in-place and maintain their original relative ordering. //! - | The ``d_data`` may equal ``d_flags``. The range ``[d_data, d_data + num_items)`` shall not overlap //! | ``[d_flags, d_flags + num_items)`` in any other way. //! - @devicestorage //! //! Snippet //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The code snippet below illustrates the compaction of items selected from an ``int`` device vector. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! // Declare, allocate, and initialize device-accessible pointers for input, //! // flags, and output //! int num_items; // e.g., 8 //! int *d_data; // e.g., [1, 2, 3, 4, 5, 6, 7, 8] //! char *d_flags; // e.g., [1, 0, 0, 1, 0, 1, 1, 0] //! int *d_num_selected_out; // e.g., [ ] //! ... //! //! // Determine temporary device storage requirements //! void *d_temp_storage = NULL; //! size_t temp_storage_bytes = 0; //! cub::DeviceSelect::Flagged( //! d_temp_storage, temp_storage_bytes, //! d_in, d_flags, d_num_selected_out, num_items); //! //! // Allocate temporary storage //! cudaMalloc(&d_temp_storage, temp_storage_bytes); //! //! // Run selection //! cub::DeviceSelect::Flagged( //! d_temp_storage, temp_storage_bytes, //! d_in, d_flags, d_num_selected_out, num_items); //! //! // d_data <-- [1, 4, 6, 7] //! // d_num_selected_out <-- [4] //! //! @endrst //! //! @tparam IteratorT //! **[inferred]** Random-access iterator type for reading and writing selected items @iterator //! //! @tparam FlagIterator //! **[inferred]** Random-access input iterator type for reading selection flags @iterator //! //! @tparam NumSelectedIteratorT //! **[inferred]** Output iterator type for recording the number of items selected @iterator //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in,out] d_data //! Pointer to the sequence of data items //! //! @param[in] d_flags //! Pointer to the input sequence of selection flags //! //! @param[out] d_num_selected_out //! Pointer to the output total number of items selected //! //! @param[in] num_items //! Total number of input items (i.e., length of `d_data`) //! //! @param[in] stream //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst template CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Flagged( void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, FlagIterator d_flags, NumSelectedIteratorT d_num_selected_out, int num_items, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSelect::Flagged"); using OffsetT = int; // Signed integer type for global offsets using SelectOp = NullType; // Selection op (not used) using EqualityOp = NullType; // Equality operator (not used) constexpr bool may_alias = true; return DispatchSelectIf< IteratorT, FlagIterator, IteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, false, may_alias>::Dispatch(d_temp_storage, temp_storage_bytes, d_data, // in d_flags, d_data, // out d_num_selected_out, SelectOp(), EqualityOp(), num_items, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Flagged( void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, FlagIterator d_flags, NumSelectedIteratorT d_num_selected_out, int num_items, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return Flagged( d_temp_storage, temp_storage_bytes, d_data, d_flags, d_num_selected_out, num_items, stream); } //! @rst //! Uses the ``select_op`` functor to selectively copy items from ``d_in`` into ``d_out``. //! The total number of items selected is written to ``d_num_selected_out``. //! //! - Copies of the selected items are compacted into ``d_out`` and maintain //! their original relative ordering. //! - | The range ``[d_out, d_out + *d_num_selected_out)`` shall not overlap //! | ``[d_in, d_in + num_items)`` nor ``d_num_selected_out`` in any way. //! - @devicestorage //! //! Snippet //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The code snippet below illustrates the compaction of items selected from an ``int`` device vector. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! // Functor type for selecting values less than some criteria //! struct LessThan //! { //! int compare; //! //! CUB_RUNTIME_FUNCTION __forceinline__ //! LessThan(int compare) : compare(compare) {} //! //! CUB_RUNTIME_FUNCTION __forceinline__ //! bool operator()(const int &a) const { //! return (a < compare); //! } //! }; //! //! // Declare, allocate, and initialize device-accessible pointers //! // for input and output //! int num_items; // e.g., 8 //! int *d_in; // e.g., [0, 2, 3, 9, 5, 2, 81, 8] //! int *d_out; // e.g., [ , , , , , , , ] //! int *d_num_selected_out; // e.g., [ ] //! LessThan select_op(7); //! ... //! //! // Determine temporary device storage requirements //! void *d_temp_storage = NULL; //! size_t temp_storage_bytes = 0; //! cub::DeviceSelect::If( //! d_temp_storage, temp_storage_bytes, //! d_in, d_out, d_num_selected_out, num_items, select_op); //! //! // Allocate temporary storage //! cudaMalloc(&d_temp_storage, temp_storage_bytes); //! //! // Run selection //! cub::DeviceSelect::If( //! d_temp_storage, temp_storage_bytes, //! d_in, d_out, d_num_selected_out, num_items, select_op); //! //! // d_out <-- [0, 2, 3, 5, 2] //! // d_num_selected_out <-- [5] //! //! @endrst //! //! @tparam InputIteratorT //! **[inferred]** Random-access input iterator type for reading input items @iterator //! //! @tparam OutputIteratorT //! **[inferred]** Random-access output iterator type for writing selected items @iterator //! //! @tparam NumSelectedIteratorT //! **[inferred]** Output iterator type for recording the number of items selected @iterator //! //! @tparam SelectOp //! **[inferred]** Selection operator type having member `bool operator()(const T &a)` //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in] d_in //! Pointer to the input sequence of data items //! //! @param[out] d_out //! Pointer to the output sequence of selected data items //! //! @param[out] d_num_selected_out //! Pointer to the output total number of items selected //! (i.e., length of `d_out`) //! //! @param[in] num_items //! Total number of input items (i.e., length of `d_in`) //! //! @param[in] select_op //! Unary selection operator //! //! @param[in] stream //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst template CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t If(void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, NumSelectedIteratorT d_num_selected_out, int num_items, SelectOp select_op, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSelect::If"); using OffsetT = int; // Signed integer type for global offsets using FlagIterator = NullType*; // FlagT iterator type (not used) using EqualityOp = NullType; // Equality operator (not used) return DispatchSelectIf< InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, false>::Dispatch(d_temp_storage, temp_storage_bytes, d_in, NULL, d_out, d_num_selected_out, select_op, EqualityOp(), num_items, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t If(void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, NumSelectedIteratorT d_num_selected_out, int num_items, SelectOp select_op, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return If( d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op, stream); } //! @rst //! Uses the ``select_op`` functor to selectively compact items in ``d_data``. //! The total number of items selected is written to ``d_num_selected_out``. //! //! - | Copies of the selected items are compacted in ``d_data`` and maintain //! | their original relative ordering. //! - @devicestorage //! //! Snippet //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The code snippet below illustrates the compaction of items selected from an ``int`` device vector. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! // Functor type for selecting values less than some criteria //! struct LessThan //! { //! int compare; //! //! CUB_RUNTIME_FUNCTION __forceinline__ //! LessThan(int compare) : compare(compare) {} //! //! CUB_RUNTIME_FUNCTION __forceinline__ //! bool operator()(const int &a) const { //! return (a < compare); //! } //! }; //! //! // Declare, allocate, and initialize device-accessible pointers //! // for input and output //! int num_items; // e.g., 8 //! int *d_data; // e.g., [0, 2, 3, 9, 5, 2, 81, 8] //! int *d_num_selected_out; // e.g., [ ] //! LessThan select_op(7); //! ... //! //! // Determine temporary device storage requirements //! void *d_temp_storage = NULL; //! size_t temp_storage_bytes = 0; //! cub::DeviceSelect::If( //! d_temp_storage, temp_storage_bytes, //! d_data, d_num_selected_out, num_items, select_op); //! //! // Allocate temporary storage //! cudaMalloc(&d_temp_storage, temp_storage_bytes); //! //! // Run selection //! cub::DeviceSelect::If( //! d_temp_storage, temp_storage_bytes, //! d_data, d_num_selected_out, num_items, select_op); //! //! // d_data <-- [0, 2, 3, 5, 2] //! // d_num_selected_out <-- [5] //! //! @endrst //! //! @tparam IteratorT //! **[inferred]** Random-access input iterator type for reading and writing items @iterator //! //! @tparam NumSelectedIteratorT //! **[inferred]** Output iterator type for recording the number of items selected @iterator //! //! @tparam SelectOp //! **[inferred]** Selection operator type having member `bool operator()(const T &a)` //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in,out] d_data //! Pointer to the sequence of data items //! //! @param[out] d_num_selected_out //! Pointer to the output total number of items selected //! //! @param[in] num_items //! Total number of input items (i.e., length of `d_data`) //! //! @param[in] select_op //! Unary selection operator //! //! @param[in] stream //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst template CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t If(void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, NumSelectedIteratorT d_num_selected_out, int num_items, SelectOp select_op, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSelect::If"); using OffsetT = int; // Signed integer type for global offsets using FlagIterator = NullType*; // FlagT iterator type (not used) using EqualityOp = NullType; // Equality operator (not used) constexpr bool may_alias = true; return DispatchSelectIf< IteratorT, FlagIterator, IteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, false, may_alias>::Dispatch(d_temp_storage, temp_storage_bytes, d_data, // in NULL, d_data, // out d_num_selected_out, select_op, EqualityOp(), num_items, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t If(void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, NumSelectedIteratorT d_num_selected_out, int num_items, SelectOp select_op, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return If( d_temp_storage, temp_storage_bytes, d_data, d_num_selected_out, num_items, select_op, stream); } //! @rst //! Uses the ``select_op`` functor applied to ``d_flags`` to selectively copy the //! corresponding items from ``d_in`` into ``d_out``. //! The total number of items selected is written to ``d_num_selected_out``. //! //! - The expression ``select_op(flag)`` must be convertible to ``bool``, //! where the type of ``flag`` corresponds to the value type of ``FlagIterator``. //! - Copies of the selected items are compacted into ``d_out`` and maintain //! their original relative ordering. //! - | The range ``[d_out, d_out + *d_num_selected_out)`` shall not overlap //! | ``[d_in, d_in + num_items)`` nor ``d_num_selected_out`` in any way. //! - @devicestorage //! //! Snippet //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The code snippet below illustrates the compaction of items selected from an ``int`` device vector. //! //! .. literalinclude:: ../../test/catch2_test_device_select_api.cu //! :language: c++ //! :dedent: //! :start-after: example-begin segmented-select-iseven //! :end-before: example-end segmented-select-iseven //! //! .. literalinclude:: ../../test/catch2_test_device_select_api.cu //! :language: c++ //! :dedent: //! :start-after: example-begin segmented-select-flaggedif //! :end-before: example-end segmented-select-flaggedif //! //! @endrst //! //! @tparam InputIteratorT //! **[inferred]** Random-access input iterator type for reading input items @iterator //! //! @tparam FlagIterator //! **[inferred]** Random-access input iterator type for reading selection flags @iterator //! //! @tparam OutputIteratorT //! **[inferred]** Random-access output iterator type for writing selected items @iterator //! //! @tparam NumSelectedIteratorT //! **[inferred]** Output iterator type for recording the number of items selected @iterator //! //! @tparam SelectOp //! **[inferred]** Selection operator type having member `bool operator()(const T &a)` //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in] d_in //! Pointer to the input sequence of data items //! //! @param[in] d_flags //! Pointer to the input sequence of selection flags //! //! @param[out] d_out //! Pointer to the output sequence of selected data items //! //! @param[out] d_num_selected_out //! Pointer to the output total number of items selected //! (i.e., length of `d_out`) //! //! @param[in] num_items //! Total number of input items (i.e., length of `d_in`) //! //! @param[in] select_op //! Unary selection operator //! //! @param[in] stream //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst template CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t FlaggedIf( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, FlagIterator d_flags, OutputIteratorT d_out, NumSelectedIteratorT d_num_selected_out, int num_items, SelectOp select_op, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSelect::FlaggedIf"); using OffsetT = int; // Signed integer type for global offsets using EqualityOp = NullType; // Equality operator (not used) return DispatchSelectIf< InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, false>::Dispatch(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, select_op, EqualityOp(), num_items, stream); } //! @rst //! Uses the ``select_op`` functor applied to ``d_flags`` to selectively compact the //! corresponding items in ``d_data``. //! The total number of items selected is written to ``d_num_selected_out``. //! //! - The expression ``select_op(flag)`` must be convertible to ``bool``, //! where the type of ``flag`` corresponds to the value type of ``FlagIterator``. //! - Copies of the selected items are compacted in-place and maintain their original relative ordering. //! - | The ``d_data`` may equal ``d_flags``. The range ``[d_data, d_data + num_items)`` shall not overlap //! | ``[d_flags, d_flags + num_items)`` in any other way. //! - @devicestorage //! //! Snippet //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The code snippet below illustrates the compaction of items selected from an ``int`` device vector. //! //! .. literalinclude:: ../../test/catch2_test_device_select_api.cu //! :language: c++ //! :dedent: //! :start-after: example-begin segmented-select-iseven //! :end-before: example-end segmented-select-iseven //! //! .. literalinclude:: ../../test/catch2_test_device_select_api.cu //! :language: c++ //! :dedent: //! :start-after: example-begin segmented-select-flaggedif-inplace //! :end-before: example-end segmented-select-flaggedif-inplace //! //! @endrst //! //! @tparam IteratorT //! **[inferred]** Random-access iterator type for reading and writing selected items @iterator //! //! @tparam FlagIterator //! **[inferred]** Random-access input iterator type for reading selection flags @iterator //! //! @tparam NumSelectedIteratorT //! **[inferred]** Output iterator type for recording the number of items selected @iterator //! //! @tparam SelectOp //! **[inferred]** Selection operator type having member `bool operator()(const T &a)` //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in,out] d_data //! Pointer to the sequence of data items //! //! @param[in] d_flags //! Pointer to the input sequence of selection flags //! //! @param[out] d_num_selected_out //! Pointer to the output total number of items selected //! //! @param[in] num_items //! Total number of input items (i.e., length of `d_data`) //! //! @param[in] select_op //! Unary selection operator //! //! @param[in] stream //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst template CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t FlaggedIf( void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, FlagIterator d_flags, NumSelectedIteratorT d_num_selected_out, int num_items, SelectOp select_op, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSelect::FlaggedIf"); using OffsetT = int; // Signed integer type for global offsets using EqualityOp = NullType; // Equality operator (not used) constexpr bool may_alias = true; return DispatchSelectIf< IteratorT, FlagIterator, IteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, false, may_alias>::Dispatch(d_temp_storage, temp_storage_bytes, d_data, // in d_flags, d_data, // out d_num_selected_out, select_op, EqualityOp(), num_items, stream); } //! @rst //! Given an input sequence ``d_in`` having runs of consecutive equal-valued keys, //! only the first key from each run is selectively copied to ``d_out``. //! The total number of items selected is written to ``d_num_selected_out``. //! //! - The ``==`` equality operator is used to determine whether keys are equivalent //! - Copies of the selected items are compacted into ``d_out`` and maintain their original relative ordering. //! - | The range ``[d_out, d_out + *d_num_selected_out)`` shall not overlap //! | ``[d_in, d_in + num_items)`` nor ``d_num_selected_out`` in any way. //! - @devicestorage //! //! Snippet //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The code snippet below illustrates the compaction of items selected from an ``int`` device vector. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! // Declare, allocate, and initialize device-accessible pointers //! // for input and output //! int num_items; // e.g., 8 //! int *d_in; // e.g., [0, 2, 2, 9, 5, 5, 5, 8] //! int *d_out; // e.g., [ , , , , , , , ] //! int *d_num_selected_out; // e.g., [ ] //! ... //! //! // Determine temporary device storage requirements //! void *d_temp_storage = NULL; //! size_t temp_storage_bytes = 0; //! cub::DeviceSelect::Unique( //! d_temp_storage, temp_storage_bytes, //! d_in, d_out, d_num_selected_out, num_items); //! //! // Allocate temporary storage //! cudaMalloc(&d_temp_storage, temp_storage_bytes); //! //! // Run selection //! cub::DeviceSelect::Unique( //! d_temp_storage, temp_storage_bytes, //! d_in, d_out, d_num_selected_out, num_items); //! //! // d_out <-- [0, 2, 9, 5, 8] //! // d_num_selected_out <-- [5] //! //! @endrst //! //! @tparam InputIteratorT //! **[inferred]** Random-access input iterator type for reading input items @iterator //! //! @tparam OutputIteratorT //! **[inferred]** Random-access output iterator type for writing selected items @iterator //! //! @tparam NumSelectedIteratorT //! **[inferred]** Output iterator type for recording the number of items selected @iterator //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in] d_in //! Pointer to the input sequence of data items //! //! @param[out] d_out //! Pointer to the output sequence of selected data items //! //! @param[out] d_num_selected_out //! Pointer to the output total number of items selected //! (i.e., length of `d_out`) //! //! @param[in] num_items //! Total number of input items (i.e., length of `d_in`) //! //! @param[in] stream //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst template CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Unique( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, NumSelectedIteratorT d_num_selected_out, int num_items, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSelect::Unique"); using OffsetT = int; // Signed integer type for global offsets using FlagIterator = NullType*; // FlagT iterator type (not used) using SelectOp = NullType; // Selection op (not used) using EqualityOp = Equality; // Default == operator return DispatchSelectIf< InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, false>::Dispatch(d_temp_storage, temp_storage_bytes, d_in, NULL, d_out, d_num_selected_out, SelectOp(), EqualityOp(), num_items, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Unique( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, NumSelectedIteratorT d_num_selected_out, int num_items, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return Unique( d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, stream); } //! @rst //! Given an input sequence ``d_keys_in`` and ``d_values_in`` with runs of key-value pairs with consecutive //! equal-valued keys, only the first key and its value from each run is selectively copied //! to ``d_keys_out`` and ``d_values_out``. //! The total number of items selected is written to ``d_num_selected_out``. //! //! - The user-provided equality operator, `equality_op`, is used to determine whether keys are equivalent //! - Copies of the selected items are compacted into ``d_out`` and maintain //! their original relative ordering. //! - In-place operations are not supported. There must be no overlap between //! any of the provided ranges: //! //! - ``[d_keys_in, d_keys_in + num_items)`` //! - ``[d_keys_out, d_keys_out + *d_num_selected_out)`` //! - ``[d_values_in, d_values_in + num_items)`` //! - ``[d_values_out, d_values_out + *d_num_selected_out)`` //! - ``[d_num_selected_out, d_num_selected_out + 1)`` //! //! - @devicestorage //! //! Snippet //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The code snippet below illustrates the compaction of items selected from an ``int`` device vector. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! // Declare, allocate, and initialize device-accessible pointers //! // for input and output //! int num_items; // e.g., 8 //! int *d_keys_in; // e.g., [0, 2, 2, 9, 5, 5, 5, 8] //! int *d_values_in; // e.g., [1, 2, 3, 4, 5, 6, 7, 8] //! int *d_keys_out; // e.g., [ , , , , , , , ] //! int *d_values_out; // e.g., [ , , , , , , , ] //! int *d_num_selected_out; // e.g., [ ] //! ... //! //! // Determine temporary device storage requirements //! void *d_temp_storage = NULL; //! size_t temp_storage_bytes = 0; //! cub::DeviceSelect::UniqueByKey( //! d_temp_storage, temp_storage_bytes, //! d_keys_in, d_values_in, //! d_keys_out, d_values_out, d_num_selected_out, num_items); //! //! // Allocate temporary storage //! cudaMalloc(&d_temp_storage, temp_storage_bytes); //! //! // Run selection //! cub::DeviceSelect::UniqueByKey( //! d_temp_storage, temp_storage_bytes, //! d_keys_in, d_values_in, //! d_keys_out, d_values_out, d_num_selected_out, num_items); //! //! // d_keys_out <-- [0, 2, 9, 5, 8] //! // d_values_out <-- [1, 2, 4, 5, 8] //! // d_num_selected_out <-- [5] //! //! @endrst //! //! @tparam KeyInputIteratorT //! **[inferred]** Random-access input iterator type for reading input keys @iterator //! //! @tparam ValueInputIteratorT //! **[inferred]** Random-access input iterator type for reading input values @iterator //! //! @tparam KeyOutputIteratorT //! **[inferred]** Random-access output iterator type for writing selected keys @iterator //! //! @tparam ValueOutputIteratorT //! **[inferred]** Random-access output iterator type for writing selected values @iterator //! //! @tparam NumSelectedIteratorT //! **[inferred]** Output iterator type for recording the number of items selected @iterator //! //! @tparam NumItemsT //! **[inferred]** Type of num_items //! //! @tparam EqualityOpT //! **[inferred]** Type of equality_op //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in] d_keys_in //! Pointer to the input sequence of keys //! //! @param[in] d_values_in //! Pointer to the input sequence of values //! //! @param[out] d_keys_out //! Pointer to the output sequence of selected keys //! //! @param[out] d_values_out //! Pointer to the output sequence of selected values //! //! @param[out] d_num_selected_out //! Pointer to the total number of items selected (i.e., length of `d_keys_out` or `d_values_out`) //! //! @param[in] num_items //! Total number of input items (i.e., length of `d_keys_in` or `d_values_in`) //! //! @param[in] equality_op //! Binary predicate to determine equality //! //! @param[in] stream //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst template CUB_RUNTIME_FUNCTION __forceinline__ static // typename ::cuda::std::enable_if< // !::cuda::std::is_convertible::value, // cudaError_t>::type UniqueByKey( void* d_temp_storage, size_t& temp_storage_bytes, KeyInputIteratorT d_keys_in, ValueInputIteratorT d_values_in, KeyOutputIteratorT d_keys_out, ValueOutputIteratorT d_values_out, NumSelectedIteratorT d_num_selected_out, NumItemsT num_items, EqualityOpT equality_op, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSelect::UniqueByKey"); using OffsetT = detail::choose_offset_t; return DispatchUniqueByKey< KeyInputIteratorT, ValueInputIteratorT, KeyOutputIteratorT, ValueOutputIteratorT, NumSelectedIteratorT, EqualityOpT, OffsetT>::Dispatch(d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_keys_out, d_values_out, d_num_selected_out, equality_op, static_cast(num_items), stream); } //! @rst //! Given an input sequence ``d_keys_in`` and ``d_values_in`` with runs of key-value pairs with consecutive //! equal-valued keys, only the first key and its value from each run is selectively copied //! to ``d_keys_out`` and ``d_values_out``. //! The total number of items selected is written to ``d_num_selected_out``. //! //! - The ``==`` equality operator is used to determine whether keys are equivalent //! - Copies of the selected items are compacted into ``d_out`` and maintain //! their original relative ordering. //! - In-place operations are not supported. There must be no overlap between //! any of the provided ranges: //! //! - ``[d_keys_in, d_keys_in + num_items)`` //! - ``[d_keys_out, d_keys_out + *d_num_selected_out)`` //! - ``[d_values_in, d_values_in + num_items)`` //! - ``[d_values_out, d_values_out + *d_num_selected_out)`` //! - ``[d_num_selected_out, d_num_selected_out + 1)`` //! //! - @devicestorage //! //! Snippet //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The code snippet below illustrates the compaction of items selected from an ``int`` device vector. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! // Declare, allocate, and initialize device-accessible pointers //! // for input and output //! int num_items; // e.g., 8 //! int *d_keys_in; // e.g., [0, 2, 2, 9, 5, 5, 5, 8] //! int *d_values_in; // e.g., [1, 2, 3, 4, 5, 6, 7, 8] //! int *d_keys_out; // e.g., [ , , , , , , , ] //! int *d_values_out; // e.g., [ , , , , , , , ] //! int *d_num_selected_out; // e.g., [ ] //! ... //! //! // Determine temporary device storage requirements //! void *d_temp_storage = NULL; //! size_t temp_storage_bytes = 0; //! cub::DeviceSelect::UniqueByKey( //! d_temp_storage, temp_storage_bytes, //! d_keys_in, d_values_in, //! d_keys_out, d_values_out, d_num_selected_out, num_items); //! //! // Allocate temporary storage //! cudaMalloc(&d_temp_storage, temp_storage_bytes); //! //! // Run selection //! cub::DeviceSelect::UniqueByKey( //! d_temp_storage, temp_storage_bytes, //! d_keys_in, d_values_in, //! d_keys_out, d_values_out, d_num_selected_out, num_items); //! //! // d_keys_out <-- [0, 2, 9, 5, 8] //! // d_values_out <-- [1, 2, 4, 5, 8] //! // d_num_selected_out <-- [5] //! //! @endrst //! //! @tparam KeyInputIteratorT //! **[inferred]** Random-access input iterator type for reading input keys @iterator //! //! @tparam ValueInputIteratorT //! **[inferred]** Random-access input iterator type for reading input values @iterator //! //! @tparam KeyOutputIteratorT //! **[inferred]** Random-access output iterator type for writing selected keys @iterator //! //! @tparam ValueOutputIteratorT //! **[inferred]** Random-access output iterator type for writing selected values @iterator //! //! @tparam NumSelectedIteratorT //! **[inferred]** Output iterator type for recording the number of items selected @iterator //! //! @tparam NumItemsT //! **[inferred]** Type of num_items //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in] d_keys_in //! Pointer to the input sequence of keys //! //! @param[in] d_values_in //! Pointer to the input sequence of values //! //! @param[out] d_keys_out //! Pointer to the output sequence of selected keys //! //! @param[out] d_values_out //! Pointer to the output sequence of selected values //! //! @param[out] d_num_selected_out //! Pointer to the total number of items selected (i.e., length of `d_keys_out` or `d_values_out`) //! //! @param[in] num_items //! Total number of input items (i.e., length of `d_keys_in` or `d_values_in`) //! //! @param[in] stream //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst template CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t UniqueByKey( void* d_temp_storage, size_t& temp_storage_bytes, KeyInputIteratorT d_keys_in, ValueInputIteratorT d_values_in, KeyOutputIteratorT d_keys_out, ValueOutputIteratorT d_values_out, NumSelectedIteratorT d_num_selected_out, NumItemsT num_items, cudaStream_t stream = 0) { return UniqueByKey( d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_keys_out, d_values_out, d_num_selected_out, num_items, Equality{}, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t UniqueByKey( void* d_temp_storage, size_t& temp_storage_bytes, KeyInputIteratorT d_keys_in, ValueInputIteratorT d_values_in, KeyOutputIteratorT d_keys_out, ValueOutputIteratorT d_values_out, NumSelectedIteratorT d_num_selected_out, NumItemsT num_items, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return UniqueByKey( d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_keys_out, d_values_out, d_num_selected_out, num_items, stream); } }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/device/device_spmv.cuh000066400000000000000000000213151463375617100202270ustar00rootroot00000000000000 /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ //! @file cub::DeviceSpmv provides device-wide parallel operations for performing //! sparse-matrix * vector multiplication (SpMV). #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include #include #include #include CUB_NAMESPACE_BEGIN //! @rst //! DeviceSpmv provides device-wide parallel operations for performing //! sparse-matrix * dense-vector multiplication (SpMV). //! //! Overview //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The `SpMV computation `_ //! performs the matrix-vector operation ``y = A * x + y``, where: //! //! - ``A`` is an ``m * n`` sparse matrix whose non-zero structure is specified in //! `compressed-storage-row (CSR) format //! `_ (i.e., three arrays: //! ``values``, ``row_offsets``, and ``column_indices``) //! - ``x`` and ``y`` are dense vectors //! //! Usage Considerations //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! @cdp_class{DeviceSpmv} //! //! @endrst struct DeviceSpmv { //! @name CSR matrix operations //! @{ //! @rst //! This function performs the matrix-vector operation ``y = A*x``. //! //! Snippet //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The code snippet below illustrates SpMV upon a 9x9 CSR matrix ``A`` representing a 3x3 lattice (24 non-zeros). //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! // Declare, allocate, and initialize device-accessible pointers for input matrix A, input //! vector x, //! // and output vector y //! int num_rows = 9; //! int num_cols = 9; //! int num_nonzeros = 24; //! //! float* d_values; // e.g., [1, 1, 1, 1, 1, 1, 1, 1, //! // 1, 1, 1, 1, 1, 1, 1, 1, //! // 1, 1, 1, 1, 1, 1, 1, 1] //! //! int* d_column_indices; // e.g., [1, 3, 0, 2, 4, 1, 5, 0, //! // 4, 6, 1, 3, 5, 7, 2, 4, //! // 8, 3, 7, 4, 6, 8, 5, 7] //! //! int* d_row_offsets; // e.g., [0, 2, 5, 7, 10, 14, 17, 19, 22, 24] //! //! float* d_vector_x; // e.g., [1, 1, 1, 1, 1, 1, 1, 1, 1] //! float* d_vector_y; // e.g., [ , , , , , , , , ] //! ... //! //! // Determine temporary device storage requirements //! void* d_temp_storage = NULL; //! size_t temp_storage_bytes = 0; //! cub::DeviceSpmv::CsrMV(d_temp_storage, temp_storage_bytes, d_values, //! d_row_offsets, d_column_indices, d_vector_x, d_vector_y, //! num_rows, num_cols, num_nonzeros); //! //! // Allocate temporary storage //! cudaMalloc(&d_temp_storage, temp_storage_bytes); //! //! // Run SpMV //! cub::DeviceSpmv::CsrMV(d_temp_storage, temp_storage_bytes, d_values, //! d_row_offsets, d_column_indices, d_vector_x, d_vector_y, //! num_rows, num_cols, num_nonzeros); //! //! // d_vector_y <-- [2, 3, 2, 3, 4, 3, 2, 3, 2] //! //! @endrst //! //! @tparam ValueT //! **[inferred]** Matrix and vector value type (e.g., `float`, `double`, etc.) //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. //! When NULL, the required allocation size is written to `temp_storage_bytes` and no work is done. //! //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! //! @param[in] d_values //! Pointer to the array of `num_nonzeros` values of the corresponding nonzero elements //! of matrix `A`. //! //! @param[in] d_row_offsets //! Pointer to the array of `m + 1` offsets demarcating the start of every row in //! `d_column_indices` and `d_values` (with the final entry being equal to `num_nonzeros`) //! //! @param[in] d_column_indices //! Pointer to the array of `num_nonzeros` column-indices of the corresponding nonzero //! elements of matrix `A`. (Indices are zero-valued.) //! //! @param[in] d_vector_x //! Pointer to the array of `num_cols` values corresponding to the dense input vector `x` //! //! @param[out] d_vector_y //! Pointer to the array of `num_rows` values corresponding to the dense output vector `y` //! //! @param[in] num_rows //! number of rows of matrix `A`. //! //! @param[in] num_cols //! number of columns of matrix `A`. //! //! @param[in] num_nonzeros //! number of nonzero elements of matrix `A`. //! //! @param[in] stream //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst template CUB_RUNTIME_FUNCTION static cudaError_t CsrMV( void* d_temp_storage, size_t& temp_storage_bytes, const ValueT* d_values, const int* d_row_offsets, const int* d_column_indices, const ValueT* d_vector_x, ValueT* d_vector_y, int num_rows, int num_cols, int num_nonzeros, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSpmv::CsrMV"); SpmvParams spmv_params; spmv_params.d_values = d_values; spmv_params.d_row_end_offsets = d_row_offsets + 1; spmv_params.d_column_indices = d_column_indices; spmv_params.d_vector_x = d_vector_x; spmv_params.d_vector_y = d_vector_y; spmv_params.num_rows = num_rows; spmv_params.num_cols = num_cols; spmv_params.num_nonzeros = num_nonzeros; spmv_params.alpha = ValueT{1}; spmv_params.beta = ValueT{0}; return DispatchSpmv::Dispatch(d_temp_storage, temp_storage_bytes, spmv_params, stream); } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t CsrMV( void* d_temp_storage, size_t& temp_storage_bytes, const ValueT* d_values, const int* d_row_offsets, const int* d_column_indices, const ValueT* d_vector_x, ValueT* d_vector_y, int num_rows, int num_cols, int num_nonzeros, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return CsrMV( d_temp_storage, temp_storage_bytes, d_values, d_row_offsets, d_column_indices, d_vector_x, d_vector_y, num_rows, num_cols, num_nonzeros, stream); } //! @} end member group }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/device/dispatch/000077500000000000000000000000001463375617100170175ustar00rootroot00000000000000cccl-2.5.0/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh000066400000000000000000000277721463375617100253410ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include #include #include #include #include #include CUB_NAMESPACE_BEGIN template CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceAdjacentDifferenceInitKernel(InputIteratorT first, InputT* result, OffsetT num_tiles, int items_per_tile) { const int tile_idx = static_cast(blockIdx.x * blockDim.x + threadIdx.x); AgentDifferenceInitT::Process(tile_idx, first, result, num_tiles, items_per_tile); } template CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceAdjacentDifferenceDifferenceKernel( InputIteratorT input, InputT* first_tile_previous, OutputIteratorT result, DifferenceOpT difference_op, OffsetT num_items) { using ActivePolicyT = typename ChainedPolicyT::ActivePolicy::AdjacentDifferencePolicy; // It is OK to introspect the return type or parameter types of the // `operator()` function of `__device__` extended lambda within device code. using OutputT = detail::invoke_result_t; using Agent = AgentDifference; __shared__ typename Agent::TempStorage storage; Agent agent(storage, input, first_tile_previous, result, difference_op, num_items); int tile_idx = static_cast(blockIdx.x); OffsetT tile_base = static_cast(tile_idx) * ActivePolicyT::ITEMS_PER_TILE; agent.Process(tile_idx, tile_base); } template struct DeviceAdjacentDifferencePolicy { using ValueT = typename std::iterator_traits::value_type; //------------------------------------------------------------------------------ // Architecture-specific tuning policies //------------------------------------------------------------------------------ struct Policy300 : ChainedPolicy<300, Policy300, Policy300> { using AdjacentDifferencePolicy = AgentAdjacentDifferencePolicy<128, Nominal8BItemsToItems(7), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE>; }; struct Policy350 : ChainedPolicy<350, Policy350, Policy300> { using AdjacentDifferencePolicy = AgentAdjacentDifferencePolicy<128, Nominal8BItemsToItems(7), BLOCK_LOAD_WARP_TRANSPOSE, MayAlias ? LOAD_CA : LOAD_LDG, BLOCK_STORE_WARP_TRANSPOSE>; }; using MaxPolicy = Policy350; }; template > struct DispatchAdjacentDifference : public SelectedPolicy { using InputT = typename std::iterator_traits::value_type; void* d_temp_storage; std::size_t& temp_storage_bytes; InputIteratorT d_input; OutputIteratorT d_output; OffsetT num_items; DifferenceOpT difference_op; cudaStream_t stream; CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchAdjacentDifference( void* d_temp_storage, std::size_t& temp_storage_bytes, InputIteratorT d_input, OutputIteratorT d_output, OffsetT num_items, DifferenceOpT difference_op, cudaStream_t stream) : d_temp_storage(d_temp_storage) , temp_storage_bytes(temp_storage_bytes) , d_input(d_input) , d_output(d_output) , num_items(num_items) , difference_op(difference_op) , stream(stream) {} CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_DEPRECATED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchAdjacentDifference( void* d_temp_storage, std::size_t& temp_storage_bytes, InputIteratorT d_input, OutputIteratorT d_output, OffsetT num_items, DifferenceOpT difference_op, cudaStream_t stream, bool debug_synchronous) : d_temp_storage(d_temp_storage) , temp_storage_bytes(temp_storage_bytes) , d_input(d_input) , d_output(d_output) , num_items(num_items) , difference_op(difference_op) , stream(stream) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG } /// Invocation template CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke() { using AdjacentDifferencePolicyT = typename ActivePolicyT::AdjacentDifferencePolicy; using MaxPolicyT = typename DispatchAdjacentDifference::MaxPolicy; cudaError error = cudaSuccess; do { constexpr int tile_size = AdjacentDifferencePolicyT::ITEMS_PER_TILE; const int num_tiles = static_cast(DivideAndRoundUp(num_items, tile_size)); std::size_t first_tile_previous_size = MayAlias * num_tiles * sizeof(InputT); void* allocations[1] = {nullptr}; std::size_t allocation_sizes[1] = {MayAlias * first_tile_previous_size}; error = CubDebug(AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes)); if (cudaSuccess != error) { break; } if (d_temp_storage == nullptr) { // Return if the caller is simply requesting the size of the storage // allocation if (temp_storage_bytes == 0) { temp_storage_bytes = 1; } break; } if (num_items == OffsetT{}) { break; } auto first_tile_previous = reinterpret_cast(allocations[0]); if (MayAlias) { using AgentDifferenceInitT = AgentDifferenceInit; constexpr int init_block_size = AgentDifferenceInitT::BLOCK_THREADS; const int init_grid_size = DivideAndRoundUp(num_tiles, init_block_size); #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking DeviceAdjacentDifferenceInitKernel" "<<<%d, %d, 0, %lld>>>()\n", init_grid_size, init_block_size, reinterpret_cast(stream)); #endif THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, init_block_size, 0, stream) .doit(DeviceAdjacentDifferenceInitKernel, d_input, first_tile_previous, num_tiles, tile_size); error = CubDebug(detail::DebugSyncStream(stream)); if (cudaSuccess != error) { break; } // Check for failure to launch error = CubDebug(cudaPeekAtLastError()); if (cudaSuccess != error) { break; } } #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking DeviceAdjacentDifferenceDifferenceKernel" "<<<%d, %d, 0, %lld>>>()\n", num_tiles, AdjacentDifferencePolicyT::BLOCK_THREADS, reinterpret_cast(stream)); #endif THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( num_tiles, AdjacentDifferencePolicyT::BLOCK_THREADS, 0, stream) .doit(DeviceAdjacentDifferenceDifferenceKernel< MaxPolicyT, InputIteratorT, OutputIteratorT, DifferenceOpT, OffsetT, InputT, MayAlias, ReadLeft>, d_input, first_tile_previous, d_output, difference_op, num_items); error = CubDebug(detail::DebugSyncStream(stream)); if (cudaSuccess != error) { break; } // Check for failure to launch error = CubDebug(cudaPeekAtLastError()); if (cudaSuccess != error) { break; } } while (0); return error; } CUB_RUNTIME_FUNCTION static cudaError_t Dispatch( void* d_temp_storage, std::size_t& temp_storage_bytes, InputIteratorT d_input, OutputIteratorT d_output, OffsetT num_items, DifferenceOpT difference_op, cudaStream_t stream) { using MaxPolicyT = typename DispatchAdjacentDifference::MaxPolicy; cudaError error = cudaSuccess; do { // Get PTX version int ptx_version = 0; error = CubDebug(PtxVersion(ptx_version)); if (cudaSuccess != error) { break; } // Create dispatch functor DispatchAdjacentDifference dispatch( d_temp_storage, temp_storage_bytes, d_input, d_output, num_items, difference_op, stream); // Dispatch to chained policy error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch)); if (cudaSuccess != error) { break; } } while (0); return error; } CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t Dispatch( void* d_temp_storage, std::size_t& temp_storage_bytes, InputIteratorT d_input, OutputIteratorT d_output, OffsetT num_items, DifferenceOpT difference_op, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return Dispatch(d_temp_storage, temp_storage_bytes, d_input, d_output, num_items, difference_op, stream); } }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh000066400000000000000000000700551463375617100240410ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * cub::DispatchBatchMemcpy provides device-wide, parallel operations for copying data from a number * of given source buffers to their corresponding destination buffer. */ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include #include #include #include #include #include CUB_NAMESPACE_BEGIN namespace detail { /** * Parameterizable tuning policy type for AgentBatchMemcpy */ template struct AgentBatchMemcpyLargeBuffersPolicy { /// Threads per thread block static constexpr uint32_t BLOCK_THREADS = _BLOCK_THREADS; /// The number of bytes each thread copies static constexpr uint32_t BYTES_PER_THREAD = _BYTES_PER_THREAD; }; /** * Initialization kernel for tile status initialization (multi-block) */ template CUB_DETAIL_KERNEL_ATTRIBUTES void InitTileStateKernel( BufferOffsetScanTileStateT buffer_offset_scan_tile_state, BlockOffsetScanTileStateT block_offset_scan_tile_state, TileOffsetT num_tiles) { // Initialize tile status buffer_offset_scan_tile_state.InitializeStatus(num_tiles); block_offset_scan_tile_state.InitializeStatus(num_tiles); } /** * Kernel that copies buffers that need to be copied by at least one (and potentially many) thread * blocks. */ template __launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentLargeBufferPolicyT::BLOCK_THREADS)) CUB_DETAIL_KERNEL_ATTRIBUTES void MultiBlockBatchMemcpyKernel( InputBufferIt input_buffer_it, OutputBufferIt output_buffer_it, BufferSizeIteratorT buffer_sizes, BufferTileOffsetItT buffer_tile_offsets, TileT buffer_offset_tile, TileOffsetT last_tile_offset) { using StatusWord = typename TileT::StatusWord; using ActivePolicyT = typename ChainedPolicyT::ActivePolicy::AgentLargeBufferPolicyT; using BufferSizeT = cub::detail::value_t; /// Internal load/store type. For byte-wise memcpy, a single-byte type using AliasT = typename ::cuda::std::conditional, std::iterator_traits>>::type::value_type; /// Types of the input and output buffers using InputBufferT = cub::detail::value_t; using OutputBufferT = cub::detail::value_t; constexpr uint32_t BLOCK_THREADS = ActivePolicyT::BLOCK_THREADS; constexpr uint32_t ITEMS_PER_THREAD = ActivePolicyT::BYTES_PER_THREAD; constexpr BufferSizeT TILE_SIZE = static_cast(BLOCK_THREADS * ITEMS_PER_THREAD); BufferOffsetT num_blev_buffers = buffer_offset_tile.LoadValid(last_tile_offset); uint32_t tile_id = blockIdx.x; // No block-level buffers => we're done here if (num_blev_buffers == 0) { return; } // While there's still tiles of bytes from block-level buffers to copied do { __shared__ BufferOffsetT block_buffer_id; // Make sure thread 0 does not overwrite the buffer id before other threads have finished with // the prior iteration of the loop CTA_SYNC(); // Binary search the buffer that this tile belongs to if (threadIdx.x == 0) { block_buffer_id = UpperBound(buffer_tile_offsets, num_blev_buffers, tile_id) - 1; } // Make sure thread 0 has written the buffer this thread block is assigned to CTA_SYNC(); const BufferOffsetT buffer_id = block_buffer_id; // The relative offset of this tile within the buffer it's assigned to BufferSizeT tile_offset_within_buffer = static_cast(tile_id - buffer_tile_offsets[buffer_id]) * TILE_SIZE; // If the tile has already reached beyond the work of the end of the last buffer if (buffer_id >= num_blev_buffers - 1 && tile_offset_within_buffer > buffer_sizes[buffer_id]) { return; } // Tiny remainders are copied without vectorizing laods if (buffer_sizes[buffer_id] - tile_offset_within_buffer <= 32) { BufferSizeT thread_offset = tile_offset_within_buffer + threadIdx.x; for (int i = 0; i < ITEMS_PER_THREAD; i++) { if (thread_offset < buffer_sizes[buffer_id]) { const auto value = read_item(input_buffer_it[buffer_id], thread_offset); write_item(output_buffer_it[buffer_id], thread_offset, value); } thread_offset += BLOCK_THREADS; } } else { copy_items( input_buffer_it[buffer_id], output_buffer_it[buffer_id], (cub::min)(buffer_sizes[buffer_id] - tile_offset_within_buffer, TILE_SIZE), tile_offset_within_buffer); } tile_id += gridDim.x; } while (true); } /** * @brief Kernel that copies data from a batch of given source buffers to their corresponding * destination buffer. If a buffer's size is to large to be copied by a single thread block, that * buffer is put into a queue of buffers that will get picked up later on, where multiple blocks * collaborate on each of these buffers. All other buffers get copied straight away. * * @param input_buffer_it [in] Iterator providing the pointers to the source memory buffers * @param output_buffer_it [in] Iterator providing the pointers to the destination memory buffers * @param buffer_sizes [in] Iterator providing the number of bytes to be copied for each pair of * buffers * @param num_buffers [in] The total number of buffer pairs * @param blev_buffer_srcs [out] The source pointers of buffers that require block-level * collaboration * @param blev_buffer_dsts [out] The destination pointers of buffers that require block-level * collaboration * @param blev_buffer_sizes [out] The sizes of buffers that require block-level collaboration * @param blev_buffer_scan_state [in,out] Tile states for the prefix sum over the count of buffers * requiring block-level collaboration (to "stream compact" (aka "select") BLEV-buffers) * @param blev_block_scan_state [in,out] Tile states for the prefix sum over the number of thread * blocks getting assigned to each buffer that requires block-level collaboration */ template __launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentSmallBufferPolicyT::BLOCK_THREADS)) CUB_DETAIL_KERNEL_ATTRIBUTES void BatchMemcpyKernel( InputBufferIt input_buffer_it, OutputBufferIt output_buffer_it, BufferSizeIteratorT buffer_sizes, BufferOffsetT num_buffers, BlevBufferSrcsOutItT blev_buffer_srcs, BlevBufferDstsOutItT blev_buffer_dsts, BlevBufferSizesOutItT blev_buffer_sizes, BlevBufferTileOffsetsOutItT blev_buffer_tile_offsets, BLevBufferOffsetTileState blev_buffer_scan_state, BLevBlockOffsetTileState blev_block_scan_state) { // Internal type used for storing a buffer's size using BufferSizeT = cub::detail::value_t; // Alias the correct tuning policy for the current compilation pass' architecture using AgentBatchMemcpyPolicyT = typename ChainedPolicyT::ActivePolicy::AgentSmallBufferPolicyT; // Block-level specialization using AgentBatchMemcpyT = AgentBatchMemcpy< AgentBatchMemcpyPolicyT, InputBufferIt, OutputBufferIt, BufferSizeIteratorT, BufferOffsetT, BlevBufferSrcsOutItT, BlevBufferDstsOutItT, BlevBufferSizesOutItT, BlevBufferTileOffsetsOutItT, BlockOffsetT, BLevBufferOffsetTileState, BLevBlockOffsetTileState, IsMemcpy>; // Shared memory for AgentBatchMemcpy __shared__ typename AgentBatchMemcpyT::TempStorage temp_storage; // Process this block's tile of input&output buffer pairs AgentBatchMemcpyT( temp_storage, input_buffer_it, output_buffer_it, buffer_sizes, num_buffers, blev_buffer_srcs, blev_buffer_dsts, blev_buffer_sizes, blev_buffer_tile_offsets, blev_buffer_scan_state, blev_block_scan_state) .ConsumeTile(blockIdx.x); } template struct DeviceBatchMemcpyPolicy { static constexpr uint32_t BLOCK_THREADS = 128U; static constexpr uint32_t BUFFERS_PER_THREAD = 4U; static constexpr uint32_t TLEV_BYTES_PER_THREAD = 8U; static constexpr uint32_t LARGE_BUFFER_BLOCK_THREADS = 256U; static constexpr uint32_t LARGE_BUFFER_BYTES_PER_THREAD = 32U; static constexpr uint32_t WARP_LEVEL_THRESHOLD = 128; static constexpr uint32_t BLOCK_LEVEL_THRESHOLD = 8 * 1024; using buff_delay_constructor_t = detail::default_delay_constructor_t; using block_delay_constructor_t = detail::default_delay_constructor_t; /// SM35 struct Policy350 : ChainedPolicy<350, Policy350, Policy350> { static constexpr bool PREFER_POW2_BITS = true; using AgentSmallBufferPolicyT = AgentBatchMemcpyPolicy< BLOCK_THREADS, BUFFERS_PER_THREAD, TLEV_BYTES_PER_THREAD, PREFER_POW2_BITS, LARGE_BUFFER_BLOCK_THREADS * LARGE_BUFFER_BYTES_PER_THREAD, WARP_LEVEL_THRESHOLD, BLOCK_LEVEL_THRESHOLD, buff_delay_constructor_t, block_delay_constructor_t>; using AgentLargeBufferPolicyT = AgentBatchMemcpyLargeBuffersPolicy; }; /// SM70 struct Policy700 : ChainedPolicy<700, Policy700, Policy350> { static constexpr bool PREFER_POW2_BITS = false; using AgentSmallBufferPolicyT = AgentBatchMemcpyPolicy< BLOCK_THREADS, BUFFERS_PER_THREAD, TLEV_BYTES_PER_THREAD, PREFER_POW2_BITS, LARGE_BUFFER_BLOCK_THREADS * LARGE_BUFFER_BYTES_PER_THREAD, WARP_LEVEL_THRESHOLD, BLOCK_LEVEL_THRESHOLD, buff_delay_constructor_t, block_delay_constructor_t>; using AgentLargeBufferPolicyT = AgentBatchMemcpyLargeBuffersPolicy; }; using MaxPolicy = Policy700; }; /** * @tparam InputBufferIt **[inferred]** Random-access input iterator type providing the pointers * to the source memory buffers * @tparam OutputBufferIt **[inferred]** Random-access input iterator type providing the pointers * to the destination memory buffers * @tparam BufferSizeIteratorT **[inferred]** Random-access input iterator type providing the * number of bytes to be copied for each pair of buffers * @tparam BufferOffsetT Integer type large enough to hold any offset in [0, num_buffers) * @tparam BlockOffsetT Integer type large enough to hold any offset in [0, * num_thread_blocks_launched) */ template , bool IsMemcpy = true> struct DispatchBatchMemcpy : SelectedPolicy { //------------------------------------------------------------------------------ // TYPE ALIASES //------------------------------------------------------------------------------ // Tile state for the single-pass prefix scan to "stream compact" (aka "select") the buffers // requiring block-level collaboration using BufferPartitionScanTileStateT = typename cub::ScanTileState; // Tile state for the single-pass prefix scan to keep track of how many blocks are assigned to // each of the buffers requiring block-level collaboration using BufferTileOffsetScanStateT = typename cub::ScanTileState; // Internal type used to keep track of a buffer's size using BufferSizeT = cub::detail::value_t; //------------------------------------------------------------------------------ // Member Veriables //------------------------------------------------------------------------------ void* d_temp_storage; size_t& temp_storage_bytes; InputBufferIt input_buffer_it; OutputBufferIt output_buffer_it; BufferSizeIteratorT buffer_sizes; BufferOffsetT num_buffers; cudaStream_t stream; //------------------------------------------------------------------------------ // Constructor //------------------------------------------------------------------------------ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchBatchMemcpy( void* d_temp_storage, size_t& temp_storage_bytes, InputBufferIt input_buffer_it, OutputBufferIt output_buffer_it, BufferSizeIteratorT buffer_sizes, BufferOffsetT num_buffers, cudaStream_t stream) : d_temp_storage(d_temp_storage) , temp_storage_bytes(temp_storage_bytes) , input_buffer_it(input_buffer_it) , output_buffer_it(output_buffer_it) , buffer_sizes(buffer_sizes) , num_buffers(num_buffers) , stream(stream) {} //------------------------------------------------------------------------------ // Chained policy invocation //------------------------------------------------------------------------------ /** * @brief Tuning policy invocation. This member function template is getting instantiated for all * tuning policies in the tuning policy chain. It is, however, *invoked* for the correct tuning * policy only. */ template CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke() { using MaxPolicyT = typename DispatchBatchMemcpy::MaxPolicy; // Single-pass prefix scan tile states for the prefix-sum over the number of block-level buffers using BLevBufferOffsetTileState = cub::ScanTileState; // Single-pass prefix scan tile states for the prefix sum over the number of thread blocks // assigned to each of the block-level buffers using BLevBlockOffsetTileState = cub::ScanTileState; cudaError error = cudaSuccess; enum : uint32_t { // Memory for the source pointers of the buffers that require block-level collaboration MEM_BLEV_BUFFER_SRCS = 0, // Memory for the destination pointers of the buffers that require block-level collaboration MEM_BLEV_BUFFER_DSTS, // Memory for the block-level buffers' sizes MEM_BLEV_BUFFER_SIZES, // Memory to keep track of the assignment of thread blocks to block-level buffers MEM_BLEV_BUFFER_TBLOCK, // Memory for the tile states of the prefix sum over the number of buffers that require // block-level collaboration MEM_BLEV_BUFFER_SCAN_STATE, // Memory for the scan tile states of the prefix sum over the number of thread block's // assigned up to and including a certain block-level buffer MEM_BLEV_BLOCK_SCAN_STATE, // Total number of distinct memory allocations in the temporary storage memory BLOB MEM_NUM_ALLOCATIONS }; // Number of threads per block for initializing the grid states constexpr BlockOffsetT INIT_KERNEL_THREADS = 128U; // The number of buffers that get processed per thread block constexpr uint32_t TILE_SIZE = ActivePolicyT::AgentSmallBufferPolicyT::BLOCK_THREADS * ActivePolicyT::AgentSmallBufferPolicyT::BUFFERS_PER_THREAD; // The number of thread blocks (or tiles) required to process all of the given buffers BlockOffsetT num_tiles = DivideAndRoundUp(num_buffers, TILE_SIZE); using BlevBufferSrcsOutT = cub::detail::conditional_t>; using BlevBufferDstOutT = cub::detail::conditional_t>; using BlevBufferSrcsOutItT = BlevBufferSrcsOutT*; using BlevBufferDstsOutItT = BlevBufferDstOutT*; using BlevBufferSizesOutItT = BufferSizeT*; using BlevBufferTileOffsetsOutItT = BlockOffsetT*; temporary_storage::layout temporary_storage_layout; auto blev_buffer_srcs_slot = temporary_storage_layout.get_slot(MEM_BLEV_BUFFER_SRCS); auto blev_buffer_dsts_slot = temporary_storage_layout.get_slot(MEM_BLEV_BUFFER_DSTS); auto blev_buffer_sizes_slot = temporary_storage_layout.get_slot(MEM_BLEV_BUFFER_SIZES); auto blev_buffer_block_slot = temporary_storage_layout.get_slot(MEM_BLEV_BUFFER_TBLOCK); auto blev_buffer_scan_slot = temporary_storage_layout.get_slot(MEM_BLEV_BUFFER_SCAN_STATE); auto blev_buffer_block_scan_slot = temporary_storage_layout.get_slot(MEM_BLEV_BLOCK_SCAN_STATE); auto blev_buffer_srcs_alloc = blev_buffer_srcs_slot->template create_alias(); auto blev_buffer_dsts_alloc = blev_buffer_dsts_slot->template create_alias(); auto blev_buffer_sizes_alloc = blev_buffer_sizes_slot->template create_alias(); auto blev_buffer_block_alloc = blev_buffer_block_slot->template create_alias(); auto blev_buffer_scan_alloc = blev_buffer_scan_slot->template create_alias(); auto blev_block_scan_alloc = blev_buffer_block_scan_slot->template create_alias(); std::size_t buffer_offset_scan_storage = 0; std::size_t blev_block_scan_storage = 0; error = CubDebug(BLevBufferOffsetTileState::AllocationSize(static_cast(num_tiles), buffer_offset_scan_storage)); if (error) { return error; } error = CubDebug(BLevBlockOffsetTileState::AllocationSize(static_cast(num_tiles), blev_block_scan_storage)); if (error) { return error; } blev_buffer_srcs_alloc.grow(num_buffers); blev_buffer_dsts_alloc.grow(num_buffers); blev_buffer_sizes_alloc.grow(num_buffers); blev_buffer_block_alloc.grow(num_buffers); blev_buffer_scan_alloc.grow(buffer_offset_scan_storage); blev_block_scan_alloc.grow(blev_block_scan_storage); // Just return if no temporary storage is provided if (d_temp_storage == nullptr) { temp_storage_bytes = temporary_storage_layout.get_size(); return error; } // Return if empty problem if (num_buffers == 0) { return error; } // Alias memory buffers into the storage blob error = CubDebug(temporary_storage_layout.map_to_buffer(d_temp_storage, temp_storage_bytes)); if (cudaSuccess != error) { return error; } // Alias into temporary storage allocation BlevBufferSrcsOutItT d_blev_src_buffers = blev_buffer_srcs_alloc.get(); BlevBufferDstsOutItT d_blev_dst_buffers = blev_buffer_dsts_alloc.get(); BlevBufferSizesOutItT d_blev_buffer_sizes = blev_buffer_sizes_alloc.get(); BlevBufferTileOffsetsOutItT d_blev_block_offsets = blev_buffer_block_alloc.get(); // Kernels' grid sizes BlockOffsetT init_grid_size = DivideAndRoundUp(num_tiles, INIT_KERNEL_THREADS); BlockOffsetT batch_memcpy_grid_size = num_tiles; // Kernels auto init_scan_states_kernel = InitTileStateKernel; auto batch_memcpy_non_blev_kernel = BatchMemcpyKernel< MaxPolicyT, InputBufferIt, OutputBufferIt, BufferSizeIteratorT, BufferOffsetT, BlevBufferSrcsOutItT, BlevBufferDstsOutItT, BlevBufferSizesOutItT, BlevBufferTileOffsetsOutItT, BlockOffsetT, BLevBufferOffsetTileState, BLevBlockOffsetTileState, IsMemcpy>; auto multi_block_memcpy_kernel = MultiBlockBatchMemcpyKernel< MaxPolicyT, BufferOffsetT, BlevBufferSrcsOutItT, BlevBufferDstsOutItT, BlevBufferSizesOutItT, BlevBufferTileOffsetsOutItT, BLevBufferOffsetTileState, BlockOffsetT, IsMemcpy>; constexpr uint32_t BLEV_BLOCK_THREADS = ActivePolicyT::AgentLargeBufferPolicyT::BLOCK_THREADS; // Get device ordinal int device_ordinal; error = CubDebug(cudaGetDevice(&device_ordinal)); if (cudaSuccess != error) { return error; } // Get SM count int sm_count; error = CubDebug(cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal)); if (cudaSuccess != error) { return error; } // Get SM occupancy for the batch memcpy block-level buffers kernel int batch_memcpy_blev_occupancy; error = CubDebug(MaxSmOccupancy(batch_memcpy_blev_occupancy, multi_block_memcpy_kernel, BLEV_BLOCK_THREADS)); if (cudaSuccess != error) { return error; } int batch_memcpy_blev_grid_size = static_cast(sm_count * batch_memcpy_blev_occupancy * CUB_SUBSCRIPTION_FACTOR(0)); // Construct the tile status for the buffer prefix sum BLevBufferOffsetTileState buffer_scan_tile_state; error = CubDebug(buffer_scan_tile_state.Init( static_cast(num_tiles), blev_buffer_scan_alloc.get(), buffer_offset_scan_storage)); if (cudaSuccess != error) { return error; } // Construct the tile status for thread blocks-to-buffer-assignment prefix sum BLevBlockOffsetTileState block_scan_tile_state; error = CubDebug(block_scan_tile_state.Init( static_cast(num_tiles), blev_block_scan_alloc.get(), blev_block_scan_storage)); if (cudaSuccess != error) { return error; } #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking " "InitTileStateKernel<<<%d, %d, 0, %lld>>>()\n", static_cast(init_grid_size), INIT_KERNEL_THREADS, (long long) stream); #endif // Invoke init_kernel to initialize buffer prefix sum-tile descriptors error = THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream) .doit(init_scan_states_kernel, buffer_scan_tile_state, block_scan_tile_state, num_tiles); // Check for failure to launch error = CubDebug(error); if (cudaSuccess != error) { return error; } // Sync the stream if specified to flush runtime errors error = CubDebug(detail::DebugSyncStream(stream)); // Check for failure to launch if (cudaSuccess != error) { return error; } #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking " "BatchMemcpyKernel<<<%d, %d, 0, %lld>>>()\n", static_cast(batch_memcpy_grid_size), ActivePolicyT::AgentSmallBufferPolicyT::BLOCK_THREADS, (long long) stream); #endif // Invoke kernel to copy small buffers and put the larger ones into a queue that will get picked // up by next kernel error = THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( batch_memcpy_grid_size, ActivePolicyT::AgentSmallBufferPolicyT::BLOCK_THREADS, 0, stream) .doit(batch_memcpy_non_blev_kernel, input_buffer_it, output_buffer_it, buffer_sizes, num_buffers, d_blev_src_buffers, d_blev_dst_buffers, d_blev_buffer_sizes, d_blev_block_offsets, buffer_scan_tile_state, block_scan_tile_state); // Check for failure to launch error = CubDebug(error); if (cudaSuccess != error) { return error; } // Sync the stream if specified to flush runtime errors error = CubDebug(detail::DebugSyncStream(stream)); if (cudaSuccess != error) { return error; } #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking " "MultiBlockBatchMemcpyKernel<<<%d, %d, 0, %lld>>>()\n", static_cast(batch_memcpy_blev_grid_size), BLEV_BLOCK_THREADS, (long long) stream); #endif error = THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(batch_memcpy_blev_grid_size, BLEV_BLOCK_THREADS, 0, stream) .doit(multi_block_memcpy_kernel, d_blev_src_buffers, d_blev_dst_buffers, d_blev_buffer_sizes, d_blev_block_offsets, buffer_scan_tile_state, batch_memcpy_grid_size - 1); // Check for failure to launch error = CubDebug(error); if (cudaSuccess != error) { return error; } // Sync the stream if specified to flush runtime errors error = CubDebug(detail::DebugSyncStream(stream)); return error; } //------------------------------------------------------------------------------ // Dispatch entrypoints //------------------------------------------------------------------------------ /** * Internal dispatch routine */ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch( void* d_temp_storage, size_t& temp_storage_bytes, InputBufferIt input_buffer_it, OutputBufferIt output_buffer_it, BufferSizeIteratorT buffer_sizes, BufferOffsetT num_buffers, cudaStream_t stream) { using MaxPolicyT = typename DispatchBatchMemcpy::MaxPolicy; cudaError_t error = cudaSuccess; // Get PTX version int ptx_version = 0; error = CubDebug(PtxVersion(ptx_version)); if (cudaSuccess != error) { return error; } // Create dispatch functor DispatchBatchMemcpy dispatch( d_temp_storage, temp_storage_bytes, input_buffer_it, output_buffer_it, buffer_sizes, num_buffers, stream); // Dispatch to chained policy error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch)); if (cudaSuccess != error) { return error; } return error; } }; } // namespace detail CUB_NAMESPACE_END cccl-2.5.0/cub/cub/device/dispatch/dispatch_for.cuh000066400000000000000000000245731463375617100222000ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include #include #include #include #include CUB_NAMESPACE_BEGIN namespace detail { namespace for_each { template struct first_parameter { using type = void; }; template struct first_parameter { using type = A; }; template struct first_parameter { using type = A; }; template using first_parameter_t = typename first_parameter::type; template struct has_unique_value_overload : ::cuda::std::false_type {}; // clang-format off template struct has_unique_value_overload< Value, Fn, typename ::cuda::std::enable_if< !::cuda::std::is_reference>::value && ::cuda::std::is_convertible >::value>::type> : ::cuda::std::true_type {}; // For trivial types, foreach is not allowed to copy values, even if those are trivially copyable. // This can be observable if the unary operator takes parameter by reference and modifies it or uses address. // The trait below checks if the freedom to copy trivial types can be regained. template using can_regain_copy_freedom = ::cuda::std::integral_constant< bool, ::cuda::std::is_trivially_constructible::value && ::cuda::std::is_trivially_copy_assignable::value && :: cuda::std::is_trivially_move_assignable::value && ::cuda::std::is_trivially_destructible::value && has_unique_value_overload::value>; // clang-format on // This kernel is used when the block size is not known at compile time template CUB_DETAIL_KERNEL_ATTRIBUTES void dynamic_kernel(OffsetT num_items, OpT op) { using active_policy_t = typename ChainedPolicyT::ActivePolicy::for_policy_t; using agent_t = agent_block_striped_t; const auto block_threads = static_cast(blockDim.x); const auto items_per_tile = active_policy_t::items_per_thread * block_threads; const auto tile_base = static_cast(blockIdx.x) * items_per_tile; const auto num_remaining = num_items - tile_base; const auto items_in_tile = static_cast(num_remaining < items_per_tile ? num_remaining : items_per_tile); if (items_in_tile == items_per_tile) { agent_t{tile_base, op}.template consume_tile(items_per_tile, block_threads); } else { agent_t{tile_base, op}.template consume_tile(items_in_tile, block_threads); } } // This kernel is used when the block size is known at compile time template CUB_DETAIL_KERNEL_ATTRIBUTES // __launch_bounds__(ChainedPolicyT::ActivePolicy::for_policy_t::block_threads) // void static_kernel(OffsetT num_items, OpT op) { using active_policy_t = typename ChainedPolicyT::ActivePolicy::for_policy_t; using agent_t = agent_block_striped_t; constexpr auto block_threads = active_policy_t::block_threads; constexpr auto items_per_tile = active_policy_t::items_per_thread * block_threads; const auto tile_base = static_cast(blockIdx.x) * items_per_tile; const auto num_remaining = num_items - tile_base; const auto items_in_tile = static_cast(num_remaining < items_per_tile ? num_remaining : items_per_tile); if (items_in_tile == items_per_tile) { agent_t{tile_base, op}.template consume_tile(items_per_tile, block_threads); } else { agent_t{tile_base, op}.template consume_tile(items_in_tile, block_threads); } } // The dispatch layer is in the detail namespace until we figure out tuning API template struct dispatch_t : PolicyHubT { OffsetT num_items; OpT op; cudaStream_t stream; CUB_RUNTIME_FUNCTION dispatch_t(OffsetT num_items, OpT op, cudaStream_t stream) : num_items(num_items) , op(op) , stream(stream) {} template CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke(::cuda::std::false_type /* block size is not known at compile time */) { using max_policy_t = typename dispatch_t::MaxPolicy; if (num_items == 0) { return cudaSuccess; } int block_threads = 256; cudaError_t error = cudaSuccess; NV_IF_TARGET(NV_IS_HOST, (int _{}; // error = cudaOccupancyMaxPotentialBlockSize( &_, &block_threads, detail::for_each::dynamic_kernel);)); error = CubDebug(error); if (cudaSuccess != error) { return error; } constexpr int items_per_thread = ActivePolicyT::for_policy_t::items_per_thread; const auto tile_size = static_cast(block_threads * items_per_thread); const auto num_tiles = cub::DivideAndRoundUp(num_items, tile_size); #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking detail::for_each::dynamic_kernel<<<%d, %d, 0, %lld>>>(), " "%d items per thread\n", static_cast(num_tiles), static_cast(block_threads), reinterpret_cast(stream), static_cast(items_per_thread)); #endif error = THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( static_cast(num_tiles), static_cast(block_threads), 0, stream) .doit(detail::for_each::dynamic_kernel, num_items, op); error = CubDebug(error); if (cudaSuccess != error) { return error; } error = CubDebug(detail::DebugSyncStream(stream)); if (cudaSuccess != error) { CubDebug(error = SyncStream(stream)); } return error; } template CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke(::cuda::std::true_type /* block size is known at compile time */) { using max_policy_t = typename dispatch_t::MaxPolicy; if (num_items == 0) { return cudaSuccess; } cudaError_t error = cudaSuccess; constexpr int block_threads = ActivePolicyT::for_policy_t::block_threads; constexpr int items_per_thread = ActivePolicyT::for_policy_t::items_per_thread; const auto tile_size = static_cast(block_threads * items_per_thread); const auto num_tiles = cub::DivideAndRoundUp(num_items, tile_size); #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking detail::for_each::static_kernel<<<%d, %d, 0, %lld>>>(), " "%d items per thread\n", static_cast(num_tiles), static_cast(block_threads), reinterpret_cast(stream), static_cast(items_per_thread)); #endif error = THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( static_cast(num_tiles), static_cast(block_threads), 0, stream) .doit(detail::for_each::static_kernel, num_items, op); error = CubDebug(error); if (cudaSuccess != error) { return error; } error = CubDebug(detail::DebugSyncStream(stream)); if (cudaSuccess != error) { CubDebug(error = SyncStream(stream)); } return error; } template CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke() { constexpr bool static_block_size = ActivePolicyT::for_policy_t::block_threads > 0; return Invoke(::cuda::std::integral_constant{}); } CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch(OffsetT num_items, OpT op, cudaStream_t stream) { using max_policy_t = typename dispatch_t::MaxPolicy; int ptx_version = 0; cudaError_t error = CubDebug(PtxVersion(ptx_version)); if (cudaSuccess != error) { return error; } dispatch_t dispatch(num_items, op, stream); error = CubDebug(max_policy_t::Invoke(ptx_version, dispatch)); return error; } }; } // namespace for_each } // namespace detail CUB_NAMESPACE_END cccl-2.5.0/cub/cub/device/dispatch/dispatch_histogram.cuh000066400000000000000000001544271463375617100234110ustar00rootroot00000000000000 /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * @file * cub::DeviceHistogram provides device-wide parallel operations for constructing histogram(s) * from a sequence of samples data residing within device-accessible memory. */ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include CUB_NAMESPACE_BEGIN /****************************************************************************** * Histogram kernel entry points *****************************************************************************/ /** * Histogram initialization kernel entry point * * @tparam NUM_ACTIVE_CHANNELS * Number of channels actively being histogrammed * * @tparam CounterT * Integer type for counting sample occurrences per histogram bin * * @tparam OffsetT * Signed integer type for global offsets * * @param num_output_bins_wrapper * Number of output histogram bins per channel * * @param d_output_histograms_wrapper * Histogram counter data having logical dimensions * `CounterT[NUM_ACTIVE_CHANNELS][num_bins.array[CHANNEL]]` * * @param tile_queue * Drain queue descriptor for dynamically mapping tile data onto thread blocks */ template CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceHistogramInitKernel( ::cuda::std::array num_output_bins_wrapper, ::cuda::std::array d_output_histograms_wrapper, GridQueue tile_queue) { if ((threadIdx.x == 0) && (blockIdx.x == 0)) { tile_queue.ResetDrain(); } int output_bin = (blockIdx.x * blockDim.x) + threadIdx.x; #pragma unroll for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) { if (output_bin < num_output_bins_wrapper[CHANNEL]) { d_output_histograms_wrapper[CHANNEL][output_bin] = 0; } } } /** * Histogram privatized sweep kernel entry point (multi-block). * Computes privatized histograms, one per thread block. * * * @tparam AgentHistogramPolicyT * Parameterized AgentHistogramPolicy tuning policy type * * @tparam PRIVATIZED_SMEM_BINS * Maximum number of histogram bins per channel (e.g., up to 256) * * @tparam NUM_CHANNELS * Number of channels interleaved in the input data (may be greater than the number of channels * being actively histogrammed) * * @tparam NUM_ACTIVE_CHANNELS * Number of channels actively being histogrammed * * @tparam SampleIteratorT * The input iterator type. @iterator. * * @tparam CounterT * Integer type for counting sample occurrences per histogram bin * * @tparam PrivatizedDecodeOpT * The transform operator type for determining privatized counter indices from samples, * one for each channel * * @tparam OutputDecodeOpT * The transform operator type for determining output bin-ids from privatized counter indices, * one for each channel * * @tparam OffsetT * integer type for global offsets * * @param d_samples * Input data to reduce * * @param num_output_bins_wrapper * The number bins per final output histogram * * @param num_privatized_bins_wrapper * The number bins per privatized histogram * * @param d_output_histograms_wrapper * Reference to final output histograms * * @param d_privatized_histograms_wrapper * Reference to privatized histograms * * @param output_decode_op_wrapper * The transform operator for determining output bin-ids from privatized counter indices, * one for each channel * * @param privatized_decode_op_wrapper * The transform operator for determining privatized counter indices from samples, * one for each channel * * @param num_row_pixels * The number of multi-channel pixels per row in the region of interest * * @param num_rows * The number of rows in the region of interest * * @param row_stride_samples * The number of samples between starts of consecutive rows in the region of interest * * @param tiles_per_row * Number of image tiles per row * * @param tile_queue * Drain queue descriptor for dynamically mapping tile data onto thread blocks */ template __launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentHistogramPolicyT::BLOCK_THREADS)) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceHistogramSweepKernel( SampleIteratorT d_samples, ::cuda::std::array num_output_bins_wrapper, ::cuda::std::array num_privatized_bins_wrapper, ::cuda::std::array d_output_histograms_wrapper, ::cuda::std::array d_privatized_histograms_wrapper, ::cuda::std::array output_decode_op_wrapper, ::cuda::std::array privatized_decode_op_wrapper, OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_samples, int tiles_per_row, GridQueue tile_queue) { // Thread block type for compositing input tiles using AgentHistogramPolicyT = typename ChainedPolicyT::ActivePolicy::AgentHistogramPolicyT; using AgentHistogramT = AgentHistogram; // Shared memory for AgentHistogram __shared__ typename AgentHistogramT::TempStorage temp_storage; AgentHistogramT agent( temp_storage, d_samples, num_output_bins_wrapper.__elems_, num_privatized_bins_wrapper.__elems_, d_output_histograms_wrapper.__elems_, d_privatized_histograms_wrapper.__elems_, output_decode_op_wrapper.__elems_, privatized_decode_op_wrapper.__elems_); // Initialize counters agent.InitBinCounters(); // Consume input tiles agent.ConsumeTiles(num_row_pixels, num_rows, row_stride_samples, tiles_per_row, tile_queue); // Store output to global (if necessary) agent.StoreOutput(); } namespace detail { template struct dispatch_histogram { void* d_temp_storage; size_t& temp_storage_bytes; SampleIteratorT d_samples; CounterT** d_output_histograms; const int* num_privatized_levels; PrivatizedDecodeOpT* privatized_decode_op; const int* num_output_levels; OutputDecodeOpT* output_decode_op; int max_num_output_bins; OffsetT num_row_pixels; OffsetT num_rows; OffsetT row_stride_samples; cudaStream_t stream; CUB_RUNTIME_FUNCTION dispatch_histogram( void* d_temp_storage, size_t& temp_storage_bytes, SampleIteratorT d_samples, CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS], const int num_privatized_levels[NUM_ACTIVE_CHANNELS], PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS], const int num_output_levels[NUM_ACTIVE_CHANNELS], OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS], int max_num_output_bins, OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_samples, cudaStream_t stream) : d_temp_storage(d_temp_storage) , temp_storage_bytes(temp_storage_bytes) , d_samples(d_samples) , d_output_histograms(d_output_histograms) , num_privatized_levels(num_privatized_levels) , privatized_decode_op(privatized_decode_op) , num_output_levels(num_output_levels) , output_decode_op(output_decode_op) , max_num_output_bins(max_num_output_bins) , num_row_pixels(num_row_pixels) , num_rows(num_rows) , row_stride_samples(row_stride_samples) , stream(stream) {} template CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE cudaError_t Invoke(DeviceHistogramInitKernelT histogram_init_kernel, DeviceHistogramSweepKernelT histogram_sweep_kernel) { cudaError error = cudaSuccess; constexpr int block_threads = ActivePolicyT::AgentHistogramPolicyT::BLOCK_THREADS; constexpr int pixels_per_thread = ActivePolicyT::AgentHistogramPolicyT::PIXELS_PER_THREAD; do { // Get device ordinal int device_ordinal; error = CubDebug(cudaGetDevice(&device_ordinal)); if (cudaSuccess != error) { break; } // Get SM count int sm_count; error = CubDebug(cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal)); if (cudaSuccess != error) { break; } // Get SM occupancy for histogram_sweep_kernel int histogram_sweep_sm_occupancy; error = CubDebug(MaxSmOccupancy(histogram_sweep_sm_occupancy, histogram_sweep_kernel, block_threads)); if (cudaSuccess != error) { break; } // Get device occupancy for histogram_sweep_kernel int histogram_sweep_occupancy = histogram_sweep_sm_occupancy * sm_count; if (num_row_pixels * NUM_CHANNELS == row_stride_samples) { // Treat as a single linear array of samples num_row_pixels *= num_rows; num_rows = 1; row_stride_samples = num_row_pixels * NUM_CHANNELS; } // Get grid dimensions, trying to keep total blocks ~histogram_sweep_occupancy int pixels_per_tile = block_threads * pixels_per_thread; int tiles_per_row = static_cast(cub::DivideAndRoundUp(num_row_pixels, pixels_per_tile)); int blocks_per_row = CUB_MIN(histogram_sweep_occupancy, tiles_per_row); int blocks_per_col = (blocks_per_row > 0) ? int(CUB_MIN(histogram_sweep_occupancy / blocks_per_row, num_rows)) : 0; int num_thread_blocks = blocks_per_row * blocks_per_col; dim3 sweep_grid_dims; sweep_grid_dims.x = (unsigned int) blocks_per_row; sweep_grid_dims.y = (unsigned int) blocks_per_col; sweep_grid_dims.z = 1; // Temporary storage allocation requirements constexpr int NUM_ALLOCATIONS = NUM_ACTIVE_CHANNELS + 1; void* allocations[NUM_ALLOCATIONS] = {}; size_t allocation_sizes[NUM_ALLOCATIONS]; for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) { allocation_sizes[CHANNEL] = size_t(num_thread_blocks) * (num_privatized_levels[CHANNEL] - 1) * sizeof(CounterT); } allocation_sizes[NUM_ALLOCATIONS - 1] = GridQueue::AllocationSize(); // Alias the temporary allocations from the single storage blob (or compute the // necessary size of the blob) error = CubDebug(AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes)); if (cudaSuccess != error) { break; } if (d_temp_storage == nullptr) { // Return if the caller is simply requesting the size of the storage allocation break; } // Construct the grid queue descriptor GridQueue tile_queue(allocations[NUM_ALLOCATIONS - 1]); // Wrap arrays so we can pass them by-value to the kernel ::cuda::std::array d_output_histograms_wrapper; ::cuda::std::array d_privatized_histograms_wrapper; ::cuda::std::array privatized_decode_op_wrapper; ::cuda::std::array output_decode_op_wrapper; ::cuda::std::array num_privatized_bins_wrapper; ::cuda::std::array num_output_bins_wrapper; auto* typedAllocations = reinterpret_cast(allocations); ::cuda::std::copy( d_output_histograms, d_output_histograms + NUM_ACTIVE_CHANNELS, d_output_histograms_wrapper.begin()); ::cuda::std::copy( typedAllocations, typedAllocations + NUM_ACTIVE_CHANNELS, d_privatized_histograms_wrapper.begin()); ::cuda::std::copy( privatized_decode_op, privatized_decode_op + NUM_ACTIVE_CHANNELS, privatized_decode_op_wrapper.begin()); ::cuda::std::copy(output_decode_op, output_decode_op + NUM_ACTIVE_CHANNELS, output_decode_op_wrapper.begin()); auto minus_one = cuda::proclaim_return_type([](int levels) { return levels - 1; }); ::cuda::std::transform( num_privatized_levels, num_privatized_levels + NUM_ACTIVE_CHANNELS, num_privatized_bins_wrapper.begin(), minus_one); ::cuda::std::transform( num_output_levels, num_output_levels + NUM_ACTIVE_CHANNELS, num_output_bins_wrapper.begin(), minus_one); int histogram_init_block_threads = 256; int histogram_init_grid_dims = (max_num_output_bins + histogram_init_block_threads - 1) / histogram_init_block_threads; // Log DeviceHistogramInitKernel configuration #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking DeviceHistogramInitKernel<<<%d, %d, 0, %lld>>>()\n", histogram_init_grid_dims, histogram_init_block_threads, (long long) stream); #endif // Invoke histogram_init_kernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( histogram_init_grid_dims, histogram_init_block_threads, 0, stream) .doit(histogram_init_kernel, num_output_bins_wrapper, d_output_histograms_wrapper, tile_queue); // Return if empty problem if ((blocks_per_row == 0) || (blocks_per_col == 0)) { break; } // Log histogram_sweep_kernel configuration #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking histogram_sweep_kernel<<<{%d, %d, %d}, %d, 0, %lld>>>(), %d pixels " "per thread, %d SM occupancy\n", sweep_grid_dims.x, sweep_grid_dims.y, sweep_grid_dims.z, block_threads, (long long) stream, pixels_per_thread, histogram_sweep_sm_occupancy); #endif // Invoke histogram_sweep_kernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(sweep_grid_dims, block_threads, 0, stream) .doit(histogram_sweep_kernel, d_samples, num_output_bins_wrapper, num_privatized_bins_wrapper, d_output_histograms_wrapper, d_privatized_histograms_wrapper, output_decode_op_wrapper, privatized_decode_op_wrapper, num_row_pixels, num_rows, row_stride_samples, tiles_per_row, tile_queue); // Check for failure to launch error = CubDebug(cudaPeekAtLastError()); if (cudaSuccess != error) { break; } // Sync the stream if specified to flush runtime errors error = CubDebug(detail::DebugSyncStream(stream)); if (cudaSuccess != error) { break; } } while (0); return error; } template CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke() { return Invoke( DeviceHistogramInitKernel, DeviceHistogramSweepKernel); } }; } // namespace detail /****************************************************************************** * Dispatch ******************************************************************************/ /** * Utility class for dispatching the appropriately-tuned kernels for DeviceHistogram * * @tparam NUM_CHANNELS * Number of channels interleaved in the input data (may be greater than the number of channels * being actively histogrammed) * * @tparam NUM_ACTIVE_CHANNELS * Number of channels actively being histogrammed * * @tparam SampleIteratorT * Random-access input iterator type for reading input items @iterator * * @tparam CounterT * Integer type for counting sample occurrences per histogram bin * * @tparam LevelT * Type for specifying bin level boundaries * * @tparam OffsetT * Signed integer type for global offsets * * @tparam SelectedPolicy * Implementation detail, do not specify directly, requirements on the * content of this type are subject to breaking change. */ template , CounterT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS>> struct DispatchHistogram : SelectedPolicy { static_assert(NUM_CHANNELS <= 4, "Histograms only support up to 4 channels"); static_assert(NUM_ACTIVE_CHANNELS <= 4, "Histograms only support up to 4 active channels"); public: //--------------------------------------------------------------------- // Types and constants //--------------------------------------------------------------------- /// The sample value type of the input iterator using SampleT = cub::detail::value_t; enum { // Maximum number of bins per channel for which we will use a privatized smem strategy MAX_PRIVATIZED_SMEM_BINS = 256 }; //--------------------------------------------------------------------- // Transform functors for converting samples to bin-ids //--------------------------------------------------------------------- // Searches for bin given a list of bin-boundary levels template struct SearchTransform { LevelIteratorT d_levels; // Pointer to levels array int num_output_levels; // Number of levels in array /** * @brief Initializer * * @param d_levels_ Pointer to levels array * @param num_output_levels_ Number of levels in array */ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE void Init(LevelIteratorT d_levels_, int num_output_levels_) { this->d_levels = d_levels_; this->num_output_levels = num_output_levels_; } // Method for converting samples to bin-ids template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE void BinSelect(_SampleT sample, int& bin, bool valid) { /// Level iterator wrapper type // Wrap the native input pointer with CacheModifiedInputIterator // or Directly use the supplied input iterator type using WrappedLevelIteratorT = cub::detail::conditional_t::value, CacheModifiedInputIterator, LevelIteratorT>; WrappedLevelIteratorT wrapped_levels(d_levels); int num_bins = num_output_levels - 1; if (valid) { bin = UpperBound(wrapped_levels, num_output_levels, (LevelT) sample) - 1; if (bin >= num_bins) { bin = -1; } } } }; // Scales samples to evenly-spaced bins struct ScaleTransform { private: using CommonT = typename ::cuda::std::common_type::type; static_assert(::cuda::std::is_convertible::value, "The common type of `LevelT` and `SampleT` must be " "convertible to `int`."); static_assert(::cuda::std::is_trivially_copyable::value, "The common type of `LevelT` and `SampleT` must be " "trivially copyable."); // An arithmetic type that's used for bin computation of integral types, guaranteed to not // overflow for (max_level - min_level) * scale.fraction.bins. Since we drop invalid samples // of less than min_level, (sample - min_level) is guaranteed to be non-negative. We use the // rule: 2^l * 2^r = 2^(l + r) to determine a sufficiently large type to hold the // multiplication result. // If CommonT used to be a 128-bit wide integral type already, we use CommonT's arithmetic using IntArithmeticT = cub::detail::conditional_t< // sizeof(SampleT) + sizeof(CommonT) <= sizeof(uint32_t), // uint32_t, // #if CUB_IS_INT128_ENABLED cub::detail::conditional_t< // (::cuda::std::is_same::value || // ::cuda::std::is_same::value), // CommonT, // uint64_t> // #else uint64_t #endif >; // Alias template that excludes __[u]int128 from the integral types template using is_integral_excl_int128 = #if CUB_IS_INT128_ENABLED cub::detail::conditional_t< ::cuda::std::is_same::value&& ::cuda::std::is_same::value, ::cuda::std::false_type, ::cuda::std::is_integral>; #else ::cuda::std::is_integral; #endif union ScaleT { // Used when CommonT is not floating-point to avoid intermediate // rounding errors (see NVIDIA/cub#489). struct FractionT { CommonT bins; CommonT range; } fraction; // Used when CommonT is floating-point as an optimization. CommonT reciprocal; }; CommonT m_max; // Max sample level (exclusive) CommonT m_min; // Min sample level (inclusive) ScaleT m_scale; // Bin scaling template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE ScaleT ComputeScale(int num_levels, T max_level, T min_level, ::cuda::std::true_type /* is_fp */) { ScaleT result; result.reciprocal = static_cast(static_cast(num_levels - 1) / static_cast(max_level - min_level)); return result; } template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE ScaleT ComputeScale(int num_levels, T max_level, T min_level, ::cuda::std::false_type /* is_fp */) { ScaleT result; result.fraction.bins = static_cast(num_levels - 1); result.fraction.range = static_cast(max_level - min_level); return result; } template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE ScaleT ComputeScale(int num_levels, T max_level, T min_level) { return this->ComputeScale(num_levels, max_level, min_level, ::cuda::std::is_floating_point{}); } #ifdef __CUDA_FP16_TYPES_EXIST__ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE ScaleT ComputeScale(int num_levels, __half max_level, __half min_level) { ScaleT result; NV_IF_TARGET(NV_PROVIDES_SM_53, (result.reciprocal = __hdiv(__float2half(num_levels - 1), __hsub(max_level, min_level));), (result.reciprocal = __float2half( static_cast(num_levels - 1) / (__half2float(max_level) - __half2float(min_level)));)) return result; } #endif // __CUDA_FP16_TYPES_EXIST__ // All types but __half: template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE int SampleIsValid(T sample, T max_level, T min_level) { return sample >= min_level && sample < max_level; } #ifdef __CUDA_FP16_TYPES_EXIST__ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE int SampleIsValid(__half sample, __half max_level, __half min_level) { NV_IF_TARGET( NV_PROVIDES_SM_53, (return __hge(sample, min_level) && __hlt(sample, max_level);), (return __half2float(sample) >= __half2float(min_level) && __half2float(sample) < __half2float(max_level);)); } #endif // __CUDA_FP16_TYPES_EXIST__ /** * @brief Bin computation for floating point (and extended floating point) types */ template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE int ComputeBin(T sample, T min_level, ScaleT scale, ::cuda::std::true_type /* is_fp */) { return static_cast((sample - min_level) * scale.reciprocal); } /** * @brief Bin computation for custom types and __[u]int128 */ template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE int ComputeBin(T sample, T min_level, ScaleT scale, ::cuda::std::false_type /* is_fp */) { return static_cast(((sample - min_level) * scale.fraction.bins) / scale.fraction.range); } /** * @brief Bin computation for integral types of up to 64-bit types */ template ::value, int>::type = 0> _CCCL_HOST_DEVICE _CCCL_FORCEINLINE int ComputeBin(T sample, T min_level, ScaleT scale) { return static_cast( (static_cast(sample - min_level) * static_cast(scale.fraction.bins)) / static_cast(scale.fraction.range)); } template ::value, int>::type = 0> _CCCL_HOST_DEVICE _CCCL_FORCEINLINE int ComputeBin(T sample, T min_level, ScaleT scale) { return this->ComputeBin(sample, min_level, scale, ::cuda::std::is_floating_point{}); } #ifdef __CUDA_FP16_TYPES_EXIST__ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE int ComputeBin(__half sample, __half min_level, ScaleT scale) { NV_IF_TARGET( NV_PROVIDES_SM_53, (return static_cast(__hmul(__hsub(sample, min_level), scale.reciprocal));), (return static_cast((__half2float(sample) - __half2float(min_level)) * __half2float(scale.reciprocal));)); } #endif // __CUDA_FP16_TYPES_EXIST__ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool MayOverflow(CommonT /* num_bins */, ::cuda::std::false_type /* is_integral */) { return false; } /** * @brief Returns true if the bin computation for a given combination of range `(max_level - * min_level)` and number of bins may overflow. */ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool MayOverflow(CommonT num_bins, ::cuda::std::true_type /* is_integral */) { return static_cast(m_max - m_min) > (::cuda::std::numeric_limits::max() / static_cast(num_bins)); } public: /** * @brief Initializes the ScaleTransform for the given parameters * @return cudaErrorInvalidValue if the ScaleTransform for the given values may overflow, * cudaSuccess otherwise */ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE cudaError_t Init(int num_levels, LevelT max_level, LevelT min_level) { m_max = static_cast(max_level); m_min = static_cast(min_level); // Check whether accurate bin computation for an integral sample type may overflow if (MayOverflow(static_cast(num_levels - 1), ::cuda::std::is_integral{})) { return cudaErrorInvalidValue; } m_scale = this->ComputeScale(num_levels, m_max, m_min); return cudaSuccess; } // Method for converting samples to bin-ids template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE void BinSelect(SampleT sample, int& bin, bool valid) { const CommonT common_sample = static_cast(sample); if (valid && this->SampleIsValid(common_sample, m_max, m_min)) { bin = this->ComputeBin(common_sample, m_min, m_scale); } } }; // Pass-through bin transform operator struct PassThruTransform { // Method for converting samples to bin-ids template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE void BinSelect(_SampleT sample, int& bin, bool valid) { if (valid) { bin = (int) sample; } } }; //--------------------------------------------------------------------- // Dispatch entrypoints //--------------------------------------------------------------------- /** * Dispatch routine for HistogramRange, specialized for sample types larger than 8bit * * @param d_temp_storage * Device-accessible allocation of temporary storage. * When NULL, the required allocation size is written to `temp_storage_bytes` and * no work is done. * * @param temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param d_samples * The pointer to the multi-channel input sequence of data samples. * The samples from different channels are assumed to be interleaved * (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). * * @param d_output_histograms * The pointers to the histogram counter output arrays, one for each active channel. * For channeli, the allocation length of `d_histograms[i]` should be * `num_output_levels[i] - 1`. * * @param num_output_levels * The number of boundaries (levels) for delineating histogram samples in each active channel. * Implies that the number of bins for channeli is * `num_output_levels[i] - 1`. * * @param d_levels * The pointers to the arrays of boundaries (levels), one for each active channel. * Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are * inclusive and upper sample value boundaries are exclusive. * * @param num_row_pixels * The number of multi-channel pixels per row in the region of interest * * @param num_rows * The number of rows in the region of interest * * @param row_stride_samples * The number of samples between starts of consecutive rows in the region of interest * * @param stream * CUDA stream to launch kernels within. Default is stream0. * * @param is_byte_sample * type indicating whether or not SampleT is a 8b type */ CUB_RUNTIME_FUNCTION static cudaError_t DispatchRange( void* d_temp_storage, size_t& temp_storage_bytes, SampleIteratorT d_samples, CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS], const int num_output_levels[NUM_ACTIVE_CHANNELS], const LevelT* const d_levels[NUM_ACTIVE_CHANNELS], OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_samples, cudaStream_t stream, Int2Type /*is_byte_sample*/) { using MaxPolicyT = typename SelectedPolicy::MaxPolicy; cudaError error = cudaSuccess; do { // Get PTX version int ptx_version = 0; error = CubDebug(PtxVersion(ptx_version)); if (cudaSuccess != error) { break; } // Use the search transform op for converting samples to privatized bins typedef SearchTransform PrivatizedDecodeOpT; // Use the pass-thru transform op for converting privatized bins to output bins typedef PassThruTransform OutputDecodeOpT; PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS]{}; OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS]{}; int max_levels = num_output_levels[0]; for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel) { privatized_decode_op[channel].Init(d_levels[channel], num_output_levels[channel]); if (num_output_levels[channel] > max_levels) { max_levels = num_output_levels[channel]; } } int max_num_output_bins = max_levels - 1; // Dispatch if (max_num_output_bins > MAX_PRIVATIZED_SMEM_BINS) { // Too many bins to keep in shared memory. constexpr int PRIVATIZED_SMEM_BINS = 0; detail::dispatch_histogram< NUM_CHANNELS, NUM_ACTIVE_CHANNELS, PRIVATIZED_SMEM_BINS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT, MaxPolicyT> dispatch( d_temp_storage, temp_storage_bytes, d_samples, d_output_histograms, num_output_levels, privatized_decode_op, num_output_levels, output_decode_op, max_num_output_bins, num_row_pixels, num_rows, row_stride_samples, stream); error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch)); if (cudaSuccess != error) { break; } } else { // Dispatch shared-privatized approach constexpr int PRIVATIZED_SMEM_BINS = MAX_PRIVATIZED_SMEM_BINS; detail::dispatch_histogram< NUM_CHANNELS, NUM_ACTIVE_CHANNELS, PRIVATIZED_SMEM_BINS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT, MaxPolicyT> dispatch( d_temp_storage, temp_storage_bytes, d_samples, d_output_histograms, num_output_levels, privatized_decode_op, num_output_levels, output_decode_op, max_num_output_bins, num_row_pixels, num_rows, row_stride_samples, stream); error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch)); if (cudaSuccess != error) { break; } } } while (0); return error; } CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t DispatchRange( void* d_temp_storage, size_t& temp_storage_bytes, SampleIteratorT d_samples, CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS], int num_output_levels[NUM_ACTIVE_CHANNELS], const LevelT* const d_levels[NUM_ACTIVE_CHANNELS], OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_samples, cudaStream_t stream, bool debug_synchronous, Int2Type is_byte_sample) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return DispatchRange( d_temp_storage, temp_storage_bytes, d_samples, d_output_histograms, num_output_levels, d_levels, num_row_pixels, num_rows, row_stride_samples, stream, is_byte_sample); } /** * Dispatch routine for HistogramRange, specialized for 8-bit sample types * (computes 256-bin privatized histograms and then reduces to user-specified levels) * * @param d_temp_storage * Device-accessible allocation of temporary storage. * When NULL, the required allocation size is written to `temp_storage_bytes` and * no work is done. * * @param temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param d_samples * The pointer to the multi-channel input sequence of data samples. * The samples from different channels are assumed to be interleaved * (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). * * @param d_output_histograms * The pointers to the histogram counter output arrays, one for each active channel. * For channeli, the allocation length of * `d_histograms[i]` should be `num_output_levels[i] - 1`. * * @param num_output_levels * The number of boundaries (levels) for delineating histogram samples in each active channel. * Implies that the number of bins for channeli is * `num_output_levels[i] - 1`. * * @param d_levels * The pointers to the arrays of boundaries (levels), one for each active channel. * Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are * inclusive and upper sample value boundaries are exclusive. * * @param num_row_pixels * The number of multi-channel pixels per row in the region of interest * * @param num_rows * The number of rows in the region of interest * * @param row_stride_samples * The number of samples between starts of consecutive rows in the region of interest * * @param stream * CUDA stream to launch kernels within. Default is stream0. * * @param is_byte_sample * Marker type indicating whether or not SampleT is a 8b type */ CUB_RUNTIME_FUNCTION static cudaError_t DispatchRange( void* d_temp_storage, size_t& temp_storage_bytes, SampleIteratorT d_samples, CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS], const int num_output_levels[NUM_ACTIVE_CHANNELS], const LevelT* const d_levels[NUM_ACTIVE_CHANNELS], OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_samples, cudaStream_t stream, Int2Type /*is_byte_sample*/) { using MaxPolicyT = typename SelectedPolicy::MaxPolicy; cudaError error = cudaSuccess; do { // Get PTX version int ptx_version = 0; error = CubDebug(PtxVersion(ptx_version)); if (cudaSuccess != error) { break; } // Use the pass-thru transform op for converting samples to privatized bins typedef PassThruTransform PrivatizedDecodeOpT; // Use the search transform op for converting privatized bins to output bins typedef SearchTransform OutputDecodeOpT; int num_privatized_levels[NUM_ACTIVE_CHANNELS]; PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS]{}; OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS]{}; int max_levels = num_output_levels[0]; // Maximum number of levels in any channel for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel) { num_privatized_levels[channel] = 257; output_decode_op[channel].Init(d_levels[channel], num_output_levels[channel]); if (num_output_levels[channel] > max_levels) { max_levels = num_output_levels[channel]; } } int max_num_output_bins = max_levels - 1; constexpr int PRIVATIZED_SMEM_BINS = 256; detail::dispatch_histogram< NUM_CHANNELS, NUM_ACTIVE_CHANNELS, PRIVATIZED_SMEM_BINS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT, MaxPolicyT> dispatch( d_temp_storage, temp_storage_bytes, d_samples, d_output_histograms, num_privatized_levels, privatized_decode_op, num_output_levels, output_decode_op, max_num_output_bins, num_row_pixels, num_rows, row_stride_samples, stream); error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch)); if (cudaSuccess != error) { break; } } while (0); return error; } CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t DispatchRange( void* d_temp_storage, size_t& temp_storage_bytes, SampleIteratorT d_samples, CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS], const int num_output_levels[NUM_ACTIVE_CHANNELS], const LevelT* const d_levels[NUM_ACTIVE_CHANNELS], OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_samples, cudaStream_t stream, bool debug_synchronous, Int2Type is_byte_sample) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return DispatchRange( d_temp_storage, temp_storage_bytes, d_samples, d_output_histograms, num_output_levels, d_levels, num_row_pixels, num_rows, row_stride_samples, stream, is_byte_sample); } /** * Dispatch routine for HistogramEven, specialized for sample types larger than 8-bit * * @param d_temp_storage * Device-accessible allocation of temporary storage. * When NULL, the required allocation size is written to * `temp_storage_bytes` and no work is done. * * @param temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param d_samples * The pointer to the input sequence of sample items. * The samples from different channels are assumed to be interleaved * (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). * * @param d_output_histograms * The pointers to the histogram counter output arrays, one for each active channel. * For channeli, the allocation length of `d_histograms[i]` should be * `num_output_levels[i] - 1`. * * @param num_output_levels * The number of bin level boundaries for delineating histogram samples in each active channel. * Implies that the number of bins for channeli is * `num_output_levels[i] - 1`. * * @param lower_level * The lower sample value bound (inclusive) for the lowest histogram bin in each active channel. * * @param upper_level * The upper sample value bound (exclusive) for the highest histogram bin in each active * channel. * * @param num_row_pixels * The number of multi-channel pixels per row in the region of interest * * @param num_rows * The number of rows in the region of interest * * @param row_stride_samples * The number of samples between starts of consecutive rows in the region of interest * * @param stream * CUDA stream to launch kernels within. Default is stream0. * * @param is_byte_sample * Marker type indicating whether or not SampleT is a 8b type */ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t DispatchEven( void* d_temp_storage, size_t& temp_storage_bytes, SampleIteratorT d_samples, CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS], const int num_output_levels[NUM_ACTIVE_CHANNELS], const LevelT lower_level[NUM_ACTIVE_CHANNELS], const LevelT upper_level[NUM_ACTIVE_CHANNELS], OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_samples, cudaStream_t stream, Int2Type /*is_byte_sample*/) { using MaxPolicyT = typename SelectedPolicy::MaxPolicy; cudaError error = cudaSuccess; do { // Get PTX version int ptx_version = 0; error = CubDebug(PtxVersion(ptx_version)); if (cudaSuccess != error) { break; } // Use the scale transform op for converting samples to privatized bins typedef ScaleTransform PrivatizedDecodeOpT; // Use the pass-thru transform op for converting privatized bins to output bins typedef PassThruTransform OutputDecodeOpT; PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS]{}; OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS]{}; int max_levels = num_output_levels[0]; for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel) { error = CubDebug( privatized_decode_op[channel].Init(num_output_levels[channel], upper_level[channel], lower_level[channel])); if (error != cudaSuccess) { // Make sure to also return a reasonable value for `temp_storage_bytes` in case of // an overflow of the bin computation, in which case a subsequent algorithm // invocation will also fail if (!d_temp_storage) { temp_storage_bytes = 1U; } return error; } if (num_output_levels[channel] > max_levels) { max_levels = num_output_levels[channel]; } } int max_num_output_bins = max_levels - 1; if (max_num_output_bins > MAX_PRIVATIZED_SMEM_BINS) { // Dispatch shared-privatized approach constexpr int PRIVATIZED_SMEM_BINS = 0; detail::dispatch_histogram< NUM_CHANNELS, NUM_ACTIVE_CHANNELS, PRIVATIZED_SMEM_BINS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT, MaxPolicyT> dispatch( d_temp_storage, temp_storage_bytes, d_samples, d_output_histograms, num_output_levels, privatized_decode_op, num_output_levels, output_decode_op, max_num_output_bins, num_row_pixels, num_rows, row_stride_samples, stream); error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch)); if (cudaSuccess != error) { break; } } else { // Dispatch shared-privatized approach constexpr int PRIVATIZED_SMEM_BINS = MAX_PRIVATIZED_SMEM_BINS; detail::dispatch_histogram< NUM_CHANNELS, NUM_ACTIVE_CHANNELS, PRIVATIZED_SMEM_BINS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT, MaxPolicyT> dispatch( d_temp_storage, temp_storage_bytes, d_samples, d_output_histograms, num_output_levels, privatized_decode_op, num_output_levels, output_decode_op, max_num_output_bins, num_row_pixels, num_rows, row_stride_samples, stream); error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch)); if (cudaSuccess != error) { break; } } } while (0); return error; } CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t DispatchEven( void* d_temp_storage, size_t& temp_storage_bytes, SampleIteratorT d_samples, CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS], const int num_output_levels[NUM_ACTIVE_CHANNELS], const LevelT lower_level[NUM_ACTIVE_CHANNELS], const LevelT upper_level[NUM_ACTIVE_CHANNELS], OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_samples, cudaStream_t stream, bool debug_synchronous, Int2Type is_byte_sample) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return DispatchEven( d_temp_storage, temp_storage_bytes, d_samples, d_output_histograms, num_output_levels, lower_level, upper_level, num_row_pixels, num_rows, row_stride_samples, stream, is_byte_sample); } /** * Dispatch routine for HistogramEven, specialized for 8-bit sample types * (computes 256-bin privatized histograms and then reduces to user-specified levels) * * @param d_temp_storage * Device-accessible allocation of temporary storage. * When NULL, the required allocation size is written to `temp_storage_bytes` and * no work is done. * * @param temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param d_samples * The pointer to the input sequence of sample items. The samples from different channels are * assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of * four RGBA 8-bit samples). * * @param d_output_histograms * The pointers to the histogram counter output arrays, one for each active channel. * For channeli, the allocation length of `d_histograms[i]` should be * `num_output_levels[i] - 1`. * * @param num_output_levels * The number of bin level boundaries for delineating histogram samples in each active channel. * Implies that the number of bins for channeli is * `num_output_levels[i] - 1`. * * @param lower_level * The lower sample value bound (inclusive) for the lowest histogram bin in each active channel. * * @param upper_level * The upper sample value bound (exclusive) for the highest histogram bin in each active * channel. * * @param num_row_pixels * The number of multi-channel pixels per row in the region of interest * * @param num_rows * The number of rows in the region of interest * * @param row_stride_samples * The number of samples between starts of consecutive rows in the region of interest * * @param stream * CUDA stream to launch kernels within. Default is stream0. * * @param is_byte_sample * type indicating whether or not SampleT is a 8b type */ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t DispatchEven( void* d_temp_storage, size_t& temp_storage_bytes, SampleIteratorT d_samples, CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS], const int num_output_levels[NUM_ACTIVE_CHANNELS], const LevelT lower_level[NUM_ACTIVE_CHANNELS], const LevelT upper_level[NUM_ACTIVE_CHANNELS], OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_samples, cudaStream_t stream, Int2Type /*is_byte_sample*/) { using MaxPolicyT = typename SelectedPolicy::MaxPolicy; cudaError error = cudaSuccess; do { // Get PTX version int ptx_version = 0; error = CubDebug(PtxVersion(ptx_version)); if (cudaSuccess != error) { break; } // Use the pass-thru transform op for converting samples to privatized bins typedef PassThruTransform PrivatizedDecodeOpT; // Use the scale transform op for converting privatized bins to output bins typedef ScaleTransform OutputDecodeOpT; int num_privatized_levels[NUM_ACTIVE_CHANNELS]; PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS]{}; OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS]{}; int max_levels = num_output_levels[0]; for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel) { num_privatized_levels[channel] = 257; output_decode_op[channel].Init(num_output_levels[channel], upper_level[channel], lower_level[channel]); if (num_output_levels[channel] > max_levels) { max_levels = num_output_levels[channel]; } } int max_num_output_bins = max_levels - 1; constexpr int PRIVATIZED_SMEM_BINS = 256; detail::dispatch_histogram< NUM_CHANNELS, NUM_ACTIVE_CHANNELS, PRIVATIZED_SMEM_BINS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT, MaxPolicyT> dispatch( d_temp_storage, temp_storage_bytes, d_samples, d_output_histograms, num_privatized_levels, privatized_decode_op, num_output_levels, output_decode_op, max_num_output_bins, num_row_pixels, num_rows, row_stride_samples, stream); error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch)); if (cudaSuccess != error) { break; } } while (0); return error; } CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t DispatchEven( void* d_temp_storage, size_t& temp_storage_bytes, SampleIteratorT d_samples, CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS], const int num_output_levels[NUM_ACTIVE_CHANNELS], const LevelT lower_level[NUM_ACTIVE_CHANNELS], const LevelT upper_level[NUM_ACTIVE_CHANNELS], OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_samples, cudaStream_t stream, bool debug_synchronous, Int2Type is_byte_sample) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return DispatchEven( d_temp_storage, temp_storage_bytes, d_samples, d_output_histograms, num_output_levels, lower_level, upper_level, num_row_pixels, num_rows, row_stride_samples, stream, is_byte_sample); } }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/device/dispatch/dispatch_merge_sort.cuh000066400000000000000000000621201463375617100235460ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include #include #include #include #include CUB_NAMESPACE_BEGIN namespace detail { /** * @brief Helper class template that provides two agent template instantiations: one instantiated with the default * policy and one with the fallback policy. This helps to avoid having to enlist all the agent's template parameters * twice: once for the default agent and once for the fallback agent */ template class AgentT, typename... AgentParamsT> struct dual_policy_agent_helper_t { using default_agent_t = AgentT; using fallback_agent_t = AgentT; static constexpr auto default_size = sizeof(typename default_agent_t::TempStorage); static constexpr auto fallback_size = sizeof(typename fallback_agent_t::TempStorage); }; /** * @brief Helper class template for merge sort-specific virtual shared memory handling. The merge sort algorithm in its * current implementation relies on the fact that both the sorting as well as the merging kernels use the same tile * size. This circumstance needs to be respected when determining whether the fallback policy for large user types is * applicable: we must either use the fallback for both or for none of the two agents. */ template class merge_sort_vsmem_helper_t { private: // Default fallback policy with a smaller tile size using fallback_policy_t = cub::detail::policy_wrapper_t; // Helper for the `AgentBlockSort` template with one member type alias for the agent template instantiated with the // default policy and one instantiated with the fallback policy using block_sort_helper_t = dual_policy_agent_helper_t< DefaultPolicyT, fallback_policy_t, AgentBlockSort, KeyInputIteratorT, ValueInputIteratorT, KeyIteratorT, ValueIteratorT, OffsetT, CompareOpT, KeyT, ValueT>; using default_block_sort_agent_t = typename block_sort_helper_t::default_agent_t; using fallback_block_sort_agent_t = typename block_sort_helper_t::fallback_agent_t; // Helper for the `AgentMerge` template with one member type alias for the agent template instantiated with the // default policy and one instantiated with the fallback policy using merge_helper_t = dual_policy_agent_helper_t< DefaultPolicyT, fallback_policy_t, AgentMerge, KeyIteratorT, ValueIteratorT, OffsetT, CompareOpT, KeyT, ValueT>; using default_merge_agent_t = typename merge_helper_t::default_agent_t; using fallback_merge_agent_t = typename merge_helper_t::fallback_agent_t; // Use fallback if either (a) the default block sort or (b) the block merge agent exceed the maximum shared memory // available per block and both (1) the fallback block sort and (2) the fallback merge agent would not exceed the // available shared memory static constexpr auto max_default_size = (cub::max)(block_sort_helper_t::default_size, merge_helper_t::default_size); static constexpr auto max_fallback_size = (cub::max)(block_sort_helper_t::fallback_size, merge_helper_t::fallback_size); static constexpr bool uses_fallback_policy = (max_default_size > max_smem_per_block) && (max_fallback_size <= max_smem_per_block); public: using policy_t = cub::detail::conditional_t; using block_sort_agent_t = cub::detail::conditional_t; using merge_agent_t = cub::detail::conditional_t; }; } // namespace detail template __launch_bounds__( cub::detail::merge_sort_vsmem_helper_t< typename ChainedPolicyT::ActivePolicy::MergeSortPolicy, KeyInputIteratorT, ValueInputIteratorT, KeyIteratorT, ValueIteratorT, OffsetT, CompareOpT, KeyT, ValueT>::policy_t::BLOCK_THREADS) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceMergeSortBlockSortKernel( bool ping, KeyInputIteratorT keys_in, ValueInputIteratorT items_in, KeyIteratorT keys_out, ValueIteratorT items_out, OffsetT keys_count, KeyT* tmp_keys_out, ValueT* tmp_items_out, CompareOpT compare_op, cub::detail::vsmem_t vsmem) { using MergeSortHelperT = cub::detail::merge_sort_vsmem_helper_t< typename ChainedPolicyT::ActivePolicy::MergeSortPolicy, KeyInputIteratorT, ValueInputIteratorT, KeyIteratorT, ValueIteratorT, OffsetT, CompareOpT, KeyT, ValueT>; using ActivePolicyT = typename MergeSortHelperT::policy_t; using AgentBlockSortT = typename MergeSortHelperT::block_sort_agent_t; using VSmemHelperT = cub::detail::vsmem_helper_impl; // Static shared memory allocation __shared__ typename VSmemHelperT::static_temp_storage_t static_temp_storage; // Get temporary storage typename AgentBlockSortT::TempStorage& temp_storage = VSmemHelperT::get_temp_storage(static_temp_storage, vsmem); AgentBlockSortT agent( ping, temp_storage, THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), keys_in), THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), items_in), keys_count, keys_out, items_out, tmp_keys_out, tmp_items_out, compare_op); agent.Process(); // If applicable, hints to discard modified cache lines for vsmem VSmemHelperT::discard_temp_storage(temp_storage); } template CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceMergeSortPartitionKernel( bool ping, KeyIteratorT keys_ping, KeyT* keys_pong, OffsetT keys_count, OffsetT num_partitions, OffsetT* merge_partitions, CompareOpT compare_op, OffsetT target_merged_tiles_number, int items_per_tile) { OffsetT partition_idx = blockDim.x * blockIdx.x + threadIdx.x; if (partition_idx < num_partitions) { AgentPartition agent( ping, keys_ping, keys_pong, keys_count, partition_idx, merge_partitions, compare_op, target_merged_tiles_number, items_per_tile, num_partitions); agent.Process(); } } template __launch_bounds__( cub::detail::merge_sort_vsmem_helper_t< typename ChainedPolicyT::ActivePolicy::MergeSortPolicy, KeyInputIteratorT, ValueInputIteratorT, KeyIteratorT, ValueIteratorT, OffsetT, CompareOpT, KeyT, ValueT>::policy_t::BLOCK_THREADS) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceMergeSortMergeKernel( bool ping, KeyIteratorT keys_ping, ValueIteratorT items_ping, OffsetT keys_count, KeyT* keys_pong, ValueT* items_pong, CompareOpT compare_op, OffsetT* merge_partitions, OffsetT target_merged_tiles_number, cub::detail::vsmem_t vsmem) { using MergeSortHelperT = cub::detail::merge_sort_vsmem_helper_t< typename ChainedPolicyT::ActivePolicy::MergeSortPolicy, KeyInputIteratorT, ValueInputIteratorT, KeyIteratorT, ValueIteratorT, OffsetT, CompareOpT, KeyT, ValueT>; using ActivePolicyT = typename MergeSortHelperT::policy_t; using AgentMergeT = typename MergeSortHelperT::merge_agent_t; using VSmemHelperT = cub::detail::vsmem_helper_impl; // Static shared memory allocation __shared__ typename VSmemHelperT::static_temp_storage_t static_temp_storage; // Get temporary storage typename AgentMergeT::TempStorage& temp_storage = VSmemHelperT::get_temp_storage(static_temp_storage, vsmem); AgentMergeT agent( ping, temp_storage, THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), keys_ping), THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), items_ping), THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), keys_pong), THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), items_pong), keys_count, keys_pong, items_pong, keys_ping, items_ping, compare_op, merge_partitions, target_merged_tiles_number); agent.Process(); // If applicable, hints to discard modified cache lines for vsmem VSmemHelperT::discard_temp_storage(temp_storage); } /******************************************************************************* * Policy ******************************************************************************/ template struct DeviceMergeSortPolicy { using KeyT = cub::detail::value_t; //---------------------------------------------------------------------------- // Architecture-specific tuning policies //---------------------------------------------------------------------------- struct Policy350 : ChainedPolicy<350, Policy350, Policy350> { using MergeSortPolicy = AgentMergeSortPolicy<256, Nominal4BItemsToItems(11), cub::BLOCK_LOAD_WARP_TRANSPOSE, cub::LOAD_LDG, cub::BLOCK_STORE_WARP_TRANSPOSE>; }; // NVBug 3384810 #if defined(_NVHPC_CUDA) using Policy520 = Policy350; #else struct Policy520 : ChainedPolicy<520, Policy520, Policy350> { using MergeSortPolicy = AgentMergeSortPolicy<512, Nominal4BItemsToItems(15), cub::BLOCK_LOAD_WARP_TRANSPOSE, cub::LOAD_LDG, cub::BLOCK_STORE_WARP_TRANSPOSE>; }; #endif struct Policy600 : ChainedPolicy<600, Policy600, Policy520> { using MergeSortPolicy = AgentMergeSortPolicy<256, Nominal4BItemsToItems(17), cub::BLOCK_LOAD_WARP_TRANSPOSE, cub::LOAD_DEFAULT, cub::BLOCK_STORE_WARP_TRANSPOSE>; }; /// MaxPolicy using MaxPolicy = Policy600; }; template > struct DispatchMergeSort : SelectedPolicy { using KeyT = cub::detail::value_t; using ValueT = cub::detail::value_t; /// Whether or not there are values to be trucked along with keys static constexpr bool KEYS_ONLY = std::is_same::value; // Problem state /// Device-accessible allocation of temporary storage. When NULL, the required /// allocation size is written to \p temp_storage_bytes and no work is done. void* d_temp_storage; /// Reference to size in bytes of \p d_temp_storage allocation std::size_t& temp_storage_bytes; /// Pointer to the input sequence of unsorted input keys KeyInputIteratorT d_input_keys; /// Pointer to the input sequence of unsorted input values ValueInputIteratorT d_input_items; /// Pointer to the output sequence of sorted input keys KeyIteratorT d_output_keys; /// Pointer to the output sequence of sorted input values ValueIteratorT d_output_items; /// Number of items to sort OffsetT num_items; /// Comparison function object which returns true if the first argument is /// ordered before the second CompareOpT compare_op; /// CUDA stream to launch kernels within. Default is stream0. cudaStream_t stream; int ptx_version; // Constructor CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchMergeSort( void* d_temp_storage, std::size_t& temp_storage_bytes, KeyInputIteratorT d_input_keys, ValueInputIteratorT d_input_items, KeyIteratorT d_output_keys, ValueIteratorT d_output_items, OffsetT num_items, CompareOpT compare_op, cudaStream_t stream, int ptx_version) : d_temp_storage(d_temp_storage) , temp_storage_bytes(temp_storage_bytes) , d_input_keys(d_input_keys) , d_input_items(d_input_items) , d_output_keys(d_output_keys) , d_output_items(d_output_items) , num_items(num_items) , compare_op(compare_op) , stream(stream) , ptx_version(ptx_version) {} CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchMergeSort( void* d_temp_storage, std::size_t& temp_storage_bytes, KeyInputIteratorT d_input_keys, ValueInputIteratorT d_input_items, KeyIteratorT d_output_keys, ValueIteratorT d_output_items, OffsetT num_items, CompareOpT compare_op, cudaStream_t stream, bool debug_synchronous, int ptx_version) : d_temp_storage(d_temp_storage) , temp_storage_bytes(temp_storage_bytes) , d_input_keys(d_input_keys) , d_input_items(d_input_items) , d_output_keys(d_output_keys) , d_output_items(d_output_items) , num_items(num_items) , compare_op(compare_op) , stream(stream) , ptx_version(ptx_version) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG } // Invocation template CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke() { using MergePolicyT = typename ActivePolicyT::MergeSortPolicy; using merge_sort_helper_t = cub::detail::merge_sort_vsmem_helper_t< MergePolicyT, KeyInputIteratorT, ValueInputIteratorT, KeyIteratorT, ValueIteratorT, OffsetT, CompareOpT, KeyT, ValueT>; using BlockSortVSmemHelperT = cub::detail::vsmem_helper_impl; using MergeAgentVSmemHelperT = cub::detail::vsmem_helper_impl; using MaxPolicyT = typename DispatchMergeSort::MaxPolicy; cudaError error = cudaSuccess; if (num_items == 0) { if (d_temp_storage == nullptr) { temp_storage_bytes = 0; } return error; } do { constexpr auto tile_size = merge_sort_helper_t::policy_t::ITEMS_PER_TILE; const auto num_tiles = cub::DivideAndRoundUp(num_items, tile_size); const auto merge_partitions_size = static_cast(1 + num_tiles) * sizeof(OffsetT); const auto temporary_keys_storage_size = static_cast(num_items * sizeof(KeyT)); const auto temporary_values_storage_size = static_cast(num_items * sizeof(ValueT)) * !KEYS_ONLY; /** * Merge sort supports large types, which can lead to excessive shared memory size requirements. In these cases, * merge sort allocates virtual shared memory that resides in global memory. */ std::size_t block_sort_smem_size = num_tiles * BlockSortVSmemHelperT::vsmem_per_block; std::size_t merge_smem_size = num_tiles * MergeAgentVSmemHelperT::vsmem_per_block; std::size_t virtual_shared_memory_size = (cub::max)(block_sort_smem_size, merge_smem_size); void* allocations[4] = {nullptr, nullptr, nullptr, nullptr}; std::size_t allocation_sizes[4] = { merge_partitions_size, temporary_keys_storage_size, temporary_values_storage_size, virtual_shared_memory_size}; error = CubDebug(AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes)); if (cudaSuccess != error) { break; } if (d_temp_storage == nullptr) { // Return if the caller is simply requesting the size of the storage allocation break; } const int num_passes = static_cast(THRUST_NS_QUALIFIER::detail::log2_ri(num_tiles)); /* * The algorithm consists of stages. At each stage, there are input and output arrays. There are two pairs of * arrays allocated (keys and items). One pair is from function arguments and another from temporary storage. Ping * is a helper variable that controls which of these two pairs of arrays is an input and which is an output for a * current stage. If the ping is true - the current stage stores its result in the temporary storage. The * temporary storage acts as input data otherwise. * * Block sort is executed before the main loop. It stores its result in the pair of arrays that will be an input * of the next stage. The initial value of the ping variable is selected so that the result of the final stage is * stored in the input arrays. */ bool ping = num_passes % 2 == 0; auto merge_partitions = reinterpret_cast(allocations[0]); auto keys_buffer = reinterpret_cast(allocations[1]); auto items_buffer = reinterpret_cast(allocations[2]); // Invoke DeviceMergeSortBlockSortKernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( static_cast(num_tiles), merge_sort_helper_t::policy_t::BLOCK_THREADS, 0, stream) .doit( DeviceMergeSortBlockSortKernel< MaxPolicyT, KeyInputIteratorT, ValueInputIteratorT, KeyIteratorT, ValueIteratorT, OffsetT, CompareOpT, KeyT, ValueT>, ping, d_input_keys, d_input_items, d_output_keys, d_output_items, num_items, keys_buffer, items_buffer, compare_op, cub::detail::vsmem_t{allocations[3]}); error = CubDebug(detail::DebugSyncStream(stream)); if (cudaSuccess != error) { break; } // Check for failure to launch error = CubDebug(cudaPeekAtLastError()); if (cudaSuccess != error) { break; } const OffsetT num_partitions = num_tiles + 1; constexpr int threads_per_partition_block = 256; const int partition_grid_size = static_cast(cub::DivideAndRoundUp(num_partitions, threads_per_partition_block)); error = CubDebug(detail::DebugSyncStream(stream)); if (cudaSuccess != error) { break; } // Check for failure to launch error = CubDebug(cudaPeekAtLastError()); if (cudaSuccess != error) { break; } for (int pass = 0; pass < num_passes; ++pass, ping = !ping) { OffsetT target_merged_tiles_number = OffsetT(2) << pass; // Partition THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( partition_grid_size, threads_per_partition_block, 0, stream) .doit(DeviceMergeSortPartitionKernel, ping, d_output_keys, keys_buffer, num_items, num_partitions, merge_partitions, compare_op, target_merged_tiles_number, tile_size); error = CubDebug(detail::DebugSyncStream(stream)); if (cudaSuccess != error) { break; } // Check for failure to launch error = CubDebug(cudaPeekAtLastError()); if (cudaSuccess != error) { break; } // Merge THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( static_cast(num_tiles), static_cast(merge_sort_helper_t::policy_t::BLOCK_THREADS), 0, stream) .doit( DeviceMergeSortMergeKernel, ping, d_output_keys, d_output_items, num_items, keys_buffer, items_buffer, compare_op, merge_partitions, target_merged_tiles_number, cub::detail::vsmem_t{allocations[3]}); error = CubDebug(detail::DebugSyncStream(stream)); if (cudaSuccess != error) { break; } // Check for failure to launch error = CubDebug(cudaPeekAtLastError()); if (cudaSuccess != error) { break; } } } while (0); return error; } CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch( void* d_temp_storage, std::size_t& temp_storage_bytes, KeyInputIteratorT d_input_keys, ValueInputIteratorT d_input_items, KeyIteratorT d_output_keys, ValueIteratorT d_output_items, OffsetT num_items, CompareOpT compare_op, cudaStream_t stream) { using MaxPolicyT = typename DispatchMergeSort::MaxPolicy; cudaError error = cudaSuccess; do { // Get PTX version int ptx_version = 0; error = CubDebug(PtxVersion(ptx_version)); if (cudaSuccess != error) { break; } // Create dispatch functor DispatchMergeSort dispatch( d_temp_storage, temp_storage_bytes, d_input_keys, d_input_items, d_output_keys, d_output_items, num_items, compare_op, stream, ptx_version); // Dispatch to chained policy error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch)); if (cudaSuccess != error) { break; } } while (0); return error; } CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch( void* d_temp_storage, std::size_t& temp_storage_bytes, KeyInputIteratorT d_input_keys, ValueInputIteratorT d_input_items, KeyIteratorT d_output_keys, ValueIteratorT d_output_items, OffsetT num_items, CompareOpT compare_op, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return Dispatch( d_temp_storage, temp_storage_bytes, d_input_keys, d_input_items, d_output_keys, d_output_items, num_items, compare_op, stream); } }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/device/dispatch/dispatch_radix_sort.cuh000066400000000000000000003250011463375617100235560ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * @file * cub::DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across * a sequence of data items residing within device-accessible memory. */ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include // suppress warnings triggered by #pragma unroll: // "warning: loop not unrolled: the optimizer was unable to perform the requested transformation; the transformation // might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]" _CCCL_DIAG_PUSH _CCCL_DIAG_SUPPRESS_CLANG("-Wpass-failed") CUB_NAMESPACE_BEGIN /****************************************************************************** * Kernel entry points *****************************************************************************/ /** * @brief Upsweep digit-counting kernel entry point (multi-block). * Computes privatized digit histograms, one per block. * * @tparam ChainedPolicyT * Chained tuning policy * * @tparam ALT_DIGIT_BITS * Whether or not to use the alternate (lower-bits) policy * * @tparam IS_DESCENDING * Whether or not the sorted-order is high-to-low * * @tparam KeyT * Key type * * @tparam OffsetT * Signed integer type for global offsets * * @param[in] d_keys * Input keys buffer * * @param[out] d_spine * Privatized (per block) digit histograms (striped, i.e., 0s counts from each block, * then 1s counts from each block, etc.) * * @param[in] num_items * Total number of input data items * * @param[in] current_bit * Bit position of current radix digit * * @param[in] num_bits * Number of bits of current radix digit * * @param[in] even_share * Even-share descriptor for mapan equal number of tiles onto each thread block */ template __launch_bounds__(int((ALT_DIGIT_BITS) ? int(ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::BLOCK_THREADS) : int(ChainedPolicyT::ActivePolicy::UpsweepPolicy::BLOCK_THREADS))) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceRadixSortUpsweepKernel( const KeyT* d_keys, OffsetT* d_spine, OffsetT /*num_items*/, int current_bit, int num_bits, GridEvenShare even_share, DecomposerT decomposer = {}) { using ActiveUpsweepPolicyT = cub::detail::conditional_t; using ActiveDownsweepPolicyT = cub::detail::conditional_t; enum { TILE_ITEMS = CUB_MAX(ActiveUpsweepPolicyT::BLOCK_THREADS * ActiveUpsweepPolicyT::ITEMS_PER_THREAD, ActiveDownsweepPolicyT::BLOCK_THREADS * ActiveDownsweepPolicyT::ITEMS_PER_THREAD) }; // Parameterize AgentRadixSortUpsweep type for the current configuration typedef AgentRadixSortUpsweep AgentRadixSortUpsweepT; // Shared memory storage __shared__ typename AgentRadixSortUpsweepT::TempStorage temp_storage; // Initialize GRID_MAPPING_RAKE even-share descriptor for this thread block even_share.template BlockInit(); AgentRadixSortUpsweepT upsweep(temp_storage, d_keys, current_bit, num_bits, decomposer); upsweep.ProcessRegion(even_share.block_offset, even_share.block_end); CTA_SYNC(); // Write out digit counts (striped) upsweep.template ExtractCounts(d_spine, gridDim.x, blockIdx.x); } /** * @brief Spine scan kernel entry point (single-block). * Computes an exclusive prefix sum over the privatized digit histograms * * @tparam ChainedPolicyT * Chained tuning policy * * @tparam OffsetT * Signed integer type for global offsets * * @param[in,out] d_spine * Privatized (per block) digit histograms (striped, i.e., 0s counts from each block, * then 1s counts from each block, etc.) * * @param[in] num_counts * Total number of bin-counts */ template __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanPolicy::BLOCK_THREADS), 1) CUB_DETAIL_KERNEL_ATTRIBUTES void RadixSortScanBinsKernel(OffsetT* d_spine, int num_counts) { // Parameterize the AgentScan type for the current configuration typedef AgentScan AgentScanT; // Shared memory storage __shared__ typename AgentScanT::TempStorage temp_storage; // Block scan instance AgentScanT block_scan(temp_storage, d_spine, d_spine, cub::Sum(), OffsetT(0)); // Process full input tiles int block_offset = 0; BlockScanRunningPrefixOp prefix_op(0, Sum()); while (block_offset + AgentScanT::TILE_ITEMS <= num_counts) { block_scan.template ConsumeTile(block_offset, prefix_op); block_offset += AgentScanT::TILE_ITEMS; } // Process the remaining partial tile (if any). if (block_offset < num_counts) { block_scan.template ConsumeTile(block_offset, prefix_op, num_counts - block_offset); } } /** * @brief Downsweep pass kernel entry point (multi-block). * Scatters keys (and values) into corresponding bins for the current digit place. * * @tparam ChainedPolicyT * Chained tuning policy * * @tparam ALT_DIGIT_BITS * Whether or not to use the alternate (lower-bits) policy * * @tparam IS_DESCENDING * Whether or not the sorted-order is high-to-low * * @tparam KeyT * Key type * * @tparam ValueT * Value type * * @tparam OffsetT * Signed integer type for global offsets * * @param[in] d_keys_in * Input keys buffer * * @param[in] d_keys_out * Output keys buffer * * @param[in] d_values_in * Input values buffer * * @param[in] d_values_out * Output values buffer * * @param[in] d_spine * Scan of privatized (per block) digit histograms (striped, i.e., 0s counts from each block, * then 1s counts from each block, etc.) * * @param[in] num_items * Total number of input data items * * @param[in] current_bit * Bit position of current radix digit * * @param[in] num_bits * Number of bits of current radix digit * * @param[in] even_share * Even-share descriptor for mapan equal number of tiles onto each thread block */ template __launch_bounds__(int((ALT_DIGIT_BITS) ? int(ChainedPolicyT::ActivePolicy::AltDownsweepPolicy::BLOCK_THREADS) : int(ChainedPolicyT::ActivePolicy::DownsweepPolicy::BLOCK_THREADS))) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceRadixSortDownsweepKernel( const KeyT* d_keys_in, KeyT* d_keys_out, const ValueT* d_values_in, ValueT* d_values_out, OffsetT* d_spine, OffsetT num_items, int current_bit, int num_bits, GridEvenShare even_share, DecomposerT decomposer = {}) { using ActiveUpsweepPolicyT = cub::detail::conditional_t; using ActiveDownsweepPolicyT = cub::detail::conditional_t; enum { TILE_ITEMS = CUB_MAX(ActiveUpsweepPolicyT::BLOCK_THREADS * ActiveUpsweepPolicyT::ITEMS_PER_THREAD, ActiveDownsweepPolicyT::BLOCK_THREADS * ActiveDownsweepPolicyT::ITEMS_PER_THREAD) }; // Parameterize AgentRadixSortDownsweep type for the current configuration typedef AgentRadixSortDownsweep AgentRadixSortDownsweepT; // Shared memory storage __shared__ typename AgentRadixSortDownsweepT::TempStorage temp_storage; // Initialize even-share descriptor for this thread block even_share.template BlockInit(); // Process input tiles AgentRadixSortDownsweepT( temp_storage, num_items, d_spine, d_keys_in, d_keys_out, d_values_in, d_values_out, current_bit, num_bits, decomposer) .ProcessRegion(even_share.block_offset, even_share.block_end); } /** * @brief Single pass kernel entry point (single-block). * Fully sorts a tile of input. * * @tparam ChainedPolicyT * Chained tuning policy * * @tparam IS_DESCENDING * Whether or not the sorted-order is high-to-low * * @tparam KeyT * Key type * * @tparam ValueT * Value type * * @tparam OffsetT * Signed integer type for global offsets * * @param[in] d_keys_in * Input keys buffer * * @param[in] d_keys_out * Output keys buffer * * @param[in] d_values_in * Input values buffer * * @param[in] d_values_out * Output values buffer * * @param[in] num_items * Total number of input data items * * @param[in] current_bit * Bit position of current radix digit * * @param[in] end_bit * The past-the-end (most-significant) bit index needed for key comparison */ template __launch_bounds__(int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS), 1) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceRadixSortSingleTileKernel( const KeyT* d_keys_in, KeyT* d_keys_out, const ValueT* d_values_in, ValueT* d_values_out, OffsetT num_items, int current_bit, int end_bit, DecomposerT decomposer = {}) { // Constants enum { BLOCK_THREADS = ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS, ITEMS_PER_THREAD = ChainedPolicyT::ActivePolicy::SingleTilePolicy::ITEMS_PER_THREAD, KEYS_ONLY = std::is_same::value, }; // BlockRadixSort type typedef BlockRadixSort BlockRadixSortT; // BlockLoad type (keys) typedef BlockLoad BlockLoadKeys; // BlockLoad type (values) typedef BlockLoad BlockLoadValues; // Unsigned word for key bits using traits = detail::radix::traits_t; using bit_ordered_type = typename traits::bit_ordered_type; // Shared memory storage __shared__ union TempStorage { typename BlockRadixSortT::TempStorage sort; typename BlockLoadKeys::TempStorage load_keys; typename BlockLoadValues::TempStorage load_values; } temp_storage; // Keys and values for the block KeyT keys[ITEMS_PER_THREAD]; ValueT values[ITEMS_PER_THREAD]; // Get default (min/max) value for out-of-bounds keys bit_ordered_type default_key_bits = IS_DESCENDING ? traits::min_raw_binary_key(decomposer) : traits::max_raw_binary_key(decomposer); KeyT default_key = reinterpret_cast(default_key_bits); // Load keys BlockLoadKeys(temp_storage.load_keys).Load(d_keys_in, keys, num_items, default_key); CTA_SYNC(); // Load values if (!KEYS_ONLY) { // Register pressure work-around: moving num_items through shfl prevents compiler // from reusing guards/addressing from prior guarded loads num_items = ShuffleIndex(num_items, 0, 0xffffffff); BlockLoadValues(temp_storage.load_values).Load(d_values_in, values, num_items); CTA_SYNC(); } // Sort tile BlockRadixSortT(temp_storage.sort) .SortBlockedToStriped( keys, values, current_bit, end_bit, Int2Type(), Int2Type(), decomposer); // Store keys and values #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { int item_offset = ITEM * BLOCK_THREADS + threadIdx.x; if (item_offset < num_items) { d_keys_out[item_offset] = keys[ITEM]; if (!KEYS_ONLY) { d_values_out[item_offset] = values[ITEM]; } } } } /** * @brief Segmented radix sorting pass (one block per segment) * * @tparam ChainedPolicyT * Chained tuning policy * * @tparam ALT_DIGIT_BITS * Whether or not to use the alternate (lower-bits) policy * * @tparam IS_DESCENDING * Whether or not the sorted-order is high-to-low * * @tparam KeyT * Key type * * @tparam ValueT * Value type * * @tparam BeginOffsetIteratorT * Random-access input iterator type for reading segment beginning offsets @iterator * * @tparam EndOffsetIteratorT * Random-access input iterator type for reading segment ending offsets @iterator * * @tparam OffsetT * Signed integer type for global offsets * * @param[in] d_keys_in * Input keys buffer * * @param[in] d_keys_out * Output keys buffer * * @param[in] d_values_in * Input values buffer * * @param[in] d_values_out * Output values buffer * * @param[in] d_begin_offsets * Random-access input iterator to the sequence of beginning offsets of length `num_segments`, * such that d_begin_offsets[i] is the first element of the ith * data segment in d_keys_* and d_values_* * * @param[in] d_end_offsets * Random-access input iterator to the sequence of ending offsets of length `num_segments`, * such that d_end_offsets[i]-1 is the last element of the ith * data segment in d_keys_* and d_values_*. * If d_end_offsets[i]-1 <= d_begin_offsets[i], * the ith is considered empty. * * @param[in] num_segments * The number of segments that comprise the sorting data * * @param[in] current_bit * Bit position of current radix digit * * @param[in] pass_bits * Number of bits of current radix digit */ template __launch_bounds__(int((ALT_DIGIT_BITS) ? ChainedPolicyT::ActivePolicy::AltSegmentedPolicy::BLOCK_THREADS : ChainedPolicyT::ActivePolicy::SegmentedPolicy::BLOCK_THREADS)) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSegmentedRadixSortKernel( const KeyT* d_keys_in, KeyT* d_keys_out, const ValueT* d_values_in, ValueT* d_values_out, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, int /*num_segments*/, int current_bit, int pass_bits, DecomposerT decomposer = {}) { // // Constants // using SegmentedPolicyT = cub::detail::conditional_t; enum { BLOCK_THREADS = SegmentedPolicyT::BLOCK_THREADS, ITEMS_PER_THREAD = SegmentedPolicyT::ITEMS_PER_THREAD, RADIX_BITS = SegmentedPolicyT::RADIX_BITS, TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, RADIX_DIGITS = 1 << RADIX_BITS, KEYS_ONLY = std::is_same::value, }; // Upsweep type using BlockUpsweepT = AgentRadixSortUpsweep; // Digit-scan type using DigitScanT = BlockScan; // Downsweep type using BlockDownsweepT = AgentRadixSortDownsweep; enum { /// Number of bin-starting offsets tracked per thread BINS_TRACKED_PER_THREAD = BlockDownsweepT::BINS_TRACKED_PER_THREAD }; // // Process input tiles // // Shared memory storage __shared__ union { typename BlockUpsweepT::TempStorage upsweep; typename BlockDownsweepT::TempStorage downsweep; struct { volatile OffsetT reverse_counts_in[RADIX_DIGITS]; volatile OffsetT reverse_counts_out[RADIX_DIGITS]; typename DigitScanT::TempStorage scan; }; } temp_storage; OffsetT segment_begin = d_begin_offsets[blockIdx.x]; OffsetT segment_end = d_end_offsets[blockIdx.x]; OffsetT num_items = segment_end - segment_begin; // Check if empty segment if (num_items <= 0) { return; } // Upsweep BlockUpsweepT upsweep(temp_storage.upsweep, d_keys_in, current_bit, pass_bits, decomposer); upsweep.ProcessRegion(segment_begin, segment_end); CTA_SYNC(); // The count of each digit value in this pass (valid in the first RADIX_DIGITS threads) OffsetT bin_count[BINS_TRACKED_PER_THREAD]; upsweep.ExtractCounts(bin_count); CTA_SYNC(); if (IS_DESCENDING) { // Reverse bin counts #pragma unroll for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) { int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track; if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) { temp_storage.reverse_counts_in[bin_idx] = bin_count[track]; } } CTA_SYNC(); #pragma unroll for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) { int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track; if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) { bin_count[track] = temp_storage.reverse_counts_in[RADIX_DIGITS - bin_idx - 1]; } } } // Scan OffsetT bin_offset[BINS_TRACKED_PER_THREAD]; // The global scatter base offset for each digit value in this pass // (valid in the first RADIX_DIGITS threads) DigitScanT(temp_storage.scan).ExclusiveSum(bin_count, bin_offset); #pragma unroll for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) { bin_offset[track] += segment_begin; } if (IS_DESCENDING) { // Reverse bin offsets #pragma unroll for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) { int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track; if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) { temp_storage.reverse_counts_out[threadIdx.x] = bin_offset[track]; } } CTA_SYNC(); #pragma unroll for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) { int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track; if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) { bin_offset[track] = temp_storage.reverse_counts_out[RADIX_DIGITS - bin_idx - 1]; } } } CTA_SYNC(); // Downsweep BlockDownsweepT downsweep( temp_storage.downsweep, bin_offset, num_items, d_keys_in, d_keys_out, d_values_in, d_values_out, current_bit, pass_bits, decomposer); downsweep.ProcessRegion(segment_begin, segment_end); } /****************************************************************************** * Onesweep kernels ******************************************************************************/ /** * Kernel for computing multiple histograms */ /** * Histogram kernel */ template CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(ChainedPolicyT::ActivePolicy::HistogramPolicy::BLOCK_THREADS) void DeviceRadixSortHistogramKernel( OffsetT* d_bins_out, const KeyT* d_keys_in, OffsetT num_items, int start_bit, int end_bit, DecomposerT decomposer = {}) { typedef typename ChainedPolicyT::ActivePolicy::HistogramPolicy HistogramPolicyT; typedef AgentRadixSortHistogram AgentT; __shared__ typename AgentT::TempStorage temp_storage; AgentT agent(temp_storage, d_bins_out, d_keys_in, num_items, start_bit, end_bit, decomposer); agent.Process(); } template CUB_DETAIL_KERNEL_ATTRIBUTES void __launch_bounds__(ChainedPolicyT::ActivePolicy::OnesweepPolicy::BLOCK_THREADS) DeviceRadixSortOnesweepKernel( AtomicOffsetT* d_lookback, AtomicOffsetT* d_ctrs, OffsetT* d_bins_out, const OffsetT* d_bins_in, KeyT* d_keys_out, const KeyT* d_keys_in, ValueT* d_values_out, const ValueT* d_values_in, PortionOffsetT num_items, int current_bit, int num_bits, DecomposerT decomposer = {}) { typedef typename ChainedPolicyT::ActivePolicy::OnesweepPolicy OnesweepPolicyT; typedef AgentRadixSortOnesweep AgentT; __shared__ typename AgentT::TempStorage s; AgentT agent( s, d_lookback, d_ctrs, d_bins_out, d_bins_in, d_keys_out, d_keys_in, d_values_out, d_values_in, num_items, current_bit, num_bits, decomposer); agent.Process(); } /** * Exclusive sum kernel */ template CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceRadixSortExclusiveSumKernel(OffsetT* d_bins) { typedef typename ChainedPolicyT::ActivePolicy::ExclusiveSumPolicy ExclusiveSumPolicyT; constexpr int RADIX_BITS = ExclusiveSumPolicyT::RADIX_BITS; constexpr int RADIX_DIGITS = 1 << RADIX_BITS; constexpr int BLOCK_THREADS = ExclusiveSumPolicyT::BLOCK_THREADS; constexpr int BINS_PER_THREAD = (RADIX_DIGITS + BLOCK_THREADS - 1) / BLOCK_THREADS; typedef cub::BlockScan BlockScan; __shared__ typename BlockScan::TempStorage temp_storage; // load the bins OffsetT bins[BINS_PER_THREAD]; int bin_start = blockIdx.x * RADIX_DIGITS; #pragma unroll for (int u = 0; u < BINS_PER_THREAD; ++u) { int bin = threadIdx.x * BINS_PER_THREAD + u; if (bin >= RADIX_DIGITS) { break; } bins[u] = d_bins[bin_start + bin]; } // compute offsets BlockScan(temp_storage).ExclusiveSum(bins, bins); // store the offsets #pragma unroll for (int u = 0; u < BINS_PER_THREAD; ++u) { int bin = threadIdx.x * BINS_PER_THREAD + u; if (bin >= RADIX_DIGITS) { break; } d_bins[bin_start + bin] = bins[u]; } } namespace detail { namespace radix { // default template struct sm90_small_key_tuning { static constexpr int threads = 384; static constexpr int items = 23; }; // clang-format off // keys template <> struct sm90_small_key_tuning<1, 0, 4> { static constexpr int threads = 512; static constexpr int items = 19; }; template <> struct sm90_small_key_tuning<1, 0, 8> { static constexpr int threads = 512; static constexpr int items = 19; }; template <> struct sm90_small_key_tuning<2, 0, 4> { static constexpr int threads = 512; static constexpr int items = 19; }; template <> struct sm90_small_key_tuning<2, 0, 8> { static constexpr int threads = 512; static constexpr int items = 19; }; // pairs 8:xx template <> struct sm90_small_key_tuning<1, 1, 4> { static constexpr int threads = 512; static constexpr int items = 15; }; template <> struct sm90_small_key_tuning<1, 1, 8> { static constexpr int threads = 448; static constexpr int items = 16; }; template <> struct sm90_small_key_tuning<1, 2, 4> { static constexpr int threads = 512; static constexpr int items = 17; }; template <> struct sm90_small_key_tuning<1, 2, 8> { static constexpr int threads = 512; static constexpr int items = 14; }; template <> struct sm90_small_key_tuning<1, 4, 4> { static constexpr int threads = 512; static constexpr int items = 17; }; template <> struct sm90_small_key_tuning<1, 4, 8> { static constexpr int threads = 512; static constexpr int items = 14; }; template <> struct sm90_small_key_tuning<1, 8, 4> { static constexpr int threads = 384; static constexpr int items = 23; }; template <> struct sm90_small_key_tuning<1, 8, 8> { static constexpr int threads = 384; static constexpr int items = 18; }; template <> struct sm90_small_key_tuning<1, 16, 4> { static constexpr int threads = 512; static constexpr int items = 22; }; template <> struct sm90_small_key_tuning<1, 16, 8> { static constexpr int threads = 512; static constexpr int items = 22; }; // pairs 16:xx template <> struct sm90_small_key_tuning<2, 1, 4> { static constexpr int threads = 384; static constexpr int items = 14; }; template <> struct sm90_small_key_tuning<2, 1, 8> { static constexpr int threads = 384; static constexpr int items = 16; }; template <> struct sm90_small_key_tuning<2, 2, 4> { static constexpr int threads = 384; static constexpr int items = 15; }; template <> struct sm90_small_key_tuning<2, 2, 8> { static constexpr int threads = 448; static constexpr int items = 16; }; template <> struct sm90_small_key_tuning<2, 4, 4> { static constexpr int threads = 512; static constexpr int items = 17; }; template <> struct sm90_small_key_tuning<2, 4, 8> { static constexpr int threads = 512; static constexpr int items = 12; }; template <> struct sm90_small_key_tuning<2, 8, 4> { static constexpr int threads = 384; static constexpr int items = 23; }; template <> struct sm90_small_key_tuning<2, 8, 8> { static constexpr int threads = 512; static constexpr int items = 23; }; template <> struct sm90_small_key_tuning<2, 16, 4> { static constexpr int threads = 512; static constexpr int items = 21; }; template <> struct sm90_small_key_tuning<2, 16, 8> { static constexpr int threads = 576; static constexpr int items = 22; }; // clang-format on } // namespace radix } // namespace detail /****************************************************************************** * Policy ******************************************************************************/ /** * @brief Tuning policy for kernel specialization * * @tparam KeyT * Key type * * @tparam ValueT * Value type * * @tparam OffsetT * Signed integer type for global offsets */ template struct DeviceRadixSortPolicy { //------------------------------------------------------------------------------ // Constants //------------------------------------------------------------------------------ // Whether this is a keys-only (or key-value) sort static constexpr bool KEYS_ONLY = std::is_same::value; // Dominant-sized key/value type using DominantT = cub::detail::conditional_t<(sizeof(ValueT) > sizeof(KeyT)), ValueT, KeyT>; //------------------------------------------------------------------------------ // Architecture-specific tuning policies //------------------------------------------------------------------------------ /// SM35 struct Policy350 : ChainedPolicy<350, Policy350, Policy350> { enum { PRIMARY_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, // 1.72B 32b keys/s, 1.17B 32b pairs/s, 1.55B 32b segmented // keys/s (K40m) ONESWEEP = false, ONESWEEP_RADIX_BITS = 8, }; // Histogram policy typedef AgentRadixSortHistogramPolicy<256, 8, 1, KeyT, ONESWEEP_RADIX_BITS> HistogramPolicy; // Exclusive sum policy typedef AgentRadixSortExclusiveSumPolicy<256, ONESWEEP_RADIX_BITS> ExclusiveSumPolicy; // Onesweep policy typedef AgentRadixSortOnesweepPolicy< 256, 21, DominantT, 1, RADIX_RANK_MATCH_EARLY_COUNTS_ANY, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_STORE_DIRECT, ONESWEEP_RADIX_BITS> OnesweepPolicy; // Scan policy typedef AgentScanPolicy<1024, 4, OffsetT, BLOCK_LOAD_VECTORIZE, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, BLOCK_SCAN_WARP_SCANS> ScanPolicy; // Keys-only downsweep policies typedef AgentRadixSortDownsweepPolicy< 128, 9, DominantT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_LDG, RADIX_RANK_MATCH, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> DownsweepPolicyKeys; typedef AgentRadixSortDownsweepPolicy< 64, 18, DominantT, BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1> AltDownsweepPolicyKeys; // Key-value pairs downsweep policies typedef DownsweepPolicyKeys DownsweepPolicyPairs; typedef AgentRadixSortDownsweepPolicy< 128, 15, DominantT, BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1> AltDownsweepPolicyPairs; // Downsweep policies using DownsweepPolicy = cub::detail::conditional_t; using AltDownsweepPolicy = cub::detail::conditional_t; // Upsweep policies using UpsweepPolicy = DownsweepPolicy; using AltUpsweepPolicy = AltDownsweepPolicy; // Single-tile policy using SingleTilePolicy = DownsweepPolicy; // Segmented policies using SegmentedPolicy = DownsweepPolicy; using AltSegmentedPolicy = AltDownsweepPolicy; }; /// SM50 struct Policy500 : ChainedPolicy<500, Policy500, Policy350> { enum { PRIMARY_RADIX_BITS = (sizeof(KeyT) > 1) ? 7 : 5, // 3.5B 32b keys/s, 1.92B 32b pairs/s (TitanX) SINGLE_TILE_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, SEGMENTED_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, // 3.1B 32b segmented keys/s (TitanX) ONESWEEP = false, ONESWEEP_RADIX_BITS = 8, }; // Histogram policy typedef AgentRadixSortHistogramPolicy<256, 8, 1, KeyT, ONESWEEP_RADIX_BITS> HistogramPolicy; // Exclusive sum policy typedef AgentRadixSortExclusiveSumPolicy<256, ONESWEEP_RADIX_BITS> ExclusiveSumPolicy; // Onesweep policy typedef AgentRadixSortOnesweepPolicy< 256, 21, DominantT, 1, RADIX_RANK_MATCH_EARLY_COUNTS_ANY, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_STORE_DIRECT, ONESWEEP_RADIX_BITS> OnesweepPolicy; // ScanPolicy typedef AgentScanPolicy<512, 23, OffsetT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy; // Downsweep policies typedef AgentRadixSortDownsweepPolicy< 160, 39, DominantT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> DownsweepPolicy; typedef AgentRadixSortDownsweepPolicy< 256, 16, DominantT, BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_RAKING_MEMOIZE, PRIMARY_RADIX_BITS - 1> AltDownsweepPolicy; // Upsweep policies typedef DownsweepPolicy UpsweepPolicy; typedef AltDownsweepPolicy AltUpsweepPolicy; // Single-tile policy typedef AgentRadixSortDownsweepPolicy< 256, 19, DominantT, BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SINGLE_TILE_RADIX_BITS> SingleTilePolicy; // Segmented policies typedef AgentRadixSortDownsweepPolicy< 192, 31, DominantT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS> SegmentedPolicy; typedef AgentRadixSortDownsweepPolicy< 256, 11, DominantT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS - 1> AltSegmentedPolicy; }; /// SM60 (GP100) struct Policy600 : ChainedPolicy<600, Policy600, Policy500> { enum { PRIMARY_RADIX_BITS = (sizeof(KeyT) > 1) ? 7 : 5, // 6.9B 32b keys/s (Quadro P100) SINGLE_TILE_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, SEGMENTED_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, // 5.9B 32b segmented keys/s (Quadro P100) ONESWEEP = sizeof(KeyT) >= sizeof(uint32_t), // 10.0B 32b keys/s (GP100, 64M random keys) ONESWEEP_RADIX_BITS = 8, OFFSET_64BIT = sizeof(OffsetT) == 8, }; // Histogram policy typedef AgentRadixSortHistogramPolicy<256, 8, 8, KeyT, ONESWEEP_RADIX_BITS> HistogramPolicy; // Exclusive sum policy typedef AgentRadixSortExclusiveSumPolicy<256, ONESWEEP_RADIX_BITS> ExclusiveSumPolicy; // Onesweep policy typedef AgentRadixSortOnesweepPolicy< 256, OFFSET_64BIT ? 29 : 30, DominantT, 2, RADIX_RANK_MATCH_EARLY_COUNTS_ANY, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_STORE_DIRECT, ONESWEEP_RADIX_BITS> OnesweepPolicy; // ScanPolicy typedef AgentScanPolicy<512, 23, OffsetT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy; // Downsweep policies typedef AgentRadixSortDownsweepPolicy< 256, 25, DominantT, BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MATCH, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> DownsweepPolicy; typedef AgentRadixSortDownsweepPolicy< 192, OFFSET_64BIT ? 32 : 39, DominantT, BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1> AltDownsweepPolicy; // Upsweep policies typedef DownsweepPolicy UpsweepPolicy; typedef AltDownsweepPolicy AltUpsweepPolicy; // Single-tile policy typedef AgentRadixSortDownsweepPolicy< 256, 19, DominantT, BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SINGLE_TILE_RADIX_BITS> SingleTilePolicy; // Segmented policies typedef AgentRadixSortDownsweepPolicy< 192, 39, DominantT, BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS> SegmentedPolicy; typedef AgentRadixSortDownsweepPolicy< 384, 11, DominantT, BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS - 1> AltSegmentedPolicy; }; /// SM61 (GP104) struct Policy610 : ChainedPolicy<610, Policy610, Policy600> { enum { PRIMARY_RADIX_BITS = (sizeof(KeyT) > 1) ? 7 : 5, // 3.4B 32b keys/s, 1.83B 32b pairs/s (1080) SINGLE_TILE_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, SEGMENTED_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, // 3.3B 32b segmented keys/s (1080) ONESWEEP = sizeof(KeyT) >= sizeof(uint32_t), ONESWEEP_RADIX_BITS = 8, }; // Histogram policy typedef AgentRadixSortHistogramPolicy<256, 8, 8, KeyT, ONESWEEP_RADIX_BITS> HistogramPolicy; // Exclusive sum policy typedef AgentRadixSortExclusiveSumPolicy<256, ONESWEEP_RADIX_BITS> ExclusiveSumPolicy; // Onesweep policy typedef AgentRadixSortOnesweepPolicy< 256, 30, DominantT, 2, RADIX_RANK_MATCH_EARLY_COUNTS_ANY, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_STORE_DIRECT, ONESWEEP_RADIX_BITS> OnesweepPolicy; // ScanPolicy typedef AgentScanPolicy<512, 23, OffsetT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy; // Downsweep policies typedef AgentRadixSortDownsweepPolicy< 384, 31, DominantT, BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MATCH, BLOCK_SCAN_RAKING_MEMOIZE, PRIMARY_RADIX_BITS> DownsweepPolicy; typedef AgentRadixSortDownsweepPolicy< 256, 35, DominantT, BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_RAKING_MEMOIZE, PRIMARY_RADIX_BITS - 1> AltDownsweepPolicy; // Upsweep policies typedef AgentRadixSortUpsweepPolicy<128, 16, DominantT, LOAD_LDG, PRIMARY_RADIX_BITS> UpsweepPolicy; typedef AgentRadixSortUpsweepPolicy<128, 16, DominantT, LOAD_LDG, PRIMARY_RADIX_BITS - 1> AltUpsweepPolicy; // Single-tile policy typedef AgentRadixSortDownsweepPolicy< 256, 19, DominantT, BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SINGLE_TILE_RADIX_BITS> SingleTilePolicy; // Segmented policies typedef AgentRadixSortDownsweepPolicy< 192, 39, DominantT, BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS> SegmentedPolicy; typedef AgentRadixSortDownsweepPolicy< 384, 11, DominantT, BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS - 1> AltSegmentedPolicy; }; /// SM62 (Tegra, less RF) struct Policy620 : ChainedPolicy<620, Policy620, Policy610> { enum { PRIMARY_RADIX_BITS = 5, ALT_RADIX_BITS = PRIMARY_RADIX_BITS - 1, ONESWEEP = sizeof(KeyT) >= sizeof(uint32_t), ONESWEEP_RADIX_BITS = 8, }; // Histogram policy typedef AgentRadixSortHistogramPolicy<256, 8, 8, KeyT, ONESWEEP_RADIX_BITS> HistogramPolicy; // Exclusive sum policy typedef AgentRadixSortExclusiveSumPolicy<256, ONESWEEP_RADIX_BITS> ExclusiveSumPolicy; // Onesweep policy typedef AgentRadixSortOnesweepPolicy< 256, 30, DominantT, 2, RADIX_RANK_MATCH_EARLY_COUNTS_ANY, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_STORE_DIRECT, ONESWEEP_RADIX_BITS> OnesweepPolicy; // ScanPolicy typedef AgentScanPolicy<512, 23, OffsetT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy; // Downsweep policies typedef AgentRadixSortDownsweepPolicy< 256, 16, DominantT, BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_RAKING_MEMOIZE, PRIMARY_RADIX_BITS> DownsweepPolicy; typedef AgentRadixSortDownsweepPolicy< 256, 16, DominantT, BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_RAKING_MEMOIZE, ALT_RADIX_BITS> AltDownsweepPolicy; // Upsweep policies typedef DownsweepPolicy UpsweepPolicy; typedef AltDownsweepPolicy AltUpsweepPolicy; // Single-tile policy typedef AgentRadixSortDownsweepPolicy< 256, 19, DominantT, BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> SingleTilePolicy; // Segmented policies typedef DownsweepPolicy SegmentedPolicy; typedef AltDownsweepPolicy AltSegmentedPolicy; }; /// SM70 (GV100) struct Policy700 : ChainedPolicy<700, Policy700, Policy620> { enum { PRIMARY_RADIX_BITS = (sizeof(KeyT) > 1) ? 7 : 5, // 7.62B 32b keys/s (GV100) SINGLE_TILE_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, SEGMENTED_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, // 8.7B 32b segmented keys/s (GV100) ONESWEEP = sizeof(KeyT) >= sizeof(uint32_t), // 15.8B 32b keys/s (V100-SXM2, 64M random keys) ONESWEEP_RADIX_BITS = 8, OFFSET_64BIT = sizeof(OffsetT) == 8, }; // Histogram policy typedef AgentRadixSortHistogramPolicy<256, 8, 8, KeyT, ONESWEEP_RADIX_BITS> HistogramPolicy; // Exclusive sum policy typedef AgentRadixSortExclusiveSumPolicy<256, ONESWEEP_RADIX_BITS> ExclusiveSumPolicy; // Onesweep policy typedef AgentRadixSortOnesweepPolicy< 256, sizeof(KeyT) == 4 && sizeof(ValueT) == 4 ? 46 : 23, DominantT, 4, RADIX_RANK_MATCH_EARLY_COUNTS_ANY, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_STORE_DIRECT, ONESWEEP_RADIX_BITS> OnesweepPolicy; // ScanPolicy typedef AgentScanPolicy<512, 23, OffsetT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy; // Downsweep policies typedef AgentRadixSortDownsweepPolicy< 512, 23, DominantT, BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MATCH, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> DownsweepPolicy; typedef AgentRadixSortDownsweepPolicy< (sizeof(KeyT) > 1) ? 256 : 128, OFFSET_64BIT ? 46 : 47, DominantT, BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1> AltDownsweepPolicy; // Upsweep policies typedef AgentRadixSortUpsweepPolicy<256, 23, DominantT, LOAD_DEFAULT, PRIMARY_RADIX_BITS> UpsweepPolicy; typedef AgentRadixSortUpsweepPolicy<256, OFFSET_64BIT ? 46 : 47, DominantT, LOAD_DEFAULT, PRIMARY_RADIX_BITS - 1> AltUpsweepPolicy; // Single-tile policy typedef AgentRadixSortDownsweepPolicy< 256, 19, DominantT, BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SINGLE_TILE_RADIX_BITS> SingleTilePolicy; // Segmented policies typedef AgentRadixSortDownsweepPolicy< 192, 39, DominantT, BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS> SegmentedPolicy; typedef AgentRadixSortDownsweepPolicy< 384, 11, DominantT, BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS - 1> AltSegmentedPolicy; }; /// SM80 struct Policy800 : ChainedPolicy<800, Policy800, Policy700> { enum { PRIMARY_RADIX_BITS = (sizeof(KeyT) > 1) ? 7 : 5, SINGLE_TILE_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, SEGMENTED_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, ONESWEEP = sizeof(KeyT) >= sizeof(uint32_t), ONESWEEP_RADIX_BITS = 8, OFFSET_64BIT = sizeof(OffsetT) == 8, }; // Histogram policy typedef AgentRadixSortHistogramPolicy<128, 16, 1, KeyT, ONESWEEP_RADIX_BITS> HistogramPolicy; // Exclusive sum policy typedef AgentRadixSortExclusiveSumPolicy<256, ONESWEEP_RADIX_BITS> ExclusiveSumPolicy; // Onesweep policy typedef AgentRadixSortOnesweepPolicy< 384, OFFSET_64BIT && sizeof(KeyT) == 4 && !KEYS_ONLY ? 17 : 21, DominantT, 1, RADIX_RANK_MATCH_EARLY_COUNTS_ANY, BLOCK_SCAN_RAKING_MEMOIZE, RADIX_SORT_STORE_DIRECT, ONESWEEP_RADIX_BITS> OnesweepPolicy; // ScanPolicy typedef AgentScanPolicy<512, 23, OffsetT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy; // Downsweep policies typedef AgentRadixSortDownsweepPolicy< 512, 23, DominantT, BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MATCH, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> DownsweepPolicy; typedef AgentRadixSortDownsweepPolicy< (sizeof(KeyT) > 1) ? 256 : 128, 47, DominantT, BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1> AltDownsweepPolicy; // Upsweep policies typedef AgentRadixSortUpsweepPolicy<256, 23, DominantT, LOAD_DEFAULT, PRIMARY_RADIX_BITS> UpsweepPolicy; typedef AgentRadixSortUpsweepPolicy<256, 47, DominantT, LOAD_DEFAULT, PRIMARY_RADIX_BITS - 1> AltUpsweepPolicy; // Single-tile policy typedef AgentRadixSortDownsweepPolicy< 256, 19, DominantT, BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SINGLE_TILE_RADIX_BITS> SingleTilePolicy; // Segmented policies typedef AgentRadixSortDownsweepPolicy< 192, 39, DominantT, BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS> SegmentedPolicy; typedef AgentRadixSortDownsweepPolicy< 384, 11, DominantT, BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS - 1> AltSegmentedPolicy; }; /// SM90 struct Policy900 : ChainedPolicy<900, Policy900, Policy800> { enum { PRIMARY_RADIX_BITS = (sizeof(KeyT) > 1) ? 7 : 5, SINGLE_TILE_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, SEGMENTED_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, ONESWEEP = true, ONESWEEP_RADIX_BITS = 8, OFFSET_64BIT = sizeof(OffsetT) == 8 ? 1 : 0, FLOAT_KEYS = std::is_same::value ? 1 : 0, }; using HistogramPolicy = AgentRadixSortHistogramPolicy<128, 16, 1, KeyT, ONESWEEP_RADIX_BITS>; using ExclusiveSumPolicy = AgentRadixSortExclusiveSumPolicy<256, ONESWEEP_RADIX_BITS>; using OnesweepPolicyKey32 = AgentRadixSortOnesweepPolicy< 384, KEYS_ONLY ? 20 - OFFSET_64BIT - FLOAT_KEYS : (sizeof(ValueT) < 8 ? (OFFSET_64BIT ? 17 : 23) : (OFFSET_64BIT ? 29 : 30)), DominantT, 1, RADIX_RANK_MATCH_EARLY_COUNTS_ANY, BLOCK_SCAN_RAKING_MEMOIZE, RADIX_SORT_STORE_DIRECT, ONESWEEP_RADIX_BITS>; using OnesweepPolicyKey64 = AgentRadixSortOnesweepPolicy< 384, sizeof(ValueT) < 8 ? 30 : 24, DominantT, 1, RADIX_RANK_MATCH_EARLY_COUNTS_ANY, BLOCK_SCAN_RAKING_MEMOIZE, RADIX_SORT_STORE_DIRECT, ONESWEEP_RADIX_BITS>; using OnesweepLargeKeyPolicy = // cub::detail::conditional_t; using OnesweepSmallKeyPolicySizes = // detail::radix::sm90_small_key_tuning; using OnesweepSmallKeyPolicy = AgentRadixSortOnesweepPolicy< OnesweepSmallKeyPolicySizes::threads, OnesweepSmallKeyPolicySizes::items, DominantT, 1, RADIX_RANK_MATCH_EARLY_COUNTS_ANY, BLOCK_SCAN_RAKING_MEMOIZE, RADIX_SORT_STORE_DIRECT, 8>; using OnesweepPolicy = // cub::detail::conditional_t; using ScanPolicy = AgentScanPolicy<512, 23, OffsetT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE>; using DownsweepPolicy = AgentRadixSortDownsweepPolicy< 512, 23, DominantT, BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MATCH, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>; using AltDownsweepPolicy = AgentRadixSortDownsweepPolicy< (sizeof(KeyT) > 1) ? 256 : 128, 47, DominantT, BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1>; using UpsweepPolicy = AgentRadixSortUpsweepPolicy<256, 23, DominantT, LOAD_DEFAULT, PRIMARY_RADIX_BITS>; using AltUpsweepPolicy = AgentRadixSortUpsweepPolicy<256, 47, DominantT, LOAD_DEFAULT, PRIMARY_RADIX_BITS - 1>; using SingleTilePolicy = AgentRadixSortDownsweepPolicy< 256, 19, DominantT, BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SINGLE_TILE_RADIX_BITS>; using SegmentedPolicy = AgentRadixSortDownsweepPolicy< 192, 39, DominantT, BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS>; using AltSegmentedPolicy = AgentRadixSortDownsweepPolicy< 384, 11, DominantT, BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS - 1>; }; using MaxPolicy = Policy900; }; /****************************************************************************** * Single-problem dispatch ******************************************************************************/ /** * Utility class for dispatching the appropriately-tuned kernels for device-wide radix sort * * @tparam IS_DESCENDING * Whether or not the sorted-order is high-to-low * * @tparam KeyT * Key type * * @tparam ValueT * Value type * * @tparam OffsetT * Signed integer type for global offsets * * @tparam DecomposerT * Implementation detail, do not specify directly, requirements on the * content of this type are subject to breaking change. */ template , typename DecomposerT = detail::identity_decomposer_t> struct DispatchRadixSort : SelectedPolicy { //------------------------------------------------------------------------------ // Constants //------------------------------------------------------------------------------ // Whether this is a keys-only (or key-value) sort static constexpr bool KEYS_ONLY = std::is_same::value; //------------------------------------------------------------------------------ // Problem state //------------------------------------------------------------------------------ /// Device-accessible allocation of temporary storage. // When NULL, the required allocation size is written to `temp_storage_bytes` and no work is // done. void* d_temp_storage; /// Reference to size in bytes of `d_temp_storage` allocation size_t& temp_storage_bytes; /// Double-buffer whose current buffer contains the unsorted input keys and, upon return, is /// updated to point to the sorted output keys DoubleBuffer& d_keys; /// Double-buffer whose current buffer contains the unsorted input values and, upon return, is /// updated to point to the sorted output values DoubleBuffer& d_values; /// Number of items to sort OffsetT num_items; /// The beginning (least-significant) bit index needed for key comparison int begin_bit; /// The past-the-end (most-significant) bit index needed for key comparison int end_bit; /// CUDA stream to launch kernels within. Default is stream0. cudaStream_t stream; /// PTX version int ptx_version; /// Whether is okay to overwrite source buffers bool is_overwrite_okay; DecomposerT decomposer; //------------------------------------------------------------------------------ // Constructor //------------------------------------------------------------------------------ /// Constructor CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchRadixSort( void* d_temp_storage, size_t& temp_storage_bytes, DoubleBuffer& d_keys, DoubleBuffer& d_values, OffsetT num_items, int begin_bit, int end_bit, bool is_overwrite_okay, cudaStream_t stream, int ptx_version, DecomposerT decomposer = {}) : d_temp_storage(d_temp_storage) , temp_storage_bytes(temp_storage_bytes) , d_keys(d_keys) , d_values(d_values) , num_items(num_items) , begin_bit(begin_bit) , end_bit(end_bit) , stream(stream) , ptx_version(ptx_version) , is_overwrite_okay(is_overwrite_okay) , decomposer(decomposer) {} CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchRadixSort( void* d_temp_storage, size_t& temp_storage_bytes, DoubleBuffer& d_keys, DoubleBuffer& d_values, OffsetT num_items, int begin_bit, int end_bit, bool is_overwrite_okay, cudaStream_t stream, bool debug_synchronous, int ptx_version) : d_temp_storage(d_temp_storage) , temp_storage_bytes(temp_storage_bytes) , d_keys(d_keys) , d_values(d_values) , num_items(num_items) , begin_bit(begin_bit) , end_bit(end_bit) , stream(stream) , ptx_version(ptx_version) , is_overwrite_okay(is_overwrite_okay) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG } //------------------------------------------------------------------------------ // Small-problem (single tile) invocation //------------------------------------------------------------------------------ /** * @brief Invoke a single block to sort in-core * * @tparam ActivePolicyT * Umbrella policy active for the target device * * @tparam SingleTileKernelT * Function type of cub::DeviceRadixSortSingleTileKernel * * @param[in] single_tile_kernel * Kernel function pointer to parameterization of cub::DeviceRadixSortSingleTileKernel */ template CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE cudaError_t InvokeSingleTile(SingleTileKernelT single_tile_kernel) { cudaError error = cudaSuccess; do { // Return if the caller is simply requesting the size of the storage allocation if (d_temp_storage == NULL) { temp_storage_bytes = 1; break; } // Log single_tile_kernel configuration #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking single_tile_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit " "%d, bit_grain %d\n", 1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, (long long) stream, ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD, 1, begin_bit, ActivePolicyT::SingleTilePolicy::RADIX_BITS); #endif // Invoke upsweep_kernel with same grid size as downsweep_kernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( 1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream) .doit(single_tile_kernel, d_keys.Current(), d_keys.Alternate(), d_values.Current(), d_values.Alternate(), num_items, begin_bit, end_bit, decomposer); // Check for failure to launch error = CubDebug(cudaPeekAtLastError()); if (cudaSuccess != error) { break; } // Sync the stream if specified to flush runtime errors error = CubDebug(detail::DebugSyncStream(stream)); if (cudaSuccess != error) { break; } // Update selector d_keys.selector ^= 1; d_values.selector ^= 1; } while (0); return error; } //------------------------------------------------------------------------------ // Normal problem size invocation //------------------------------------------------------------------------------ /** * Invoke a three-kernel sorting pass at the current bit. */ template CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t InvokePass( const KeyT* d_keys_in, KeyT* d_keys_out, const ValueT* d_values_in, ValueT* d_values_out, OffsetT* d_spine, int /*spine_length*/, int& current_bit, PassConfigT& pass_config) { cudaError error = cudaSuccess; do { int pass_bits = CUB_MIN(pass_config.radix_bits, (end_bit - current_bit)); // Log upsweep_kernel configuration #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking upsweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit %d, " "bit_grain %d\n", pass_config.even_share.grid_size, pass_config.upsweep_config.block_threads, (long long) stream, pass_config.upsweep_config.items_per_thread, pass_config.upsweep_config.sm_occupancy, current_bit, pass_bits); #endif // Spine length written by the upsweep kernel in the current pass. int pass_spine_length = pass_config.even_share.grid_size * pass_config.radix_digits; // Invoke upsweep_kernel with same grid size as downsweep_kernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( pass_config.even_share.grid_size, pass_config.upsweep_config.block_threads, 0, stream) .doit(pass_config.upsweep_kernel, d_keys_in, d_spine, num_items, current_bit, pass_bits, pass_config.even_share, decomposer); // Check for failure to launch error = CubDebug(cudaPeekAtLastError()); if (cudaSuccess != error) { break; } // Sync the stream if specified to flush runtime errors error = CubDebug(detail::DebugSyncStream(stream)); if (cudaSuccess != error) { break; } // Log scan_kernel configuration #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking scan_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread\n", 1, pass_config.scan_config.block_threads, (long long) stream, pass_config.scan_config.items_per_thread); #endif // Invoke scan_kernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(1, pass_config.scan_config.block_threads, 0, stream) .doit(pass_config.scan_kernel, d_spine, pass_spine_length); // Check for failure to launch error = CubDebug(cudaPeekAtLastError()); if (cudaSuccess != error) { break; } // Sync the stream if specified to flush runtime errors error = CubDebug(detail::DebugSyncStream(stream)); if (cudaSuccess != error) { break; } // Log downsweep_kernel configuration #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking downsweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", pass_config.even_share.grid_size, pass_config.downsweep_config.block_threads, (long long) stream, pass_config.downsweep_config.items_per_thread, pass_config.downsweep_config.sm_occupancy); #endif // Invoke downsweep_kernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( pass_config.even_share.grid_size, pass_config.downsweep_config.block_threads, 0, stream) .doit(pass_config.downsweep_kernel, d_keys_in, d_keys_out, d_values_in, d_values_out, d_spine, num_items, current_bit, pass_bits, pass_config.even_share, decomposer); // Check for failure to launch error = CubDebug(cudaPeekAtLastError()); if (cudaSuccess != error) { break; } // Sync the stream if specified to flush runtime errors error = CubDebug(detail::DebugSyncStream(stream)); if (cudaSuccess != error) { break; } // Update current bit current_bit += pass_bits; } while (0); return error; } /// Pass configuration structure template struct PassConfig { UpsweepKernelT upsweep_kernel; KernelConfig upsweep_config; ScanKernelT scan_kernel; KernelConfig scan_config; DownsweepKernelT downsweep_kernel; KernelConfig downsweep_config; int radix_bits; int radix_digits; int max_downsweep_grid_size; GridEvenShare even_share; /// Initialize pass configuration template CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE cudaError_t InitPassConfig( UpsweepKernelT upsweep_kernel, ScanKernelT scan_kernel, DownsweepKernelT downsweep_kernel, int /*ptx_version*/, int sm_count, OffsetT num_items) { cudaError error = cudaSuccess; do { this->upsweep_kernel = upsweep_kernel; this->scan_kernel = scan_kernel; this->downsweep_kernel = downsweep_kernel; radix_bits = DownsweepPolicyT::RADIX_BITS; radix_digits = 1 << radix_bits; error = CubDebug(upsweep_config.Init(upsweep_kernel)); if (cudaSuccess != error) { break; } error = CubDebug(scan_config.Init(scan_kernel)); if (cudaSuccess != error) { break; } error = CubDebug(downsweep_config.Init(downsweep_kernel)); if (cudaSuccess != error) { break; } max_downsweep_grid_size = (downsweep_config.sm_occupancy * sm_count) * CUB_SUBSCRIPTION_FACTOR(0); even_share.DispatchInit( num_items, max_downsweep_grid_size, CUB_MAX(downsweep_config.tile_size, upsweep_config.tile_size)); } while (0); return error; } }; template CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t InvokeOnesweep() { typedef typename DispatchRadixSort::MaxPolicy MaxPolicyT; // PortionOffsetT is used for offsets within a portion, and must be signed. typedef int PortionOffsetT; typedef PortionOffsetT AtomicOffsetT; // compute temporary storage size constexpr int RADIX_BITS = ActivePolicyT::ONESWEEP_RADIX_BITS; constexpr int RADIX_DIGITS = 1 << RADIX_BITS; constexpr int ONESWEEP_ITEMS_PER_THREAD = ActivePolicyT::OnesweepPolicy::ITEMS_PER_THREAD; constexpr int ONESWEEP_BLOCK_THREADS = ActivePolicyT::OnesweepPolicy::BLOCK_THREADS; constexpr int ONESWEEP_TILE_ITEMS = ONESWEEP_ITEMS_PER_THREAD * ONESWEEP_BLOCK_THREADS; // portions handle inputs with >=2**30 elements, due to the way lookback works // for testing purposes, one portion is <= 2**28 elements constexpr PortionOffsetT PORTION_SIZE = ((1 << 28) - 1) / ONESWEEP_TILE_ITEMS * ONESWEEP_TILE_ITEMS; int num_passes = cub::DivideAndRoundUp(end_bit - begin_bit, RADIX_BITS); OffsetT num_portions = static_cast(cub::DivideAndRoundUp(num_items, PORTION_SIZE)); PortionOffsetT max_num_blocks = cub::DivideAndRoundUp( static_cast(CUB_MIN(num_items, static_cast(PORTION_SIZE))), ONESWEEP_TILE_ITEMS); size_t value_size = KEYS_ONLY ? 0 : sizeof(ValueT); size_t allocation_sizes[] = { // bins num_portions * num_passes * RADIX_DIGITS * sizeof(OffsetT), // lookback max_num_blocks * RADIX_DIGITS * sizeof(AtomicOffsetT), // extra key buffer is_overwrite_okay || num_passes <= 1 ? 0 : num_items * sizeof(KeyT), // extra value buffer is_overwrite_okay || num_passes <= 1 ? 0 : num_items * value_size, // counters num_portions * num_passes * sizeof(AtomicOffsetT), }; constexpr int NUM_ALLOCATIONS = sizeof(allocation_sizes) / sizeof(allocation_sizes[0]); void* allocations[NUM_ALLOCATIONS] = {}; AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes); // just return if no temporary storage is provided cudaError_t error = cudaSuccess; if (d_temp_storage == NULL) { return error; } OffsetT* d_bins = (OffsetT*) allocations[0]; AtomicOffsetT* d_lookback = (AtomicOffsetT*) allocations[1]; KeyT* d_keys_tmp2 = (KeyT*) allocations[2]; ValueT* d_values_tmp2 = (ValueT*) allocations[3]; AtomicOffsetT* d_ctrs = (AtomicOffsetT*) allocations[4]; do { // initialization error = CubDebug(cudaMemsetAsync(d_ctrs, 0, num_portions * num_passes * sizeof(AtomicOffsetT), stream)); if (cudaSuccess != error) { break; } // compute num_passes histograms with RADIX_DIGITS bins each error = CubDebug(cudaMemsetAsync(d_bins, 0, num_passes * RADIX_DIGITS * sizeof(OffsetT), stream)); if (cudaSuccess != error) { break; } int device = -1; int num_sms = 0; error = CubDebug(cudaGetDevice(&device)); if (cudaSuccess != error) { break; } error = CubDebug(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, device)); if (cudaSuccess != error) { break; } constexpr int HISTO_BLOCK_THREADS = ActivePolicyT::HistogramPolicy::BLOCK_THREADS; int histo_blocks_per_sm = 1; auto histogram_kernel = DeviceRadixSortHistogramKernel; error = CubDebug( cudaOccupancyMaxActiveBlocksPerMultiprocessor(&histo_blocks_per_sm, histogram_kernel, HISTO_BLOCK_THREADS, 0)); if (cudaSuccess != error) { break; } // log histogram_kernel configuration #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking histogram_kernel<<<%d, %d, 0, %lld>>>(), %d items per iteration, " "%d SM occupancy, bit_grain %d\n", histo_blocks_per_sm * num_sms, HISTO_BLOCK_THREADS, reinterpret_cast(stream), ActivePolicyT::HistogramPolicy::ITEMS_PER_THREAD, histo_blocks_per_sm, ActivePolicyT::HistogramPolicy::RADIX_BITS); #endif error = THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( histo_blocks_per_sm * num_sms, HISTO_BLOCK_THREADS, 0, stream) .doit(histogram_kernel, d_bins, d_keys.Current(), num_items, begin_bit, end_bit, decomposer); error = CubDebug(error); if (cudaSuccess != error) { break; } error = CubDebug(detail::DebugSyncStream(stream)); if (cudaSuccess != error) { break; } // exclusive sums to determine starts constexpr int SCAN_BLOCK_THREADS = ActivePolicyT::ExclusiveSumPolicy::BLOCK_THREADS; // log exclusive_sum_kernel configuration #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking exclusive_sum_kernel<<<%d, %d, 0, %lld>>>(), bit_grain %d\n", num_passes, SCAN_BLOCK_THREADS, reinterpret_cast(stream), ActivePolicyT::ExclusiveSumPolicy::RADIX_BITS); #endif error = THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(num_passes, SCAN_BLOCK_THREADS, 0, stream) .doit(DeviceRadixSortExclusiveSumKernel, d_bins); error = CubDebug(error); if (cudaSuccess != error) { break; } error = CubDebug(detail::DebugSyncStream(stream)); if (cudaSuccess != error) { break; } // use the other buffer if no overwrite is allowed KeyT* d_keys_tmp = d_keys.Alternate(); ValueT* d_values_tmp = d_values.Alternate(); if (!is_overwrite_okay && num_passes % 2 == 0) { d_keys.d_buffers[1] = d_keys_tmp2; d_values.d_buffers[1] = d_values_tmp2; } for (int current_bit = begin_bit, pass = 0; current_bit < end_bit; current_bit += RADIX_BITS, ++pass) { int num_bits = CUB_MIN(end_bit - current_bit, RADIX_BITS); for (OffsetT portion = 0; portion < num_portions; ++portion) { PortionOffsetT portion_num_items = static_cast( CUB_MIN(num_items - portion * PORTION_SIZE, static_cast(PORTION_SIZE))); PortionOffsetT num_blocks = cub::DivideAndRoundUp(portion_num_items, ONESWEEP_TILE_ITEMS); error = CubDebug(cudaMemsetAsync(d_lookback, 0, num_blocks * RADIX_DIGITS * sizeof(AtomicOffsetT), stream)); if (cudaSuccess != error) { break; } // log onesweep_kernel configuration #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking onesweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, " "current bit %d, bit_grain %d, portion %d/%d\n", num_blocks, ONESWEEP_BLOCK_THREADS, reinterpret_cast(stream), ActivePolicyT::OnesweepPolicy::ITEMS_PER_THREAD, current_bit, num_bits, static_cast(portion), static_cast(num_portions)); #endif auto onesweep_kernel = DeviceRadixSortOnesweepKernel< MaxPolicyT, IS_DESCENDING, KeyT, ValueT, OffsetT, PortionOffsetT, AtomicOffsetT, DecomposerT>; error = THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(num_blocks, ONESWEEP_BLOCK_THREADS, 0, stream) .doit(onesweep_kernel, d_lookback, d_ctrs + portion * num_passes + pass, portion < num_portions - 1 ? d_bins + ((portion + 1) * num_passes + pass) * RADIX_DIGITS : NULL, d_bins + (portion * num_passes + pass) * RADIX_DIGITS, d_keys.Alternate(), d_keys.Current() + portion * PORTION_SIZE, d_values.Alternate(), d_values.Current() + portion * PORTION_SIZE, portion_num_items, current_bit, num_bits, decomposer); error = CubDebug(error); if (cudaSuccess != error) { break; } error = CubDebug(detail::DebugSyncStream(stream)); if (cudaSuccess != error) { break; } } if (error != cudaSuccess) { break; } // use the temporary buffers if no overwrite is allowed if (!is_overwrite_okay && pass == 0) { d_keys = num_passes % 2 == 0 ? DoubleBuffer(d_keys_tmp, d_keys_tmp2) : DoubleBuffer(d_keys_tmp2, d_keys_tmp); d_values = num_passes % 2 == 0 ? DoubleBuffer(d_values_tmp, d_values_tmp2) : DoubleBuffer(d_values_tmp2, d_values_tmp); } d_keys.selector ^= 1; d_values.selector ^= 1; } } while (0); return error; } /** * @brief Invocation (run multiple digit passes) * * @tparam ActivePolicyT * Umbrella policy active for the target device * * @tparam UpsweepKernelT * Function type of cub::DeviceRadixSortUpsweepKernel * * @tparam ScanKernelT * Function type of cub::SpineScanKernel * * @tparam DownsweepKernelT * Function type of cub::DeviceRadixSortDownsweepKernel * * @param[in] upsweep_kernel * Kernel function pointer to parameterization of cub::DeviceRadixSortUpsweepKernel * * @param[in] alt_upsweep_kernel * Alternate kernel function pointer to parameterization of cub::DeviceRadixSortUpsweepKernel * * @param[in] scan_kernel * Kernel function pointer to parameterization of cub::SpineScanKernel * * @param[in] downsweep_kernel * Kernel function pointer to parameterization of cub::DeviceRadixSortDownsweepKernel * * @param[in] alt_downsweep_kernel * Alternate kernel function pointer to parameterization of * cub::DeviceRadixSortDownsweepKernel */ template CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE cudaError_t InvokePasses( UpsweepKernelT upsweep_kernel, UpsweepKernelT alt_upsweep_kernel, ScanKernelT scan_kernel, DownsweepKernelT downsweep_kernel, DownsweepKernelT alt_downsweep_kernel) { cudaError error = cudaSuccess; do { // Get device ordinal int device_ordinal; error = CubDebug(cudaGetDevice(&device_ordinal)); if (cudaSuccess != error) { break; } // Get SM count int sm_count; error = CubDebug(cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal)); if (cudaSuccess != error) { break; } // Init regular and alternate-digit kernel configurations PassConfig pass_config, alt_pass_config; error = pass_config.template InitPassConfig( upsweep_kernel, scan_kernel, downsweep_kernel, ptx_version, sm_count, num_items); if (error) { break; } error = alt_pass_config.template InitPassConfig( alt_upsweep_kernel, scan_kernel, alt_downsweep_kernel, ptx_version, sm_count, num_items); if (error) { break; } // Get maximum spine length int max_grid_size = CUB_MAX(pass_config.max_downsweep_grid_size, alt_pass_config.max_downsweep_grid_size); int spine_length = (max_grid_size * pass_config.radix_digits) + pass_config.scan_config.tile_size; // Temporary storage allocation requirements void* allocations[3] = {}; size_t allocation_sizes[3] = { // bytes needed for privatized block digit histograms spine_length * sizeof(OffsetT), // bytes needed for 3rd keys buffer (is_overwrite_okay) ? 0 : num_items * sizeof(KeyT), // bytes needed for 3rd values buffer (is_overwrite_okay || (KEYS_ONLY)) ? 0 : num_items * sizeof(ValueT), }; // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob) error = CubDebug(AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes)); if (cudaSuccess != error) { break; } // Return if the caller is simply requesting the size of the storage allocation if (d_temp_storage == NULL) { return cudaSuccess; } // Pass planning. Run passes of the alternate digit-size configuration until we have an even multiple of our // preferred digit size int num_bits = end_bit - begin_bit; int num_passes = cub::DivideAndRoundUp(num_bits, pass_config.radix_bits); bool is_num_passes_odd = num_passes & 1; int max_alt_passes = (num_passes * pass_config.radix_bits) - num_bits; int alt_end_bit = CUB_MIN(end_bit, begin_bit + (max_alt_passes * alt_pass_config.radix_bits)); // Alias the temporary storage allocations OffsetT* d_spine = static_cast(allocations[0]); DoubleBuffer d_keys_remaining_passes( (is_overwrite_okay || is_num_passes_odd) ? d_keys.Alternate() : static_cast(allocations[1]), (is_overwrite_okay) ? d_keys.Current() : (is_num_passes_odd) ? static_cast(allocations[1]) : d_keys.Alternate()); DoubleBuffer d_values_remaining_passes( (is_overwrite_okay || is_num_passes_odd) ? d_values.Alternate() : static_cast(allocations[2]), (is_overwrite_okay) ? d_values.Current() : (is_num_passes_odd) ? static_cast(allocations[2]) : d_values.Alternate()); // Run first pass, consuming from the input's current buffers int current_bit = begin_bit; error = CubDebug(InvokePass( d_keys.Current(), d_keys_remaining_passes.Current(), d_values.Current(), d_values_remaining_passes.Current(), d_spine, spine_length, current_bit, (current_bit < alt_end_bit) ? alt_pass_config : pass_config)); if (cudaSuccess != error) { break; } // Run remaining passes while (current_bit < end_bit) { error = CubDebug(InvokePass( d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector], d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1], d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector], d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1], d_spine, spine_length, current_bit, (current_bit < alt_end_bit) ? alt_pass_config : pass_config)); if (cudaSuccess != error) { break; } // Invert selectors d_keys_remaining_passes.selector ^= 1; d_values_remaining_passes.selector ^= 1; } // Update selector if (!is_overwrite_okay) { num_passes = 1; // Sorted data always ends up in the other vector } d_keys.selector = (d_keys.selector + num_passes) & 1; d_values.selector = (d_values.selector + num_passes) & 1; } while (0); return error; } //------------------------------------------------------------------------------ // Chained policy invocation //------------------------------------------------------------------------------ template CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t InvokeManyTiles(Int2Type) { // Invoke upsweep-downsweep typedef typename DispatchRadixSort::MaxPolicy MaxPolicyT; return InvokePasses( DeviceRadixSortUpsweepKernel, DeviceRadixSortUpsweepKernel, RadixSortScanBinsKernel, DeviceRadixSortDownsweepKernel, DeviceRadixSortDownsweepKernel); } template CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t InvokeManyTiles(Int2Type) { // Invoke onesweep return InvokeOnesweep(); } CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t InvokeCopy() { // is_overwrite_okay == false here // Return the number of temporary bytes if requested if (d_temp_storage == nullptr) { temp_storage_bytes = 1; return cudaSuccess; } // Copy keys #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking async copy of %lld keys on stream %lld\n", (long long) num_items, (long long) stream); #endif cudaError_t error = cudaSuccess; error = CubDebug( cudaMemcpyAsync(d_keys.Alternate(), d_keys.Current(), num_items * sizeof(KeyT), cudaMemcpyDefault, stream)); if (cudaSuccess != error) { return error; } error = CubDebug(detail::DebugSyncStream(stream)); if (cudaSuccess != error) { return error; } d_keys.selector ^= 1; // Copy values if necessary if (!KEYS_ONLY) { #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking async copy of %lld values on stream %lld\n", (long long) num_items, (long long) stream); #endif error = CubDebug(cudaMemcpyAsync( d_values.Alternate(), d_values.Current(), num_items * sizeof(ValueT), cudaMemcpyDefault, stream)); if (cudaSuccess != error) { return error; } error = CubDebug(detail::DebugSyncStream(stream)); if (cudaSuccess != error) { return error; } } d_values.selector ^= 1; return error; } /// Invocation template CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke() { typedef typename DispatchRadixSort::MaxPolicy MaxPolicyT; typedef typename ActivePolicyT::SingleTilePolicy SingleTilePolicyT; // Return if empty problem, or if no bits to sort and double-buffering is used if (num_items == 0 || (begin_bit == end_bit && is_overwrite_okay)) { if (d_temp_storage == nullptr) { temp_storage_bytes = 1; } return cudaSuccess; } // Check if simple copy suffices (is_overwrite_okay == false at this point) if (begin_bit == end_bit) { bool has_uva = false; cudaError_t error = detail::HasUVA(has_uva); if (error != cudaSuccess) { return error; } if (has_uva) { return InvokeCopy(); } } // Force kernel code-generation in all compiler passes if (num_items <= (SingleTilePolicyT::BLOCK_THREADS * SingleTilePolicyT::ITEMS_PER_THREAD)) { // Small, single tile size return InvokeSingleTile( DeviceRadixSortSingleTileKernel); } else { // Regular size return InvokeManyTiles(Int2Type()); } } //------------------------------------------------------------------------------ // Dispatch entrypoints //------------------------------------------------------------------------------ /** * @brief Internal dispatch routine * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When NULL, the required * allocation size is written to `temp_storage_bytes` and no work is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in,out] d_keys * Double-buffer whose current buffer contains the unsorted input keys and, * upon return, is updated to point to the sorted output keys * * @param[in,out] d_values * Double-buffer whose current buffer contains the unsorted input values and, * upon return, is updated to point to the sorted output values * * @param[in] num_items * Number of items to sort * * @param[in] begin_bit * The beginning (least-significant) bit index needed for key comparison * * @param[in] end_bit * The past-the-end (most-significant) bit index needed for key comparison * * @param[in] is_overwrite_okay * Whether is okay to overwrite source buffers * * @param[in] stream * CUDA stream to launch kernels within. Default is stream0. */ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch( void* d_temp_storage, size_t& temp_storage_bytes, DoubleBuffer& d_keys, DoubleBuffer& d_values, OffsetT num_items, int begin_bit, int end_bit, bool is_overwrite_okay, cudaStream_t stream, DecomposerT decomposer = {}) { typedef typename DispatchRadixSort::MaxPolicy MaxPolicyT; cudaError_t error; do { // Get PTX version int ptx_version = 0; error = CubDebug(PtxVersion(ptx_version)); if (cudaSuccess != error) { break; } // Create dispatch functor DispatchRadixSort dispatch( d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream, ptx_version, decomposer); // Dispatch to chained policy error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch)); if (cudaSuccess != error) { break; } } while (0); return error; } CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch( void* d_temp_storage, size_t& temp_storage_bytes, DoubleBuffer& d_keys, DoubleBuffer& d_values, OffsetT num_items, int begin_bit, int end_bit, bool is_overwrite_okay, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return Dispatch( d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream); } }; /****************************************************************************** * Segmented dispatch ******************************************************************************/ /** * @brief Utility class for dispatching the appropriately-tuned kernels for segmented device-wide * radix sort * * @tparam IS_DESCENDING * Whether or not the sorted-order is high-to-low * * @tparam KeyT * Key type * * @tparam ValueT * Value type * * @tparam BeginOffsetIteratorT * Random-access input iterator type for reading segment beginning offsets @iterator * * @tparam EndOffsetIteratorT * Random-access input iterator type for reading segment ending offsets @iterator * * @tparam OffsetT * Signed integer type for global offsets */ template , typename DecomposerT = detail::identity_decomposer_t> struct DispatchSegmentedRadixSort : SelectedPolicy { //------------------------------------------------------------------------------ // Constants //------------------------------------------------------------------------------ // Whether this is a keys-only (or key-value) sort static constexpr bool KEYS_ONLY = std::is_same::value; //------------------------------------------------------------------------------ // Parameter members //------------------------------------------------------------------------------ /// Device-accessible allocation of temporary storage. When NULL, the required allocation size /// is written to `temp_storage_bytes` and no work is done. void* d_temp_storage; /// Reference to size in bytes of `d_temp_storage` allocation size_t& temp_storage_bytes; /// Double-buffer whose current buffer contains the unsorted input keys and, upon return, is /// updated to point to the sorted output keys DoubleBuffer& d_keys; /// Double-buffer whose current buffer contains the unsorted input values and, upon return, is /// updated to point to the sorted output values DoubleBuffer& d_values; /// Number of items to sort OffsetT num_items; /// The number of segments that comprise the sorting data OffsetT num_segments; /// Random-access input iterator to the sequence of beginning offsets of length `num_segments`, /// such that d_begin_offsets[i] is the first element of the ith /// data segment in d_keys_* and d_values_* BeginOffsetIteratorT d_begin_offsets; /// Random-access input iterator to the sequence of ending offsets of length `num_segments`, /// such that d_end_offsets[i]-1 is the last element of the ith /// data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 /// <= d_begin_offsets[i], the ith is considered empty. EndOffsetIteratorT d_end_offsets; /// The beginning (least-significant) bit index needed for key comparison int begin_bit; /// The past-the-end (most-significant) bit index needed for key comparison int end_bit; /// CUDA stream to launch kernels within. Default is stream0. cudaStream_t stream; /// PTX version int ptx_version; /// Whether is okay to overwrite source buffers bool is_overwrite_okay; DecomposerT decomposer; //------------------------------------------------------------------------------ // Constructors //------------------------------------------------------------------------------ /// Constructor CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchSegmentedRadixSort( void* d_temp_storage, size_t& temp_storage_bytes, DoubleBuffer& d_keys, DoubleBuffer& d_values, OffsetT num_items, OffsetT num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, int begin_bit, int end_bit, bool is_overwrite_okay, cudaStream_t stream, int ptx_version, DecomposerT decomposer = {}) : d_temp_storage(d_temp_storage) , temp_storage_bytes(temp_storage_bytes) , d_keys(d_keys) , d_values(d_values) , num_items(num_items) , num_segments(num_segments) , d_begin_offsets(d_begin_offsets) , d_end_offsets(d_end_offsets) , begin_bit(begin_bit) , end_bit(end_bit) , stream(stream) , ptx_version(ptx_version) , is_overwrite_okay(is_overwrite_okay) , decomposer(decomposer) {} CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchSegmentedRadixSort( void* d_temp_storage, size_t& temp_storage_bytes, DoubleBuffer& d_keys, DoubleBuffer& d_values, OffsetT num_items, OffsetT num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, int begin_bit, int end_bit, bool is_overwrite_okay, cudaStream_t stream, bool debug_synchronous, int ptx_version) : d_temp_storage(d_temp_storage) , temp_storage_bytes(temp_storage_bytes) , d_keys(d_keys) , d_values(d_values) , num_items(num_items) , num_segments(num_segments) , d_begin_offsets(d_begin_offsets) , d_end_offsets(d_end_offsets) , begin_bit(begin_bit) , end_bit(end_bit) , stream(stream) , ptx_version(ptx_version) , is_overwrite_okay(is_overwrite_okay) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG } //------------------------------------------------------------------------------ // Multi-segment invocation //------------------------------------------------------------------------------ /// Invoke a three-kernel sorting pass at the current bit. template CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t InvokePass( const KeyT* d_keys_in, KeyT* d_keys_out, const ValueT* d_values_in, ValueT* d_values_out, int& current_bit, PassConfigT& pass_config) { cudaError error = cudaSuccess; do { int pass_bits = CUB_MIN(pass_config.radix_bits, (end_bit - current_bit)); // Log kernel configuration #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking segmented_kernels<<<%lld, %lld, 0, %lld>>>(), " "%lld items per thread, %lld SM occupancy, " "current bit %d, bit_grain %d\n", (long long) num_segments, (long long) pass_config.segmented_config.block_threads, (long long) stream, (long long) pass_config.segmented_config.items_per_thread, (long long) pass_config.segmented_config.sm_occupancy, current_bit, pass_bits); #endif THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( num_segments, pass_config.segmented_config.block_threads, 0, stream) .doit(pass_config.segmented_kernel, d_keys_in, d_keys_out, d_values_in, d_values_out, d_begin_offsets, d_end_offsets, num_segments, current_bit, pass_bits, decomposer); // Check for failure to launch error = CubDebug(cudaPeekAtLastError()); if (cudaSuccess != error) { break; } // Sync the stream if specified to flush runtime errors error = CubDebug(detail::DebugSyncStream(stream)); if (cudaSuccess != error) { break; } // Update current bit current_bit += pass_bits; } while (0); return error; } /// PassConfig data structure template struct PassConfig { SegmentedKernelT segmented_kernel; KernelConfig segmented_config; int radix_bits; int radix_digits; /// Initialize pass configuration template CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE cudaError_t InitPassConfig(SegmentedKernelT segmented_kernel) { this->segmented_kernel = segmented_kernel; this->radix_bits = SegmentedPolicyT::RADIX_BITS; this->radix_digits = 1 << radix_bits; return CubDebug(segmented_config.Init(segmented_kernel)); } }; /** * @brief Invocation (run multiple digit passes) * * @tparam ActivePolicyT * Umbrella policy active for the target device * * @tparam SegmentedKernelT * Function type of cub::DeviceSegmentedRadixSortKernel * * @param[in] segmented_kernel * Kernel function pointer to parameterization of cub::DeviceSegmentedRadixSortKernel * * @param[in] alt_segmented_kernel * Alternate kernel function pointer to parameterization of * cub::DeviceSegmentedRadixSortKernel */ template CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE cudaError_t InvokePasses(SegmentedKernelT segmented_kernel, SegmentedKernelT alt_segmented_kernel) { cudaError error = cudaSuccess; do { // Init regular and alternate kernel configurations PassConfig pass_config, alt_pass_config; if ((error = pass_config.template InitPassConfig(segmented_kernel))) { break; } if ((error = alt_pass_config.template InitPassConfig(alt_segmented_kernel))) { break; } // Temporary storage allocation requirements void* allocations[2] = {}; size_t allocation_sizes[2] = { // bytes needed for 3rd keys buffer (is_overwrite_okay) ? 0 : num_items * sizeof(KeyT), // bytes needed for 3rd values buffer (is_overwrite_okay || (KEYS_ONLY)) ? 0 : num_items * sizeof(ValueT), }; // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob) error = CubDebug(AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes)); if (cudaSuccess != error) { break; } // Return if the caller is simply requesting the size of the storage allocation if (d_temp_storage == NULL) { if (temp_storage_bytes == 0) { temp_storage_bytes = 1; } return cudaSuccess; } // Pass planning. Run passes of the alternate digit-size configuration until we have an even multiple of our // preferred digit size int radix_bits = ActivePolicyT::SegmentedPolicy::RADIX_BITS; int alt_radix_bits = ActivePolicyT::AltSegmentedPolicy::RADIX_BITS; int num_bits = end_bit - begin_bit; int num_passes = CUB_MAX(DivideAndRoundUp(num_bits, radix_bits), 1); bool is_num_passes_odd = num_passes & 1; int max_alt_passes = (num_passes * radix_bits) - num_bits; int alt_end_bit = CUB_MIN(end_bit, begin_bit + (max_alt_passes * alt_radix_bits)); DoubleBuffer d_keys_remaining_passes( (is_overwrite_okay || is_num_passes_odd) ? d_keys.Alternate() : static_cast(allocations[0]), (is_overwrite_okay) ? d_keys.Current() : (is_num_passes_odd) ? static_cast(allocations[0]) : d_keys.Alternate()); DoubleBuffer d_values_remaining_passes( (is_overwrite_okay || is_num_passes_odd) ? d_values.Alternate() : static_cast(allocations[1]), (is_overwrite_okay) ? d_values.Current() : (is_num_passes_odd) ? static_cast(allocations[1]) : d_values.Alternate()); // Run first pass, consuming from the input's current buffers int current_bit = begin_bit; error = CubDebug(InvokePass( d_keys.Current(), d_keys_remaining_passes.Current(), d_values.Current(), d_values_remaining_passes.Current(), current_bit, (current_bit < alt_end_bit) ? alt_pass_config : pass_config)); if (cudaSuccess != error) { break; } // Run remaining passes while (current_bit < end_bit) { error = CubDebug(InvokePass( d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector], d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1], d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector], d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1], current_bit, (current_bit < alt_end_bit) ? alt_pass_config : pass_config)); if (cudaSuccess != error) { break; } // Invert selectors and update current bit d_keys_remaining_passes.selector ^= 1; d_values_remaining_passes.selector ^= 1; } // Update selector if (!is_overwrite_okay) { num_passes = 1; // Sorted data always ends up in the other vector } d_keys.selector = (d_keys.selector + num_passes) & 1; d_values.selector = (d_values.selector + num_passes) & 1; } while (0); return error; } //------------------------------------------------------------------------------ // Chained policy invocation //------------------------------------------------------------------------------ /// Invocation template CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke() { typedef typename DispatchSegmentedRadixSort::MaxPolicy MaxPolicyT; // Return if empty problem, or if no bits to sort and double-buffering is used if (num_items == 0 || num_segments == 0 || (begin_bit == end_bit && is_overwrite_okay)) { if (d_temp_storage == nullptr) { temp_storage_bytes = 1; } return cudaSuccess; } // Force kernel code-generation in all compiler passes return InvokePasses( DeviceSegmentedRadixSortKernel< MaxPolicyT, false, IS_DESCENDING, KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT, OffsetT, DecomposerT>, DeviceSegmentedRadixSortKernel< MaxPolicyT, true, IS_DESCENDING, KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT, OffsetT, DecomposerT>); } //------------------------------------------------------------------------------ // Dispatch entrypoints //------------------------------------------------------------------------------ /** * @brief Internal dispatch routine * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When NULL, the required allocation size * is written to `temp_storage_bytes` and no work is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in,out] d_keys * Double-buffer whose current buffer contains the unsorted input keys and, upon return, is * updated to point to the sorted output keys * * @param[in,out] d_values * Double-buffer whose current buffer contains the unsorted input values and, upon return, is * updated to point to the sorted output values * * @param[in] num_items * Number of items to sort * * @param[in] num_segments * The number of segments that comprise the sorting data * * @param[in] d_begin_offsets * Random-access input iterator to the sequence of beginning offsets of length * `num_segments`, such that d_begin_offsets[i] is the first element of the * ith data segment in d_keys_* and d_values_* * * @param[in] d_end_offsets * Random-access input iterator to the sequence of ending offsets of length `num_segments`, * such that d_end_offsets[i]-1 is the last element of the ith * data segment in d_keys_* and d_values_*. * If d_end_offsets[i]-1 <= d_begin_offsets[i], * the ith is considered empty. * * @param[in] begin_bit * The beginning (least-significant) bit index needed for key comparison * * @param[in] end_bit * The past-the-end (most-significant) bit index needed for key comparison * * @param[in] is_overwrite_okay * Whether is okay to overwrite source buffers * * @param[in] stream * CUDA stream to launch kernels within. Default is stream0. */ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch( void* d_temp_storage, size_t& temp_storage_bytes, DoubleBuffer& d_keys, DoubleBuffer& d_values, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, int begin_bit, int end_bit, bool is_overwrite_okay, cudaStream_t stream) { typedef typename DispatchSegmentedRadixSort::MaxPolicy MaxPolicyT; cudaError_t error; do { // Get PTX version int ptx_version = 0; error = CubDebug(PtxVersion(ptx_version)); if (cudaSuccess != error) { break; } // Create dispatch functor DispatchSegmentedRadixSort dispatch( d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, num_segments, d_begin_offsets, d_end_offsets, begin_bit, end_bit, is_overwrite_okay, stream, ptx_version); // Dispatch to chained policy error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch)); if (cudaSuccess != error) { break; } } while (0); return error; } CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch( void* d_temp_storage, size_t& temp_storage_bytes, DoubleBuffer& d_keys, DoubleBuffer& d_values, int num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, int begin_bit, int end_bit, bool is_overwrite_okay, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return Dispatch( d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, num_segments, d_begin_offsets, d_end_offsets, begin_bit, end_bit, is_overwrite_okay, stream); } }; CUB_NAMESPACE_END _CCCL_DIAG_POP cccl-2.5.0/cub/cub/device/dispatch/dispatch_reduce.cuh000066400000000000000000001271771463375617100226650ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * @file cub::DeviceReduce provides device-wide, parallel operations for * computing a reduction across a sequence of data items residing within * device-accessible memory. */ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include #include #include #include #include #include #include #include _CCCL_SUPPRESS_DEPRECATED_PUSH #include _CCCL_SUPPRESS_DEPRECATED_POP #include CUB_NAMESPACE_BEGIN namespace detail { namespace reduce { /** * All cub::DeviceReduce::* algorithms are using the same implementation. Some of them, however, * should use initial value only for empty problems. If this struct is used as initial value with * one of the `DeviceReduce` algorithms, the `init` value wrapped by this struct will only be used * for empty problems; it will not be incorporated into the aggregate of non-empty problems. */ template struct empty_problem_init_t { T init; _CCCL_HOST_DEVICE operator T() const { return init; } }; /** * @brief Applies initial value to the block aggregate and stores the result to the output iterator. * * @param d_out Iterator to the output aggregate * @param reduction_op Binary reduction functor * @param init Initial value * @param block_aggregate Aggregate value computed by the block */ template _CCCL_HOST_DEVICE void finalize_and_store_aggregate(OutputIteratorT d_out, ReductionOpT reduction_op, InitT init, AccumT block_aggregate) { *d_out = reduction_op(init, block_aggregate); } /** * @brief Ignores initial value and stores the block aggregate to the output iterator. * * @param d_out Iterator to the output aggregate * @param block_aggregate Aggregate value computed by the block */ template _CCCL_HOST_DEVICE void finalize_and_store_aggregate(OutputIteratorT d_out, ReductionOpT, empty_problem_init_t, AccumT block_aggregate) { *d_out = block_aggregate; } } // namespace reduce } // namespace detail /****************************************************************************** * Kernel entry points *****************************************************************************/ /** * @brief Reduce region kernel entry point (multi-block). Computes privatized * reductions, one per thread block. * * @tparam ChainedPolicyT * Chained tuning policy * * @tparam InputIteratorT * Random-access input iterator type for reading input items @iterator * * @tparam OffsetT * Signed integer type for global offsets * * @tparam ReductionOpT * Binary reduction functor type having member * `auto operator()(const T &a, const U &b)` * * @tparam InitT * Initial value type * * @tparam AccumT * Accumulator type * * @param[in] d_in * Pointer to the input sequence of data items * * @param[out] d_out * Pointer to the output aggregate * * @param[in] num_items * Total number of input data items * * @param[in] even_share * Even-share descriptor for mapping an equal number of tiles onto each * thread block * * @param[in] reduction_op * Binary reduction functor */ template CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)) void DeviceReduceKernel( InputIteratorT d_in, AccumT* d_out, OffsetT num_items, GridEvenShare even_share, ReductionOpT reduction_op, TransformOpT transform_op) { // Thread block type for reducing input tiles using AgentReduceT = AgentReduce; // Shared memory storage __shared__ typename AgentReduceT::TempStorage temp_storage; // Consume input tiles AccumT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op, transform_op).ConsumeTiles(even_share); // Output result if (threadIdx.x == 0) { detail::uninitialized_copy(d_out + blockIdx.x, block_aggregate); } } /** * @brief Reduce a single tile kernel entry point (single-block). Can be used * to aggregate privatized thread block reductions from a previous * multi-block reduction pass. * * @tparam ChainedPolicyT * Chained tuning policy * * @tparam InputIteratorT * Random-access input iterator type for reading input items @iterator * * @tparam OutputIteratorT * Output iterator type for recording the reduced aggregate @iterator * * @tparam OffsetT * Signed integer type for global offsets * * @tparam ReductionOpT * Binary reduction functor type having member * `T operator()(const T &a, const U &b)` * * @tparam InitT * Initial value type * * @tparam AccumT * Accumulator type * * @param[in] d_in * Pointer to the input sequence of data items * * @param[out] d_out * Pointer to the output aggregate * * @param[in] num_items * Total number of input data items * * @param[in] reduction_op * Binary reduction functor * * @param[in] init * The initial value of the reduction */ template CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__( int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS), 1) void DeviceReduceSingleTileKernel(InputIteratorT d_in, OutputIteratorT d_out, OffsetT num_items, ReductionOpT reduction_op, InitT init, TransformOpT transform_op) { // Thread block type for reducing input tiles using AgentReduceT = AgentReduce; // Shared memory storage __shared__ typename AgentReduceT::TempStorage temp_storage; // Check if empty problem if (num_items == 0) { if (threadIdx.x == 0) { *d_out = init; } return; } // Consume input tiles AccumT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op, transform_op).ConsumeRange(OffsetT(0), num_items); // Output result if (threadIdx.x == 0) { detail::reduce::finalize_and_store_aggregate(d_out, reduction_op, init, block_aggregate); } } /// Normalize input iterator to segment offset template _CCCL_DEVICE _CCCL_FORCEINLINE void NormalizeReductionOutput(T& /*val*/, OffsetT /*base_offset*/, IteratorT /*itr*/) {} /// Normalize input iterator to segment offset (specialized for arg-index) template _CCCL_DEVICE _CCCL_FORCEINLINE void NormalizeReductionOutput( KeyValuePairT& val, OffsetT base_offset, ArgIndexInputIterator /*itr*/) { val.key -= base_offset; } /** * Segmented reduction (one block per segment) * @tparam ChainedPolicyT * Chained tuning policy * * @tparam InputIteratorT * Random-access input iterator type for reading input items @iterator * * @tparam OutputIteratorT * Output iterator type for recording the reduced aggregate @iterator * * @tparam BeginOffsetIteratorT * Random-access input iterator type for reading segment beginning offsets * @iterator * * @tparam EndOffsetIteratorT * Random-access input iterator type for reading segment ending offsets * @iterator * * @tparam OffsetT * Signed integer type for global offsets * * @tparam ReductionOpT * Binary reduction functor type having member * `T operator()(const T &a, const U &b)` * * @tparam InitT * Initial value type * * @param[in] d_in * Pointer to the input sequence of data items * * @param[out] d_out * Pointer to the output aggregate * * @param[in] d_begin_offsets * Random-access input iterator to the sequence of beginning offsets of * length `num_segments`, such that `d_begin_offsets[i]` is the first element * of the *i*th data segment in `d_keys_*` and `d_values_*` * * @param[in] d_end_offsets * Random-access input iterator to the sequence of ending offsets of length * `num_segments`, such that `d_end_offsets[i] - 1` is the last element of * the *i*th data segment in `d_keys_*` and `d_values_*`. * If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*th is * considered empty. * * @param[in] num_segments * The number of segments that comprise the sorting data * * @param[in] reduction_op * Binary reduction functor * * @param[in] init * The initial value of the reduction */ template CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)) void DeviceSegmentedReduceKernel( InputIteratorT d_in, OutputIteratorT d_out, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, int /*num_segments*/, ReductionOpT reduction_op, InitT init) { // Thread block type for reducing input tiles using AgentReduceT = AgentReduce; // Shared memory storage __shared__ typename AgentReduceT::TempStorage temp_storage; OffsetT segment_begin = d_begin_offsets[blockIdx.x]; OffsetT segment_end = d_end_offsets[blockIdx.x]; // Check if empty problem if (segment_begin == segment_end) { if (threadIdx.x == 0) { *(d_out + blockIdx.x) = init; } return; } // Consume input tiles AccumT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op).ConsumeRange(segment_begin, segment_end); // Normalize as needed NormalizeReductionOutput(block_aggregate, segment_begin, d_in); if (threadIdx.x == 0) { detail::reduce::finalize_and_store_aggregate(d_out + blockIdx.x, reduction_op, init, block_aggregate); } } /****************************************************************************** * Policy ******************************************************************************/ /** * @tparam AccumT * Accumulator data type * * OffsetT * Signed integer type for global offsets * * ReductionOpT * Binary reduction functor type having member * `auto operator()(const T &a, const U &b)` */ template struct DeviceReducePolicy { //--------------------------------------------------------------------------- // Architecture-specific tuning policies //--------------------------------------------------------------------------- /// SM30 struct Policy300 : ChainedPolicy<300, Policy300, Policy300> { static constexpr int threads_per_block = 256; static constexpr int items_per_thread = 20; static constexpr int items_per_vec_load = 2; // ReducePolicy (GTX670: 154.0 @ 48M 4B items) using ReducePolicy = AgentReducePolicy; // SingleTilePolicy using SingleTilePolicy = ReducePolicy; // SegmentedReducePolicy using SegmentedReducePolicy = ReducePolicy; }; /// SM35 struct Policy350 : ChainedPolicy<350, Policy350, Policy300> { static constexpr int threads_per_block = 256; static constexpr int items_per_thread = 20; static constexpr int items_per_vec_load = 4; // ReducePolicy (GTX Titan: 255.1 GB/s @ 48M 4B items; 228.7 GB/s @ 192M 1B // items) using ReducePolicy = AgentReducePolicy; // SingleTilePolicy using SingleTilePolicy = ReducePolicy; // SegmentedReducePolicy using SegmentedReducePolicy = ReducePolicy; }; /// SM60 struct Policy600 : ChainedPolicy<600, Policy600, Policy350> { static constexpr int threads_per_block = 256; static constexpr int items_per_thread = 16; static constexpr int items_per_vec_load = 4; // ReducePolicy (P100: 591 GB/s @ 64M 4B items; 583 GB/s @ 256M 1B items) using ReducePolicy = AgentReducePolicy; // SingleTilePolicy using SingleTilePolicy = ReducePolicy; // SegmentedReducePolicy using SegmentedReducePolicy = ReducePolicy; }; using MaxPolicy = Policy600; }; /****************************************************************************** * Single-problem dispatch *****************************************************************************/ /** * @brief Utility class for dispatching the appropriately-tuned kernels for * device-wide reduction * * @tparam InputIteratorT * Random-access input iterator type for reading input items @iterator * * @tparam OutputIteratorT * Output iterator type for recording the reduced aggregate @iterator * * @tparam OffsetT * Signed integer type for global offsets * * @tparam ReductionOpT * Binary reduction functor type having member * `auto operator()(const T &a, const U &b)` * * @tparam InitT * Initial value type */ template >, typename AccumT = detail::accumulator_t>, typename SelectedPolicy = DeviceReducePolicy, typename TransformOpT = ::cuda::std::__identity> struct DispatchReduce : SelectedPolicy { //--------------------------------------------------------------------------- // Problem state //--------------------------------------------------------------------------- /// Device-accessible allocation of temporary storage. When `nullptr`, the /// required allocation size is written to `temp_storage_bytes` and no work /// is done. void* d_temp_storage; /// Reference to size in bytes of `d_temp_storage` allocation size_t& temp_storage_bytes; /// Pointer to the input sequence of data items InputIteratorT d_in; /// Pointer to the output aggregate OutputIteratorT d_out; /// Total number of input items (i.e., length of `d_in`) OffsetT num_items; /// Binary reduction functor ReductionOpT reduction_op; /// The initial value of the reduction InitT init; /// CUDA stream to launch kernels within. Default is stream0. cudaStream_t stream; int ptx_version; TransformOpT transform_op; //--------------------------------------------------------------------------- // Constructor //--------------------------------------------------------------------------- /// Constructor CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchReduce( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, OffsetT num_items, ReductionOpT reduction_op, InitT init, cudaStream_t stream, int ptx_version, TransformOpT transform_op = {}) : d_temp_storage(d_temp_storage) , temp_storage_bytes(temp_storage_bytes) , d_in(d_in) , d_out(d_out) , num_items(num_items) , reduction_op(reduction_op) , init(init) , stream(stream) , ptx_version(ptx_version) , transform_op(transform_op) {} CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchReduce( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, OffsetT num_items, ReductionOpT reduction_op, InitT init, cudaStream_t stream, bool debug_synchronous, int ptx_version) : d_temp_storage(d_temp_storage) , temp_storage_bytes(temp_storage_bytes) , d_in(d_in) , d_out(d_out) , num_items(num_items) , reduction_op(reduction_op) , init(init) , stream(stream) , ptx_version(ptx_version) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG } //--------------------------------------------------------------------------- // Small-problem (single tile) invocation //--------------------------------------------------------------------------- /** * @brief Invoke a single block block to reduce in-core * * @tparam ActivePolicyT * Umbrella policy active for the target device * * @tparam SingleTileKernelT * Function type of cub::DeviceReduceSingleTileKernel * * @param[in] single_tile_kernel * Kernel function pointer to parameterization of * cub::DeviceReduceSingleTileKernel */ template CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE cudaError_t InvokeSingleTile(SingleTileKernelT single_tile_kernel) { cudaError error = cudaSuccess; do { // Return if the caller is simply requesting the size of the storage // allocation if (d_temp_storage == NULL) { temp_storage_bytes = 1; break; } // Log single_reduce_sweep_kernel configuration #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), " "%d items per thread\n", ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, (long long) stream, ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD); #endif // Invoke single_reduce_sweep_kernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( 1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream) .doit(single_tile_kernel, d_in, d_out, num_items, reduction_op, init, transform_op); // Check for failure to launch error = CubDebug(cudaPeekAtLastError()); if (cudaSuccess != error) { break; } // Sync the stream if specified to flush runtime errors error = CubDebug(detail::DebugSyncStream(stream)); if (cudaSuccess != error) { break; } } while (0); return error; } //--------------------------------------------------------------------------- // Normal problem size invocation (two-pass) //--------------------------------------------------------------------------- /** * @brief Invoke two-passes to reduce * @tparam ActivePolicyT * Umbrella policy active for the target device * * @tparam ReduceKernelT * Function type of cub::DeviceReduceKernel * * @tparam SingleTileKernelT * Function type of cub::DeviceReduceSingleTileKernel * * @param[in] reduce_kernel * Kernel function pointer to parameterization of cub::DeviceReduceKernel * * @param[in] single_tile_kernel * Kernel function pointer to parameterization of * cub::DeviceReduceSingleTileKernel */ template CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE cudaError_t InvokePasses(ReduceKernelT reduce_kernel, SingleTileKernelT single_tile_kernel) { cudaError error = cudaSuccess; do { // Get device ordinal int device_ordinal; error = CubDebug(cudaGetDevice(&device_ordinal)); if (cudaSuccess != error) { break; } // Get SM count int sm_count; error = CubDebug(cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal)); if (cudaSuccess != error) { break; } // Init regular kernel configuration KernelConfig reduce_config; error = CubDebug(reduce_config.Init(reduce_kernel)); if (cudaSuccess != error) { break; } int reduce_device_occupancy = reduce_config.sm_occupancy * sm_count; // Even-share work distribution int max_blocks = reduce_device_occupancy * CUB_SUBSCRIPTION_FACTOR(0); GridEvenShare even_share; even_share.DispatchInit(num_items, max_blocks, reduce_config.tile_size); // Temporary storage allocation requirements void* allocations[1] = {}; size_t allocation_sizes[1] = { max_blocks * sizeof(AccumT) // bytes needed for privatized block // reductions }; // Alias the temporary allocations from the single storage blob (or // compute the necessary size of the blob) error = CubDebug(AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes)); if (cudaSuccess != error) { break; } if (d_temp_storage == NULL) { // Return if the caller is simply requesting the size of the storage // allocation return cudaSuccess; } // Alias the allocation for the privatized per-block reductions AccumT* d_block_reductions = (AccumT*) allocations[0]; // Get grid size for device_reduce_sweep_kernel int reduce_grid_size = even_share.grid_size; // Log device_reduce_sweep_kernel configuration #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking DeviceReduceKernel<<<%d, %d, 0, %lld>>>(), %d items " "per thread, %d SM occupancy\n", reduce_grid_size, ActivePolicyT::ReducePolicy::BLOCK_THREADS, (long long) stream, ActivePolicyT::ReducePolicy::ITEMS_PER_THREAD, reduce_config.sm_occupancy); #endif // Invoke DeviceReduceKernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( reduce_grid_size, ActivePolicyT::ReducePolicy::BLOCK_THREADS, 0, stream) .doit(reduce_kernel, d_in, d_block_reductions, num_items, even_share, reduction_op, transform_op); // Check for failure to launch error = CubDebug(cudaPeekAtLastError()); if (cudaSuccess != error) { break; } // Sync the stream if specified to flush runtime errors error = CubDebug(detail::DebugSyncStream(stream)); if (cudaSuccess != error) { break; } // Log single_reduce_sweep_kernel configuration #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), " "%d items per thread\n", ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, (long long) stream, ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD); #endif // Invoke DeviceReduceSingleTileKernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( 1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream) .doit(single_tile_kernel, d_block_reductions, d_out, reduce_grid_size, // triple_chevron is not type safe, make sure to use int reduction_op, init, ::cuda::std::__identity{}); // Check for failure to launch error = CubDebug(cudaPeekAtLastError()); if (cudaSuccess != error) { break; } // Sync the stream if specified to flush runtime errors error = CubDebug(detail::DebugSyncStream(stream)); if (cudaSuccess != error) { break; } } while (0); return error; } //--------------------------------------------------------------------------- // Chained policy invocation //--------------------------------------------------------------------------- /// Invocation template CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke() { typedef typename ActivePolicyT::SingleTilePolicy SingleTilePolicyT; typedef typename DispatchReduce::MaxPolicy MaxPolicyT; // Force kernel code-generation in all compiler passes if (num_items <= (SingleTilePolicyT::BLOCK_THREADS * SingleTilePolicyT::ITEMS_PER_THREAD)) { // Small, single tile size return InvokeSingleTile( DeviceReduceSingleTileKernel); } else { // Regular size return InvokePasses( DeviceReduceKernel, DeviceReduceSingleTileKernel); } } //--------------------------------------------------------------------------- // Dispatch entrypoints //--------------------------------------------------------------------------- /** * @brief Internal dispatch routine for computing a device-wide reduction * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no work * is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in] d_in * Pointer to the input sequence of data items * * @param[out] d_out * Pointer to the output aggregate * * @param[in] num_items * Total number of input items (i.e., length of `d_in`) * * @param[in] reduction_op * Binary reduction functor * * @param[in] init * The initial value of the reduction * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, OffsetT num_items, ReductionOpT reduction_op, InitT init, cudaStream_t stream, TransformOpT transform_op = {}) { typedef typename DispatchReduce::MaxPolicy MaxPolicyT; cudaError error = cudaSuccess; do { // Get PTX version int ptx_version = 0; error = CubDebug(PtxVersion(ptx_version)); if (cudaSuccess != error) { break; } // Create dispatch functor DispatchReduce dispatch( d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, reduction_op, init, stream, ptx_version, transform_op); // Dispatch to chained policy error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch)); if (cudaSuccess != error) { break; } } while (0); return error; } CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, OffsetT num_items, ReductionOpT reduction_op, InitT init, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return Dispatch(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, reduction_op, init, stream); } }; /** * @brief Utility class for dispatching the appropriately-tuned kernels for * device-wide transpose reduce * * @tparam InputIteratorT * Random-access input iterator type for reading input items @iterator * * @tparam OutputIteratorT * Output iterator type for recording the reduced aggregate @iterator * * @tparam OffsetT * Signed integer type for global offsets * * @tparam ReductionOpT * Binary reduction functor type having member * `auto operator()(const T &a, const U &b)` * * @tparam TransformOpT * Unary transform functor type having member * `auto operator()(const T &a)` * * @tparam InitT * Initial value type */ template >>, typename SelectedPolicyT = DeviceReducePolicy> using DispatchTransformReduce = DispatchReduce; /****************************************************************************** * Segmented dispatch *****************************************************************************/ /** * @brief Utility class for dispatching the appropriately-tuned kernels for * device-wide reduction * * @tparam InputIteratorT * Random-access input iterator type for reading input items @iterator * * @tparam OutputIteratorT * Output iterator type for recording the reduced aggregate @iterator * * @tparam BeginOffsetIteratorT * Random-access input iterator type for reading segment beginning offsets * @iterator * * @tparam EndOffsetIteratorT * Random-access input iterator type for reading segment ending offsets * @iterator * * @tparam OffsetT * Signed integer type for global offsets * * @tparam ReductionOpT * Binary reduction functor type having member * `auto operator()(const T &a, const U &b)` * * @tparam InitT * value type */ template >, typename AccumT = detail::accumulator_t>, typename SelectedPolicy = DeviceReducePolicy> struct DispatchSegmentedReduce : SelectedPolicy { //--------------------------------------------------------------------------- // Problem state //--------------------------------------------------------------------------- /// Device-accessible allocation of temporary storage. When `nullptr`, the /// required allocation size is written to `temp_storage_bytes` and no work /// is done. void* d_temp_storage; /// Reference to size in bytes of `d_temp_storage` allocation size_t& temp_storage_bytes; /// Pointer to the input sequence of data items InputIteratorT d_in; /// Pointer to the output aggregate OutputIteratorT d_out; /// The number of segments that comprise the sorting data int num_segments; /// Random-access input iterator to the sequence of beginning offsets of /// length `num_segments`, such that `d_begin_offsets[i]` is the first /// element of the *i*th data segment in `d_keys_*` and /// `d_values_*` BeginOffsetIteratorT d_begin_offsets; /// Random-access input iterator to the sequence of ending offsets of length /// `num_segments`, such that `d_end_offsets[i] - 1` is the last element of /// the *i*th data segment in `d_keys_*` and `d_values_*`. /// If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*th is /// considered empty. EndOffsetIteratorT d_end_offsets; /// Binary reduction functor ReductionOpT reduction_op; /// The initial value of the reduction InitT init; /// CUDA stream to launch kernels within. Default is stream0. cudaStream_t stream; int ptx_version; //--------------------------------------------------------------------------- // Constructor //--------------------------------------------------------------------------- /// Constructor CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchSegmentedReduce( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, ReductionOpT reduction_op, InitT init, cudaStream_t stream, int ptx_version) : d_temp_storage(d_temp_storage) , temp_storage_bytes(temp_storage_bytes) , d_in(d_in) , d_out(d_out) , num_segments(num_segments) , d_begin_offsets(d_begin_offsets) , d_end_offsets(d_end_offsets) , reduction_op(reduction_op) , init(init) , stream(stream) , ptx_version(ptx_version) {} CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchSegmentedReduce( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, ReductionOpT reduction_op, InitT init, cudaStream_t stream, bool debug_synchronous, int ptx_version) : d_temp_storage(d_temp_storage) , temp_storage_bytes(temp_storage_bytes) , d_in(d_in) , d_out(d_out) , num_segments(num_segments) , d_begin_offsets(d_begin_offsets) , d_end_offsets(d_end_offsets) , reduction_op(reduction_op) , init(init) , stream(stream) , ptx_version(ptx_version) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG } //--------------------------------------------------------------------------- // Chained policy invocation //--------------------------------------------------------------------------- /** * @brief Invocation * * @tparam ActivePolicyT * Umbrella policy active for the target device * * @tparam DeviceSegmentedReduceKernelT * Function type of cub::DeviceSegmentedReduceKernel * * @param[in] segmented_reduce_kernel * Kernel function pointer to parameterization of * cub::DeviceSegmentedReduceKernel */ template CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE cudaError_t InvokePasses(DeviceSegmentedReduceKernelT segmented_reduce_kernel) { cudaError error = cudaSuccess; do { // Return if the caller is simply requesting the size of the storage // allocation if (d_temp_storage == NULL) { temp_storage_bytes = 1; return cudaSuccess; } // Init kernel configuration KernelConfig segmented_reduce_config; error = CubDebug(segmented_reduce_config.Init(segmented_reduce_kernel)); if (cudaSuccess != error) { break; } // Log device_reduce_sweep_kernel configuration #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking SegmentedDeviceReduceKernel<<<%d, %d, 0, %lld>>>(), " "%d items per thread, %d SM occupancy\n", num_segments, ActivePolicyT::SegmentedReducePolicy::BLOCK_THREADS, (long long) stream, ActivePolicyT::SegmentedReducePolicy::ITEMS_PER_THREAD, segmented_reduce_config.sm_occupancy); #endif // Invoke DeviceReduceKernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( num_segments, ActivePolicyT::SegmentedReducePolicy::BLOCK_THREADS, 0, stream) .doit(segmented_reduce_kernel, d_in, d_out, d_begin_offsets, d_end_offsets, num_segments, reduction_op, init); // Check for failure to launch error = CubDebug(cudaPeekAtLastError()); if (cudaSuccess != error) { break; } // Sync the stream if specified to flush runtime errors error = CubDebug(detail::DebugSyncStream(stream)); if (cudaSuccess != error) { break; } } while (0); return error; } /// Invocation template CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke() { typedef typename DispatchSegmentedReduce::MaxPolicy MaxPolicyT; // Force kernel code-generation in all compiler passes return InvokePasses( DeviceSegmentedReduceKernel< MaxPolicyT, InputIteratorT, OutputIteratorT, BeginOffsetIteratorT, EndOffsetIteratorT, OffsetT, ReductionOpT, InitT, AccumT>); } //--------------------------------------------------------------------------- // Dispatch entrypoints //--------------------------------------------------------------------------- /** * @brief Internal dispatch routine for computing a device-wide reduction * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no work * is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in] d_in * Pointer to the input sequence of data items * * @param[out] d_out * Pointer to the output aggregate * * @param[in] num_segments * The number of segments that comprise the sorting data * * @param[in] d_begin_offsets * Random-access input iterator to the sequence of beginning offsets of * length `num_segments`, such that `d_begin_offsets[i]` is the first * element of the *i*th data segment in `d_keys_*` and * `d_values_*` * * @param[in] d_end_offsets * Random-access input iterator to the sequence of ending offsets of length * `num_segments`, such that `d_end_offsets[i] - 1` is the last element of * the *i*th data segment in `d_keys_*` and `d_values_*`. * If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*th is * considered empty. * * @param[in] reduction_op * Binary reduction functor * * @param[in] init * The initial value of the reduction * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, ReductionOpT reduction_op, InitT init, cudaStream_t stream) { typedef typename DispatchSegmentedReduce::MaxPolicy MaxPolicyT; if (num_segments <= 0) { return cudaSuccess; } cudaError error = cudaSuccess; do { // Get PTX version int ptx_version = 0; error = CubDebug(PtxVersion(ptx_version)); if (cudaSuccess != error) { break; } // Create dispatch functor DispatchSegmentedReduce dispatch( d_temp_storage, temp_storage_bytes, d_in, d_out, num_segments, d_begin_offsets, d_end_offsets, reduction_op, init, stream, ptx_version); // Dispatch to chained policy error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch)); if (cudaSuccess != error) { break; } } while (0); return error; } CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, ReductionOpT reduction_op, InitT init, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return Dispatch( d_temp_storage, temp_storage_bytes, d_in, d_out, num_segments, d_begin_offsets, d_end_offsets, reduction_op, init, stream); } }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh000066400000000000000000000442171463375617100242200ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * @file cub::DeviceReduceByKey provides device-wide, parallel operations for * reducing segments of values residing within device-accessible memory. */ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include #include #include #include #include #include #include #include #include CUB_NAMESPACE_BEGIN /****************************************************************************** * Kernel entry points *****************************************************************************/ /** * @brief Multi-block reduce-by-key sweep kernel entry point * * @tparam AgentReduceByKeyPolicyT * Parameterized AgentReduceByKeyPolicyT tuning policy type * * @tparam KeysInputIteratorT * Random-access input iterator type for keys * * @tparam UniqueOutputIteratorT * Random-access output iterator type for keys * * @tparam ValuesInputIteratorT * Random-access input iterator type for values * * @tparam AggregatesOutputIteratorT * Random-access output iterator type for values * * @tparam NumRunsOutputIteratorT * Output iterator type for recording number of segments encountered * * @tparam ScanTileStateT * Tile status interface type * * @tparam EqualityOpT * KeyT equality operator type * * @tparam ReductionOpT * ValueT reduction operator type * * @tparam OffsetT * Signed integer type for global offsets * * @param d_keys_in * Pointer to the input sequence of keys * * @param d_unique_out * Pointer to the output sequence of unique keys (one key per run) * * @param d_values_in * Pointer to the input sequence of corresponding values * * @param d_aggregates_out * Pointer to the output sequence of value aggregates (one aggregate per run) * * @param d_num_runs_out * Pointer to total number of runs encountered * (i.e., the length of d_unique_out) * * @param tile_state * Tile status interface * * @param start_tile * The starting tile for the current grid * * @param equality_op * KeyT equality operator * * @param reduction_op * ValueT reduction operator * * @param num_items * Total number of items to select from */ template __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReduceByKeyPolicyT::BLOCK_THREADS)) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceReduceByKeyKernel( KeysInputIteratorT d_keys_in, UniqueOutputIteratorT d_unique_out, ValuesInputIteratorT d_values_in, AggregatesOutputIteratorT d_aggregates_out, NumRunsOutputIteratorT d_num_runs_out, ScanTileStateT tile_state, int start_tile, EqualityOpT equality_op, ReductionOpT reduction_op, OffsetT num_items) { using AgentReduceByKeyPolicyT = typename ChainedPolicyT::ActivePolicy::ReduceByKeyPolicyT; // Thread block type for reducing tiles of value segments using AgentReduceByKeyT = AgentReduceByKey; // Shared memory for AgentReduceByKey __shared__ typename AgentReduceByKeyT::TempStorage temp_storage; // Process tiles AgentReduceByKeyT( temp_storage, d_keys_in, d_unique_out, d_values_in, d_aggregates_out, d_num_runs_out, equality_op, reduction_op) .ConsumeRange(num_items, tile_state, start_tile); } /****************************************************************************** * Dispatch ******************************************************************************/ /** * @brief Utility class for dispatching the appropriately-tuned kernels for * DeviceReduceByKey * * @tparam KeysInputIteratorT * Random-access input iterator type for keys * * @tparam UniqueOutputIteratorT * Random-access output iterator type for keys * * @tparam ValuesInputIteratorT * Random-access input iterator type for values * * @tparam AggregatesOutputIteratorT * Random-access output iterator type for values * * @tparam NumRunsOutputIteratorT * Output iterator type for recording number of segments encountered * * @tparam EqualityOpT * KeyT equality operator type * * @tparam ReductionOpT * ValueT reduction operator type * * @tparam OffsetT * Signed integer type for global offsets * * @tparam SelectedPolicy * Implementation detail, do not specify directly, requirements on the * content of this type are subject to breaking change. */ template < typename KeysInputIteratorT, typename UniqueOutputIteratorT, typename ValuesInputIteratorT, typename AggregatesOutputIteratorT, typename NumRunsOutputIteratorT, typename EqualityOpT, typename ReductionOpT, typename OffsetT, typename AccumT = // detail:: accumulator_t, cub::detail::value_t>, typename SelectedPolicy = // detail::device_reduce_by_key_policy_hub< // ReductionOpT, // AccumT, // cub::detail::non_void_value_t< // UniqueOutputIteratorT, // cub::detail::value_t>>> struct DispatchReduceByKey { //------------------------------------------------------------------------- // Types and constants //------------------------------------------------------------------------- // The input values type using ValueInputT = cub::detail::value_t; static constexpr int INIT_KERNEL_THREADS = 128; // Tile status descriptor interface type using ScanTileStateT = ReduceByKeyScanTileState; void* d_temp_storage; size_t& temp_storage_bytes; KeysInputIteratorT d_keys_in; UniqueOutputIteratorT d_unique_out; ValuesInputIteratorT d_values_in; AggregatesOutputIteratorT d_aggregates_out; NumRunsOutputIteratorT d_num_runs_out; EqualityOpT equality_op; ReductionOpT reduction_op; OffsetT num_items; cudaStream_t stream; CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchReduceByKey( void* d_temp_storage, size_t& temp_storage_bytes, KeysInputIteratorT d_keys_in, UniqueOutputIteratorT d_unique_out, ValuesInputIteratorT d_values_in, AggregatesOutputIteratorT d_aggregates_out, NumRunsOutputIteratorT d_num_runs_out, EqualityOpT equality_op, ReductionOpT reduction_op, OffsetT num_items, cudaStream_t stream) : d_temp_storage(d_temp_storage) , temp_storage_bytes(temp_storage_bytes) , d_keys_in(d_keys_in) , d_unique_out(d_unique_out) , d_values_in(d_values_in) , d_aggregates_out(d_aggregates_out) , d_num_runs_out(d_num_runs_out) , equality_op(equality_op) , reduction_op(reduction_op) , num_items(num_items) , stream(stream) {} //--------------------------------------------------------------------- // Dispatch entrypoints //--------------------------------------------------------------------- template CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE cudaError_t Invoke(ScanInitKernelT init_kernel, ReduceByKeyKernelT reduce_by_key_kernel) { using AgentReduceByKeyPolicyT = typename ActivePolicyT::ReduceByKeyPolicyT; constexpr int block_threads = AgentReduceByKeyPolicyT::BLOCK_THREADS; constexpr int items_per_thread = AgentReduceByKeyPolicyT::ITEMS_PER_THREAD; cudaError error = cudaSuccess; do { // Get device ordinal int device_ordinal; error = CubDebug(cudaGetDevice(&device_ordinal)); if (cudaSuccess != error) { break; } // Number of input tiles int tile_size = block_threads * items_per_thread; int num_tiles = static_cast(cub::DivideAndRoundUp(num_items, tile_size)); // Specify temporary storage allocation requirements size_t allocation_sizes[1]; error = CubDebug(ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0])); if (cudaSuccess != error) { break; // bytes needed for tile status descriptors } // Compute allocation pointers into the single storage blob (or compute // the necessary size of the blob) void* allocations[1] = {}; error = CubDebug(AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes)); if (cudaSuccess != error) { break; } if (d_temp_storage == nullptr) { // Return if the caller is simply requesting the size of the storage // allocation break; } // Construct the tile status interface ScanTileStateT tile_state; error = CubDebug(tile_state.Init(num_tiles, allocations[0], allocation_sizes[0])); if (cudaSuccess != error) { break; } // Log init_kernel configuration int init_grid_size = CUB_MAX(1, cub::DivideAndRoundUp(num_tiles, INIT_KERNEL_THREADS)); #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream); #endif // Invoke init_kernel to initialize tile descriptors THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream) .doit(init_kernel, tile_state, num_tiles, d_num_runs_out); // Check for failure to launch error = CubDebug(cudaPeekAtLastError()); if (cudaSuccess != error) { break; } // Sync the stream if specified to flush runtime errors error = CubDebug(detail::DebugSyncStream(stream)); if (cudaSuccess != error) { break; } // Return if empty problem if (num_items == 0) { break; } // Get SM occupancy for reduce_by_key_kernel int reduce_by_key_sm_occupancy; error = CubDebug(MaxSmOccupancy(reduce_by_key_sm_occupancy, reduce_by_key_kernel, block_threads)); if (cudaSuccess != error) { break; } // Get max x-dimension of grid int max_dim_x; error = CubDebug(cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal)); if (cudaSuccess != error) { break; } // Run grids in epochs (in case number of tiles exceeds max x-dimension int scan_grid_size = CUB_MIN(num_tiles, max_dim_x); for (int start_tile = 0; start_tile < num_tiles; start_tile += scan_grid_size) { // Log reduce_by_key_kernel configuration #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking %d reduce_by_key_kernel<<<%d, %d, 0, %lld>>>(), %d " "items per thread, %d SM occupancy\n", start_tile, scan_grid_size, block_threads, (long long) stream, items_per_thread, reduce_by_key_sm_occupancy); #endif // Invoke reduce_by_key_kernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(scan_grid_size, block_threads, 0, stream) .doit(reduce_by_key_kernel, d_keys_in, d_unique_out, d_values_in, d_aggregates_out, d_num_runs_out, tile_state, start_tile, equality_op, reduction_op, num_items); // Check for failure to launch error = CubDebug(cudaPeekAtLastError()); if (cudaSuccess != error) { break; } // Sync the stream if specified to flush runtime errors error = CubDebug(detail::DebugSyncStream(stream)); if (cudaSuccess != error) { break; } } } while (0); return error; } template CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke() { using MaxPolicyT = typename SelectedPolicy::MaxPolicy; return Invoke( DeviceCompactInitKernel, DeviceReduceByKeyKernel< MaxPolicyT, KeysInputIteratorT, UniqueOutputIteratorT, ValuesInputIteratorT, AggregatesOutputIteratorT, NumRunsOutputIteratorT, ScanTileStateT, EqualityOpT, ReductionOpT, OffsetT, AccumT>); } /** * Internal dispatch routine * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no * work is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in] d_keys_in * Pointer to the input sequence of keys * * @param[out] d_unique_out * Pointer to the output sequence of unique keys (one key per run) * * @param[in] d_values_in * Pointer to the input sequence of corresponding values * * @param[out] d_aggregates_out * Pointer to the output sequence of value aggregates * (one aggregate per run) * * @param[out] d_num_runs_out * Pointer to total number of runs encountered * (i.e., the length of d_unique_out) * * @param[in] equality_op * KeyT equality operator * * @param[in] reduction_op * ValueT reduction operator * * @param[in] num_items * Total number of items to select from * * @param[in] stream * CUDA stream to launch kernels within. Default is stream0. */ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch( void* d_temp_storage, size_t& temp_storage_bytes, KeysInputIteratorT d_keys_in, UniqueOutputIteratorT d_unique_out, ValuesInputIteratorT d_values_in, AggregatesOutputIteratorT d_aggregates_out, NumRunsOutputIteratorT d_num_runs_out, EqualityOpT equality_op, ReductionOpT reduction_op, OffsetT num_items, cudaStream_t stream) { using MaxPolicyT = typename SelectedPolicy::MaxPolicy; cudaError error = cudaSuccess; do { // Get PTX version int ptx_version = 0; error = CubDebug(PtxVersion(ptx_version)); if (cudaSuccess != error) { break; } DispatchReduceByKey dispatch( d_temp_storage, temp_storage_bytes, d_keys_in, d_unique_out, d_values_in, d_aggregates_out, d_num_runs_out, equality_op, reduction_op, num_items, stream); // Dispatch error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch)); if (cudaSuccess != error) { break; } } while (0); return error; } CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch( void* d_temp_storage, size_t& temp_storage_bytes, KeysInputIteratorT d_keys_in, UniqueOutputIteratorT d_unique_out, ValuesInputIteratorT d_values_in, AggregatesOutputIteratorT d_aggregates_out, NumRunsOutputIteratorT d_num_runs_out, EqualityOpT equality_op, ReductionOpT reduction_op, OffsetT num_items, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return Dispatch( d_temp_storage, temp_storage_bytes, d_keys_in, d_unique_out, d_values_in, d_aggregates_out, d_num_runs_out, equality_op, reduction_op, num_items, stream); } }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/device/dispatch/dispatch_rle.cuh000066400000000000000000000442301463375617100221640ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * @file * cub::DeviceRle provides device-wide, parallel operations for run-length-encoding sequences of * data items residing within device-accessible memory. */ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include #include #include #include #include #include #include #include CUB_NAMESPACE_BEGIN /****************************************************************************** * Kernel entry points *****************************************************************************/ /** * Select kernel entry point (multi-block) * * Performs functor-based selection if SelectOp functor type != NullType * Otherwise performs flag-based selection if FlagIterator's value type != NullType * Otherwise performs discontinuity selection (keep unique) * * @tparam AgentRlePolicyT * Parameterized AgentRlePolicyT tuning policy type * * @tparam InputIteratorT * Random-access input iterator type for reading input items @iterator * * @tparam OffsetsOutputIteratorT * Random-access output iterator type for writing run-offset values @iterator * * @tparam LengthsOutputIteratorT * Random-access output iterator type for writing run-length values @iterator * * @tparam NumRunsOutputIteratorT * Output iterator type for recording the number of runs encountered @iterator * * @tparam ScanTileStateT * Tile status interface type * * @tparam EqualityOpT * T equality operator type * * @tparam OffsetT * Signed integer type for global offsets * * @param d_in * Pointer to input sequence of data items * * @param d_offsets_out * Pointer to output sequence of run-offsets * * @param d_lengths_out * Pointer to output sequence of run-lengths * * @param d_num_runs_out * Pointer to total number of runs (i.e., length of `d_offsets_out`) * * @param tile_status * Tile status interface * * @param equality_op * Equality operator for input items * * @param num_items * Total number of input items (i.e., length of `d_in`) * * @param num_tiles * Total number of tiles for the entire problem */ template __launch_bounds__(int(ChainedPolicyT::ActivePolicy::RleSweepPolicyT::BLOCK_THREADS)) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceRleSweepKernel( InputIteratorT d_in, OffsetsOutputIteratorT d_offsets_out, LengthsOutputIteratorT d_lengths_out, NumRunsOutputIteratorT d_num_runs_out, ScanTileStateT tile_status, EqualityOpT equality_op, OffsetT num_items, int num_tiles) { using AgentRlePolicyT = typename ChainedPolicyT::ActivePolicy::RleSweepPolicyT; // Thread block type for selecting data from input tiles using AgentRleT = AgentRle; // Shared memory for AgentRle __shared__ typename AgentRleT::TempStorage temp_storage; // Process tiles AgentRleT(temp_storage, d_in, d_offsets_out, d_lengths_out, equality_op, num_items) .ConsumeRange(num_tiles, tile_status, d_num_runs_out); } /****************************************************************************** * Dispatch ******************************************************************************/ /** * Utility class for dispatching the appropriately-tuned kernels for DeviceRle * * @tparam InputIteratorT * Random-access input iterator type for reading input items @iterator * * @tparam OffsetsOutputIteratorT * Random-access output iterator type for writing run-offset values @iterator * * @tparam LengthsOutputIteratorT * Random-access output iterator type for writing run-length values @iterator * * @tparam NumRunsOutputIteratorT * Output iterator type for recording the number of runs encountered @iterator * * @tparam EqualityOpT * T equality operator type * * @tparam OffsetT * Signed integer type for global offsets * * @tparam SelectedPolicy * Implementation detail, do not specify directly, requirements on the * content of this type are subject to breaking change. */ template , cub::detail::value_t>> struct DeviceRleDispatch { /****************************************************************************** * Types and constants ******************************************************************************/ // The lengths output value type using LengthT = cub::detail::non_void_value_t; enum { INIT_KERNEL_THREADS = 128, }; // Tile status descriptor interface type using ScanTileStateT = ReduceByKeyScanTileState; void* d_temp_storage; size_t& temp_storage_bytes; InputIteratorT d_in; OffsetsOutputIteratorT d_offsets_out; LengthsOutputIteratorT d_lengths_out; NumRunsOutputIteratorT d_num_runs_out; EqualityOpT equality_op; OffsetT num_items; cudaStream_t stream; CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DeviceRleDispatch( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OffsetsOutputIteratorT d_offsets_out, LengthsOutputIteratorT d_lengths_out, NumRunsOutputIteratorT d_num_runs_out, EqualityOpT equality_op, OffsetT num_items, cudaStream_t stream) : d_temp_storage(d_temp_storage) , temp_storage_bytes(temp_storage_bytes) , d_in(d_in) , d_offsets_out(d_offsets_out) , d_lengths_out(d_lengths_out) , d_num_runs_out(d_num_runs_out) , equality_op(equality_op) , num_items(num_items) , stream(stream) {} /****************************************************************************** * Dispatch entrypoints ******************************************************************************/ /** * Internal dispatch routine for computing a device-wide run-length-encode using the * specified kernel functions. * * @tparam DeviceScanInitKernelPtr * Function type of cub::DeviceScanInitKernel * * @tparam DeviceRleSweepKernelPtr * Function type of cub::DeviceRleSweepKernelPtr * * @param d_temp_storage * Device-accessible allocation of temporary storage. * When NULL, the required allocation size is written to * `temp_storage_bytes` and no work is done. * * @param temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param d_in * Pointer to the input sequence of data items * * @param d_offsets_out * Pointer to the output sequence of run-offsets * * @param d_lengths_out * Pointer to the output sequence of run-lengths * * @param d_num_runs_out * Pointer to the total number of runs encountered (i.e., length of `d_offsets_out`) * * @param equality_op * Equality operator for input items * * @param num_items * Total number of input items (i.e., length of `d_in`) * * @param stream * CUDA stream to launch kernels within. Default is stream0. * * @param ptx_version * PTX version of dispatch kernels * * @param device_scan_init_kernel * Kernel function pointer to parameterization of cub::DeviceScanInitKernel * * @param device_rle_sweep_kernel * Kernel function pointer to parameterization of cub::DeviceRleSweepKernel */ template CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke(DeviceScanInitKernelPtr device_scan_init_kernel, DeviceRleSweepKernelPtr device_rle_sweep_kernel) { cudaError error = cudaSuccess; constexpr int block_threads = ActivePolicyT::RleSweepPolicyT::BLOCK_THREADS; constexpr int items_per_thread = ActivePolicyT::RleSweepPolicyT::ITEMS_PER_THREAD; do { // Get device ordinal int device_ordinal; error = CubDebug(cudaGetDevice(&device_ordinal)); if (cudaSuccess != error) { break; } // Number of input tiles int tile_size = block_threads * items_per_thread; int num_tiles = static_cast(cub::DivideAndRoundUp(num_items, tile_size)); // Specify temporary storage allocation requirements size_t allocation_sizes[1]; error = CubDebug(ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0])); if (cudaSuccess != error) { break; // bytes needed for tile status descriptors } // Compute allocation pointers into the single storage blob (or compute the necessary size of // the blob) void* allocations[1] = {}; error = CubDebug(AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes)); if (error != cudaSuccess) { break; } if (d_temp_storage == nullptr) { // Return if the caller is simply requesting the size of the storage allocation break; } // Construct the tile status interface ScanTileStateT tile_status; error = CubDebug(tile_status.Init(num_tiles, allocations[0], allocation_sizes[0])); if (cudaSuccess != error) { break; } // Log device_scan_init_kernel configuration int init_grid_size = CUB_MAX(1, cub::DivideAndRoundUp(num_tiles, INIT_KERNEL_THREADS)); #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking device_scan_init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream); #endif // Invoke device_scan_init_kernel to initialize tile descriptors and queue descriptors THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream) .doit(device_scan_init_kernel, tile_status, num_tiles, d_num_runs_out); // Check for failure to launch error = CubDebug(cudaPeekAtLastError()); if (cudaSuccess != error) { break; } // Sync the stream if specified to flush runtime errors error = CubDebug(detail::DebugSyncStream(stream)); if (cudaSuccess != error) { break; } // Return if empty problem if (num_items == 0) { break; } // Get SM occupancy for device_rle_sweep_kernel int device_rle_kernel_sm_occupancy; error = CubDebug(MaxSmOccupancy(device_rle_kernel_sm_occupancy, // out device_rle_sweep_kernel, block_threads)); if (cudaSuccess != error) { break; } // Get max x-dimension of grid int max_dim_x; error = CubDebug(cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal)); if (cudaSuccess != error) { break; } // Get grid size for scanning tiles dim3 scan_grid_size; scan_grid_size.z = 1; scan_grid_size.y = cub::DivideAndRoundUp(num_tiles, max_dim_x); scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x); // Log device_rle_sweep_kernel configuration #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking device_rle_sweep_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per " "thread, %d SM occupancy\n", scan_grid_size.x, scan_grid_size.y, scan_grid_size.z, block_threads, (long long) stream, items_per_thread, device_rle_kernel_sm_occupancy); #endif // Invoke device_rle_sweep_kernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(scan_grid_size, block_threads, 0, stream) .doit(device_rle_sweep_kernel, d_in, d_offsets_out, d_lengths_out, d_num_runs_out, tile_status, equality_op, num_items, num_tiles); // Check for failure to launch error = CubDebug(cudaPeekAtLastError()); if (cudaSuccess != error) { break; } // Sync the stream if specified to flush runtime errors error = CubDebug(detail::DebugSyncStream(stream)); if (cudaSuccess != error) { break; } } while (0); return error; } template CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke() { using MaxPolicyT = typename SelectedPolicy::MaxPolicy; return Invoke( DeviceCompactInitKernel, DeviceRleSweepKernel); } /** * Internal dispatch routine * * @param d_temp_storage * Device-accessible allocation of temporary storage. * When NULL, the required allocation size is written to * `temp_storage_bytes` and no work is done. * * @param temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param d_in * Pointer to input sequence of data items * * @param d_offsets_out * Pointer to output sequence of run-offsets * * @param d_lengths_out * Pointer to output sequence of run-lengths * * @param d_num_runs_out * Pointer to total number of runs (i.e., length of `d_offsets_out`) * * @param equality_op * Equality operator for input items * * @param num_items * Total number of input items (i.e., length of `d_in`) * * @param stream * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OffsetsOutputIteratorT d_offsets_out, LengthsOutputIteratorT d_lengths_out, NumRunsOutputIteratorT d_num_runs_out, EqualityOpT equality_op, OffsetT num_items, cudaStream_t stream) { using MaxPolicyT = typename SelectedPolicy::MaxPolicy; cudaError error = cudaSuccess; do { // Get PTX version int ptx_version = 0; error = CubDebug(PtxVersion(ptx_version)); if (cudaSuccess != error) { break; } DeviceRleDispatch dispatch( d_temp_storage, temp_storage_bytes, d_in, d_offsets_out, d_lengths_out, d_num_runs_out, equality_op, num_items, stream); // Dispatch error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch)); if (cudaSuccess != error) { break; } } while (0); return error; } CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OffsetsOutputIteratorT d_offsets_out, LengthsOutputIteratorT d_lengths_out, NumRunsOutputIteratorT d_num_runs_out, EqualityOpT equality_op, OffsetT num_items, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return Dispatch( d_temp_storage, temp_storage_bytes, d_in, d_offsets_out, d_lengths_out, d_num_runs_out, equality_op, num_items, stream); } }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/device/dispatch/dispatch_scan.cuh000066400000000000000000000444421463375617100223330ustar00rootroot00000000000000 /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * @file cub::DeviceScan provides device-wide, parallel operations for * computing a prefix scan across a sequence of data items residing * within device-accessible memory. */ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include #include #include #include #include #include #include CUB_NAMESPACE_BEGIN /****************************************************************************** * Kernel entry points *****************************************************************************/ /** * @brief Initialization kernel for tile status initialization (multi-block) * * @tparam ScanTileStateT * Tile status interface type * * @param[in] tile_state * Tile status interface * * @param[in] num_tiles * Number of tiles */ template CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceScanInitKernel(ScanTileStateT tile_state, int num_tiles) { // Initialize tile status tile_state.InitializeStatus(num_tiles); } /** * Initialization kernel for tile status initialization (multi-block) * * @tparam ScanTileStateT * Tile status interface type * * @tparam NumSelectedIteratorT * Output iterator type for recording the number of items selected * * @param[in] tile_state * Tile status interface * * @param[in] num_tiles * Number of tiles * * @param[out] d_num_selected_out * Pointer to the total number of items selected * (i.e., length of `d_selected_out`) */ template CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceCompactInitKernel(ScanTileStateT tile_state, int num_tiles, NumSelectedIteratorT d_num_selected_out) { // Initialize tile status tile_state.InitializeStatus(num_tiles); // Initialize d_num_selected_out if ((blockIdx.x == 0) && (threadIdx.x == 0)) { *d_num_selected_out = 0; } } /** * @brief Scan kernel entry point (multi-block) * * * @tparam ChainedPolicyT * Chained tuning policy * * @tparam InputIteratorT * Random-access input iterator type for reading scan inputs @iterator * * @tparam OutputIteratorT * Random-access output iterator type for writing scan outputs @iterator * * @tparam ScanTileStateT * Tile status interface type * * @tparam ScanOpT * Binary scan functor type having member * `auto operator()(const T &a, const U &b)` * * @tparam InitValueT * Initial value to seed the exclusive scan * (cub::NullType for inclusive scans) * * @tparam OffsetT * Signed integer type for global offsets * * @paramInput d_in * data * * @paramOutput d_out * data * * @paramTile tile_state * status interface * * @paramThe start_tile * starting tile for the current grid * * @paramBinary scan_op * scan functor * * @paramInitial init_value * value to seed the exclusive scan * * @paramTotal num_items * number of scan items for the entire problem */ template __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanPolicyT::BLOCK_THREADS)) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceScanKernel( InputIteratorT d_in, OutputIteratorT d_out, ScanTileStateT tile_state, int start_tile, ScanOpT scan_op, InitValueT init_value, OffsetT num_items) { using RealInitValueT = typename InitValueT::value_type; typedef typename ChainedPolicyT::ActivePolicy::ScanPolicyT ScanPolicyT; // Thread block type for scanning input tiles typedef AgentScan AgentScanT; // Shared memory for AgentScan __shared__ typename AgentScanT::TempStorage temp_storage; RealInitValueT real_init_value = init_value; // Process tiles AgentScanT(temp_storage, d_in, d_out, scan_op, real_init_value).ConsumeRange(num_items, tile_state, start_tile); } /****************************************************************************** * Dispatch ******************************************************************************/ /** * @brief Utility class for dispatching the appropriately-tuned kernels for * DeviceScan * * @tparam InputIteratorT * Random-access input iterator type for reading scan inputs @iterator * * @tparam OutputIteratorT * Random-access output iterator type for writing scan outputs @iterator * * @tparam ScanOpT * Binary scan functor type having member * `auto operator()(const T &a, const U &b)` * * @tparam InitValueT * The init_value element type for ScanOpT (cub::NullType for inclusive scans) * * @tparam OffsetT * Signed integer type for global offsets * */ template ::value, cub::detail::value_t, typename InitValueT::value_type>, cub::detail::value_t>, typename SelectedPolicy = DeviceScanPolicy> struct DispatchScan : SelectedPolicy { //--------------------------------------------------------------------- // Constants and Types //--------------------------------------------------------------------- static constexpr int INIT_KERNEL_THREADS = 128; // The input value type using InputT = cub::detail::value_t; /// Device-accessible allocation of temporary storage. When NULL, the /// required allocation size is written to \p temp_storage_bytes and no work /// is done. void* d_temp_storage; /// Reference to size in bytes of \p d_temp_storage allocation size_t& temp_storage_bytes; /// Iterator to the input sequence of data items InputIteratorT d_in; /// Iterator to the output sequence of data items OutputIteratorT d_out; /// Binary scan functor ScanOpT scan_op; /// Initial value to seed the exclusive scan InitValueT init_value; /// Total number of input items (i.e., the length of \p d_in) OffsetT num_items; /// CUDA stream to launch kernels within. Default is stream0. cudaStream_t stream; int ptx_version; /** * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no * work is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in] d_in * Iterator to the input sequence of data items * * @param[out] d_out * Iterator to the output sequence of data items * * @param[in] num_items * Total number of input items (i.e., the length of `d_in`) * * @param[in] scan_op * Binary scan functor * * @param[in] init_value * Initial value to seed the exclusive scan * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchScan( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, OffsetT num_items, ScanOpT scan_op, InitValueT init_value, cudaStream_t stream, int ptx_version) : d_temp_storage(d_temp_storage) , temp_storage_bytes(temp_storage_bytes) , d_in(d_in) , d_out(d_out) , scan_op(scan_op) , init_value(init_value) , num_items(num_items) , stream(stream) , ptx_version(ptx_version) {} CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchScan( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, OffsetT num_items, ScanOpT scan_op, InitValueT init_value, cudaStream_t stream, bool debug_synchronous, int ptx_version) : d_temp_storage(d_temp_storage) , temp_storage_bytes(temp_storage_bytes) , d_in(d_in) , d_out(d_out) , scan_op(scan_op) , init_value(init_value) , num_items(num_items) , stream(stream) , ptx_version(ptx_version) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG } template CUB_RUNTIME_FUNCTION _CCCL_HOST _CCCL_FORCEINLINE cudaError_t Invoke(InitKernel init_kernel, ScanKernel scan_kernel) { typedef typename ActivePolicyT::ScanPolicyT Policy; typedef typename cub::ScanTileState ScanTileStateT; // `LOAD_LDG` makes in-place execution UB and doesn't lead to better // performance. static_assert(Policy::LOAD_MODIFIER != CacheLoadModifier::LOAD_LDG, "The memory consistency model does not apply to texture " "accesses"); cudaError error = cudaSuccess; do { // Get device ordinal int device_ordinal; error = CubDebug(cudaGetDevice(&device_ordinal)); if (cudaSuccess != error) { break; } // Number of input tiles int tile_size = Policy::BLOCK_THREADS * Policy::ITEMS_PER_THREAD; int num_tiles = static_cast(cub::DivideAndRoundUp(num_items, tile_size)); // Specify temporary storage allocation requirements size_t allocation_sizes[1]; error = CubDebug(ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0])); if (cudaSuccess != error) { break; // bytes needed for tile status descriptors } // Compute allocation pointers into the single storage blob (or compute // the necessary size of the blob) void* allocations[1] = {}; error = CubDebug(AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes)); if (cudaSuccess != error) { break; } if (d_temp_storage == NULL) { // Return if the caller is simply requesting the size of the storage // allocation break; } // Return if empty problem if (num_items == 0) { break; } // Construct the tile status interface ScanTileStateT tile_state; error = CubDebug(tile_state.Init(num_tiles, allocations[0], allocation_sizes[0])); if (cudaSuccess != error) { break; } // Log init_kernel configuration int init_grid_size = cub::DivideAndRoundUp(num_tiles, INIT_KERNEL_THREADS); #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream); #endif // Invoke init_kernel to initialize tile descriptors THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream) .doit(init_kernel, tile_state, num_tiles); // Check for failure to launch error = CubDebug(cudaPeekAtLastError()); if (cudaSuccess != error) { break; } // Sync the stream if specified to flush runtime errors error = CubDebug(detail::DebugSyncStream(stream)); if (cudaSuccess != error) { break; } // Get SM occupancy for scan_kernel int scan_sm_occupancy; error = CubDebug(MaxSmOccupancy(scan_sm_occupancy, // out scan_kernel, Policy::BLOCK_THREADS)); if (cudaSuccess != error) { break; } // Get max x-dimension of grid int max_dim_x; error = CubDebug(cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal)); if (cudaSuccess != error) { break; } // Run grids in epochs (in case number of tiles exceeds max x-dimension int scan_grid_size = CUB_MIN(num_tiles, max_dim_x); for (int start_tile = 0; start_tile < num_tiles; start_tile += scan_grid_size) { // Log scan_kernel configuration #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking %d scan_kernel<<<%d, %d, 0, %lld>>>(), %d items " "per thread, %d SM occupancy\n", start_tile, scan_grid_size, Policy::BLOCK_THREADS, (long long) stream, Policy::ITEMS_PER_THREAD, scan_sm_occupancy); #endif // Invoke scan_kernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(scan_grid_size, Policy::BLOCK_THREADS, 0, stream) .doit(scan_kernel, d_in, d_out, tile_state, start_tile, scan_op, init_value, num_items); // Check for failure to launch error = CubDebug(cudaPeekAtLastError()); if (cudaSuccess != error) { break; } // Sync the stream if specified to flush runtime errors error = CubDebug(detail::DebugSyncStream(stream)); if (cudaSuccess != error) { break; } } } while (0); return error; } template CUB_RUNTIME_FUNCTION _CCCL_HOST _CCCL_FORCEINLINE cudaError_t Invoke() { typedef typename DispatchScan::MaxPolicy MaxPolicyT; typedef typename cub::ScanTileState ScanTileStateT; // Ensure kernels are instantiated. return Invoke( DeviceScanInitKernel, DeviceScanKernel); } /** * @brief Internal dispatch routine * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no * work is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in] d_in * Iterator to the input sequence of data items * * @param[out] d_out * Iterator to the output sequence of data items * * @param[in] scan_op * Binary scan functor * * @param[in] init_value * Initial value to seed the exclusive scan * * @param[in] num_items * Total number of input items (i.e., the length of `d_in`) * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. * Default is stream0. * */ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, ScanOpT scan_op, InitValueT init_value, OffsetT num_items, cudaStream_t stream) { typedef typename DispatchScan::MaxPolicy MaxPolicyT; cudaError_t error; do { // Get PTX version int ptx_version = 0; error = CubDebug(PtxVersion(ptx_version)); if (cudaSuccess != error) { break; } // Create dispatch functor DispatchScan dispatch( d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, scan_op, init_value, stream, ptx_version); // Dispatch to chained policy error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch)); if (cudaSuccess != error) { break; } } while (0); return error; } CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, ScanOpT scan_op, InitValueT init_value, OffsetT num_items, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return Dispatch(d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, init_value, num_items, stream); } }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/device/dispatch/dispatch_scan_by_key.cuh000066400000000000000000000472611463375617100236770ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * @file DeviceScan provides device-wide, parallel operations for computing a * prefix scan across a sequence of data items residing within * device-accessible memory. */ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include #include #include #include #include #include #include CUB_NAMESPACE_BEGIN /****************************************************************************** * Kernel entry points *****************************************************************************/ /** * @brief Scan kernel entry point (multi-block) * * @tparam ChainedPolicyT * Chained tuning policy * * @tparam KeysInputIteratorT * Random-access input iterator type * * @tparam ValuesInputIteratorT * Random-access input iterator type * * @tparam ValuesOutputIteratorT * Random-access output iterator type * * @tparam ScanByKeyTileStateT * Tile status interface type * * @tparam EqualityOp * Equality functor type * * @tparam ScanOpT * Scan functor type * * @tparam InitValueT * The init_value element for ScanOpT type (cub::NullType for inclusive scan) * * @tparam OffsetT * Signed integer type for global offsets * * @param d_keys_in * Input keys data * * @param d_keys_prev_in * Predecessor items for each tile * * @param d_values_in * Input values data * * @param d_values_out * Output values data * * @param tile_state * Tile status interface * * @param start_tile * The starting tile for the current grid * * @param equality_op * Binary equality functor * * @param scan_op * Binary scan functor * * @param init_value * Initial value to seed the exclusive scan * * @param num_items * Total number of scan items for the entire problem */ template > __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanByKeyPolicyT::BLOCK_THREADS)) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceScanByKeyKernel( KeysInputIteratorT d_keys_in, KeyT* d_keys_prev_in, ValuesInputIteratorT d_values_in, ValuesOutputIteratorT d_values_out, ScanByKeyTileStateT tile_state, int start_tile, EqualityOp equality_op, ScanOpT scan_op, InitValueT init_value, OffsetT num_items) { using ScanByKeyPolicyT = typename ChainedPolicyT::ActivePolicy::ScanByKeyPolicyT; // Thread block type for scanning input tiles using AgentScanByKeyT = AgentScanByKey; // Shared memory for AgentScanByKey __shared__ typename AgentScanByKeyT::TempStorage temp_storage; // Process tiles AgentScanByKeyT(temp_storage, d_keys_in, d_keys_prev_in, d_values_in, d_values_out, equality_op, scan_op, init_value) .ConsumeRange(num_items, tile_state, start_tile); } template CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceScanByKeyInitKernel( ScanTileStateT tile_state, KeysInputIteratorT d_keys_in, cub::detail::value_t* d_keys_prev_in, unsigned items_per_tile, int num_tiles) { // Initialize tile status tile_state.InitializeStatus(num_tiles); const unsigned tid = threadIdx.x + blockDim.x * blockIdx.x; const unsigned tile_base = tid * items_per_tile; if (tid > 0 && tid < num_tiles) { d_keys_prev_in[tid] = d_keys_in[tile_base - 1]; } } /****************************************************************************** * Dispatch ******************************************************************************/ /** * @brief Utility class for dispatching the appropriately-tuned kernels * for DeviceScan * * @tparam KeysInputIteratorT * Random-access input iterator type * * @tparam ValuesInputIteratorT * Random-access input iterator type * * @tparam ValuesOutputIteratorT * Random-access output iterator type * * @tparam EqualityOp * Equality functor type * * @tparam ScanOpT * Scan functor type * * @tparam InitValueT * The init_value element for ScanOpT type (cub::NullType for inclusive scan) * * @tparam OffsetT * Signed integer type for global offsets * */ template < typename KeysInputIteratorT, typename ValuesInputIteratorT, typename ValuesOutputIteratorT, typename EqualityOp, typename ScanOpT, typename InitValueT, typename OffsetT, typename AccumT = detail::accumulator_t< ScanOpT, cub::detail:: conditional_t::value, cub::detail::value_t, InitValueT>, cub::detail::value_t>, typename SelectedPolicy = DeviceScanByKeyPolicy, ScanOpT>> struct DispatchScanByKey : SelectedPolicy { //--------------------------------------------------------------------- // Constants and Types //--------------------------------------------------------------------- static constexpr int INIT_KERNEL_THREADS = 128; // The input key type using KeyT = cub::detail::value_t; // The input value type using InputT = cub::detail::value_t; /// Device-accessible allocation of temporary storage. When `nullptr`, the /// required allocation size is written to `temp_storage_bytes` and no work /// is done. void* d_temp_storage; /// Reference to size in bytes of `d_temp_storage` allocation size_t& temp_storage_bytes; /// Iterator to the input sequence of key items KeysInputIteratorT d_keys_in; /// Iterator to the input sequence of value items ValuesInputIteratorT d_values_in; /// Iterator to the input sequence of value items ValuesOutputIteratorT d_values_out; /// Binary equality functor EqualityOp equality_op; /// Binary scan functor ScanOpT scan_op; /// Initial value to seed the exclusive scan InitValueT init_value; /// Total number of input items (i.e., the length of `d_in`) OffsetT num_items; /// CUDA stream to launch kernels within. cudaStream_t stream; int ptx_version; /** * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no * work is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in] d_keys_in * Iterator to the input sequence of key items * * @param[in] d_values_in * Iterator to the input sequence of value items * * @param[out] d_values_out * Iterator to the input sequence of value items * * @param[in] equality_op * Binary equality functor * * @param[in] scan_op * Binary scan functor * * @param[in] init_value * Initial value to seed the exclusive scan * * @param[in] num_items * Total number of input items (i.e., the length of `d_in`) * * @param[in] stream * CUDA stream to launch kernels within. */ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchScanByKey( void* d_temp_storage, size_t& temp_storage_bytes, KeysInputIteratorT d_keys_in, ValuesInputIteratorT d_values_in, ValuesOutputIteratorT d_values_out, EqualityOp equality_op, ScanOpT scan_op, InitValueT init_value, OffsetT num_items, cudaStream_t stream, int ptx_version) : d_temp_storage(d_temp_storage) , temp_storage_bytes(temp_storage_bytes) , d_keys_in(d_keys_in) , d_values_in(d_values_in) , d_values_out(d_values_out) , equality_op(equality_op) , scan_op(scan_op) , init_value(init_value) , num_items(num_items) , stream(stream) , ptx_version(ptx_version) {} CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchScanByKey( void* d_temp_storage, size_t& temp_storage_bytes, KeysInputIteratorT d_keys_in, ValuesInputIteratorT d_values_in, ValuesOutputIteratorT d_values_out, EqualityOp equality_op, ScanOpT scan_op, InitValueT init_value, OffsetT num_items, cudaStream_t stream, bool debug_synchronous, int ptx_version) : d_temp_storage(d_temp_storage) , temp_storage_bytes(temp_storage_bytes) , d_keys_in(d_keys_in) , d_values_in(d_values_in) , d_values_out(d_values_out) , equality_op(equality_op) , scan_op(scan_op) , init_value(init_value) , num_items(num_items) , stream(stream) , ptx_version(ptx_version) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG } template CUB_RUNTIME_FUNCTION _CCCL_HOST _CCCL_FORCEINLINE cudaError_t Invoke(InitKernel init_kernel, ScanKernel scan_kernel) { using Policy = typename ActivePolicyT::ScanByKeyPolicyT; using ScanByKeyTileStateT = ReduceByKeyScanTileState; cudaError error = cudaSuccess; do { // Get device ordinal int device_ordinal; error = CubDebug(cudaGetDevice(&device_ordinal)); if (cudaSuccess != error) { break; } // Number of input tiles int tile_size = Policy::BLOCK_THREADS * Policy::ITEMS_PER_THREAD; int num_tiles = static_cast(cub::DivideAndRoundUp(num_items, tile_size)); // Specify temporary storage allocation requirements size_t allocation_sizes[2]; error = CubDebug(ScanByKeyTileStateT::AllocationSize(num_tiles, allocation_sizes[0])); if (cudaSuccess != error) { break; // bytes needed for tile status descriptors } allocation_sizes[1] = sizeof(KeyT) * (num_tiles + 1); // Compute allocation pointers into the single storage blob (or compute // the necessary size of the blob) void* allocations[2] = {}; error = CubDebug(AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes)); if (cudaSuccess != error) { break; } if (d_temp_storage == NULL) { // Return if the caller is simply requesting the size of the storage // allocation break; } // Return if empty problem if (num_items == 0) { break; } KeyT* d_keys_prev_in = reinterpret_cast(allocations[1]); // Construct the tile status interface ScanByKeyTileStateT tile_state; error = CubDebug(tile_state.Init(num_tiles, allocations[0], allocation_sizes[0])); if (cudaSuccess != error) { break; } // Log init_kernel configuration int init_grid_size = cub::DivideAndRoundUp(num_tiles, INIT_KERNEL_THREADS); #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream); #endif // Invoke init_kernel to initialize tile descriptors THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream) .doit(init_kernel, tile_state, d_keys_in, d_keys_prev_in, tile_size, num_tiles); // Check for failure to launch error = CubDebug(cudaPeekAtLastError()); if (cudaSuccess != error) { break; } // Sync the stream if specified to flush runtime errors error = CubDebug(detail::DebugSyncStream(stream)); if (cudaSuccess != error) { break; } // Get SM occupancy for scan_kernel int scan_sm_occupancy; error = CubDebug(MaxSmOccupancy(scan_sm_occupancy, // out scan_kernel, Policy::BLOCK_THREADS)); if (cudaSuccess != error) { break; } // Get max x-dimension of grid int max_dim_x; error = CubDebug(cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal)); if (cudaSuccess != error) { break; } // Run grids in epochs (in case number of tiles exceeds max x-dimension int scan_grid_size = CUB_MIN(num_tiles, max_dim_x); for (int start_tile = 0; start_tile < num_tiles; start_tile += scan_grid_size) { // Log scan_kernel configuration #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking %d scan_kernel<<<%d, %d, 0, %lld>>>(), %d items " "per thread, %d SM occupancy\n", start_tile, scan_grid_size, Policy::BLOCK_THREADS, (long long) stream, Policy::ITEMS_PER_THREAD, scan_sm_occupancy); #endif // Invoke scan_kernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(scan_grid_size, Policy::BLOCK_THREADS, 0, stream) .doit(scan_kernel, d_keys_in, d_keys_prev_in, d_values_in, d_values_out, tile_state, start_tile, equality_op, scan_op, init_value, num_items); // Check for failure to launch error = CubDebug(cudaPeekAtLastError()); if (cudaSuccess != error) { break; } // Sync the stream if specified to flush runtime errors error = CubDebug(detail::DebugSyncStream(stream)); if (cudaSuccess != error) { break; } } } while (0); return error; } template CUB_RUNTIME_FUNCTION _CCCL_HOST _CCCL_FORCEINLINE cudaError_t Invoke() { using MaxPolicyT = typename DispatchScanByKey::MaxPolicy; using ScanByKeyTileStateT = ReduceByKeyScanTileState; // Ensure kernels are instantiated. return Invoke( DeviceScanByKeyInitKernel, DeviceScanByKeyKernel); } /** * @brief Internal dispatch routine * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no * work is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in] d_keys_in * Iterator to the input sequence of key items * * @param[in] d_values_in * Iterator to the input sequence of value items * * @param[out] d_values_out * Iterator to the input sequence of value items * * @param[in] equality_op * Binary equality functor * * @param[in] scan_op * Binary scan functor * * @param[in] init_value * Initial value to seed the exclusive scan * * @param[in] num_items * Total number of input items (i.e., the length of `d_in`) * * @param[in] stream * CUDA stream to launch kernels within. */ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch( void* d_temp_storage, size_t& temp_storage_bytes, KeysInputIteratorT d_keys_in, ValuesInputIteratorT d_values_in, ValuesOutputIteratorT d_values_out, EqualityOp equality_op, ScanOpT scan_op, InitValueT init_value, OffsetT num_items, cudaStream_t stream) { using MaxPolicyT = typename DispatchScanByKey::MaxPolicy; cudaError_t error; do { // Get PTX version int ptx_version = 0; error = CubDebug(PtxVersion(ptx_version)); if (cudaSuccess != error) { break; } // Create dispatch functor DispatchScanByKey dispatch( d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_values_out, equality_op, scan_op, init_value, num_items, stream, ptx_version); // Dispatch to chained policy error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch)); if (cudaSuccess != error) { break; } } while (0); return error; } CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch( void* d_temp_storage, size_t& temp_storage_bytes, KeysInputIteratorT d_keys_in, ValuesInputIteratorT d_values_in, ValuesOutputIteratorT d_values_out, EqualityOp equality_op, ScanOpT scan_op, InitValueT init_value, OffsetT num_items, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return Dispatch( d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_values_out, equality_op, scan_op, init_value, num_items, stream); } }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/device/dispatch/dispatch_segmented_sort.cuh000066400000000000000000001736101463375617100244310ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include CUB_NAMESPACE_BEGIN /** * @brief Fallback kernel, in case there's not enough segments to * take advantage of partitioning. * * In this case, the sorting method is still selected based on the segment size. * If a single warp can sort the segment, the algorithm will use the sub-warp * merge sort. Otherwise, the algorithm will use the in-shared-memory version of * block radix sort. If data don't fit into shared memory, the algorithm will * use in-global-memory radix sort. * * @param[in] d_keys_in_orig * Input keys buffer * * @param[out] d_keys_out_orig * Output keys buffer * * @param[in,out] d_keys_double_buffer * Double keys buffer * * @param[in] d_values_in_orig * Input values buffer * * @param[out] d_values_out_orig * Output values buffer * * @param[in,out] d_values_double_buffer * Double values buffer * * @param[in] d_begin_offsets * Random-access input iterator to the sequence of beginning offsets of length * `num_segments`, such that `d_begin_offsets[i]` is the first element of the * i-th data segment in `d_keys_*` and `d_values_*` * * @param[in] d_end_offsets * Random-access input iterator to the sequence of ending offsets of length * `num_segments`, such that `d_end_offsets[i]-1` is the last element of the * i-th data segment in `d_keys_*` and `d_values_*`. * If `d_end_offsets[i]-1 <= d_begin_offsets[i]`, the i-th segment is * considered empty. */ template __launch_bounds__(ChainedPolicyT::ActivePolicy::LargeSegmentPolicy::BLOCK_THREADS) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSegmentedSortFallbackKernel( const KeyT* d_keys_in_orig, KeyT* d_keys_out_orig, cub::detail::device_double_buffer d_keys_double_buffer, const ValueT* d_values_in_orig, ValueT* d_values_out_orig, cub::detail::device_double_buffer d_values_double_buffer, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets) { using ActivePolicyT = typename ChainedPolicyT::ActivePolicy; using LargeSegmentPolicyT = typename ActivePolicyT::LargeSegmentPolicy; using MediumPolicyT = typename ActivePolicyT::SmallAndMediumSegmentedSortPolicyT::MediumPolicyT; const unsigned int segment_id = blockIdx.x; OffsetT segment_begin = d_begin_offsets[segment_id]; OffsetT segment_end = d_end_offsets[segment_id]; OffsetT num_items = segment_end - segment_begin; if (num_items <= 0) { return; } using AgentSegmentedRadixSortT = cub::AgentSegmentedRadixSort; using WarpReduceT = cub::WarpReduce; using AgentWarpMergeSortT = AgentSubWarpSort; __shared__ union { typename AgentSegmentedRadixSortT::TempStorage block_sort; typename WarpReduceT::TempStorage warp_reduce; typename AgentWarpMergeSortT::TempStorage medium_warp_sort; } temp_storage; constexpr bool keys_only = std::is_same::value; AgentSegmentedRadixSortT agent(num_items, temp_storage.block_sort); constexpr int begin_bit = 0; constexpr int end_bit = sizeof(KeyT) * 8; constexpr int cacheable_tile_size = LargeSegmentPolicyT::BLOCK_THREADS * LargeSegmentPolicyT::ITEMS_PER_THREAD; d_keys_in_orig += segment_begin; d_keys_out_orig += segment_begin; if (!keys_only) { d_values_in_orig += segment_begin; d_values_out_orig += segment_begin; } if (num_items <= MediumPolicyT::ITEMS_PER_TILE) { // Sort by a single warp if (threadIdx.x < MediumPolicyT::WARP_THREADS) { AgentWarpMergeSortT(temp_storage.medium_warp_sort) .ProcessSegment(num_items, d_keys_in_orig, d_keys_out_orig, d_values_in_orig, d_values_out_orig); } } else if (num_items < cacheable_tile_size) { // Sort by a CTA if data fits into shared memory agent.ProcessSinglePass(begin_bit, end_bit, d_keys_in_orig, d_values_in_orig, d_keys_out_orig, d_values_out_orig); } else { // Sort by a CTA with multiple reads from global memory int current_bit = begin_bit; int pass_bits = (cub::min)(int{LargeSegmentPolicyT::RADIX_BITS}, (end_bit - current_bit)); d_keys_double_buffer = cub::detail::device_double_buffer( d_keys_double_buffer.current() + segment_begin, d_keys_double_buffer.alternate() + segment_begin); if (!keys_only) { d_values_double_buffer = cub::detail::device_double_buffer( d_values_double_buffer.current() + segment_begin, d_values_double_buffer.alternate() + segment_begin); } agent.ProcessIterative( current_bit, pass_bits, d_keys_in_orig, d_values_in_orig, d_keys_double_buffer.current(), d_values_double_buffer.current()); current_bit += pass_bits; #pragma unroll 1 while (current_bit < end_bit) { pass_bits = (cub::min)(int{LargeSegmentPolicyT::RADIX_BITS}, (end_bit - current_bit)); CTA_SYNC(); agent.ProcessIterative( current_bit, pass_bits, d_keys_double_buffer.current(), d_values_double_buffer.current(), d_keys_double_buffer.alternate(), d_values_double_buffer.alternate()); d_keys_double_buffer.swap(); d_values_double_buffer.swap(); current_bit += pass_bits; } } } /** * @brief Single kernel for moderate size (less than a few thousand items) * segments. * * This kernel allocates a sub-warp per segment. Therefore, this kernel assigns * a single thread block to multiple segments. Segments fall into two * categories. An architectural warp usually sorts segments in the medium-size * category, while a few threads sort segments in the small-size category. Since * segments are partitioned, we know the last thread block index assigned to * sort medium-size segments. A particular thread block can check this number to * find out which category it was assigned to sort. In both cases, the * merge sort is used. * * @param[in] small_segments * Number of segments that can be sorted by a warp part * * @param[in] medium_segments * Number of segments that can be sorted by a warp * * @param[in] medium_blocks * Number of CTAs assigned to process medium segments * * @param[in] d_small_segments_indices * Small segments mapping of length @p small_segments, such that * `d_small_segments_indices[i]` is the input segment index * * @param[in] d_medium_segments_indices * Medium segments mapping of length @p medium_segments, such that * `d_medium_segments_indices[i]` is the input segment index * * @param[in] d_keys_in_orig * Input keys buffer * * @param[out] d_keys_out_orig * Output keys buffer * * @param[in] d_values_in_orig * Input values buffer * * @param[out] d_values_out_orig * Output values buffer * * @param[in] d_begin_offsets * Random-access input iterator to the sequence of beginning offsets of length * `num_segments`, such that `d_begin_offsets[i]` is the first element of the * ith data segment in `d_keys_*` and `d_values_*` * * @param[in] d_end_offsets * Random-access input iterator to the sequence of ending offsets of length * `num_segments`, such that `d_end_offsets[i]-1` is the last element of the * ith data segment in `d_keys_*` and `d_values_*`. If * `d_end_offsets[i]-1 <= d_begin_offsets[i]`, the ith is * considered empty. */ template __launch_bounds__(ChainedPolicyT::ActivePolicy::SmallAndMediumSegmentedSortPolicyT::BLOCK_THREADS) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSegmentedSortKernelSmall( unsigned int small_segments, unsigned int medium_segments, unsigned int medium_blocks, const unsigned int* d_small_segments_indices, const unsigned int* d_medium_segments_indices, const KeyT* d_keys_in, KeyT* d_keys_out, const ValueT* d_values_in, ValueT* d_values_out, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets) { const unsigned int tid = threadIdx.x; const unsigned int bid = blockIdx.x; using ActivePolicyT = typename ChainedPolicyT::ActivePolicy; using SmallAndMediumPolicyT = typename ActivePolicyT::SmallAndMediumSegmentedSortPolicyT; using MediumPolicyT = typename SmallAndMediumPolicyT::MediumPolicyT; using SmallPolicyT = typename SmallAndMediumPolicyT::SmallPolicyT; constexpr int threads_per_medium_segment = MediumPolicyT::WARP_THREADS; constexpr int threads_per_small_segment = SmallPolicyT::WARP_THREADS; using MediumAgentWarpMergeSortT = AgentSubWarpSort; using SmallAgentWarpMergeSortT = AgentSubWarpSort; constexpr auto segments_per_medium_block = static_cast(SmallAndMediumPolicyT::SEGMENTS_PER_MEDIUM_BLOCK); constexpr auto segments_per_small_block = static_cast(SmallAndMediumPolicyT::SEGMENTS_PER_SMALL_BLOCK); __shared__ union { typename MediumAgentWarpMergeSortT::TempStorage medium_storage[segments_per_medium_block]; typename SmallAgentWarpMergeSortT::TempStorage small_storage[segments_per_small_block]; } temp_storage; if (bid < medium_blocks) { const unsigned int sid_within_block = tid / threads_per_medium_segment; const unsigned int medium_segment_id = bid * segments_per_medium_block + sid_within_block; if (medium_segment_id < medium_segments) { const unsigned int global_segment_id = d_medium_segments_indices[medium_segment_id]; const OffsetT segment_begin = d_begin_offsets[global_segment_id]; const OffsetT segment_end = d_end_offsets[global_segment_id]; const OffsetT num_items = segment_end - segment_begin; MediumAgentWarpMergeSortT(temp_storage.medium_storage[sid_within_block]) .ProcessSegment(num_items, d_keys_in + segment_begin, d_keys_out + segment_begin, d_values_in + segment_begin, d_values_out + segment_begin); } } else { const unsigned int sid_within_block = tid / threads_per_small_segment; const unsigned int small_segment_id = (bid - medium_blocks) * segments_per_small_block + sid_within_block; if (small_segment_id < small_segments) { const unsigned int global_segment_id = d_small_segments_indices[small_segment_id]; const OffsetT segment_begin = d_begin_offsets[global_segment_id]; const OffsetT segment_end = d_end_offsets[global_segment_id]; const OffsetT num_items = segment_end - segment_begin; SmallAgentWarpMergeSortT(temp_storage.small_storage[sid_within_block]) .ProcessSegment(num_items, d_keys_in + segment_begin, d_keys_out + segment_begin, d_values_in + segment_begin, d_values_out + segment_begin); } } } /** * @brief Single kernel for large size (more than a few thousand items) segments. * * @param[in] d_keys_in_orig * Input keys buffer * * @param[out] d_keys_out_orig * Output keys buffer * * @param[in] d_values_in_orig * Input values buffer * * @param[out] d_values_out_orig * Output values buffer * * @param[in] d_begin_offsets * Random-access input iterator to the sequence of beginning offsets of length * `num_segments`, such that `d_begin_offsets[i]` is the first element of the * ith data segment in `d_keys_*` and `d_values_*` * * @param[in] d_end_offsets * Random-access input iterator to the sequence of ending offsets of length * `num_segments`, such that `d_end_offsets[i]-1` is the last element of the * ith data segment in `d_keys_*` and `d_values_*`. If * `d_end_offsets[i]-1 <= d_begin_offsets[i]`, the ith is * considered empty. */ template __launch_bounds__(ChainedPolicyT::ActivePolicy::LargeSegmentPolicy::BLOCK_THREADS) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSegmentedSortKernelLarge( const unsigned int* d_segments_indices, const KeyT* d_keys_in_orig, KeyT* d_keys_out_orig, cub::detail::device_double_buffer d_keys_double_buffer, const ValueT* d_values_in_orig, ValueT* d_values_out_orig, cub::detail::device_double_buffer d_values_double_buffer, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets) { using ActivePolicyT = typename ChainedPolicyT::ActivePolicy; using LargeSegmentPolicyT = typename ActivePolicyT::LargeSegmentPolicy; constexpr int small_tile_size = LargeSegmentPolicyT::BLOCK_THREADS * LargeSegmentPolicyT::ITEMS_PER_THREAD; using AgentSegmentedRadixSortT = cub::AgentSegmentedRadixSort; __shared__ typename AgentSegmentedRadixSortT::TempStorage storage; const unsigned int bid = blockIdx.x; constexpr int begin_bit = 0; constexpr int end_bit = sizeof(KeyT) * 8; const unsigned int global_segment_id = d_segments_indices[bid]; const OffsetT segment_begin = d_begin_offsets[global_segment_id]; const OffsetT segment_end = d_end_offsets[global_segment_id]; const OffsetT num_items = segment_end - segment_begin; constexpr bool keys_only = std::is_same::value; AgentSegmentedRadixSortT agent(num_items, storage); d_keys_in_orig += segment_begin; d_keys_out_orig += segment_begin; if (!keys_only) { d_values_in_orig += segment_begin; d_values_out_orig += segment_begin; } if (num_items < small_tile_size) { // Sort in shared memory if the segment fits into it agent.ProcessSinglePass(begin_bit, end_bit, d_keys_in_orig, d_values_in_orig, d_keys_out_orig, d_values_out_orig); } else { // Sort reading global memory multiple times int current_bit = begin_bit; int pass_bits = (cub::min)(int{LargeSegmentPolicyT::RADIX_BITS}, (end_bit - current_bit)); d_keys_double_buffer = cub::detail::device_double_buffer( d_keys_double_buffer.current() + segment_begin, d_keys_double_buffer.alternate() + segment_begin); if (!keys_only) { d_values_double_buffer = cub::detail::device_double_buffer( d_values_double_buffer.current() + segment_begin, d_values_double_buffer.alternate() + segment_begin); } agent.ProcessIterative( current_bit, pass_bits, d_keys_in_orig, d_values_in_orig, d_keys_double_buffer.current(), d_values_double_buffer.current()); current_bit += pass_bits; #pragma unroll 1 while (current_bit < end_bit) { pass_bits = (cub::min)(int{LargeSegmentPolicyT::RADIX_BITS}, (end_bit - current_bit)); CTA_SYNC(); agent.ProcessIterative( current_bit, pass_bits, d_keys_double_buffer.current(), d_values_double_buffer.current(), d_keys_double_buffer.alternate(), d_values_double_buffer.alternate()); d_keys_double_buffer.swap(); d_values_double_buffer.swap(); current_bit += pass_bits; } } } /* * Continuation is called after the partitioning stage. It launches kernels * to sort large and small segments using the partitioning results. Separation * of this stage is required to eliminate device-side synchronization in * the CDP mode. */ template CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN cudaError_t DeviceSegmentedSortContinuation( LargeKernelT large_kernel, SmallKernelT small_kernel, int num_segments, KeyT* d_current_keys, KeyT* d_final_keys, detail::device_double_buffer d_keys_double_buffer, ValueT* d_current_values, ValueT* d_final_values, detail::device_double_buffer d_values_double_buffer, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, unsigned int* group_sizes, unsigned int* large_and_medium_segments_indices, unsigned int* small_segments_indices, cudaStream_t stream) { cudaError error = cudaSuccess; const unsigned int large_segments = group_sizes[0]; if (large_segments > 0) { // One CTA per segment const unsigned int blocks_in_grid = large_segments; #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking " "DeviceSegmentedSortKernelLarge<<<%d, %d, 0, %lld>>>()\n", static_cast(blocks_in_grid), LargeSegmentPolicyT::BLOCK_THREADS, (long long) stream); #endif THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( blocks_in_grid, LargeSegmentPolicyT::BLOCK_THREADS, 0, stream) .doit(large_kernel, large_and_medium_segments_indices, d_current_keys, d_final_keys, d_keys_double_buffer, d_current_values, d_final_values, d_values_double_buffer, d_begin_offsets, d_end_offsets); // Check for failure to launch error = CubDebug(cudaPeekAtLastError()); if (cudaSuccess != error) { return error; } // Sync the stream if specified to flush runtime errors error = CubDebug(detail::DebugSyncStream(stream)); if (cudaSuccess != error) { return error; } } const unsigned int small_segments = group_sizes[1]; const unsigned int medium_segments = static_cast(num_segments) - (large_segments + small_segments); const unsigned int small_blocks = DivideAndRoundUp(small_segments, SmallAndMediumPolicyT::SEGMENTS_PER_SMALL_BLOCK); const unsigned int medium_blocks = DivideAndRoundUp(medium_segments, SmallAndMediumPolicyT::SEGMENTS_PER_MEDIUM_BLOCK); const unsigned int small_and_medium_blocks_in_grid = small_blocks + medium_blocks; if (small_and_medium_blocks_in_grid) { #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking " "DeviceSegmentedSortKernelSmall<<<%d, %d, 0, %lld>>>()\n", static_cast(small_and_medium_blocks_in_grid), SmallAndMediumPolicyT::BLOCK_THREADS, (long long) stream); #endif THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( small_and_medium_blocks_in_grid, SmallAndMediumPolicyT::BLOCK_THREADS, 0, stream) .doit(small_kernel, small_segments, medium_segments, medium_blocks, small_segments_indices, large_and_medium_segments_indices + num_segments - medium_segments, d_current_keys, d_final_keys, d_current_values, d_final_values, d_begin_offsets, d_end_offsets); // Check for failure to launch error = CubDebug(cudaPeekAtLastError()); if (cudaSuccess != error) { return error; } // Sync the stream if specified to flush runtime errors error = CubDebug(detail::DebugSyncStream(stream)); if (cudaSuccess != error) { return error; } } return error; } #ifdef CUB_RDC_ENABLED /* * Continuation kernel is used only in the CDP mode. It's used to * launch DeviceSegmentedSortContinuation as a separate kernel. */ template __launch_bounds__(1) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSegmentedSortContinuationKernel( LargeKernelT large_kernel, SmallKernelT small_kernel, int num_segments, KeyT* d_current_keys, KeyT* d_final_keys, detail::device_double_buffer d_keys_double_buffer, ValueT* d_current_values, ValueT* d_final_values, detail::device_double_buffer d_values_double_buffer, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, unsigned int* group_sizes, unsigned int* large_and_medium_segments_indices, unsigned int* small_segments_indices) { using ActivePolicyT = typename ChainedPolicyT::ActivePolicy; using LargeSegmentPolicyT = typename ActivePolicyT::LargeSegmentPolicy; using SmallAndMediumPolicyT = typename ActivePolicyT::SmallAndMediumSegmentedSortPolicyT; // In case of CDP: // 1. each CTA has a different main stream // 2. all streams are non-blocking // 3. child grid always completes before the parent grid // 4. streams can be used only from the CTA in which they were created // 5. streams created on the host cannot be used on the device // // Due to (4, 5), we can't pass the user-provided stream in the continuation. // Due to (1, 2, 3) it's safe to pass the main stream. cudaError_t error = DeviceSegmentedSortContinuation( large_kernel, small_kernel, num_segments, d_current_keys, d_final_keys, d_keys_double_buffer, d_current_values, d_final_values, d_values_double_buffer, d_begin_offsets, d_end_offsets, group_sizes, large_and_medium_segments_indices, small_segments_indices, 0); // always launching on the main stream (see motivation above) error = CubDebug(error); } #endif // CUB_RDC_ENABLED template struct DeviceSegmentedSortPolicy { using DominantT = cub::detail::conditional_t<(sizeof(ValueT) > sizeof(KeyT)), ValueT, KeyT>; static constexpr int KEYS_ONLY = std::is_same::value; //---------------------------------------------------------------------------- // Architecture-specific tuning policies //---------------------------------------------------------------------------- struct Policy350 : ChainedPolicy<350, Policy350, Policy350> { static constexpr int BLOCK_THREADS = 128; static constexpr int RADIX_BITS = sizeof(KeyT) > 1 ? 6 : 4; static constexpr int PARTITIONING_THRESHOLD = 300; using LargeSegmentPolicy = AgentRadixSortDownsweepPolicy< BLOCK_THREADS, 9, DominantT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MATCH, BLOCK_SCAN_WARP_SCANS, RADIX_BITS>; static constexpr int ITEMS_PER_SMALL_THREAD = Nominal4BItemsToItems(5); static constexpr int ITEMS_PER_MEDIUM_THREAD = Nominal4BItemsToItems(5); using SmallAndMediumSegmentedSortPolicyT = AgentSmallAndMediumSegmentedSortPolicy< BLOCK_THREADS, // Small policy cub::AgentSubWarpMergeSortPolicy<4, ITEMS_PER_SMALL_THREAD, WarpLoadAlgorithm::WARP_LOAD_DIRECT, CacheLoadModifier::LOAD_DEFAULT>, // Medium policy cub::AgentSubWarpMergeSortPolicy<32, ITEMS_PER_MEDIUM_THREAD, WarpLoadAlgorithm::WARP_LOAD_DIRECT, CacheLoadModifier::LOAD_DEFAULT>>; }; struct Policy500 : ChainedPolicy<500, Policy500, Policy350> { static constexpr int BLOCK_THREADS = 256; static constexpr int RADIX_BITS = sizeof(KeyT) > 1 ? 6 : 4; static constexpr int PARTITIONING_THRESHOLD = 300; using LargeSegmentPolicy = AgentRadixSortDownsweepPolicy< BLOCK_THREADS, 16, DominantT, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_RAKING_MEMOIZE, RADIX_BITS>; static constexpr int ITEMS_PER_SMALL_THREAD = Nominal4BItemsToItems(7); static constexpr int ITEMS_PER_MEDIUM_THREAD = Nominal4BItemsToItems(7); using SmallAndMediumSegmentedSortPolicyT = AgentSmallAndMediumSegmentedSortPolicy< BLOCK_THREADS, // Small policy cub::AgentSubWarpMergeSortPolicy<4, // Threads per segment ITEMS_PER_SMALL_THREAD, WarpLoadAlgorithm::WARP_LOAD_DIRECT, CacheLoadModifier::LOAD_DEFAULT>, // Medium policy cub::AgentSubWarpMergeSortPolicy<32, // Threads per segment ITEMS_PER_MEDIUM_THREAD, WarpLoadAlgorithm::WARP_LOAD_DIRECT, CacheLoadModifier::LOAD_DEFAULT>>; }; struct Policy600 : ChainedPolicy<600, Policy600, Policy500> { static constexpr int BLOCK_THREADS = 256; static constexpr int RADIX_BITS = sizeof(KeyT) > 1 ? 6 : 4; static constexpr int PARTITIONING_THRESHOLD = 500; using LargeSegmentPolicy = AgentRadixSortDownsweepPolicy< BLOCK_THREADS, 19, DominantT, BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MATCH, BLOCK_SCAN_WARP_SCANS, RADIX_BITS>; static constexpr int ITEMS_PER_SMALL_THREAD = Nominal4BItemsToItems(9); static constexpr int ITEMS_PER_MEDIUM_THREAD = Nominal4BItemsToItems(9); using SmallAndMediumSegmentedSortPolicyT = AgentSmallAndMediumSegmentedSortPolicy< BLOCK_THREADS, // Small policy cub::AgentSubWarpMergeSortPolicy<4, // Threads per segment ITEMS_PER_SMALL_THREAD, WarpLoadAlgorithm::WARP_LOAD_DIRECT, CacheLoadModifier::LOAD_DEFAULT>, // Medium policy cub::AgentSubWarpMergeSortPolicy<32, // Threads per segment ITEMS_PER_MEDIUM_THREAD, WarpLoadAlgorithm::WARP_LOAD_DIRECT, CacheLoadModifier::LOAD_DEFAULT>>; }; struct Policy610 : ChainedPolicy<610, Policy610, Policy600> { static constexpr int BLOCK_THREADS = 256; static constexpr int RADIX_BITS = sizeof(KeyT) > 1 ? 6 : 4; static constexpr int PARTITIONING_THRESHOLD = 500; using LargeSegmentPolicy = AgentRadixSortDownsweepPolicy< BLOCK_THREADS, 19, DominantT, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, RADIX_BITS>; static constexpr int ITEMS_PER_SMALL_THREAD = Nominal4BItemsToItems(9); static constexpr int ITEMS_PER_MEDIUM_THREAD = Nominal4BItemsToItems(9); using SmallAndMediumSegmentedSortPolicyT = AgentSmallAndMediumSegmentedSortPolicy< BLOCK_THREADS, // Small policy cub::AgentSubWarpMergeSortPolicy<4, // Threads per segment ITEMS_PER_SMALL_THREAD, WarpLoadAlgorithm::WARP_LOAD_DIRECT, CacheLoadModifier::LOAD_DEFAULT>, // Medium policy cub::AgentSubWarpMergeSortPolicy<32, // Threads per segment ITEMS_PER_MEDIUM_THREAD, WarpLoadAlgorithm::WARP_LOAD_DIRECT, CacheLoadModifier::LOAD_DEFAULT>>; }; struct Policy620 : ChainedPolicy<620, Policy620, Policy610> { static constexpr int BLOCK_THREADS = 256; static constexpr int RADIX_BITS = sizeof(KeyT) > 1 ? 5 : 4; static constexpr int PARTITIONING_THRESHOLD = 500; using LargeSegmentPolicy = AgentRadixSortDownsweepPolicy< BLOCK_THREADS, 16, DominantT, BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_RAKING_MEMOIZE, RADIX_BITS>; static constexpr int ITEMS_PER_SMALL_THREAD = Nominal4BItemsToItems(9); static constexpr int ITEMS_PER_MEDIUM_THREAD = Nominal4BItemsToItems(9); using SmallAndMediumSegmentedSortPolicyT = AgentSmallAndMediumSegmentedSortPolicy< BLOCK_THREADS, // Small policy cub::AgentSubWarpMergeSortPolicy<4, // Threads per segment ITEMS_PER_SMALL_THREAD, WarpLoadAlgorithm::WARP_LOAD_DIRECT, CacheLoadModifier::LOAD_DEFAULT>, // Medium policy cub::AgentSubWarpMergeSortPolicy<32, // Threads per segment ITEMS_PER_MEDIUM_THREAD, WarpLoadAlgorithm::WARP_LOAD_DIRECT, CacheLoadModifier::LOAD_DEFAULT>>; }; struct Policy700 : ChainedPolicy<700, Policy700, Policy620> { static constexpr int BLOCK_THREADS = 256; static constexpr int RADIX_BITS = sizeof(KeyT) > 1 ? 6 : 4; static constexpr int PARTITIONING_THRESHOLD = 500; using LargeSegmentPolicy = AgentRadixSortDownsweepPolicy< BLOCK_THREADS, 19, DominantT, BLOCK_LOAD_DIRECT, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, RADIX_BITS>; static constexpr int ITEMS_PER_SMALL_THREAD = Nominal4BItemsToItems(7); static constexpr int ITEMS_PER_MEDIUM_THREAD = Nominal4BItemsToItems(KEYS_ONLY ? 11 : 7); using SmallAndMediumSegmentedSortPolicyT = AgentSmallAndMediumSegmentedSortPolicy< BLOCK_THREADS, // Small policy cub::AgentSubWarpMergeSortPolicy<(KEYS_ONLY ? 4 : 8), // Threads per segment ITEMS_PER_SMALL_THREAD, WarpLoadAlgorithm::WARP_LOAD_DIRECT, CacheLoadModifier::LOAD_DEFAULT>, // Medium policy cub::AgentSubWarpMergeSortPolicy<32, // Threads per segment ITEMS_PER_MEDIUM_THREAD, WarpLoadAlgorithm::WARP_LOAD_DIRECT, CacheLoadModifier::LOAD_DEFAULT>>; }; struct Policy800 : ChainedPolicy<800, Policy800, Policy700> { static constexpr int BLOCK_THREADS = 256; static constexpr int PARTITIONING_THRESHOLD = 500; using LargeSegmentPolicy = cub::AgentRadixSortDownsweepPolicy< BLOCK_THREADS, 23, DominantT, cub::BLOCK_LOAD_TRANSPOSE, cub::LOAD_DEFAULT, cub::RADIX_RANK_MEMOIZE, cub::BLOCK_SCAN_WARP_SCANS, (sizeof(KeyT) > 1) ? 6 : 4>; static constexpr int ITEMS_PER_SMALL_THREAD = Nominal4BItemsToItems(9); static constexpr int ITEMS_PER_MEDIUM_THREAD = Nominal4BItemsToItems(KEYS_ONLY ? 7 : 11); using SmallAndMediumSegmentedSortPolicyT = AgentSmallAndMediumSegmentedSortPolicy< BLOCK_THREADS, // Small policy cub::AgentSubWarpMergeSortPolicy<(KEYS_ONLY ? 4 : 2), // Threads per segment ITEMS_PER_SMALL_THREAD, WarpLoadAlgorithm::WARP_LOAD_TRANSPOSE, CacheLoadModifier::LOAD_DEFAULT>, // Medium policy cub::AgentSubWarpMergeSortPolicy<32, // Threads per segment ITEMS_PER_MEDIUM_THREAD, WarpLoadAlgorithm::WARP_LOAD_TRANSPOSE, CacheLoadModifier::LOAD_DEFAULT>>; }; struct Policy860 : ChainedPolicy<860, Policy860, Policy800> { static constexpr int BLOCK_THREADS = 256; static constexpr int PARTITIONING_THRESHOLD = 500; using LargeSegmentPolicy = cub::AgentRadixSortDownsweepPolicy< BLOCK_THREADS, 23, DominantT, cub::BLOCK_LOAD_TRANSPOSE, cub::LOAD_DEFAULT, cub::RADIX_RANK_MEMOIZE, cub::BLOCK_SCAN_WARP_SCANS, (sizeof(KeyT) > 1) ? 6 : 4>; static constexpr bool LARGE_ITEMS = sizeof(DominantT) > 4; static constexpr int ITEMS_PER_SMALL_THREAD = Nominal4BItemsToItems(LARGE_ITEMS ? 7 : 9); static constexpr int ITEMS_PER_MEDIUM_THREAD = Nominal4BItemsToItems(LARGE_ITEMS ? 9 : 7); using SmallAndMediumSegmentedSortPolicyT = AgentSmallAndMediumSegmentedSortPolicy< BLOCK_THREADS, // Small policy cub::AgentSubWarpMergeSortPolicy<(LARGE_ITEMS ? 8 : 2), // Threads per segment ITEMS_PER_SMALL_THREAD, WarpLoadAlgorithm::WARP_LOAD_TRANSPOSE, CacheLoadModifier::LOAD_LDG>, // Medium policy cub::AgentSubWarpMergeSortPolicy<16, // Threads per segment ITEMS_PER_MEDIUM_THREAD, WarpLoadAlgorithm::WARP_LOAD_TRANSPOSE, CacheLoadModifier::LOAD_LDG>>; }; /// MaxPolicy using MaxPolicy = Policy860; }; template > struct DispatchSegmentedSort : SelectedPolicy { static constexpr int KEYS_ONLY = std::is_same::value; struct LargeSegmentsSelectorT { OffsetT value{}; BeginOffsetIteratorT d_offset_begin{}; EndOffsetIteratorT d_offset_end{}; _CCCL_HOST_DEVICE _CCCL_FORCEINLINE LargeSegmentsSelectorT(OffsetT value, BeginOffsetIteratorT d_offset_begin, EndOffsetIteratorT d_offset_end) : value(value) , d_offset_begin(d_offset_begin) , d_offset_end(d_offset_end) {} _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool operator()(unsigned int segment_id) const { const OffsetT segment_size = d_offset_end[segment_id] - d_offset_begin[segment_id]; return segment_size > value; } }; struct SmallSegmentsSelectorT { OffsetT value{}; BeginOffsetIteratorT d_offset_begin{}; EndOffsetIteratorT d_offset_end{}; _CCCL_HOST_DEVICE _CCCL_FORCEINLINE SmallSegmentsSelectorT(OffsetT value, BeginOffsetIteratorT d_offset_begin, EndOffsetIteratorT d_offset_end) : value(value) , d_offset_begin(d_offset_begin) , d_offset_end(d_offset_end) {} _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool operator()(unsigned int segment_id) const { const OffsetT segment_size = d_offset_end[segment_id] - d_offset_begin[segment_id]; return segment_size < value; } }; // Partition selects large and small groups. The middle group is not selected. static constexpr std::size_t num_selected_groups = 2; /** * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no work * is done. */ void* d_temp_storage; /// Reference to size in bytes of `d_temp_storage` allocation std::size_t& temp_storage_bytes; /** * Double-buffer whose current buffer contains the unsorted input keys and, * upon return, is updated to point to the sorted output keys */ DoubleBuffer& d_keys; /** * Double-buffer whose current buffer contains the unsorted input values and, * upon return, is updated to point to the sorted output values */ DoubleBuffer& d_values; /// Number of items to sort OffsetT num_items; /// The number of segments that comprise the sorting data int num_segments; /** * Random-access input iterator to the sequence of beginning offsets of length * `num_segments`, such that `d_begin_offsets[i]` is the first element of the * ith data segment in `d_keys_*` and `d_values_*` */ BeginOffsetIteratorT d_begin_offsets; /** * Random-access input iterator to the sequence of ending offsets of length * `num_segments`, such that d_end_offsets[i]-1 is the last element * of the ith data segment in `d_keys_*` and * `d_values_*`. If `d_end_offsets[i]-1 <= d_begin_offsets[i]`, * the ith is considered empty. */ EndOffsetIteratorT d_end_offsets; /// Whether is okay to overwrite source buffers bool is_overwrite_okay; /// CUDA stream to launch kernels within. cudaStream_t stream; CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchSegmentedSort( void* d_temp_storage, std::size_t& temp_storage_bytes, DoubleBuffer& d_keys, DoubleBuffer& d_values, OffsetT num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, bool is_overwrite_okay, cudaStream_t stream) : d_temp_storage(d_temp_storage) , temp_storage_bytes(temp_storage_bytes) , d_keys(d_keys) , d_values(d_values) , num_items(num_items) , num_segments(num_segments) , d_begin_offsets(d_begin_offsets) , d_end_offsets(d_end_offsets) , is_overwrite_okay(is_overwrite_okay) , stream(stream) {} CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchSegmentedSort( void* d_temp_storage, std::size_t& temp_storage_bytes, DoubleBuffer& d_keys, DoubleBuffer& d_values, OffsetT num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, bool is_overwrite_okay, cudaStream_t stream, bool debug_synchronous) : d_temp_storage(d_temp_storage) , temp_storage_bytes(temp_storage_bytes) , d_keys(d_keys) , d_values(d_values) , num_items(num_items) , num_segments(num_segments) , d_begin_offsets(d_begin_offsets) , d_end_offsets(d_end_offsets) , is_overwrite_okay(is_overwrite_okay) , stream(stream) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG } template CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke() { using MaxPolicyT = typename DispatchSegmentedSort::MaxPolicy; using LargeSegmentPolicyT = typename ActivePolicyT::LargeSegmentPolicy; using SmallAndMediumPolicyT = typename ActivePolicyT::SmallAndMediumSegmentedSortPolicyT; static_assert(LargeSegmentPolicyT::LOAD_MODIFIER != CacheLoadModifier::LOAD_LDG, "The memory consistency model does not apply to texture accesses"); static_assert(KEYS_ONLY || LargeSegmentPolicyT::LOAD_ALGORITHM != BLOCK_LOAD_STRIPED || SmallAndMediumPolicyT::MediumPolicyT::LOAD_ALGORITHM != WARP_LOAD_STRIPED || SmallAndMediumPolicyT::SmallPolicyT::LOAD_ALGORITHM != WARP_LOAD_STRIPED, "Striped load will make this algorithm unstable"); static_assert(SmallAndMediumPolicyT::MediumPolicyT::STORE_ALGORITHM != WARP_STORE_STRIPED || SmallAndMediumPolicyT::SmallPolicyT::STORE_ALGORITHM != WARP_STORE_STRIPED, "Striped stores will produce unsorted results"); constexpr int radix_bits = LargeSegmentPolicyT::RADIX_BITS; cudaError error = cudaSuccess; do { //------------------------------------------------------------------------ // Prepare temporary storage layout //------------------------------------------------------------------------ const bool partition_segments = num_segments > ActivePolicyT::PARTITIONING_THRESHOLD; cub::detail::temporary_storage::layout<5> temporary_storage_layout; auto keys_slot = temporary_storage_layout.get_slot(0); auto values_slot = temporary_storage_layout.get_slot(1); auto large_and_medium_partitioning_slot = temporary_storage_layout.get_slot(2); auto small_partitioning_slot = temporary_storage_layout.get_slot(3); auto group_sizes_slot = temporary_storage_layout.get_slot(4); auto keys_allocation = keys_slot->create_alias(); auto values_allocation = values_slot->create_alias(); if (!is_overwrite_okay) { keys_allocation.grow(num_items); if (!KEYS_ONLY) { values_allocation.grow(num_items); } } auto large_and_medium_segments_indices = large_and_medium_partitioning_slot->create_alias(); auto small_segments_indices = small_partitioning_slot->create_alias(); auto group_sizes = group_sizes_slot->create_alias(); std::size_t three_way_partition_temp_storage_bytes{}; LargeSegmentsSelectorT large_segments_selector( SmallAndMediumPolicyT::MediumPolicyT::ITEMS_PER_TILE, d_begin_offsets, d_end_offsets); SmallSegmentsSelectorT small_segments_selector( SmallAndMediumPolicyT::SmallPolicyT::ITEMS_PER_TILE + 1, d_begin_offsets, d_end_offsets); auto device_partition_temp_storage = keys_slot->create_alias(); if (partition_segments) { large_and_medium_segments_indices.grow(num_segments); small_segments_indices.grow(num_segments); group_sizes.grow(num_selected_groups); auto medium_indices_iterator = THRUST_NS_QUALIFIER::make_reverse_iterator(large_and_medium_segments_indices.get()); cub::DevicePartition::IfNoNVTX( nullptr, three_way_partition_temp_storage_bytes, THRUST_NS_QUALIFIER::counting_iterator(0), large_and_medium_segments_indices.get(), small_segments_indices.get(), medium_indices_iterator, group_sizes.get(), num_segments, large_segments_selector, small_segments_selector, stream); device_partition_temp_storage.grow(three_way_partition_temp_storage_bytes); } if (d_temp_storage == nullptr) { temp_storage_bytes = temporary_storage_layout.get_size(); // Return if the caller is simply requesting the size of the storage // allocation break; } if (num_items == 0 || num_segments == 0) { break; } error = CubDebug(temporary_storage_layout.map_to_buffer(d_temp_storage, temp_storage_bytes)); if (cudaSuccess != error) { break; } //------------------------------------------------------------------------ // Sort //------------------------------------------------------------------------ const bool is_num_passes_odd = GetNumPasses(radix_bits) & 1; /** * This algorithm sorts segments that don't fit into shared memory with * the in-global-memory radix sort. Radix sort splits key representation * into multiple "digits". Each digit is RADIX_BITS wide. The algorithm * iterates over these digits. Each of these iterations consists of a * couple of stages. The first stage computes a histogram for a current * digit in each segment key. This histogram helps to determine the * starting position of the keys group with a similar digit. * For example: * keys_digits = [ 1, 0, 0, 1 ] * digit_prefix = [ 0, 2 ] * The second stage checks the keys again and increments the prefix to * determine the final position of the key: * * expression | key | idx | result * ----------------------------------- | ----- | ------- | -------------- * result[prefix[keys[0]]++] = keys[0] | 1 | 2 | [ ?, ?, 1, ? ] * result[prefix[keys[1]]++] = keys[0] | 0 | 0 | [ 0, ?, 1, ? ] * result[prefix[keys[2]]++] = keys[0] | 0 | 1 | [ 0, 0, 1, ? ] * result[prefix[keys[3]]++] = keys[0] | 1 | 3 | [ 0, 0, 1, 1 ] * * If the resulting memory is aliased to the input one, we'll face the * following issues: * * input | key | idx | result/input | issue * -------------- | ----- | ------- | ---------------- | ---------------- * [ 1, 0, 0, 1 ] | 1 | 2 | [ 1, 0, 1, 1 ] | overwrite keys[2] * [ 1, 0, 1, 1 ] | 0 | 0 | [ 0, 0, 1, 1 ] | * [ 0, 0, 1, 1 ] | 1 | 3 | [ 0, 0, 1, 1 ] | extra key * [ 0, 0, 1, 1 ] | 1 | 4 | [ 0, 0, 1, 1 ] 1 | OOB access * * To avoid these issues, we have to use extra memory. The extra memory * holds temporary storage for writing intermediate results of each stage. * Since we iterate over digits in keys, we potentially need: * `sizeof(KeyT) * num_items * DivideAndRoundUp(sizeof(KeyT),RADIX_BITS)` * auxiliary memory bytes. To reduce the auxiliary memory storage * requirements, the algorithm relies on a double buffer facility. The * idea behind it is in swapping destination and source buffers at each * iteration. This way, we can use only two buffers. One of these buffers * can be the final algorithm output destination. Therefore, only one * auxiliary array is needed. Depending on the number of iterations, we * can initialize the double buffer so that the algorithm output array * will match the double buffer result one at the final iteration. * A user can provide this algorithm with a double buffer straightaway to * further reduce the auxiliary memory requirements. `is_overwrite_okay` * indicates this use case. */ detail::device_double_buffer d_keys_double_buffer( (is_overwrite_okay || is_num_passes_odd) ? d_keys.Alternate() : keys_allocation.get(), (is_overwrite_okay) ? d_keys.Current() : (is_num_passes_odd) ? keys_allocation.get() : d_keys.Alternate()); detail::device_double_buffer d_values_double_buffer( (is_overwrite_okay || is_num_passes_odd) ? d_values.Alternate() : values_allocation.get(), (is_overwrite_okay) ? d_values.Current() : (is_num_passes_odd) ? values_allocation.get() : d_values.Alternate()); if (partition_segments) { // Partition input segments into size groups and assign specialized // kernels for each of them. error = SortWithPartitioning( DeviceSegmentedSortKernelLarge, DeviceSegmentedSortKernelSmall, three_way_partition_temp_storage_bytes, d_keys_double_buffer, d_values_double_buffer, large_segments_selector, small_segments_selector, device_partition_temp_storage, large_and_medium_segments_indices, small_segments_indices, group_sizes); } else { // If there are not enough segments, there's no reason to spend time // on extra partitioning steps. error = SortWithoutPartitioning( DeviceSegmentedSortFallbackKernel, d_keys_double_buffer, d_values_double_buffer); } d_keys.selector = GetFinalSelector(d_keys.selector, radix_bits); d_values.selector = GetFinalSelector(d_values.selector, radix_bits); } while (false); return error; } CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch( void* d_temp_storage, std::size_t& temp_storage_bytes, DoubleBuffer& d_keys, DoubleBuffer& d_values, OffsetT num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, bool is_overwrite_okay, cudaStream_t stream) { using MaxPolicyT = typename DispatchSegmentedSort::MaxPolicy; cudaError error = cudaSuccess; do { // Get PTX version int ptx_version = 0; error = CubDebug(PtxVersion(ptx_version)); if (cudaSuccess != error) { break; } // Create dispatch functor DispatchSegmentedSort dispatch( d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, num_segments, d_begin_offsets, d_end_offsets, is_overwrite_okay, stream); // Dispatch to chained policy error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch)); if (cudaSuccess != error) { break; } } while (false); return error; } CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch( void* d_temp_storage, std::size_t& temp_storage_bytes, DoubleBuffer& d_keys, DoubleBuffer& d_values, OffsetT num_items, int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, bool is_overwrite_okay, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return Dispatch( d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, num_segments, d_begin_offsets, d_end_offsets, is_overwrite_okay, stream); } private: CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE int GetNumPasses(int radix_bits) { constexpr int byte_size = 8; constexpr int num_bits = sizeof(KeyT) * byte_size; const int num_passes = DivideAndRoundUp(num_bits, radix_bits); return num_passes; } CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE int GetFinalSelector(int selector, int radix_bits) { // Sorted data always ends up in the other vector if (!is_overwrite_okay) { return (selector + 1) & 1; } return (selector + GetNumPasses(radix_bits)) & 1; } template CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE T* GetFinalOutput(int radix_bits, DoubleBuffer& buffer) { const int final_selector = GetFinalSelector(buffer.selector, radix_bits); return buffer.d_buffers[final_selector]; } template CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE cudaError_t SortWithPartitioning( LargeKernelT large_kernel, SmallKernelT small_kernel, std::size_t three_way_partition_temp_storage_bytes, cub::detail::device_double_buffer& d_keys_double_buffer, cub::detail::device_double_buffer& d_values_double_buffer, LargeSegmentsSelectorT& large_segments_selector, SmallSegmentsSelectorT& small_segments_selector, cub::detail::temporary_storage::alias& device_partition_temp_storage, cub::detail::temporary_storage::alias& large_and_medium_segments_indices, cub::detail::temporary_storage::alias& small_segments_indices, cub::detail::temporary_storage::alias& group_sizes) { cudaError_t error = cudaSuccess; auto medium_indices_iterator = THRUST_NS_QUALIFIER::make_reverse_iterator(large_and_medium_segments_indices.get() + num_segments); error = CubDebug(cub::DevicePartition::IfNoNVTX( device_partition_temp_storage.get(), three_way_partition_temp_storage_bytes, THRUST_NS_QUALIFIER::counting_iterator(0), large_and_medium_segments_indices.get(), small_segments_indices.get(), medium_indices_iterator, group_sizes.get(), num_segments, large_segments_selector, small_segments_selector, stream)); if (cudaSuccess != error) { return error; } // The device path is only used (and only compiles) when CDP is enabled. // It's defined in a macro since we can't put `#ifdef`s inside of // `NV_IF_TARGET`. #ifndef CUB_RDC_ENABLED # define CUB_TEMP_DEVICE_CODE #else // CUB_RDC_ENABLED # define CUB_TEMP_DEVICE_CODE \ using MaxPolicyT = typename DispatchSegmentedSort::MaxPolicy; \ error = \ THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(1, 1, 0, stream) \ .doit( \ DeviceSegmentedSortContinuationKernel< \ MaxPolicyT, \ LargeKernelT, \ SmallKernelT, \ KeyT, \ ValueT, \ BeginOffsetIteratorT, \ EndOffsetIteratorT>, \ large_kernel, \ small_kernel, \ num_segments, \ d_keys.Current(), \ GetFinalOutput(LargeSegmentPolicyT::RADIX_BITS, d_keys), \ d_keys_double_buffer, \ d_values.Current(), \ GetFinalOutput(LargeSegmentPolicyT::RADIX_BITS, d_values), \ d_values_double_buffer, \ d_begin_offsets, \ d_end_offsets, \ group_sizes.get(), \ large_and_medium_segments_indices.get(), \ small_segments_indices.get()); \ error = CubDebug(error); \ \ if (cudaSuccess != error) \ { \ return error; \ } \ \ error = CubDebug(detail::DebugSyncStream(stream)); \ if (cudaSuccess != error) \ { \ return error; \ } #endif // CUB_RDC_ENABLED // Clang format mangles some of this NV_IF_TARGET block // clang-format off NV_IF_TARGET( NV_IS_HOST, ( unsigned int h_group_sizes[num_selected_groups]; error = CubDebug(cudaMemcpyAsync(h_group_sizes, group_sizes.get(), num_selected_groups * sizeof(unsigned int), cudaMemcpyDeviceToHost, stream)); if (cudaSuccess != error) { return error; } error = CubDebug(SyncStream(stream)); if (cudaSuccess != error) { return error; } error = DeviceSegmentedSortContinuation( large_kernel, small_kernel, num_segments, d_keys.Current(), GetFinalOutput(LargeSegmentPolicyT::RADIX_BITS, d_keys), d_keys_double_buffer, d_values.Current(), GetFinalOutput(LargeSegmentPolicyT::RADIX_BITS, d_values), d_values_double_buffer, d_begin_offsets, d_end_offsets, h_group_sizes, large_and_medium_segments_indices.get(), small_segments_indices.get(), stream);), // NV_IS_DEVICE: (CUB_TEMP_DEVICE_CODE)); // clang-format on #undef CUB_TEMP_DEVICE_CODE return error; } template CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE cudaError_t SortWithoutPartitioning( FallbackKernelT fallback_kernel, cub::detail::device_double_buffer& d_keys_double_buffer, cub::detail::device_double_buffer& d_values_double_buffer) { cudaError_t error = cudaSuccess; const auto blocks_in_grid = static_cast(num_segments); constexpr auto threads_in_block = static_cast(LargeSegmentPolicyT::BLOCK_THREADS); // Log kernel configuration #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking DeviceSegmentedSortFallbackKernel<<<%d, %d, " "0, %lld>>>(), %d items per thread, bit_grain %d\n", blocks_in_grid, threads_in_block, (long long) stream, LargeSegmentPolicyT::ITEMS_PER_THREAD, LargeSegmentPolicyT::RADIX_BITS); #endif // Invoke fallback kernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(blocks_in_grid, threads_in_block, 0, stream) .doit(fallback_kernel, d_keys.Current(), GetFinalOutput(LargeSegmentPolicyT::RADIX_BITS, d_keys), d_keys_double_buffer, d_values.Current(), GetFinalOutput(LargeSegmentPolicyT::RADIX_BITS, d_values), d_values_double_buffer, d_begin_offsets, d_end_offsets); // Check for failure to launch error = CubDebug(cudaPeekAtLastError()); if (cudaSuccess != error) { return error; } // Sync the stream if specified to flush runtime errors error = CubDebug(detail::DebugSyncStream(stream)); if (cudaSuccess != error) { return error; } return error; } }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/device/dispatch/dispatch_select_if.cuh000066400000000000000000000543551463375617100233500ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * @file * cub::DeviceSelect provides device-wide, parallel operations for selecting items from sequences * of data items residing within device-accessible memory. */ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include #include #include #include #include #include #include #include #include #include CUB_NAMESPACE_BEGIN namespace detail { /** * @brief Wrapper that partially specializes the `AgentSelectIf` on the non-type name parameter `KeepRejects`. */ template struct agent_select_if_wrapper_t { // Using an explicit list of template parameters forwarded to AgentSelectIf, since MSVC complains about a template // argument following a parameter pack expansion like `AgentSelectIf` template struct agent_t : public AgentSelectIf { using AgentSelectIf::AgentSelectIf; }; }; } // namespace detail /****************************************************************************** * Kernel entry points *****************************************************************************/ /** * Select kernel entry point (multi-block) * * Performs functor-based selection if SelectOpT functor type != NullType * Otherwise performs flag-based selection if FlagsInputIterator's value type != NullType * Otherwise performs discontinuity selection (keep unique) * * @tparam InputIteratorT * Random-access input iterator type for reading input items * * @tparam FlagsInputIteratorT * Random-access input iterator type for reading selection flags (NullType* if a selection functor * or discontinuity flagging is to be used for selection) * * @tparam SelectedOutputIteratorT * Random-access output iterator type for writing selected items * * @tparam NumSelectedIteratorT * Output iterator type for recording the number of items selected * * @tparam ScanTileStateT * Tile status interface type * * @tparam SelectOpT * Selection operator type (NullType if selection flags or discontinuity flagging is * to be used for selection) * * @tparam EqualityOpT * Equality operator type (NullType if selection functor or selection flags is * to be used for selection) * * @tparam OffsetT * Signed integer type for global offsets * * @tparam KEEP_REJECTS * Whether or not we push rejected items to the back of the output * * @param[in] d_in * Pointer to the input sequence of data items * * @param[in] d_flags * Pointer to the input sequence of selection flags (if applicable) * * @param[out] d_selected_out * Pointer to the output sequence of selected data items * * @param[out] d_num_selected_out * Pointer to the total number of items selected (i.e., length of \p d_selected_out) * * @param[in] tile_status * Tile status interface * * @param[in] select_op * Selection operator * * @param[in] equality_op * Equality operator * * @param[in] num_items * Total number of input items (i.e., length of \p d_in) * * @param[in] num_tiles * Total number of tiles for the entire problem * * @param[in] vsmem * Memory to support virtual shared memory */ template __launch_bounds__(int( cub::detail::vsmem_helper_default_fallback_policy_t< typename ChainedPolicyT::ActivePolicy::SelectIfPolicyT, detail::agent_select_if_wrapper_t::template agent_t, InputIteratorT, FlagsInputIteratorT, SelectedOutputIteratorT, SelectOpT, EqualityOpT, OffsetT>::agent_policy_t::BLOCK_THREADS)) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSelectSweepKernel( InputIteratorT d_in, FlagsInputIteratorT d_flags, SelectedOutputIteratorT d_selected_out, NumSelectedIteratorT d_num_selected_out, ScanTileStateT tile_status, SelectOpT select_op, EqualityOpT equality_op, OffsetT num_items, int num_tiles, cub::detail::vsmem_t vsmem) { using VsmemHelperT = cub::detail::vsmem_helper_default_fallback_policy_t< typename ChainedPolicyT::ActivePolicy::SelectIfPolicyT, detail::agent_select_if_wrapper_t::template agent_t, InputIteratorT, FlagsInputIteratorT, SelectedOutputIteratorT, SelectOpT, EqualityOpT, OffsetT>; using AgentSelectIfPolicyT = typename VsmemHelperT::agent_policy_t; // Thread block type for selecting data from input tiles using AgentSelectIfT = typename VsmemHelperT::agent_t; // Static shared memory allocation __shared__ typename VsmemHelperT::static_temp_storage_t static_temp_storage; // Get temporary storage typename AgentSelectIfT::TempStorage& temp_storage = VsmemHelperT::get_temp_storage(static_temp_storage, vsmem); // Process tiles AgentSelectIfT(temp_storage, d_in, d_flags, d_selected_out, select_op, equality_op, num_items) .ConsumeRange(num_tiles, tile_status, d_num_selected_out); // If applicable, hints to discard modified cache lines for vsmem VsmemHelperT::discard_temp_storage(temp_storage); } /****************************************************************************** * Dispatch ******************************************************************************/ /** * Utility class for dispatching the appropriately-tuned kernels for DeviceSelect * * @tparam InputIteratorT * Random-access input iterator type for reading input items * * @tparam FlagsInputIteratorT * Random-access input iterator type for reading selection flags * (NullType* if a selection functor or discontinuity flagging is to be used for selection) * * @tparam SelectedOutputIteratorT * Random-access output iterator type for writing selected items * * @tparam NumSelectedIteratorT * Output iterator type for recording the number of items selected * * @tparam SelectOpT * Selection operator type (NullType if selection flags or discontinuity flagging is * to be used for selection) * * @tparam EqualityOpT * Equality operator type (NullType if selection functor or selection flags is to * be used for selection) * * @tparam OffsetT * Signed integer type for global offsets * * @tparam KEEP_REJECTS * Whether or not we push rejected items to the back of the output */ template , cub::detail::value_t, OffsetT, MayAlias, KEEP_REJECTS>> struct DispatchSelectIf : SelectedPolicy { /****************************************************************************** * Types and constants ******************************************************************************/ // Tile status descriptor interface type using ScanTileStateT = ScanTileState; static constexpr int INIT_KERNEL_THREADS = 128; /// Device-accessible allocation of temporary storage. /// When `nullptr`, the required allocation size is written to `temp_storage_bytes` /// and no work is done. void* d_temp_storage; /// Reference to size in bytes of `d_temp_storage` allocation size_t& temp_storage_bytes; /// Pointer to the input sequence of data items InputIteratorT d_in; /// Pointer to the input sequence of selection flags (if applicable) FlagsInputIteratorT d_flags; /// Pointer to the output sequence of selected data items SelectedOutputIteratorT d_selected_out; /// Pointer to the total number of items selected (i.e., length of `d_selected_out`) NumSelectedIteratorT d_num_selected_out; /// Selection operator SelectOpT select_op; /// Equality operator EqualityOpT equality_op; /// Total number of input items (i.e., length of `d_in`) OffsetT num_items; /// CUDA stream to launch kernels within. Default is stream0. cudaStream_t stream; int ptx_version; /** * @param d_temp_storage * Device-accessible allocation of temporary storage. * When `nullptr`, the required allocation size is written to `temp_storage_bytes` * and no work is done. * * @param temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param d_in * Pointer to the input sequence of data items * * @param d_flags * Pointer to the input sequence of selection flags (if applicable) * * @param d_selected_out * Pointer to the output sequence of selected data items * * @param d_num_selected_out * Pointer to the total number of items selected (i.e., length of `d_selected_out`) * * @param select_op * Selection operator * * @param equality_op * Equality operator * * @param num_items * Total number of input items (i.e., length of `d_in`) * * @param stream * CUDA stream to launch kernels within. Default is stream0. */ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchSelectIf( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, FlagsInputIteratorT d_flags, SelectedOutputIteratorT d_selected_out, NumSelectedIteratorT d_num_selected_out, SelectOpT select_op, EqualityOpT equality_op, OffsetT num_items, cudaStream_t stream, int ptx_version) : d_temp_storage(d_temp_storage) , temp_storage_bytes(temp_storage_bytes) , d_in(d_in) , d_flags(d_flags) , d_selected_out(d_selected_out) , d_num_selected_out(d_num_selected_out) , select_op(select_op) , equality_op(equality_op) , num_items(num_items) , stream(stream) , ptx_version(ptx_version) {} /****************************************************************************** * Dispatch entrypoints ******************************************************************************/ /** * Internal dispatch routine for computing a device-wide selection using the * specified kernel functions. */ template CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke(ScanInitKernelPtrT scan_init_kernel, SelectIfKernelPtrT select_if_kernel) { using Policy = typename ActivePolicyT::SelectIfPolicyT; using VsmemHelperT = cub::detail::vsmem_helper_default_fallback_policy_t< Policy, detail::agent_select_if_wrapper_t::template agent_t, InputIteratorT, FlagsInputIteratorT, SelectedOutputIteratorT, SelectOpT, EqualityOpT, OffsetT>; cudaError error = cudaSuccess; constexpr auto block_threads = VsmemHelperT::agent_policy_t::BLOCK_THREADS; constexpr auto items_per_thread = VsmemHelperT::agent_policy_t::ITEMS_PER_THREAD; constexpr int tile_size = block_threads * items_per_thread; int num_tiles = static_cast(cub::DivideAndRoundUp(num_items, tile_size)); const auto vsmem_size = num_tiles * VsmemHelperT::vsmem_per_block; do { // Get device ordinal int device_ordinal; error = CubDebug(cudaGetDevice(&device_ordinal)); if (cudaSuccess != error) { break; } // Specify temporary storage allocation requirements size_t allocation_sizes[2] = {0ULL, vsmem_size}; // bytes needed for tile status descriptors error = CubDebug(ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0])); if (cudaSuccess != error) { break; } // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob) void* allocations[2] = {}; error = CubDebug(AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes)); if (cudaSuccess != error) { break; } if (d_temp_storage == nullptr) { // Return if the caller is simply requesting the size of the storage allocation break; } // Construct the tile status interface ScanTileStateT tile_status; error = CubDebug(tile_status.Init(num_tiles, allocations[0], allocation_sizes[0])); if (cudaSuccess != error) { break; } // Log scan_init_kernel configuration int init_grid_size = CUB_MAX(1, cub::DivideAndRoundUp(num_tiles, INIT_KERNEL_THREADS)); #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog( "Invoking scan_init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream); #endif // Invoke scan_init_kernel to initialize tile descriptors THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream) .doit(scan_init_kernel, tile_status, num_tiles, d_num_selected_out); // Check for failure to launch error = CubDebug(cudaPeekAtLastError()); if (cudaSuccess != error) { break; } // Sync the stream if specified to flush runtime errors error = CubDebug(detail::DebugSyncStream(stream)); if (cudaSuccess != error) { break; } // Return if empty problem if (num_items == 0) { break; } // Get max x-dimension of grid int max_dim_x; error = CubDebug(cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal)); if (cudaSuccess != error) { break; } // Get grid size for scanning tiles dim3 scan_grid_size; scan_grid_size.z = 1; scan_grid_size.y = cub::DivideAndRoundUp(num_tiles, max_dim_x); scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x); // Log select_if_kernel configuration #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG { // Get SM occupancy for select_if_kernel int range_select_sm_occupancy; error = CubDebug(MaxSmOccupancy(range_select_sm_occupancy, // out select_if_kernel, block_threads)); if (cudaSuccess != error) { break; } _CubLog("Invoking select_if_kernel<<<{%d,%d,%d}, %d, 0, " "%lld>>>(), %d items per thread, %d SM occupancy\n", scan_grid_size.x, scan_grid_size.y, scan_grid_size.z, block_threads, (long long) stream, items_per_thread, range_select_sm_occupancy); } #endif // Invoke select_if_kernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(scan_grid_size, block_threads, 0, stream) .doit(select_if_kernel, d_in, d_flags, d_selected_out, d_num_selected_out, tile_status, select_op, equality_op, num_items, num_tiles, cub::detail::vsmem_t{allocations[1]}); // Check for failure to launch error = CubDebug(cudaPeekAtLastError()); if (cudaSuccess != error) { break; } // Sync the stream if specified to flush runtime errors error = CubDebug(detail::DebugSyncStream(stream)); if (cudaSuccess != error) { break; } } while (0); return error; } template CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke() { using MaxPolicyT = typename SelectedPolicy::MaxPolicy; return Invoke( DeviceCompactInitKernel, DeviceSelectSweepKernel); } /** * Internal dispatch routine * * @param d_temp_storage * Device-accessible allocation of temporary storage. * When `nullptr`, the required allocation size is written to `temp_storage_bytes` * and no work is done. * * @param temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param d_in * Pointer to the input sequence of data items * * @param d_flags * Pointer to the input sequence of selection flags (if applicable) * * @param d_selected_out * Pointer to the output sequence of selected data items * * @param d_num_selected_out * Pointer to the total number of items selected (i.e., length of `d_selected_out`) * * @param select_op * Selection operator * * @param equality_op * Equality operator * * @param num_items * Total number of input items (i.e., length of `d_in`) * * @param stream * CUDA stream to launch kernels within. Default is stream0. */ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, FlagsInputIteratorT d_flags, SelectedOutputIteratorT d_selected_out, NumSelectedIteratorT d_num_selected_out, SelectOpT select_op, EqualityOpT equality_op, OffsetT num_items, cudaStream_t stream) { using MaxPolicyT = typename SelectedPolicy::MaxPolicy; int ptx_version = 0; if (cudaError_t error = CubDebug(PtxVersion(ptx_version))) { return error; } DispatchSelectIf dispatch( d_temp_storage, temp_storage_bytes, d_in, d_flags, d_selected_out, d_num_selected_out, select_op, equality_op, num_items, stream, ptx_version); return CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch)); } CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, FlagsInputIteratorT d_flags, SelectedOutputIteratorT d_selected_out, NumSelectedIteratorT d_num_selected_out, SelectOpT select_op, EqualityOpT equality_op, OffsetT num_items, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return Dispatch( d_temp_storage, temp_storage_bytes, d_in, d_flags, d_selected_out, d_num_selected_out, select_op, equality_op, num_items, stream); } }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/device/dispatch/dispatch_spmv_orig.cuh000066400000000000000000001012431463375617100234050ustar00rootroot00000000000000 /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * @file * cub::DeviceSpmv provides device-wide parallel operations for performing sparse-matrix * vector * multiplication (SpMV). */ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include #include #include #include #include #include #include #include #include #include #include CUB_NAMESPACE_BEGIN /****************************************************************************** * SpMV kernel entry points *****************************************************************************/ /** * @brief Spmv search kernel. Identifies merge path starting coordinates for each tile. * * @tparam AgentSpmvPolicyT * Parameterized SpmvPolicy tuning policy type * * @tparam ValueT * Matrix and vector value type * * @tparam OffsetT * Signed integer type for sequence offsets * * @param[in] spmv_params * SpMV input parameter bundle */ template CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSpmv1ColKernel(SpmvParams spmv_params) { typedef CacheModifiedInputIterator VectorValueIteratorT; VectorValueIteratorT wrapped_vector_x(spmv_params.d_vector_x); int row_idx = (blockIdx.x * blockDim.x) + threadIdx.x; if (row_idx < spmv_params.num_rows) { OffsetT end_nonzero_idx = spmv_params.d_row_end_offsets[row_idx]; OffsetT nonzero_idx = spmv_params.d_row_end_offsets[row_idx - 1]; ValueT value = 0.0; if (end_nonzero_idx != nonzero_idx) { value = spmv_params.d_values[nonzero_idx] * wrapped_vector_x[spmv_params.d_column_indices[nonzero_idx]]; } spmv_params.d_vector_y[row_idx] = value; } } /** * @brief Spmv search kernel. Identifies merge path starting coordinates for each tile. * * @tparam SpmvPolicyT * Parameterized SpmvPolicy tuning policy type * * @tparam OffsetT * Signed integer type for sequence offsets * * @tparam CoordinateT * Merge path coordinate type * * @tparam SpmvParamsT * SpmvParams type * * @param[in] num_merge_tiles * Number of SpMV merge tiles (spmv grid size) * * @param[out] d_tile_coordinates * Pointer to the temporary array of tile starting coordinates * * @param[in] spmv_params * SpMV input parameter bundle */ template CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSpmvSearchKernel(int num_merge_tiles, CoordinateT* d_tile_coordinates, SpmvParamsT spmv_params) { /// Constants enum { BLOCK_THREADS = SpmvPolicyT::BLOCK_THREADS, ITEMS_PER_THREAD = SpmvPolicyT::ITEMS_PER_THREAD, TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, }; typedef CacheModifiedInputIterator RowOffsetsSearchIteratorT; // Find the starting coordinate for all tiles (plus the end coordinate of the last one) int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x; if (tile_idx < num_merge_tiles + 1) { OffsetT diagonal = (tile_idx * TILE_ITEMS); CoordinateT tile_coordinate; CountingInputIterator nonzero_indices(0); // Search the merge path MergePathSearch( diagonal, RowOffsetsSearchIteratorT(spmv_params.d_row_end_offsets), nonzero_indices, spmv_params.num_rows, spmv_params.num_nonzeros, tile_coordinate); // Output starting offset d_tile_coordinates[tile_idx] = tile_coordinate; } } /** * @brief Spmv agent entry point * * @tparam SpmvPolicyT * Parameterized SpmvPolicy tuning policy type * * @tparam ScanTileStateT * Tile status interface type * * @tparam ValueT * Matrix and vector value type * * @tparam OffsetT * Signed integer type for sequence offsets * * @tparam CoordinateT * Merge path coordinate type * * @tparam HAS_ALPHA * Whether the input parameter Alpha is 1 * * @tparam HAS_BETA * Whether the input parameter Beta is 0 * * @param[in] spmv_params * SpMV input parameter bundle * * @param[in] d_tile_coordinates * Pointer to the temporary array of tile starting coordinates * * @param[out] d_tile_carry_pairs * Pointer to the temporary array carry-out dot product row-ids, one per block * * @param[in] num_tiles * Number of merge tiles * * @param[in] tile_state * Tile status interface for fixup reduce-by-key kernel * * @param[in] num_segment_fixup_tiles * Number of reduce-by-key tiles (fixup grid size) */ template __launch_bounds__(int(SpmvPolicyT::BLOCK_THREADS)) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSpmvKernel( SpmvParams spmv_params, CoordinateT* d_tile_coordinates, KeyValuePair* d_tile_carry_pairs, int num_tiles, ScanTileStateT tile_state, int num_segment_fixup_tiles) { // Spmv agent type specialization typedef AgentSpmv AgentSpmvT; // Shared memory for AgentSpmv __shared__ typename AgentSpmvT::TempStorage temp_storage; AgentSpmvT(temp_storage, spmv_params).ConsumeTile(d_tile_coordinates, d_tile_carry_pairs, num_tiles); // Initialize fixup tile status tile_state.InitializeStatus(num_segment_fixup_tiles); } /** * @tparam ValueT * Matrix and vector value type * * @tparam OffsetT * Signed integer type for sequence offsets * * @tparam HAS_BETA * Whether the input parameter Beta is 0 */ template CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSpmvEmptyMatrixKernel(SpmvParams spmv_params) { const int row = static_cast(threadIdx.x + blockIdx.x * blockDim.x); if (row < spmv_params.num_rows) { ValueT result = 0.0; _CCCL_IF_CONSTEXPR (HAS_BETA) { result += spmv_params.beta * spmv_params.d_vector_y[row]; } spmv_params.d_vector_y[row] = result; } } /** * @brief Multi-block reduce-by-key sweep kernel entry point * * @tparam AgentSegmentFixupPolicyT * Parameterized AgentSegmentFixupPolicy tuning policy type * * @tparam PairsInputIteratorT * Random-access input iterator type for keys * * @tparam AggregatesOutputIteratorT * Random-access output iterator type for values * * @tparam OffsetT * Signed integer type for global offsets * * @tparam ScanTileStateT * Tile status interface type * * @param[in] d_pairs_in * Pointer to the array carry-out dot product row-ids, one per spmv block * * @param[in,out] d_aggregates_out * Output value aggregates * * @param[in] num_items * Total number of items to select from * * @param[in] num_tiles * Total number of tiles for the entire problem * * @param[in] tile_state * Tile status interface */ template __launch_bounds__(int(AgentSegmentFixupPolicyT::BLOCK_THREADS)) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSegmentFixupKernel( PairsInputIteratorT d_pairs_in, AggregatesOutputIteratorT d_aggregates_out, OffsetT num_items, int num_tiles, ScanTileStateT tile_state) { // Thread block type for reducing tiles of value segments typedef AgentSegmentFixup AgentSegmentFixupT; // Shared memory for AgentSegmentFixup __shared__ typename AgentSegmentFixupT::TempStorage temp_storage; // Process tiles AgentSegmentFixupT(temp_storage, d_pairs_in, d_aggregates_out, cub::Equality(), cub::Sum()) .ConsumeRange(num_items, num_tiles, tile_state); } /****************************************************************************** * Dispatch ******************************************************************************/ /** * @brief Utility class for dispatching the appropriately-tuned kernels for DeviceSpmv * * @tparam ValueT * Matrix and vector value type * * @tparam OffsetT * Signed integer type for global offsets */ template struct DispatchSpmv { //--------------------------------------------------------------------- // Constants and Types //--------------------------------------------------------------------- enum { INIT_KERNEL_THREADS = 128, EMPTY_MATRIX_KERNEL_THREADS = 128 }; // SpmvParams bundle type typedef SpmvParams SpmvParamsT; // 2D merge path coordinate type typedef typename CubVector::Type CoordinateT; // Tile status descriptor interface type typedef ReduceByKeyScanTileState ScanTileStateT; // Tuple type for scanning (pairs accumulated segment-value with segment-index) typedef KeyValuePair KeyValuePairT; //--------------------------------------------------------------------- // Tuning policies //--------------------------------------------------------------------- /// SM35 struct Policy350 { typedef AgentSpmvPolicy<(sizeof(ValueT) > 4) ? 96 : 128, (sizeof(ValueT) > 4) ? 4 : 7, LOAD_LDG, LOAD_CA, LOAD_LDG, LOAD_LDG, LOAD_LDG, (sizeof(ValueT) > 4) ? true : false, BLOCK_SCAN_WARP_SCANS> SpmvPolicyT; typedef AgentSegmentFixupPolicy<128, 3, BLOCK_LOAD_VECTORIZE, LOAD_LDG, BLOCK_SCAN_WARP_SCANS> SegmentFixupPolicyT; }; /// SM37 struct Policy370 { typedef AgentSpmvPolicy<(sizeof(ValueT) > 4) ? 128 : 128, (sizeof(ValueT) > 4) ? 9 : 14, LOAD_LDG, LOAD_CA, LOAD_LDG, LOAD_LDG, LOAD_LDG, false, BLOCK_SCAN_WARP_SCANS> SpmvPolicyT; typedef AgentSegmentFixupPolicy<128, 3, BLOCK_LOAD_VECTORIZE, LOAD_LDG, BLOCK_SCAN_WARP_SCANS> SegmentFixupPolicyT; }; /// SM50 struct Policy500 { typedef AgentSpmvPolicy<(sizeof(ValueT) > 4) ? 64 : 128, (sizeof(ValueT) > 4) ? 6 : 7, LOAD_LDG, LOAD_DEFAULT, (sizeof(ValueT) > 4) ? LOAD_LDG : LOAD_DEFAULT, (sizeof(ValueT) > 4) ? LOAD_LDG : LOAD_DEFAULT, LOAD_LDG, (sizeof(ValueT) > 4) ? true : false, (sizeof(ValueT) > 4) ? BLOCK_SCAN_WARP_SCANS : BLOCK_SCAN_RAKING_MEMOIZE> SpmvPolicyT; typedef AgentSegmentFixupPolicy<128, 3, BLOCK_LOAD_VECTORIZE, LOAD_LDG, BLOCK_SCAN_RAKING_MEMOIZE> SegmentFixupPolicyT; }; /// SM60 struct Policy600 { typedef AgentSpmvPolicy<(sizeof(ValueT) > 4) ? 64 : 128, (sizeof(ValueT) > 4) ? 5 : 7, LOAD_DEFAULT, LOAD_DEFAULT, LOAD_DEFAULT, LOAD_DEFAULT, LOAD_DEFAULT, false, BLOCK_SCAN_WARP_SCANS> SpmvPolicyT; typedef AgentSegmentFixupPolicy<128, 3, BLOCK_LOAD_DIRECT, LOAD_LDG, BLOCK_SCAN_WARP_SCANS> SegmentFixupPolicyT; }; //--------------------------------------------------------------------- // Tuning policies of current PTX compiler pass //--------------------------------------------------------------------- #if (CUB_PTX_ARCH >= 600) typedef Policy600 PtxPolicy; #elif (CUB_PTX_ARCH >= 500) typedef Policy500 PtxPolicy; #elif (CUB_PTX_ARCH >= 370) typedef Policy370 PtxPolicy; #else typedef Policy350 PtxPolicy; #endif // "Opaque" policies (whose parameterizations aren't reflected in the type signature) struct PtxSpmvPolicyT : PtxPolicy::SpmvPolicyT {}; struct PtxSegmentFixupPolicy : PtxPolicy::SegmentFixupPolicyT {}; //--------------------------------------------------------------------- // Utilities //--------------------------------------------------------------------- /** * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use */ template CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static void InitConfigs(int ptx_version, KernelConfig& spmv_config, KernelConfig& segment_fixup_config) { NV_IF_TARGET( NV_IS_DEVICE, ( // We're on the device, so initialize the kernel dispatch // configurations with the current PTX policy spmv_config.template Init(); segment_fixup_config.template Init();), ( // We're on the host, so lookup and initialize the kernel dispatch // configurations with the policies that match the device's PTX // version if (ptx_version >= 600) { spmv_config.template Init(); segment_fixup_config.template Init(); } else if (ptx_version >= 500) { spmv_config.template Init(); segment_fixup_config.template Init(); } else if (ptx_version >= 370) { spmv_config.template Init(); segment_fixup_config.template Init(); } else { spmv_config.template Init(); segment_fixup_config.template Init(); })); } /** * Kernel kernel dispatch configuration. */ struct KernelConfig { int block_threads; int items_per_thread; int tile_items; template CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE void Init() { block_threads = PolicyT::BLOCK_THREADS; items_per_thread = PolicyT::ITEMS_PER_THREAD; tile_items = block_threads * items_per_thread; } }; //--------------------------------------------------------------------- // Dispatch entrypoints //--------------------------------------------------------------------- /** * Internal dispatch routine for computing a device-wide reduction using the * specified kernel functions. * * If the input is larger than a single tile, this method uses two-passes of * kernel invocations. * * @tparam Spmv1ColKernelT * Function type of cub::DeviceSpmv1ColKernel * * @tparam SpmvSearchKernelT * Function type of cub::AgentSpmvSearchKernel * * @tparam SpmvKernelT * Function type of cub::AgentSpmvKernel * * @tparam SegmentFixupKernelT * Function type of cub::DeviceSegmentFixupKernelT * * @tparam SpmvEmptyMatrixKernelT * Function type of cub::DeviceSpmvEmptyMatrixKernel * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. * When NULL, the required allocation size is written to * `temp_storage_bytes` and no work is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of \p d_temp_storage allocation * * @paramSpMV spmv_params * input parameter bundle * * @param[in] stream * CUDA stream to launch kernels within. Default is stream0. * * @param[in] spmv_1col_kernel * Kernel function pointer to parameterization of DeviceSpmv1ColKernel * * @param[in] spmv_search_kernel * Kernel function pointer to parameterization of AgentSpmvSearchKernel * * @param[in] spmv_kernel * Kernel function pointer to parameterization of AgentSpmvKernel * * @param[in] segment_fixup_kernel * Kernel function pointer to parameterization of cub::DeviceSegmentFixupKernel * * @param[in] spmv_empty_matrix_kernel * Kernel function pointer to parameterization of cub::DeviceSpmvEmptyMatrixKernel * * @param[in] spmv_config * Dispatch parameters that match the policy that @p spmv_kernel was compiled for * * @param[in] segment_fixup_config * Dispatch parameters that match the policy that @p segment_fixup_kernel was compiled for */ template CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE static cudaError_t Dispatch( void* d_temp_storage, size_t& temp_storage_bytes, SpmvParamsT& spmv_params, cudaStream_t stream, Spmv1ColKernelT spmv_1col_kernel, SpmvSearchKernelT spmv_search_kernel, SpmvKernelT spmv_kernel, SegmentFixupKernelT segment_fixup_kernel, SpmvEmptyMatrixKernelT spmv_empty_matrix_kernel, KernelConfig spmv_config, KernelConfig segment_fixup_config) { cudaError error = cudaSuccess; do { if (spmv_params.num_rows < 0 || spmv_params.num_cols < 0) { return cudaErrorInvalidValue; } if (spmv_params.num_rows == 0 || spmv_params.num_cols == 0) { // Empty problem, no-op. if (d_temp_storage == NULL) { temp_storage_bytes = 1; } break; } if (spmv_params.num_nonzeros == 0) { if (d_temp_storage == NULL) { // Return if the caller is simply requesting the size of the storage allocation temp_storage_bytes = 1; break; } constexpr int threads_in_block = EMPTY_MATRIX_KERNEL_THREADS; const int blocks_in_grid = cub::DivideAndRoundUp(spmv_params.num_rows, threads_in_block); #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking spmv_empty_matrix_kernel<<<%d, %d, 0, %lld>>>()\n", blocks_in_grid, threads_in_block, (long long) stream); #endif error = THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(blocks_in_grid, threads_in_block, 0, stream) .doit(spmv_empty_matrix_kernel, spmv_params); if (CubDebug(error)) { break; } // Sync the stream if specified to flush runtime errors error = detail::DebugSyncStream(stream); if (CubDebug(error)) { break; } break; } if (spmv_params.num_cols == 1) { if (d_temp_storage == NULL) { // Return if the caller is simply requesting the size of the storage allocation temp_storage_bytes = 1; break; } // Get search/init grid dims int degen_col_kernel_block_size = INIT_KERNEL_THREADS; int degen_col_kernel_grid_size = cub::DivideAndRoundUp(spmv_params.num_rows, degen_col_kernel_block_size); #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking spmv_1col_kernel<<<%d, %d, 0, %lld>>>()\n", degen_col_kernel_grid_size, degen_col_kernel_block_size, (long long) stream); #endif // Invoke spmv_search_kernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( degen_col_kernel_grid_size, degen_col_kernel_block_size, 0, stream) .doit(spmv_1col_kernel, spmv_params); // Check for failure to launch if (CubDebug(error = cudaPeekAtLastError())) { break; } // Sync the stream if specified to flush runtime errors error = detail::DebugSyncStream(stream); if (CubDebug(error)) { break; } break; } // Get device ordinal int device_ordinal; if (CubDebug(error = cudaGetDevice(&device_ordinal))) { break; } // Get SM count int sm_count; if (CubDebug(error = cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) { break; } // Get max x-dimension of grid int max_dim_x; if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) { break; } // Total number of spmv work items int num_merge_items = spmv_params.num_rows + spmv_params.num_nonzeros; // Tile sizes of kernels int merge_tile_size = spmv_config.block_threads * spmv_config.items_per_thread; int segment_fixup_tile_size = segment_fixup_config.block_threads * segment_fixup_config.items_per_thread; // Number of tiles for kernels int num_merge_tiles = cub::DivideAndRoundUp(num_merge_items, merge_tile_size); int num_segment_fixup_tiles = cub::DivideAndRoundUp(num_merge_tiles, segment_fixup_tile_size); // Get SM occupancy for kernels int spmv_sm_occupancy; if (CubDebug(error = MaxSmOccupancy(spmv_sm_occupancy, spmv_kernel, spmv_config.block_threads))) { break; } int segment_fixup_sm_occupancy; if (CubDebug(error = MaxSmOccupancy( segment_fixup_sm_occupancy, segment_fixup_kernel, segment_fixup_config.block_threads))) { break; } // Get grid dimensions dim3 spmv_grid_size(CUB_MIN(num_merge_tiles, max_dim_x), cub::DivideAndRoundUp(num_merge_tiles, max_dim_x), 1); dim3 segment_fixup_grid_size( CUB_MIN(num_segment_fixup_tiles, max_dim_x), cub::DivideAndRoundUp(num_segment_fixup_tiles, max_dim_x), 1); // Get the temporary storage allocation requirements size_t allocation_sizes[3]; if (CubDebug(error = ScanTileStateT::AllocationSize(num_segment_fixup_tiles, allocation_sizes[0]))) { break; // bytes needed for reduce-by-key tile status descriptors } allocation_sizes[1] = num_merge_tiles * sizeof(KeyValuePairT); // bytes needed for block carry-out pairs allocation_sizes[2] = (num_merge_tiles + 1) * sizeof(CoordinateT); // bytes needed for tile starting coordinates // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob) void* allocations[3] = {}; if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) { break; } if (d_temp_storage == NULL) { // Return if the caller is simply requesting the size of the storage allocation break; } // Construct the tile status interface ScanTileStateT tile_state; if (CubDebug(error = tile_state.Init(num_segment_fixup_tiles, allocations[0], allocation_sizes[0]))) { break; } // Alias the other allocations KeyValuePairT* d_tile_carry_pairs = (KeyValuePairT*) allocations[1]; // Agent carry-out pairs CoordinateT* d_tile_coordinates = (CoordinateT*) allocations[2]; // Agent starting coordinates // Get search/init grid dims int search_block_size = INIT_KERNEL_THREADS; int search_grid_size = cub::DivideAndRoundUp(num_merge_tiles + 1, search_block_size); if (search_grid_size < sm_count) // if (num_merge_tiles < spmv_sm_occupancy * sm_count) { // Not enough spmv tiles to saturate the device: have spmv blocks search their own staring coords d_tile_coordinates = NULL; } else { // Use separate search kernel if we have enough spmv tiles to saturate the device // Log spmv_search_kernel configuration #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking spmv_search_kernel<<<%d, %d, 0, %lld>>>()\n", search_grid_size, search_block_size, (long long) stream); #endif // Invoke spmv_search_kernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(search_grid_size, search_block_size, 0, stream) .doit(spmv_search_kernel, num_merge_tiles, d_tile_coordinates, spmv_params); // Check for failure to launch if (CubDebug(error = cudaPeekAtLastError())) { break; } // Sync the stream if specified to flush runtime errors error = detail::DebugSyncStream(stream); if (CubDebug(error)) { break; } } // Log spmv_kernel configuration #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking spmv_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", spmv_grid_size.x, spmv_grid_size.y, spmv_grid_size.z, spmv_config.block_threads, (long long) stream, spmv_config.items_per_thread, spmv_sm_occupancy); #endif // Invoke spmv_kernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(spmv_grid_size, spmv_config.block_threads, 0, stream) .doit(spmv_kernel, spmv_params, d_tile_coordinates, d_tile_carry_pairs, num_merge_tiles, tile_state, num_segment_fixup_tiles); // Check for failure to launch if (CubDebug(error = cudaPeekAtLastError())) { break; } // Sync the stream if specified to flush runtime errors error = detail::DebugSyncStream(stream); if (CubDebug(error)) { break; } // Run reduce-by-key fixup if necessary if (num_merge_tiles > 1) { // Log segment_fixup_kernel configuration #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking segment_fixup_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", segment_fixup_grid_size.x, segment_fixup_grid_size.y, segment_fixup_grid_size.z, segment_fixup_config.block_threads, (long long) stream, segment_fixup_config.items_per_thread, segment_fixup_sm_occupancy); #endif // Invoke segment_fixup_kernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( segment_fixup_grid_size, segment_fixup_config.block_threads, 0, stream) .doit(segment_fixup_kernel, d_tile_carry_pairs, spmv_params.d_vector_y, num_merge_tiles, num_segment_fixup_tiles, tile_state); // Check for failure to launch if (CubDebug(error = cudaPeekAtLastError())) { break; } // Sync the stream if specified to flush runtime errors error = detail::DebugSyncStream(stream); if (CubDebug(error)) { break; } } } while (0); return error; } template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE static cudaError_t Dispatch(void* d_temp_storage, size_t& temp_storage_bytes, SpmvParamsT& spmv_params, cudaStream_t stream, bool debug_synchronous, Spmv1ColKernelT spmv_1col_kernel, SpmvSearchKernelT spmv_search_kernel, SpmvKernelT spmv_kernel, SegmentFixupKernelT segment_fixup_kernel, SpmvEmptyMatrixKernelT spmv_empty_matrix_kernel, KernelConfig spmv_config, KernelConfig segment_fixup_config) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return Dispatch( d_temp_storage, temp_storage_bytes, spmv_params, stream, spmv_1col_kernel, spmv_search_kernel, spmv_kernel, segment_fixup_kernel, spmv_empty_matrix_kernel, spmv_config, segment_fixup_config); } /** * @brief Internal dispatch routine for computing a device-wide reduction * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. * When NULL, the required allocation size is written to * `temp_storage_bytes` and no work is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param SpMV spmv_params * input parameter bundle * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. Default is stream0. */ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch(void* d_temp_storage, size_t& temp_storage_bytes, SpmvParamsT& spmv_params, cudaStream_t stream = 0) { cudaError error = cudaSuccess; do { // Get PTX version int ptx_version = 0; if (CubDebug(error = PtxVersion(ptx_version))) { break; } // Get kernel kernel dispatch configurations KernelConfig spmv_config, segment_fixup_config; InitConfigs(ptx_version, spmv_config, segment_fixup_config); constexpr bool has_alpha = false; constexpr bool has_beta = false; if (CubDebug( error = Dispatch( d_temp_storage, temp_storage_bytes, spmv_params, stream, DeviceSpmv1ColKernel, DeviceSpmvSearchKernel, DeviceSpmvKernel, DeviceSegmentFixupKernel, DeviceSpmvEmptyMatrixKernel, spmv_config, segment_fixup_config))) { break; } } while (0); return error; } CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch( void* d_temp_storage, size_t& temp_storage_bytes, SpmvParamsT& spmv_params, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return Dispatch(d_temp_storage, temp_storage_bytes, spmv_params, stream); } }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/device/dispatch/dispatch_three_way_partition.cuh000066400000000000000000000370541463375617100254700ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include #include #include #include #include #include #include #include CUB_NAMESPACE_BEGIN /****************************************************************************** * Kernel entry points *****************************************************************************/ template __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ThreeWayPartitionPolicy::BLOCK_THREADS)) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceThreeWayPartitionKernel( InputIteratorT d_in, FirstOutputIteratorT d_first_part_out, SecondOutputIteratorT d_second_part_out, UnselectedOutputIteratorT d_unselected_out, NumSelectedIteratorT d_num_selected_out, ScanTileStateT tile_status, SelectFirstPartOp select_first_part_op, SelectSecondPartOp select_second_part_op, OffsetT num_items, int num_tiles) { using AgentThreeWayPartitionPolicyT = typename ChainedPolicyT::ActivePolicy::ThreeWayPartitionPolicy; // Thread block type for selecting data from input tiles using AgentThreeWayPartitionT = AgentThreeWayPartition< AgentThreeWayPartitionPolicyT, InputIteratorT, FirstOutputIteratorT, SecondOutputIteratorT, UnselectedOutputIteratorT, SelectFirstPartOp, SelectSecondPartOp, OffsetT>; // Shared memory for AgentThreeWayPartition __shared__ typename AgentThreeWayPartitionT::TempStorage temp_storage; // Process tiles AgentThreeWayPartitionT( temp_storage, d_in, d_first_part_out, d_second_part_out, d_unselected_out, select_first_part_op, select_second_part_op, num_items) .ConsumeRange(num_tiles, tile_status, d_num_selected_out); } /** * @brief Initialization kernel for tile status initialization (multi-block) * * @tparam ScanTileStateT * Tile status interface type * * @tparam NumSelectedIteratorT * Output iterator type for recording the number of items selected * * @param[in] tile_state_1 * Tile status interface * * @param[in] tile_state_2 * Tile status interface * * @param[in] num_tiles * Number of tiles * * @param[out] d_num_selected_out * Pointer to the total number of items selected * (i.e., length of @p d_selected_out) */ template CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceThreeWayPartitionInitKernel(ScanTileStateT tile_state, int num_tiles, NumSelectedIteratorT d_num_selected_out) { // Initialize tile status tile_state.InitializeStatus(num_tiles); // Initialize d_num_selected_out if (blockIdx.x == 0) { if (threadIdx.x < 2) { d_num_selected_out[threadIdx.x] = 0; } } } /****************************************************************************** * Dispatch ******************************************************************************/ template , OffsetT>> struct DispatchThreeWayPartitionIf { /***************************************************************************** * Types and constants ****************************************************************************/ using AccumPackHelperT = detail::three_way_partition::accumulator_pack_t; using AccumPackT = typename AccumPackHelperT::pack_t; using ScanTileStateT = cub::ScanTileState; static constexpr int INIT_KERNEL_THREADS = 256; void* d_temp_storage; std::size_t& temp_storage_bytes; InputIteratorT d_in; FirstOutputIteratorT d_first_part_out; SecondOutputIteratorT d_second_part_out; UnselectedOutputIteratorT d_unselected_out; NumSelectedIteratorT d_num_selected_out; SelectFirstPartOp select_first_part_op; SelectSecondPartOp select_second_part_op; OffsetT num_items; cudaStream_t stream; CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchThreeWayPartitionIf( void* d_temp_storage, std::size_t& temp_storage_bytes, InputIteratorT d_in, FirstOutputIteratorT d_first_part_out, SecondOutputIteratorT d_second_part_out, UnselectedOutputIteratorT d_unselected_out, NumSelectedIteratorT d_num_selected_out, SelectFirstPartOp select_first_part_op, SelectSecondPartOp select_second_part_op, OffsetT num_items, cudaStream_t stream) : d_temp_storage(d_temp_storage) , temp_storage_bytes(temp_storage_bytes) , d_in(d_in) , d_first_part_out(d_first_part_out) , d_second_part_out(d_second_part_out) , d_unselected_out(d_unselected_out) , d_num_selected_out(d_num_selected_out) , select_first_part_op(select_first_part_op) , select_second_part_op(select_second_part_op) , num_items(num_items) , stream(stream) {} /***************************************************************************** * Dispatch entrypoints ****************************************************************************/ template CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke(ScanInitKernelPtrT three_way_partition_init_kernel, SelectIfKernelPtrT three_way_partition_kernel) { cudaError error = cudaSuccess; constexpr int block_threads = ActivePolicyT::ThreeWayPartitionPolicy::BLOCK_THREADS; constexpr int items_per_thread = ActivePolicyT::ThreeWayPartitionPolicy::ITEMS_PER_THREAD; do { // Get device ordinal int device_ordinal; error = CubDebug(cudaGetDevice(&device_ordinal)); if (cudaSuccess != error) { break; } // Number of input tiles int tile_size = block_threads * items_per_thread; int num_tiles = static_cast(DivideAndRoundUp(num_items, tile_size)); // Specify temporary storage allocation requirements size_t allocation_sizes[1]; // bytes needed for tile status descriptors error = CubDebug(ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0])); if (cudaSuccess != error) { break; } // Compute allocation pointers into the single storage blob (or compute // the necessary size of the blob) void* allocations[1] = {}; error = CubDebug(cub::AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes)); if (cudaSuccess != error) { break; } if (d_temp_storage == nullptr) { // Return if the caller is simply requesting the size of the storage // allocation break; } // Return if empty problem if (num_items == 0) { break; } // Construct the tile status interface ScanTileStateT tile_status; error = CubDebug(tile_status.Init(num_tiles, allocations[0], allocation_sizes[0])); if (cudaSuccess != error) { break; } // Log three_way_partition_init_kernel configuration int init_grid_size = CUB_MAX(1, DivideAndRoundUp(num_tiles, INIT_KERNEL_THREADS)); #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking three_way_partition_init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, reinterpret_cast(stream)); #endif // Invoke three_way_partition_init_kernel to initialize tile descriptors THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream) .doit(three_way_partition_init_kernel, tile_status, num_tiles, d_num_selected_out); // Check for failure to launch error = CubDebug(cudaPeekAtLastError()); if (cudaSuccess != error) { break; } // Sync the stream if specified to flush runtime errors error = CubDebug(detail::DebugSyncStream(stream)); if (cudaSuccess != error) { break; } // Get max x-dimension of grid int max_dim_x; error = CubDebug(cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal)); if (cudaSuccess != error) { break; } // Get grid size for scanning tiles dim3 scan_grid_size; scan_grid_size.z = 1; scan_grid_size.y = DivideAndRoundUp(num_tiles, max_dim_x); scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x); // Log select_if_kernel configuration #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG { // Get SM occupancy for select_if_kernel int range_select_sm_occupancy; error = CubDebug(MaxSmOccupancy(range_select_sm_occupancy, // out three_way_partition_kernel, block_threads)); if (cudaSuccess != error) { break; } _CubLog("Invoking three_way_partition_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d " "items per thread, %d SM occupancy\n", scan_grid_size.x, scan_grid_size.y, scan_grid_size.z, block_threads, reinterpret_cast(stream), items_per_thread, range_select_sm_occupancy); } #endif // Invoke select_if_kernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(scan_grid_size, block_threads, 0, stream) .doit(three_way_partition_kernel, d_in, d_first_part_out, d_second_part_out, d_unselected_out, d_num_selected_out, tile_status, select_first_part_op, select_second_part_op, num_items, num_tiles); // Check for failure to launch error = CubDebug(cudaPeekAtLastError()); if (cudaSuccess != error) { break; } // Sync the stream if specified to flush runtime errors error = CubDebug(detail::DebugSyncStream(stream)); if (cudaSuccess != error) { break; } } while (0); return error; } template CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke() { using MaxPolicyT = typename SelectedPolicy::MaxPolicy; return Invoke( DeviceThreeWayPartitionInitKernel, DeviceThreeWayPartitionKernel< MaxPolicyT, InputIteratorT, FirstOutputIteratorT, SecondOutputIteratorT, UnselectedOutputIteratorT, NumSelectedIteratorT, ScanTileStateT, SelectFirstPartOp, SelectSecondPartOp, OffsetT>); } /** * Internal dispatch routine */ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch( void* d_temp_storage, std::size_t& temp_storage_bytes, InputIteratorT d_in, FirstOutputIteratorT d_first_part_out, SecondOutputIteratorT d_second_part_out, UnselectedOutputIteratorT d_unselected_out, NumSelectedIteratorT d_num_selected_out, SelectFirstPartOp select_first_part_op, SelectSecondPartOp select_second_part_op, OffsetT num_items, cudaStream_t stream) { using MaxPolicyT = typename SelectedPolicy::MaxPolicy; cudaError error = cudaSuccess; do { // Get PTX version int ptx_version = 0; error = CubDebug(cub::PtxVersion(ptx_version)); if (cudaSuccess != error) { break; } DispatchThreeWayPartitionIf dispatch( d_temp_storage, temp_storage_bytes, d_in, d_first_part_out, d_second_part_out, d_unselected_out, d_num_selected_out, select_first_part_op, select_second_part_op, num_items, stream); // Dispatch error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch)); if (cudaSuccess != error) { break; } } while (0); return error; } CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch( void* d_temp_storage, std::size_t& temp_storage_bytes, InputIteratorT d_in, FirstOutputIteratorT d_first_part_out, SecondOutputIteratorT d_second_part_out, UnselectedOutputIteratorT d_unselected_out, NumSelectedIteratorT d_num_selected_out, SelectFirstPartOp select_first_part_op, SelectSecondPartOp select_second_part_op, OffsetT num_items, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return Dispatch( d_temp_storage, temp_storage_bytes, d_in, d_first_part_out, d_second_part_out, d_unselected_out, d_num_selected_out, select_first_part_op, select_second_part_op, num_items, stream); } }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/device/dispatch/dispatch_unique_by_key.cuh000066400000000000000000000512111463375617100242470ustar00rootroot00000000000000 /****************************************************************************** * Copyright (c) NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * @file * cub::DeviceSelect::UniqueByKey provides device-wide, parallel operations for selecting unique * items by key from sequences of data items residing within device-accessible memory. */ #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include #include #include #include #include CUB_NAMESPACE_BEGIN /****************************************************************************** * Kernel entry points *****************************************************************************/ /** * @brief Unique by key kernel entry point (multi-block) * * @tparam KeyInputIteratorT * Random-access input iterator type for keys * * @tparam ValueInputIteratorT * Random-access input iterator type for values * * @tparam KeyOutputIteratorT * Random-access output iterator type for keys * * @tparam ValueOutputIteratorT * Random-access output iterator type for values * * @tparam NumSelectedIteratorT * Output iterator type for recording the number of items selected * * @tparam ScanTileStateT * Tile status interface type * * @tparam EqualityOpT * Equality operator type * * @tparam OffsetT * Signed integer type for global offsets * * @param[in] d_keys_in * Pointer to the input sequence of keys * * @param[in] d_values_in * Pointer to the input sequence of values * * @param[out] d_keys_out * Pointer to the output sequence of selected data items * * @param[out] d_values_out * Pointer to the output sequence of selected data items * * @param[out] d_num_selected_out * Pointer to the total number of items selected * (i.e., length of @p d_keys_out or @p d_values_out) * * @param[in] tile_state * Tile status interface * * @param[in] equality_op * Equality operator * * @param[in] num_items * Total number of input items * (i.e., length of @p d_keys_in or @p d_values_in) * * @param[in] num_tiles * Total number of tiles for the entire problem * * @param[in] vsmem * Memory to support virtual shared memory */ template __launch_bounds__(int( cub::detail::vsmem_helper_default_fallback_policy_t< typename ChainedPolicyT::ActivePolicy::UniqueByKeyPolicyT, AgentUniqueByKey, KeyInputIteratorT, ValueInputIteratorT, KeyOutputIteratorT, ValueOutputIteratorT, EqualityOpT, OffsetT>::agent_policy_t::BLOCK_THREADS)) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceUniqueByKeySweepKernel( KeyInputIteratorT d_keys_in, ValueInputIteratorT d_values_in, KeyOutputIteratorT d_keys_out, ValueOutputIteratorT d_values_out, NumSelectedIteratorT d_num_selected_out, ScanTileStateT tile_state, EqualityOpT equality_op, OffsetT num_items, int num_tiles, cub::detail::vsmem_t vsmem) { using VsmemHelperT = cub::detail::vsmem_helper_default_fallback_policy_t< typename ChainedPolicyT::ActivePolicy::UniqueByKeyPolicyT, AgentUniqueByKey, KeyInputIteratorT, ValueInputIteratorT, KeyOutputIteratorT, ValueOutputIteratorT, EqualityOpT, OffsetT>; using AgentUniqueByKeyPolicyT = typename VsmemHelperT::agent_policy_t; // Thread block type for selecting data from input tiles using AgentUniqueByKeyT = typename VsmemHelperT::agent_t; // Static shared memory allocation __shared__ typename VsmemHelperT::static_temp_storage_t static_temp_storage; // Get temporary storage typename AgentUniqueByKeyT::TempStorage& temp_storage = VsmemHelperT::get_temp_storage(static_temp_storage, vsmem, (blockIdx.x * gridDim.y) + blockIdx.y); // Process tiles AgentUniqueByKeyT(temp_storage, d_keys_in, d_values_in, d_keys_out, d_values_out, equality_op, num_items) .ConsumeRange(num_tiles, tile_state, d_num_selected_out); // If applicable, hints to discard modified cache lines for vsmem VsmemHelperT::discard_temp_storage(temp_storage); } /****************************************************************************** * Dispatch ******************************************************************************/ /** * @brief Utility class for dispatching the appropriately-tuned kernels for DeviceSelect * * @tparam KeyInputIteratorT * Random-access input iterator type for keys * * @tparam ValueInputIteratorT * Random-access input iterator type for values * * @tparam KeyOutputIteratorT * Random-access output iterator type for keys * * @tparam ValueOutputIteratorT * Random-access output iterator type for values * * @tparam NumSelectedIteratorT * Output iterator type for recording the number of items selected * * @tparam EqualityOpT * Equality operator type * * @tparam OffsetT * Signed integer type for global offsets */ template > struct DispatchUniqueByKey : SelectedPolicy { /****************************************************************************** * Types and constants ******************************************************************************/ enum { INIT_KERNEL_THREADS = 128, }; // The input key and value type using KeyT = typename std::iterator_traits::value_type; using ValueT = typename std::iterator_traits::value_type; // Tile status descriptor interface type using ScanTileStateT = ScanTileState; /// Device-accessible allocation of temporary storage. When NULL, the required allocation size /// is written to `temp_storage_bytes` and no work is done. void* d_temp_storage; /// Reference to size in bytes of `d_temp_storage` allocation size_t& temp_storage_bytes; /// Pointer to the input sequence of keys KeyInputIteratorT d_keys_in; /// Pointer to the input sequence of values ValueInputIteratorT d_values_in; /// Pointer to the output sequence of selected data items KeyOutputIteratorT d_keys_out; /// Pointer to the output sequence of selected data items ValueOutputIteratorT d_values_out; /// Pointer to the total number of items selected /// (i.e., length of @p d_keys_out or @p d_values_out) NumSelectedIteratorT d_num_selected_out; /// Equality operator EqualityOpT equality_op; /// Total number of input items (i.e., length of @p d_keys_in or @p d_values_in) OffsetT num_items; /// **[optional]** CUDA stream to launch kernels within. Default is stream0. cudaStream_t stream; /** * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. * When NULL, the required allocation size is written to * `temp_storage_bytes` and no work is done. * * @tparam temp_storage_bytes * [in,out] Reference to size in bytes of `d_temp_storage` allocation * * @param[in] d_keys_in * Pointer to the input sequence of keys * * @param[in] d_values_in * Pointer to the input sequence of values * * @param[out] d_keys_out * Pointer to the output sequence of selected data items * * @param[out] d_values_out * Pointer to the output sequence of selected data items * * @param[out] d_num_selected_out * Pointer to the total number of items selected * (i.e., length of @p d_keys_out or @p d_values_out) * * @param[in] equality_op * Equality operator * * @param[in] num_items * Total number of input items (i.e., length of @p d_keys_in or @p d_values_in) * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchUniqueByKey( void* d_temp_storage, size_t& temp_storage_bytes, KeyInputIteratorT d_keys_in, ValueInputIteratorT d_values_in, KeyOutputIteratorT d_keys_out, ValueOutputIteratorT d_values_out, NumSelectedIteratorT d_num_selected_out, EqualityOpT equality_op, OffsetT num_items, cudaStream_t stream) : d_temp_storage(d_temp_storage) , temp_storage_bytes(temp_storage_bytes) , d_keys_in(d_keys_in) , d_values_in(d_values_in) , d_keys_out(d_keys_out) , d_values_out(d_values_out) , d_num_selected_out(d_num_selected_out) , equality_op(equality_op) , num_items(num_items) , stream(stream) {} CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchUniqueByKey( void* d_temp_storage, size_t& temp_storage_bytes, KeyInputIteratorT d_keys_in, ValueInputIteratorT d_values_in, KeyOutputIteratorT d_keys_out, ValueOutputIteratorT d_values_out, NumSelectedIteratorT d_num_selected_out, EqualityOpT equality_op, OffsetT num_items, cudaStream_t stream, bool debug_synchronous) : d_temp_storage(d_temp_storage) , temp_storage_bytes(temp_storage_bytes) , d_keys_in(d_keys_in) , d_values_in(d_values_in) , d_keys_out(d_keys_out) , d_values_out(d_values_out) , d_num_selected_out(d_num_selected_out) , equality_op(equality_op) , num_items(num_items) , stream(stream) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG } /****************************************************************************** * Dispatch entrypoints ******************************************************************************/ template CUB_RUNTIME_FUNCTION _CCCL_HOST _CCCL_FORCEINLINE cudaError_t Invoke(InitKernel init_kernel, ScanKernel scan_kernel) { using Policy = typename ActivePolicyT::UniqueByKeyPolicyT; using VsmemHelperT = cub::detail::vsmem_helper_default_fallback_policy_t< Policy, AgentUniqueByKey, KeyInputIteratorT, ValueInputIteratorT, KeyOutputIteratorT, ValueOutputIteratorT, EqualityOpT, OffsetT>; cudaError error = cudaSuccess; do { // Get device ordinal int device_ordinal; error = CubDebug(cudaGetDevice(&device_ordinal)); if (cudaSuccess != error) { break; } // Number of input tiles constexpr auto block_threads = VsmemHelperT::agent_policy_t::BLOCK_THREADS; constexpr auto items_per_thread = VsmemHelperT::agent_policy_t::ITEMS_PER_THREAD; int tile_size = block_threads * items_per_thread; int num_tiles = static_cast(cub::DivideAndRoundUp(num_items, tile_size)); const auto vsmem_size = num_tiles * VsmemHelperT::vsmem_per_block; // Specify temporary storage allocation requirements size_t allocation_sizes[2] = {0, vsmem_size}; // Bytes needed for tile status descriptors error = CubDebug(ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0])); if (cudaSuccess != error) { break; } // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob) void* allocations[2] = {NULL, NULL}; error = CubDebug(AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes)); if (cudaSuccess != error) { break; } if (d_temp_storage == NULL) { // Return if the caller is simply requesting the size of the storage allocation break; } // Construct the tile status interface ScanTileStateT tile_state; error = CubDebug(tile_state.Init(num_tiles, allocations[0], allocation_sizes[0])); if (cudaSuccess != error) { break; } // Log init_kernel configuration num_tiles = CUB_MAX(1, num_tiles); int init_grid_size = cub::DivideAndRoundUp(num_tiles, INIT_KERNEL_THREADS); #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream); #endif // Invoke init_kernel to initialize tile descriptors THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream) .doit(init_kernel, tile_state, num_tiles, d_num_selected_out); // Check for failure to launch error = CubDebug(cudaPeekAtLastError()); if (cudaSuccess != error) { break; } // Sync the stream if specified to flush runtime errors error = CubDebug(detail::DebugSyncStream(stream)); if (cudaSuccess != error) { break; } // Return if empty problem if (num_items == 0) { break; } // Get max x-dimension of grid int max_dim_x; error = CubDebug(cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal)); if (cudaSuccess != error) { break; } // Get grid size for scanning tiles dim3 scan_grid_size; scan_grid_size.z = 1; scan_grid_size.y = cub::DivideAndRoundUp(num_tiles, max_dim_x); scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x); // Log select_if_kernel configuration #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG { // Get SM occupancy for unique_by_key_kernel int scan_sm_occupancy; error = CubDebug(MaxSmOccupancy(scan_sm_occupancy, // out scan_kernel, block_threads)); if (cudaSuccess != error) { break; } _CubLog("Invoking unique_by_key_kernel<<<{%d,%d,%d}, %d, 0, " "%lld>>>(), %d items per thread, %d SM occupancy\n", scan_grid_size.x, scan_grid_size.y, scan_grid_size.z, block_threads, (long long) stream, items_per_thread, scan_sm_occupancy); } #endif // Invoke select_if_kernel error = THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(scan_grid_size, block_threads, 0, stream) .doit(scan_kernel, d_keys_in, d_values_in, d_keys_out, d_values_out, d_num_selected_out, tile_state, equality_op, num_items, num_tiles, cub::detail::vsmem_t{allocations[1]}); // Check for failure to launch error = CubDebug(error); if (cudaSuccess != error) { break; } // Sync the stream if specified to flush runtime errors error = CubDebug(detail::DebugSyncStream(stream)); if (cudaSuccess != error) { break; } } while (0); return error; } template CUB_RUNTIME_FUNCTION _CCCL_HOST _CCCL_FORCEINLINE cudaError_t Invoke() { using MaxPolicyT = typename DispatchUniqueByKey::MaxPolicy; // Ensure kernels are instantiated. return Invoke( DeviceCompactInitKernel, DeviceUniqueByKeySweepKernel< MaxPolicyT, KeyInputIteratorT, ValueInputIteratorT, KeyOutputIteratorT, ValueOutputIteratorT, NumSelectedIteratorT, ScanTileStateT, EqualityOpT, OffsetT>); } /** * @brief Internal dispatch routine * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. * When NULL, the required allocation size is written to * `temp_storage_bytes` and no work is done. * * @param[in,out] &temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in] d_keys_in * Pointer to the input sequence of keys * * @param[in] d_values_in * Pointer to the input sequence of values * * @param[out] d_keys_out * Pointer to the output sequence of selected data items * * @param[out] d_values_out * Pointer to the output sequence of selected data items * * @param[out] d_num_selected_out * Pointer to the total number of items selected * (i.e., length of @p d_keys_out or @p d_values_out) * * @param[in] equality_op * Equality operator * * @param[in] num_items * Total number of input items (i.e., the length of @p d_in) * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch( void* d_temp_storage, size_t& temp_storage_bytes, KeyInputIteratorT d_keys_in, ValueInputIteratorT d_values_in, KeyOutputIteratorT d_keys_out, ValueOutputIteratorT d_values_out, NumSelectedIteratorT d_num_selected_out, EqualityOpT equality_op, OffsetT num_items, cudaStream_t stream) { using MaxPolicyT = typename DispatchUniqueByKey::MaxPolicy; cudaError_t error; do { // Get PTX version int ptx_version = 0; error = CubDebug(PtxVersion(ptx_version)); if (cudaSuccess != error) { break; } // Create dispatch functor DispatchUniqueByKey dispatch( d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_keys_out, d_values_out, d_num_selected_out, equality_op, num_items, stream); // Dispatch to chained policy error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch)); if (cudaSuccess != error) { break; } } while (0); return error; } CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch( void* d_temp_storage, size_t& temp_storage_bytes, KeyInputIteratorT d_keys_in, ValueInputIteratorT d_values_in, KeyOutputIteratorT d_keys_out, ValueOutputIteratorT d_values_out, NumSelectedIteratorT d_num_selected_out, EqualityOpT equality_op, OffsetT num_items, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return Dispatch( d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_keys_out, d_values_out, d_num_selected_out, equality_op, num_items, stream); } }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/device/dispatch/tuning/000077500000000000000000000000001463375617100203235ustar00rootroot00000000000000cccl-2.5.0/cub/cub/device/dispatch/tuning/tuning_for.cuh000066400000000000000000000045561463375617100232100ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include CUB_NAMESPACE_BEGIN namespace detail { namespace for_each { struct policy_hub_t { struct policy_350_t : ChainedPolicy<350, policy_350_t, policy_350_t> { using for_policy_t = policy_t<256, 2>; }; using MaxPolicy = policy_350_t; }; } // namespace for_each } // namespace detail CUB_NAMESPACE_END cccl-2.5.0/cub/cub/device/dispatch/tuning/tuning_histogram.cuh000066400000000000000000000147351463375617100244170ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include CUB_NAMESPACE_BEGIN namespace detail { namespace histogram { enum class primitive_sample { no, yes }; enum class sample_size { _1, _2, unknown }; enum class counter_size { _4, unknown }; template constexpr primitive_sample is_primitive_sample() { return Traits::PRIMITIVE ? primitive_sample::yes : primitive_sample::no; } template constexpr counter_size classify_counter_size() { return sizeof(CounterT) == 4 ? counter_size::_4 : counter_size::unknown; } template constexpr sample_size classify_sample_size() { return sizeof(SampleT) == 1 ? sample_size::_1 : sizeof(SampleT) == 2 ? sample_size::_2 : sample_size::unknown; } template constexpr int v_scale() { return (sizeof(SampleT) + sizeof(int) - 1) / sizeof(int); } template constexpr int t_scale() { return CUB_MAX((NominalItemsPerThread / NumActiveChannels / v_scale()), 1); } template (), sample_size SampleSize = classify_sample_size()> struct sm90_tuning { static constexpr int threads = 384; static constexpr int items = t_scale(); static constexpr CacheLoadModifier load_modifier = LOAD_LDG; static constexpr BlockHistogramMemoryPreference mem_preference = SMEM; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; static constexpr bool rle_compress = true; static constexpr bool work_stealing = false; }; template struct sm90_tuning { static constexpr int threads = 768; static constexpr int items = 12; static constexpr CacheLoadModifier load_modifier = LOAD_LDG; static constexpr BlockHistogramMemoryPreference mem_preference = SMEM; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; static constexpr bool rle_compress = false; static constexpr bool work_stealing = false; }; template struct sm90_tuning { static constexpr int threads = 960; static constexpr int items = 10; static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT; static constexpr BlockHistogramMemoryPreference mem_preference = SMEM; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; static constexpr bool rle_compress = true; static constexpr bool work_stealing = false; }; } // namespace histogram template struct device_histogram_policy_hub { template struct TScale { enum { V_SCALE = (sizeof(SampleT) + sizeof(int) - 1) / sizeof(int), VALUE = CUB_MAX((NOMINAL_ITEMS_PER_THREAD / NumActiveChannels / V_SCALE), 1) }; }; /// SM35 struct Policy350 : ChainedPolicy<350, Policy350, Policy350> { // TODO This might be worth it to separate usual histogram and the multi one using AgentHistogramPolicyT = AgentHistogramPolicy<128, TScale<8>::VALUE, BLOCK_LOAD_DIRECT, LOAD_LDG, true, BLEND, true>; }; /// SM50 struct Policy500 : ChainedPolicy<500, Policy500, Policy350> { // TODO This might be worth it to separate usual histogram and the multi one using AgentHistogramPolicyT = AgentHistogramPolicy<384, TScale<16>::VALUE, cub::BLOCK_LOAD_DIRECT, LOAD_LDG, true, SMEM, false>; }; /// SM900 struct Policy900 : ChainedPolicy<900, Policy900, Policy500> { using tuning = detail::histogram:: sm90_tuning()>; using AgentHistogramPolicyT = AgentHistogramPolicy; }; using MaxPolicy = Policy900; }; } // namespace detail CUB_NAMESPACE_END cccl-2.5.0/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh000066400000000000000000000661341463375617100252330ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include #include #include #include CUB_NAMESPACE_BEGIN namespace detail { namespace reduce_by_key { enum class primitive_key { no, yes }; enum class primitive_accum { no, yes }; enum class primitive_op { no, yes }; enum class key_size { _1, _2, _4, _8, _16, unknown }; enum class accum_size { _1, _2, _4, _8, _16, unknown }; template constexpr primitive_key is_primitive_key() { return Traits::PRIMITIVE ? primitive_key::yes : primitive_key::no; } template constexpr primitive_accum is_primitive_accum() { return Traits::PRIMITIVE ? primitive_accum::yes : primitive_accum::no; } template constexpr primitive_op is_primitive_op() { return basic_binary_op_t::value ? primitive_op::yes : primitive_op::no; } template constexpr key_size classify_key_size() { return sizeof(KeyT) == 1 ? key_size::_1 : sizeof(KeyT) == 2 ? key_size::_2 : sizeof(KeyT) == 4 ? key_size::_4 : sizeof(KeyT) == 8 ? key_size::_8 : sizeof(KeyT) == 16 ? key_size::_16 : key_size::unknown; } template constexpr accum_size classify_accum_size() { return sizeof(AccumT) == 1 ? accum_size::_1 : sizeof(AccumT) == 2 ? accum_size::_2 : sizeof(AccumT) == 4 ? accum_size::_4 : sizeof(AccumT) == 8 ? accum_size::_8 : sizeof(AccumT) == 16 ? accum_size::_16 : accum_size::unknown; } template (), primitive_accum PrimitiveAccum = is_primitive_accum(), key_size KeySize = classify_key_size(), accum_size AccumSize = classify_accum_size()> struct sm90_tuning { static constexpr int max_input_bytes = CUB_MAX(sizeof(KeyT), sizeof(AccumT)); static constexpr int combined_input_bytes = sizeof(KeyT) + sizeof(AccumT); static constexpr int threads = 128; static constexpr int nominal_4b_items_per_thread = 6; static constexpr int items = (max_input_bytes <= 8) ? 6 : CUB_MIN(nominal_4b_items_per_thread, CUB_MAX(1, ((nominal_4b_items_per_thread * 8) + combined_input_bytes - 1) / combined_input_bytes)); static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; using delay_constructor = detail::default_reduce_by_key_delay_constructor_t; }; // 8-bit key template struct sm90_tuning { static constexpr int threads = 256; static constexpr int items = 13; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; using delay_constructor = detail::no_delay_constructor_t<720>; }; template struct sm90_tuning { static constexpr int threads = 320; static constexpr int items = 23; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; using delay_constructor = detail::no_delay_constructor_t<865>; }; template struct sm90_tuning { static constexpr int threads = 192; static constexpr int items = 14; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; using delay_constructor = detail::no_delay_constructor_t<735>; }; template struct sm90_tuning { static constexpr int threads = 128; static constexpr int items = 13; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; using delay_constructor = detail::no_delay_constructor_t<580>; }; template struct sm90_tuning { static constexpr int threads = 128; static constexpr int items = 11; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; using delay_constructor = detail::no_delay_constructor_t<1100>; }; // 16-bit key template struct sm90_tuning { static constexpr int threads = 128; static constexpr int items = 23; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; using delay_constructor = detail::no_delay_constructor_t<985>; }; template struct sm90_tuning { static constexpr int threads = 256; static constexpr int items = 11; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; using delay_constructor = detail::fixed_delay_constructor_t<276, 650>; }; template struct sm90_tuning { static constexpr int threads = 256; static constexpr int items = 14; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; using delay_constructor = detail::fixed_delay_constructor_t<240, 765>; }; template struct sm90_tuning { static constexpr int threads = 128; static constexpr int items = 19; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; using delay_constructor = detail::no_delay_constructor_t<1190>; }; template struct sm90_tuning { static constexpr int threads = 128; static constexpr int items = 11; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; using delay_constructor = detail::no_delay_constructor_t<1175>; }; // 32-bit key template struct sm90_tuning { static constexpr int threads = 256; static constexpr int items = 13; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; using delay_constructor = detail::fixed_delay_constructor_t<404, 645>; }; template struct sm90_tuning { static constexpr int threads = 256; static constexpr int items = 18; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; using delay_constructor = detail::no_delay_constructor_t<1160>; }; template struct sm90_tuning { static constexpr int threads = 256; static constexpr int items = 18; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; using delay_constructor = detail::no_delay_constructor_t<1170>; }; template struct sm90_tuning { static constexpr int threads = 128; static constexpr int items = 13; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; using delay_constructor = detail::no_delay_constructor_t<1055>; }; template struct sm90_tuning { static constexpr int threads = 128; static constexpr int items = 11; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; using delay_constructor = detail::no_delay_constructor_t<1195>; }; // 64-bit key template struct sm90_tuning { static constexpr int threads = 256; static constexpr int items = 10; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; using delay_constructor = detail::no_delay_constructor_t<1170>; }; template struct sm90_tuning { static constexpr int threads = 256; static constexpr int items = 9; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; using delay_constructor = detail::fixed_delay_constructor_t<236, 1030>; }; template struct sm90_tuning { static constexpr int threads = 128; static constexpr int items = 13; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; using delay_constructor = detail::fixed_delay_constructor_t<152, 560>; }; template struct sm90_tuning { static constexpr int threads = 128; static constexpr int items = 23; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; using delay_constructor = detail::no_delay_constructor_t<1030>; }; template struct sm90_tuning { static constexpr int threads = 128; static constexpr int items = 11; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; using delay_constructor = detail::no_delay_constructor_t<1125>; }; // 128-bit key template struct sm90_tuning { static constexpr int threads = 128; static constexpr int items = 11; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; using delay_constructor = detail::no_delay_constructor_t<1080>; }; template struct sm90_tuning { static constexpr int threads = 128; static constexpr int items = 11; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; using delay_constructor = detail::fixed_delay_constructor_t<320, 1005>; }; template struct sm90_tuning { static constexpr int threads = 128; static constexpr int items = 11; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; using delay_constructor = detail::fixed_delay_constructor_t<232, 1100>; }; template struct sm90_tuning { static constexpr int threads = 128; static constexpr int items = 11; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; using delay_constructor = detail::no_delay_constructor_t<1195>; }; template struct sm90_tuning { static constexpr int threads = 128; static constexpr int items = 11; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; using delay_constructor = detail::no_delay_constructor_t<1150>; }; template (), primitive_accum PrimitiveAccum = is_primitive_accum(), key_size KeySize = classify_key_size(), accum_size AccumSize = classify_accum_size()> struct sm80_tuning { static constexpr int max_input_bytes = CUB_MAX(sizeof(KeyT), sizeof(AccumT)); static constexpr int combined_input_bytes = sizeof(KeyT) + sizeof(AccumT); static constexpr int threads = 128; static constexpr int nominal_4b_items_per_thread = 6; static constexpr int items = (max_input_bytes <= 8) ? 6 : CUB_MIN(nominal_4b_items_per_thread, CUB_MAX(1, ((nominal_4b_items_per_thread * 8) + combined_input_bytes - 1) / combined_input_bytes)); static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; using delay_constructor = detail::default_reduce_by_key_delay_constructor_t; }; // 8-bit key template struct sm80_tuning { static constexpr int threads = 256; static constexpr int items = 13; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; using delay_constructor = detail::no_delay_constructor_t<975>; }; template struct sm80_tuning { static constexpr int threads = 224; static constexpr int items = 12; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; using delay_constructor = detail::no_delay_constructor_t<840>; }; template struct sm80_tuning { static constexpr int threads = 256; static constexpr int items = 15; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; using delay_constructor = detail::no_delay_constructor_t<760>; }; template struct sm80_tuning { static constexpr int threads = 224; static constexpr int items = 7; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; using delay_constructor = detail::no_delay_constructor_t<1070>; }; template struct sm80_tuning { static constexpr int threads = 128; static constexpr int items = 9; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; using delay_constructor = detail::no_delay_constructor_t<1175>; }; // 16-bit key template struct sm80_tuning { static constexpr int threads = 256; static constexpr int items = 11; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; using delay_constructor = detail::no_delay_constructor_t<620>; }; template struct sm80_tuning { static constexpr int threads = 224; static constexpr int items = 14; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; using delay_constructor = detail::no_delay_constructor_t<640>; }; template struct sm80_tuning { static constexpr int threads = 256; static constexpr int items = 14; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; using delay_constructor = detail::no_delay_constructor_t<905>; }; template struct sm80_tuning { static constexpr int threads = 224; static constexpr int items = 9; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; using delay_constructor = detail::no_delay_constructor_t<810>; }; template struct sm80_tuning { static constexpr int threads = 160; static constexpr int items = 9; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; using delay_constructor = detail::no_delay_constructor_t<1115>; }; // 32-bit key template struct sm80_tuning { static constexpr int threads = 288; static constexpr int items = 11; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; using delay_constructor = detail::no_delay_constructor_t<1110>; }; template struct sm80_tuning { static constexpr int threads = 192; static constexpr int items = 15; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; using delay_constructor = detail::no_delay_constructor_t<1200>; }; template struct sm80_tuning { static constexpr int threads = 256; static constexpr int items = 15; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; using delay_constructor = detail::no_delay_constructor_t<1110>; }; template struct sm80_tuning { static constexpr int threads = 224; static constexpr int items = 9; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; using delay_constructor = detail::no_delay_constructor_t<1165>; }; template struct sm80_tuning { static constexpr int threads = 160; static constexpr int items = 9; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; using delay_constructor = detail::no_delay_constructor_t<1100>; }; // 64-bit key template struct sm80_tuning { static constexpr int threads = 192; static constexpr int items = 10; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; using delay_constructor = detail::no_delay_constructor_t<1175>; }; template struct sm80_tuning { static constexpr int threads = 224; static constexpr int items = 7; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; using delay_constructor = detail::no_delay_constructor_t<1075>; }; template struct sm80_tuning { static constexpr int threads = 384; static constexpr int items = 7; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; using delay_constructor = detail::no_delay_constructor_t<1040>; }; template struct sm80_tuning { static constexpr int threads = 128; static constexpr int items = 14; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; using delay_constructor = detail::no_delay_constructor_t<1080>; }; template struct sm80_tuning { static constexpr int threads = 128; static constexpr int items = 11; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; using delay_constructor = detail::no_delay_constructor_t<430>; }; // 128-bit key template struct sm80_tuning { static constexpr int threads = 192; static constexpr int items = 7; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; using delay_constructor = detail::no_delay_constructor_t<1105>; }; template struct sm80_tuning { static constexpr int threads = 192; static constexpr int items = 7; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; using delay_constructor = detail::no_delay_constructor_t<755>; }; template struct sm80_tuning { static constexpr int threads = 192; static constexpr int items = 7; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; using delay_constructor = detail::no_delay_constructor_t<535>; }; template struct sm80_tuning { static constexpr int threads = 192; static constexpr int items = 7; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; using delay_constructor = detail::no_delay_constructor_t<1035>; }; template struct sm80_tuning { static constexpr int threads = 128; static constexpr int items = 11; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; using delay_constructor = detail::no_delay_constructor_t<1090>; }; } // namespace reduce_by_key template struct device_reduce_by_key_policy_hub { static constexpr int MAX_INPUT_BYTES = CUB_MAX(sizeof(KeyT), sizeof(AccumT)); static constexpr int COMBINED_INPUT_BYTES = sizeof(KeyT) + sizeof(AccumT); struct DefaultTuning { static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = 6; static constexpr int ITEMS_PER_THREAD = (MAX_INPUT_BYTES <= 8) ? 6 : CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)); using ReduceByKeyPolicyT = AgentReduceByKeyPolicy<128, ITEMS_PER_THREAD, BLOCK_LOAD_DIRECT, LOAD_LDG, BLOCK_SCAN_WARP_SCANS, detail::default_reduce_by_key_delay_constructor_t>; }; /// SM35 struct Policy350 : DefaultTuning , ChainedPolicy<350, Policy350, Policy350> {}; /// SM80 struct Policy800 : ChainedPolicy<800, Policy800, Policy350> { using tuning = detail::reduce_by_key::sm80_tuning()>; using ReduceByKeyPolicyT = AgentReduceByKeyPolicy; }; /// SM86 struct Policy860 : DefaultTuning , ChainedPolicy<860, Policy860, Policy800> {}; /// SM90 struct Policy900 : ChainedPolicy<900, Policy900, Policy860> { using tuning = detail::reduce_by_key::sm90_tuning()>; using ReduceByKeyPolicyT = AgentReduceByKeyPolicy; }; using MaxPolicy = Policy900; }; } // namespace detail CUB_NAMESPACE_END cccl-2.5.0/cub/cub/device/dispatch/tuning/tuning_run_length_encode.cuh000066400000000000000000000510241463375617100260740ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include #include #include #include #include CUB_NAMESPACE_BEGIN namespace detail { namespace rle { enum class primitive_key { no, yes }; enum class primitive_length { no, yes }; enum class key_size { _1, _2, _4, _8, _16, unknown }; enum class length_size { _4, unknown }; template constexpr primitive_key is_primitive_key() { return Traits::PRIMITIVE ? primitive_key::yes : primitive_key::no; } template constexpr primitive_length is_primitive_length() { return Traits::PRIMITIVE ? primitive_length::yes : primitive_length::no; } template constexpr key_size classify_key_size() { return sizeof(KeyT) == 1 ? key_size::_1 : sizeof(KeyT) == 2 ? key_size::_2 : sizeof(KeyT) == 4 ? key_size::_4 : sizeof(KeyT) == 8 ? key_size::_8 : sizeof(KeyT) == 16 ? key_size::_16 : key_size::unknown; } template constexpr length_size classify_length_size() { return sizeof(LengthT) == 4 ? length_size::_4 : length_size::unknown; } namespace encode { template (), primitive_key PrimitiveKey = is_primitive_key(), length_size LengthSize = classify_length_size(), key_size KeySize = classify_key_size()> struct sm90_tuning { static constexpr int max_input_bytes = CUB_MAX(sizeof(KeyT), sizeof(LengthT)); static constexpr int combined_input_bytes = sizeof(KeyT) + sizeof(LengthT); static constexpr int threads = 128; static constexpr int nominal_4b_items_per_thread = 6; static constexpr int items = (max_input_bytes <= 8) ? 6 : CUB_MIN(nominal_4b_items_per_thread, CUB_MAX(1, ((nominal_4b_items_per_thread * 8) + combined_input_bytes - 1) / combined_input_bytes)); static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; using delay_constructor = detail::default_reduce_by_key_delay_constructor_t; }; template struct sm90_tuning { static constexpr int threads = 256; static constexpr int items = 13; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; using delay_constructor = detail::no_delay_constructor_t<620>; }; template struct sm90_tuning { static constexpr int threads = 128; static constexpr int items = 22; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; using delay_constructor = detail::no_delay_constructor_t<775>; }; template struct sm90_tuning { static constexpr int threads = 192; static constexpr int items = 14; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; using delay_constructor = detail::fixed_delay_constructor_t<284, 480>; }; template struct sm90_tuning { static constexpr int threads = 128; static constexpr int items = 19; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; using delay_constructor = detail::no_delay_constructor_t<515>; }; #if CUB_IS_INT128_ENABLED template struct sm90_tuning { static constexpr int threads = 128; static constexpr int items = 11; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; using delay_constructor = detail::fixed_delay_constructor_t<428, 930>; }; template struct sm90_tuning { static constexpr int threads = 128; static constexpr int items = 11; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; using delay_constructor = detail::fixed_delay_constructor_t<428, 930>; }; #endif template (), primitive_key PrimitiveKey = is_primitive_key(), length_size LengthSize = classify_length_size(), key_size KeySize = classify_key_size()> struct sm80_tuning { static constexpr int max_input_bytes = CUB_MAX(sizeof(KeyT), sizeof(LengthT)); static constexpr int combined_input_bytes = sizeof(KeyT) + sizeof(LengthT); static constexpr int threads = 128; static constexpr int nominal_4b_items_per_thread = 6; static constexpr int items = (max_input_bytes <= 8) ? 6 : CUB_MIN(nominal_4b_items_per_thread, CUB_MAX(1, ((nominal_4b_items_per_thread * 8) + combined_input_bytes - 1) / combined_input_bytes)); static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; using delay_constructor = detail::default_reduce_by_key_delay_constructor_t; }; template struct sm80_tuning { static constexpr int threads = 256; static constexpr int items = 14; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; using delay_constructor = detail::no_delay_constructor_t<640>; }; template struct sm80_tuning { static constexpr int threads = 256; static constexpr int items = 13; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; using delay_constructor = detail::no_delay_constructor_t<900>; }; template struct sm80_tuning { static constexpr int threads = 256; static constexpr int items = 13; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; using delay_constructor = detail::no_delay_constructor_t<1080>; }; template struct sm80_tuning { static constexpr int threads = 224; static constexpr int items = 9; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; using delay_constructor = detail::no_delay_constructor_t<1075>; }; #if CUB_IS_INT128_ENABLED template struct sm80_tuning { static constexpr int threads = 128; static constexpr int items = 7; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; using delay_constructor = detail::no_delay_constructor_t<630>; }; template struct sm80_tuning { static constexpr int threads = 128; static constexpr int items = 7; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; using delay_constructor = detail::no_delay_constructor_t<630>; }; #endif } // namespace encode namespace non_trivial_runs { template (), primitive_key PrimitiveKey = is_primitive_key(), length_size LengthSize = classify_length_size(), key_size KeySize = classify_key_size()> struct sm90_tuning { static constexpr int threads = 96; static constexpr int nominal_4b_items_per_thread = 15; static constexpr int items = CUB_MIN(nominal_4b_items_per_thread, CUB_MAX(1, (nominal_4b_items_per_thread * 4 / sizeof(KeyT)))); static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr bool store_with_time_slicing = true; using delay_constructor = detail::default_reduce_by_key_delay_constructor_t; }; template struct sm90_tuning { static constexpr int threads = 256; static constexpr int items = 18; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; static constexpr bool store_with_time_slicing = false; using delay_constructor = detail::no_delay_constructor_t<385>; }; template struct sm90_tuning { static constexpr int threads = 224; static constexpr int items = 20; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; static constexpr bool store_with_time_slicing = false; using delay_constructor = detail::no_delay_constructor_t<675>; }; template struct sm90_tuning { static constexpr int threads = 256; static constexpr int items = 18; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; static constexpr bool store_with_time_slicing = false; using delay_constructor = detail::no_delay_constructor_t<695>; }; template struct sm90_tuning { static constexpr int threads = 224; static constexpr int items = 14; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr bool store_with_time_slicing = false; using delay_constructor = detail::no_delay_constructor_t<840>; }; #if CUB_IS_INT128_ENABLED template struct sm90_tuning { static constexpr int threads = 288; static constexpr int items = 9; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr bool store_with_time_slicing = false; using delay_constructor = detail::fixed_delay_constructor_t<484, 1150>; }; template struct sm90_tuning { static constexpr int threads = 288; static constexpr int items = 9; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr bool store_with_time_slicing = false; using delay_constructor = detail::fixed_delay_constructor_t<484, 1150>; }; #endif template (), primitive_key PrimitiveKey = is_primitive_key(), length_size LengthSize = classify_length_size(), key_size KeySize = classify_key_size()> struct sm80_tuning { static constexpr int threads = 96; static constexpr int nominal_4b_items_per_thread = 15; static constexpr int items = CUB_MIN(nominal_4b_items_per_thread, CUB_MAX(1, (nominal_4b_items_per_thread * 4 / sizeof(KeyT)))); static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr bool store_with_time_slicing = true; using delay_constructor = detail::default_reduce_by_key_delay_constructor_t; }; template struct sm80_tuning { static constexpr int threads = 192; static constexpr int items = 20; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; static constexpr bool store_with_time_slicing = false; using delay_constructor = detail::no_delay_constructor_t<630>; }; template struct sm80_tuning { static constexpr int threads = 192; static constexpr int items = 20; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr bool store_with_time_slicing = false; using delay_constructor = detail::no_delay_constructor_t<1015>; }; template struct sm80_tuning { static constexpr int threads = 224; static constexpr int items = 15; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr bool store_with_time_slicing = false; using delay_constructor = detail::no_delay_constructor_t<915>; }; template struct sm80_tuning { static constexpr int threads = 256; static constexpr int items = 13; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr bool store_with_time_slicing = false; using delay_constructor = detail::no_delay_constructor_t<1065>; }; #if CUB_IS_INT128_ENABLED template struct sm80_tuning { static constexpr int threads = 192; static constexpr int items = 13; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr bool store_with_time_slicing = false; using delay_constructor = detail::no_delay_constructor_t<1050>; }; template struct sm80_tuning { static constexpr int threads = 192; static constexpr int items = 13; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr bool store_with_time_slicing = false; using delay_constructor = detail::no_delay_constructor_t<1050>; }; #endif } // namespace non_trivial_runs } // namespace rle template struct device_run_length_encode_policy_hub { static constexpr int MAX_INPUT_BYTES = CUB_MAX(sizeof(KeyT), sizeof(LengthT)); static constexpr int COMBINED_INPUT_BYTES = sizeof(KeyT) + sizeof(LengthT); struct DefaultTuning { static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = 6; static constexpr int ITEMS_PER_THREAD = (MAX_INPUT_BYTES <= 8) ? 6 : CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)); using ReduceByKeyPolicyT = AgentReduceByKeyPolicy<128, ITEMS_PER_THREAD, BLOCK_LOAD_DIRECT, LOAD_LDG, BLOCK_SCAN_WARP_SCANS, detail::default_reduce_by_key_delay_constructor_t>; }; /// SM35 struct Policy350 : DefaultTuning , ChainedPolicy<350, Policy350, Policy350> {}; /// SM80 struct Policy800 : ChainedPolicy<800, Policy800, Policy350> { using tuning = detail::rle::encode::sm80_tuning; using ReduceByKeyPolicyT = AgentReduceByKeyPolicy; }; // SM86 struct Policy860 : DefaultTuning , ChainedPolicy<860, Policy860, Policy800> {}; /// SM90 struct Policy900 : ChainedPolicy<900, Policy900, Policy860> { using tuning = detail::rle::encode::sm90_tuning; using ReduceByKeyPolicyT = AgentReduceByKeyPolicy; }; using MaxPolicy = Policy900; }; template struct device_non_trivial_runs_policy_hub { struct DefaultTuning { enum { NOMINAL_4B_ITEMS_PER_THREAD = 15, ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(KeyT)))), }; using RleSweepPolicyT = AgentRlePolicy<96, ITEMS_PER_THREAD, BLOCK_LOAD_DIRECT, LOAD_LDG, true, BLOCK_SCAN_WARP_SCANS, detail::default_reduce_by_key_delay_constructor_t>; }; /// SM35 struct Policy350 : DefaultTuning , ChainedPolicy<350, Policy350, Policy350> {}; // SM80 struct Policy800 : ChainedPolicy<800, Policy800, Policy350> { using tuning = detail::rle::non_trivial_runs::sm80_tuning; using RleSweepPolicyT = AgentRlePolicy; }; // SM86 struct Policy860 : DefaultTuning , ChainedPolicy<860, Policy860, Policy800> {}; // SM90 struct Policy900 : ChainedPolicy<900, Policy900, Policy860> { using tuning = detail::rle::non_trivial_runs::sm90_tuning; using RleSweepPolicyT = AgentRlePolicy; }; using MaxPolicy = Policy900; }; } // namespace detail CUB_NAMESPACE_END cccl-2.5.0/cub/cub/device/dispatch/tuning/tuning_scan.cuh000066400000000000000000000312601463375617100233360ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include #include #include #include CUB_NAMESPACE_BEGIN namespace detail { namespace scan { enum class keep_rejects { no, yes }; enum class primitive_accum { no, yes }; enum class primitive_op { no, yes }; enum class offset_size { _4, _8, unknown }; enum class accum_size { _1, _2, _4, _8, _16, unknown }; template constexpr primitive_accum is_primitive_accum() { return Traits::PRIMITIVE ? primitive_accum::yes : primitive_accum::no; } template constexpr primitive_op is_primitive_op() { return basic_binary_op_t::value ? primitive_op::yes : primitive_op::no; } template constexpr accum_size classify_accum_size() { return sizeof(AccumT) == 1 ? accum_size::_1 : sizeof(AccumT) == 2 ? accum_size::_2 : sizeof(AccumT) == 4 ? accum_size::_4 : sizeof(AccumT) == 8 ? accum_size::_8 : sizeof(AccumT) == 16 ? accum_size::_16 : accum_size::unknown; } template struct tuning { static constexpr int threads = Threads; static constexpr int items = Items; using delay_constructor = detail::fixed_delay_constructor_t; }; template (), accum_size AccumSize = classify_accum_size()> struct sm90_tuning { static constexpr int threads = 128; static constexpr int items = 15; using delay_constructor = detail::default_delay_constructor_t; }; // clang-format off template struct sm90_tuning : tuning<192, 22, 168, 1140> {}; template struct sm90_tuning : tuning<512, 12, 376, 1125> {}; template struct sm90_tuning : tuning<128, 24, 648, 1245> {}; template struct sm90_tuning : tuning<224, 24, 632, 1290> {}; template <> struct sm90_tuning : tuning<128, 24, 688, 1140> {}; template <> struct sm90_tuning : tuning<224, 24, 576, 1215> {}; #if CUB_IS_INT128_ENABLED template <> struct sm90_tuning< __int128_t, primitive_op::yes, primitive_accum::no, accum_size::_16> : tuning<576, 21, 860, 630> {}; template <> struct sm90_tuning<__uint128_t, primitive_op::yes, primitive_accum::no, accum_size::_16> : tuning<576, 21, 860, 630> {}; #endif // clang-format on template (), accum_size AccumSize = classify_accum_size()> struct sm80_tuning { static constexpr int threads = 128; static constexpr int items = 15; using delay_constructor = detail::default_delay_constructor_t; static constexpr bool LargeValues = sizeof(AccumT) > 128; static constexpr BlockLoadAlgorithm load_algorithm = // LargeValues ? BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED : BLOCK_LOAD_WARP_TRANSPOSE; static constexpr BlockStoreAlgorithm store_algorithm = // LargeValues ? BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED : BLOCK_STORE_WARP_TRANSPOSE; }; template struct sm80_tuning { static constexpr int threads = 320; static constexpr int items = 14; using delay_constructor = detail::fixed_delay_constructor_t<368, 725>; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE; }; template struct sm80_tuning { static constexpr int threads = 352; static constexpr int items = 16; using delay_constructor = detail::fixed_delay_constructor_t<488, 1040>; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE; }; template struct sm80_tuning { static constexpr int threads = 320; static constexpr int items = 12; using delay_constructor = detail::fixed_delay_constructor_t<268, 1180>; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE; }; template struct sm80_tuning { static constexpr int threads = 288; static constexpr int items = 22; using delay_constructor = detail::fixed_delay_constructor_t<716, 785>; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE; }; template <> struct sm80_tuning { static constexpr int threads = 288; static constexpr int items = 8; using delay_constructor = detail::fixed_delay_constructor_t<724, 1050>; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE; }; template <> struct sm80_tuning { static constexpr int threads = 384; static constexpr int items = 12; using delay_constructor = detail::fixed_delay_constructor_t<388, 1100>; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE; }; #if CUB_IS_INT128_ENABLED template <> struct sm80_tuning<__int128_t, primitive_op::yes, primitive_accum::no, accum_size::_16> { static constexpr int threads = 640; static constexpr int items = 24; using delay_constructor = detail::no_delay_constructor_t<1200>; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_DIRECT; }; template <> struct sm80_tuning<__uint128_t, primitive_op::yes, primitive_accum::no, accum_size::_16> { static constexpr int threads = 640; static constexpr int items = 24; using delay_constructor = detail::no_delay_constructor_t<1200>; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_DIRECT; }; #endif } // namespace scan } // namespace detail template struct DeviceScanPolicy { // For large values, use timesliced loads/stores to fit shared memory. static constexpr bool LargeValues = sizeof(AccumT) > 128; static constexpr BlockLoadAlgorithm ScanTransposedLoad = LargeValues ? BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED : BLOCK_LOAD_WARP_TRANSPOSE; static constexpr BlockStoreAlgorithm ScanTransposedStore = LargeValues ? BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED : BLOCK_STORE_WARP_TRANSPOSE; template using policy_t = AgentScanPolicy, DelayConstructorT>; /// SM350 struct Policy350 : ChainedPolicy<350, Policy350, Policy350> { // GTX Titan: 29.5B items/s (232.4 GB/s) @ 48M 32-bit T using ScanPolicyT = policy_t<128, 12, ///< Threads per block, items per thread AccumT, BLOCK_LOAD_DIRECT, LOAD_CA, BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, BLOCK_SCAN_RAKING, detail::default_delay_constructor_t>; }; /// SM520 struct Policy520 : ChainedPolicy<520, Policy520, Policy350> { // Titan X: 32.47B items/s @ 48M 32-bit T using ScanPolicyT = policy_t<128, 12, ///< Threads per block, items per thread AccumT, BLOCK_LOAD_DIRECT, LOAD_CA, ScanTransposedStore, BLOCK_SCAN_WARP_SCANS, detail::default_delay_constructor_t>; }; /// SM600 struct DefaultTuning { using ScanPolicyT = policy_t<128, 15, ///< Threads per block, items per thread AccumT, ScanTransposedLoad, LOAD_DEFAULT, ScanTransposedStore, BLOCK_SCAN_WARP_SCANS, detail::default_delay_constructor_t>; }; /// SM600 struct Policy600 : DefaultTuning , ChainedPolicy<600, Policy600, Policy520> {}; /// SM800 struct Policy800 : ChainedPolicy<800, Policy800, Policy600> { using tuning = detail::scan::sm80_tuning()>; using ScanPolicyT = policy_t; }; /// SM860 struct Policy860 : DefaultTuning , ChainedPolicy<860, Policy860, Policy800> {}; /// SM900 struct Policy900 : ChainedPolicy<900, Policy900, Policy860> { using tuning = detail::scan::sm90_tuning()>; using ScanPolicyT = policy_t; }; using MaxPolicy = Policy900; }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/device/dispatch/tuning/tuning_scan_by_key.cuh000066400000000000000000001070201463375617100246760ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include #include #include #include #include #include CUB_NAMESPACE_BEGIN namespace detail { namespace scan_by_key { enum class primitive_accum { no, yes }; enum class primitive_op { no, yes }; enum class offset_size { _4, _8, unknown }; enum class val_size { _1, _2, _4, _8, _16, unknown }; enum class key_size { _1, _2, _4, _8, _16, unknown }; template constexpr primitive_accum is_primitive_accum() { return Traits::PRIMITIVE ? primitive_accum::yes : primitive_accum::no; } template constexpr primitive_op is_primitive_op() { return basic_binary_op_t::value ? primitive_op::yes : primitive_op::no; } template constexpr val_size classify_val_size() { return sizeof(ValueT) == 1 ? val_size::_1 : sizeof(ValueT) == 2 ? val_size::_2 : sizeof(ValueT) == 4 ? val_size::_4 : sizeof(ValueT) == 8 ? val_size::_8 : sizeof(ValueT) == 16 ? val_size::_16 : val_size::unknown; } template constexpr key_size classify_key_size() { return sizeof(KeyT) == 1 ? key_size::_1 : sizeof(KeyT) == 2 ? key_size::_2 : sizeof(KeyT) == 4 ? key_size::_4 : sizeof(KeyT) == 8 ? key_size::_8 : sizeof(KeyT) == 16 ? key_size::_16 : key_size::unknown; } template (), val_size AccumSize = classify_val_size(), primitive_accum PrimitiveAccumulator = is_primitive_accum()> struct sm90_tuning { static constexpr int nominal_4b_items_per_thread = 9; static constexpr int threads = 256; static constexpr size_t max_input_bytes = (cub::max)(sizeof(KeyT), sizeof(AccumT)); static constexpr size_t combined_input_bytes = sizeof(KeyT) + sizeof(AccumT); static constexpr int items = ((max_input_bytes <= 8) ? 9 : Nominal4BItemsToItemsCombined(nominal_4b_items_per_thread, combined_input_bytes)); static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE; using delay_constructor = detail::default_reduce_by_key_delay_constructor_t; }; template struct sm90_tuning { static constexpr int threads = 128; static constexpr int items = 12; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_DIRECT; using delay_constructor = detail::no_delay_constructor_t<650>; }; template struct sm90_tuning { static constexpr int threads = 256; static constexpr int items = 16; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE; using delay_constructor = detail::fixed_delay_constructor_t<124, 995>; }; template struct sm90_tuning { static constexpr int threads = 128; static constexpr int items = 15; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE; using delay_constructor = detail::fixed_delay_constructor_t<488, 545>; }; template struct sm90_tuning { static constexpr int threads = 224; static constexpr int items = 10; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE; using delay_constructor = detail::fixed_delay_constructor_t<488, 1070>; }; #if CUB_IS_INT128_ENABLED template struct sm90_tuning { static constexpr int threads = 128; static constexpr int items = 23; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE; using delay_constructor = detail::fixed_delay_constructor_t<936, 1105>; }; template struct sm90_tuning { static constexpr int threads = 128; static constexpr int items = 23; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE; using delay_constructor = detail::fixed_delay_constructor_t<936, 1105>; }; #endif template struct sm90_tuning { static constexpr int threads = 128; static constexpr int items = 12; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_DIRECT; using delay_constructor = detail::fixed_delay_constructor_t<136, 785>; }; template struct sm90_tuning { static constexpr int threads = 128; static constexpr int items = 20; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE; using delay_constructor = detail::no_delay_constructor_t<445>; }; template struct sm90_tuning { static constexpr int threads = 128; static constexpr int items = 22; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE; using delay_constructor = detail::fixed_delay_constructor_t<312, 865>; }; template struct sm90_tuning { static constexpr int threads = 224; static constexpr int items = 10; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE; using delay_constructor = detail::fixed_delay_constructor_t<352, 1170>; }; #if CUB_IS_INT128_ENABLED template struct sm90_tuning { static constexpr int threads = 128; static constexpr int items = 23; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE; using delay_constructor = detail::fixed_delay_constructor_t<504, 1190>; }; template struct sm90_tuning { static constexpr int threads = 128; static constexpr int items = 23; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE; using delay_constructor = detail::fixed_delay_constructor_t<504, 1190>; }; #endif template struct sm90_tuning { static constexpr int threads = 128; static constexpr int items = 12; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_DIRECT; using delay_constructor = detail::no_delay_constructor_t<850>; }; template struct sm90_tuning { static constexpr int threads = 256; static constexpr int items = 14; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE; using delay_constructor = detail::fixed_delay_constructor_t<128, 965>; }; template struct sm90_tuning { static constexpr int threads = 288; static constexpr int items = 14; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE; using delay_constructor = detail::fixed_delay_constructor_t<700, 1005>; }; template struct sm90_tuning { static constexpr int threads = 224; static constexpr int items = 14; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE; using delay_constructor = detail::fixed_delay_constructor_t<556, 1195>; }; #if CUB_IS_INT128_ENABLED template struct sm90_tuning { static constexpr int threads = 128; static constexpr int items = 23; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE; using delay_constructor = detail::fixed_delay_constructor_t<512, 1030>; }; template struct sm90_tuning { static constexpr int threads = 128; static constexpr int items = 23; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE; using delay_constructor = detail::fixed_delay_constructor_t<512, 1030>; }; #endif template struct sm90_tuning { static constexpr int threads = 128; static constexpr int items = 12; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_DIRECT; using delay_constructor = detail::fixed_delay_constructor_t<504, 1010>; }; template struct sm90_tuning { static constexpr int threads = 224; static constexpr int items = 10; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE; using delay_constructor = detail::fixed_delay_constructor_t<420, 970>; }; template struct sm90_tuning { static constexpr int threads = 192; static constexpr int items = 10; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE; using delay_constructor = detail::fixed_delay_constructor_t<500, 1125>; }; template struct sm90_tuning { static constexpr int threads = 224; static constexpr int items = 11; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE; using delay_constructor = detail::fixed_delay_constructor_t<600, 930>; }; #if CUB_IS_INT128_ENABLED template struct sm90_tuning { static constexpr int threads = 192; static constexpr int items = 15; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE; using delay_constructor = detail::fixed_delay_constructor_t<364, 1085>; }; template struct sm90_tuning { static constexpr int threads = 192; static constexpr int items = 15; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE; using delay_constructor = detail::fixed_delay_constructor_t<364, 1085>; }; #endif template struct sm90_tuning { static constexpr int threads = 192; static constexpr int items = 7; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE; using delay_constructor = detail::fixed_delay_constructor_t<500, 975>; }; template struct sm90_tuning { static constexpr int threads = 224; static constexpr int items = 10; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE; using delay_constructor = detail::fixed_delay_constructor_t<164, 1075>; }; template struct sm90_tuning { static constexpr int threads = 256; static constexpr int items = 9; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE; using delay_constructor = detail::fixed_delay_constructor_t<268, 1120>; }; template struct sm90_tuning { static constexpr int threads = 192; static constexpr int items = 9; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE; using delay_constructor = detail::fixed_delay_constructor_t<320, 1200>; }; #if CUB_IS_INT128_ENABLED template struct sm90_tuning { static constexpr int threads = 128; static constexpr int items = 23; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE; using delay_constructor = detail::fixed_delay_constructor_t<364, 1050>; }; template struct sm90_tuning { static constexpr int threads = 128; static constexpr int items = 23; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE; using delay_constructor = detail::fixed_delay_constructor_t<364, 1050>; }; #endif template (), val_size AccumSize = classify_val_size(), primitive_accum PrimitiveAccumulator = is_primitive_accum()> struct sm80_tuning { static constexpr int nominal_4b_items_per_thread = 9; static constexpr int threads = 256; static constexpr size_t max_input_bytes = (cub::max)(sizeof(KeyT), sizeof(AccumT)); static constexpr size_t combined_input_bytes = sizeof(KeyT) + sizeof(AccumT); static constexpr int items = ((max_input_bytes <= 8) ? 9 : Nominal4BItemsToItemsCombined(nominal_4b_items_per_thread, combined_input_bytes)); static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE; using delay_constructor = detail::default_reduce_by_key_delay_constructor_t; }; template struct sm80_tuning { static constexpr int threads = 128; static constexpr int items = 12; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_DIRECT; using delay_constructor = detail::no_delay_constructor_t<795>; }; template struct sm80_tuning { static constexpr int threads = 288; static constexpr int items = 12; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE; using delay_constructor = detail::no_delay_constructor_t<825>; }; template struct sm80_tuning { static constexpr int threads = 256; static constexpr int items = 15; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE; using delay_constructor = detail::no_delay_constructor_t<640>; }; template struct sm80_tuning { static constexpr int threads = 192; static constexpr int items = 10; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE; using delay_constructor = detail::fixed_delay_constructor_t<124, 1040>; }; #if CUB_IS_INT128_ENABLED template struct sm80_tuning { static constexpr int threads = 128; static constexpr int items = 19; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE; using delay_constructor = detail::no_delay_constructor_t<1095>; }; template struct sm80_tuning { static constexpr int threads = 128; static constexpr int items = 19; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE; using delay_constructor = detail::no_delay_constructor_t<1095>; }; #endif template struct sm80_tuning { static constexpr int threads = 256; static constexpr int items = 8; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_DIRECT; using delay_constructor = detail::no_delay_constructor_t<1070>; }; template struct sm80_tuning { static constexpr int threads = 320; static constexpr int items = 14; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE; using delay_constructor = detail::no_delay_constructor_t<625>; }; template struct sm80_tuning { static constexpr int threads = 256; static constexpr int items = 15; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE; using delay_constructor = detail::no_delay_constructor_t<1055>; }; template struct sm80_tuning { static constexpr int threads = 160; static constexpr int items = 17; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE; using delay_constructor = detail::fixed_delay_constructor_t<160, 695>; }; #if CUB_IS_INT128_ENABLED template struct sm80_tuning { static constexpr int threads = 160; static constexpr int items = 14; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE; using delay_constructor = detail::no_delay_constructor_t<1105>; }; template struct sm80_tuning { static constexpr int threads = 160; static constexpr int items = 14; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE; using delay_constructor = detail::no_delay_constructor_t<1105>; }; #endif template struct sm80_tuning { static constexpr int threads = 128; static constexpr int items = 12; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_DIRECT; using delay_constructor = detail::no_delay_constructor_t<1130>; }; template struct sm80_tuning { static constexpr int threads = 256; static constexpr int items = 12; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE; using delay_constructor = detail::no_delay_constructor_t<1130>; }; template struct sm80_tuning { static constexpr int threads = 256; static constexpr int items = 15; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE; using delay_constructor = detail::no_delay_constructor_t<1140>; }; template struct sm80_tuning { static constexpr int threads = 256; static constexpr int items = 9; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE; using delay_constructor = detail::fixed_delay_constructor_t<888, 635>; }; #if CUB_IS_INT128_ENABLED template struct sm80_tuning { static constexpr int threads = 128; static constexpr int items = 17; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE; using delay_constructor = detail::no_delay_constructor_t<1100>; }; template struct sm80_tuning { static constexpr int threads = 128; static constexpr int items = 17; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE; using delay_constructor = detail::no_delay_constructor_t<1100>; }; #endif template struct sm80_tuning { static constexpr int threads = 128; static constexpr int items = 11; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE; using delay_constructor = detail::no_delay_constructor_t<1120>; }; template struct sm80_tuning { static constexpr int threads = 256; static constexpr int items = 10; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE; using delay_constructor = detail::no_delay_constructor_t<1115>; }; template struct sm80_tuning { static constexpr int threads = 224; static constexpr int items = 13; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE; using delay_constructor = detail::fixed_delay_constructor_t<24, 1060>; }; template struct sm80_tuning { static constexpr int threads = 224; static constexpr int items = 10; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE; using delay_constructor = detail::no_delay_constructor_t<1160>; }; #if CUB_IS_INT128_ENABLED template struct sm80_tuning { static constexpr int threads = 320; static constexpr int items = 8; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE; using delay_constructor = detail::no_delay_constructor_t<220>; }; template struct sm80_tuning { static constexpr int threads = 320; static constexpr int items = 8; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE; using delay_constructor = detail::no_delay_constructor_t<220>; }; #endif template struct sm80_tuning { static constexpr int threads = 192; static constexpr int items = 7; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE; using delay_constructor = detail::fixed_delay_constructor_t<144, 1120>; }; template struct sm80_tuning { static constexpr int threads = 192; static constexpr int items = 7; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE; using delay_constructor = detail::fixed_delay_constructor_t<364, 780>; }; template struct sm80_tuning { static constexpr int threads = 256; static constexpr int items = 7; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE; using delay_constructor = detail::no_delay_constructor_t<1170>; }; template struct sm80_tuning { static constexpr int threads = 128; static constexpr int items = 15; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE; using delay_constructor = detail::no_delay_constructor_t<1030>; }; #if CUB_IS_INT128_ENABLED template struct sm80_tuning { static constexpr int threads = 128; static constexpr int items = 15; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE; using delay_constructor = detail::no_delay_constructor_t<1160>; }; template struct sm80_tuning { static constexpr int threads = 128; static constexpr int items = 15; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE; using delay_constructor = detail::no_delay_constructor_t<1160>; }; #endif } // namespace scan_by_key } // namespace detail template struct DeviceScanByKeyPolicy { using KeyT = cub::detail::value_t; static constexpr size_t MaxInputBytes = (cub::max)(sizeof(KeyT), sizeof(AccumT)); static constexpr size_t CombinedInputBytes = sizeof(KeyT) + sizeof(AccumT); // SM350 struct Policy350 : ChainedPolicy<350, Policy350, Policy350> { static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = 6; static constexpr int ITEMS_PER_THREAD = ((MaxInputBytes <= 8) ? 6 : Nominal4BItemsToItemsCombined(NOMINAL_4B_ITEMS_PER_THREAD, CombinedInputBytes)); using ScanByKeyPolicyT = AgentScanByKeyPolicy<128, ITEMS_PER_THREAD, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_CA, BLOCK_SCAN_WARP_SCANS, BLOCK_STORE_WARP_TRANSPOSE, detail::default_reduce_by_key_delay_constructor_t>; }; struct DefaultTuning { static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = 9; static constexpr int ITEMS_PER_THREAD = ((MaxInputBytes <= 8) ? 9 : Nominal4BItemsToItemsCombined(NOMINAL_4B_ITEMS_PER_THREAD, CombinedInputBytes)); using ScanByKeyPolicyT = AgentScanByKeyPolicy<256, ITEMS_PER_THREAD, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_CA, BLOCK_SCAN_WARP_SCANS, BLOCK_STORE_WARP_TRANSPOSE, detail::default_reduce_by_key_delay_constructor_t>; }; // SM520 struct Policy520 : DefaultTuning , ChainedPolicy<520, Policy520, Policy350> {}; // SM800 struct Policy800 : ChainedPolicy<800, Policy800, Policy520> { using tuning = detail::scan_by_key::sm80_tuning()>; using ScanByKeyPolicyT = AgentScanByKeyPolicy; }; // SM860 struct Policy860 : DefaultTuning , ChainedPolicy<860, Policy860, Policy800> {}; // SM900 struct Policy900 : ChainedPolicy<900, Policy900, Policy860> { using tuning = detail::scan_by_key::sm90_tuning()>; using ScanByKeyPolicyT = AgentScanByKeyPolicy; }; using MaxPolicy = Policy900; }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/device/dispatch/tuning/tuning_select_if.cuh000066400000000000000000000601721463375617100243530ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include #include #include CUB_NAMESPACE_BEGIN namespace detail { namespace select { enum class flagged { no, yes }; enum class keep_rejects { no, yes }; enum class primitive { no, yes }; enum class offset_size { _4, _8, unknown }; enum class input_size { _1, _2, _4, _8, _16, unknown }; template constexpr primitive is_primitive() { return Traits::PRIMITIVE ? primitive::yes : primitive::no; } template constexpr flagged is_flagged() { return std::is_same::value ? flagged::no : flagged::yes; } template constexpr keep_rejects are_rejects_kept() { return KeepRejects ? keep_rejects::yes : keep_rejects::no; } template constexpr input_size classify_input_size() { return sizeof(InputT) == 1 ? input_size::_1 : sizeof(InputT) == 2 ? input_size::_2 : sizeof(InputT) == 4 ? input_size::_4 : sizeof(InputT) == 8 ? input_size::_8 : sizeof(InputT) == 16 ? input_size::_16 : input_size::unknown; } template constexpr offset_size classify_offset_size() { return sizeof(OffsetT) == 4 ? offset_size::_4 : sizeof(OffsetT) == 8 ? offset_size::_8 : offset_size::unknown; } template (), input_size InputSize = classify_input_size()> struct sm90_tuning { static constexpr int threads = 128; static constexpr int nominal_4b_items_per_thread = 10; static constexpr int items = CUB_MIN(nominal_4b_items_per_thread, CUB_MAX(1, (nominal_4b_items_per_thread * 4 / sizeof(InputT)))); static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; using delay_constructor = detail::fixed_delay_constructor_t<350, 450>; }; // select::if template struct sm90_tuning { static constexpr int threads = 256; static constexpr int items = 22; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; using delay_constructor = detail::no_delay_constructor_t<580>; }; template struct sm90_tuning { static constexpr int threads = 256; static constexpr int items = 22; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; using delay_constructor = detail::fixed_delay_constructor_t<320, 605>; }; template struct sm90_tuning { static constexpr int threads = 384; static constexpr int items = 17; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; using delay_constructor = detail::fixed_delay_constructor_t<76, 1150>; }; template struct sm90_tuning { static constexpr int threads = 384; static constexpr int items = 11; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; using delay_constructor = detail::fixed_delay_constructor_t<380, 1140>; }; #if CUB_IS_INT128_ENABLED template <> struct sm90_tuning<__int128_t, flagged::no, keep_rejects::no, offset_size::_4, primitive::no, input_size::_16> { static constexpr int threads = 512; static constexpr int items = 5; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; using delay_constructor = detail::fixed_delay_constructor_t<460, 1145>; }; template <> struct sm90_tuning<__uint128_t, flagged::no, keep_rejects::no, offset_size::_4, primitive::no, input_size::_16> { static constexpr int threads = 512; static constexpr int items = 5; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; using delay_constructor = detail::fixed_delay_constructor_t<460, 1145>; }; #endif // select::flagged template struct sm90_tuning { static constexpr int threads = 448; static constexpr int items = 20; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; using delay_constructor = detail::no_delay_constructor_t<715>; }; template struct sm90_tuning { static constexpr int threads = 448; static constexpr int items = 20; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; using delay_constructor = detail::fixed_delay_constructor_t<504, 765>; }; template struct sm90_tuning { static constexpr int threads = 384; static constexpr int items = 15; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; using delay_constructor = detail::fixed_delay_constructor_t<415, 1125>; }; template struct sm90_tuning { static constexpr int threads = 384; static constexpr int items = 11; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; using delay_constructor = detail::fixed_delay_constructor_t<360, 1170>; }; #if CUB_IS_INT128_ENABLED template <> struct sm90_tuning<__int128_t, flagged::yes, keep_rejects::no, offset_size::_4, primitive::no, input_size::_16> { static constexpr int threads = 512; static constexpr int items = 3; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; using delay_constructor = detail::fixed_delay_constructor_t<284, 1130>; }; template <> struct sm90_tuning<__uint128_t, flagged::yes, keep_rejects::no, offset_size::_4, primitive::no, input_size::_16> { static constexpr int threads = 512; static constexpr int items = 3; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; using delay_constructor = detail::fixed_delay_constructor_t<284, 1130>; }; #endif // partition::if template struct sm90_tuning { static constexpr int threads = 384; static constexpr int items = 20; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; using delay_constructor = detail::fixed_delay_constructor_t<908, 995>; }; template struct sm90_tuning { static constexpr int threads = 320; static constexpr int items = 14; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; using delay_constructor = detail::fixed_delay_constructor_t<500, 560>; }; template struct sm90_tuning { static constexpr int threads = 256; static constexpr int items = 14; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; using delay_constructor = detail::fixed_delay_constructor_t<536, 1055>; }; template struct sm90_tuning { static constexpr int threads = 128; static constexpr int items = 12; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; using delay_constructor = detail::fixed_delay_constructor_t<512, 1075>; }; #if CUB_IS_INT128_ENABLED template <> struct sm90_tuning<__int128_t, flagged::no, keep_rejects::yes, offset_size::_4, primitive::no, input_size::_16> { static constexpr int threads = 192; static constexpr int items = 5; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; using delay_constructor = detail::fixed_delay_constructor_t<1616, 1115>; }; template <> struct sm90_tuning<__uint128_t, flagged::no, keep_rejects::yes, offset_size::_4, primitive::no, input_size::_16> { static constexpr int threads = 192; static constexpr int items = 5; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; using delay_constructor = detail::fixed_delay_constructor_t<1616, 1115>; }; #endif // partition::flagged template struct sm90_tuning { static constexpr int threads = 256; static constexpr int items = 20; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; using delay_constructor = detail::fixed_delay_constructor_t<580, 850>; }; template struct sm90_tuning { static constexpr int threads = 512; static constexpr int items = 20; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; using delay_constructor = detail::fixed_delay_constructor_t<388, 1055>; }; template struct sm90_tuning { static constexpr int threads = 256; static constexpr int items = 20; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; using delay_constructor = detail::fixed_delay_constructor_t<72, 1165>; }; template struct sm90_tuning { static constexpr int threads = 224; static constexpr int items = 6; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; using delay_constructor = detail::fixed_delay_constructor_t<532, 1180>; }; #if CUB_IS_INT128_ENABLED template <> struct sm90_tuning<__int128_t, flagged::yes, keep_rejects::yes, offset_size::_4, primitive::no, input_size::_16> { static constexpr int threads = 160; static constexpr int items = 5; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; using delay_constructor = detail::fixed_delay_constructor_t<720, 1105>; }; template <> struct sm90_tuning<__uint128_t, flagged::yes, keep_rejects::yes, offset_size::_4, primitive::no, input_size::_16> { static constexpr int threads = 160; static constexpr int items = 5; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; using delay_constructor = detail::fixed_delay_constructor_t<720, 1105>; }; #endif template (), input_size InputSize = classify_input_size()> struct sm80_tuning { static constexpr int threads = 128; static constexpr int nominal_4b_items_per_thread = 10; static constexpr int items = CUB_MIN(nominal_4b_items_per_thread, CUB_MAX(1, (nominal_4b_items_per_thread * 4 / sizeof(InputT)))); static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; using delay_constructor = detail::fixed_delay_constructor_t<350, 450>; }; // select::if template struct sm80_tuning { static constexpr int threads = 992; static constexpr int items = 20; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; using delay_constructor = detail::no_delay_constructor_t<395>; }; template struct sm80_tuning { static constexpr int threads = 576; static constexpr int items = 14; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; using delay_constructor = detail::no_delay_constructor_t<870>; }; template struct sm80_tuning { static constexpr int threads = 256; static constexpr int items = 18; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; using delay_constructor = detail::no_delay_constructor_t<1130>; }; template struct sm80_tuning { static constexpr int threads = 192; static constexpr int items = 10; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; using delay_constructor = detail::fixed_delay_constructor_t<832, 1165>; }; #if CUB_IS_INT128_ENABLED template <> struct sm80_tuning<__int128_t, flagged::no, keep_rejects::no, offset_size::_4, primitive::no, input_size::_16> { static constexpr int threads = 384; static constexpr int items = 4; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; using delay_constructor = detail::no_delay_constructor_t<1140>; }; template <> struct sm80_tuning<__uint128_t, flagged::no, keep_rejects::no, offset_size::_4, primitive::no, input_size::_16> { static constexpr int threads = 384; static constexpr int items = 4; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; using delay_constructor = detail::no_delay_constructor_t<1140>; }; #endif // select::flagged template struct sm80_tuning { static constexpr int threads = 224; static constexpr int items = 20; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; using delay_constructor = detail::no_delay_constructor_t<735>; }; template struct sm80_tuning { static constexpr int threads = 256; static constexpr int items = 20; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; using delay_constructor = detail::no_delay_constructor_t<1155>; }; template struct sm80_tuning { static constexpr int threads = 320; static constexpr int items = 10; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; using delay_constructor = detail::fixed_delay_constructor_t<124, 1115>; }; template struct sm80_tuning { static constexpr int threads = 384; static constexpr int items = 6; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; using delay_constructor = detail::no_delay_constructor_t<1130>; }; #if CUB_IS_INT128_ENABLED template <> struct sm80_tuning<__int128_t, flagged::yes, keep_rejects::no, offset_size::_4, primitive::no, input_size::_16> { static constexpr int threads = 256; static constexpr int items = 5; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; using delay_constructor = detail::fixed_delay_constructor_t<464, 1025>; }; template <> struct sm80_tuning<__uint128_t, flagged::yes, keep_rejects::no, offset_size::_4, primitive::no, input_size::_16> { static constexpr int threads = 256; static constexpr int items = 5; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; using delay_constructor = detail::fixed_delay_constructor_t<464, 1025>; }; #endif // partition::if template struct sm80_tuning { static constexpr int threads = 512; static constexpr int items = 20; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; using delay_constructor = detail::no_delay_constructor_t<510>; }; template struct sm80_tuning { static constexpr int threads = 224; static constexpr int items = 18; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; using delay_constructor = detail::no_delay_constructor_t<1045>; }; template struct sm80_tuning { static constexpr int threads = 192; static constexpr int items = 15; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; using delay_constructor = detail::no_delay_constructor_t<1040>; }; template struct sm80_tuning { static constexpr int threads = 192; static constexpr int items = 10; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; using delay_constructor = detail::fixed_delay_constructor_t<68, 1160>; }; #if CUB_IS_INT128_ENABLED template <> struct sm80_tuning<__int128_t, flagged::no, keep_rejects::yes, offset_size::_4, primitive::no, input_size::_16> { static constexpr int threads = 256; static constexpr int items = 5; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; using delay_constructor = detail::fixed_delay_constructor_t<400, 1090>; }; template <> struct sm80_tuning<__uint128_t, flagged::no, keep_rejects::yes, offset_size::_4, primitive::no, input_size::_16> { static constexpr int threads = 256; static constexpr int items = 5; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; using delay_constructor = detail::fixed_delay_constructor_t<400, 1090>; }; #endif // partition::flagged template struct sm80_tuning { static constexpr int threads = 512; static constexpr int items = 20; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; using delay_constructor = detail::no_delay_constructor_t<595>; }; template struct sm80_tuning { static constexpr int threads = 224; static constexpr int items = 18; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; using delay_constructor = detail::no_delay_constructor_t<1105>; }; template struct sm80_tuning { static constexpr int threads = 192; static constexpr int items = 12; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; using delay_constructor = detail::fixed_delay_constructor_t<912, 1025>; }; template struct sm80_tuning { static constexpr int threads = 192; static constexpr int items = 12; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; using delay_constructor = detail::fixed_delay_constructor_t<884, 1130>; }; #if CUB_IS_INT128_ENABLED template <> struct sm80_tuning<__int128_t, flagged::yes, keep_rejects::yes, offset_size::_4, primitive::no, input_size::_16> { static constexpr int threads = 256; static constexpr int items = 5; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; using delay_constructor = detail::fixed_delay_constructor_t<400, 1090>; }; template <> struct sm80_tuning<__uint128_t, flagged::yes, keep_rejects::yes, offset_size::_4, primitive::no, input_size::_16> { static constexpr int threads = 256; static constexpr int items = 5; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; using delay_constructor = detail::fixed_delay_constructor_t<400, 1090>; }; #endif } // namespace select template struct device_select_policy_hub { struct DefaultTuning { static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = 10; static constexpr int ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT)))); using SelectIfPolicyT = AgentSelectIfPolicy<128, ITEMS_PER_THREAD, BLOCK_LOAD_DIRECT, MayAlias ? LOAD_CA : LOAD_LDG, BLOCK_SCAN_WARP_SCANS, detail::fixed_delay_constructor_t<350, 450>>; }; struct Policy350 : DefaultTuning , ChainedPolicy<350, Policy350, Policy350> {}; struct Policy800 : ChainedPolicy<800, Policy800, Policy350> { using tuning = detail::select::sm80_tuning(), select::are_rejects_kept(), select::classify_offset_size()>; using SelectIfPolicyT = AgentSelectIfPolicy; }; struct Policy860 : DefaultTuning , ChainedPolicy<860, Policy860, Policy800> {}; struct Policy900 : ChainedPolicy<900, Policy900, Policy860> { using tuning = detail::select::sm90_tuning(), select::are_rejects_kept(), select::classify_offset_size()>; using SelectIfPolicyT = AgentSelectIfPolicy; }; using MaxPolicy = Policy900; }; } // namespace detail CUB_NAMESPACE_END cccl-2.5.0/cub/cub/device/dispatch/tuning/tuning_three_way_partition.cuh000066400000000000000000000252371463375617100265010ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include #include #include #include CUB_NAMESPACE_BEGIN namespace detail { namespace three_way_partition { enum class input_size { _1, _2, _4, _8, _16, unknown }; enum class offset_size { _4, _8, unknown }; template constexpr input_size classify_input_size() { return sizeof(InputT) == 1 ? input_size::_1 : sizeof(InputT) == 2 ? input_size::_2 : sizeof(InputT) == 4 ? input_size::_4 : sizeof(InputT) == 8 ? input_size::_8 : sizeof(InputT) == 16 ? input_size::_16 : input_size::unknown; } template constexpr offset_size classify_offset_size() { return sizeof(OffsetT) == 4 ? offset_size::_4 : sizeof(OffsetT) == 8 ? offset_size::_8 : offset_size::unknown; } template (), offset_size OffsetSize = classify_offset_size()> struct sm90_tuning { static constexpr int threads = 256; static constexpr int items = Nominal4BItemsToItems(9); static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; using AccumPackHelperT = detail::three_way_partition::accumulator_pack_t; using AccumPackT = typename AccumPackHelperT::pack_t; using delay_constructor = detail::default_delay_constructor_t; }; template struct sm90_tuning { static constexpr int threads = 256; static constexpr int items = 12; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; using delay_constructor = detail::no_delay_constructor_t<445>; }; template struct sm90_tuning { static constexpr int threads = 256; static constexpr int items = 12; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; using delay_constructor = detail::fixed_delay_constructor_t<104, 512>; }; template struct sm90_tuning { static constexpr int threads = 320; static constexpr int items = 12; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; using delay_constructor = detail::no_delay_constructor_t<1105>; }; template struct sm90_tuning { static constexpr int threads = 384; static constexpr int items = 7; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; using delay_constructor = detail::fixed_delay_constructor_t<464, 1165>; }; template struct sm90_tuning { static constexpr int threads = 128; static constexpr int items = 7; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; using delay_constructor = detail::no_delay_constructor_t<1040>; }; template struct sm90_tuning { static constexpr int threads = 256; static constexpr int items = 24; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; using delay_constructor = detail::fixed_delay_constructor_t<4, 285>; }; template struct sm90_tuning { static constexpr int threads = 640; static constexpr int items = 24; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; using delay_constructor = detail::no_delay_constructor_t<245>; }; template struct sm90_tuning { static constexpr int threads = 256; static constexpr int items = 23; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; using delay_constructor = detail::no_delay_constructor_t<910>; }; template struct sm90_tuning { static constexpr int threads = 256; static constexpr int items = 18; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; using delay_constructor = detail::no_delay_constructor_t<1145>; }; template struct sm90_tuning { static constexpr int threads = 256; static constexpr int items = 11; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; using delay_constructor = detail::no_delay_constructor_t<1050>; }; template (), offset_size OffsetSize = classify_offset_size()> struct sm80_tuning { static constexpr int threads = 256; static constexpr int items = Nominal4BItemsToItems(9); static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; using AccumPackHelperT = detail::three_way_partition::accumulator_pack_t; using AccumPackT = typename AccumPackHelperT::pack_t; using delay_constructor = detail::default_delay_constructor_t; }; template struct sm80_tuning { static constexpr int threads = 256; static constexpr int items = 12; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; using delay_constructor = detail::no_delay_constructor_t<910>; }; template struct sm80_tuning { static constexpr int threads = 256; static constexpr int items = 11; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; using delay_constructor = detail::no_delay_constructor_t<1120>; }; template struct sm80_tuning { static constexpr int threads = 224; static constexpr int items = 11; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; using delay_constructor = detail::fixed_delay_constructor_t<264, 1080>; }; template struct sm80_tuning { static constexpr int threads = 128; static constexpr int items = 10; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; using delay_constructor = detail::fixed_delay_constructor_t<672, 1120>; }; } // namespace three_way_partition template struct device_three_way_partition_policy_hub { struct DefaultTuning { static constexpr int ITEMS_PER_THREAD = Nominal4BItemsToItems(9); using ThreeWayPartitionPolicy = cub::AgentThreeWayPartitionPolicy<256, ITEMS_PER_THREAD, cub::BLOCK_LOAD_DIRECT, cub::LOAD_DEFAULT, cub::BLOCK_SCAN_WARP_SCANS>; }; /// SM35 struct Policy350 : DefaultTuning , ChainedPolicy<350, Policy350, Policy350> {}; struct Policy800 : ChainedPolicy<800, Policy800, Policy350> { using tuning = detail::three_way_partition::sm80_tuning; using ThreeWayPartitionPolicy = AgentThreeWayPartitionPolicy; }; struct Policy860 : DefaultTuning , ChainedPolicy<860, Policy860, Policy800> {}; /// SM90 struct Policy900 : ChainedPolicy<900, Policy900, Policy860> { using tuning = detail::three_way_partition::sm90_tuning; using ThreeWayPartitionPolicy = AgentThreeWayPartitionPolicy; }; using MaxPolicy = Policy900; }; } // namespace detail CUB_NAMESPACE_END cccl-2.5.0/cub/cub/device/dispatch/tuning/tuning_unique_by_key.cuh000066400000000000000000000574451463375617100252770ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include #include #include #include CUB_NAMESPACE_BEGIN namespace detail { namespace unique_by_key { enum class primitive_key { no, yes }; enum class primitive_val { no, yes }; enum class key_size { _1, _2, _4, _8, _16, unknown }; enum class val_size { _1, _2, _4, _8, _16, unknown }; template constexpr primitive_key is_primitive_key() { return Traits::PRIMITIVE ? primitive_key::yes : primitive_key::no; } template constexpr primitive_val is_primitive_val() { return Traits::PRIMITIVE ? primitive_val::yes : primitive_val::no; } template constexpr key_size classify_key_size() { return sizeof(KeyT) == 1 ? key_size::_1 : sizeof(KeyT) == 2 ? key_size::_2 : sizeof(KeyT) == 4 ? key_size::_4 : sizeof(KeyT) == 8 ? key_size::_8 : sizeof(KeyT) == 16 ? key_size::_16 : key_size::unknown; } template constexpr val_size classify_val_size() { return sizeof(ValueT) == 1 ? val_size::_1 : sizeof(ValueT) == 2 ? val_size::_2 : sizeof(ValueT) == 4 ? val_size::_4 : sizeof(ValueT) == 8 ? val_size::_8 : sizeof(ValueT) == 16 ? val_size::_16 : val_size::unknown; } template (), primitive_val PrimitiveAccum = is_primitive_val(), key_size KeySize = classify_key_size(), val_size AccumSize = classify_val_size()> struct sm90_tuning { static constexpr int threads = 64; static constexpr int nominal_4b_items_per_thread = 11; static constexpr int items = Nominal4BItemsToItems(nominal_4b_items_per_thread); static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr CacheLoadModifier load_modifier = LOAD_LDG; using delay_constructor = detail::default_delay_constructor_t; }; // 8-bit key template struct sm90_tuning { static constexpr int threads = 256; static constexpr int items = 12; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT; using delay_constructor = detail::no_delay_constructor_t<550>; }; template struct sm90_tuning { static constexpr int threads = 448; static constexpr int items = 14; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT; using delay_constructor = detail::no_delay_constructor_t<725>; }; template struct sm90_tuning { static constexpr int threads = 256; static constexpr int items = 12; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT; using delay_constructor = detail::no_delay_constructor_t<1130>; }; template struct sm90_tuning { static constexpr int threads = 512; static constexpr int items = 10; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT; using delay_constructor = detail::no_delay_constructor_t<1100>; }; template struct sm90_tuning { static constexpr int threads = 288; static constexpr int items = 7; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT; using delay_constructor = detail::fixed_delay_constructor_t<344, 1165>; }; // 16-bit key template struct sm90_tuning { static constexpr int threads = 256; static constexpr int items = 12; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT; using delay_constructor = detail::no_delay_constructor_t<640>; }; template struct sm90_tuning { static constexpr int threads = 288; static constexpr int items = 14; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT; using delay_constructor = detail::fixed_delay_constructor_t<404, 710>; }; template struct sm90_tuning { static constexpr int threads = 512; static constexpr int items = 12; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT; using delay_constructor = detail::no_delay_constructor_t<525>; }; template struct sm90_tuning { static constexpr int threads = 256; static constexpr int items = 23; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT; using delay_constructor = detail::no_delay_constructor_t<1200>; }; template struct sm90_tuning { static constexpr int threads = 224; static constexpr int items = 9; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT; using delay_constructor = detail::fixed_delay_constructor_t<424, 1055>; }; // 32-bit key template struct sm90_tuning { static constexpr int threads = 448; static constexpr int items = 12; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT; using delay_constructor = detail::fixed_delay_constructor_t<348, 580>; }; template struct sm90_tuning { static constexpr int threads = 384; static constexpr int items = 9; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT; using delay_constructor = detail::no_delay_constructor_t<1060>; }; template struct sm90_tuning { static constexpr int threads = 512; static constexpr int items = 14; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT; using delay_constructor = detail::no_delay_constructor_t<1045>; }; template struct sm90_tuning { static constexpr int threads = 512; static constexpr int items = 11; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT; using delay_constructor = detail::no_delay_constructor_t<1120>; }; template struct sm90_tuning { static constexpr int threads = 384; static constexpr int items = 7; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT; using delay_constructor = detail::no_delay_constructor_t<1025>; }; // 64-bit key template struct sm90_tuning { static constexpr int threads = 384; static constexpr int items = 9; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT; using delay_constructor = detail::no_delay_constructor_t<1060>; }; template struct sm90_tuning { static constexpr int threads = 384; static constexpr int items = 9; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT; using delay_constructor = detail::fixed_delay_constructor_t<964, 1125>; }; template struct sm90_tuning { static constexpr int threads = 640; static constexpr int items = 7; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT; using delay_constructor = detail::no_delay_constructor_t<1070>; }; template struct sm90_tuning { static constexpr int threads = 448; static constexpr int items = 11; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT; using delay_constructor = detail::no_delay_constructor_t<1190>; }; template struct sm90_tuning { static constexpr int threads = 256; static constexpr int items = 9; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT; using delay_constructor = detail::no_delay_constructor_t<1155>; }; template (), primitive_val PrimitiveAccum = is_primitive_val(), key_size KeySize = classify_key_size(), val_size AccumSize = classify_val_size()> struct sm80_tuning { static constexpr int threads = 64; static constexpr int nominal_4b_items_per_thread = 11; static constexpr int items = Nominal4BItemsToItems(nominal_4b_items_per_thread); static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr CacheLoadModifier load_modifier = LOAD_LDG; using delay_constructor = detail::default_delay_constructor_t; }; // 8-bit key template struct sm80_tuning { static constexpr int threads = 256; static constexpr int items = 12; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT; using delay_constructor = detail::no_delay_constructor_t<835>; }; template struct sm80_tuning { static constexpr int threads = 256; static constexpr int items = 12; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT; using delay_constructor = detail::no_delay_constructor_t<765>; }; template struct sm80_tuning { static constexpr int threads = 256; static constexpr int items = 12; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT; using delay_constructor = detail::no_delay_constructor_t<1155>; }; template struct sm80_tuning { static constexpr int threads = 224; static constexpr int items = 10; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT; using delay_constructor = detail::no_delay_constructor_t<1065>; }; template struct sm80_tuning { static constexpr int threads = 128; static constexpr int items = 15; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT; using delay_constructor = detail::fixed_delay_constructor_t<248, 1200>; }; // 16-bit key template struct sm80_tuning { static constexpr int threads = 320; static constexpr int items = 20; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT; using delay_constructor = detail::no_delay_constructor_t<1020>; }; template struct sm80_tuning { static constexpr int threads = 192; static constexpr int items = 22; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT; using delay_constructor = detail::fixed_delay_constructor_t<328, 1080>; }; template struct sm80_tuning { static constexpr int threads = 256; static constexpr int items = 14; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT; using delay_constructor = detail::no_delay_constructor_t<535>; }; template struct sm80_tuning { static constexpr int threads = 256; static constexpr int items = 10; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT; using delay_constructor = detail::no_delay_constructor_t<1055>; }; // 32-bit key template struct sm80_tuning { static constexpr int threads = 256; static constexpr int items = 12; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT; using delay_constructor = detail::no_delay_constructor_t<1120>; }; template struct sm80_tuning { static constexpr int threads = 256; static constexpr int items = 14; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT; using delay_constructor = detail::no_delay_constructor_t<1185>; }; template struct sm80_tuning { static constexpr int threads = 256; static constexpr int items = 11; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT; using delay_constructor = detail::no_delay_constructor_t<1115>; }; template struct sm80_tuning { static constexpr int threads = 256; static constexpr int items = 7; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT; using delay_constructor = detail::fixed_delay_constructor_t<320, 1115>; }; // 64-bit key template struct sm80_tuning { static constexpr int threads = 256; static constexpr int items = 7; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT; using delay_constructor = detail::fixed_delay_constructor_t<24, 555>; }; template struct sm80_tuning { static constexpr int threads = 256; static constexpr int items = 7; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT; using delay_constructor = detail::fixed_delay_constructor_t<324, 1105>; }; template struct sm80_tuning { static constexpr int threads = 256; static constexpr int items = 7; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT; using delay_constructor = detail::fixed_delay_constructor_t<740, 1105>; }; template struct sm80_tuning { static constexpr int threads = 192; static constexpr int items = 7; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT; using delay_constructor = detail::fixed_delay_constructor_t<764, 1155>; }; template struct sm80_tuning { static constexpr int threads = 128; static constexpr int items = 7; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT; using delay_constructor = detail::fixed_delay_constructor_t<992, 1135>; }; } // namespace unique_by_key } // namespace detail template struct DeviceUniqueByKeyPolicy { using KeyT = typename std::iterator_traits::value_type; using ValueT = typename std::iterator_traits::value_type; // SM350 struct Policy350 : ChainedPolicy<350, Policy350, Policy350> { static constexpr int INPUT_SIZE = sizeof(KeyT); enum { NOMINAL_4B_ITEMS_PER_THREAD = 9, ITEMS_PER_THREAD = Nominal4BItemsToItems(NOMINAL_4B_ITEMS_PER_THREAD), }; using UniqueByKeyPolicyT = AgentUniqueByKeyPolicy<128, ITEMS_PER_THREAD, cub::BLOCK_LOAD_WARP_TRANSPOSE, cub::LOAD_LDG, cub::BLOCK_SCAN_WARP_SCANS, detail::default_delay_constructor_t>; }; struct DefaultTuning { static constexpr int INPUT_SIZE = sizeof(KeyT); enum { NOMINAL_4B_ITEMS_PER_THREAD = 11, ITEMS_PER_THREAD = Nominal4BItemsToItems(NOMINAL_4B_ITEMS_PER_THREAD), }; using UniqueByKeyPolicyT = AgentUniqueByKeyPolicy<64, ITEMS_PER_THREAD, cub::BLOCK_LOAD_WARP_TRANSPOSE, cub::LOAD_LDG, cub::BLOCK_SCAN_WARP_SCANS, detail::default_delay_constructor_t>; }; // SM520 struct Policy520 : DefaultTuning , ChainedPolicy<520, Policy520, Policy350> {}; /// SM80 struct Policy800 : ChainedPolicy<800, Policy800, Policy520> { using tuning = detail::unique_by_key::sm80_tuning; using UniqueByKeyPolicyT = AgentUniqueByKeyPolicy; }; // SM860 struct Policy860 : DefaultTuning , ChainedPolicy<860, Policy860, Policy800> {}; /// SM90 struct Policy900 : ChainedPolicy<900, Policy900, Policy860> { using tuning = detail::unique_by_key::sm90_tuning; using UniqueByKeyPolicyT = AgentUniqueByKeyPolicy; }; using MaxPolicy = Policy900; }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/grid/000077500000000000000000000000001463375617100147065ustar00rootroot00000000000000cccl-2.5.0/cub/cub/grid/grid_barrier.cuh000066400000000000000000000130031463375617100200370ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * cub::GridBarrier implements a software global barrier among thread blocks within a CUDA grid */ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include CUB_NAMESPACE_BEGIN /** * \brief GridBarrier implements a software global barrier among thread blocks within a CUDA grid */ class GridBarrier { protected: typedef unsigned int SyncFlag; // Counters in global device memory SyncFlag* d_sync; public: /** * Constructor */ GridBarrier() : d_sync(NULL) {} /** * Synchronize */ _CCCL_DEVICE _CCCL_FORCEINLINE void Sync() const { volatile SyncFlag* d_vol_sync = d_sync; // Threadfence and syncthreads to make sure global writes are visible before // thread-0 reports in with its sync counter __threadfence(); CTA_SYNC(); if (blockIdx.x == 0) { // Report in ourselves if (threadIdx.x == 0) { d_vol_sync[blockIdx.x] = 1; } CTA_SYNC(); // Wait for everyone else to report in for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x) { while (ThreadLoad(d_sync + peer_block) == 0) { __threadfence_block(); } } CTA_SYNC(); // Let everyone know it's safe to proceed for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x) { d_vol_sync[peer_block] = 0; } } else { if (threadIdx.x == 0) { // Report in d_vol_sync[blockIdx.x] = 1; // Wait for acknowledgment while (ThreadLoad(d_sync + blockIdx.x) == 1) { __threadfence_block(); } } CTA_SYNC(); } } }; /** * \brief GridBarrierLifetime extends GridBarrier to provide lifetime management of the temporary device storage needed * for cooperation. * * Uses RAII for lifetime, i.e., device resources are reclaimed when * the destructor is called. */ class GridBarrierLifetime : public GridBarrier { protected: // Number of bytes backed by d_sync size_t sync_bytes; public: /** * Constructor */ GridBarrierLifetime() : GridBarrier() , sync_bytes(0) {} /** * DeviceFrees and resets the progress counters */ cudaError_t HostReset() { cudaError_t retval = cudaSuccess; if (d_sync) { retval = CubDebug(cudaFree(d_sync)); d_sync = NULL; } sync_bytes = 0; return retval; } /** * Destructor */ virtual ~GridBarrierLifetime() { HostReset(); } /** * Sets up the progress counters for the next kernel launch (lazily * allocating and initializing them if necessary) */ cudaError_t Setup(int sweep_grid_size) { cudaError_t retval = cudaSuccess; do { size_t new_sync_bytes = sweep_grid_size * sizeof(SyncFlag); if (new_sync_bytes > sync_bytes) { if (d_sync) { retval = CubDebug(cudaFree(d_sync)); if (cudaSuccess != retval) { break; } } sync_bytes = new_sync_bytes; // Allocate and initialize to zero retval = CubDebug(cudaMalloc((void**) &d_sync, sync_bytes)); if (cudaSuccess != retval) { break; } retval = CubDebug(cudaMemset(d_sync, 0, new_sync_bytes)); if (cudaSuccess != retval) { break; } } } while (0); return retval; } }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/grid/grid_even_share.cuh000066400000000000000000000176341463375617100205460ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * @file * cub::GridEvenShare is a descriptor utility for distributing input among CUDA thread blocks in an * "even-share" fashion. Each thread block gets roughly the same number of fixed-size work units * (grains). */ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include CUB_NAMESPACE_BEGIN /** * @brief GridEvenShare is a descriptor utility for distributing input among * CUDA thread blocks in an "even-share" fashion. Each thread block gets roughly * the same number of input tiles. * * @par Overview * Each thread block is assigned a consecutive sequence of input tiles. To help * preserve alignment and eliminate the overhead of guarded loads for all but the * last thread block, to GridEvenShare assigns one of three different amounts of * work to a given thread block: "big", "normal", or "last". The "big" workloads * are one scheduling grain larger than "normal". The "last" work unit for the * last thread block may be partially-full if the input is not an even multiple of * the scheduling grain size. * * @par * Before invoking a child grid, a parent thread will typically construct an * instance of GridEvenShare. The instance can be passed to child thread blocks * which can initialize their per-thread block offsets using \p BlockInit(). */ template struct GridEvenShare { private: int total_tiles; int big_shares; OffsetT big_share_items; OffsetT normal_share_items; OffsetT normal_base_offset; public: /// Total number of input items OffsetT num_items; /// Grid size in thread blocks int grid_size; /// OffsetT into input marking the beginning of the owning thread block's segment of input tiles OffsetT block_offset; /// OffsetT into input of marking the end (one-past) of the owning thread block's segment of input tiles OffsetT block_end; /// Stride between input tiles OffsetT block_stride; /** * \brief Constructor. */ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE GridEvenShare() : total_tiles(0) , big_shares(0) , big_share_items(0) , normal_share_items(0) , normal_base_offset(0) , num_items(0) , grid_size(0) , block_offset(0) , block_end(0) , block_stride(0) {} /** * @brief Dispatch initializer. To be called prior prior to kernel launch. * * @param num_items_ * Total number of input items * * @param max_grid_size * Maximum grid size allowable (actual grid size may be less if not warranted by the the * number of input items) * * @param tile_items * Number of data items per input tile */ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE void DispatchInit(OffsetT num_items_, int max_grid_size, int tile_items) { this->block_offset = num_items_; // Initialize past-the-end this->block_end = num_items_; // Initialize past-the-end this->num_items = num_items_; this->total_tiles = static_cast(cub::DivideAndRoundUp(num_items_, tile_items)); this->grid_size = CUB_MIN(total_tiles, max_grid_size); int avg_tiles_per_block = total_tiles / grid_size; // leftover grains go to big blocks: this->big_shares = total_tiles - (avg_tiles_per_block * grid_size); this->normal_share_items = avg_tiles_per_block * tile_items; this->normal_base_offset = big_shares * tile_items; this->big_share_items = normal_share_items + tile_items; } /** * @brief Initializes ranges for the specified thread block index. Specialized * for a "raking" access pattern in which each thread block is assigned a * consecutive sequence of input tiles. */ template _CCCL_DEVICE _CCCL_FORCEINLINE void BlockInit(int block_id, Int2Type /*strategy_tag*/) { block_stride = TILE_ITEMS; if (block_id < big_shares) { // This thread block gets a big share of grains (avg_tiles_per_block + 1) block_offset = (block_id * big_share_items); block_end = block_offset + big_share_items; } else if (block_id < total_tiles) { // This thread block gets a normal share of grains (avg_tiles_per_block) block_offset = normal_base_offset + (block_id * normal_share_items); // Avoid generating values greater than num_items, as it may cause overflow block_end = block_offset + CUB_MIN(num_items - block_offset, normal_share_items); } // Else default past-the-end } /** * @brief Block-initialization, specialized for a "raking" access * pattern in which each thread block is assigned a consecutive sequence * of input tiles. */ template _CCCL_DEVICE _CCCL_FORCEINLINE void BlockInit(int block_id, Int2Type /*strategy_tag*/) { block_stride = grid_size * TILE_ITEMS; block_offset = (block_id * TILE_ITEMS); block_end = num_items; } /** * @brief Block-initialization, specialized for "strip mining" access * pattern in which the input tiles assigned to each thread block are * separated by a stride equal to the the extent of the grid. */ template _CCCL_DEVICE _CCCL_FORCEINLINE void BlockInit() { BlockInit(blockIdx.x, Int2Type()); } /** * @brief Block-initialization, specialized for a "raking" access * pattern in which each thread block is assigned a consecutive sequence * of input tiles. * * @param[in] block_offset * Threadblock begin offset (inclusive) * * @param[in] block_end * Threadblock end offset (exclusive) */ template _CCCL_DEVICE _CCCL_FORCEINLINE void BlockInit(OffsetT block_offset, OffsetT block_end) { this->block_offset = block_offset; this->block_end = block_end; this->block_stride = TILE_ITEMS; } }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/grid/grid_mapping.cuh000066400000000000000000000112611463375617100200500ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto * a grid of CUDA thread blocks. */ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header CUB_NAMESPACE_BEGIN /****************************************************************************** * Mapping policies *****************************************************************************/ /** * \brief cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide * data onto a grid of CUDA thread blocks. */ enum GridMappingStrategy { /** * \brief An a "raking" access pattern in which each thread block is * assigned a consecutive sequence of input tiles * * \par Overview * The input is evenly partitioned into \p p segments, where \p p is * constant and corresponds loosely to the number of thread blocks that may * actively reside on the target device. Each segment is comprised of * consecutive tiles, where a tile is a small, constant-sized unit of input * to be processed to completion before the thread block terminates or * obtains more work. The kernel invokes \p p thread blocks, each * of which iteratively consumes a segment of n/p elements * in tile-size increments. */ GRID_MAPPING_RAKE, /** * \brief An a "strip mining" access pattern in which the input tiles assigned * to each thread block are separated by a stride equal to the the extent of * the grid. * * \par Overview * The input is evenly partitioned into \p p sets, where \p p is * constant and corresponds loosely to the number of thread blocks that may * actively reside on the target device. Each set is comprised of * data tiles separated by stride \p tiles, where a tile is a small, * constant-sized unit of input to be processed to completion before the * thread block terminates or obtains more work. The kernel invokes \p p * thread blocks, each of which iteratively consumes a segment of * n/p elements in tile-size increments. */ GRID_MAPPING_STRIP_MINE, /** * \brief A dynamic "queue-based" strategy for assigning input tiles to thread blocks. * * \par Overview * The input is treated as a queue to be dynamically consumed by a grid of * thread blocks. Work is atomically dequeued in tiles, where a tile is a * unit of input to be processed to completion before the thread block * terminates or obtains more work. The grid size \p p is constant, * loosely corresponding to the number of thread blocks that may actively * reside on the target device. */ GRID_MAPPING_DYNAMIC, }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/grid/grid_queue.cuh000066400000000000000000000167701463375617100175530ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * @file * cub::GridQueue is a descriptor utility for dynamic queue management. */ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include CUB_NAMESPACE_BEGIN /** * @brief GridQueue is a descriptor utility for dynamic queue management. * * @par Overview * GridQueue descriptors provides abstractions for "filling" or * "draining" globally-shared vectors. * * @par * A "filling" GridQueue works by atomically-adding to a zero-initialized counter, * returning a unique offset for the calling thread to write its items. * The GridQueue maintains the total "fill-size". The fill counter must be reset * using GridQueue::ResetFill by the host or kernel instance prior to the kernel instance that * will be filling. * * @par * Similarly, a "draining" GridQueue works by works by atomically-incrementing a * zero-initialized counter, returning a unique offset for the calling thread to * read its items. Threads can safely drain until the array's logical fill-size is * exceeded. The drain counter must be reset using GridQueue::ResetDrain or * GridQueue::FillAndResetDrain by the host or kernel instance prior to the kernel instance that * will be filling. (For dynamic work distribution of existing data, the corresponding fill-size * is simply the number of elements in the array.) * * @par * Iterative work management can be implemented simply with a pair of flip-flopping * work buffers, each with an associated set of fill and drain GridQueue descriptors. * * @tparam OffsetT Signed integer type for global offsets */ template class GridQueue { private: /// Counter indices enum { FILL = 0, DRAIN = 1, }; /// Pair of counters OffsetT* d_counters; public: /// Returns the device allocation size in bytes needed to construct a GridQueue instance _CCCL_HOST_DEVICE _CCCL_FORCEINLINE static size_t AllocationSize() { return sizeof(OffsetT) * 2; } /// Constructs an invalid GridQueue descriptor _CCCL_HOST_DEVICE _CCCL_FORCEINLINE GridQueue() : d_counters(NULL) {} /** * @brief Constructs a GridQueue descriptor around the device storage allocation * * @param d_storage * Device allocation to back the GridQueue. Must be at least as big as * AllocationSize(). */ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE GridQueue(void* d_storage) : d_counters((OffsetT*) d_storage) {} /// This operation sets the fill-size and resets the drain counter, preparing the GridQueue for /// draining in the next kernel instance. To be called by the host or by a kernel prior to that /// which will be draining. _CCCL_HOST_DEVICE _CCCL_FORCEINLINE cudaError_t FillAndResetDrain(OffsetT fill_size, cudaStream_t stream = 0) { cudaError_t result = cudaErrorUnknown; NV_IF_TARGET( NV_IS_DEVICE, ((void) stream; d_counters[FILL] = fill_size; d_counters[DRAIN] = 0; result = cudaSuccess;), (OffsetT counters[2]; counters[FILL] = fill_size; counters[DRAIN] = 0; result = CubDebug(cudaMemcpyAsync(d_counters, counters, sizeof(OffsetT) * 2, cudaMemcpyHostToDevice, stream));)); return result; } /// This operation resets the drain so that it may advance to meet the existing fill-size. /// To be called by the host or by a kernel prior to that which will be draining. _CCCL_HOST_DEVICE _CCCL_FORCEINLINE cudaError_t ResetDrain(cudaStream_t stream = 0) { cudaError_t result = cudaErrorUnknown; NV_IF_TARGET(NV_IS_DEVICE, ((void) stream; d_counters[DRAIN] = 0; result = cudaSuccess;), (result = CubDebug(cudaMemsetAsync(d_counters + DRAIN, 0, sizeof(OffsetT), stream));)); return result; } /// This operation resets the fill counter. /// To be called by the host or by a kernel prior to that which will be filling. _CCCL_HOST_DEVICE _CCCL_FORCEINLINE cudaError_t ResetFill(cudaStream_t stream = 0) { cudaError_t result = cudaErrorUnknown; NV_IF_TARGET(NV_IS_DEVICE, ((void) stream; d_counters[FILL] = 0; result = cudaSuccess;), (result = CubDebug(cudaMemsetAsync(d_counters + FILL, 0, sizeof(OffsetT), stream));)); return result; } /// Returns the fill-size established by the parent or by the previous kernel. _CCCL_HOST_DEVICE _CCCL_FORCEINLINE cudaError_t FillSize(OffsetT& fill_size, cudaStream_t stream = 0) { cudaError_t result = cudaErrorUnknown; NV_IF_TARGET(NV_IS_DEVICE, ((void) stream; fill_size = d_counters[FILL]; result = cudaSuccess;), (result = CubDebug( cudaMemcpyAsync(&fill_size, d_counters + FILL, sizeof(OffsetT), cudaMemcpyDeviceToHost, stream));)); return result; } /// Drain @p num_items from the queue. Returns offset from which to read items. /// To be called from CUDA kernel. _CCCL_DEVICE _CCCL_FORCEINLINE OffsetT Drain(OffsetT num_items) { return atomicAdd(d_counters + DRAIN, num_items); } /// Fill @p num_items into the queue. Returns offset from which to write items. /// To be called from CUDA kernel. _CCCL_DEVICE _CCCL_FORCEINLINE OffsetT Fill(OffsetT num_items) { return atomicAdd(d_counters + FILL, num_items); } }; #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document /** * Reset grid queue (call with 1 block of 1 thread) */ template __global__ void FillAndResetDrainKernel(GridQueue grid_queue, OffsetT num_items) { grid_queue.FillAndResetDrain(num_items); } #endif // DOXYGEN_SHOULD_SKIP_THIS CUB_NAMESPACE_END cccl-2.5.0/cub/cub/host/000077500000000000000000000000001463375617100147365ustar00rootroot00000000000000cccl-2.5.0/cub/cub/host/mutex.cuh000066400000000000000000000047641463375617100166140ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * Simple portable mutex */ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include CUB_NAMESPACE_BEGIN /** * Wraps std::mutex * @deprecated [Since CUB 2.1.0] The `cub::Mutex` is deprecated and will be removed * in a future release. Use `std::mutex` instead. */ struct CUB_DEPRECATED Mutex { std::mutex mtx; void Lock() { mtx.lock(); } void Unlock() { mtx.unlock(); } }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/iterator/000077500000000000000000000000001463375617100156125ustar00rootroot00000000000000cccl-2.5.0/cub/cub/iterator/arg_index_input_iterator.cuh000066400000000000000000000201361463375617100234050ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * Random-access iterator types */ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include #include #if (THRUST_VERSION >= 100700) // This iterator is compatible with Thrust API 1.7 and newer # include # include #endif // THRUST_VERSION CUB_NAMESPACE_BEGIN /** * @brief A random-access input wrapper for pairing dereferenced values with their corresponding * indices (forming \p KeyValuePair tuples). * * @par Overview * - ArgIndexInputIteratorTwraps a random access input iterator @p itr of type @p InputIteratorT. * Dereferencing an ArgIndexInputIteratorTat offset @p i produces a @p KeyValuePair value whose * @p key field is @p i and whose @p value field is itr[i]. * - Can be used with any data type. * - Can be constructed, manipulated, and exchanged within and between host and device * functions. Wrapped host memory can only be dereferenced on the host, and wrapped * device memory can only be dereferenced on the device. * - Compatible with Thrust API v1.7 or newer. * * @par Snippet * The code snippet below illustrates the use of @p ArgIndexInputIteratorTto * dereference an array of doubles * @par * @code * #include // or equivalently * * // Declare, allocate, and initialize a device array * double *d_in; // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0] * * // Create an iterator wrapper * cub::ArgIndexInputIterator itr(d_in); * * // Within device code: * typedef typename cub::ArgIndexInputIterator::value_type Tuple; * Tuple item_offset_pair.key = *itr; * printf("%f @ %d\n", * item_offset_pair.value, * item_offset_pair.key); // 8.0 @ 0 * * itr = itr + 6; * item_offset_pair.key = *itr; * printf("%f @ %d\n", * item_offset_pair.value, * item_offset_pair.key); // 9.0 @ 6 * * @endcode * * @tparam InputIteratorT * The value type of the wrapped input iterator * * @tparam OffsetT * The difference type of this iterator (Default: @p ptrdiff_t) * * @tparam OutputValueT * The paired value type of the tuple (Default: value type of input iterator) */ template > class ArgIndexInputIterator { public: // Required iterator traits /// My own type typedef ArgIndexInputIterator self_type; /// Type to express the result of subtracting one iterator from another typedef OffsetT difference_type; /// The type of the element the iterator can point to typedef KeyValuePair value_type; /// The type of a pointer to an element the iterator can point to typedef value_type* pointer; /// The type of a reference to an element the iterator can point to typedef value_type reference; #if (THRUST_VERSION >= 100700) // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods /// The iterator category typedef typename THRUST_NS_QUALIFIER::detail::iterator_facade_category< THRUST_NS_QUALIFIER::any_system_tag, THRUST_NS_QUALIFIER::random_access_traversal_tag, value_type, reference>::type iterator_category; #else /// The iterator category typedef std::random_access_iterator_tag iterator_category; #endif // THRUST_VERSION private: InputIteratorT itr; difference_type offset; public: /** * @param itr * Input iterator to wrap * * @param offset * OffsetT (in items) from @p itr denoting the position of the iterator */ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE ArgIndexInputIterator(InputIteratorT itr, difference_type offset = 0) : itr(itr) , offset(offset) {} /// Postfix increment _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type operator++(int) { self_type retval = *this; offset++; return retval; } /// Prefix increment _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type operator++() { offset++; return *this; } /// Indirection _CCCL_HOST_DEVICE _CCCL_FORCEINLINE reference operator*() const { value_type retval; retval.value = itr[offset]; retval.key = offset; return retval; } /// Addition template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type operator+(Distance n) const { self_type retval(itr, offset + n); return retval; } /// Addition assignment template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type& operator+=(Distance n) { offset += n; return *this; } /// Subtraction template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type operator-(Distance n) const { self_type retval(itr, offset - n); return retval; } /// Subtraction assignment template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type& operator-=(Distance n) { offset -= n; return *this; } /// Distance _CCCL_HOST_DEVICE _CCCL_FORCEINLINE difference_type operator-(self_type other) const { return offset - other.offset; } /// Array subscript template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE reference operator[](Distance n) const { self_type offset = (*this) + n; return *offset; } /// Structure dereference _CCCL_HOST_DEVICE _CCCL_FORCEINLINE pointer operator->() { return &(*(*this)); } /// Equal to _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool operator==(const self_type& rhs) { return ((itr == rhs.itr) && (offset == rhs.offset)); } /// Not equal to _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool operator!=(const self_type& rhs) { return ((itr != rhs.itr) || (offset != rhs.offset)); } /// Normalize _CCCL_HOST_DEVICE _CCCL_FORCEINLINE void normalize() { itr += offset; offset = 0; } /// ostream operator friend std::ostream& operator<<(std::ostream& os, const self_type& /*itr*/) { return os; } }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/iterator/cache_modified_input_iterator.cuh000066400000000000000000000174041463375617100243540ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * @file * Random-access iterator types */ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #if !defined(_CCCL_COMPILER_NVRTC) # include # include #else # include #endif #include #include #if (THRUST_VERSION >= 100700) // This iterator is compatible with Thrust API 1.7 and newer # include # include #endif // THRUST_VERSION CUB_NAMESPACE_BEGIN /** * @brief A random-access input wrapper for dereferencing array values using a PTX cache load * modifier. * * @par Overview * - CacheModifiedInputIterator is a random-access input iterator that wraps a native * device pointer of type ValueType*. @p ValueType references are * made by reading @p ValueType values through loads modified by @p MODIFIER. * - Can be used to load any data type from memory using PTX cache load modifiers (e.g., "LOAD_LDG", * "LOAD_CG", "LOAD_CA", "LOAD_CS", "LOAD_CV", etc.). * - Can be constructed, manipulated, and exchanged within and between host and device * functions, but can only be dereferenced within device functions. * - Compatible with Thrust API v1.7 or newer. * * @par Snippet * The code snippet below illustrates the use of @p CacheModifiedInputIterator to * dereference a device array of double using the "ldg" PTX load modifier * (i.e., load values through texture cache). * @par * @code * #include // or equivalently * * // Declare, allocate, and initialize a device array * double *d_in; // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0] * * // Create an iterator wrapper * cub::CacheModifiedInputIterator itr(d_in); * * // Within device code: * printf("%f\n", itr[0]); // 8.0 * printf("%f\n", itr[1]); // 6.0 * printf("%f\n", itr[6]); // 9.0 * * @endcode * * @tparam CacheLoadModifier * The cub::CacheLoadModifier to use when accessing data * * @tparam ValueType * The value type of this iterator * * @tparam OffsetT * The difference type of this iterator (Default: @p ptrdiff_t) */ template class CacheModifiedInputIterator { public: // Required iterator traits /// My own type typedef CacheModifiedInputIterator self_type; /// Type to express the result of subtracting one iterator from another typedef OffsetT difference_type; /// The type of the element the iterator can point to typedef ValueType value_type; /// The type of a pointer to an element the iterator can point to typedef ValueType* pointer; /// The type of a reference to an element the iterator can point to typedef ValueType reference; #if !defined(_CCCL_COMPILER_NVRTC) # if (THRUST_VERSION >= 100700) // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods using iterator_category = typename THRUST_NS_QUALIFIER::detail::iterator_facade_category< THRUST_NS_QUALIFIER::device_system_tag, THRUST_NS_QUALIFIER::random_access_traversal_tag, value_type, reference>::type; # else // THRUST_VERSION < 100700 using iterator_category = std::random_access_iterator_tag; # endif // THRUST_VERSION #else // defined(_CCCL_COMPILER_NVRTC) using iterator_category = ::cuda::std::random_access_iterator_tag; #endif // defined(_CCCL_COMPILER_NVRTC) public: /// Wrapped native pointer ValueType* ptr; /// Constructor template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE CacheModifiedInputIterator(QualifiedValueType* ptr) ///< Native pointer to wrap : ptr(const_cast::type*>(ptr)) {} /// Postfix increment _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type operator++(int) { self_type retval = *this; ptr++; return retval; } /// Prefix increment _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type operator++() { ptr++; return *this; } /// Indirection _CCCL_DEVICE _CCCL_FORCEINLINE reference operator*() const { return ThreadLoad(ptr); } /// Addition template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type operator+(Distance n) const { self_type retval(ptr + n); return retval; } /// Addition assignment template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type& operator+=(Distance n) { ptr += n; return *this; } /// Subtraction template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type operator-(Distance n) const { self_type retval(ptr - n); return retval; } /// Subtraction assignment template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type& operator-=(Distance n) { ptr -= n; return *this; } /// Distance _CCCL_HOST_DEVICE _CCCL_FORCEINLINE difference_type operator-(self_type other) const { return ptr - other.ptr; } /// Array subscript template _CCCL_DEVICE _CCCL_FORCEINLINE reference operator[](Distance n) const { return ThreadLoad(ptr + n); } /// Structure dereference _CCCL_DEVICE _CCCL_FORCEINLINE pointer operator->() { return &ThreadLoad(ptr); } /// Equal to _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool operator==(const self_type& rhs) const { return (ptr == rhs.ptr); } /// Not equal to _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool operator!=(const self_type& rhs) const { return (ptr != rhs.ptr); } /// ostream operator #if !defined(_CCCL_COMPILER_NVRTC) friend std::ostream& operator<<(std::ostream& os, const self_type& /*itr*/) { return os; } #endif }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/iterator/cache_modified_output_iterator.cuh000066400000000000000000000172071463375617100245560ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * Random-access iterator types */ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include #if (THRUST_VERSION >= 100700) // This iterator is compatible with Thrust API 1.7 and newer # include # include #endif // THRUST_VERSION CUB_NAMESPACE_BEGIN /** * @brief A random-access output wrapper for storing array values using a PTX cache-modifier. * * @par Overview * - CacheModifiedOutputIterator is a random-access output iterator that wraps a native * device pointer of type ValueType*. @p ValueType references are * made by writing @p ValueType values through stores modified by @p MODIFIER. * - Can be used to store any data type to memory using PTX cache store modifiers (e.g., "STORE_WB", * "STORE_CG", "STORE_CS", "STORE_WT", etc.). * - Can be constructed, manipulated, and exchanged within and between host and device * functions, but can only be dereferenced within device functions. * - Compatible with Thrust API v1.7 or newer. * * @par Snippet * The code snippet below illustrates the use of @p CacheModifiedOutputIterator to * dereference a device array of doubles using the "wt" PTX load modifier * (i.e., write-through to system memory). * @par * @code * #include // or equivalently * * // Declare, allocate, and initialize a device array * double *d_out; // e.g., [, , , , , , ] * * // Create an iterator wrapper * cub::CacheModifiedOutputIterator itr(d_out); * * // Within device code: * itr[0] = 8.0; * itr[1] = 66.0; * itr[55] = 24.0; * * @endcode * * @par Usage Considerations * - Can only be dereferenced within device code * * @tparam CacheStoreModifier * The cub::CacheStoreModifier to use when accessing data * * @tparam ValueType * The value type of this iterator * * @tparam OffsetT * The difference type of this iterator (Default: @p ptrdiff_t) */ template class CacheModifiedOutputIterator { private: // Proxy object struct Reference { ValueType* ptr; /// Constructor _CCCL_HOST_DEVICE _CCCL_FORCEINLINE Reference(ValueType* ptr) : ptr(ptr) {} /// Assignment _CCCL_DEVICE _CCCL_FORCEINLINE ValueType operator=(ValueType val) { ThreadStore(ptr, val); return val; } }; public: // Required iterator traits /// My own type typedef CacheModifiedOutputIterator self_type; /// Type to express the result of subtracting one iterator from another typedef OffsetT difference_type; /// The type of the element the iterator can point to typedef void value_type; /// The type of a pointer to an element the iterator can point to typedef void pointer; /// The type of a reference to an element the iterator can point to typedef Reference reference; #if (THRUST_VERSION >= 100700) // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods /// The iterator category typedef typename THRUST_NS_QUALIFIER::detail::iterator_facade_category< THRUST_NS_QUALIFIER::device_system_tag, THRUST_NS_QUALIFIER::random_access_traversal_tag, value_type, reference>::type iterator_category; #else /// The iterator category typedef std::random_access_iterator_tag iterator_category; #endif // THRUST_VERSION private: ValueType* ptr; public: /** * @param ptr * Native pointer to wrap */ template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE CacheModifiedOutputIterator(QualifiedValueType* ptr) : ptr(const_cast::type*>(ptr)) {} /// Postfix increment _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type operator++(int) { self_type retval = *this; ptr++; return retval; } /// Prefix increment _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type operator++() { ptr++; return *this; } /// Indirection _CCCL_HOST_DEVICE _CCCL_FORCEINLINE reference operator*() const { return Reference(ptr); } /// Addition template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type operator+(Distance n) const { self_type retval(ptr + n); return retval; } /// Addition assignment template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type& operator+=(Distance n) { ptr += n; return *this; } /// Subtraction template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type operator-(Distance n) const { self_type retval(ptr - n); return retval; } /// Subtraction assignment template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type& operator-=(Distance n) { ptr -= n; return *this; } /// Distance _CCCL_HOST_DEVICE _CCCL_FORCEINLINE difference_type operator-(self_type other) const { return ptr - other.ptr; } /// Array subscript template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE reference operator[](Distance n) const { return Reference(ptr + n); } /// Equal to _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool operator==(const self_type& rhs) { return (ptr == rhs.ptr); } /// Not equal to _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool operator!=(const self_type& rhs) { return (ptr != rhs.ptr); } /// ostream operator friend std::ostream& operator<<(std::ostream& os, const self_type& itr) { return os; } }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/iterator/constant_input_iterator.cuh000066400000000000000000000160621463375617100233010ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * @file * Random-access iterator types */ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include #if (THRUST_VERSION >= 100700) // This iterator is compatible with Thrust API 1.7 and newer # include # include #endif // THRUST_VERSION CUB_NAMESPACE_BEGIN /** * @brief A random-access input generator for dereferencing a sequence of homogeneous values * * @par Overview * - Read references to a ConstantInputIteratorTiterator always return the supplied constant * of type @p ValueType. * - Can be used with any data type. * - Can be constructed, manipulated, dereferenced, and exchanged within and between host and device * functions. * - Compatible with Thrust API v1.7 or newer. * * @par Snippet * The code snippet below illustrates the use of @p ConstantInputIteratorTto * dereference a sequence of homogeneous doubles. * @par * @code * #include // or equivalently * * cub::ConstantInputIterator itr(5.0); * * printf("%f\n", itr[0]); // 5.0 * printf("%f\n", itr[1]); // 5.0 * printf("%f\n", itr[2]); // 5.0 * printf("%f\n", itr[50]); // 5.0 * * @endcode * * @tparam ValueType * The value type of this iterator * * @tparam OffsetT * The difference type of this iterator (Default: @p ptrdiff_t) */ template class ConstantInputIterator { public: // Required iterator traits /// My own type typedef ConstantInputIterator self_type; /// Type to express the result of subtracting one iterator from another typedef OffsetT difference_type; /// The type of the element the iterator can point to typedef ValueType value_type; /// The type of a pointer to an element the iterator can point to typedef ValueType* pointer; /// The type of a reference to an element the iterator can point to typedef ValueType reference; #if (THRUST_VERSION >= 100700) // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods /// The iterator category typedef typename THRUST_NS_QUALIFIER::detail::iterator_facade_category< THRUST_NS_QUALIFIER::any_system_tag, THRUST_NS_QUALIFIER::random_access_traversal_tag, value_type, reference>::type iterator_category; #else /// The iterator category typedef std::random_access_iterator_tag iterator_category; #endif // THRUST_VERSION private: ValueType val; OffsetT offset; #ifdef _WIN32 // Workaround for win32 parameter-passing bug (ulonglong2 argmin DeviceReduce) OffsetT pad[CUB_MAX(1, (16 / sizeof(OffsetT) - 1))]; #endif public: /** * @param val * Starting value for the iterator instance to report * * @param offset * Base offset */ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE ConstantInputIterator(ValueType val, OffsetT offset = 0) : val(val) , offset(offset) {} /// Postfix increment _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type operator++(int) { self_type retval = *this; offset++; return retval; } /// Prefix increment _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type operator++() { offset++; return *this; } /// Indirection _CCCL_HOST_DEVICE _CCCL_FORCEINLINE reference operator*() const { return val; } /// Addition template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type operator+(Distance n) const { self_type retval(val, offset + n); return retval; } /// Addition assignment template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type& operator+=(Distance n) { offset += n; return *this; } /// Subtraction template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type operator-(Distance n) const { self_type retval(val, offset - n); return retval; } /// Subtraction assignment template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type& operator-=(Distance n) { offset -= n; return *this; } /// Distance _CCCL_HOST_DEVICE _CCCL_FORCEINLINE difference_type operator-(self_type other) const { return offset - other.offset; } /// Array subscript template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE reference operator[](Distance /*n*/) const { return val; } /// Structure dereference _CCCL_HOST_DEVICE _CCCL_FORCEINLINE pointer operator->() { return &val; } /// Equal to _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool operator==(const self_type& rhs) const { return (offset == rhs.offset) && ((val == rhs.val)); } /// Not equal to _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool operator!=(const self_type& rhs) const { return (offset != rhs.offset) || (val != rhs.val); } /// ostream operator friend std::ostream& operator<<(std::ostream& os, const self_type& itr) { os << "[" << itr.val << "," << itr.offset << "]"; return os; } }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/iterator/counting_input_iterator.cuh000066400000000000000000000161031463375617100232720ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * Random-access iterator types */ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #if !defined(_CCCL_COMPILER_NVRTC) # include # include #else # include #endif #include #if (THRUST_VERSION >= 100700) // This iterator is compatible with Thrust API 1.7 and newer # include # include #endif // THRUST_VERSION CUB_NAMESPACE_BEGIN /** * @brief A random-access input generator for dereferencing a sequence of incrementing integer values. * * @par Overview * - After initializing a CountingInputIteratorTto a certain integer @p base, read references * at @p offset will return the value @p base + @p offset. * - Can be constructed, manipulated, dereferenced, and exchanged within and between host and device * functions. * - Compatible with Thrust API v1.7 or newer. * * @par Snippet * The code snippet below illustrates the use of @p CountingInputIteratorTto * dereference a sequence of incrementing integers. * @par * @code * #include // or equivalently * * cub::CountingInputIterator itr(5); * * printf("%d\n", itr[0]); // 5 * printf("%d\n", itr[1]); // 6 * printf("%d\n", itr[2]); // 7 * printf("%d\n", itr[50]); // 55 * * @endcode * * @tparam ValueType * The value type of this iterator * * @tparam OffsetT * The difference type of this iterator (Default: @p ptrdiff_t) */ template class CountingInputIterator { public: // Required iterator traits /// My own type typedef CountingInputIterator self_type; /// Type to express the result of subtracting one iterator from another typedef OffsetT difference_type; /// The type of the element the iterator can point to typedef ValueType value_type; /// The type of a pointer to an element the iterator can point to typedef ValueType* pointer; /// The type of a reference to an element the iterator can point to typedef ValueType reference; #if !defined(_CCCL_COMPILER_NVRTC) # if (THRUST_VERSION >= 100700) // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods using iterator_category = typename THRUST_NS_QUALIFIER::detail::iterator_facade_category< THRUST_NS_QUALIFIER::any_system_tag, THRUST_NS_QUALIFIER::random_access_traversal_tag, value_type, reference>::type; # else // THRUST_VERSION < 100700 using iterator_category = std::random_access_iterator_tag; # endif // THRUST_VERSION #else // defined(_CCCL_COMPILER_NVRTC) using iterator_category = ::cuda::std::random_access_iterator_tag; #endif // defined(_CCCL_COMPILER_NVRTC) private: ValueType val; public: /** * @param val * Starting value for the iterator instance to report */ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE CountingInputIterator(const ValueType& val) : val(val) {} /// Postfix increment _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type operator++(int) { self_type retval = *this; val++; return retval; } /// Prefix increment _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type operator++() { val++; return *this; } /// Indirection _CCCL_HOST_DEVICE _CCCL_FORCEINLINE reference operator*() const { return val; } /// Addition template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type operator+(Distance n) const { self_type retval(val + (ValueType) n); return retval; } /// Addition assignment template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type& operator+=(Distance n) { val += (ValueType) n; return *this; } /// Subtraction template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type operator-(Distance n) const { self_type retval(val - (ValueType) n); return retval; } /// Subtraction assignment template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type& operator-=(Distance n) { val -= n; return *this; } /// Distance _CCCL_HOST_DEVICE _CCCL_FORCEINLINE difference_type operator-(self_type other) const { return (difference_type) (val - other.val); } /// Array subscript template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE reference operator[](Distance n) const { return val + (ValueType) n; } /// Structure dereference _CCCL_HOST_DEVICE _CCCL_FORCEINLINE pointer operator->() { return &val; } /// Equal to _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool operator==(const self_type& rhs) const { return (val == rhs.val); } /// Not equal to _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool operator!=(const self_type& rhs) const { return (val != rhs.val); } /// ostream operator #if !defined(_CCCL_COMPILER_NVRTC) friend std::ostream& operator<<(std::ostream& os, const self_type& itr) { os << "[" << itr.val << "]"; return os; } #endif }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/iterator/discard_output_iterator.cuh000066400000000000000000000141621463375617100232610ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * @file * Random-access iterator types */ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #if (THRUST_VERSION >= 100700) // This iterator is compatible with Thrust API 1.7 and newer # include # include #endif // THRUST_VERSION CUB_NAMESPACE_BEGIN /** * @brief A discard iterator */ template class DiscardOutputIterator { public: // Required iterator traits /// My own type typedef DiscardOutputIterator self_type; /// Type to express the result of subtracting one iterator from another typedef OffsetT difference_type; /// The type of the element the iterator can point to typedef void value_type; /// The type of a pointer to an element the iterator can point to typedef void pointer; /// The type of a reference to an element the iterator can point to typedef void reference; #if (THRUST_VERSION >= 100700) // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods /// The iterator category typedef typename THRUST_NS_QUALIFIER::detail::iterator_facade_category< THRUST_NS_QUALIFIER::any_system_tag, THRUST_NS_QUALIFIER::random_access_traversal_tag, value_type, reference>::type iterator_category; #else /// The iterator category typedef std::random_access_iterator_tag iterator_category; #endif // THRUST_VERSION private: OffsetT offset; #if defined(_WIN32) || !defined(_WIN64) // Workaround for win32 parameter-passing bug (ulonglong2 argmin DeviceReduce) OffsetT pad[CUB_MAX(1, (16 / sizeof(OffsetT) - 1))] = {}; #endif public: /** * @param offset * Base offset */ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE DiscardOutputIterator(OffsetT offset = 0) : offset(offset) {} /// Postfix increment _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type operator++(int) { self_type retval = *this; offset++; return retval; } /// Prefix increment _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type operator++() { offset++; return *this; } /// Indirection _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type& operator*() { // return self reference, which can be assigned to anything return *this; } /// Addition template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type operator+(Distance n) const { self_type retval(offset + n); return retval; } /// Addition assignment template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type& operator+=(Distance n) { offset += n; return *this; } /// Subtraction template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type operator-(Distance n) const { self_type retval(offset - n); return retval; } /// Subtraction assignment template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type& operator-=(Distance n) { offset -= n; return *this; } /// Distance _CCCL_HOST_DEVICE _CCCL_FORCEINLINE difference_type operator-(self_type other) const { return offset - other.offset; } /// Array subscript template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type& operator[](Distance n) { // return self reference, which can be assigned to anything return *this; } /// Structure dereference _CCCL_HOST_DEVICE _CCCL_FORCEINLINE pointer operator->() { return; } /// Assignment to anything else (no-op) template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE void operator=(T const&) {} /// Cast to void* operator _CCCL_HOST_DEVICE _CCCL_FORCEINLINE operator void*() const { return NULL; } /// Equal to _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool operator==(const self_type& rhs) { return (offset == rhs.offset); } /// Not equal to _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool operator!=(const self_type& rhs) { return (offset != rhs.offset); } /// ostream operator friend std::ostream& operator<<(std::ostream& os, const self_type& itr) { os << "[" << itr.offset << "]"; return os; } }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/iterator/tex_obj_input_iterator.cuh000066400000000000000000000240071463375617100231000ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * @file * Random-access iterator types */ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include #include #include #if (THRUST_VERSION >= 100700) // This iterator is compatible with Thrust API 1.7 and newer # include # include #endif // THRUST_VERSION CUB_NAMESPACE_BEGIN /** * @brief A random-access input wrapper for dereferencing array values through texture cache. * Uses newer Kepler-style texture objects. * * @par Overview * - TexObjInputIterator wraps a native device pointer of type ValueType*. References * to elements are to be loaded through texture cache. * - Can be used to load any data type from memory through texture cache. * - Can be manipulated and exchanged within and between host and device * functions, can only be constructed within host functions, and can only be * dereferenced within device functions. * - With regard to nested/dynamic parallelism, TexObjInputIterator iterators may only be * created by the host thread, but can be used by any descendant kernel. * - Compatible with Thrust API v1.7 or newer. * * @par Snippet * The code snippet below illustrates the use of @p TexObjInputIterator to * dereference a device array of doubles through texture cache. * @par * @code * #include // or equivalently * * // Declare, allocate, and initialize a device array * int num_items; // e.g., 7 * double *d_in; // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0] * * // Create an iterator wrapper * cub::TexObjInputIterator itr; * itr.BindTexture(d_in, sizeof(double) * num_items); * ... * * // Within device code: * printf("%f\n", itr[0]); // 8.0 * printf("%f\n", itr[1]); // 6.0 * printf("%f\n", itr[6]); // 9.0 * * ... * itr.UnbindTexture(); * * @endcode * * @tparam T * The value type of this iterator * * @tparam OffsetT * The difference type of this iterator (Default: @p ptrdiff_t) */ template class TexObjInputIterator { public: // Required iterator traits /// My own type typedef TexObjInputIterator self_type; /// Type to express the result of subtracting one iterator from another typedef OffsetT difference_type; /// The type of the element the iterator can point to typedef T value_type; /// The type of a pointer to an element the iterator can point to typedef T* pointer; /// The type of a reference to an element the iterator can point to typedef T reference; #if (THRUST_VERSION >= 100700) // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods /// The iterator category typedef typename THRUST_NS_QUALIFIER::detail::iterator_facade_category< THRUST_NS_QUALIFIER::device_system_tag, THRUST_NS_QUALIFIER::random_access_traversal_tag, value_type, reference>::type iterator_category; #else /// The iterator category typedef std::random_access_iterator_tag iterator_category; #endif // THRUST_VERSION private: // Largest texture word we can use in device typedef typename UnitWord::TextureWord TextureWord; // Number of texture words per T enum { TEXTURE_MULTIPLE = sizeof(T) / sizeof(TextureWord) }; private: T* ptr; difference_type tex_offset; cudaTextureObject_t tex_obj; public: /// Constructor _CCCL_HOST_DEVICE _CCCL_FORCEINLINE TexObjInputIterator() : ptr(NULL) , tex_offset(0) , tex_obj(0) {} /** * @brief Use this iterator to bind @p ptr with a texture reference * * @param ptr * Native pointer to wrap that is aligned to cudaDeviceProp::textureAlignment * * @param bytes * Number of bytes in the range * * @param tex_offset * OffsetT (in items) from @p ptr denoting the position of the iterator */ template cudaError_t BindTexture(QualifiedT* ptr, size_t bytes, size_t tex_offset = 0) { this->ptr = const_cast::type*>(ptr); this->tex_offset = static_cast(tex_offset); cudaChannelFormatDesc channel_desc = cudaCreateChannelDesc(); cudaResourceDesc res_desc; cudaTextureDesc tex_desc; memset(&res_desc, 0, sizeof(cudaResourceDesc)); memset(&tex_desc, 0, sizeof(cudaTextureDesc)); res_desc.resType = cudaResourceTypeLinear; res_desc.res.linear.devPtr = this->ptr; res_desc.res.linear.desc = channel_desc; res_desc.res.linear.sizeInBytes = bytes; tex_desc.readMode = cudaReadModeElementType; return CubDebug(cudaCreateTextureObject(&tex_obj, &res_desc, &tex_desc, NULL)); } /// Unbind this iterator from its texture reference cudaError_t UnbindTexture() { return CubDebug(cudaDestroyTextureObject(tex_obj)); } /// Postfix increment _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type operator++(int) { self_type retval = *this; tex_offset++; return retval; } /// Prefix increment _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type operator++() { tex_offset++; return *this; } /// Indirection _CCCL_HOST_DEVICE _CCCL_FORCEINLINE reference operator*() const { NV_IF_TARGET(NV_IS_HOST, (return ptr[tex_offset];), (return this->device_deref();)); } /// Addition template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type operator+(Distance n) const { self_type retval; retval.ptr = ptr; retval.tex_obj = tex_obj; retval.tex_offset = tex_offset + n; return retval; } /// Addition assignment template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type& operator+=(Distance n) { tex_offset += n; return *this; } /// Subtraction template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type operator-(Distance n) const { self_type retval; retval.ptr = ptr; retval.tex_obj = tex_obj; retval.tex_offset = tex_offset - n; return retval; } /// Subtraction assignment template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type& operator-=(Distance n) { tex_offset -= n; return *this; } /// Distance _CCCL_HOST_DEVICE _CCCL_FORCEINLINE difference_type operator-(self_type other) const { return tex_offset - other.tex_offset; } /// Array subscript template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE reference operator[](Distance n) const { self_type offset = (*this) + n; return *offset; } /// Structure dereference _CCCL_HOST_DEVICE _CCCL_FORCEINLINE pointer operator->() { return &(*(*this)); } /// Equal to _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool operator==(const self_type& rhs) const { return ((ptr == rhs.ptr) && (tex_offset == rhs.tex_offset) && (tex_obj == rhs.tex_obj)); } /// Not equal to _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool operator!=(const self_type& rhs) const { return ((ptr != rhs.ptr) || (tex_offset != rhs.tex_offset) || (tex_obj != rhs.tex_obj)); } /// ostream operator friend std::ostream& operator<<(std::ostream& os, const self_type& itr) { os << "cub::TexObjInputIterator( ptr=" << itr.ptr << ", offset=" << itr.tex_offset << ", tex_obj=" << itr.tex_obj << " )"; return os; } private: // This is hoisted out of operator* because #pragma can't be used inside of // NV_IF_TARGET _CCCL_DEVICE _CCCL_FORCEINLINE reference device_deref() const { // Move array of uninitialized words, then alias and assign to return // value TextureWord words[TEXTURE_MULTIPLE]; const auto tex_idx_base = tex_offset * TEXTURE_MULTIPLE; #pragma unroll for (int i = 0; i < TEXTURE_MULTIPLE; ++i) { words[i] = tex1Dfetch(tex_obj, tex_idx_base + i); } // Load from words return *reinterpret_cast(words); } }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/iterator/tex_ref_input_iterator.cuh000066400000000000000000000111361463375617100231010ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * @file * Random-access iterator types */ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include CUB_NAMESPACE_BEGIN /** * @brief A random-access input wrapper for dereferencing array values through texture cache. * * @deprecated [Since 1.13.0] The CUDA texture management APIs used by * TexRefInputIterator are deprecated. Use cub::TexObjInputIterator instead. * * @par Overview * - TexRefInputIterator wraps a native device pointer of type ValueType*. References * to elements are to be loaded through texture cache. * - Can be used to load any data type from memory through texture cache. * - Can be manipulated and exchanged within and between host and device * functions, can only be constructed within host functions, and can only be * dereferenced within device functions. * - The @p UNIQUE_ID template parameter is used to statically name the underlying texture * reference. Only one TexRefInputIterator instance can be bound at any given time for a * specific combination of (1) data type @p T, (2) @p UNIQUE_ID, (3) host * thread, and (4) compilation .o unit. * - With regard to nested/dynamic parallelism, TexRefInputIterator iterators may only be * created by the host thread and used by a top-level kernel (i.e. the one which is launched * from the host). * - Compatible with Thrust API v1.7 or newer. * * @par Snippet * The code snippet below illustrates the use of @p TexRefInputIterator to * dereference a device array of doubles through texture cache. * @par * @code * #include // or equivalently * * // Declare, allocate, and initialize a device array * int num_items; // e.g., 7 * double *d_in; // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0] * * // Create an iterator wrapper * cub::TexRefInputIterator itr; * itr.BindTexture(d_in, sizeof(double) * num_items); * ... * * // Within device code: * printf("%f\n", itr[0]); // 8.0 * printf("%f\n", itr[1]); // 6.0 * printf("%f\n", itr[6]); // 9.0 * * ... * itr.UnbindTexture(); * * @endcode * * @tparam T * The value type of this iterator * * @tparam UNIQUE_ID * A globally-unique identifier (within the compilation unit) to name the underlying texture reference * * @tparam OffsetT * The difference type of this iterator (Default: @p ptrdiff_t) */ template using TexRefInputIterator CUB_DEPRECATED = cub::TexObjInputIterator; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/iterator/transform_input_iterator.cuh000066400000000000000000000175511463375617100234670ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * @file * Random-access iterator types */ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include #if (THRUST_VERSION >= 100700) // This iterator is compatible with Thrust API 1.7 and newer # include # include #endif // THRUST_VERSION CUB_NAMESPACE_BEGIN /** * @brief A random-access input wrapper for transforming dereferenced values. * * @par Overview * - TransformInputIteratorTwraps a unary conversion functor of type * @p ConversionOp and a random-access input iterator of type InputIteratorT, * using the former to produce references of type @p ValueType from the latter. * - Can be used with any data type. * - Can be constructed, manipulated, and exchanged within and between host and device * functions. Wrapped host memory can only be dereferenced on the host, and wrapped * device memory can only be dereferenced on the device. * - Compatible with Thrust API v1.7 or newer. * * @par Snippet * The code snippet below illustrates the use of @p TransformInputIteratorTto * dereference an array of integers, tripling the values and converting them to doubles. * @par * @code * #include // or equivalently * * // Functor for tripling integer values and converting to doubles * struct TripleDoubler * { * __host__ __device__ __forceinline__ * double operator()(const int &a) const { * return double(a * 3); * } * }; * * // Declare, allocate, and initialize a device array * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] * TripleDoubler conversion_op; * * // Create an iterator wrapper * cub::TransformInputIterator itr(d_in, conversion_op); * * // Within device code: * printf("%f\n", itr[0]); // 24.0 * printf("%f\n", itr[1]); // 18.0 * printf("%f\n", itr[6]); // 27.0 * * @endcode * * @tparam ValueType * The value type of this iterator * * @tparam ConversionOp * Unary functor type for mapping objects of type @p InputType to type @p ValueType. * Must have member ValueType operator()(const InputType &datum). * * @tparam InputIteratorT * The type of the wrapped input iterator * * @tparam OffsetT * The difference type of this iterator (Default: @p ptrdiff_t) */ template class TransformInputIterator { public: // Required iterator traits /// My own type typedef TransformInputIterator self_type; /// Type to express the result of subtracting one iterator from another typedef OffsetT difference_type; /// The type of the element the iterator can point to typedef ValueType value_type; /// The type of a pointer to an element the iterator can point to typedef ValueType* pointer; /// The type of a reference to an element the iterator can point to typedef ValueType reference; #if (THRUST_VERSION >= 100700) // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods /// The iterator category typedef typename THRUST_NS_QUALIFIER::detail::iterator_facade_category< THRUST_NS_QUALIFIER::any_system_tag, THRUST_NS_QUALIFIER::random_access_traversal_tag, value_type, reference>::type iterator_category; #else /// The iterator category typedef std::random_access_iterator_tag iterator_category; #endif // THRUST_VERSION private: ConversionOp conversion_op; InputIteratorT input_itr; public: /** * @param input_itr * Input iterator to wrap * * @param conversion_op * Conversion functor to wrap */ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE TransformInputIterator(InputIteratorT input_itr, ConversionOp conversion_op) : conversion_op(conversion_op) , input_itr(input_itr) {} /// Postfix increment _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type operator++(int) { self_type retval = *this; input_itr++; return retval; } /// Prefix increment _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type operator++() { input_itr++; return *this; } /// Indirection _CCCL_HOST_DEVICE _CCCL_FORCEINLINE reference operator*() const { return conversion_op(*input_itr); } /// Addition template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type operator+(Distance n) const { self_type retval(input_itr + n, conversion_op); return retval; } /// Addition assignment template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type& operator+=(Distance n) { input_itr += n; return *this; } /// Subtraction template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type operator-(Distance n) const { self_type retval(input_itr - n, conversion_op); return retval; } /// Subtraction assignment template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type& operator-=(Distance n) { input_itr -= n; return *this; } /// Distance _CCCL_HOST_DEVICE _CCCL_FORCEINLINE difference_type operator-(self_type other) const { return input_itr - other.input_itr; } /// Array subscript template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE reference operator[](Distance n) const { return conversion_op(input_itr[n]); } /// Equal to _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool operator==(const self_type& rhs) const { return (input_itr == rhs.input_itr); } /// Not equal to _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool operator!=(const self_type& rhs) const { return (input_itr != rhs.input_itr); } /// ostream operator friend std::ostream& operator<<(std::ostream& os, const self_type& /* itr */) { return os; } }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/thread/000077500000000000000000000000001463375617100152305ustar00rootroot00000000000000cccl-2.5.0/cub/cub/thread/thread_load.cuh000066400000000000000000000417121463375617100202040ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * @file * Thread utilities for reading memory using PTX cache modifiers. */ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include CUB_NAMESPACE_BEGIN //----------------------------------------------------------------------------- // Tags and constants //----------------------------------------------------------------------------- /** * @brief Enumeration of cache modifiers for memory load operations. */ enum CacheLoadModifier { LOAD_DEFAULT, ///< Default (no modifier) LOAD_CA, ///< Cache at all levels LOAD_CG, ///< Cache at global level LOAD_CS, ///< Cache streaming (likely to be accessed once) LOAD_CV, ///< Cache as volatile (including cached system lines) LOAD_LDG, ///< Cache as texture LOAD_VOLATILE, ///< Volatile (any memory space) }; /** * @name Thread I/O (cache modified) * @{ */ /** * @brief Thread utility for reading memory using cub::CacheLoadModifier cache modifiers. * Can be used to load any data type. * * @par Example * @code * #include // or equivalently * * // 32-bit load using cache-global modifier: * int *d_in; * int val = cub::ThreadLoad(d_in + threadIdx.x); * * // 16-bit load using default modifier * short *d_in; * short val = cub::ThreadLoad(d_in + threadIdx.x); * * // 256-bit load using cache-volatile modifier * double4 *d_in; * double4 val = cub::ThreadLoad(d_in + threadIdx.x); * * // 96-bit load using cache-streaming modifier * struct TestFoo { bool a; short b; }; * TestFoo *d_struct; * TestFoo val = cub::ThreadLoad(d_in + threadIdx.x); * \endcode * * @tparam MODIFIER * [inferred] CacheLoadModifier enumeration * * @tparam InputIteratorT * [inferred] Input iterator type \iterator */ template _CCCL_DEVICE _CCCL_FORCEINLINE cub::detail::value_t ThreadLoad(InputIteratorT itr); //@} end member group #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document /// Helper structure for templated load iteration (inductive case) template struct IterateThreadLoad { template static _CCCL_DEVICE _CCCL_FORCEINLINE void Load(T const* ptr, T* vals) { vals[COUNT] = ThreadLoad(ptr + COUNT); IterateThreadLoad::template Load(ptr, vals); } template static _CCCL_DEVICE _CCCL_FORCEINLINE void Dereference(InputIteratorT itr, T* vals) { vals[COUNT] = itr[COUNT]; IterateThreadLoad::Dereference(itr, vals); } }; /// Helper structure for templated load iteration (termination case) template struct IterateThreadLoad { template static _CCCL_DEVICE _CCCL_FORCEINLINE void Load(T const* /*ptr*/, T* /*vals*/) {} template static _CCCL_DEVICE _CCCL_FORCEINLINE void Dereference(InputIteratorT /*itr*/, T* /*vals*/) {} }; /** * Define a uint4 (16B) ThreadLoad specialization for the given Cache load modifier */ # define _CUB_LOAD_16(cub_modifier, ptx_modifier) \ template <> \ _CCCL_DEVICE _CCCL_FORCEINLINE uint4 ThreadLoad(uint4 const* ptr) \ { \ uint4 retval; \ asm volatile("ld." #ptx_modifier ".v4.u32 {%0, %1, %2, %3}, [%4];" \ : "=r"(retval.x), "=r"(retval.y), "=r"(retval.z), "=r"(retval.w) \ : _CUB_ASM_PTR_(ptr)); \ return retval; \ } \ template <> \ _CCCL_DEVICE _CCCL_FORCEINLINE ulonglong2 ThreadLoad(ulonglong2 const* ptr) \ { \ ulonglong2 retval; \ asm volatile("ld." #ptx_modifier ".v2.u64 {%0, %1}, [%2];" \ : "=l"(retval.x), "=l"(retval.y) \ : _CUB_ASM_PTR_(ptr)); \ return retval; \ } /** * Define a uint2 (8B) ThreadLoad specialization for the given Cache load modifier */ # define _CUB_LOAD_8(cub_modifier, ptx_modifier) \ template <> \ _CCCL_DEVICE _CCCL_FORCEINLINE ushort4 ThreadLoad(ushort4 const* ptr) \ { \ ushort4 retval; \ asm volatile("ld." #ptx_modifier ".v4.u16 {%0, %1, %2, %3}, [%4];" \ : "=h"(retval.x), "=h"(retval.y), "=h"(retval.z), "=h"(retval.w) \ : _CUB_ASM_PTR_(ptr)); \ return retval; \ } \ template <> \ _CCCL_DEVICE _CCCL_FORCEINLINE uint2 ThreadLoad(uint2 const* ptr) \ { \ uint2 retval; \ asm volatile("ld." #ptx_modifier ".v2.u32 {%0, %1}, [%2];" \ : "=r"(retval.x), "=r"(retval.y) \ : _CUB_ASM_PTR_(ptr)); \ return retval; \ } \ template <> \ _CCCL_DEVICE _CCCL_FORCEINLINE unsigned long long ThreadLoad( \ unsigned long long const* ptr) \ { \ unsigned long long retval; \ asm volatile("ld." #ptx_modifier ".u64 %0, [%1];" : "=l"(retval) : _CUB_ASM_PTR_(ptr)); \ return retval; \ } /** * Define a uint (4B) ThreadLoad specialization for the given Cache load modifier */ # define _CUB_LOAD_4(cub_modifier, ptx_modifier) \ template <> \ _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int ThreadLoad(unsigned int const* ptr) \ { \ unsigned int retval; \ asm volatile("ld." #ptx_modifier ".u32 %0, [%1];" : "=r"(retval) : _CUB_ASM_PTR_(ptr)); \ return retval; \ } /** * Define a unsigned short (2B) ThreadLoad specialization for the given Cache load modifier */ # define _CUB_LOAD_2(cub_modifier, ptx_modifier) \ template <> \ _CCCL_DEVICE _CCCL_FORCEINLINE unsigned short ThreadLoad( \ unsigned short const* ptr) \ { \ unsigned short retval; \ asm volatile("ld." #ptx_modifier ".u16 %0, [%1];" : "=h"(retval) : _CUB_ASM_PTR_(ptr)); \ return retval; \ } /** * Define an unsigned char (1B) ThreadLoad specialization for the given Cache load modifier */ # define _CUB_LOAD_1(cub_modifier, ptx_modifier) \ template <> \ _CCCL_DEVICE _CCCL_FORCEINLINE unsigned char ThreadLoad( \ unsigned char const* ptr) \ { \ unsigned short retval; \ asm volatile( \ "{" \ " .reg .u8 datum;" \ " ld." #ptx_modifier ".u8 datum, [%1];" \ " cvt.u16.u8 %0, datum;" \ "}" \ : "=h"(retval) \ : _CUB_ASM_PTR_(ptr)); \ return (unsigned char) retval; \ } /** * Define powers-of-two ThreadLoad specializations for the given Cache load modifier */ # define _CUB_LOAD_ALL(cub_modifier, ptx_modifier) \ _CUB_LOAD_16(cub_modifier, ptx_modifier) \ _CUB_LOAD_8(cub_modifier, ptx_modifier) \ _CUB_LOAD_4(cub_modifier, ptx_modifier) \ _CUB_LOAD_2(cub_modifier, ptx_modifier) \ _CUB_LOAD_1(cub_modifier, ptx_modifier) /** * Define powers-of-two ThreadLoad specializations for the various Cache load modifiers */ _CUB_LOAD_ALL(LOAD_CA, ca) _CUB_LOAD_ALL(LOAD_CG, cg) _CUB_LOAD_ALL(LOAD_CS, cs) _CUB_LOAD_ALL(LOAD_CV, cv) _CUB_LOAD_ALL(LOAD_LDG, global.nc) // Macro cleanup # undef _CUB_LOAD_ALL # undef _CUB_LOAD_1 # undef _CUB_LOAD_2 # undef _CUB_LOAD_4 # undef _CUB_LOAD_8 # undef _CUB_LOAD_16 /** * ThreadLoad definition for LOAD_DEFAULT modifier on iterator types */ template _CCCL_DEVICE _CCCL_FORCEINLINE cub::detail::value_t ThreadLoad(InputIteratorT itr, Int2Type /*modifier*/, Int2Type /*is_pointer*/) { return *itr; } /** * ThreadLoad definition for LOAD_DEFAULT modifier on pointer types */ template _CCCL_DEVICE _CCCL_FORCEINLINE T ThreadLoad(T* ptr, Int2Type /*modifier*/, Int2Type /*is_pointer*/) { return *ptr; } /** * ThreadLoad definition for LOAD_VOLATILE modifier on primitive pointer types */ template _CCCL_DEVICE _CCCL_FORCEINLINE T ThreadLoadVolatilePointer(T* ptr, Int2Type /*is_primitive*/) { T retval = *reinterpret_cast(ptr); return retval; } /** * ThreadLoad definition for LOAD_VOLATILE modifier on non-primitive pointer types */ template _CCCL_DEVICE _CCCL_FORCEINLINE T ThreadLoadVolatilePointer(T* ptr, Int2Type /*is_primitive*/) { // Word type for memcopying typedef typename UnitWord::VolatileWord VolatileWord; constexpr int VOLATILE_MULTIPLE = sizeof(T) / sizeof(VolatileWord); T retval; VolatileWord* words = reinterpret_cast(&retval); IterateThreadLoad<0, VOLATILE_MULTIPLE>::Dereference(reinterpret_cast(ptr), words); return retval; } /** * ThreadLoad definition for LOAD_VOLATILE modifier on pointer types */ template _CCCL_DEVICE _CCCL_FORCEINLINE T ThreadLoad(T* ptr, Int2Type /*modifier*/, Int2Type /*is_pointer*/) { // Apply tags for partial-specialization return ThreadLoadVolatilePointer(ptr, Int2Type::PRIMITIVE>()); } /** * ThreadLoad definition for generic modifiers on pointer types */ template _CCCL_DEVICE _CCCL_FORCEINLINE T ThreadLoad(T const* ptr, Int2Type /*modifier*/, Int2Type /*is_pointer*/) { typedef typename UnitWord::DeviceWord DeviceWord; constexpr int DEVICE_MULTIPLE = sizeof(T) / sizeof(DeviceWord); DeviceWord words[DEVICE_MULTIPLE]; IterateThreadLoad<0, DEVICE_MULTIPLE>::template Load( reinterpret_cast(const_cast(ptr)), words); return *reinterpret_cast(words); } /** * ThreadLoad definition for generic modifiers */ template _CCCL_DEVICE _CCCL_FORCEINLINE cub::detail::value_t ThreadLoad(InputIteratorT itr) { // Apply tags for partial-specialization return ThreadLoad(itr, Int2Type(), Int2Type<::cuda::std::is_pointer::value>()); } #endif // DOXYGEN_SHOULD_SKIP_THIS CUB_NAMESPACE_END cccl-2.5.0/cub/cub/thread/thread_operators.cuh000066400000000000000000000273621463375617100213100ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * @file * Simple binary operator functor types */ /****************************************************************************** * Simple functor operators ******************************************************************************/ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include _CCCL_SUPPRESS_DEPRECATED_PUSH #include _CCCL_SUPPRESS_DEPRECATED_POP #include #include CUB_NAMESPACE_BEGIN /// @brief Inequality functor (wraps equality functor) template struct InequalityWrapper { /// Wrapped equality operator EqualityOp op; /// Constructor _CCCL_HOST_DEVICE _CCCL_FORCEINLINE InequalityWrapper(EqualityOp op) : op(op) {} /// Boolean inequality operator, returns `t != u` template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool operator()(T&& t, U&& u) { return !op(::cuda::std::forward(t), ::cuda::std::forward(u)); } }; #if _CCCL_STD_VER > 2011 using Equality = ::cuda::std::equal_to<>; using Inequality = ::cuda::std::not_equal_to<>; using Sum = ::cuda::std::plus<>; using Difference = ::cuda::std::minus<>; using Division = ::cuda::std::divides<>; #else /// @brief Default equality functor struct Equality { /// Boolean equality operator, returns `t == u` template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool operator()(T&& t, U&& u) const { return ::cuda::std::forward(t) == ::cuda::std::forward(u); } }; /// @brief Default inequality functor struct Inequality { /// Boolean inequality operator, returns `t != u` template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool operator()(T&& t, U&& u) const { return ::cuda::std::forward(t) != ::cuda::std::forward(u); } }; /// @brief Default sum functor struct Sum { /// Binary sum operator, returns `t + u` template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE auto operator()(T&& t, U&& u) const -> decltype(::cuda::std::forward(t) + ::cuda::std::forward(u)) { return ::cuda::std::forward(t) + ::cuda::std::forward(u); } }; /// @brief Default difference functor struct Difference { /// Binary difference operator, returns `t - u` template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE auto operator()(T&& t, U&& u) const -> decltype(::cuda::std::forward(t) - ::cuda::std::forward(u)) { return ::cuda::std::forward(t) - ::cuda::std::forward(u); } }; /// @brief Default division functor struct Division { /// Binary division operator, returns `t / u` template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE auto operator()(T&& t, U&& u) const -> decltype(::cuda::std::forward(t) / ::cuda::std::forward(u)) { return ::cuda::std::forward(t) / ::cuda::std::forward(u); } }; #endif /// @brief Default max functor struct Max { /// Boolean max operator, returns `(t > u) ? t : u` template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE typename ::cuda::std::common_type::type operator()(T&& t, U&& u) const { return CUB_MAX(t, u); } }; /// @brief Arg max functor (keeps the value and offset of the first occurrence /// of the larger item) struct ArgMax { /// Boolean max operator, preferring the item having the smaller offset in /// case of ties template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE KeyValuePair operator()(const KeyValuePair& a, const KeyValuePair& b) const { // Mooch BUG (device reduce argmax gk110 3.2 million random fp32) // return ((b.value > a.value) || // ((a.value == b.value) && (b.key < a.key))) // ? b : a; if ((b.value > a.value) || ((a.value == b.value) && (b.key < a.key))) { return b; } return a; } }; /// @brief Default min functor struct Min { /// Boolean min operator, returns `(t < u) ? t : u` template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE typename ::cuda::std::common_type::type operator()(T&& t, U&& u) const { return CUB_MIN(t, u); } }; /// @brief Arg min functor (keeps the value and offset of the first occurrence /// of the smallest item) struct ArgMin { /// Boolean min operator, preferring the item having the smaller offset in /// case of ties template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE KeyValuePair operator()(const KeyValuePair& a, const KeyValuePair& b) const { // Mooch BUG (device reduce argmax gk110 3.2 million random fp32) // return ((b.value < a.value) || // ((a.value == b.value) && (b.key < a.key))) // ? b : a; if ((b.value < a.value) || ((a.value == b.value) && (b.key < a.key))) { return b; } return a; } }; namespace detail { template struct basic_binary_op_t { static constexpr bool value = false; }; template <> struct basic_binary_op_t { static constexpr bool value = true; }; template <> struct basic_binary_op_t { static constexpr bool value = true; }; template <> struct basic_binary_op_t { static constexpr bool value = true; }; } // namespace detail /// @brief Default cast functor template struct CastOp { /// Cast operator, returns `(B) a` template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE B operator()(A&& a) const { return (B) a; } }; /// @brief Binary operator wrapper for switching non-commutative scan arguments template class SwizzleScanOp { private: /// Wrapped scan operator ScanOp scan_op; public: /// Constructor _CCCL_HOST_DEVICE _CCCL_FORCEINLINE SwizzleScanOp(ScanOp scan_op) : scan_op(scan_op) {} /// Switch the scan arguments template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE T operator()(const T& a, const T& b) { T _a(a); T _b(b); return scan_op(_b, _a); } }; /** * @brief Reduce-by-segment functor. * * Given two cub::KeyValuePair inputs `a` and `b` and a binary associative * combining operator `f(const T &x, const T &y)`, an instance of this functor * returns a cub::KeyValuePair whose `key` field is `a.key + b.key`, and whose * `value` field is either `b.value` if `b.key` is non-zero, or * `f(a.value, b.value)` otherwise. * * ReduceBySegmentOp is an associative, non-commutative binary combining * operator for input sequences of cub::KeyValuePair pairings. Such sequences * are typically used to represent a segmented set of values to be reduced * and a corresponding set of {0,1}-valued integer "head flags" demarcating the * first value of each segment. * * @tparam ReductionOpT Binary reduction operator to apply to values */ template struct ReduceBySegmentOp { /// Wrapped reduction operator ReductionOpT op; /// Constructor _CCCL_HOST_DEVICE _CCCL_FORCEINLINE ReduceBySegmentOp() {} /// Constructor _CCCL_HOST_DEVICE _CCCL_FORCEINLINE ReduceBySegmentOp(ReductionOpT op) : op(op) {} /** * @brief Scan operator * * @tparam KeyValuePairT * KeyValuePair pairing of T (value) and OffsetT (head flag) * * @param[in] first * First partial reduction * * @param[in] second * Second partial reduction */ template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE KeyValuePairT operator()(const KeyValuePairT& first, const KeyValuePairT& second) { KeyValuePairT retval; retval.key = first.key + second.key; #ifdef _NVHPC_CUDA // WAR bug on nvc++ if (second.key) { retval.value = second.value; } else { // If second.value isn't copied into a temporary here, nvc++ will // crash while compiling the TestScanByKeyWithLargeTypes test in // thrust/testing/scan_by_key.cu: auto v2 = second.value; retval.value = op(first.value, v2); } #else // not nvc++: // if (second.key) { // The second partial reduction spans a segment reset, so it's value // aggregate becomes the running aggregate // else { // The second partial reduction does not span a reset, so accumulate both // into the running aggregate // } retval.value = (second.key) ? second.value : op(first.value, second.value); #endif return retval; } }; /** * @tparam ReductionOpT Binary reduction operator to apply to values */ template struct ReduceByKeyOp { /// Wrapped reduction operator ReductionOpT op; /// Constructor _CCCL_HOST_DEVICE _CCCL_FORCEINLINE ReduceByKeyOp() {} /// Constructor _CCCL_HOST_DEVICE _CCCL_FORCEINLINE ReduceByKeyOp(ReductionOpT op) : op(op) {} /** * @brief Scan operator * * @param[in] first First partial reduction * @param[in] second Second partial reduction */ template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE KeyValuePairT operator()(const KeyValuePairT& first, const KeyValuePairT& second) { KeyValuePairT retval = second; if (first.key == second.key) { retval.value = op(first.value, retval.value); } return retval; } }; template struct BinaryFlip { BinaryOpT binary_op; _CCCL_HOST_DEVICE explicit BinaryFlip(BinaryOpT binary_op) : binary_op(binary_op) {} template _CCCL_DEVICE auto operator()(T&& t, U&& u) -> decltype(binary_op(::cuda::std::forward(u), ::cuda::std::forward(t))) { return binary_op(::cuda::std::forward(u), ::cuda::std::forward(t)); } }; template _CCCL_HOST_DEVICE BinaryFlip MakeBinaryFlip(BinaryOpT binary_op) { return BinaryFlip(binary_op); } CUB_NAMESPACE_END cccl-2.5.0/cub/cub/thread/thread_reduce.cuh000066400000000000000000000145401463375617100205330ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * @file * Thread utilities for sequential reduction over statically-sized array types */ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include CUB_NAMESPACE_BEGIN /// Internal namespace (to prevent ADL mishaps between static functions when mixing different CUB installations) namespace internal { /** * @brief Sequential reduction over statically-sized array types * * @param[in] input * Input array * * @param[in] reduction_op * Binary reduction operator * * @param[in] prefix * Prefix to seed reduction with */ template > _CCCL_DEVICE _CCCL_FORCEINLINE AccumT ThreadReduce(T* input, ReductionOp reduction_op, PrefixT prefix, Int2Type /*length*/) { AccumT retval = prefix; #pragma unroll for (int i = 0; i < LENGTH; ++i) { retval = reduction_op(retval, input[i]); } return retval; } /** * @brief Perform a sequential reduction over @p LENGTH elements of the @p input array, * seeded with the specified @p prefix. The aggregate is returned. * * @tparam LENGTH * LengthT of input array * * @tparam T * [inferred] The data type to be reduced. * * @tparam ReductionOp * [inferred] Binary reduction operator type having member * T operator()(const T &a, const T &b) * * @param[in] input * Input array * * @param[in] reduction_op * Binary reduction operator * * @param[in] prefix * Prefix to seed reduction with */ template > _CCCL_DEVICE _CCCL_FORCEINLINE AccumT ThreadReduce(T* input, ReductionOp reduction_op, PrefixT prefix) { return ThreadReduce(input, reduction_op, prefix, Int2Type()); } /** * @brief Perform a sequential reduction over @p LENGTH elements of the @p input array. * The aggregate is returned. * * @tparam LENGTH * LengthT of input array * * @tparam T * [inferred] The data type to be reduced. * * @tparam ReductionOp * [inferred] Binary reduction operator type having member * T operator()(const T &a, const T &b) * * @param[in] input * Input array * * @param[in] reduction_op * Binary reduction operator */ template _CCCL_DEVICE _CCCL_FORCEINLINE T ThreadReduce(T* input, ReductionOp reduction_op) { T prefix = input[0]; return ThreadReduce(input + 1, reduction_op, prefix); } /** * @brief Perform a sequential reduction over the statically-sized @p input array, * seeded with the specified @p prefix. The aggregate is returned. * * @tparam LENGTH * [inferred] LengthT of @p input array * * @tparam T * [inferred] The data type to be reduced. * * @tparam ReductionOp * [inferred] Binary reduction operator type having member * T operator()(const T &a, const T &b) * * @param[in] input * Input array * * @param[in] reduction_op * Binary reduction operator * * @param[in] prefix * Prefix to seed reduction with */ template > _CCCL_DEVICE _CCCL_FORCEINLINE AccumT ThreadReduce(T (&input)[LENGTH], ReductionOp reduction_op, PrefixT prefix) { return ThreadReduce(input, reduction_op, prefix, Int2Type()); } /** * @brief Serial reduction with the specified operator * * @tparam LENGTH * [inferred] LengthT of @p input array * * @tparam T * [inferred] The data type to be reduced. * * @tparam ReductionOp * [inferred] Binary reduction operator type having member * T operator()(const T &a, const T &b) * * @param[in] input * Input array * * @param[in] reduction_op * Binary reduction operator */ template _CCCL_DEVICE _CCCL_FORCEINLINE T ThreadReduce(T (&input)[LENGTH], ReductionOp reduction_op) { return ThreadReduce((T*) input, reduction_op); } } // namespace internal CUB_NAMESPACE_END cccl-2.5.0/cub/cub/thread/thread_scan.cuh000066400000000000000000000237011463375617100202070ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * @file * Thread utilities for sequential prefix scan over statically-sized array types */ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include CUB_NAMESPACE_BEGIN /// Internal namespace (to prevent ADL mishaps between static functions when mixing different CUB installations) namespace internal { /** * @name Sequential prefix scan over statically-sized array types * @{ */ /** * @param[in] input * Input array * * @param[out] output * Output array (may be aliased to @p input) * * @param[in] scan_op * Binary scan operator */ template _CCCL_DEVICE _CCCL_FORCEINLINE T ThreadScanExclusive(T inclusive, T exclusive, T* input, T* output, ScanOp scan_op, Int2Type /*length*/) { #pragma unroll for (int i = 0; i < LENGTH; ++i) { inclusive = scan_op(exclusive, input[i]); output[i] = exclusive; exclusive = inclusive; } return inclusive; } /** * @brief Perform a sequential exclusive prefix scan over @p LENGTH elements of * the @p input array, seeded with the specified @p prefix. The aggregate is returned. * * @tparam LENGTH * LengthT of @p input and @p output arrays * * @tparam T * [inferred] The data type to be scanned. * * @tparam ScanOp * [inferred] Binary scan operator type having member * T operator()(const T &a, const T &b) * * @param[in] input * Input array * * @param[out] output * Output array (may be aliased to @p input) * * @param[in] scan_op * Binary scan operator * * @param[in] prefix * Prefix to seed scan with * * @param[in] apply_prefix * Whether or not the calling thread should apply its prefix. * If not, the first output element is undefined. * (Handy for preventing thread-0 from applying a prefix.) */ template _CCCL_DEVICE _CCCL_FORCEINLINE T ThreadScanExclusive(T* input, T* output, ScanOp scan_op, T prefix, bool apply_prefix = true) { T inclusive = input[0]; if (apply_prefix) { inclusive = scan_op(prefix, inclusive); } output[0] = prefix; T exclusive = inclusive; return ThreadScanExclusive(inclusive, exclusive, input + 1, output + 1, scan_op, Int2Type()); } /** * @brief Perform a sequential exclusive prefix scan over the statically-sized * @p input array, seeded with the specified @p prefix. The aggregate is returned. * * @tparam LENGTH * [inferred] LengthT of @p input and @p output arrays * * @tparam T * [inferred] The data type to be scanned. * * @tparam ScanOp * [inferred] Binary scan operator type having member * T operator()(const T &a, const T &b) * * @param[in] input * Input array * * @param[out] output * Output array (may be aliased to @p input) * * @param[in] scan_op * Binary scan operator * * @param[in] prefix * Prefix to seed scan with * * @param[in] apply_prefix * Whether or not the calling thread should apply its prefix. * (Handy for preventing thread-0 from applying a prefix.) */ template _CCCL_DEVICE _CCCL_FORCEINLINE T ThreadScanExclusive(T (&input)[LENGTH], T (&output)[LENGTH], ScanOp scan_op, T prefix, bool apply_prefix = true) { return ThreadScanExclusive((T*) input, (T*) output, scan_op, prefix, apply_prefix); } /** * @param[in] input * Input array * * @param[out] output * Output array (may be aliased to @p input) * * @param[in] scan_op * Binary scan operator */ template _CCCL_DEVICE _CCCL_FORCEINLINE T ThreadScanInclusive(T inclusive, T* input, T* output, ScanOp scan_op, Int2Type /*length*/) { #pragma unroll for (int i = 0; i < LENGTH; ++i) { inclusive = scan_op(inclusive, input[i]); output[i] = inclusive; } return inclusive; } /** * @brief Perform a sequential inclusive prefix scan over * @p LENGTH elements of the @p input array. The aggregate is returned. * * @tparam LENGTH * LengthT of @p input and @p output arrays * * @tparam T * [inferred] The data type to be scanned. * * @tparam ScanOp * [inferred] Binary scan operator type having member * T operator()(const T &a, const T &b) * * @param[in] input * Input array * * @param[out] output * Output array (may be aliased to @p input) * * @param[in] scan_op * Binary scan operator */ template _CCCL_DEVICE _CCCL_FORCEINLINE T ThreadScanInclusive(T* input, T* output, ScanOp scan_op) { T inclusive = input[0]; output[0] = inclusive; // Continue scan return ThreadScanInclusive(inclusive, input + 1, output + 1, scan_op, Int2Type()); } /** * @brief Perform a sequential inclusive prefix scan over the * statically-sized @p input array. The aggregate is returned. * * @tparam LENGTH * [inferred] LengthT of @p input and @p output arrays * * @tparam T * [inferred] The data type to be scanned. * * @tparam ScanOp * [inferred] Binary scan operator type having member * T operator()(const T &a, const T &b) * * @param[in] input * Input array * * @param[out] output * Output array (may be aliased to @p input) * * @param[in] scan_op * Binary scan operator */ template _CCCL_DEVICE _CCCL_FORCEINLINE T ThreadScanInclusive(T (&input)[LENGTH], T (&output)[LENGTH], ScanOp scan_op) { return ThreadScanInclusive((T*) input, (T*) output, scan_op); } /** * @brief Perform a sequential inclusive prefix scan over * @p LENGTH elements of the @p input array, seeded with the * specified @p prefix. The aggregate is returned. * * @tparam LENGTH * LengthT of @p input and @p output arrays * * @tparam T * [inferred] The data type to be scanned. * * @tparam ScanOp * [inferred] Binary scan operator type having member * T operator()(const T &a, const T &b) * * @param[in] input * Input array * * @param[out] output * Output array (may be aliased to @p input) * * @param[in] scan_op * Binary scan operator * * @param[in] prefix * Prefix to seed scan with * * @param[in] apply_prefix * Whether or not the calling thread should apply its prefix. * (Handy for preventing thread-0 from applying a prefix.) */ template _CCCL_DEVICE _CCCL_FORCEINLINE T ThreadScanInclusive(T* input, T* output, ScanOp scan_op, T prefix, bool apply_prefix = true) { T inclusive = input[0]; if (apply_prefix) { inclusive = scan_op(prefix, inclusive); } output[0] = inclusive; // Continue scan return ThreadScanInclusive(inclusive, input + 1, output + 1, scan_op, Int2Type()); } /** * @brief Perform a sequential inclusive prefix scan over the * statically-sized @p input array, seeded with the specified @p prefix. * The aggregate is returned. * * @tparam LENGTH * [inferred] LengthT of @p input and @p output arrays * * @tparam T * [inferred] The data type to be scanned. * * @tparam ScanOp * [inferred] Binary scan operator type having member * T operator()(const T &a, const T &b) * * @param[in] input * Input array * * @param[out] output * Output array (may be aliased to @p input) * * @param[in] scan_op * Binary scan operator * * @param[in] prefix * Prefix to seed scan with * * @param[in] apply_prefix * Whether or not the calling thread should apply its prefix. * (Handy for preventing thread-0 from applying a prefix.) */ template _CCCL_DEVICE _CCCL_FORCEINLINE T ThreadScanInclusive(T (&input)[LENGTH], T (&output)[LENGTH], ScanOp scan_op, T prefix, bool apply_prefix = true) { return ThreadScanInclusive((T*) input, (T*) output, scan_op, prefix, apply_prefix); } //@} end member group } // namespace internal CUB_NAMESPACE_END cccl-2.5.0/cub/cub/thread/thread_search.cuh000066400000000000000000000126121463375617100205270ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * @file * Thread utilities for sequential search */ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include CUB_NAMESPACE_BEGIN /** * Computes the begin offsets into A and B for the specific diagonal */ template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE void MergePathSearch( OffsetT diagonal, AIteratorT a, BIteratorT b, OffsetT a_len, OffsetT b_len, CoordinateT& path_coordinate) { /// The value type of the input iterator using T = cub::detail::value_t; OffsetT split_min = CUB_MAX(diagonal - b_len, 0); OffsetT split_max = CUB_MIN(diagonal, a_len); while (split_min < split_max) { OffsetT split_pivot = (split_min + split_max) >> 1; if (a[split_pivot] <= b[diagonal - split_pivot - 1]) { // Move candidate split range up A, down B split_min = split_pivot + 1; } else { // Move candidate split range up B, down A split_max = split_pivot; } } path_coordinate.x = CUB_MIN(split_min, a_len); path_coordinate.y = diagonal - split_min; } /** * @brief Returns the offset of the first value within @p input which does not compare * less than @p val * * @param[in] input * Input sequence * * @param[in] num_items * Input sequence length * * @param[in] val * Search key */ template _CCCL_DEVICE _CCCL_FORCEINLINE OffsetT LowerBound(InputIteratorT input, OffsetT num_items, T val) { OffsetT retval = 0; while (num_items > 0) { OffsetT half = num_items >> 1; if (input[retval + half] < val) { retval = retval + (half + 1); num_items = num_items - (half + 1); } else { num_items = half; } } return retval; } /** * @brief Returns the offset of the first value within @p input which compares * greater than @p val * * @param[in] input * Input sequence * * @param[in] num_items * Input sequence length * * @param[in] val * Search key */ template _CCCL_DEVICE _CCCL_FORCEINLINE OffsetT UpperBound(InputIteratorT input, OffsetT num_items, T val) { OffsetT retval = 0; while (num_items > 0) { OffsetT half = num_items >> 1; if (val < input[retval + half]) { num_items = half; } else { retval = retval + (half + 1); num_items = num_items - (half + 1); } } return retval; } #if defined(__CUDA_FP16_TYPES_EXIST__) /** * @param[in] input * Input sequence * * @param[in] num_items * Input sequence length * * @param[in] val * Search key */ template _CCCL_DEVICE _CCCL_FORCEINLINE OffsetT UpperBound(InputIteratorT input, OffsetT num_items, __half val) { OffsetT retval = 0; while (num_items > 0) { OffsetT half = num_items >> 1; bool lt; NV_IF_TARGET(NV_PROVIDES_SM_53, (lt = __hlt(val, input[retval + half]);), (lt = __half2float(val) < __half2float(input[retval + half]);)); if (lt) { num_items = half; } else { retval = retval + (half + 1); num_items = num_items - (half + 1); } } return retval; } #endif // __CUDA_FP16_TYPES_EXIST__ CUB_NAMESPACE_END cccl-2.5.0/cub/cub/thread/thread_sort.cuh000066400000000000000000000073001463375617100202470ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include CUB_NAMESPACE_BEGIN template _CCCL_DEVICE _CCCL_FORCEINLINE void Swap(T& lhs, T& rhs) { T temp = lhs; lhs = rhs; rhs = temp; } /** * @brief Sorts data using odd-even sort method * * The sorting method is stable. Further details can be found in: * A. Nico Habermann. Parallel neighbor sort (or the glory of the induction * principle). Technical Report AD-759 248, Carnegie Mellon University, 1972. * * @tparam KeyT * Key type * * @tparam ValueT * Value type. If `cub::NullType` is used as `ValueT`, only keys are sorted. * * @tparam CompareOp * functor type having member `bool operator()(KeyT lhs, KeyT rhs)` * * @tparam ITEMS_PER_THREAD * The number of items per thread * * @param[in,out] keys * Keys to sort * * @param[in,out] items * Values to sort * * @param[in] compare_op * Comparison function object which returns true if the first argument is * ordered before the second */ template _CCCL_DEVICE _CCCL_FORCEINLINE void StableOddEvenSort(KeyT (&keys)[ITEMS_PER_THREAD], ValueT (&items)[ITEMS_PER_THREAD], CompareOp compare_op) { constexpr bool KEYS_ONLY = ::cuda::std::is_same::value; #pragma unroll for (int i = 0; i < ITEMS_PER_THREAD; ++i) { #pragma unroll for (int j = 1 & i; j < ITEMS_PER_THREAD - 1; j += 2) { if (compare_op(keys[j + 1], keys[j])) { Swap(keys[j], keys[j + 1]); if (!KEYS_ONLY) { Swap(items[j], items[j + 1]); } } } // inner loop } // outer loop } CUB_NAMESPACE_END cccl-2.5.0/cub/cub/thread/thread_store.cuh000066400000000000000000000371431463375617100204240ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * @file * Thread utilities for writing memory using PTX cache modifiers. */ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include CUB_NAMESPACE_BEGIN //----------------------------------------------------------------------------- // Tags and constants //----------------------------------------------------------------------------- /** * @brief Enumeration of cache modifiers for memory store operations. */ enum CacheStoreModifier { STORE_DEFAULT, ///< Default (no modifier) STORE_WB, ///< Cache write-back all coherent levels STORE_CG, ///< Cache at global level STORE_CS, ///< Cache streaming (likely to be accessed once) STORE_WT, ///< Cache write-through (to system memory) STORE_VOLATILE, ///< Volatile shared (any memory space) }; /** * @name Thread I/O (cache modified) * @{ */ /** * @brief Thread utility for writing memory using cub::CacheStoreModifier cache modifiers. * Can be used to store any data type. * * @par Example * @code * #include // or equivalently * * // 32-bit store using cache-global modifier: * int *d_out; * int val; * cub::ThreadStore(d_out + threadIdx.x, val); * * // 16-bit store using default modifier * short *d_out; * short val; * cub::ThreadStore(d_out + threadIdx.x, val); * * // 256-bit store using write-through modifier * double4 *d_out; * double4 val; * cub::ThreadStore(d_out + threadIdx.x, val); * * // 96-bit store using cache-streaming cache modifier * struct TestFoo { bool a; short b; }; * TestFoo *d_struct; * TestFoo val; * cub::ThreadStore(d_out + threadIdx.x, val); * @endcode * * @tparam MODIFIER * [inferred] CacheStoreModifier enumeration * * @tparam InputIteratorT * [inferred] Output iterator type \iterator * * @tparam T * [inferred] Data type of output value */ template _CCCL_DEVICE _CCCL_FORCEINLINE void ThreadStore(OutputIteratorT itr, T val); //@} end member group #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document /// Helper structure for templated store iteration (inductive case) template struct IterateThreadStore { template static _CCCL_DEVICE _CCCL_FORCEINLINE void Store(T* ptr, T* vals) { ThreadStore(ptr + COUNT, vals[COUNT]); IterateThreadStore::template Store(ptr, vals); } template static _CCCL_DEVICE _CCCL_FORCEINLINE void Dereference(OutputIteratorT ptr, T* vals) { ptr[COUNT] = vals[COUNT]; IterateThreadStore::Dereference(ptr, vals); } }; /// Helper structure for templated store iteration (termination case) template struct IterateThreadStore { template static _CCCL_DEVICE _CCCL_FORCEINLINE void Store(T* /*ptr*/, T* /*vals*/) {} template static _CCCL_DEVICE _CCCL_FORCEINLINE void Dereference(OutputIteratorT /*ptr*/, T* /*vals*/) {} }; /** * Define a uint4 (16B) ThreadStore specialization for the given Cache load modifier */ # define _CUB_STORE_16(cub_modifier, ptx_modifier) \ template <> \ _CCCL_DEVICE _CCCL_FORCEINLINE void ThreadStore(uint4 * ptr, uint4 val) \ { \ asm volatile("st." #ptx_modifier ".v4.u32 [%0], {%1, %2, %3, %4};" \ : \ : _CUB_ASM_PTR_(ptr), "r"(val.x), "r"(val.y), "r"(val.z), "r"(val.w)); \ } \ template <> \ _CCCL_DEVICE _CCCL_FORCEINLINE void ThreadStore( \ ulonglong2 * ptr, ulonglong2 val) \ { \ asm volatile("st." #ptx_modifier ".v2.u64 [%0], {%1, %2};" : : _CUB_ASM_PTR_(ptr), "l"(val.x), "l"(val.y)); \ } /** * Define a uint2 (8B) ThreadStore specialization for the given Cache load modifier */ # define _CUB_STORE_8(cub_modifier, ptx_modifier) \ template <> \ _CCCL_DEVICE _CCCL_FORCEINLINE void ThreadStore(ushort4 * ptr, ushort4 val) \ { \ asm volatile("st." #ptx_modifier ".v4.u16 [%0], {%1, %2, %3, %4};" \ : \ : _CUB_ASM_PTR_(ptr), "h"(val.x), "h"(val.y), "h"(val.z), "h"(val.w)); \ } \ template <> \ _CCCL_DEVICE _CCCL_FORCEINLINE void ThreadStore(uint2 * ptr, uint2 val) \ { \ asm volatile("st." #ptx_modifier ".v2.u32 [%0], {%1, %2};" : : _CUB_ASM_PTR_(ptr), "r"(val.x), "r"(val.y)); \ } \ template <> \ _CCCL_DEVICE _CCCL_FORCEINLINE void ThreadStore( \ unsigned long long* ptr, unsigned long long val) \ { \ asm volatile("st." #ptx_modifier ".u64 [%0], %1;" : : _CUB_ASM_PTR_(ptr), "l"(val)); \ } /** * Define a unsigned int (4B) ThreadStore specialization for the given Cache load modifier */ # define _CUB_STORE_4(cub_modifier, ptx_modifier) \ template <> \ _CCCL_DEVICE _CCCL_FORCEINLINE void ThreadStore( \ unsigned int* ptr, unsigned int val) \ { \ asm volatile("st." #ptx_modifier ".u32 [%0], %1;" : : _CUB_ASM_PTR_(ptr), "r"(val)); \ } /** * Define a unsigned short (2B) ThreadStore specialization for the given Cache load modifier */ # define _CUB_STORE_2(cub_modifier, ptx_modifier) \ template <> \ _CCCL_DEVICE _CCCL_FORCEINLINE void ThreadStore( \ unsigned short* ptr, unsigned short val) \ { \ asm volatile("st." #ptx_modifier ".u16 [%0], %1;" : : _CUB_ASM_PTR_(ptr), "h"(val)); \ } /** * Define a unsigned char (1B) ThreadStore specialization for the given Cache load modifier */ # define _CUB_STORE_1(cub_modifier, ptx_modifier) \ template <> \ _CCCL_DEVICE _CCCL_FORCEINLINE void ThreadStore( \ unsigned char* ptr, unsigned char val) \ { \ asm volatile( \ "{" \ " .reg .u8 datum;" \ " cvt.u8.u16 datum, %1;" \ " st." #ptx_modifier ".u8 [%0], datum;" \ "}" \ : \ : _CUB_ASM_PTR_(ptr), "h"((unsigned short) val)); \ } /** * Define powers-of-two ThreadStore specializations for the given Cache load modifier */ # define _CUB_STORE_ALL(cub_modifier, ptx_modifier) \ _CUB_STORE_16(cub_modifier, ptx_modifier) \ _CUB_STORE_8(cub_modifier, ptx_modifier) \ _CUB_STORE_4(cub_modifier, ptx_modifier) \ _CUB_STORE_2(cub_modifier, ptx_modifier) \ _CUB_STORE_1(cub_modifier, ptx_modifier) /** * Define ThreadStore specializations for the various Cache load modifiers */ _CUB_STORE_ALL(STORE_WB, wb) _CUB_STORE_ALL(STORE_CG, cg) _CUB_STORE_ALL(STORE_CS, cs) _CUB_STORE_ALL(STORE_WT, wt) // Macro cleanup # undef _CUB_STORE_ALL # undef _CUB_STORE_1 # undef _CUB_STORE_2 # undef _CUB_STORE_4 # undef _CUB_STORE_8 # undef _CUB_STORE_16 /** * ThreadStore definition for STORE_DEFAULT modifier on iterator types */ template _CCCL_DEVICE _CCCL_FORCEINLINE void ThreadStore(OutputIteratorT itr, T val, Int2Type /*modifier*/, Int2Type /*is_pointer*/) { *itr = val; } /** * ThreadStore definition for STORE_DEFAULT modifier on pointer types */ template _CCCL_DEVICE _CCCL_FORCEINLINE void ThreadStore(T* ptr, T val, Int2Type /*modifier*/, Int2Type /*is_pointer*/) { *ptr = val; } /** * ThreadStore definition for STORE_VOLATILE modifier on primitive pointer types */ template _CCCL_DEVICE _CCCL_FORCEINLINE void ThreadStoreVolatilePtr(T* ptr, T val, Int2Type /*is_primitive*/) { *reinterpret_cast(ptr) = val; } /** * ThreadStore definition for STORE_VOLATILE modifier on non-primitive pointer types */ template _CCCL_DEVICE _CCCL_FORCEINLINE void ThreadStoreVolatilePtr(T* ptr, T val, Int2Type /*is_primitive*/) { // Create a temporary using shuffle-words, then store using volatile-words typedef typename UnitWord::VolatileWord VolatileWord; typedef typename UnitWord::ShuffleWord ShuffleWord; constexpr int VOLATILE_MULTIPLE = sizeof(T) / sizeof(VolatileWord); constexpr int SHUFFLE_MULTIPLE = sizeof(T) / sizeof(ShuffleWord); VolatileWord words[VOLATILE_MULTIPLE]; # pragma unroll for (int i = 0; i < SHUFFLE_MULTIPLE; ++i) { reinterpret_cast(words)[i] = reinterpret_cast(&val)[i]; } IterateThreadStore<0, VOLATILE_MULTIPLE>::template Dereference(reinterpret_cast(ptr), words); } /** * ThreadStore definition for STORE_VOLATILE modifier on pointer types */ template _CCCL_DEVICE _CCCL_FORCEINLINE void ThreadStore(T* ptr, T val, Int2Type /*modifier*/, Int2Type /*is_pointer*/) { ThreadStoreVolatilePtr(ptr, val, Int2Type::PRIMITIVE>()); } /** * ThreadStore definition for generic modifiers on pointer types */ template _CCCL_DEVICE _CCCL_FORCEINLINE void ThreadStore(T* ptr, T val, Int2Type /*modifier*/, Int2Type /*is_pointer*/) { // Create a temporary using shuffle-words, then store using device-words typedef typename UnitWord::DeviceWord DeviceWord; typedef typename UnitWord::ShuffleWord ShuffleWord; constexpr int DEVICE_MULTIPLE = sizeof(T) / sizeof(DeviceWord); constexpr int SHUFFLE_MULTIPLE = sizeof(T) / sizeof(ShuffleWord); DeviceWord words[DEVICE_MULTIPLE]; # pragma unroll for (int i = 0; i < SHUFFLE_MULTIPLE; ++i) { reinterpret_cast(words)[i] = reinterpret_cast(&val)[i]; } IterateThreadStore<0, DEVICE_MULTIPLE>::template Store( reinterpret_cast(ptr), words); } /** * ThreadStore definition for generic modifiers */ template _CCCL_DEVICE _CCCL_FORCEINLINE void ThreadStore(OutputIteratorT itr, T val) { ThreadStore(itr, val, Int2Type(), Int2Type::value>()); } #endif // DOXYGEN_SHOULD_SKIP_THIS CUB_NAMESPACE_END cccl-2.5.0/cub/cub/util_allocator.cuh000066400000000000000000000725611463375617100175120ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /****************************************************************************** * Simple caching allocator for device memory allocations. The allocator is * thread-safe and capable of managing device allocations on multiple devices. ******************************************************************************/ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include #include #include #include CUB_NAMESPACE_BEGIN /****************************************************************************** * CachingDeviceAllocator (host use) ******************************************************************************/ /** * @brief A simple caching allocator for device memory allocations. * * @par Overview * The allocator is thread-safe and stream-safe and is capable of managing cached * device allocations on multiple devices. It behaves as follows: * * @par * - Allocations from the allocator are associated with an @p active_stream. Once freed, * the allocation becomes available immediately for reuse within the @p active_stream * with which it was associated with during allocation, and it becomes available for * reuse within other streams when all prior work submitted to @p active_stream has completed. * - Allocations are categorized and cached by bin size. A new allocation request of * a given size will only consider cached allocations within the corresponding bin. * - Bin limits progress geometrically in accordance with the growth factor * @p bin_growth provided during construction. Unused device allocations within * a larger bin cache are not reused for allocation requests that categorize to * smaller bin sizes. * - Allocation requests below ( @p bin_growth ^ @p min_bin ) are rounded up to * ( @p bin_growth ^ @p min_bin ). * - Allocations above ( @p bin_growth ^ @p max_bin ) are not rounded up to the nearest * bin and are simply freed when they are deallocated instead of being returned * to a bin-cache. * - If the total storage of cached allocations on a given device will exceed * @p max_cached_bytes, allocations for that device are simply freed when they are * deallocated instead of being returned to their bin-cache. * * @par * For example, the default-constructed CachingDeviceAllocator is configured with: * - @p bin_growth = 8 * - @p min_bin = 3 * - @p max_bin = 7 * - @p max_cached_bytes = 6MB - 1B * * @par * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB * and sets a maximum of 6,291,455 cached bytes per device * */ struct CachingDeviceAllocator { //--------------------------------------------------------------------- // Constants //--------------------------------------------------------------------- /// Out-of-bounds bin static constexpr unsigned int INVALID_BIN = (unsigned int) -1; /// Invalid size static constexpr size_t INVALID_SIZE = (size_t) -1; #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document /// Invalid device ordinal static constexpr int INVALID_DEVICE_ORDINAL = -1; //--------------------------------------------------------------------- // Type definitions and helper types //--------------------------------------------------------------------- /** * Descriptor for device memory allocations */ struct BlockDescriptor { // Device pointer void* d_ptr; // Size of allocation in bytes size_t bytes; // Bin enumeration unsigned int bin; // device ordinal int device; // Associated associated_stream cudaStream_t associated_stream; // Signal when associated stream has run to the point at which this block was freed cudaEvent_t ready_event; // Constructor (suitable for searching maps for a specific block, given its pointer and // device) BlockDescriptor(void* d_ptr, int device) : d_ptr(d_ptr) , bytes(0) , bin(INVALID_BIN) , device(device) , associated_stream(0) , ready_event(0) {} // Constructor (suitable for searching maps for a range of suitable blocks, given a device) BlockDescriptor(int device) : d_ptr(NULL) , bytes(0) , bin(INVALID_BIN) , device(device) , associated_stream(0) , ready_event(0) {} // Comparison functor for comparing device pointers static bool PtrCompare(const BlockDescriptor& a, const BlockDescriptor& b) { if (a.device == b.device) { return (a.d_ptr < b.d_ptr); } else { return (a.device < b.device); } } // Comparison functor for comparing allocation sizes static bool SizeCompare(const BlockDescriptor& a, const BlockDescriptor& b) { if (a.device == b.device) { return (a.bytes < b.bytes); } else { return (a.device < b.device); } } }; /// BlockDescriptor comparator function interface typedef bool (*Compare)(const BlockDescriptor&, const BlockDescriptor&); class TotalBytes { public: size_t free; size_t live; TotalBytes() { free = live = 0; } }; /// Set type for cached blocks (ordered by size) typedef std::multiset CachedBlocks; /// Set type for live blocks (ordered by ptr) typedef std::multiset BusyBlocks; /// Map type of device ordinals to the number of cached bytes cached by each device typedef std::map GpuCachedBytes; //--------------------------------------------------------------------- // Utility functions //--------------------------------------------------------------------- /** * Integer pow function for unsigned base and exponent */ static unsigned int IntPow(unsigned int base, unsigned int exp) { unsigned int retval = 1; while (exp > 0) { if (exp & 1) { retval = retval * base; // multiply the result by the current base } base = base * base; // square the base exp = exp >> 1; // divide the exponent in half } return retval; } /** * Round up to the nearest power-of */ void NearestPowerOf(unsigned int& power, size_t& rounded_bytes, unsigned int base, size_t value) { power = 0; rounded_bytes = 1; if (value * base < value) { // Overflow power = sizeof(size_t) * 8; rounded_bytes = size_t(0) - 1; return; } while (rounded_bytes < value) { rounded_bytes *= base; power++; } } //--------------------------------------------------------------------- // Fields //--------------------------------------------------------------------- /// Mutex for thread-safety std::mutex mutex; /// Geometric growth factor for bin-sizes unsigned int bin_growth; /// Minimum bin enumeration unsigned int min_bin; /// Maximum bin enumeration unsigned int max_bin; /// Minimum bin size size_t min_bin_bytes; /// Maximum bin size size_t max_bin_bytes; /// Maximum aggregate cached bytes per device size_t max_cached_bytes; /// Whether or not to skip a call to FreeAllCached() when destructor is called. /// (The CUDA runtime may have already shut down for statically declared allocators) const bool skip_cleanup; /// Whether or not to print (de)allocation events to stdout bool debug; /// Map of device ordinal to aggregate cached bytes on that device GpuCachedBytes cached_bytes; /// Set of cached device allocations available for reuse CachedBlocks cached_blocks; /// Set of live device allocations currently in use BusyBlocks live_blocks; #endif // DOXYGEN_SHOULD_SKIP_THIS //--------------------------------------------------------------------- // Methods //--------------------------------------------------------------------- /** * @brief Constructor. * * @param bin_growth * Geometric growth factor for bin-sizes * * @param min_bin * Minimum bin (default is bin_growth ^ 1) * * @param max_bin * Maximum bin (default is no max bin) * * @param max_cached_bytes * Maximum aggregate cached bytes per device (default is no limit) * * @param skip_cleanup * Whether or not to skip a call to @p FreeAllCached() when the destructor is called (default * is to deallocate) * * @param debug * Whether or not to print (de)allocation events to stdout (default is no stderr output) */ CachingDeviceAllocator( unsigned int bin_growth, unsigned int min_bin = 1, unsigned int max_bin = INVALID_BIN, size_t max_cached_bytes = INVALID_SIZE, bool skip_cleanup = false) : bin_growth(bin_growth) , min_bin(min_bin) , max_bin(max_bin) , min_bin_bytes(IntPow(bin_growth, min_bin)) , max_bin_bytes(IntPow(bin_growth, max_bin)) , max_cached_bytes(max_cached_bytes) , skip_cleanup(skip_cleanup) , debug(false) , cached_blocks(BlockDescriptor::SizeCompare) , live_blocks(BlockDescriptor::PtrCompare) {} /** * @brief Constructor. * * @param bin_growth * Geometric growth factor for bin-sizes * * @param min_bin * Minimum bin (default is bin_growth ^ 1) * * @param max_bin * Maximum bin (default is no max bin) * * @param max_cached_bytes * Maximum aggregate cached bytes per device (default is no limit) * * @param skip_cleanup * Whether or not to skip a call to @p FreeAllCached() when the destructor is called (default * is to deallocate) * * @param debug * Whether or not to print (de)allocation events to stdout (default is no stderr output) */ CUB_DEPRECATED_BECAUSE("CUB no longer accepts `debug` parameter. " "Define CUB_DEBUG_LOG instead, or silence this message with " "CUB_IGNORE_DEPRECATED_API.") CachingDeviceAllocator( unsigned int bin_growth, unsigned int min_bin, unsigned int max_bin, size_t max_cached_bytes, bool skip_cleanup, bool /* debug */) : CachingDeviceAllocator(bin_growth, min_bin, max_bin, max_cached_bytes, skip_cleanup) {} /** * @brief Default constructor. * * Configured with: * @par * - @p bin_growth = 8 * - @p min_bin = 3 * - @p max_bin = 7 * - @p max_cached_bytes = ( @p bin_growth ^ @p max_bin) * 3 ) - 1 = 6,291,455 bytes * * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB and * sets a maximum of 6,291,455 cached bytes per device */ CachingDeviceAllocator(bool skip_cleanup = false, bool debug = false) : bin_growth(8) , min_bin(3) , max_bin(7) , min_bin_bytes(IntPow(bin_growth, min_bin)) , max_bin_bytes(IntPow(bin_growth, max_bin)) , max_cached_bytes((max_bin_bytes * 3) - 1) , skip_cleanup(skip_cleanup) , debug(debug) , cached_blocks(BlockDescriptor::SizeCompare) , live_blocks(BlockDescriptor::PtrCompare) {} /** * @brief Sets the limit on the number bytes this allocator is allowed to cache per device. * * Changing the ceiling of cached bytes does not cause any allocations (in-use or * cached-in-reserve) to be freed. See \p FreeAllCached(). */ cudaError_t SetMaxCachedBytes(size_t max_cached_bytes_) { // Lock mutex.lock(); #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog( "Changing max_cached_bytes (%lld -> %lld)\n", (long long) this->max_cached_bytes, (long long) max_cached_bytes_); #endif this->max_cached_bytes = max_cached_bytes_; // Unlock mutex.unlock(); return cudaSuccess; } /** * @brief Provides a suitable allocation of device memory for the given size on the specified * device. * * Once freed, the allocation becomes available immediately for reuse within the @p * active_stream with which it was associated with during allocation, and it becomes available * for reuse within other streams when all prior work submitted to @p active_stream has * completed. * * @param[in] device * Device on which to place the allocation * * @param[out] d_ptr * Reference to pointer to the allocation * * @param[in] bytes * Minimum number of bytes for the allocation * * @param[in] active_stream * The stream to be associated with this allocation */ cudaError_t DeviceAllocate(int device, void** d_ptr, size_t bytes, cudaStream_t active_stream = 0) { *d_ptr = NULL; int entrypoint_device = INVALID_DEVICE_ORDINAL; cudaError_t error = cudaSuccess; if (device == INVALID_DEVICE_ORDINAL) { error = CubDebug(cudaGetDevice(&entrypoint_device)); if (cudaSuccess != error) { return error; } device = entrypoint_device; } // Create a block descriptor for the requested allocation bool found = false; BlockDescriptor search_key(device); search_key.associated_stream = active_stream; NearestPowerOf(search_key.bin, search_key.bytes, bin_growth, bytes); if (search_key.bin > max_bin) { // Bin is greater than our maximum bin: allocate the request // exactly and give out-of-bounds bin. It will not be cached // for reuse when returned. search_key.bin = INVALID_BIN; search_key.bytes = bytes; } else { // Search for a suitable cached allocation: lock mutex.lock(); if (search_key.bin < min_bin) { // Bin is less than minimum bin: round up search_key.bin = min_bin; search_key.bytes = min_bin_bytes; } // Iterate through the range of cached blocks on the same device in the same bin CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key); while ((block_itr != cached_blocks.end()) && (block_itr->device == device) && (block_itr->bin == search_key.bin)) { // To prevent races with reusing blocks returned by the host but still // in use by the device, only consider cached blocks that are // either (from the active stream) or (from an idle stream) bool is_reusable = false; if (active_stream == block_itr->associated_stream) { is_reusable = true; } else { const cudaError_t event_status = cudaEventQuery(block_itr->ready_event); if (event_status != cudaErrorNotReady) { CubDebug(event_status); is_reusable = true; } } if (is_reusable) { // Reuse existing cache block. Insert into live blocks. found = true; search_key = *block_itr; search_key.associated_stream = active_stream; live_blocks.insert(search_key); // Remove from free blocks cached_bytes[device].free -= search_key.bytes; cached_bytes[device].live += search_key.bytes; #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("\tDevice %d reused cached block at %p (%lld bytes) for stream %lld (previously associated with " "stream %lld).\n", device, search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) block_itr->associated_stream); #endif cached_blocks.erase(block_itr); break; } block_itr++; } // Done searching: unlock mutex.unlock(); } // Allocate the block if necessary if (!found) { // Set runtime's current device to specified device (entrypoint may not be set) if (device != entrypoint_device) { error = CubDebug(cudaGetDevice(&entrypoint_device)); if (cudaSuccess != error) { return error; } error = CubDebug(cudaSetDevice(device)); if (cudaSuccess != error) { return error; } } // Attempt to allocate error = CubDebug(cudaMalloc(&search_key.d_ptr, search_key.bytes)); if (error == cudaErrorMemoryAllocation) { // The allocation attempt failed: free all cached blocks on device and retry #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("\tDevice %d failed to allocate %lld bytes for stream %lld, retrying after freeing cached allocations", device, (long long) search_key.bytes, (long long) search_key.associated_stream); #endif error = cudaSuccess; // Reset the error we will return cudaGetLastError(); // Reset CUDART's error // Lock mutex.lock(); // Iterate the range of free blocks on the same device BlockDescriptor free_key(device); CachedBlocks::iterator block_itr = cached_blocks.lower_bound(free_key); while ((block_itr != cached_blocks.end()) && (block_itr->device == device)) { // No need to worry about synchronization with the device: cudaFree is // blocking and will synchronize across all kernels executing // on the current device // Free device memory and destroy stream event. error = CubDebug(cudaFree(block_itr->d_ptr)); if (cudaSuccess != error) { break; } error = CubDebug(cudaEventDestroy(block_itr->ready_event)); if (cudaSuccess != error) { break; } // Reduce balance and erase entry cached_bytes[device].free -= block_itr->bytes; #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("\tDevice %d freed %lld bytes.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks " "(%lld bytes) outstanding.\n", device, (long long) block_itr->bytes, (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live); #endif block_itr = cached_blocks.erase(block_itr); } // Unlock mutex.unlock(); // Return under error if (error) { return error; } // Try to allocate again error = CubDebug(cudaMalloc(&search_key.d_ptr, search_key.bytes)); if (cudaSuccess != error) { return error; } } // Create ready event error = CubDebug(cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming)); if (cudaSuccess != error) { return error; } // Insert into live blocks mutex.lock(); live_blocks.insert(search_key); cached_bytes[device].live += search_key.bytes; mutex.unlock(); #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("\tDevice %d allocated new device block at %p (%lld bytes associated with stream %lld).\n", device, search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream); #endif // Attempt to revert back to previous device if necessary if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device)) { error = CubDebug(cudaSetDevice(entrypoint_device)); if (cudaSuccess != error) { return error; } } } // Copy device pointer to output parameter *d_ptr = search_key.d_ptr; #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG if (debug) { _CubLog("\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n", (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live); } #endif return error; } /** * @brief Provides a suitable allocation of device memory for the given size on the current * device. * * Once freed, the allocation becomes available immediately for reuse within the @p * active_stream with which it was associated with during allocation, and it becomes available * for reuse within other streams when all prior work submitted to @p active_stream has * completed. * * @param[out] d_ptr * Reference to pointer to the allocation * * @param[in] bytes * Minimum number of bytes for the allocation * * @param[in] active_stream * The stream to be associated with this allocation */ cudaError_t DeviceAllocate(void** d_ptr, size_t bytes, cudaStream_t active_stream = 0) { return DeviceAllocate(INVALID_DEVICE_ORDINAL, d_ptr, bytes, active_stream); } /** * @brief Frees a live allocation of device memory on the specified device, returning it to the * allocator. * * Once freed, the allocation becomes available immediately for reuse within the * @p active_stream with which it was associated with during allocation, and it becomes * available for reuse within other streams when all prior work submitted to @p active_stream * has completed. */ cudaError_t DeviceFree(int device, void* d_ptr) { int entrypoint_device = INVALID_DEVICE_ORDINAL; cudaError_t error = cudaSuccess; if (device == INVALID_DEVICE_ORDINAL) { error = CubDebug(cudaGetDevice(&entrypoint_device)); if (cudaSuccess != error) { return error; } device = entrypoint_device; } // Lock mutex.lock(); // Find corresponding block descriptor bool recached = false; BlockDescriptor search_key(d_ptr, device); BusyBlocks::iterator block_itr = live_blocks.find(search_key); if (block_itr != live_blocks.end()) { // Remove from live blocks search_key = *block_itr; live_blocks.erase(block_itr); cached_bytes[device].live -= search_key.bytes; // Keep the returned allocation if bin is valid and we won't exceed the max cached threshold if ((search_key.bin != INVALID_BIN) && (cached_bytes[device].free + search_key.bytes <= max_cached_bytes)) { // Insert returned allocation into free blocks recached = true; cached_blocks.insert(search_key); cached_bytes[device].free += search_key.bytes; #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("\tDevice %d returned %lld bytes from associated stream %lld.\n\t\t %lld available blocks cached (%lld " "bytes), %lld live blocks outstanding. (%lld bytes)\n", device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live); #endif } } // Unlock mutex.unlock(); // First set to specified device (entrypoint may not be set) if (device != entrypoint_device) { error = CubDebug(cudaGetDevice(&entrypoint_device)); if (cudaSuccess != error) { return error; } error = CubDebug(cudaSetDevice(device)); if (cudaSuccess != error) { return error; } } if (recached) { // Insert the ready event in the associated stream (must have current device set properly) error = CubDebug(cudaEventRecord(search_key.ready_event, search_key.associated_stream)); if (cudaSuccess != error) { return error; } } if (!recached) { // Free the allocation from the runtime and cleanup the event. error = CubDebug(cudaFree(d_ptr)); if (cudaSuccess != error) { return error; } error = CubDebug(cudaEventDestroy(search_key.ready_event)); if (cudaSuccess != error) { return error; } #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("\tDevice %d freed %lld bytes from associated stream %lld.\n\t\t %lld available blocks cached (%lld " "bytes), %lld live blocks (%lld bytes) outstanding.\n", device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live); #endif } // Reset device if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device)) { error = CubDebug(cudaSetDevice(entrypoint_device)); if (cudaSuccess != error) { return error; } } return error; } /** * @brief Frees a live allocation of device memory on the current device, returning it to the * allocator. * * Once freed, the allocation becomes available immediately for reuse within the @p * active_stream with which it was associated with during allocation, and it becomes available * for reuse within other streams when all prior work submitted to @p active_stream has * completed. */ cudaError_t DeviceFree(void* d_ptr) { return DeviceFree(INVALID_DEVICE_ORDINAL, d_ptr); } /** * @brief Frees all cached device allocations on all devices */ cudaError_t FreeAllCached() { cudaError_t error = cudaSuccess; int entrypoint_device = INVALID_DEVICE_ORDINAL; int current_device = INVALID_DEVICE_ORDINAL; mutex.lock(); while (!cached_blocks.empty()) { // Get first block CachedBlocks::iterator begin = cached_blocks.begin(); // Get entry-point device ordinal if necessary if (entrypoint_device == INVALID_DEVICE_ORDINAL) { error = CubDebug(cudaGetDevice(&entrypoint_device)); if (cudaSuccess != error) { break; } } // Set current device ordinal if necessary if (begin->device != current_device) { error = CubDebug(cudaSetDevice(begin->device)); if (cudaSuccess != error) { break; } current_device = begin->device; } // Free device memory error = CubDebug(cudaFree(begin->d_ptr)); if (cudaSuccess != error) { break; } error = CubDebug(cudaEventDestroy(begin->ready_event)); if (cudaSuccess != error) { break; } // Reduce balance and erase entry const size_t block_bytes = begin->bytes; cached_bytes[current_device].free -= block_bytes; cached_blocks.erase(begin); #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("\tDevice %d freed %lld bytes.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld " "bytes) outstanding.\n", current_device, (long long) block_bytes, (long long) cached_blocks.size(), (long long) cached_bytes[current_device].free, (long long) live_blocks.size(), (long long) cached_bytes[current_device].live); #endif } mutex.unlock(); // Attempt to revert back to entry-point device if necessary if (entrypoint_device != INVALID_DEVICE_ORDINAL) { error = CubDebug(cudaSetDevice(entrypoint_device)); if (cudaSuccess != error) { return error; } } return error; } /** * @brief Destructor */ virtual ~CachingDeviceAllocator() { if (!skip_cleanup) { FreeAllCached(); } } }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/util_arch.cuh000066400000000000000000000141571463375617100164440ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * Static architectural properties by SM version. */ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include // Legacy include; this functionality used to be defined in here. #include CUB_NAMESPACE_BEGIN #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document // \deprecated [Since 2.1.0] # define CUB_USE_COOPERATIVE_GROUPS /// In device code, CUB_PTX_ARCH expands to the PTX version for which we are /// compiling. In host code, CUB_PTX_ARCH's value is implementation defined. # ifndef CUB_PTX_ARCH # if defined(_NVHPC_CUDA) // __NVCOMPILER_CUDA_ARCH__ is the target PTX version, and is defined // when compiling both host code and device code. Currently, only one // PTX version can be targeted. # define CUB_PTX_ARCH __NVCOMPILER_CUDA_ARCH__ # elif !defined(__CUDA_ARCH__) # define CUB_PTX_ARCH 0 # else # define CUB_PTX_ARCH __CUDA_ARCH__ # endif # endif // These definitions were intended for internal use only and are now obsolete. // If you relied on them, consider porting your code to use the functionality // in libcu++'s header. // For a temporary workaround, define CUB_PROVIDE_LEGACY_ARCH_MACROS to make // them available again. These should be considered deprecated and will be // fully removed in a future version. # ifdef CUB_PROVIDE_LEGACY_ARCH_MACROS # ifndef CUB_IS_DEVICE_CODE # if defined(_NVHPC_CUDA) # define CUB_IS_DEVICE_CODE __builtin_is_device_code() # define CUB_IS_HOST_CODE (!__builtin_is_device_code()) # define CUB_INCLUDE_DEVICE_CODE 1 # define CUB_INCLUDE_HOST_CODE 1 # elif CUB_PTX_ARCH > 0 # define CUB_IS_DEVICE_CODE 1 # define CUB_IS_HOST_CODE 0 # define CUB_INCLUDE_DEVICE_CODE 1 # define CUB_INCLUDE_HOST_CODE 0 # else # define CUB_IS_DEVICE_CODE 0 # define CUB_IS_HOST_CODE 1 # define CUB_INCLUDE_DEVICE_CODE 0 # define CUB_INCLUDE_HOST_CODE 1 # endif # endif # endif // CUB_PROVIDE_LEGACY_ARCH_MACROS /// Maximum number of devices supported. # ifndef CUB_MAX_DEVICES # define CUB_MAX_DEVICES (128) # endif static_assert(CUB_MAX_DEVICES > 0, "CUB_MAX_DEVICES must be greater than 0."); /// Number of threads per warp # ifndef CUB_LOG_WARP_THREADS # define CUB_LOG_WARP_THREADS(unused) (5) # define CUB_WARP_THREADS(unused) (1 << CUB_LOG_WARP_THREADS(0)) # define CUB_PTX_WARP_THREADS CUB_WARP_THREADS(0) # define CUB_PTX_LOG_WARP_THREADS CUB_LOG_WARP_THREADS(0) # endif /// Number of smem banks # ifndef CUB_LOG_SMEM_BANKS # define CUB_LOG_SMEM_BANKS(unused) (5) # define CUB_SMEM_BANKS(unused) (1 << CUB_LOG_SMEM_BANKS(0)) # define CUB_PTX_LOG_SMEM_BANKS CUB_LOG_SMEM_BANKS(0) # define CUB_PTX_SMEM_BANKS CUB_SMEM_BANKS # endif /// Oversubscription factor # ifndef CUB_SUBSCRIPTION_FACTOR # define CUB_SUBSCRIPTION_FACTOR(unused) (5) # define CUB_PTX_SUBSCRIPTION_FACTOR CUB_SUBSCRIPTION_FACTOR(0) # endif /// Prefer padding overhead vs X-way conflicts greater than this threshold # ifndef CUB_PREFER_CONFLICT_OVER_PADDING # define CUB_PREFER_CONFLICT_OVER_PADDING(unused) (1) # define CUB_PTX_PREFER_CONFLICT_OVER_PADDING CUB_PREFER_CONFLICT_OVER_PADDING(0) # endif template struct RegBoundScaling { enum { ITEMS_PER_THREAD = CUB_MAX(1, NOMINAL_4B_ITEMS_PER_THREAD * 4 / CUB_MAX(4, sizeof(T))), BLOCK_THREADS = CUB_MIN(NOMINAL_4B_BLOCK_THREADS, (((1024 * 48) / (sizeof(T) * ITEMS_PER_THREAD)) + 31) / 32 * 32), }; }; template struct MemBoundScaling { enum { ITEMS_PER_THREAD = CUB_MAX(1, CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T), NOMINAL_4B_ITEMS_PER_THREAD * 2)), BLOCK_THREADS = CUB_MIN(NOMINAL_4B_BLOCK_THREADS, (((1024 * 48) / (sizeof(T) * ITEMS_PER_THREAD)) + 31) / 32 * 32), }; }; #endif // Do not document CUB_NAMESPACE_END cccl-2.5.0/cub/cub/util_compiler.cuh000066400000000000000000000073701463375617100173400ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * Detect compiler information. */ #pragma once // For _CCCL_IMPLICIT_SYSTEM_HEADER #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header // enumerate host compilers we know about #define CUB_HOST_COMPILER_UNKNOWN 0 #define CUB_HOST_COMPILER_MSVC 1 #define CUB_HOST_COMPILER_GCC 2 #define CUB_HOST_COMPILER_CLANG 3 // enumerate device compilers we know about #define CUB_DEVICE_COMPILER_UNKNOWN 0 #define CUB_DEVICE_COMPILER_MSVC 1 #define CUB_DEVICE_COMPILER_GCC 2 #define CUB_DEVICE_COMPILER_NVCC 3 #define CUB_DEVICE_COMPILER_CLANG 4 // figure out which host compiler we're using #if defined(_CCCL_COMPILER_MSVC) # define CUB_HOST_COMPILER CUB_HOST_COMPILER_MSVC # define CUB_MSVC_VERSION _MSC_VER # define CUB_MSVC_VERSION_FULL _MSC_FULL_VER #elif defined(_CCCL_COMPILER_CLANG) # define CUB_HOST_COMPILER CUB_HOST_COMPILER_CLANG # define CUB_CLANG_VERSION (__clang_major__ * 10000 + __clang_minor__ * 100 + __clang_patchlevel__) #elif defined(_CCCL_COMPILER_GCC) # define CUB_HOST_COMPILER CUB_HOST_COMPILER_GCC # define CUB_GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) #endif // figure out which device compiler we're using #if defined(_CCCL_CUDA_COMPILER_NVCC) || defined(_CCCL_CUDA_COMPILER_NVHPC) # define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_NVCC #elif defined(_CCCL_COMPILER_MSVC) # define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_MSVC #elif defined(_CCCL_COMPILER_GCC) # define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_GCC #elif defined(_CCCL_COMPILER_CLANG) // CUDA-capable clang should behave similar to NVCC. # if defined(_CCCL_CUDA_COMPILER_NVCC) # define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_NVCC # else # define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_CLANG # endif #else # define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_UNKNOWN #endif cccl-2.5.0/cub/cub/util_cpp_dialect.cuh000066400000000000000000000132031463375617100177650ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ //! @file Detect the version of the C++ standard used by the compiler. #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document // Deprecation warnings may be silenced by defining the following macros. These // may be combined. // - CUB_IGNORE_DEPRECATED_CPP_DIALECT: // Ignore all deprecated C++ dialects and outdated compilers. // - CUB_IGNORE_DEPRECATED_CPP_11: // Ignore deprecation warnings when compiling with C++11. C++03 and outdated // compilers will still issue warnings. // - CUB_IGNORE_DEPRECATED_COMPILER // Ignore deprecation warnings when using deprecated compilers. Compiling // with C++03 and C++11 will still issue warnings. // Check for the thrust opt-outs as well: # if !defined(CUB_IGNORE_DEPRECATED_CPP_DIALECT) && defined(THRUST_IGNORE_DEPRECATED_CPP_DIALECT) # define CUB_IGNORE_DEPRECATED_CPP_DIALECT # endif # if !defined(CUB_IGNORE_DEPRECATED_CPP_11) && defined(THRUST_IGNORE_DEPRECATED_CPP_11) # define CUB_IGNORE_DEPRECATED_CPP_11 # endif # if !defined(CUB_IGNORE_DEPRECATED_COMPILER) && defined(THRUST_IGNORE_DEPRECATED_COMPILER) # define CUB_IGNORE_DEPRECATED_COMPILER # endif # ifdef CUB_IGNORE_DEPRECATED_CPP_DIALECT # define CUB_IGNORE_DEPRECATED_CPP_11 # ifndef CUB_IGNORE_DEPRECATED_COMPILER # define CUB_IGNORE_DEPRECATED_COMPILER # endif # endif # define CUB_CPP_DIALECT _CCCL_STD_VER // Define CUB_COMPILER_DEPRECATION macro: # if defined(_CCCL_COMPILER_MSVC) # define CUB_COMP_DEPR_IMPL(msg) __pragma(message(__FILE__ ":" CUB_COMP_DEPR_IMPL0(__LINE__) ": warning: " #msg)) # define CUB_COMP_DEPR_IMPL0(x) CUB_COMP_DEPR_IMPL1(x) # define CUB_COMP_DEPR_IMPL1(x) #x # else // clang / gcc: # define CUB_COMP_DEPR_IMPL(msg) CUB_COMP_DEPR_IMPL0(GCC warning #msg) # define CUB_COMP_DEPR_IMPL0(expr) _Pragma(#expr) # define CUB_COMP_DEPR_IMPL1 /* intentionally blank */ # endif # define CUB_COMPILER_DEPRECATION(REQ) \ CUB_COMP_DEPR_IMPL(CUB requires at least REQ.Define CUB_IGNORE_DEPRECATED_COMPILER to suppress this message.) # define CUB_COMPILER_DEPRECATION_SOFT(REQ, CUR) \ CUB_COMP_DEPR_IMPL( \ CUB requires at least REQ.CUR is deprecated but still supported.CUR support will be removed in a future \ release.Define CUB_IGNORE_DEPRECATED_CPP_DIALECT to suppress this message.) # ifndef CUB_IGNORE_DEPRECATED_COMPILER // Compiler checks: # if defined(_CCCL_COMPILER_GCC) && CUB_GCC_VERSION < 50000 CUB_COMPILER_DEPRECATION(GCC 5.0); # elif defined(_CCCL_COMPILER_CLANG) && CUB_CLANG_VERSION < 70000 CUB_COMPILER_DEPRECATION(Clang 7.0); # elif defined(_CCCL_COMPILER_MSVC) && CUB_MSVC_VERSION < 1910 // <2017. Hard upgrade message: CUB_COMPILER_DEPRECATION(MSVC 2019(19.20 / 16.0 / 14.20)); # elif defined(_CCCL_COMPILER_MSVC) && CUB_MSVC_VERSION < 1920 // >=2017, <2019. Soft deprecation message: CUB_COMPILER_DEPRECATION_SOFT(MSVC 2019(19.20 / 16.0 / 14.20), MSVC 2017); # endif # endif // CUB_IGNORE_DEPRECATED_COMPILER # ifndef CUB_IGNORE_DEPRECATED_DIALECT // Dialect checks: # if _CCCL_STD_VER < 2011 // #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #ifdef DOXYGEN_SHOULD_SKIP_THIS // Only parse this during doxygen passes: /** * @def CUB_DEBUG_LOG * * Causes kernel launch configurations to be printed to the console */ # define CUB_DEBUG_LOG /** * @def CUB_DEBUG_SYNC * * Causes synchronization of the stream after every kernel launch to check * for errors. Also causes kernel launch configurations to be printed to the * console. */ # define CUB_DEBUG_SYNC /** * @def CUB_DEBUG_HOST_ASSERTIONS * * Extends `CUB_DEBUG_SYNC` effects by checking host-side precondition * assertions. */ # define CUB_DEBUG_HOST_ASSERTIONS /** * @def CUB_DEBUG_DEVICE_ASSERTIONS * * Extends `CUB_DEBUG_HOST_ASSERTIONS` effects by checking device-side * precondition assertions. */ # define CUB_DEBUG_DEVICE_ASSERTIONS /** * @def CUB_DEBUG_ALL * * Causes host and device-side precondition assertions to be checked. Apart * from that, causes synchronization of the stream after every kernel launch to * check for errors. Also causes kernel launch configurations to be printed to * the console. */ # define CUB_DEBUG_ALL #endif // DOXYGEN_SHOULD_SKIP_THIS // `CUB_DETAIL_DEBUG_LEVEL_*`: Implementation details, internal use only: #define CUB_DETAIL_DEBUG_LEVEL_NONE 0 #define CUB_DETAIL_DEBUG_LEVEL_HOST_ASSERTIONS_ONLY 1 #define CUB_DETAIL_DEBUG_LEVEL_LOG 2 #define CUB_DETAIL_DEBUG_LEVEL_SYNC 3 #define CUB_DETAIL_DEBUG_LEVEL_HOST_ASSERTIONS 4 #define CUB_DETAIL_DEBUG_LEVEL_DEVICE_ASSERTIONS 5 #define CUB_DETAIL_DEBUG_LEVEL_ALL 1000 // `CUB_DEBUG_*`: User interfaces: // Extra logging, no syncs #ifdef CUB_DEBUG_LOG # define CUB_DETAIL_DEBUG_LEVEL CUB_DETAIL_DEBUG_LEVEL_LOG #endif // Logging + syncs #ifdef CUB_DEBUG_SYNC # define CUB_DETAIL_DEBUG_LEVEL CUB_DETAIL_DEBUG_LEVEL_SYNC #endif // Logging + syncs + host assertions #ifdef CUB_DEBUG_HOST_ASSERTIONS # define CUB_DETAIL_DEBUG_LEVEL CUB_DETAIL_DEBUG_LEVEL_HOST_ASSERTIONS #endif // Logging + syncs + host assertions + device assertions #ifdef CUB_DEBUG_DEVICE_ASSERTIONS # define CUB_DETAIL_DEBUG_LEVEL CUB_DETAIL_DEBUG_LEVEL_DEVICE_ASSERTIONS #endif // All #ifdef CUB_DEBUG_ALL # define CUB_DETAIL_DEBUG_LEVEL CUB_DETAIL_DEBUG_LEVEL_ALL #endif // Default case, no extra debugging: #ifndef CUB_DETAIL_DEBUG_LEVEL # ifdef NDEBUG # define CUB_DETAIL_DEBUG_LEVEL CUB_DETAIL_DEBUG_LEVEL_NONE # else # define CUB_DETAIL_DEBUG_LEVEL CUB_DETAIL_DEBUG_LEVEL_HOST_ASSERTIONS_ONLY # endif #endif /* * `CUB_DETAIL_DEBUG_ENABLE_*`: * Internal implementation details, used for testing enabled debug features: */ #if CUB_DETAIL_DEBUG_LEVEL >= CUB_DETAIL_DEBUG_LEVEL_LOG # define CUB_DETAIL_DEBUG_ENABLE_LOG #endif #if CUB_DETAIL_DEBUG_LEVEL >= CUB_DETAIL_DEBUG_LEVEL_SYNC # define CUB_DETAIL_DEBUG_ENABLE_SYNC #endif #if (CUB_DETAIL_DEBUG_LEVEL >= CUB_DETAIL_DEBUG_LEVEL_HOST_ASSERTIONS) \ || (CUB_DETAIL_DEBUG_LEVEL == CUB_DETAIL_DEBUG_LEVEL_HOST_ASSERTIONS_ONLY) # define CUB_DETAIL_DEBUG_ENABLE_HOST_ASSERTIONS #endif #if CUB_DETAIL_DEBUG_LEVEL >= CUB_DETAIL_DEBUG_LEVEL_DEVICE_ASSERTIONS # define CUB_DETAIL_DEBUG_ENABLE_DEVICE_ASSERTIONS #endif /// CUB error reporting macro (prints error messages to stderr) #if (defined(DEBUG) || defined(_DEBUG)) && !defined(CUB_STDERR) # define CUB_STDERR #endif #if defined(CUB_STDERR) || defined(CUB_DETAIL_DEBUG_ENABLE_LOG) # include #endif CUB_NAMESPACE_BEGIN /** * \brief %If \p CUB_STDERR is defined and \p error is not \p cudaSuccess, the * corresponding error message is printed to \p stderr (or \p stdout in device * code) along with the supplied source context. * * \return The CUDA error. */ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE cudaError_t Debug(cudaError_t error, const char* filename, int line) { // Clear the global CUDA error state which may have been set by the last // call. Otherwise, errors may "leak" to unrelated kernel launches. // clang-format off #ifndef CUB_RDC_ENABLED #define CUB_TEMP_DEVICE_CODE #else #define CUB_TEMP_DEVICE_CODE last_error = cudaGetLastError() #endif cudaError_t last_error = cudaSuccess; NV_IF_TARGET( NV_IS_HOST, (last_error = cudaGetLastError();), (CUB_TEMP_DEVICE_CODE;) ); #undef CUB_TEMP_DEVICE_CODE // clang-format on if (error == cudaSuccess && last_error != cudaSuccess) { error = last_error; } #ifdef CUB_STDERR if (error) { NV_IF_TARGET( NV_IS_HOST, (fprintf(stderr, "CUDA error %d [%s, %d]: %s\n", error, filename, line, cudaGetErrorString(error)); fflush(stderr);), (printf("CUDA error %d [block (%d,%d,%d) thread (%d,%d,%d), %s, %d]\n", error, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, filename, line);)); } #else (void) filename; (void) line; #endif return error; } /** * \brief Debug macro */ #ifndef CubDebug # define CubDebug(e) CUB_NS_QUALIFIER::Debug((cudaError_t) (e), __FILE__, __LINE__) #endif /** * \brief Debug macro with exit */ #ifndef CubDebugExit # define CubDebugExit(e) \ if (CUB_NS_QUALIFIER::Debug((cudaError_t) (e), __FILE__, __LINE__)) \ { \ exit(1); \ } #endif /** * \brief Log macro for printf statements. */ #if !defined(_CubLog) # if defined(_NVHPC_CUDA) || !(defined(__clang__) && defined(__CUDA__)) // NVCC / NVC++ # define _CubLog(format, ...) \ do \ { \ NV_IF_TARGET( \ NV_IS_HOST, \ (printf(format, __VA_ARGS__);), \ (printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, \ blockIdx.z, \ blockIdx.y, \ blockIdx.x, \ threadIdx.z, \ threadIdx.y, \ threadIdx.x, \ __VA_ARGS__);)); \ } while (false) # else // Clang: // XXX shameless hack for clang around variadic printf... // Compilies w/o supplying -std=c++11 but shows warning, // so we silence them :) # pragma clang diagnostic ignored "-Wc++11-extensions" # pragma clang diagnostic ignored "-Wunnamed-type-template-args" # ifdef CUB_STDERR template inline _CCCL_HOST_DEVICE void va_printf(char const* format, Args const&... args) { # ifdef __CUDA_ARCH__ printf(format, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, args...); # else printf(format, args...); # endif } # else // !defined(CUB_STDERR) template inline _CCCL_HOST_DEVICE void va_printf(char const*, Args const&...) {} # endif // !defined(CUB_STDERR) # ifndef __CUDA_ARCH__ # define _CubLog(format, ...) CUB_NS_QUALIFIER::va_printf(format, __VA_ARGS__); # else # define _CubLog(format, ...) \ CUB_NS_QUALIFIER::va_printf("[block (%d,%d,%d), thread " \ "(%d,%d,%d)]: " format, \ __VA_ARGS__); # endif # endif #endif CUB_NAMESPACE_END cccl-2.5.0/cub/cub/util_deprecated.cuh000066400000000000000000000073721463375617100176300ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * Define CUB_DEPRECATED macro. */ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #if defined(THRUST_IGNORE_DEPRECATED_API) && !defined(CUB_IGNORE_DEPRECATED_API) # define CUB_IGNORE_DEPRECATED_API #endif #ifdef CUB_IGNORE_DEPRECATED_API # define CUB_DEPRECATED # define CUB_DEPRECATED_BECAUSE(MSG) #elif _CCCL_STD_VER >= 2014 # define CUB_DEPRECATED [[deprecated]] # define CUB_DEPRECATED_BECAUSE(MSG) [[deprecated(MSG)]] #elif defined(_CCCL_COMPILER_MSVC) # define CUB_DEPRECATED __declspec(deprecated) # define CUB_DEPRECATED_BECAUSE(MSG) __declspec(deprecated(MSG)) #elif defined(_CCCL_COMPILER_CLANG) # define CUB_DEPRECATED __attribute__((deprecated)) # define CUB_DEPRECATED_BECAUSE(MSG) __attribute__((deprecated(MSG))) #elif defined(_CCCL_COMPILER_GCC) # define CUB_DEPRECATED __attribute__((deprecated)) # define CUB_DEPRECATED_BECAUSE(MSG) __attribute__((deprecated(MSG))) #else # define CUB_DEPRECATED # define CUB_DEPRECATED_BECAUSE(MSG) #endif #define CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED \ CUB_DEPRECATED_BECAUSE( \ "CUB no longer accepts `debug_synchronous` parameter. " \ "Define CUB_DEBUG_SYNC instead, or silence this message with " \ "CUB_IGNORE_DEPRECATED_API.") #define CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG \ if (debug_synchronous) \ { \ _CubLog("%s\n", \ "CUB no longer accepts `debug_synchronous` parameter. " \ "Define CUB_DEBUG_SYNC instead."); \ } cccl-2.5.0/cub/cub/util_device.cuh000066400000000000000000000516611463375617100167670ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * Properties of a given CUDA device and the corresponding PTX bundle. * * \note * This file contains __host__ only functions and utilities, and should not be * included in code paths that could be online-compiled (ex: using NVRTC). */ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include // for backward compatibility #include #include #include #include #include #include #include CUB_NAMESPACE_BEGIN #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document namespace detail { /** * @brief Helper class template that allows overwriting the `BLOCK_THREAD` and `ITEMS_PER_THREAD` * configurations of a given policy. */ template struct policy_wrapper_t : PolicyT { static constexpr int ITEMS_PER_THREAD = ITEMS_PER_THREAD_; static constexpr int BLOCK_THREADS = BLOCK_THREADS_; static constexpr int ITEMS_PER_TILE = BLOCK_THREADS * ITEMS_PER_THREAD; }; } // namespace detail /** * \brief Empty kernel for querying PTX manifest metadata (e.g., version) for the current device */ template CUB_DETAIL_KERNEL_ATTRIBUTES void EmptyKernel() {} #endif // DOXYGEN_SHOULD_SKIP_THIS /** * \brief Returns the current device or -1 if an error occurred. */ CUB_RUNTIME_FUNCTION inline int CurrentDevice() { int device = -1; if (CubDebug(cudaGetDevice(&device))) { return -1; } return device; } #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document /** * \brief RAII helper which saves the current device and switches to the * specified device on construction and switches to the saved device on * destruction. */ struct SwitchDevice { private: int const old_device; bool const needs_reset; public: _CCCL_HOST inline SwitchDevice(int new_device) : old_device(CurrentDevice()) , needs_reset(old_device != new_device) { if (needs_reset) { CubDebug(cudaSetDevice(new_device)); } } _CCCL_HOST inline ~SwitchDevice() { if (needs_reset) { CubDebug(cudaSetDevice(old_device)); } } }; #endif // DOXYGEN_SHOULD_SKIP_THIS /** * \brief Returns the number of CUDA devices available or -1 if an error * occurred. */ CUB_RUNTIME_FUNCTION inline int DeviceCountUncached() { int count = -1; if (CubDebug(cudaGetDeviceCount(&count))) { // CUDA makes no guarantees about the state of the output parameter if // `cudaGetDeviceCount` fails; in practice, they don't, but out of // paranoia we'll reset `count` to `-1`. count = -1; } return count; } /** * \brief Cache for an arbitrary value produced by a nullary function. */ template struct ValueCache { T const value; /** * \brief Call the nullary function to produce the value and construct the * cache. */ _CCCL_HOST inline ValueCache() : value(Function()) {} }; // Host code, only safely usable in C++11 or newer, where thread-safe // initialization of static locals is guaranteed. This is a separate function // to avoid defining a local static in a host/device function. _CCCL_HOST inline int DeviceCountCachedValue() { static ValueCache cache; return cache.value; } /** * \brief Returns the number of CUDA devices available. * * \note This function may cache the result internally. * * \note This function is thread safe. */ CUB_RUNTIME_FUNCTION inline int DeviceCount() { int result = -1; NV_IF_TARGET(NV_IS_HOST, (result = DeviceCountCachedValue();), (result = DeviceCountUncached();)); return result; } #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document /** * \brief Per-device cache for a CUDA attribute value; the attribute is queried * and stored for each device upon construction. */ struct PerDeviceAttributeCache { struct DevicePayload { int attribute; cudaError_t error; }; // Each entry starts in the `DeviceEntryEmpty` state, then proceeds to the // `DeviceEntryInitializing` state, and then proceeds to the // `DeviceEntryReady` state. These are the only state transitions allowed; // e.g. a linear sequence of transitions. enum DeviceEntryStatus { DeviceEntryEmpty = 0, DeviceEntryInitializing, DeviceEntryReady }; struct DeviceEntry { std::atomic flag; DevicePayload payload; }; private: std::array entries_; public: /** * \brief Construct the cache. */ _CCCL_HOST inline PerDeviceAttributeCache() : entries_() { assert(DeviceCount() <= CUB_MAX_DEVICES); } /** * \brief Retrieves the payload of the cached function \p f for \p device. * * \note You must pass a morally equivalent function in to every call or * this function has undefined behavior. */ template _CCCL_HOST DevicePayload operator()(Invocable&& f, int device) { if (device >= DeviceCount() || device < 0) { return DevicePayload{0, cudaErrorInvalidDevice}; } auto& entry = entries_[device]; auto& flag = entry.flag; auto& payload = entry.payload; DeviceEntryStatus old_status = DeviceEntryEmpty; // First, check for the common case of the entry being ready. if (flag.load(std::memory_order_acquire) != DeviceEntryReady) { // Assume the entry is empty and attempt to lock it so we can fill // it by trying to set the state from `DeviceEntryReady` to // `DeviceEntryInitializing`. if (flag.compare_exchange_strong( old_status, DeviceEntryInitializing, std::memory_order_acq_rel, std::memory_order_acquire)) { // We successfully set the state to `DeviceEntryInitializing`; // we have the lock and it's our job to initialize this entry // and then release it. // We don't use `CubDebug` here because we let the user code // decide whether or not errors are hard errors. payload.error = ::cuda::std::forward(f)(payload.attribute); if (payload.error) { // Clear the global CUDA error state which may have been // set by the last call. Otherwise, errors may "leak" to // unrelated kernel launches. cudaGetLastError(); } // Release the lock by setting the state to `DeviceEntryReady`. flag.store(DeviceEntryReady, std::memory_order_release); } // If the `compare_exchange_weak` failed, then `old_status` has // been updated with the value of `flag` that it observed. else if (old_status == DeviceEntryInitializing) { // Another execution agent is initializing this entry; we need // to wait for them to finish; we'll know they're done when we // observe the entry status as `DeviceEntryReady`. do { old_status = flag.load(std::memory_order_acquire); } while (old_status != DeviceEntryReady); // FIXME: Use `atomic::wait` instead when we have access to // host-side C++20 atomics. We could use libcu++, but it only // supports atomics for SM60 and up, even if you're only using // them in host code. } } // We now know that the state of our entry is `DeviceEntryReady`, so // just return the entry's payload. return entry.payload; } }; #endif // DOXYGEN_SHOULD_SKIP_THIS /** * \brief Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10). */ CUB_RUNTIME_FUNCTION inline cudaError_t PtxVersionUncached(int& ptx_version) { // Instantiate `EmptyKernel` in both host and device code to ensure // it can be called. typedef void (*EmptyKernelPtr)(); EmptyKernelPtr empty_kernel = EmptyKernel; // This is necessary for unused variable warnings in host compilers. The // usual syntax of (void)empty_kernel; was not sufficient on MSVC2015. (void) reinterpret_cast(empty_kernel); // Define a temporary macro that expands to the current target ptx version // in device code. // may provide an abstraction for this eventually. For now, // we have to keep this usage of __CUDA_ARCH__. #if defined(_NVHPC_CUDA) # define CUB_TEMP_GET_PTX __builtin_current_device_sm() #else # define CUB_TEMP_GET_PTX __CUDA_ARCH__ #endif cudaError_t result = cudaSuccess; NV_IF_TARGET( NV_IS_HOST, (cudaFuncAttributes empty_kernel_attrs; result = CubDebug(cudaFuncGetAttributes(&empty_kernel_attrs, reinterpret_cast(empty_kernel))); ptx_version = empty_kernel_attrs.ptxVersion * 10;), // NV_IS_DEVICE ( // This is necessary to ensure instantiation of EmptyKernel in device // code. The `reinterpret_cast` is necessary to suppress a // set-but-unused warnings. This is a meme now: // https://twitter.com/blelbach/status/1222391615576100864 (void) reinterpret_cast(empty_kernel); ptx_version = CUB_TEMP_GET_PTX;)); #undef CUB_TEMP_GET_PTX return result; } /** * \brief Retrieves the PTX version that will be used on \p device (major * 100 + minor * 10). */ _CCCL_HOST inline cudaError_t PtxVersionUncached(int& ptx_version, int device) { SwitchDevice sd(device); (void) sd; return PtxVersionUncached(ptx_version); } template _CCCL_HOST inline PerDeviceAttributeCache& GetPerDeviceAttributeCache() { // C++11 guarantees that initialization of static locals is thread safe. static PerDeviceAttributeCache cache; return cache; } struct PtxVersionCacheTag {}; struct SmVersionCacheTag {}; /** * \brief Retrieves the PTX version that will be used on \p device (major * 100 + minor * 10). * * \note This function may cache the result internally. * * \note This function is thread safe. */ _CCCL_HOST inline cudaError_t PtxVersion(int& ptx_version, int device) { auto const payload = GetPerDeviceAttributeCache()( // If this call fails, then we get the error code back in the payload, // which we check with `CubDebug` below. [=](int& pv) { return PtxVersionUncached(pv, device); }, device); if (!CubDebug(payload.error)) { ptx_version = payload.attribute; } return payload.error; } /** * \brief Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10). * * \note This function may cache the result internally. * * \note This function is thread safe. */ CUB_RUNTIME_FUNCTION inline cudaError_t PtxVersion(int& ptx_version) { cudaError_t result = cudaErrorUnknown; NV_IF_TARGET( NV_IS_HOST, (auto const device = CurrentDevice(); auto const payload = GetPerDeviceAttributeCache()( // If this call fails, then we get the error code back in the payload, // which we check with `CubDebug` below. [=](int& pv) { return PtxVersionUncached(pv, device); }, device); if (!CubDebug(payload.error)) { ptx_version = payload.attribute; } result = payload.error;), ( // NV_IS_DEVICE: result = PtxVersionUncached(ptx_version);)); return result; } /** * \brief Retrieves the SM version of \p device (major * 100 + minor * 10) */ CUB_RUNTIME_FUNCTION inline cudaError_t SmVersionUncached(int& sm_version, int device = CurrentDevice()) { cudaError_t error = cudaSuccess; do { int major = 0, minor = 0; error = CubDebug(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device)); if (cudaSuccess != error) { break; } error = CubDebug(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device)); if (cudaSuccess != error) { break; } sm_version = major * 100 + minor * 10; } while (0); return error; } /** * \brief Retrieves the SM version of \p device (major * 100 + minor * 10) * * \note This function may cache the result internally. * * \note This function is thread safe. */ CUB_RUNTIME_FUNCTION inline cudaError_t SmVersion(int& sm_version, int device = CurrentDevice()) { cudaError_t result = cudaErrorUnknown; NV_IF_TARGET( NV_IS_HOST, (auto const payload = GetPerDeviceAttributeCache()( // If this call fails, then we get the error code back in // the payload, which we check with `CubDebug` below. [=](int& pv) { return SmVersionUncached(pv, device); }, device); if (!CubDebug(payload.error)) { sm_version = payload.attribute; }; result = payload.error;), ( // NV_IS_DEVICE result = SmVersionUncached(sm_version, device);)); return result; } /** * Synchronize the specified \p stream. */ CUB_RUNTIME_FUNCTION inline cudaError_t SyncStream(cudaStream_t stream) { cudaError_t result = cudaErrorNotSupported; NV_IF_TARGET(NV_IS_HOST, (result = CubDebug(cudaStreamSynchronize(stream));), ((void) stream; result = CubDebug(cub::detail::device_synchronize());)); return result; } namespace detail { /** * Same as SyncStream, but intended for use with the debug_synchronous flags * in device algorithms. This should not be used if synchronization is required * for correctness. * * If `debug_synchronous` is false, this function will immediately return * cudaSuccess. If true, one of the following will occur: * * If synchronization is supported by the current compilation target and * settings, the sync is performed and the sync result is returned. * * If syncs are not supported then no sync is performed, but a message is logged * via _CubLog and cudaSuccess is returned. */ CUB_RUNTIME_FUNCTION inline cudaError_t DebugSyncStream(cudaStream_t stream) { #ifndef CUB_DETAIL_DEBUG_ENABLE_SYNC (void) stream; return cudaSuccess; #else // CUB_DETAIL_DEBUG_ENABLE_SYNC: # define CUB_TMP_SYNC_AVAILABLE \ _CubLog("%s\n", "Synchronizing..."); \ return SyncStream(stream) # define CUB_TMP_DEVICE_SYNC_UNAVAILABLE \ (void) stream; \ _CubLog("WARNING: Skipping CUB `debug_synchronous` synchronization (%s).\n", \ "device-side sync requires // or equivalently * * template * __global__ void ExampleKernel() * { * // Allocate shared memory for BlockScan * __shared__ volatile T buffer[4096]; * * ... * } * * ... * * // Determine SM occupancy for ExampleKernel specialized for unsigned char * int max_sm_occupancy; * MaxSmOccupancy(max_sm_occupancy, ExampleKernel, 64); * * // max_sm_occupancy <-- 4 on SM10 * // max_sm_occupancy <-- 8 on SM20 * // max_sm_occupancy <-- 12 on SM35 * * @endcode * * @param[out] max_sm_occupancy * maximum number of thread blocks that can reside on a single SM * * @param[in] kernel_ptr * Kernel pointer for which to compute SM occupancy * * @param[in] block_threads * Number of threads per thread block * * @param[in] dynamic_smem_bytes * Dynamically allocated shared memory in bytes. Default is 0. */ template _CCCL_VISIBILITY_HIDDEN CUB_RUNTIME_FUNCTION inline cudaError_t MaxSmOccupancy(int& max_sm_occupancy, KernelPtr kernel_ptr, int block_threads, int dynamic_smem_bytes = 0) { return CubDebug( cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_sm_occupancy, kernel_ptr, block_threads, dynamic_smem_bytes)); } /****************************************************************************** * Policy management ******************************************************************************/ /** * Kernel dispatch configuration */ struct KernelConfig { int block_threads; int items_per_thread; int tile_size; int sm_occupancy; CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE KernelConfig() : block_threads(0) , items_per_thread(0) , tile_size(0) , sm_occupancy(0) {} template CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE cudaError_t Init(KernelPtrT kernel_ptr) { block_threads = AgentPolicyT::BLOCK_THREADS; items_per_thread = AgentPolicyT::ITEMS_PER_THREAD; tile_size = block_threads * items_per_thread; cudaError_t retval = MaxSmOccupancy(sm_occupancy, kernel_ptr, block_threads); return retval; } }; /// Helper for dispatching into a policy chain template struct ChainedPolicy { /// The policy for the active compiler pass using ActivePolicy = cub::detail::conditional_t<(CUB_PTX_ARCH < PTX_VERSION), typename PrevPolicyT::ActivePolicy, PolicyT>; /// Specializes and dispatches op in accordance to the first policy in the chain of adequate PTX version template CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Invoke(int ptx_version, FunctorT& op) { if (ptx_version < PTX_VERSION) { return PrevPolicyT::Invoke(ptx_version, op); } return op.template Invoke(); } }; /// Helper for dispatching into a policy chain (end-of-chain specialization) template struct ChainedPolicy { /// The policy for the active compiler pass typedef PolicyT ActivePolicy; /// Specializes and dispatches op in accordance to the first policy in the chain of adequate PTX version template CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Invoke(int /*ptx_version*/, FunctorT& op) { return op.template Invoke(); } }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/util_macro.cuh000066400000000000000000000126201463375617100166210ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /****************************************************************************** * Common C/C++ macro utilities ******************************************************************************/ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include CUB_NAMESPACE_BEGIN #ifndef CUB_ALIGN # if defined(_WIN32) || defined(_WIN64) /// Align struct # define CUB_ALIGN(bytes) __declspec(align(32)) # else /// Align struct # define CUB_ALIGN(bytes) __attribute__((aligned(bytes))) # endif #endif #define CUB_PREVENT_MACRO_SUBSTITUTION #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template constexpr _CCCL_HOST_DEVICE auto min CUB_PREVENT_MACRO_SUBSTITUTION(T&& t, U&& u) -> decltype(t < u ? ::cuda::std::forward(t) : ::cuda::std::forward(u)) { return t < u ? ::cuda::std::forward(t) : ::cuda::std::forward(u); } template constexpr _CCCL_HOST_DEVICE auto max CUB_PREVENT_MACRO_SUBSTITUTION(T&& t, U&& u) -> decltype(t < u ? ::cuda::std::forward(u) : ::cuda::std::forward(t)) { return t < u ? ::cuda::std::forward(u) : ::cuda::std::forward(t); } #endif #ifndef CUB_MAX /// Select maximum(a, b) # define CUB_MAX(a, b) (((b) > (a)) ? (b) : (a)) #endif #ifndef CUB_MIN /// Select minimum(a, b) # define CUB_MIN(a, b) (((b) < (a)) ? (b) : (a)) #endif #ifndef CUB_QUOTIENT_FLOOR /// Quotient of x/y rounded down to nearest integer # define CUB_QUOTIENT_FLOOR(x, y) ((x) / (y)) #endif #ifndef CUB_QUOTIENT_CEILING /// Quotient of x/y rounded up to nearest integer # define CUB_QUOTIENT_CEILING(x, y) (((x) + (y) -1) / (y)) #endif #ifndef CUB_ROUND_UP_NEAREST /// x rounded up to the nearest multiple of y # define CUB_ROUND_UP_NEAREST(x, y) ((((x) + (y) -1) / (y)) * y) #endif #ifndef CUB_ROUND_DOWN_NEAREST /// x rounded down to the nearest multiple of y # define CUB_ROUND_DOWN_NEAREST(x, y) (((x) / (y)) * y) #endif #ifndef CUB_STATIC_ASSERT # ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document # define CUB_CAT_(a, b) a##b # define CUB_CAT(a, b) CUB_CAT_(a, b) # endif // DOXYGEN_SHOULD_SKIP_THIS /// Static assert # define CUB_STATIC_ASSERT(cond, msg) typedef int CUB_CAT(cub_static_assert, __LINE__)[(cond) ? 1 : -1] #endif #ifndef CUB_DETAIL_KERNEL_ATTRIBUTES # define CUB_DETAIL_KERNEL_ATTRIBUTES CCCL_DETAIL_KERNEL_ATTRIBUTES #endif /** * @def CUB_DISABLE_KERNEL_VISIBILITY_WARNING_SUPPRESSION * If defined, the default suppression of kernel visibility attribute warning is disabled. */ #if !defined(CUB_DISABLE_KERNEL_VISIBILITY_WARNING_SUPPRESSION) _CCCL_DIAG_SUPPRESS_GCC("-Wattributes") _CCCL_DIAG_SUPPRESS_CLANG("-Wattributes") # if !defined(_CCCL_CUDA_COMPILER_NVHPC) _CCCL_DIAG_SUPPRESS_NVHPC(attribute_requires_external_linkage) # endif // !_CCCL_CUDA_COMPILER_NVHPC # if defined(_CCCL_COMPILER_ICC) || defined(_CCCL_COMPILER_ICC_LLVM) # pragma nv_diag_suppress 1407 // the "__visibility__" attribute can only appear on functions and // variables with external linkage' # pragma warning(disable : 1890) // the "__visibility__" attribute can only appear on functions and // variables with external linkage' # endif // _CCCL_COMPILER_ICC || _CCCL_COMPILER_ICC_LLVM #endif // !CUB_DISABLE_KERNEL_VISIBILITY_WARNING_SUPPRESSION CUB_NAMESPACE_END cccl-2.5.0/cub/cub/util_math.cuh000066400000000000000000000116551463375617100164600ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * Define helper math functions. */ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include CUB_NAMESPACE_BEGIN namespace detail { template using is_integral_or_enum = ::cuda::std::integral_constant::value || ::cuda::std::is_enum::value>; /** * Computes lhs + rhs, but bounds the result to the maximum number representable by the given type, if the addition * would overflow. Note, lhs must be non-negative. * * Effectively performs `min((lhs + rhs), ::cuda::std::numeric_limits::max())`, but is robust against the case * where `(lhs + rhs)` would overflow. */ template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE OffsetT safe_add_bound_to_max(OffsetT lhs, OffsetT rhs) { static_assert(::cuda::std::is_integral::value, "OffsetT must be an integral type"); static_assert(sizeof(OffsetT) >= 4, "OffsetT must be at least 32 bits in size"); auto const capped_operand_rhs = (cub::min)(rhs, ::cuda::std::numeric_limits::max() - lhs); return lhs + capped_operand_rhs; } } // namespace detail /** * Divide n by d, round up if any remainder, and return the result. * * Effectively performs `(n + d - 1) / d`, but is robust against the case where * `(n + d - 1)` would overflow. */ template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE constexpr NumeratorT DivideAndRoundUp(NumeratorT n, DenominatorT d) { static_assert( cub::detail::is_integral_or_enum::value && cub::detail::is_integral_or_enum::value, "DivideAndRoundUp is only intended for integral types."); // Static cast to undo integral promotion. return static_cast(n / d + (n % d != 0 ? 1 : 0)); } constexpr _CCCL_HOST_DEVICE int Nominal4BItemsToItemsCombined(int nominal_4b_items_per_thread, int combined_bytes) { return (cub::min)(nominal_4b_items_per_thread, (cub::max)(1, nominal_4b_items_per_thread * 8 / combined_bytes)); } template constexpr _CCCL_HOST_DEVICE int Nominal4BItemsToItems(int nominal_4b_items_per_thread) { return (cub::min)(nominal_4b_items_per_thread, (cub::max)(1, nominal_4b_items_per_thread * 4 / static_cast(sizeof(T)))); } template constexpr _CCCL_HOST_DEVICE int Nominal8BItemsToItems(int nominal_8b_items_per_thread) { return sizeof(ItemT) <= 8u ? nominal_8b_items_per_thread : (cub::min)(nominal_8b_items_per_thread, (cub::max)(1, ((nominal_8b_items_per_thread * 8) + static_cast(sizeof(ItemT)) - 1) / static_cast(sizeof(ItemT)))); } /** * \brief Computes the midpoint of the integers * * Extra operation is performed in order to prevent overflow. * * \return Half the sum of \p begin and \p end */ template constexpr _CCCL_HOST_DEVICE T MidPoint(T begin, T end) { return begin + (end - begin) / 2; } CUB_NAMESPACE_END cccl-2.5.0/cub/cub/util_namespace.cuh000066400000000000000000000257131463375617100174630ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file util_namespace.cuh * \brief Utilities that allow `cub::` to be placed inside an * application-specific namespace. */ #pragma once // This is not used by this file; this is a hack so that we can detect the // CUB version from Thrust on older versions of CUB that did not have // version.cuh. #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include // Prior to 1.13.1, only the PREFIX/POSTFIX macros were used. Notify users // that they must now define the qualifier macro, too. #if (defined(CUB_NS_PREFIX) || defined(CUB_NS_POSTFIX)) && !defined(CUB_NS_QUALIFIER) # error CUB requires a definition of CUB_NS_QUALIFIER when CUB_NS_PREFIX/POSTFIX are defined. #endif /** * \def THRUST_CUB_WRAPPED_NAMESPACE * If defined, this value will be used as the name of a namespace that wraps the * `thrust::` and `cub::` namespaces. * This macro should not be used with any other CUB namespace macros. */ #ifdef THRUST_CUB_WRAPPED_NAMESPACE # define CUB_WRAPPED_NAMESPACE THRUST_CUB_WRAPPED_NAMESPACE #endif /** * \def CUB_WRAPPED_NAMESPACE * If defined, this value will be used as the name of a namespace that wraps the * `cub::` namespace. * If THRUST_CUB_WRAPPED_NAMESPACE is set, this will inherit that macro's value. * This macro should not be used with any other CUB namespace macros. */ #ifdef CUB_WRAPPED_NAMESPACE # define CUB_NS_PREFIX \ namespace CUB_WRAPPED_NAMESPACE \ { # define CUB_NS_POSTFIX } # define CUB_NS_QUALIFIER ::CUB_WRAPPED_NAMESPACE::cub #endif /** * \def CUB_NS_PREFIX * This macro is inserted prior to all `namespace cub { ... }` blocks. It is * derived from CUB_WRAPPED_NAMESPACE, if set, and will be empty otherwise. * It may be defined by users, in which case CUB_NS_PREFIX, * CUB_NS_POSTFIX, and CUB_NS_QUALIFIER must all be set consistently. */ #ifndef CUB_NS_PREFIX # define CUB_NS_PREFIX #endif /** * \def CUB_NS_POSTFIX * This macro is inserted following the closing braces of all * `namespace cub { ... }` block. It is defined appropriately when * CUB_WRAPPED_NAMESPACE is set, and will be empty otherwise. It may be * defined by users, in which case CUB_NS_PREFIX, CUB_NS_POSTFIX, and * CUB_NS_QUALIFIER must all be set consistently. */ #ifndef CUB_NS_POSTFIX # define CUB_NS_POSTFIX #endif /** * \def CUB_NS_QUALIFIER * This macro is used to qualify members of cub:: when accessing them from * outside of their namespace. By default, this is just `::cub`, and will be * set appropriately when CUB_WRAPPED_NAMESPACE is defined. This macro may be * defined by users, in which case CUB_NS_PREFIX, CUB_NS_POSTFIX, and * CUB_NS_QUALIFIER must all be set consistently. */ #ifndef CUB_NS_QUALIFIER # define CUB_NS_QUALIFIER ::cub #endif #if !defined(CUB_DETAIL_MAGIC_NS_NAME) # define CUB_DETAIL_COUNT_N( \ _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, N, ...) \ N # define CUB_DETAIL_COUNT(...) \ CUB_DETAIL_IDENTITY( \ CUB_DETAIL_COUNT_N(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)) # define CUB_DETAIL_IDENTITY(N) N # define CUB_DETAIL_APPLY(MACRO, ...) CUB_DETAIL_IDENTITY(MACRO(__VA_ARGS__)) # define CUB_DETAIL_MAGIC_NS_NAME1(P1) CUB_##P1##_NS # define CUB_DETAIL_MAGIC_NS_NAME2(P1, P2) CUB_##P1##_##P2##_NS # define CUB_DETAIL_MAGIC_NS_NAME3(P1, P2, P3) CUB_##P1##_##P2##_##P3##_NS # define CUB_DETAIL_MAGIC_NS_NAME4(P1, P2, P3, P4) CUB_##P1##_##P2##_##P3##_##P4##_NS # define CUB_DETAIL_MAGIC_NS_NAME5(P1, P2, P3, P4, P5) CUB_##P1##_##P2##_##P3##_##P4##_##P5##_NS # define CUB_DETAIL_MAGIC_NS_NAME6(P1, P2, P3, P4, P5, P6) CUB_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_NS # define CUB_DETAIL_MAGIC_NS_NAME7(P1, P2, P3, P4, P5, P6, P7) CUB_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_NS # define CUB_DETAIL_MAGIC_NS_NAME8(P1, P2, P3, P4, P5, P6, P7, P8) \ CUB_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_NS # define CUB_DETAIL_MAGIC_NS_NAME9(P1, P2, P3, P4, P5, P6, P7, P8, P9) \ CUB_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_##P9##_NS # define CUB_DETAIL_MAGIC_NS_NAME10(P1, P2, P3, P4, P5, P6, P7, P8, P9, P10) \ CUB_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_##P9##_##P10##_NS # define CUB_DETAIL_MAGIC_NS_NAME11(P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11) \ CUB_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_##P9##_##P10##_##P11##_NS # define CUB_DETAIL_MAGIC_NS_NAME12(P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11, P12) \ CUB_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_##P9##_##P10##_##P11##_##P12##_NS # define CUB_DETAIL_MAGIC_NS_NAME13(P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11, P12, P13) \ CUB_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_##P9##_##P10##_##P11##_##P12##_##P13##_NS # define CUB_DETAIL_MAGIC_NS_NAME14(P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11, P12, P13, P14) \ CUB_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_##P9##_##P10##_##P11##_##P12##_##P13##_##P14##_NS # define CUB_DETAIL_MAGIC_NS_NAME15(P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11, P12, P13, P14, P15) \ CUB_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_##P9##_##P10##_##P11##_##P12##_##P13##_##P14##_##P15##_NS # define CUB_DETAIL_MAGIC_NS_NAME16(P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11, P12, P13, P14, P15, P16) \ CUB_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_##P9##_##P10##_##P11##_##P12##_##P13##_##P14##_##P15##_##P16##_NS # define CUB_DETAIL_MAGIC_NS_NAME17(P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11, P12, P13, P14, P15, P16, P17) \ CUB_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_##P9##_##P10##_##P11##_##P12##_##P13##_##P14##_##P15##_##P16##_##P17##_NS # define CUB_DETAIL_MAGIC_NS_NAME18(P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11, P12, P13, P14, P15, P16, P17, P18) \ CUB_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_##P9##_##P10##_##P11##_##P12##_##P13##_##P14##_##P15##_##P16##_##P17##_##P18##_NS # define CUB_DETAIL_MAGIC_NS_NAME19( \ P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11, P12, P13, P14, P15, P16, P17, P18, P19) \ CUB_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_##P9##_##P10##_##P11##_##P12##_##P13##_##P14##_##P15##_##P16##_##P17##_##P18##_##P19##_NS # define CUB_DETAIL_MAGIC_NS_NAME20( \ P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11, P12, P13, P14, P15, P16, P17, P18, P19, P20) \ CUB_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_##P9##_##P10##_##P11##_##P12##_##P13##_##P14##_##P15##_##P16##_##P17##_##P18##_##P19##_##P20##_NS # define CUB_DETAIL_DISPATCH(N) CUB_DETAIL_MAGIC_NS_NAME##N # define CUB_DETAIL_MAGIC_NS_NAME(...) \ CUB_DETAIL_IDENTITY(CUB_DETAIL_APPLY(CUB_DETAIL_DISPATCH, CUB_DETAIL_COUNT(__VA_ARGS__))(__VA_ARGS__)) #endif // !defined(CUB_DETAIL_MAGIC_NS_NAME) // clang-format off #if defined(CUB_DISABLE_NAMESPACE_MAGIC) || defined(CUB_WRAPPED_NAMESPACE) # if !defined(CUB_WRAPPED_NAMESPACE) # if !defined(CUB_IGNORE_NAMESPACE_MAGIC_ERROR) # error "Disabling namespace magic is unsafe without wrapping namespace" # endif // !defined(CUB_IGNORE_NAMESPACE_MAGIC_ERROR) # endif // !defined(CUB_WRAPPED_NAMESPACE) # define CUB_DETAIL_MAGIC_NS_BEGIN # define CUB_DETAIL_MAGIC_NS_END #else // not defined(CUB_DISABLE_NAMESPACE_MAGIC) # if defined(_NVHPC_CUDA) # define CUB_DETAIL_MAGIC_NS_BEGIN inline namespace CUB_DETAIL_MAGIC_NS_NAME(CUB_VERSION, NV_TARGET_SM_INTEGER_LIST) { # define CUB_DETAIL_MAGIC_NS_END } # else // not defined(_NVHPC_CUDA) # define CUB_DETAIL_MAGIC_NS_BEGIN inline namespace CUB_DETAIL_MAGIC_NS_NAME(CUB_VERSION, __CUDA_ARCH_LIST__) { # define CUB_DETAIL_MAGIC_NS_END } # endif // not defined(_NVHPC_CUDA) #endif // not defined(CUB_DISABLE_NAMESPACE_MAGIC) // clang-format on /** * \def CUB_NAMESPACE_BEGIN * This macro is used to open a `cub::` namespace block, along with any * enclosing namespaces requested by CUB_WRAPPED_NAMESPACE, etc. * This macro is defined by CUB and may not be overridden. */ #define CUB_NAMESPACE_BEGIN \ CUB_NS_PREFIX \ namespace cub \ { \ CUB_DETAIL_MAGIC_NS_BEGIN /** * \def CUB_NAMESPACE_END * This macro is used to close a `cub::` namespace block, along with any * enclosing namespaces requested by CUB_WRAPPED_NAMESPACE, etc. * This macro is defined by CUB and may not be overridden. */ #define CUB_NAMESPACE_END \ CUB_DETAIL_MAGIC_NS_END \ } /* end namespace cub */ \ CUB_NS_POSTFIX // Declare these namespaces here for the purpose of Doxygenating them CUB_NS_PREFIX /*! \namespace cub * \brief \p cub is the top-level namespace which contains all CUB * functions and types. */ namespace cub { } CUB_NS_POSTFIX cccl-2.5.0/cub/cub/util_ptx.cuh000066400000000000000000000564541463375617100163500ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * PTX intrinsics */ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include CUB_NAMESPACE_BEGIN /****************************************************************************** * PTX helper macros ******************************************************************************/ #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document /** * Register modifier for pointer-types (for inlining PTX assembly) */ # if defined(_WIN64) || defined(__LP64__) # define __CUB_LP64__ 1 // 64-bit register modifier for inlined asm # define _CUB_ASM_PTR_ "l" # define _CUB_ASM_PTR_SIZE_ "u64" # else # define __CUB_LP64__ 0 // 32-bit register modifier for inlined asm # define _CUB_ASM_PTR_ "r" # define _CUB_ASM_PTR_SIZE_ "u32" # endif #endif // DOXYGEN_SHOULD_SKIP_THIS /****************************************************************************** * Inlined PTX intrinsics ******************************************************************************/ namespace detail { /** * @brief Shifts @p val left by the amount specified by unsigned 32-bit value in @p num_bits. If @p * num_bits is larger than 32 bits, @p num_bits is clamped to 32. */ _CCCL_DEVICE _CCCL_FORCEINLINE uint32_t LogicShiftLeft(uint32_t val, uint32_t num_bits) { uint32_t ret{}; asm("shl.b32 %0, %1, %2;" : "=r"(ret) : "r"(val), "r"(num_bits)); return ret; } /** * @brief Shifts @p val right by the amount specified by unsigned 32-bit value in @p num_bits. If @p * num_bits is larger than 32 bits, @p num_bits is clamped to 32. */ _CCCL_DEVICE _CCCL_FORCEINLINE uint32_t LogicShiftRight(uint32_t val, uint32_t num_bits) { uint32_t ret{}; asm("shr.b32 %0, %1, %2;" : "=r"(ret) : "r"(val), "r"(num_bits)); return ret; } } // namespace detail /** * \brief Shift-right then add. Returns (\p x >> \p shift) + \p addend. */ _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int SHR_ADD(unsigned int x, unsigned int shift, unsigned int addend) { unsigned int ret; asm("vshr.u32.u32.u32.clamp.add %0, %1, %2, %3;" : "=r"(ret) : "r"(x), "r"(shift), "r"(addend)); return ret; } /** * \brief Shift-left then add. Returns (\p x << \p shift) + \p addend. */ _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int SHL_ADD(unsigned int x, unsigned int shift, unsigned int addend) { unsigned int ret; asm("vshl.u32.u32.u32.clamp.add %0, %1, %2, %3;" : "=r"(ret) : "r"(x), "r"(shift), "r"(addend)); return ret; } #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document /** * Bitfield-extract. */ template _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int BFE(UnsignedBits source, unsigned int bit_start, unsigned int num_bits, Int2Type /*byte_len*/) { unsigned int bits; asm("bfe.u32 %0, %1, %2, %3;" : "=r"(bits) : "r"((unsigned int) source), "r"(bit_start), "r"(num_bits)); return bits; } /** * Bitfield-extract for 64-bit types. */ template _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int BFE(UnsignedBits source, unsigned int bit_start, unsigned int num_bits, Int2Type<8> /*byte_len*/) { const unsigned long long MASK = (1ull << num_bits) - 1; return (source >> bit_start) & MASK; } # if CUB_IS_INT128_ENABLED /** * Bitfield-extract for 128-bit types. */ template _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int BFE(UnsignedBits source, unsigned int bit_start, unsigned int num_bits, Int2Type<16> /*byte_len*/) { const __uint128_t MASK = (__uint128_t{1} << num_bits) - 1; return (source >> bit_start) & MASK; } # endif #endif // DOXYGEN_SHOULD_SKIP_THIS /** * \brief Bitfield-extract. Extracts \p num_bits from \p source starting at bit-offset \p bit_start. The input \p * source may be an 8b, 16b, 32b, or 64b unsigned integer type. */ template _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int BFE(UnsignedBits source, unsigned int bit_start, unsigned int num_bits) { return BFE(source, bit_start, num_bits, Int2Type()); } /** * \brief Bitfield insert. Inserts the \p num_bits least significant bits of \p y into \p x at bit-offset \p bit_start. */ _CCCL_DEVICE _CCCL_FORCEINLINE void BFI(unsigned int& ret, unsigned int x, unsigned int y, unsigned int bit_start, unsigned int num_bits) { asm("bfi.b32 %0, %1, %2, %3, %4;" : "=r"(ret) : "r"(y), "r"(x), "r"(bit_start), "r"(num_bits)); } /** * \brief Three-operand add. Returns \p x + \p y + \p z. */ _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int IADD3(unsigned int x, unsigned int y, unsigned int z) { asm("vadd.u32.u32.u32.add %0, %1, %2, %3;" : "=r"(x) : "r"(x), "r"(y), "r"(z)); return x; } /** * \brief Byte-permute. Pick four arbitrary bytes from two 32-bit registers, and reassemble them into a 32-bit * destination register. For SM2.0 or later. * * \par * The bytes in the two source registers \p a and \p b are numbered from 0 to 7: * {\p b, \p a} = {{b7, b6, b5, b4}, {b3, b2, b1, b0}}. For each of the four bytes * {b3, b2, b1, b0} selected in the return value, a 4-bit selector is defined within * the four lower "nibbles" of \p index: {\p index } = {n7, n6, n5, n4, n3, n2, n1, n0} * * \par Snippet * The code snippet below illustrates byte-permute. * \par * \code * #include * * __global__ void ExampleKernel(...) * { * int a = 0x03020100; * int b = 0x07060504; * int index = 0x00007531; * * int selected = PRMT(a, b, index); // 0x07050301 * * \endcode * */ _CCCL_DEVICE _CCCL_FORCEINLINE int PRMT(unsigned int a, unsigned int b, unsigned int index) { int ret; asm("prmt.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(a), "r"(b), "r"(index)); return ret; } #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document /** * Sync-threads barrier. */ _CCCL_DEVICE _CCCL_FORCEINLINE void BAR(int count) { asm volatile("bar.sync 1, %0;" : : "r"(count)); } /** * CTA barrier */ _CCCL_DEVICE _CCCL_FORCEINLINE void CTA_SYNC() { __syncthreads(); } /** * CTA barrier with predicate */ _CCCL_DEVICE _CCCL_FORCEINLINE int CTA_SYNC_AND(int p) { return __syncthreads_and(p); } /** * CTA barrier with predicate */ _CCCL_DEVICE _CCCL_FORCEINLINE int CTA_SYNC_OR(int p) { return __syncthreads_or(p); } /** * Warp barrier */ _CCCL_DEVICE _CCCL_FORCEINLINE void WARP_SYNC(unsigned int member_mask) { __syncwarp(member_mask); } /** * Warp any */ _CCCL_DEVICE _CCCL_FORCEINLINE int WARP_ANY(int predicate, unsigned int member_mask) { return __any_sync(member_mask, predicate); } /** * Warp any */ _CCCL_DEVICE _CCCL_FORCEINLINE int WARP_ALL(int predicate, unsigned int member_mask) { return __all_sync(member_mask, predicate); } /** * Warp ballot */ _CCCL_DEVICE _CCCL_FORCEINLINE int WARP_BALLOT(int predicate, unsigned int member_mask) { return __ballot_sync(member_mask, predicate); } /** * Warp synchronous shfl_up */ _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int SHFL_UP_SYNC(unsigned int word, int src_offset, int flags, unsigned int member_mask) { asm volatile("shfl.sync.up.b32 %0, %1, %2, %3, %4;" : "=r"(word) : "r"(word), "r"(src_offset), "r"(flags), "r"(member_mask)); return word; } /** * Warp synchronous shfl_down */ _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int SHFL_DOWN_SYNC(unsigned int word, int src_offset, int flags, unsigned int member_mask) { asm volatile("shfl.sync.down.b32 %0, %1, %2, %3, %4;" : "=r"(word) : "r"(word), "r"(src_offset), "r"(flags), "r"(member_mask)); return word; } /** * Warp synchronous shfl_idx */ _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int SHFL_IDX_SYNC(unsigned int word, int src_lane, int flags, unsigned int member_mask) { asm volatile("shfl.sync.idx.b32 %0, %1, %2, %3, %4;" : "=r"(word) : "r"(word), "r"(src_lane), "r"(flags), "r"(member_mask)); return word; } /** * Warp synchronous shfl_idx */ _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int SHFL_IDX_SYNC(unsigned int word, int src_lane, unsigned int member_mask) { return __shfl_sync(member_mask, word, src_lane); } /** * Floating point multiply. (Mantissa LSB rounds towards zero.) */ _CCCL_DEVICE _CCCL_FORCEINLINE float FMUL_RZ(float a, float b) { float d; asm("mul.rz.f32 %0, %1, %2;" : "=f"(d) : "f"(a), "f"(b)); return d; } /** * Floating point multiply-add. (Mantissa LSB rounds towards zero.) */ _CCCL_DEVICE _CCCL_FORCEINLINE float FFMA_RZ(float a, float b, float c) { float d; asm("fma.rz.f32 %0, %1, %2, %3;" : "=f"(d) : "f"(a), "f"(b), "f"(c)); return d; } #endif // DOXYGEN_SHOULD_SKIP_THIS /** * \brief Terminates the calling thread */ _CCCL_DEVICE _CCCL_FORCEINLINE void ThreadExit() { asm volatile("exit;"); } /** * \brief Abort execution and generate an interrupt to the host CPU */ _CCCL_DEVICE _CCCL_FORCEINLINE void ThreadTrap() { asm volatile("trap;"); } /** * \brief Returns the row-major linear thread identifier for a multidimensional thread block */ _CCCL_DEVICE _CCCL_FORCEINLINE int RowMajorTid(int block_dim_x, int block_dim_y, int block_dim_z) { return ((block_dim_z == 1) ? 0 : (threadIdx.z * block_dim_x * block_dim_y)) + ((block_dim_y == 1) ? 0 : (threadIdx.y * block_dim_x)) + threadIdx.x; } /** * \brief Returns the warp lane ID of the calling thread */ _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int LaneId() { unsigned int ret; asm("mov.u32 %0, %%laneid;" : "=r"(ret)); return ret; } /** * \brief Returns the warp ID of the calling thread. Warp ID is guaranteed to be unique among warps, but may not * correspond to a zero-based ranking within the thread block. */ _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int WarpId() { unsigned int ret; asm("mov.u32 %0, %%warpid;" : "=r"(ret)); return ret; } /** * @brief Returns the warp mask for a warp of @p LOGICAL_WARP_THREADS threads * * @par * If the number of threads assigned to the virtual warp is not a power of two, * it's assumed that only one virtual warp exists. * * @tparam LOGICAL_WARP_THREADS [optional] The number of threads per * "logical" warp (may be less than the number of * hardware warp threads). * @param warp_id Id of virtual warp within architectural warp */ template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE unsigned int WarpMask(unsigned int warp_id) { constexpr bool is_pow_of_two = PowerOfTwo::VALUE; constexpr bool is_arch_warp = LOGICAL_WARP_THREADS == CUB_WARP_THREADS(0); unsigned int member_mask = 0xFFFFFFFFu >> (CUB_WARP_THREADS(0) - LOGICAL_WARP_THREADS); _CCCL_IF_CONSTEXPR (is_pow_of_two && !is_arch_warp) { member_mask <<= warp_id * LOGICAL_WARP_THREADS; } (void) warp_id; return member_mask; } /** * \brief Returns the warp lane mask of all lanes less than the calling thread */ _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int LaneMaskLt() { unsigned int ret; asm("mov.u32 %0, %%lanemask_lt;" : "=r"(ret)); return ret; } /** * \brief Returns the warp lane mask of all lanes less than or equal to the calling thread */ _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int LaneMaskLe() { unsigned int ret; asm("mov.u32 %0, %%lanemask_le;" : "=r"(ret)); return ret; } /** * \brief Returns the warp lane mask of all lanes greater than the calling thread */ _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int LaneMaskGt() { unsigned int ret; asm("mov.u32 %0, %%lanemask_gt;" : "=r"(ret)); return ret; } /** * \brief Returns the warp lane mask of all lanes greater than or equal to the calling thread */ _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int LaneMaskGe() { unsigned int ret; asm("mov.u32 %0, %%lanemask_ge;" : "=r"(ret)); return ret; } /** * @brief Shuffle-up for any data type. * Each warp-lanei obtains the value @p input contributed by * warp-lanei-src_offset. * For thread lanes @e i < src_offset, the thread's own @p input is returned to the thread. * ![](shfl_up_logo.png) * * @tparam LOGICAL_WARP_THREADS * The number of threads per "logical" warp. Must be a power-of-two <= 32. * * @tparam T * [inferred] The input/output element type * * @par * - Available only for SM3.0 or newer * * @par Snippet * The code snippet below illustrates each thread obtaining a \p double value from the * predecessor of its predecessor. * @par * @code * #include // or equivalently * * __global__ void ExampleKernel(...) * { * // Obtain one input item per thread * double thread_data = ... * * // Obtain item from two ranks below * double peer_data = ShuffleUp<32>(thread_data, 2, 0, 0xffffffff); * * @endcode * @par * Suppose the set of input @p thread_data across the first warp of threads is * {1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}. The corresponding output @p peer_data will be * {1.0, 2.0, 1.0, 2.0, 3.0, ..., 30.0}. * * @param[in] input * The value to broadcast * * @param[in] src_offset * The relative down-offset of the peer to read from * * @param[in] first_thread * Index of first lane in logical warp (typically 0) * * @param[in] member_mask * 32-bit mask of participating warp lanes */ template _CCCL_DEVICE _CCCL_FORCEINLINE T ShuffleUp(T input, int src_offset, int first_thread, unsigned int member_mask) { /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up enum { SHFL_C = (32 - LOGICAL_WARP_THREADS) << 8 }; typedef typename UnitWord::ShuffleWord ShuffleWord; constexpr int WORDS = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord); T output; ShuffleWord* output_alias = reinterpret_cast(&output); ShuffleWord* input_alias = reinterpret_cast(&input); unsigned int shuffle_word; shuffle_word = SHFL_UP_SYNC((unsigned int) input_alias[0], src_offset, first_thread | SHFL_C, member_mask); output_alias[0] = shuffle_word; #pragma unroll for (int WORD = 1; WORD < WORDS; ++WORD) { shuffle_word = SHFL_UP_SYNC((unsigned int) input_alias[WORD], src_offset, first_thread | SHFL_C, member_mask); output_alias[WORD] = shuffle_word; } return output; } /** * @brief Shuffle-down for any data type. * Each warp-lanei obtains the value @p input contributed by * warp-lanei+src_offset. * For thread lanes @e i >= WARP_THREADS, the thread's own @p input is returned to the * thread. ![](shfl_down_logo.png) * * @tparam LOGICAL_WARP_THREADS * The number of threads per "logical" warp. Must be a power-of-two <= 32. * * @tparam T * [inferred] The input/output element type * * @par * - Available only for SM3.0 or newer * * @par Snippet * The code snippet below illustrates each thread obtaining a @p double value from the * successor of its successor. * @par * @code * #include // or equivalently * * __global__ void ExampleKernel(...) * { * // Obtain one input item per thread * double thread_data = ... * * // Obtain item from two ranks below * double peer_data = ShuffleDown<32>(thread_data, 2, 31, 0xffffffff); * * @endcode * @par * Suppose the set of input @p thread_data across the first warp of threads is * {1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}. * The corresponding output @p peer_data will be * {3.0, 4.0, 5.0, 6.0, 7.0, ..., 32.0}. * * @param[in] input * The value to broadcast * * @param[in] src_offset * The relative up-offset of the peer to read from * * @param[in] last_thread * Index of last thread in logical warp (typically 31 for a 32-thread warp) * * @param[in] member_mask * 32-bit mask of participating warp lanes */ template _CCCL_DEVICE _CCCL_FORCEINLINE T ShuffleDown(T input, int src_offset, int last_thread, unsigned int member_mask) { /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up enum { SHFL_C = (32 - LOGICAL_WARP_THREADS) << 8 }; typedef typename UnitWord::ShuffleWord ShuffleWord; constexpr int WORDS = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord); T output; ShuffleWord* output_alias = reinterpret_cast(&output); ShuffleWord* input_alias = reinterpret_cast(&input); unsigned int shuffle_word; shuffle_word = SHFL_DOWN_SYNC((unsigned int) input_alias[0], src_offset, last_thread | SHFL_C, member_mask); output_alias[0] = shuffle_word; #pragma unroll for (int WORD = 1; WORD < WORDS; ++WORD) { shuffle_word = SHFL_DOWN_SYNC((unsigned int) input_alias[WORD], src_offset, last_thread | SHFL_C, member_mask); output_alias[WORD] = shuffle_word; } return output; } /** * @brief Shuffle-broadcast for any data type. * Each warp-lanei obtains the value @p input * contributed by warp-lanesrc_lane. * For @p src_lane < 0 or @p src_lane >= WARP_THREADS, * then the thread's own @p input is returned to the thread. * ![](shfl_broadcast_logo.png) * * @tparam LOGICAL_WARP_THREADS * The number of threads per "logical" warp. Must be a power-of-two <= 32. * * @tparam T * [inferred] The input/output element type * * @par * - Available only for SM3.0 or newer * * @par Snippet * The code snippet below illustrates each thread obtaining a @p double value from * warp-lane0. * * @par * @code * #include // or equivalently * * __global__ void ExampleKernel(...) * { * // Obtain one input item per thread * double thread_data = ... * * // Obtain item from thread 0 * double peer_data = ShuffleIndex<32>(thread_data, 0, 0xffffffff); * * @endcode * @par * Suppose the set of input @p thread_data across the first warp of threads is * {1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}. * The corresponding output @p peer_data will be * {1.0, 1.0, 1.0, 1.0, 1.0, ..., 1.0}. * * @param[in] input * The value to broadcast * * @param[in] src_lane * Which warp lane is to do the broadcasting * * @param[in] member_mask * 32-bit mask of participating warp lanes */ template _CCCL_DEVICE _CCCL_FORCEINLINE T ShuffleIndex(T input, int src_lane, unsigned int member_mask) { /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up enum { SHFL_C = ((32 - LOGICAL_WARP_THREADS) << 8) | (LOGICAL_WARP_THREADS - 1) }; typedef typename UnitWord::ShuffleWord ShuffleWord; constexpr int WORDS = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord); T output; ShuffleWord* output_alias = reinterpret_cast(&output); ShuffleWord* input_alias = reinterpret_cast(&input); unsigned int shuffle_word; shuffle_word = SHFL_IDX_SYNC((unsigned int) input_alias[0], src_lane, SHFL_C, member_mask); output_alias[0] = shuffle_word; #pragma unroll for (int WORD = 1; WORD < WORDS; ++WORD) { shuffle_word = SHFL_IDX_SYNC((unsigned int) input_alias[WORD], src_lane, SHFL_C, member_mask); output_alias[WORD] = shuffle_word; } return output; } #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document namespace detail { /** * Implementation detail for `MatchAny`. It provides specializations for full and partial warps. * For partial warps, inactive threads must be masked out. This is done in the partial warp * specialization below. * Usage: * ``` * // returns a mask of threads with the same 4 least-significant bits of `label` * // in a warp with 16 active threads * warp_matcher_t<4, 16>::match_any(label); * * // returns a mask of threads with the same 4 least-significant bits of `label` * // in a warp with 32 active threads (no extra work is done) * warp_matcher_t<4, 32>::match_any(label); * ``` */ template struct warp_matcher_t { static _CCCL_DEVICE unsigned int match_any(unsigned int label) { return warp_matcher_t::match_any(label) & ~(~0 << WARP_ACTIVE_THREADS); } }; template struct warp_matcher_t { // match.any.sync.b32 is slower when matching a few bits // using a ballot loop instead static _CCCL_DEVICE unsigned int match_any(unsigned int label) { unsigned int retval; // Extract masks of common threads for each bit # pragma unroll for (int BIT = 0; BIT < LABEL_BITS; ++BIT) { unsigned int mask; unsigned int current_bit = 1 << BIT; asm("{\n" " .reg .pred p;\n" " and.b32 %0, %1, %2;" " setp.ne.u32 p, %0, 0;\n" " vote.ballot.sync.b32 %0, p, 0xffffffff;\n" " @!p not.b32 %0, %0;\n" "}\n" : "=r"(mask) : "r"(label), "r"(current_bit)); // Remove peers who differ retval = (BIT == 0) ? mask : retval & mask; } return retval; } }; } // namespace detail #endif // DOXYGEN_SHOULD_SKIP_THIS /** * Compute a 32b mask of threads having the same least-significant * LABEL_BITS of \p label as the calling thread. */ template inline _CCCL_DEVICE unsigned int MatchAny(unsigned int label) { return detail::warp_matcher_t::match_any(label); } CUB_NAMESPACE_END cccl-2.5.0/cub/cub/util_temporary_storage.cuh000066400000000000000000000101521463375617100212640ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * Utilities for device-accessible temporary storages. */ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include CUB_NAMESPACE_BEGIN #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document /** * @brief Alias temporaries to externally-allocated device storage (or simply return the amount of * storage needed). * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. * When NULL, the required allocation size is written to @p temp_storage_bytes and no work is * done. * * @param[in,out] temp_storage_bytes * Size in bytes of @p d_temp_storage allocation * * @param[in,out] allocations * Pointers to device allocations needed * * @param[in] allocation_sizes * Sizes in bytes of device allocations needed */ template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE cudaError_t AliasTemporaries( void* d_temp_storage, size_t& temp_storage_bytes, void* (&allocations)[ALLOCATIONS], size_t (&allocation_sizes)[ALLOCATIONS]) { constexpr int ALIGN_BYTES = 256; constexpr int ALIGN_MASK = ~(ALIGN_BYTES - 1); // Compute exclusive prefix sum over allocation requests size_t allocation_offsets[ALLOCATIONS]; size_t bytes_needed = 0; for (int i = 0; i < ALLOCATIONS; ++i) { size_t allocation_bytes = (allocation_sizes[i] + ALIGN_BYTES - 1) & ALIGN_MASK; allocation_offsets[i] = bytes_needed; bytes_needed += allocation_bytes; } bytes_needed += ALIGN_BYTES - 1; // Check if the caller is simply requesting the size of the storage allocation if (!d_temp_storage) { temp_storage_bytes = bytes_needed; return cudaSuccess; } // Check if enough storage provided if (temp_storage_bytes < bytes_needed) { return CubDebug(cudaErrorInvalidValue); } // Alias d_temp_storage = (void*) ((size_t(d_temp_storage) + ALIGN_BYTES - 1) & ALIGN_MASK); for (int i = 0; i < ALLOCATIONS; ++i) { allocations[i] = static_cast(d_temp_storage) + allocation_offsets[i]; } return cudaSuccess; } #endif // DOXYGEN_SHOULD_SKIP_THIS CUB_NAMESPACE_END cccl-2.5.0/cub/cub/util_type.cuh000066400000000000000000001363471463375617100165160ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * Common type manipulation (metaprogramming) utilities */ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include #if defined(_CCCL_HAS_NVBF16) # if !defined(_CCCL_CUDACC_BELOW_11_8) // cuda_fp8.h resets default for C4127, so we have to guard the inclusion _CCCL_DIAG_PUSH # include _CCCL_DIAG_POP # endif // !_CCCL_CUDACC_BELOW_11_8 #endif // _CCCL_HAS_NV_BF16 #if !defined(_CCCL_COMPILER_NVRTC) # include #else # include #endif CUB_NAMESPACE_BEGIN #ifndef CUB_IS_INT128_ENABLED # if defined(__CUDACC_RTC__) # if defined(__CUDACC_RTC_INT128__) # define CUB_IS_INT128_ENABLED 1 # endif // !defined(__CUDACC_RTC_INT128__) # else // !defined(__CUDACC_RTC__) # if _CCCL_CUDACC_VER >= 1105000 # if defined(_CCCL_COMPILER_GCC) || defined(_CCCL_COMPILER_CLANG) || defined(_CCCL_COMPILER_ICC) \ || defined(_CCCL_COMPILER_NVHPC) # define CUB_IS_INT128_ENABLED 1 # endif // GCC || CLANG || ICC || NVHPC # endif // CTK >= 11.5 # endif // !defined(__CUDACC_RTC__) #endif // !defined(CUB_IS_INT128_ENABLED) /****************************************************************************** * Conditional types ******************************************************************************/ #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document namespace detail { template using conditional_t = typename ::cuda::std::conditional::type; template using value_t = # if !defined(_CCCL_COMPILER_NVRTC) typename std::iterator_traits::value_type; # else // defined(_CCCL_COMPILER_NVRTC) typename ::cuda::std::iterator_traits::value_type; # endif // defined(_CCCL_COMPILER_NVRTC) template ::type>::type, void>::value> struct non_void_value_impl { using type = FallbackT; }; template struct non_void_value_impl { using type = typename ::cuda::std::conditional<::cuda::std::is_same, void>::value, FallbackT, value_t>::type; }; /** * The output value type * type = (if IteratorT's value type is void) ? * ... then the FallbackT, * ... else the IteratorT's value type */ template using non_void_value_t = typename non_void_value_impl::type; } // namespace detail /** * \brief Type selection (IF ? ThenType : ElseType) * * \deprecated [Since 1.16.0] The cub::If APIs are deprecated. * Use cub::detail::conditional_t instead. */ template struct CUB_DEPRECATED If { using Type = cub::detail::conditional_t; }; /****************************************************************************** * Type equality ******************************************************************************/ /** * \brief Type equality test * * \deprecated [Since 1.16.0] The cub::Equals APIs are deprecated. * Use std::is_same instead. */ template struct CUB_DEPRECATED Equals { static constexpr int VALUE = ::cuda::std::is_same::value ? 1 : 0; static constexpr int NEGATE = VALUE ? 0 : 1; }; /****************************************************************************** * Static math ******************************************************************************/ /** * \brief Statically determine log2(N), rounded up. * * For example: * Log2<8>::VALUE // 3 * Log2<3>::VALUE // 2 */ template struct Log2 { /// Static logarithm value enum { VALUE = Log2> 1), COUNT + 1>::VALUE }; // Inductive case }; # ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template struct Log2 { enum { VALUE = (1 << (COUNT - 1) < N) ? // Base case COUNT : COUNT - 1 }; }; # endif // DOXYGEN_SHOULD_SKIP_THIS /** * \brief Statically determine if N is a power-of-two */ template struct PowerOfTwo { enum { VALUE = ((N & (N - 1)) == 0) }; }; /****************************************************************************** * Pointer vs. iterator detection ******************************************************************************/ /** * \brief Pointer vs. iterator * * \deprecated [Since 1.16.0] The cub::IsPointer APIs are deprecated. * Use std::is_pointer instead. */ template struct CUB_DEPRECATED IsPointer { static constexpr int VALUE = ::cuda::std::is_pointer::value; }; /****************************************************************************** * Qualifier detection ******************************************************************************/ /** * \brief Volatile modifier test * * \deprecated [Since 1.16.0] The cub::IsVolatile APIs are deprecated. * Use std::is_volatile instead. */ template struct CUB_DEPRECATED IsVolatile { static constexpr int VALUE = ::cuda::std::is_volatile::value; }; /****************************************************************************** * Qualifier removal ******************************************************************************/ /** * \brief Removes \p const and \p volatile qualifiers from type \p Tp. * * \deprecated [Since 1.16.0] The cub::RemoveQualifiers APIs are deprecated. * Use std::remove_cv instead. * * For example: * typename RemoveQualifiers::Type // int; */ template struct CUB_DEPRECATED RemoveQualifiers { using Type = typename ::cuda::std::remove_cv::type; }; #endif // DOXYGEN_SHOULD_SKIP_THIS /****************************************************************************** * Marker types ******************************************************************************/ #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document /** * \brief A simple "NULL" marker type */ struct NullType { using value_type = NullType; NullType() = default; template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE explicit NullType(const T&) {} template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE NullType& operator=(const T&) { return *this; } friend _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool operator==(const NullType&, const NullType&) { return true; } friend _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool operator!=(const NullType&, const NullType&) { return false; } }; /** * \brief Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static call * dispatch based on constant integral values) */ template struct Int2Type { enum { VALUE = A }; }; /** * \brief Allows algorithms that take a value as input to take a future value that is not computed yet at launch time. * * Note that it is user's responsibility to ensure that the result will be ready before use via external synchronization * or stream-ordering dependencies. * * \code * int *d_intermediate_result; * allocator.DeviceAllocate((void **)&d_intermediate_result, sizeof(int)); * compute_intermediate_result<<>>( * d_intermediate_result, // output * arg1, // input * arg2); // input * cub::FutureValue init_value(d_intermediate_result); * cub::DeviceScan::ExclusiveScan( * d_temp_storage, * temp_storage_bytes, * d_in, * d_out, * cub::Sum(), * init_value, * num_items); * allocator.DeviceFree(d_intermediate_result); * \endcode */ template struct FutureValue { using value_type = T; using iterator_type = IterT; explicit _CCCL_HOST_DEVICE _CCCL_FORCEINLINE FutureValue(IterT iter) : m_iter(iter) {} _CCCL_HOST_DEVICE _CCCL_FORCEINLINE operator T() { return *m_iter; } private: IterT m_iter; }; namespace detail { /** * \brief Allows algorithms to instantiate a single kernel to support both immediate value and future value. */ template struct InputValue { using value_type = T; using iterator_type = IterT; _CCCL_HOST_DEVICE _CCCL_FORCEINLINE operator T() { if (m_is_future) { return m_future_value; } return m_immediate_value; } explicit _CCCL_HOST_DEVICE _CCCL_FORCEINLINE InputValue(T immediate_value) : m_is_future(false) , m_immediate_value(immediate_value) {} explicit _CCCL_HOST_DEVICE _CCCL_FORCEINLINE InputValue(FutureValue future_value) : m_is_future(true) , m_future_value(future_value) {} _CCCL_HOST_DEVICE _CCCL_FORCEINLINE InputValue(const InputValue& other) : m_is_future(other.m_is_future) { if (m_is_future) { m_future_value = other.m_future_value; } else { detail::uninitialized_copy(&m_immediate_value, other.m_immediate_value); } } private: bool m_is_future; union { FutureValue m_future_value; T m_immediate_value; }; }; } // namespace detail /****************************************************************************** * Size and alignment ******************************************************************************/ /// Structure alignment template struct AlignBytes { struct Pad { T val; char byte; }; enum { /// The "true CUDA" alignment of T in bytes ALIGN_BYTES = sizeof(Pad) - sizeof(T) }; /// The "truly aligned" type typedef T Type; }; // Specializations where host C++ compilers (e.g., 32-bit Windows) may disagree // with device C++ compilers (EDG) on types passed as template parameters through // kernel functions # define __CUB_ALIGN_BYTES(t, b) \ template <> \ struct AlignBytes \ { \ enum \ { \ ALIGN_BYTES = b \ }; \ typedef __align__(b) t Type; \ }; __CUB_ALIGN_BYTES(short4, 8) __CUB_ALIGN_BYTES(ushort4, 8) __CUB_ALIGN_BYTES(int2, 8) __CUB_ALIGN_BYTES(uint2, 8) __CUB_ALIGN_BYTES(long long, 8) __CUB_ALIGN_BYTES(unsigned long long, 8) __CUB_ALIGN_BYTES(float2, 8) __CUB_ALIGN_BYTES(double, 8) # ifdef _WIN32 __CUB_ALIGN_BYTES(long2, 8) __CUB_ALIGN_BYTES(ulong2, 8) # else __CUB_ALIGN_BYTES(long2, 16) __CUB_ALIGN_BYTES(ulong2, 16) # endif __CUB_ALIGN_BYTES(int4, 16) __CUB_ALIGN_BYTES(uint4, 16) __CUB_ALIGN_BYTES(float4, 16) __CUB_ALIGN_BYTES(long4, 16) __CUB_ALIGN_BYTES(ulong4, 16) __CUB_ALIGN_BYTES(longlong2, 16) __CUB_ALIGN_BYTES(ulonglong2, 16) __CUB_ALIGN_BYTES(double2, 16) __CUB_ALIGN_BYTES(longlong4, 16) __CUB_ALIGN_BYTES(ulonglong4, 16) __CUB_ALIGN_BYTES(double4, 16) // clang-format off template struct AlignBytes : AlignBytes {}; template struct AlignBytes : AlignBytes {}; template struct AlignBytes : AlignBytes {}; // clang-format on /// Unit-words of data movement template struct UnitWord { enum { ALIGN_BYTES = AlignBytes::ALIGN_BYTES }; template struct IsMultiple { enum { UNIT_ALIGN_BYTES = AlignBytes::ALIGN_BYTES, IS_MULTIPLE = (sizeof(T) % sizeof(Unit) == 0) && (int(ALIGN_BYTES) % int(UNIT_ALIGN_BYTES) == 0) }; }; /// Biggest shuffle word that T is a whole multiple of and is not larger than /// the alignment of T using ShuffleWord = cub::detail::conditional_t< IsMultiple::IS_MULTIPLE, unsigned int, cub::detail::conditional_t::IS_MULTIPLE, unsigned short, unsigned char>>; /// Biggest volatile word that T is a whole multiple of and is not larger than /// the alignment of T using VolatileWord = cub::detail::conditional_t::IS_MULTIPLE, unsigned long long, ShuffleWord>; /// Biggest memory-access word that T is a whole multiple of and is not larger /// than the alignment of T using DeviceWord = cub::detail::conditional_t::IS_MULTIPLE, ulonglong2, VolatileWord>; /// Biggest texture reference word that T is a whole multiple of and is not /// larger than the alignment of T using TextureWord = cub::detail::conditional_t::IS_MULTIPLE, uint4, cub::detail::conditional_t::IS_MULTIPLE, uint2, ShuffleWord>>; }; // float2 specialization workaround (for SM10-SM13) template <> struct UnitWord { typedef int ShuffleWord; typedef unsigned long long VolatileWord; typedef unsigned long long DeviceWord; typedef float2 TextureWord; }; // float4 specialization workaround (for SM10-SM13) template <> struct UnitWord { typedef int ShuffleWord; typedef unsigned long long VolatileWord; typedef ulonglong2 DeviceWord; typedef float4 TextureWord; }; // char2 specialization workaround (for SM10-SM13) template <> struct UnitWord { typedef unsigned short ShuffleWord; typedef unsigned short VolatileWord; typedef unsigned short DeviceWord; typedef unsigned short TextureWord; }; // clang-format off template struct UnitWord : UnitWord {}; template struct UnitWord : UnitWord {}; template struct UnitWord : UnitWord {}; // clang-format on /****************************************************************************** * Vector type inference utilities. ******************************************************************************/ /** * \brief Exposes a member typedef \p Type that names the corresponding CUDA vector type if one exists. Otherwise \p * Type refers to the CubVector structure itself, which will wrap the corresponding \p x, \p y, etc. vector fields. */ template struct CubVector { static_assert(!sizeof(T), "CubVector can only have 1-4 elements"); }; enum { /// The maximum number of elements in CUDA vector types MAX_VEC_ELEMENTS = 4, }; /** * Generic vector-1 type */ template struct CubVector { T x; typedef T BaseType; typedef CubVector Type; }; /** * Generic vector-2 type */ template struct CubVector { T x; T y; typedef T BaseType; typedef CubVector Type; }; /** * Generic vector-3 type */ template struct CubVector { T x; T y; T z; typedef T BaseType; typedef CubVector Type; }; /** * Generic vector-4 type */ template struct CubVector { T x; T y; T z; T w; typedef T BaseType; typedef CubVector Type; }; /** * Macro for expanding partially-specialized built-in vector types */ # define CUB_DEFINE_VECTOR_TYPE(base_type, short_type) \ \ template <> \ struct CubVector : short_type##1 \ { \ typedef base_type BaseType; \ typedef short_type##1 Type; \ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE CubVector operator+(const CubVector& other) const \ { \ CubVector retval; \ retval.x = x + other.x; \ return retval; \ } \ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE CubVector operator-(const CubVector& other) const \ { \ CubVector retval; \ retval.x = x - other.x; \ return retval; \ } \ }; \ \ template <> \ struct CubVector : short_type##2 \ { \ typedef base_type BaseType; \ typedef short_type##2 Type; \ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE CubVector operator+(const CubVector& other) const \ { \ CubVector retval; \ retval.x = x + other.x; \ retval.y = y + other.y; \ return retval; \ } \ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE CubVector operator-(const CubVector& other) const \ { \ CubVector retval; \ retval.x = x - other.x; \ retval.y = y - other.y; \ return retval; \ } \ }; \ \ template <> \ struct CubVector : short_type##3 \ { \ typedef base_type BaseType; \ typedef short_type##3 Type; \ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE CubVector operator+(const CubVector& other) const \ { \ CubVector retval; \ retval.x = x + other.x; \ retval.y = y + other.y; \ retval.z = z + other.z; \ return retval; \ } \ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE CubVector operator-(const CubVector& other) const \ { \ CubVector retval; \ retval.x = x - other.x; \ retval.y = y - other.y; \ retval.z = z - other.z; \ return retval; \ } \ }; \ \ template <> \ struct CubVector : short_type##4 \ { \ typedef base_type BaseType; \ typedef short_type##4 Type; \ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE CubVector operator+(const CubVector& other) const \ { \ CubVector retval; \ retval.x = x + other.x; \ retval.y = y + other.y; \ retval.z = z + other.z; \ retval.w = w + other.w; \ return retval; \ } \ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE CubVector operator-(const CubVector& other) const \ { \ CubVector retval; \ retval.x = x - other.x; \ retval.y = y - other.y; \ retval.z = z - other.z; \ retval.w = w - other.w; \ return retval; \ } \ }; // Expand CUDA vector types for built-in primitives // clang-format off CUB_DEFINE_VECTOR_TYPE(char, char) CUB_DEFINE_VECTOR_TYPE(signed char, char) CUB_DEFINE_VECTOR_TYPE(short, short) CUB_DEFINE_VECTOR_TYPE(int, int) CUB_DEFINE_VECTOR_TYPE(long, long) CUB_DEFINE_VECTOR_TYPE(long long, longlong) CUB_DEFINE_VECTOR_TYPE(unsigned char, uchar) CUB_DEFINE_VECTOR_TYPE(unsigned short, ushort) CUB_DEFINE_VECTOR_TYPE(unsigned int, uint) CUB_DEFINE_VECTOR_TYPE(unsigned long, ulong) CUB_DEFINE_VECTOR_TYPE(unsigned long long, ulonglong) CUB_DEFINE_VECTOR_TYPE(float, float) CUB_DEFINE_VECTOR_TYPE(double, double) CUB_DEFINE_VECTOR_TYPE(bool, uchar) // clang-format on // Undefine macros # undef CUB_DEFINE_VECTOR_TYPE /****************************************************************************** * Wrapper types ******************************************************************************/ /** * \brief A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions */ template struct Uninitialized { /// Biggest memory-access word that T is a whole multiple of and is not larger than the alignment of T typedef typename UnitWord::DeviceWord DeviceWord; static constexpr ::cuda::std::size_t DATA_SIZE = sizeof(T); static constexpr ::cuda::std::size_t WORD_SIZE = sizeof(DeviceWord); static constexpr ::cuda::std::size_t WORDS = DATA_SIZE / WORD_SIZE; /// Backing storage DeviceWord storage[WORDS]; /// Alias _CCCL_HOST_DEVICE _CCCL_FORCEINLINE T& Alias() { return reinterpret_cast(*this); } }; /** * \brief A key identifier paired with a corresponding value */ template ::ALIGN_BYTES < AlignBytes<_Value>::ALIGN_BYTES), bool ValIsLT = (AlignBytes<_Value>::ALIGN_BYTES < AlignBytes<_Key>::ALIGN_BYTES) # endif // #if defined(_WIN32) && !defined(_WIN64) > struct KeyValuePair { typedef _Key Key; ///< Key data type typedef _Value Value; ///< Value data type Key key; ///< Item key Value value; ///< Item value /// Constructor _CCCL_HOST_DEVICE _CCCL_FORCEINLINE KeyValuePair() {} /// Constructor _CCCL_HOST_DEVICE _CCCL_FORCEINLINE KeyValuePair(Key const& key, Value const& value) : key(key) , value(value) {} /// Inequality operator _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool operator!=(const KeyValuePair& b) { return (value != b.value) || (key != b.key); } }; # if defined(_WIN32) && !defined(_WIN64) /** * Win32 won't do 16B alignment. This can present two problems for * should-be-16B-aligned (but actually 8B aligned) built-in and intrinsics members: * 1) If a smaller-aligned item were to be listed first, the host compiler places the * should-be-16B item at too early an offset (and disagrees with device compiler) * 2) Or, if a smaller-aligned item lists second, the host compiler gets the size * of the struct wrong (and disagrees with device compiler) * * So we put the larger-should-be-aligned item first, and explicitly pad the * end of the struct */ /// Smaller key specialization template struct KeyValuePair { typedef K Key; typedef V Value; typedef char Pad[AlignBytes::ALIGN_BYTES - AlignBytes::ALIGN_BYTES]; Value value; // Value has larger would-be alignment and goes first Key key; Pad pad; /// Constructor _CCCL_HOST_DEVICE _CCCL_FORCEINLINE KeyValuePair() {} /// Constructor _CCCL_HOST_DEVICE _CCCL_FORCEINLINE KeyValuePair(Key const& key, Value const& value) : key(key) , value(value) {} /// Inequality operator _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool operator!=(const KeyValuePair& b) { return (value != b.value) || (key != b.key); } }; /// Smaller value specialization template struct KeyValuePair { typedef K Key; typedef V Value; typedef char Pad[AlignBytes::ALIGN_BYTES - AlignBytes::ALIGN_BYTES]; Key key; // Key has larger would-be alignment and goes first Value value; Pad pad; /// Constructor _CCCL_HOST_DEVICE _CCCL_FORCEINLINE KeyValuePair() {} /// Constructor _CCCL_HOST_DEVICE _CCCL_FORCEINLINE KeyValuePair(Key const& key, Value const& value) : key(key) , value(value) {} /// Inequality operator _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool operator!=(const KeyValuePair& b) { return (value != b.value) || (key != b.key); } }; # endif // #if defined(_WIN32) && !defined(_WIN64) /** * \brief A wrapper for passing simple static arrays as kernel parameters * deprecated [Since 2.5.0] The `cub::ArrayWrapper` is deprecated. Use `cuda::std::array` instead. */ template struct CUB_DEPRECATED_BECAUSE("Use cuda::std::array instead.") ArrayWrapper { /// Statically-sized array of type \p T T array[COUNT]; /// Constructor _CCCL_HOST_DEVICE _CCCL_FORCEINLINE ArrayWrapper() {} }; /** * \brief Double-buffer storage wrapper for multi-pass stream transformations that require more than one storage array * for streaming intermediate results back and forth. * * Many multi-pass computations require a pair of "ping-pong" storage * buffers (e.g., one for reading from and the other for writing to, and then * vice-versa for the subsequent pass). This structure wraps a set of device * buffers and a "selector" member to track which is "current". */ template struct DoubleBuffer { /// Pair of device buffer pointers T* d_buffers[2]; /// Selector into \p d_buffers (i.e., the active/valid buffer) int selector; /// \brief Constructor _CCCL_HOST_DEVICE _CCCL_FORCEINLINE DoubleBuffer() { selector = 0; d_buffers[0] = NULL; d_buffers[1] = NULL; } /// \brief Constructor _CCCL_HOST_DEVICE _CCCL_FORCEINLINE DoubleBuffer(T* d_current, ///< The currently valid buffer T* d_alternate) ///< Alternate storage buffer of the same size as \p ///< d_current { selector = 0; d_buffers[0] = d_current; d_buffers[1] = d_alternate; } /// \brief Return pointer to the currently valid buffer _CCCL_HOST_DEVICE _CCCL_FORCEINLINE T* Current() { return d_buffers[selector]; } /// \brief Return pointer to the currently invalid buffer _CCCL_HOST_DEVICE _CCCL_FORCEINLINE T* Alternate() { return d_buffers[selector ^ 1]; } }; /****************************************************************************** * Typedef-detection ******************************************************************************/ /** * \brief Defines a structure \p detector_name that is templated on type \p T. The \p detector_name struct exposes a * constant member \p VALUE indicating whether or not parameter \p T exposes a nested type \p nested_type_name */ # define CUB_DEFINE_DETECT_NESTED_TYPE(detector_name, nested_type_name) \ template \ struct detector_name \ { \ template \ static char& test(typename C::nested_type_name*); \ template \ static int& test(...); \ enum \ { \ VALUE = sizeof(test(0)) < sizeof(int) \ }; \ }; /****************************************************************************** * Simple enable-if (similar to Boost) ******************************************************************************/ /** * \brief Simple enable-if (similar to Boost) * * \deprecated [Since 1.16.0] The cub::If APIs are deprecated. * Use std::enable_if instead. */ template struct CUB_DEPRECATED EnableIf { using Type = typename ::cuda::std::enable_if::type; }; /****************************************************************************** * Typedef-detection ******************************************************************************/ /** * \brief Determine whether or not BinaryOp's functor is of the form bool operator()(const T& a, const T&b) or * bool operator()(const T& a, const T&b, unsigned int idx) */ template struct BinaryOpHasIdxParam { private: /* template struct SFINAE1 {}; template struct SFINAE2 {}; template struct SFINAE3 {}; template struct SFINAE4 {}; */ template struct SFINAE5 {}; template struct SFINAE6 {}; template struct SFINAE7 {}; template struct SFINAE8 {}; /* template static char Test(SFINAE1 *); template static char Test(SFINAE2 *); template static char Test(SFINAE3 *); template static char Test(SFINAE4 *); */ template _CCCL_HOST_DEVICE static char Test(SFINAE5*); template _CCCL_HOST_DEVICE static char Test(SFINAE6*); template _CCCL_HOST_DEVICE static char Test(SFINAE7*); template _CCCL_HOST_DEVICE static char Test(SFINAE8*); template _CCCL_HOST_DEVICE static int Test(...); public: /// Whether the functor BinaryOp has a third unsigned int index param static constexpr bool HAS_PARAM = sizeof(Test(NULL)) == sizeof(char); }; /****************************************************************************** * Simple type traits utilities. * * For example: * Traits::CATEGORY // SIGNED_INTEGER * Traits::NULL_TYPE // true * Traits::CATEGORY // NOT_A_NUMBER * Traits::PRIMITIVE; // false * ******************************************************************************/ /** * \brief Basic type traits categories */ enum Category { NOT_A_NUMBER, SIGNED_INTEGER, UNSIGNED_INTEGER, FLOATING_POINT }; /** * \brief Basic type traits */ template struct BaseTraits { /// Category static constexpr Category CATEGORY = _CATEGORY; enum { PRIMITIVE = _PRIMITIVE, NULL_TYPE = _NULL_TYPE, }; }; /** * Basic type traits (unsigned primitive specialization) */ template struct BaseTraits { typedef _UnsignedBits UnsignedBits; static constexpr Category CATEGORY = UNSIGNED_INTEGER; static constexpr UnsignedBits LOWEST_KEY = UnsignedBits(0); static constexpr UnsignedBits MAX_KEY = UnsignedBits(-1); enum { PRIMITIVE = true, NULL_TYPE = false, }; static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE UnsignedBits TwiddleIn(UnsignedBits key) { return key; } static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE UnsignedBits TwiddleOut(UnsignedBits key) { return key; } static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE T Max() { UnsignedBits retval_bits = MAX_KEY; T retval; memcpy(&retval, &retval_bits, sizeof(T)); return retval; } static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE T Lowest() { UnsignedBits retval_bits = LOWEST_KEY; T retval; memcpy(&retval, &retval_bits, sizeof(T)); return retval; } }; /** * Basic type traits (signed primitive specialization) */ template struct BaseTraits { typedef _UnsignedBits UnsignedBits; static constexpr Category CATEGORY = SIGNED_INTEGER; static constexpr UnsignedBits HIGH_BIT = UnsignedBits(1) << ((sizeof(UnsignedBits) * 8) - 1); static constexpr UnsignedBits LOWEST_KEY = HIGH_BIT; static constexpr UnsignedBits MAX_KEY = UnsignedBits(-1) ^ HIGH_BIT; enum { PRIMITIVE = true, NULL_TYPE = false, }; static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE UnsignedBits TwiddleIn(UnsignedBits key) { return key ^ HIGH_BIT; }; static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE UnsignedBits TwiddleOut(UnsignedBits key) { return key ^ HIGH_BIT; }; static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE T Max() { UnsignedBits retval = MAX_KEY; return reinterpret_cast(retval); } static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE T Lowest() { UnsignedBits retval = LOWEST_KEY; return reinterpret_cast(retval); } }; template struct FpLimits; template <> struct FpLimits { static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE float Max() { return ::cuda::std::numeric_limits::max(); } static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE float Lowest() { return ::cuda::std::numeric_limits::lowest(); } }; template <> struct FpLimits { static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE double Max() { return ::cuda::std::numeric_limits::max(); } static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE double Lowest() { return ::cuda::std::numeric_limits::lowest(); } }; # if defined(_CCCL_HAS_NVFP16) template <> struct FpLimits<__half> { static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE __half Max() { unsigned short max_word = 0x7BFF; return reinterpret_cast<__half&>(max_word); } static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE __half Lowest() { unsigned short lowest_word = 0xFBFF; return reinterpret_cast<__half&>(lowest_word); } }; # endif // _CCCL_HAS_NVFP16 # if defined(_CCCL_HAS_NVBF16) template <> struct FpLimits<__nv_bfloat16> { static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE __nv_bfloat16 Max() { unsigned short max_word = 0x7F7F; return reinterpret_cast<__nv_bfloat16&>(max_word); } static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE __nv_bfloat16 Lowest() { unsigned short lowest_word = 0xFF7F; return reinterpret_cast<__nv_bfloat16&>(lowest_word); } }; # endif // _CCCL_HAS_NVBF16 # if defined(__CUDA_FP8_TYPES_EXIST__) template <> struct FpLimits<__nv_fp8_e4m3> { static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE __nv_fp8_e4m3 Max() { unsigned char max_word = 0x7EU; __nv_fp8_e4m3 ret_val; memcpy(&ret_val, &max_word, sizeof(__nv_fp8_e4m3)); return ret_val; } static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE __nv_fp8_e4m3 Lowest() { unsigned char lowest_word = 0xFEU; __nv_fp8_e4m3 ret_val; memcpy(&ret_val, &lowest_word, sizeof(__nv_fp8_e4m3)); return ret_val; } }; template <> struct FpLimits<__nv_fp8_e5m2> { static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE __nv_fp8_e5m2 Max() { unsigned char max_word = 0x7BU; __nv_fp8_e5m2 ret_val; memcpy(&ret_val, &max_word, sizeof(__nv_fp8_e5m2)); return ret_val; } static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE __nv_fp8_e5m2 Lowest() { unsigned char lowest_word = 0xFBU; __nv_fp8_e5m2 ret_val; memcpy(&ret_val, &lowest_word, sizeof(__nv_fp8_e5m2)); return ret_val; } }; # endif // __CUDA_FP8_TYPES_EXIST__ /** * Basic type traits (fp primitive specialization) */ template struct BaseTraits { typedef _UnsignedBits UnsignedBits; static constexpr Category CATEGORY = FLOATING_POINT; static constexpr UnsignedBits HIGH_BIT = UnsignedBits(1) << ((sizeof(UnsignedBits) * 8) - 1); static constexpr UnsignedBits LOWEST_KEY = UnsignedBits(-1); static constexpr UnsignedBits MAX_KEY = UnsignedBits(-1) ^ HIGH_BIT; enum { PRIMITIVE = true, NULL_TYPE = false, }; static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE UnsignedBits TwiddleIn(UnsignedBits key) { UnsignedBits mask = (key & HIGH_BIT) ? UnsignedBits(-1) : HIGH_BIT; return key ^ mask; }; static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE UnsignedBits TwiddleOut(UnsignedBits key) { UnsignedBits mask = (key & HIGH_BIT) ? HIGH_BIT : UnsignedBits(-1); return key ^ mask; }; static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE T Max() { return FpLimits::Max(); } static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE T Lowest() { return FpLimits::Lowest(); } }; /** * \brief Numeric type traits */ // clang-format off template struct NumericTraits : BaseTraits {}; template <> struct NumericTraits : BaseTraits {}; template <> struct NumericTraits : BaseTraits<(::cuda::std::numeric_limits::is_signed) ? SIGNED_INTEGER : UNSIGNED_INTEGER, true, false, unsigned char, char> {}; template <> struct NumericTraits : BaseTraits {}; template <> struct NumericTraits : BaseTraits {}; template <> struct NumericTraits : BaseTraits {}; template <> struct NumericTraits : BaseTraits {}; template <> struct NumericTraits : BaseTraits {}; template <> struct NumericTraits : BaseTraits {}; template <> struct NumericTraits : BaseTraits {}; template <> struct NumericTraits : BaseTraits {}; template <> struct NumericTraits : BaseTraits {}; template <> struct NumericTraits : BaseTraits {}; #if CUB_IS_INT128_ENABLED template <> struct NumericTraits<__uint128_t> { using T = __uint128_t; using UnsignedBits = __uint128_t; static constexpr Category CATEGORY = UNSIGNED_INTEGER; static constexpr UnsignedBits LOWEST_KEY = UnsignedBits(0); static constexpr UnsignedBits MAX_KEY = UnsignedBits(-1); static constexpr bool PRIMITIVE = false; static constexpr bool NULL_TYPE = false; static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE UnsignedBits TwiddleIn(UnsignedBits key) { return key; } static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE UnsignedBits TwiddleOut(UnsignedBits key) { return key; } static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE T Max() { return MAX_KEY; } static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE T Lowest() { return LOWEST_KEY; } }; template <> struct NumericTraits<__int128_t> { using T = __int128_t; using UnsignedBits = __uint128_t; static constexpr Category CATEGORY = SIGNED_INTEGER; static constexpr UnsignedBits HIGH_BIT = UnsignedBits(1) << ((sizeof(UnsignedBits) * 8) - 1); static constexpr UnsignedBits LOWEST_KEY = HIGH_BIT; static constexpr UnsignedBits MAX_KEY = UnsignedBits(-1) ^ HIGH_BIT; static constexpr bool PRIMITIVE = false; static constexpr bool NULL_TYPE = false; static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE UnsignedBits TwiddleIn(UnsignedBits key) { return key ^ HIGH_BIT; }; static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE UnsignedBits TwiddleOut(UnsignedBits key) { return key ^ HIGH_BIT; }; static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE T Max() { UnsignedBits retval = MAX_KEY; return reinterpret_cast(retval); } static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE T Lowest() { UnsignedBits retval = LOWEST_KEY; return reinterpret_cast(retval); } }; #endif template <> struct NumericTraits : BaseTraits {}; template <> struct NumericTraits : BaseTraits {}; # if defined(_CCCL_HAS_NVFP16) template <> struct NumericTraits<__half> : BaseTraits {}; # endif // _CCCL_HAS_NVFP16 # if defined(_CCCL_HAS_NVBF16) template <> struct NumericTraits<__nv_bfloat16> : BaseTraits {}; # endif // _CCCL_HAS_NVBF16 #if defined(__CUDA_FP8_TYPES_EXIST__) template <> struct NumericTraits<__nv_fp8_e4m3> : BaseTraits {}; template <> struct NumericTraits<__nv_fp8_e5m2> : BaseTraits {}; #endif // __CUDA_FP8_TYPES_EXIST__ template <> struct NumericTraits : BaseTraits::VolatileWord, bool> {}; // clang-format on /** * \brief Type traits */ template struct Traits : NumericTraits::type> {}; #endif // DOXYGEN_SHOULD_SKIP_THIS CUB_NAMESPACE_END cccl-2.5.0/cub/cub/util_vsmem.cuh000066400000000000000000000263471463375617100166620ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2023-24, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * This file contains facilities that help to prevent exceeding the available shared memory per thread block */ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include #include CUB_NAMESPACE_BEGIN #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document namespace detail { /** * @brief Helper struct to wrap all the information needed to implement virtual shared memory that's passed to a kernel. * */ struct vsmem_t { void* gmem_ptr; }; // The maximum amount of static shared memory available per thread block // Note that in contrast to dynamic shared memory, static shared memory is still limited to 48 KB static constexpr std::size_t max_smem_per_block = 48 * 1024; /** * @brief Class template that helps to prevent exceeding the available shared memory per thread block. * * @tparam AgentT The agent for which we check whether per-thread block shared memory is sufficient or whether virtual * shared memory is needed. */ template class vsmem_helper_impl { private: // Per-block virtual shared memory may be padded to make sure vsmem is an integer multiple of `line_size` static constexpr std::size_t line_size = 128; // The amount of shared memory or virtual shared memory required by the algorithm's agent static constexpr std::size_t required_smem = sizeof(typename AgentT::TempStorage); // Whether we need to allocate global memory-backed virtual shared memory static constexpr bool needs_vsmem = required_smem > max_smem_per_block; // Padding bytes to an integer multiple of `line_size`. Only applies to virtual shared memory static constexpr std::size_t padding_bytes = (required_smem % line_size == 0) ? 0 : (line_size - (required_smem % line_size)); public: // Type alias to be used for static temporary storage declaration within the algorithm's kernel using static_temp_storage_t = cub::detail::conditional_t; // The amount of global memory-backed virtual shared memory needed, padded to an integer multiple of 128 bytes static constexpr std::size_t vsmem_per_block = needs_vsmem ? (required_smem + padding_bytes) : 0; /** * @brief Used from within the device algorithm's kernel to get the temporary storage that can be * passed to the agent, specialized for the case when we can use native shared memory as temporary * storage. */ static _CCCL_DEVICE _CCCL_FORCEINLINE typename AgentT::TempStorage& get_temp_storage(typename AgentT::TempStorage& static_temp_storage, vsmem_t&) { return static_temp_storage; } /** * @brief Used from within the device algorithm's kernel to get the temporary storage that can be * passed to the agent, specialized for the case when we can use native shared memory as temporary * storage and taking a linear block id. */ static _CCCL_DEVICE _CCCL_FORCEINLINE typename AgentT::TempStorage& get_temp_storage(typename AgentT::TempStorage& static_temp_storage, vsmem_t&, std::size_t) { return static_temp_storage; } /** * @brief Used from within the device algorithm's kernel to get the temporary storage that can be * passed to the agent, specialized for the case when we have to use global memory-backed * virtual shared memory as temporary storage. */ static _CCCL_DEVICE _CCCL_FORCEINLINE typename AgentT::TempStorage& get_temp_storage(cub::NullType& static_temp_storage, vsmem_t& vsmem) { return *reinterpret_cast( static_cast(vsmem.gmem_ptr) + (vsmem_per_block * blockIdx.x)); } /** * @brief Used from within the device algorithm's kernel to get the temporary storage that can be * passed to the agent, specialized for the case when we have to use global memory-backed * virtual shared memory as temporary storage and taking a linear block id. */ static _CCCL_DEVICE _CCCL_FORCEINLINE typename AgentT::TempStorage& get_temp_storage(cub::NullType& static_temp_storage, vsmem_t& vsmem, std::size_t linear_block_id) { return *reinterpret_cast( static_cast(vsmem.gmem_ptr) + (vsmem_per_block * linear_block_id)); } /** * @brief Hints to discard modified cache lines of the used virtual shared memory. * modified cache lines. * * @note Needs to be followed by `__syncthreads()` if the function returns true and the virtual shared memory is * supposed to be reused after this function call. */ template ::type = 0> static _CCCL_DEVICE _CCCL_FORCEINLINE bool discard_temp_storage(typename AgentT::TempStorage& temp_storage) { return false; } /** * @brief Hints to discard modified cache lines of the used virtual shared memory. * modified cache lines. * * @note Needs to be followed by `__syncthreads()` if the function returns true and the virtual shared memory is * supposed to be reused after this function call. */ template ::type = 0> static _CCCL_DEVICE _CCCL_FORCEINLINE bool discard_temp_storage(typename AgentT::TempStorage& temp_storage) { // Ensure all threads finished using temporary storage CTA_SYNC(); const std::size_t linear_tid = threadIdx.x; const std::size_t block_stride = line_size * blockDim.x; char* ptr = reinterpret_cast(&temp_storage); auto ptr_end = ptr + vsmem_per_block; // 128 byte-aligned virtual shared memory discard for (auto thread_ptr = ptr + (linear_tid * line_size); thread_ptr < ptr_end; thread_ptr += block_stride) { cuda::discard_memory(thread_ptr, line_size); } return true; } }; template constexpr bool use_fallback_agent() { return (sizeof(typename DefaultAgentT::TempStorage) > max_smem_per_block) && (sizeof(typename FallbackAgentT::TempStorage) <= max_smem_per_block); } /** * @brief Class template that helps to prevent exceeding the available shared memory per thread block with two measures: * (1) If an agent's `TempStorage` declaration exceeds the maximum amount of shared memory per thread block, we check * whether using a fallback policy, e.g., with a smaller tile size, would fit into shared memory. * (2) If the fallback still doesn't fit into shared memory, we make use of virtual shared memory that is backed by * global memory. * * @tparam DefaultAgentPolicyT The default tuning policy that is used if the default agent's shared memory requirements * fall within the bounds of `max_smem_per_block` or when virtual shared memory is needed * @tparam DefaultAgentT The default agent, instantiated with the given default tuning policy * @tparam FallbackAgentPolicyT A fallback tuning policy that may exhibit lower shared memory requirements, e.g., by * using a smaller tile size, than the default. This fallback policy is used if and only if the shared memory * requirements of the default agent exceed `max_smem_per_block`, yet the shared memory requirements of the fallback * agent falls within the bounds of `max_smem_per_block`. * @tparam FallbackAgentT The fallback agent, instantiated with the given fallback tuning policy */ template ()> struct vsmem_helper_with_fallback_impl : public vsmem_helper_impl { using agent_t = DefaultAgentT; using agent_policy_t = DefaultAgentPolicyT; }; template struct vsmem_helper_with_fallback_impl : public vsmem_helper_impl { using agent_t = FallbackAgentT; using agent_policy_t = FallbackAgentPolicyT; }; /** * @brief Alias template for the `vsmem_helper_with_fallback_impl` that instantiates the given AgentT template with the * respective policy as first template parameter, followed by the parameters captured by the `AgentParamsT` template * parameter pack. */ template class AgentT, typename... AgentParamsT> using vsmem_helper_fallback_policy_t = vsmem_helper_with_fallback_impl, FallbackPolicyT, AgentT>; /** * @brief Alias template for the `vsmem_helper_t` by using a simple fallback policy that uses `DefaultPolicyT` as basis, * overwriting `64` threads per block and `1` item per thread. */ template class AgentT, typename... AgentParamsT> using vsmem_helper_default_fallback_policy_t = vsmem_helper_fallback_policy_t, AgentT, AgentParamsT...>; } // namespace detail #endif // DOXYGEN_SHOULD_SKIP_THIS CUB_NAMESPACE_END cccl-2.5.0/cub/cub/version.cuh000066400000000000000000000073561463375617100161620ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /*! \file version.cuh * \brief Compile-time macros encoding CUB release version * * is the only CUB header that is guaranteed to * change with every CUB release. * */ #pragma once // For _CCCL_IMPLICIT_SYSTEM_HEADER #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include /*! \def CUB_VERSION * \brief The preprocessor macro \p CUB_VERSION encodes the version * number of the CUB library as MMMmmmpp. * * \note CUB_VERSION is formatted as `MMMmmmpp`, which differs from `CCCL_VERSION` that uses `MMMmmmppp`. * * CUB_VERSION % 100 is the sub-minor version. * CUB_VERSION / 100 % 1000 is the minor version. * CUB_VERSION / 100000 is the major version. */ #define CUB_VERSION 200500 // macro expansion with ## requires this to be a single value /*! \def CUB_MAJOR_VERSION * \brief The preprocessor macro \p CUB_MAJOR_VERSION encodes the * major version number of the CUB library. */ #define CUB_MAJOR_VERSION (CUB_VERSION / 100000) /*! \def CUB_MINOR_VERSION * \brief The preprocessor macro \p CUB_MINOR_VERSION encodes the * minor version number of the CUB library. */ #define CUB_MINOR_VERSION (CUB_VERSION / 100 % 1000) /*! \def CUB_SUBMINOR_VERSION * \brief The preprocessor macro \p CUB_SUBMINOR_VERSION encodes the * sub-minor version number of the CUB library. */ #define CUB_SUBMINOR_VERSION (CUB_VERSION % 100) /*! \def CUB_PATCH_NUMBER * \brief The preprocessor macro \p CUB_PATCH_NUMBER encodes the * patch number of the CUB library. */ #define CUB_PATCH_NUMBER 0 static_assert(CUB_MAJOR_VERSION == CCCL_MAJOR_VERSION, ""); static_assert(CUB_MINOR_VERSION == CCCL_MINOR_VERSION, ""); static_assert(CUB_SUBMINOR_VERSION == CCCL_PATCH_VERSION, ""); cccl-2.5.0/cub/cub/warp/000077500000000000000000000000001463375617100147325ustar00rootroot00000000000000cccl-2.5.0/cub/cub/warp/specializations/000077500000000000000000000000001463375617100201335ustar00rootroot00000000000000cccl-2.5.0/cub/cub/warp/specializations/warp_exchange_shfl.cuh000066400000000000000000000346211463375617100244710ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include CUB_NAMESPACE_BEGIN namespace detail { template class WarpExchangeShfl { static_assert(PowerOfTwo::VALUE, "LOGICAL_WARP_THREADS must be a power of two"); static_assert(ITEMS_PER_THREAD == LOGICAL_WARP_THREADS, "WARP_EXCHANGE_SHUFFLE currently only works when ITEMS_PER_THREAD == " "LOGICAL_WARP_THREADS"); static constexpr bool IS_ARCH_WARP = LOGICAL_WARP_THREADS == CUB_WARP_THREADS(0); // concrete recursion class template class CompileTimeArray : protected CompileTimeArray { protected: InputT val; template _CCCL_DEVICE void Foreach(const bool xor_bit_set, const unsigned mask) { // The implementation here is a recursive divide-and-conquer approach // that takes inspiration from: // https://forums.developer.nvidia.com/t/transposing-register-held-matrices-with-warp-shuffles-need-help/38652/2 // // At its core, the problem can be boiled down to transposing the matrix // // A B // C D // // by swapping the off-diagonal elements/sub-matrices B and C recursively. // // This implementation requires power-of-two matrices. In order to avoid // the use of local or shared memory, all index computation has to occur // at compile-time, since registers cannot be indexed dynamically. // Furthermore, using recursive templates reduces the mental load on the // optimizer, since lowering for-loops into registers oftentimes requires // finagling them with #pragma unroll, which leads to brittle code. // // To illustrate this algorithm, let's pretend we have warpSize = 8, // where t0, ..., t7 denote the 8 threads, and thread i has an array of // size 8 with data = [Ai, Bi, ..., Hi] (the columns in the schematics). // // In the first round, we exchange the largest 4x4 off-diagonal // submatrix. Boxes illustrate the submatrices to be exchanged. // // ROUND 1 // ======= // t0 t1 t2 t3 t4 t5 t6 t7 // ┌──────────────┠// A0 A1 A2 A3 │A4 A5 A6 A7│ NUM_ENTRIES == 4 tells us how many // │ │ entries we have in a submatrix, // │ │ in this case 4 and the size of // B0 B1 B2 B3 │B4 B5 B6 B7│ the jumps between submatrices. // │ │ // │ │ 1. t[0,1,2,3] data[4] swap with t[4,5,6,7]'s data[0] // C0 C1 C2 C3 │C4 C5 C6 C7│ 2. t[0,1,2,3] data[5] swap with t[4,5,6,7]'s data[1] // │ │ 3. t[0,1,2,3] data[6] swap with t[4,5,6,7]'s data[2] // │ │ 4. t[0,1,2,3] data[7] swap with t[4,5,6,7]'s data[3] // D0 D1 D2 D3 │D4 D5 D6 D7│ // └──────────────┘ // ┌──────────────┠// │E0 E1 E2 E3│ E4 E5 E6 E7 // │ │ // │ │ // │F0 F1 F2 F3│ F4 F5 F6 F7 // │ │ // │ │ // │G0 G1 G2 G3│ G4 G5 G6 G7 // │ │ // │ │ // │H0 H1 H2 H3│ H4 H5 H6 H7 // └──────────────┘ // // ROUND 2 // ======= // t0 t1 t2 t3 t4 t5 t6 t7 // ┌──────┠┌──────┠// A0 A1 │A2 A3│ E0 E1 │E2 E3│ NUM_ENTRIES == 2 so we have 2 // │ │ │ │ submatrices per thread and there // │ │ │ │ are 2 elements between these // B0 B1 │B2 B3│ F0 F1 │F2 F3│ submatrices. // └──────┘ └──────┘ // ┌──────┠┌──────┠1. t[0,1,4,5] data[2] swap with t[2,3,6,7]'s data[0] // │C0 C1│ C2 C3 │G0 G1│ G2 G3 2. t[0,1,4,5] data[3] swap with t[2,3,6,7]'s data[1] // │ │ │ │ 3. t[0,1,4,5] data[6] swap with t[2,3,6,7]'s data[4] // │ │ │ │ 4. t[0,1,4,5] data[7] swap with t[2,3,6,7]'s data[5] // │D0 D1│ D2 D3 │H0 H1│ H2 H3 // └──────┘ └──────┘ // ┌──────┠┌──────┠// A4 A5 │A6 A7│ E4 E5 │E6 E7│ // │ │ │ │ // │ │ │ │ // B4 B5 │B6 B7│ F4 F5 │F6 F7│ // └──────┘ └──────┘ // ┌──────┠┌──────┠// │C4 C5│ C6 C7 │G4 G5│ G6 G7 // │ │ │ │ // │ │ │ │ // │D4 D5│ D6 D7 │H4 H5│ H6 H7 // └──────┘ └──────┘ // // ROUND 3 // ======= // t0 t1 t2 t3 t4 t5 t6 t7 // ┌──┠┌──┠┌──┠┌──┠// A0 │A1│ C0 │C1│ E0 │E1│ G0 │G1│ NUM_ENTRIES == 1 so we have 4 // └──┘ └──┘ └──┘ └──┘ submatrices per thread and there // ┌──┠┌──┠┌──┠┌──┠is 1 element between these // │B0│ B1 │D0│ D1 │F0│ F1 │H0│ H1 submatrices. // └──┘ └──┘ └──┘ └──┘ // ┌──┠┌──┠┌──┠┌──┠1. t[0,2,4,6] data[1] swap with t[1,3,5,7]'s data[0] // A2 │A3│ C2 │C3│ E2 │E3│ G2 │G3│ 2. t[0,2,4,6] data[3] swap with t[1,3,5,7]'s data[2] // └──┘ └──┘ └──┘ └──┘ 3. t[0,2,4,6] data[5] swap with t[1,3,5,7]'s data[4] // ┌──┠┌──┠┌──┠┌──┠4. t[0,2,4,6] data[7] swap with t[1,3,5,7]'s data[6] // │B2│ B3 │D2│ D3 │F2│ F3 │H2│ H3 // └──┘ └──┘ └──┘ └──┘ // ┌──┠┌──┠┌──┠┌──┠// A4 │A5│ C4 │C5│ E4 │E5│ G4 │G5│ // └──┘ └──┘ └──┘ └──┘ // ┌──┠┌──┠┌──┠┌──┠// │B4│ B5 │D4│ D5 │F4│ F5 │H4│ H5 // └──┘ └──┘ └──┘ └──┘ // ┌──┠┌──┠┌──┠┌──┠// A6 │A7│ C6 │C7│ E6 │E7│ G6 │G7│ // └──┘ └──┘ └──┘ └──┘ // ┌──┠┌──┠┌──┠┌──┠// │B6│ B7 │D6│ D7 │F6│ F7 │H6│ H7 // └──┘ └──┘ └──┘ └──┘ // // RESULT // ====== // t0 t1 t2 t3 t4 t5 t6 t7 // // A0 B0 C0 D0 E0 F0 G0 H0 // // // A1 B1 C1 D1 E1 F1 G1 H1 // // // A2 B2 C2 D2 E2 F2 G2 H2 // // // A3 B3 C3 D3 E3 F3 G3 H3 // // // A4 B4 C4 D4 E4 F4 G4 H4 // // // A5 B5 C5 D5 E5 F5 G5 H5 // // // A6 B6 C6 D6 E6 F6 G6 H6 // // // A7 B7 C7 D7 E7 F7 G7 H7 // // NOTE: Do *NOT* try to refactor this code to use a reference, since nvcc // tends to choke on it and then drop everything into local memory. const InputT send_val = (xor_bit_set ? CompileTimeArray::val : CompileTimeArray::val); const InputT recv_val = __shfl_xor_sync(mask, send_val, NUM_ENTRIES, LOGICAL_WARP_THREADS); (xor_bit_set ? CompileTimeArray::val : CompileTimeArray::val) = recv_val; constexpr int next_idx = IDX + 1 + ((IDX + 1) % NUM_ENTRIES == 0) * NUM_ENTRIES; CompileTimeArray::template Foreach(xor_bit_set, mask); } // terminate recursion _CCCL_DEVICE void TransposeImpl(unsigned int, unsigned int, Int2Type<0>) {} template _CCCL_DEVICE void TransposeImpl(const unsigned int lane_id, const unsigned int mask, Int2Type) { const bool xor_bit_set = lane_id & NUM_ENTRIES; Foreach(xor_bit_set, mask); TransposeImpl(lane_id, mask, Int2Type()); } public: _CCCL_DEVICE CompileTimeArray(const InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD]) : CompileTimeArray{input_items, output_items} , val{input_items[IDX]} {} _CCCL_DEVICE ~CompileTimeArray() { this->output_items[IDX] = val; } _CCCL_DEVICE void Transpose(const unsigned int lane_id, const unsigned int mask) { TransposeImpl(lane_id, mask, Int2Type()); } }; // terminating partial specialization template class CompileTimeArray { protected: // used for dumping back the individual values after transposing InputT (&output_items)[ITEMS_PER_THREAD]; template _CCCL_DEVICE void Foreach(bool, unsigned) {} public: _CCCL_DEVICE CompileTimeArray(const InputT (&)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD]) : output_items{output_items} {} }; const unsigned int lane_id; const unsigned int warp_id; const unsigned int member_mask; public: using TempStorage = NullType; WarpExchangeShfl() = delete; explicit _CCCL_DEVICE _CCCL_FORCEINLINE WarpExchangeShfl(TempStorage&) : lane_id(IS_ARCH_WARP ? LaneId() : (LaneId() % LOGICAL_WARP_THREADS)) , warp_id(IS_ARCH_WARP ? 0 : (LaneId() / LOGICAL_WARP_THREADS)) , member_mask(WarpMask(warp_id)) {} template _CCCL_DEVICE _CCCL_FORCEINLINE void BlockedToStriped(const InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD]) { CompileTimeArray arr{input_items, output_items}; arr.Transpose(lane_id, member_mask); } template _CCCL_DEVICE _CCCL_FORCEINLINE void StripedToBlocked(const InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD]) { BlockedToStriped(input_items, output_items); } // Trick to keep the compiler from inferring that the // condition in the static_assert is always false. template struct dependent_false { static constexpr bool value = false; }; template _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToStriped(InputT (&)[ITEMS_PER_THREAD], OffsetT (&)[ITEMS_PER_THREAD]) { static_assert(dependent_false::value, "Shuffle specialization of warp exchange does not support\n" "ScatterToStriped(InputT (&items)[ITEMS_PER_THREAD],\n" " OffsetT (&ranks)[ITEMS_PER_THREAD])"); } template _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToStriped(const InputT (&)[ITEMS_PER_THREAD], OutputT (&)[ITEMS_PER_THREAD], OffsetT (&)[ITEMS_PER_THREAD]) { static_assert(dependent_false::value, "Shuffle specialization of warp exchange does not support\n" "ScatterToStriped(const InputT (&input_items)[ITEMS_PER_THREAD],\n" " OutputT (&output_items)[ITEMS_PER_THREAD],\n" " OffsetT (&ranks)[ITEMS_PER_THREAD])"); } }; } // namespace detail CUB_NAMESPACE_END cccl-2.5.0/cub/cub/warp/specializations/warp_exchange_smem.cuh000066400000000000000000000140201463375617100244650ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * @file * The cub::WarpExchangeSmem class provides [collective](index.html#sec0) * methods for rearranging data partitioned across a CUDA warp. */ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include CUB_NAMESPACE_BEGIN namespace detail { template class WarpExchangeSmem { static_assert(PowerOfTwo::VALUE, "LOGICAL_WARP_THREADS must be a power of two"); static constexpr int ITEMS_PER_TILE = ITEMS_PER_THREAD * LOGICAL_WARP_THREADS + 1; static constexpr bool IS_ARCH_WARP = LOGICAL_WARP_THREADS == CUB_WARP_THREADS(0); static constexpr int LOG_SMEM_BANKS = CUB_LOG_SMEM_BANKS(0); // Insert padding if the number of items per thread is a power of two // and > 4 (otherwise we can typically use 128b loads) static constexpr bool INSERT_PADDING = (ITEMS_PER_THREAD > 4) && (PowerOfTwo::VALUE); static constexpr int PADDING_ITEMS = INSERT_PADDING ? (ITEMS_PER_TILE >> LOG_SMEM_BANKS) : 0; union _TempStorage { InputT items_shared[ITEMS_PER_TILE + PADDING_ITEMS]; }; // union TempStorage /// Shared storage reference _TempStorage& temp_storage; const unsigned int lane_id; const unsigned int warp_id; const unsigned int member_mask; public: struct TempStorage : Uninitialized<_TempStorage> {}; WarpExchangeSmem() = delete; explicit _CCCL_DEVICE _CCCL_FORCEINLINE WarpExchangeSmem(TempStorage& temp_storage) : temp_storage(temp_storage.Alias()) , lane_id(IS_ARCH_WARP ? LaneId() : (LaneId() % LOGICAL_WARP_THREADS)) , warp_id(IS_ARCH_WARP ? 0 : (LaneId() / LOGICAL_WARP_THREADS)) , member_mask(WarpMask(warp_id)) {} template _CCCL_DEVICE _CCCL_FORCEINLINE void BlockedToStriped(const InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD]) { for (int item = 0; item < ITEMS_PER_THREAD; item++) { const int idx = ITEMS_PER_THREAD * lane_id + item; temp_storage.items_shared[idx] = input_items[item]; } WARP_SYNC(member_mask); for (int item = 0; item < ITEMS_PER_THREAD; item++) { const int idx = LOGICAL_WARP_THREADS * item + lane_id; output_items[item] = temp_storage.items_shared[idx]; } } template _CCCL_DEVICE _CCCL_FORCEINLINE void StripedToBlocked(const InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD]) { for (int item = 0; item < ITEMS_PER_THREAD; item++) { const int idx = LOGICAL_WARP_THREADS * item + lane_id; temp_storage.items_shared[idx] = input_items[item]; } WARP_SYNC(member_mask); for (int item = 0; item < ITEMS_PER_THREAD; item++) { const int idx = ITEMS_PER_THREAD * lane_id + item; output_items[item] = temp_storage.items_shared[idx]; } } template _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToStriped(InputT (&items)[ITEMS_PER_THREAD], OffsetT (&ranks)[ITEMS_PER_THREAD]) { ScatterToStriped(items, items, ranks); } template _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToStriped( const InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], OffsetT (&ranks)[ITEMS_PER_THREAD]) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { if (INSERT_PADDING) { ranks[ITEM] = SHR_ADD(ranks[ITEM], LOG_SMEM_BANKS, ranks[ITEM]); } temp_storage.items_shared[ranks[ITEM]] = input_items[ITEM]; } WARP_SYNC(member_mask); #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { int item_offset = (ITEM * LOGICAL_WARP_THREADS) + lane_id; if (INSERT_PADDING) { item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); } output_items[ITEM] = temp_storage.items_shared[item_offset]; } } }; } // namespace detail CUB_NAMESPACE_END cccl-2.5.0/cub/cub/warp/specializations/warp_reduce_shfl.cuh000066400000000000000000000531421463375617100241550ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * @file * cub::WarpReduceShfl provides SHFL-based variants of parallel reduction of items partitioned across a CUDA thread * warp. */ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include #include #include CUB_NAMESPACE_BEGIN namespace detail { template struct reduce_add_exists : ::cuda::std::false_type {}; template struct reduce_add_exists : ::cuda::std::true_type {}; template struct reduce_min_exists : ::cuda::std::false_type {}; template struct reduce_min_exists : ::cuda::std::true_type {}; template struct reduce_max_exists : ::cuda::std::false_type {}; template struct reduce_max_exists : ::cuda::std::true_type {}; } // namespace detail /** * @brief WarpReduceShfl provides SHFL-based variants of parallel reduction of items partitioned * across a CUDA thread warp. * * @tparam T * Data type being reduced * * @tparam LOGICAL_WARP_THREADS * Number of threads per logical warp (must be a power-of-two) * * @tparam LEGACY_PTX_ARCH * The PTX compute capability for which to to specialize this collective */ template struct WarpReduceShfl { static_assert(PowerOfTwo::VALUE, "LOGICAL_WARP_THREADS must be a power of two"); //--------------------------------------------------------------------- // Constants and type definitions //--------------------------------------------------------------------- enum { /// Whether the logical warp size and the PTX warp size coincide IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(0)), /// The number of warp reduction steps STEPS = Log2::VALUE, /// Number of logical warps in a PTX warp LOGICAL_WARPS = CUB_WARP_THREADS(0) / LOGICAL_WARP_THREADS, /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up SHFL_C = (CUB_WARP_THREADS(0) - LOGICAL_WARP_THREADS) << 8 }; template struct IsInteger { enum { /// Whether the data type is a small (32b or less) integer for which we can use a single SFHL instruction per /// exchange IS_SMALL_UNSIGNED = (Traits::CATEGORY == UNSIGNED_INTEGER) && (sizeof(S) <= sizeof(unsigned int)) }; }; /// Shared memory storage layout type typedef NullType TempStorage; //--------------------------------------------------------------------- // Thread fields //--------------------------------------------------------------------- /// Lane index in logical warp int lane_id; /// Logical warp index in 32-thread physical warp int warp_id; /// 32-thread physical warp member mask of logical warp ::cuda::std::uint32_t member_mask; //--------------------------------------------------------------------- // Construction //--------------------------------------------------------------------- /// Constructor _CCCL_DEVICE _CCCL_FORCEINLINE WarpReduceShfl(TempStorage& /*temp_storage*/) : lane_id(static_cast(LaneId())) , warp_id(IS_ARCH_WARP ? 0 : (lane_id / LOGICAL_WARP_THREADS)) , member_mask(WarpMask(warp_id)) { if (!IS_ARCH_WARP) { lane_id = lane_id % LOGICAL_WARP_THREADS; } } //--------------------------------------------------------------------- // Reduction steps //--------------------------------------------------------------------- /** * @brief Reduction (specialized for summation across uint32 types) * * @param[in] input * Calling thread's input item. * * @param[in] reduction_op * Binary reduction operator * * @param[in] last_lane * Index of last lane in segment * * @param[in] offset * Up-offset to pull from */ _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int ReduceStep(unsigned int input, cub::Sum /*reduction_op*/, int last_lane, int offset) { unsigned int output; int shfl_c = last_lane | SHFL_C; // Shuffle control (mask and last_lane) // Use predicate set from SHFL to guard against invalid peers asm volatile( "{" " .reg .u32 r0;" " .reg .pred p;" " shfl.sync.down.b32 r0|p, %1, %2, %3, %5;" " @p add.u32 r0, r0, %4;" " mov.u32 %0, r0;" "}" : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input), "r"(member_mask)); return output; } /** * @brief Reduction (specialized for summation across fp32 types) * * @param[in] input * Calling thread's input item. * * @param[in] reduction_op * Binary reduction operator * * @param[in] last_lane * Index of last lane in segment * * @param[in] offset * Up-offset to pull from */ _CCCL_DEVICE _CCCL_FORCEINLINE float ReduceStep(float input, cub::Sum /*reduction_op*/, int last_lane, int offset) { float output; int shfl_c = last_lane | SHFL_C; // Shuffle control (mask and last_lane) // Use predicate set from SHFL to guard against invalid peers asm volatile( "{" " .reg .f32 r0;" " .reg .pred p;" " shfl.sync.down.b32 r0|p, %1, %2, %3, %5;" " @p add.f32 r0, r0, %4;" " mov.f32 %0, r0;" "}" : "=f"(output) : "f"(input), "r"(offset), "r"(shfl_c), "f"(input), "r"(member_mask)); return output; } /** * @brief Reduction (specialized for summation across unsigned long long types) * * @param[in] input * Calling thread's input item * * @param[in] reduction_op * Binary reduction operator * * @param[in] last_lane * Index of last lane in segment * * @param[in] offset * Up-offset to pull from */ _CCCL_DEVICE _CCCL_FORCEINLINE unsigned long long ReduceStep(unsigned long long input, cub::Sum /*reduction_op*/, int last_lane, int offset) { unsigned long long output; int shfl_c = last_lane | SHFL_C; // Shuffle control (mask and last_lane) asm volatile( "{" " .reg .u32 lo;" " .reg .u32 hi;" " .reg .pred p;" " mov.b64 {lo, hi}, %1;" " shfl.sync.down.b32 lo|p, lo, %2, %3, %4;" " shfl.sync.down.b32 hi|p, hi, %2, %3, %4;" " mov.b64 %0, {lo, hi};" " @p add.u64 %0, %0, %1;" "}" : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "r"(member_mask)); return output; } /** * @brief Reduction (specialized for summation across long long types) * * @param[in] input * Calling thread's input item * * @param[in] reduction_op * Binary reduction operator * * @param[in] last_lane * Index of last lane in segment * * @param[in] offset * Up-offset to pull from */ _CCCL_DEVICE _CCCL_FORCEINLINE long long ReduceStep(long long input, cub::Sum /*reduction_op*/, int last_lane, int offset) { long long output; int shfl_c = last_lane | SHFL_C; // Shuffle control (mask and last_lane) // Use predicate set from SHFL to guard against invalid peers asm volatile( "{" " .reg .u32 lo;" " .reg .u32 hi;" " .reg .pred p;" " mov.b64 {lo, hi}, %1;" " shfl.sync.down.b32 lo|p, lo, %2, %3, %4;" " shfl.sync.down.b32 hi|p, hi, %2, %3, %4;" " mov.b64 %0, {lo, hi};" " @p add.s64 %0, %0, %1;" "}" : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "r"(member_mask)); return output; } /** * @brief Reduction (specialized for summation across double types) * * @param[in] input * Calling thread's input item. * * @param[in] reduction_op * Binary reduction operator * * @param[in] last_lane * Index of last lane in segment * * @param[in] offset * Up-offset to pull from */ _CCCL_DEVICE _CCCL_FORCEINLINE double ReduceStep(double input, cub::Sum /*reduction_op*/, int last_lane, int offset) { double output; int shfl_c = last_lane | SHFL_C; // Shuffle control (mask and last_lane) // Use predicate set from SHFL to guard against invalid peers asm volatile( "{" " .reg .u32 lo;" " .reg .u32 hi;" " .reg .pred p;" " .reg .f64 r0;" " mov.b64 %0, %1;" " mov.b64 {lo, hi}, %1;" " shfl.sync.down.b32 lo|p, lo, %2, %3, %4;" " shfl.sync.down.b32 hi|p, hi, %2, %3, %4;" " mov.b64 r0, {lo, hi};" " @p add.f64 %0, %0, r0;" "}" : "=d"(output) : "d"(input), "r"(offset), "r"(shfl_c), "r"(member_mask)); return output; } /** * @brief Reduction (specialized for swizzled ReduceByKeyOp across * KeyValuePair types) * * @param[in] input * Calling thread's input item * * @param[in] reduction_op * Binary reduction operator * * @param[in] last_lane * Index of last lane in segment * * @param[in] offset * Up-offset to pull from */ template _CCCL_DEVICE _CCCL_FORCEINLINE KeyValuePair ReduceStep( KeyValuePair input, SwizzleScanOp> /*reduction_op*/, int last_lane, int offset) { KeyValuePair output; KeyT other_key = ShuffleDown(input.key, offset, last_lane, member_mask); output.key = input.key; output.value = ReduceStep(input.value, cub::Sum(), last_lane, offset, Int2Type::IS_SMALL_UNSIGNED>()); if (input.key != other_key) { output.value = input.value; } return output; } /** * @brief Reduction (specialized for swizzled ReduceBySegmentOp across * KeyValuePair types) * * @param[in] input * Calling thread's input item. * * @param[in] reduction_op * Binary reduction operator * * @param[in] last_lane * Index of last lane in segment * * @param[in] offset * Up-offset to pull from */ template _CCCL_DEVICE _CCCL_FORCEINLINE KeyValuePair ReduceStep( KeyValuePair input, SwizzleScanOp> /*reduction_op*/, int last_lane, int offset) { KeyValuePair output; output.value = ReduceStep(input.value, cub::Sum(), last_lane, offset, Int2Type::IS_SMALL_UNSIGNED>()); output.key = ReduceStep(input.key, cub::Sum(), last_lane, offset, Int2Type::IS_SMALL_UNSIGNED>()); if (input.key > 0) { output.value = input.value; } return output; } /** * @brief Reduction step (generic) * * @param[in] input * Calling thread's input item * * @param[in] reduction_op * Binary reduction operator * * @param[in] last_lane * Index of last lane in segment * * @param[in] offset * Up-offset to pull from */ template _CCCL_DEVICE _CCCL_FORCEINLINE _T ReduceStep(_T input, ReductionOp reduction_op, int last_lane, int offset) { _T output = input; _T temp = ShuffleDown(output, offset, last_lane, member_mask); // Perform reduction op if valid if (offset + lane_id <= last_lane) { output = reduction_op(input, temp); } return output; } /** * @brief Reduction step (specialized for small unsigned integers size 32b or less) * * @param[in] input * Calling thread's input item. * * @param[in] reduction_op * Binary reduction operator * * @param[in] last_lane * Index of last lane in segment * * @param[in] offset * Up-offset to pull from * * @param[in] is_small_unsigned * Marker type indicating whether T is a small unsigned integer */ template _CCCL_DEVICE _CCCL_FORCEINLINE _T ReduceStep(_T input, ReductionOp reduction_op, int last_lane, int offset, Int2Type /*is_small_unsigned*/) { return ReduceStep(input, reduction_op, last_lane, offset); } /** * @brief Reduction step (specialized for types other than small unsigned integers size * 32b or less) * * @param[in] input * Calling thread's input item. * * @param[in] reduction_op * Binary reduction operator * * @param[in] last_lane * Index of last lane in segment * * @param[in] offset * Up-offset to pull from * * @param[in] is_small_unsigned * Marker type indicating whether T is a small unsigned integer */ template _CCCL_DEVICE _CCCL_FORCEINLINE _T ReduceStep(_T input, ReductionOp reduction_op, int last_lane, int offset, Int2Type /*is_small_unsigned*/) { return ReduceStep(input, reduction_op, last_lane, offset); } //--------------------------------------------------------------------- // Templated reduction iteration //--------------------------------------------------------------------- /** * @param[in] input * Calling thread's input item. * * @param[in] reduction_op * Binary reduction operator * * @param[in] last_lane * Index of last lane in segment */ template _CCCL_DEVICE _CCCL_FORCEINLINE void ReduceStep(T& input, ReductionOp reduction_op, int last_lane, Int2Type /*step*/) { input = ReduceStep(input, reduction_op, last_lane, 1 << STEP, Int2Type::IS_SMALL_UNSIGNED>()); ReduceStep(input, reduction_op, last_lane, Int2Type()); } /** * @param[in] input * Calling thread's input item. * * @param[in] reduction_op * Binary reduction operator * * @param[in] last_lane * Index of last lane in segment */ template _CCCL_DEVICE _CCCL_FORCEINLINE void ReduceStep(T& /*input*/, ReductionOp /*reduction_op*/, int /*last_lane*/, Int2Type /*step*/) {} //--------------------------------------------------------------------- // Reduction operations //--------------------------------------------------------------------- /** * @param[in] input * Calling thread's input * * @param[in] valid_items * Total number of valid items across the logical warp * * @param[in] reduction_op * Binary reduction operator */ template _CCCL_DEVICE _CCCL_FORCEINLINE T ReduceImpl(Int2Type<0> /* all_lanes_valid */, T input, int valid_items, ReductionOp reduction_op) { int last_lane = valid_items - 1; T output = input; // Template-iterate reduction steps ReduceStep(output, reduction_op, last_lane, Int2Type<0>()); return output; } /** * @param[in] input * Calling thread's input * * @param[in] valid_items * Total number of valid items across the logical warp * * @param[in] reduction_op * Binary reduction operator */ template _CCCL_DEVICE _CCCL_FORCEINLINE T ReduceImpl(Int2Type<1> /* all_lanes_valid */, T input, int /* valid_items */, ReductionOp reduction_op) { int last_lane = LOGICAL_WARP_THREADS - 1; T output = input; // Template-iterate reduction steps ReduceStep(output, reduction_op, last_lane, Int2Type<0>()); return output; } template _CCCL_DEVICE _CCCL_FORCEINLINE typename ::cuda::std::enable_if<(::cuda::std::is_same::value || ::cuda::std::is_same::value) && detail::reduce_add_exists<>::value, T>::type ReduceImpl(Int2Type<1> /* all_lanes_valid */, T input, int /* valid_items */, cub::Sum /* reduction_op */) { T output = input; NV_IF_TARGET(NV_PROVIDES_SM_80, (output = __reduce_add_sync(member_mask, input);), (output = ReduceImpl(Int2Type<1>{}, input, LOGICAL_WARP_THREADS, cub::Sum{});)); return output; } template _CCCL_DEVICE _CCCL_FORCEINLINE typename ::cuda::std::enable_if<(::cuda::std::is_same::value || ::cuda::std::is_same::value) && detail::reduce_min_exists<>::value, T>::type ReduceImpl(Int2Type<1> /* all_lanes_valid */, T input, int /* valid_items */, cub::Min /* reduction_op */) { T output = input; NV_IF_TARGET(NV_PROVIDES_SM_80, (output = __reduce_min_sync(member_mask, input);), (output = ReduceImpl(Int2Type<1>{}, input, LOGICAL_WARP_THREADS, cub::Min{});)); return output; } template _CCCL_DEVICE _CCCL_FORCEINLINE typename ::cuda::std::enable_if<(::cuda::std::is_same::value || ::cuda::std::is_same::value) && detail::reduce_max_exists<>::value, T>::type ReduceImpl(Int2Type<1> /* all_lanes_valid */, T input, int /* valid_items */, cub::Max /* reduction_op */) { T output = input; NV_IF_TARGET(NV_PROVIDES_SM_80, (output = __reduce_max_sync(member_mask, input);), (output = ReduceImpl(Int2Type<1>{}, input, LOGICAL_WARP_THREADS, cub::Max{});)); return output; } /** * @brief Reduction * * @tparam ALL_LANES_VALID * Whether all lanes in each warp are contributing a valid fold of items * * @param[in] input * Calling thread's input * * @param[in] valid_items * Total number of valid items across the logical warp * * @param[in] reduction_op * Binary reduction operator */ template _CCCL_DEVICE _CCCL_FORCEINLINE T Reduce(T input, int valid_items, ReductionOp reduction_op) { return ReduceImpl(Int2Type{}, input, valid_items, reduction_op); } /** * @brief Segmented reduction * * @tparam HEAD_SEGMENTED * Whether flags indicate a segment-head or a segment-tail * * @param[in] input * Calling thread's input * * @param[in] flag * Whether or not the current lane is a segment head/tail * * @param[in] reduction_op * Binary reduction operator */ template _CCCL_DEVICE _CCCL_FORCEINLINE T SegmentedReduce(T input, FlagT flag, ReductionOp reduction_op) { // Get the start flags for each thread in the warp. int warp_flags = WARP_BALLOT(flag, member_mask); // Convert to tail-segmented if (HEAD_SEGMENTED) { warp_flags >>= 1; } // Mask out the bits below the current thread warp_flags &= LaneMaskGe(); // Mask of physical lanes outside the logical warp and convert to logical lanemask if (!IS_ARCH_WARP) { warp_flags = (warp_flags & member_mask) >> (warp_id * LOGICAL_WARP_THREADS); } // Mask in the last lane of logical warp warp_flags |= 1u << (LOGICAL_WARP_THREADS - 1); // Find the next set flag int last_lane = __clz(__brev(warp_flags)); T output = input; // // Iterate reduction steps // #pragma unroll // for (int STEP = 0; STEP < STEPS; STEP++) // { // output = ReduceStep(output, reduction_op, last_lane, 1 << STEP, // Int2Type::IS_SMALL_UNSIGNED>()); // } // Template-iterate reduction steps ReduceStep(output, reduction_op, last_lane, Int2Type<0>()); return output; } }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/warp/specializations/warp_reduce_smem.cuh000066400000000000000000000314541463375617100241640ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * @file * cub::WarpReduceSmem provides smem-based variants of parallel reduction of items partitioned * across a CUDA thread warp. */ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include CUB_NAMESPACE_BEGIN /** * @brief WarpReduceSmem provides smem-based variants of parallel reduction of items partitioned * across a CUDA thread warp. * * @tparam T * Data type being reduced * * @tparam LOGICAL_WARP_THREADS * Number of threads per logical warp * * @tparam LEGACY_PTX_ARCH * The PTX compute capability for which to to specialize this collective */ template struct WarpReduceSmem { /****************************************************************************** * Constants and type definitions ******************************************************************************/ enum { /// Whether the logical warp size and the PTX warp size coincide IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(0)), /// Whether the logical warp size is a power-of-two IS_POW_OF_TWO = PowerOfTwo::VALUE, /// The number of warp reduction steps STEPS = Log2::VALUE, /// The number of threads in half a warp HALF_WARP_THREADS = 1 << (STEPS - 1), /// The number of shared memory elements per warp WARP_SMEM_ELEMENTS = LOGICAL_WARP_THREADS + HALF_WARP_THREADS, /// FlagT status (when not using ballot) UNSET = 0x0, // Is initially unset SET = 0x1, // Is initially set SEEN = 0x2, // Has seen another head flag from a successor peer }; /// Shared memory flag type typedef unsigned char SmemFlag; /// Shared memory storage layout type (1.5 warps-worth of elements for each warp) struct _TempStorage { T reduce[WARP_SMEM_ELEMENTS]; SmemFlag flags[WARP_SMEM_ELEMENTS]; }; // Alias wrapper allowing storage to be unioned struct TempStorage : Uninitialized<_TempStorage> {}; /****************************************************************************** * Thread fields ******************************************************************************/ _TempStorage& temp_storage; unsigned int lane_id; unsigned int member_mask; /****************************************************************************** * Construction ******************************************************************************/ /// Constructor explicit _CCCL_DEVICE _CCCL_FORCEINLINE WarpReduceSmem(TempStorage& temp_storage) : temp_storage(temp_storage.Alias()) , lane_id(IS_ARCH_WARP ? LaneId() : LaneId() % LOGICAL_WARP_THREADS) , member_mask(WarpMask(LaneId() / LOGICAL_WARP_THREADS)) {} /****************************************************************************** * Utility methods ******************************************************************************/ //--------------------------------------------------------------------- // Regular reduction //--------------------------------------------------------------------- /** * @brief Reduction step * * @tparam ALL_LANES_VALID * Whether all lanes in each warp are contributing a valid fold of items * * @param[in] input * Calling thread's input * * @param[in] valid_items * Total number of valid items across the logical warp * * @param[in] reduction_op * Reduction operator */ template _CCCL_DEVICE _CCCL_FORCEINLINE T ReduceStep(T input, int valid_items, ReductionOp reduction_op, Int2Type /*step*/) { constexpr int OFFSET = 1 << STEP; // Share input through buffer ThreadStore(&temp_storage.reduce[lane_id], input); WARP_SYNC(member_mask); // Update input if peer_addend is in range if ((ALL_LANES_VALID && IS_POW_OF_TWO) || ((lane_id + OFFSET) < valid_items)) { T peer_addend = ThreadLoad(&temp_storage.reduce[lane_id + OFFSET]); input = reduction_op(input, peer_addend); } WARP_SYNC(member_mask); return ReduceStep(input, valid_items, reduction_op, Int2Type()); } /** * @brief Reduction step (terminate) * * @tparam ALL_LANES_VALID * Whether all lanes in each warp are contributing a valid fold of items * * @param[in] input * Calling thread's input * * @param[in] valid_items * Total number of valid items across the logical warp * * @param[in] reduction_op * Reduction operator */ template _CCCL_DEVICE _CCCL_FORCEINLINE T ReduceStep(T input, int valid_items, ReductionOp /*reduction_op*/, Int2Type /*step*/) { return input; } //--------------------------------------------------------------------- // Segmented reduction //--------------------------------------------------------------------- /** * @brief Ballot-based segmented reduce * * @tparam HEAD_SEGMENTED * Whether flags indicate a segment-head or a segment-tail * * @param[in] input * Calling thread's input * * @param[in] flag * Whether or not the current lane is a segment head/tail * * @param[in] reduction_op * Reduction operator * * @param[in] has_ballot * Marker type for whether the target arch has ballot functionality */ template _CCCL_DEVICE _CCCL_FORCEINLINE T SegmentedReduce(T input, FlagT flag, ReductionOp reduction_op, Int2Type /*has_ballot*/) { // Get the start flags for each thread in the warp. int warp_flags = WARP_BALLOT(flag, member_mask); if (!HEAD_SEGMENTED) { warp_flags <<= 1; } // Keep bits above the current thread. warp_flags &= LaneMaskGt(); // Accommodate packing of multiple logical warps in a single physical warp if (!IS_ARCH_WARP) { warp_flags >>= (LaneId() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS; } // Find next flag int next_flag = __clz(__brev(warp_flags)); // Clip the next segment at the warp boundary if necessary if (LOGICAL_WARP_THREADS != 32) { next_flag = CUB_MIN(next_flag, LOGICAL_WARP_THREADS); } #pragma unroll for (int STEP = 0; STEP < STEPS; STEP++) { const int OFFSET = 1 << STEP; // Share input into buffer ThreadStore(&temp_storage.reduce[lane_id], input); WARP_SYNC(member_mask); // Update input if peer_addend is in range if (OFFSET + lane_id < next_flag) { T peer_addend = ThreadLoad(&temp_storage.reduce[lane_id + OFFSET]); input = reduction_op(input, peer_addend); } WARP_SYNC(member_mask); } return input; } /** * @brief Smem-based segmented reduce * * @tparam HEAD_SEGMENTED * Whether flags indicate a segment-head or a segment-tail * * @param[in] input * Calling thread's input * * @param[in] flag * Whether or not the current lane is a segment head/tail * * @param[in] reduction_op * Reduction operator * * @param[in] has_ballot * Marker type for whether the target arch has ballot functionality */ template _CCCL_DEVICE _CCCL_FORCEINLINE T SegmentedReduce(T input, FlagT flag, ReductionOp reduction_op, Int2Type /*has_ballot*/) { enum { UNSET = 0x0, // Is initially unset SET = 0x1, // Is initially set SEEN = 0x2, // Has seen another head flag from a successor peer }; // Alias flags onto shared data storage volatile SmemFlag* flag_storage = temp_storage.flags; SmemFlag flag_status = (flag) ? SET : UNSET; for (int STEP = 0; STEP < STEPS; STEP++) { const int OFFSET = 1 << STEP; // Share input through buffer ThreadStore(&temp_storage.reduce[lane_id], input); WARP_SYNC(member_mask); // Get peer from buffer T peer_addend = ThreadLoad(&temp_storage.reduce[lane_id + OFFSET]); WARP_SYNC(member_mask); // Share flag through buffer flag_storage[lane_id] = flag_status; // Get peer flag from buffer SmemFlag peer_flag_status = flag_storage[lane_id + OFFSET]; // Update input if peer was in range if (lane_id < LOGICAL_WARP_THREADS - OFFSET) { if (HEAD_SEGMENTED) { // Head-segmented if ((flag_status & SEEN) == 0) { // Has not seen a more distant head flag if (peer_flag_status & SET) { // Has now seen a head flag flag_status |= SEEN; } else { // Peer is not a head flag: grab its count input = reduction_op(input, peer_addend); } // Update seen status to include that of peer flag_status |= (peer_flag_status & SEEN); } } else { // Tail-segmented. Simply propagate flag status if (!flag_status) { input = reduction_op(input, peer_addend); flag_status |= peer_flag_status; } } } } return input; } /****************************************************************************** * Interface ******************************************************************************/ /** * @brief Reduction * * @tparam ALL_LANES_VALID * Whether all lanes in each warp are contributing a valid fold of items * * @param[in] input * Calling thread's input * * @param[in] valid_items * Total number of valid items across the logical warp * * @param[in] reduction_op * Reduction operator */ template _CCCL_DEVICE _CCCL_FORCEINLINE T Reduce(T input, int valid_items, ReductionOp reduction_op) { return ReduceStep(input, valid_items, reduction_op, Int2Type<0>()); } /** * @brief Segmented reduction * * @tparam HEAD_SEGMENTED * Whether flags indicate a segment-head or a segment-tail * * @param[in] input * Calling thread's input * * @param[in] flag * Whether or not the current lane is a segment head/tail * * @param[in] reduction_op * Reduction operator */ template _CCCL_DEVICE _CCCL_FORCEINLINE T SegmentedReduce(T input, FlagT flag, ReductionOp reduction_op) { return SegmentedReduce(input, flag, reduction_op, Int2Type()); } }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/warp/specializations/warp_scan_shfl.cuh000066400000000000000000000511611463375617100236310ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * @file * cub::WarpScanShfl provides SHFL-based variants of parallel prefix scan of items partitioned * across a CUDA thread warp. */ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include CUB_NAMESPACE_BEGIN /** * @brief WarpScanShfl provides SHFL-based variants of parallel prefix scan of items partitioned * across a CUDA thread warp. * * @tparam T * Data type being scanned * * @tparam LOGICAL_WARP_THREADS * Number of threads per logical warp (must be a power-of-two) * * @tparam LEGACY_PTX_ARCH * The PTX compute capability for which to to specialize this collective */ template struct WarpScanShfl { //--------------------------------------------------------------------- // Constants and type definitions //--------------------------------------------------------------------- enum { /// Whether the logical warp size and the PTX warp size coincide IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(0)), /// The number of warp scan steps STEPS = Log2::VALUE, /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up SHFL_C = (CUB_WARP_THREADS(0) - LOGICAL_WARP_THREADS) << 8 }; template struct IntegerTraits { enum { /// Whether the data type is a small (32b or less) integer for which we can use a single SFHL instruction per /// exchange IS_SMALL_UNSIGNED = (Traits::CATEGORY == UNSIGNED_INTEGER) && (sizeof(S) <= sizeof(unsigned int)) }; }; /// Shared memory storage layout type struct TempStorage {}; //--------------------------------------------------------------------- // Thread fields //--------------------------------------------------------------------- /// Lane index in logical warp unsigned int lane_id; /// Logical warp index in 32-thread physical warp unsigned int warp_id; /// 32-thread physical warp member mask of logical warp unsigned int member_mask; //--------------------------------------------------------------------- // Construction //--------------------------------------------------------------------- /// Constructor explicit _CCCL_DEVICE _CCCL_FORCEINLINE WarpScanShfl(TempStorage& /*temp_storage*/) : lane_id(LaneId()) , warp_id(IS_ARCH_WARP ? 0 : (lane_id / LOGICAL_WARP_THREADS)) , member_mask(WarpMask(warp_id)) { if (!IS_ARCH_WARP) { lane_id = lane_id % LOGICAL_WARP_THREADS; } } //--------------------------------------------------------------------- // Inclusive scan steps //--------------------------------------------------------------------- /** * @brief Inclusive prefix scan step (specialized for summation across int32 types) * * @param[in] input * Calling thread's input item. * * @param[in] scan_op * Binary scan operator * * @param[in] first_lane * Index of first lane in segment * * @param[in] offset * Up-offset to pull from */ _CCCL_DEVICE _CCCL_FORCEINLINE int InclusiveScanStep(int input, cub::Sum /*scan_op*/, int first_lane, int offset) { int output; int shfl_c = first_lane | SHFL_C; // Shuffle control (mask and first-lane) // Use predicate set from SHFL to guard against invalid peers asm volatile( "{" " .reg .s32 r0;" " .reg .pred p;" " shfl.sync.up.b32 r0|p, %1, %2, %3, %5;" " @p add.s32 r0, r0, %4;" " mov.s32 %0, r0;" "}" : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input), "r"(member_mask)); return output; } /** * @brief Inclusive prefix scan step (specialized for summation across uint32 types) * * @param[in] input * Calling thread's input item * * @param[in] scan_op * Binary scan operator * * @param[in] first_lane * Index of first lane in segment * * @param[in] offset * Up-offset to pull from */ _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int InclusiveScanStep(unsigned int input, cub::Sum /*scan_op*/, int first_lane, int offset) { unsigned int output; int shfl_c = first_lane | SHFL_C; // Shuffle control (mask and first-lane) // Use predicate set from SHFL to guard against invalid peers asm volatile( "{" " .reg .u32 r0;" " .reg .pred p;" " shfl.sync.up.b32 r0|p, %1, %2, %3, %5;" " @p add.u32 r0, r0, %4;" " mov.u32 %0, r0;" "}" : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input), "r"(member_mask)); return output; } /** * @brief Inclusive prefix scan step (specialized for summation across fp32 types) * * @param[in] input * Calling thread's input item * * @param[in] scan_op * Binary scan operator * * @param[in] first_lane * Index of first lane in segment * * @param[in] offset * Up-offset to pull from */ _CCCL_DEVICE _CCCL_FORCEINLINE float InclusiveScanStep(float input, cub::Sum /*scan_op*/, int first_lane, int offset) { float output; int shfl_c = first_lane | SHFL_C; // Shuffle control (mask and first-lane) // Use predicate set from SHFL to guard against invalid peers asm volatile( "{" " .reg .f32 r0;" " .reg .pred p;" " shfl.sync.up.b32 r0|p, %1, %2, %3, %5;" " @p add.f32 r0, r0, %4;" " mov.f32 %0, r0;" "}" : "=f"(output) : "f"(input), "r"(offset), "r"(shfl_c), "f"(input), "r"(member_mask)); return output; } /** * @brief Inclusive prefix scan step (specialized for summation across unsigned long long types) * * @param[in] input * Calling thread's input item * * @param[in] scan_op * Binary scan operator * * @param[in] first_lane * Index of first lane in segment * * @param[in] offset * Up-offset to pull from */ _CCCL_DEVICE _CCCL_FORCEINLINE unsigned long long InclusiveScanStep(unsigned long long input, cub::Sum /*scan_op*/, int first_lane, int offset) { unsigned long long output; int shfl_c = first_lane | SHFL_C; // Shuffle control (mask and first-lane) // Use predicate set from SHFL to guard against invalid peers asm volatile( "{" " .reg .u64 r0;" " .reg .u32 lo;" " .reg .u32 hi;" " .reg .pred p;" " mov.b64 {lo, hi}, %1;" " shfl.sync.up.b32 lo|p, lo, %2, %3, %5;" " shfl.sync.up.b32 hi|p, hi, %2, %3, %5;" " mov.b64 r0, {lo, hi};" " @p add.u64 r0, r0, %4;" " mov.u64 %0, r0;" "}" : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "l"(input), "r"(member_mask)); return output; } /** * @brief Inclusive prefix scan step (specialized for summation across long long types) * * @param[in] input * Calling thread's input item. * * @param[in] scan_op * Binary scan operator * * @param[in] first_lane * Index of first lane in segment * * @param[in] offset * Up-offset to pull from */ _CCCL_DEVICE _CCCL_FORCEINLINE long long InclusiveScanStep(long long input, cub::Sum /*scan_op*/, int first_lane, int offset) { long long output; int shfl_c = first_lane | SHFL_C; // Shuffle control (mask and first-lane) // Use predicate set from SHFL to guard against invalid peers asm volatile( "{" " .reg .s64 r0;" " .reg .u32 lo;" " .reg .u32 hi;" " .reg .pred p;" " mov.b64 {lo, hi}, %1;" " shfl.sync.up.b32 lo|p, lo, %2, %3, %5;" " shfl.sync.up.b32 hi|p, hi, %2, %3, %5;" " mov.b64 r0, {lo, hi};" " @p add.s64 r0, r0, %4;" " mov.s64 %0, r0;" "}" : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "l"(input), "r"(member_mask)); return output; } /** * @brief Inclusive prefix scan step (specialized for summation across fp64 types) * * @param[in] input * Calling thread's input item. * * @param[in] scan_op * Binary scan operator * * @param[in] first_lane * Index of first lane in segment * * @param[in] offset * Up-offset to pull from */ _CCCL_DEVICE _CCCL_FORCEINLINE double InclusiveScanStep(double input, cub::Sum /*scan_op*/, int first_lane, int offset) { double output; int shfl_c = first_lane | SHFL_C; // Shuffle control (mask and first-lane) // Use predicate set from SHFL to guard against invalid peers asm volatile( "{" " .reg .u32 lo;" " .reg .u32 hi;" " .reg .pred p;" " .reg .f64 r0;" " mov.b64 %0, %1;" " mov.b64 {lo, hi}, %1;" " shfl.sync.up.b32 lo|p, lo, %2, %3, %4;" " shfl.sync.up.b32 hi|p, hi, %2, %3, %4;" " mov.b64 r0, {lo, hi};" " @p add.f64 %0, %0, r0;" "}" : "=d"(output) : "d"(input), "r"(offset), "r"(shfl_c), "r"(member_mask)); return output; } /* /// Inclusive prefix scan (specialized for ReduceBySegmentOp across KeyValuePair types) template _CCCL_DEVICE _CCCL_FORCEINLINE KeyValuePairInclusiveScanStep( KeyValuePair input, ///< [in] Calling thread's input item. ReduceBySegmentOp scan_op, ///< [in] Binary scan operator int first_lane, ///< [in] Index of first lane in segment int offset) ///< [in] Up-offset to pull from { KeyValuePair output; output.value = InclusiveScanStep(input.value, cub::Sum(), first_lane, offset, Int2Type::IS_SMALL_UNSIGNED>()); output.key = InclusiveScanStep(input.key, cub::Sum(), first_lane, offset, Int2Type::IS_SMALL_UNSIGNED>()); if (input.key > 0) output.value = input.value; return output; } */ /** * @brief Inclusive prefix scan step (generic) * * @param[in] input * Calling thread's input item. * * @param[in] scan_op * Binary scan operator * * @param[in] first_lane * Index of first lane in segment * * @param[in] offset * Up-offset to pull from */ template _CCCL_DEVICE _CCCL_FORCEINLINE _T InclusiveScanStep(_T input, ScanOpT scan_op, int first_lane, int offset) { _T temp = ShuffleUp(input, offset, first_lane, member_mask); // Perform scan op if from a valid peer _T output = scan_op(temp, input); if (static_cast(lane_id) < first_lane + offset) { output = input; } return output; } /** * @brief Inclusive prefix scan step (specialized for small integers size 32b or less) * * @param[in] input * Calling thread's input item * * @param[in] scan_op * Binary scan operator * * @param[in] first_lane * Index of first lane in segment * * @param[in] offset * Up-offset to pull from * * @param[in] is_small_unsigned * Marker type indicating whether T is a small integer */ template _CCCL_DEVICE _CCCL_FORCEINLINE _T InclusiveScanStep(_T input, ScanOpT scan_op, int first_lane, int offset, Int2Type /*is_small_unsigned*/) { return InclusiveScanStep(input, scan_op, first_lane, offset); } /** * @brief Inclusive prefix scan step (specialized for types other than small integers size * 32b or less) * * @param[in] input * Calling thread's input item. * * @param[in] scan_op * Binary scan operator * * @param[in] first_lane * Index of first lane in segment * * @param[in] offset * Up-offset to pull from * * @param[in] is_small_unsigned * Marker type indicating whether T is a small integer */ template _CCCL_DEVICE _CCCL_FORCEINLINE _T InclusiveScanStep(_T input, ScanOpT scan_op, int first_lane, int offset, Int2Type /*is_small_unsigned*/) { return InclusiveScanStep(input, scan_op, first_lane, offset); } /****************************************************************************** * Interface ******************************************************************************/ //--------------------------------------------------------------------- // Broadcast //--------------------------------------------------------------------- /** * @brief Broadcast * * @param[in] input * The value to broadcast * * @param[in] src_lane * Which warp lane is to do the broadcasting */ _CCCL_DEVICE _CCCL_FORCEINLINE T Broadcast(T input, int src_lane) { return ShuffleIndex(input, src_lane, member_mask); } //--------------------------------------------------------------------- // Inclusive operations //--------------------------------------------------------------------- /** * @brief Inclusive scan * * @param[in] input * Calling thread's input item * * @param[out] inclusive_output * Calling thread's output item. May be aliased with @p input * * @param[in] scan_op * Binary scan operator */ template _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScan(_T input, _T& inclusive_output, ScanOpT scan_op) { inclusive_output = input; // Iterate scan steps int segment_first_lane = 0; // Iterate scan steps #pragma unroll for (int STEP = 0; STEP < STEPS; STEP++) { inclusive_output = InclusiveScanStep( inclusive_output, scan_op, segment_first_lane, (1 << STEP), Int2Type::IS_SMALL_UNSIGNED>()); } } /** * @brief Inclusive scan, specialized for reduce-value-by-key * * @param[in] input * Calling thread's input item * * @param[out] inclusive_output * Calling thread's output item. May be aliased with @p input * * @param[in] scan_op * Binary scan operator */ template _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScan( KeyValuePair input, KeyValuePair& inclusive_output, ReduceByKeyOp scan_op) { inclusive_output = input; KeyT pred_key = ShuffleUp(inclusive_output.key, 1, 0, member_mask); unsigned int ballot = WARP_BALLOT((pred_key != inclusive_output.key), member_mask); // Mask away all lanes greater than ours ballot = ballot & LaneMaskLe(); // Find index of first set bit int segment_first_lane = CUB_MAX(0, 31 - __clz(ballot)); // Iterate scan steps #pragma unroll for (int STEP = 0; STEP < STEPS; STEP++) { inclusive_output.value = InclusiveScanStep( inclusive_output.value, scan_op.op, segment_first_lane, (1 << STEP), Int2Type::IS_SMALL_UNSIGNED>()); } } /** * @brief Inclusive scan with aggregate * * @param[in] input * Calling thread's input item * * @param[out] inclusive_output * Calling thread's output item. May be aliased with @p input * * @param[in] scan_op * Binary scan operator * * @param[out] warp_aggregate * Warp-wide aggregate reduction of input items */ template _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScan(T input, T& inclusive_output, ScanOpT scan_op, T& warp_aggregate) { InclusiveScan(input, inclusive_output, scan_op); // Grab aggregate from last warp lane warp_aggregate = ShuffleIndex(inclusive_output, LOGICAL_WARP_THREADS - 1, member_mask); } //--------------------------------------------------------------------- // Get exclusive from inclusive //--------------------------------------------------------------------- /** * @brief Update inclusive and exclusive using input and inclusive * * @param[in] input * * @param[out] inclusive * * @param[out] exclusive * * @param[in] scan_op * * @param[in] is_integer */ template _CCCL_DEVICE _CCCL_FORCEINLINE void Update(T /*input*/, T& inclusive, T& exclusive, ScanOpT /*scan_op*/, IsIntegerT /*is_integer*/) { // initial value unknown exclusive = ShuffleUp(inclusive, 1, 0, member_mask); } /** * @brief Update inclusive and exclusive using input and inclusive (specialized for summation of * integer types) */ _CCCL_DEVICE _CCCL_FORCEINLINE void Update(T input, T& inclusive, T& exclusive, cub::Sum /*scan_op*/, Int2Type /*is_integer*/) { // initial value presumed 0 exclusive = inclusive - input; } /** * @brief Update inclusive and exclusive using initial value using input, inclusive, and initial * value */ template _CCCL_DEVICE _CCCL_FORCEINLINE void Update(T /*input*/, T& inclusive, T& exclusive, ScanOpT scan_op, T initial_value, IsIntegerT /*is_integer*/) { inclusive = scan_op(initial_value, inclusive); exclusive = ShuffleUp(inclusive, 1, 0, member_mask); if (lane_id == 0) { exclusive = initial_value; } } /** * @brief Update inclusive and exclusive using initial value using input and inclusive * (specialized for summation of integer types) */ _CCCL_DEVICE _CCCL_FORCEINLINE void Update(T input, T& inclusive, T& exclusive, cub::Sum scan_op, T initial_value, Int2Type /*is_integer*/) { inclusive = scan_op(initial_value, inclusive); exclusive = inclusive - input; } /** * @brief Update inclusive, exclusive, and warp aggregate using input and inclusive */ template _CCCL_DEVICE _CCCL_FORCEINLINE void Update(T input, T& inclusive, T& exclusive, T& warp_aggregate, ScanOpT scan_op, IsIntegerT is_integer) { warp_aggregate = ShuffleIndex(inclusive, LOGICAL_WARP_THREADS - 1, member_mask); Update(input, inclusive, exclusive, scan_op, is_integer); } /** * @brief Update inclusive, exclusive, and warp aggregate using input, inclusive, and initial * value */ template _CCCL_DEVICE _CCCL_FORCEINLINE void Update( T input, T& inclusive, T& exclusive, T& warp_aggregate, ScanOpT scan_op, T initial_value, IsIntegerT is_integer) { warp_aggregate = ShuffleIndex(inclusive, LOGICAL_WARP_THREADS - 1, member_mask); Update(input, inclusive, exclusive, scan_op, initial_value, is_integer); } }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/warp/specializations/warp_scan_smem.cuh000066400000000000000000000340501463375617100236340ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * @file * cub::WarpScanSmem provides smem-based variants of parallel prefix scan of items partitioned * across a CUDA thread warp. */ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include CUB_NAMESPACE_BEGIN /** * @brief WarpScanSmem provides smem-based variants of parallel prefix scan of items partitioned * across a CUDA thread warp. * * @tparam T * Data type being scanned * * @tparam LOGICAL_WARP_THREADS * Number of threads per logical warp * * @tparam LEGACY_PTX_ARCH * The PTX compute capability for which to to specialize this collective */ template struct WarpScanSmem { /****************************************************************************** * Constants and type definitions ******************************************************************************/ enum { /// Whether the logical warp size and the PTX warp size coincide IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(0)), /// The number of warp scan steps STEPS = Log2::VALUE, /// The number of threads in half a warp HALF_WARP_THREADS = 1 << (STEPS - 1), /// The number of shared memory elements per warp WARP_SMEM_ELEMENTS = LOGICAL_WARP_THREADS + HALF_WARP_THREADS, }; /// Storage cell type (workaround for SM1x compiler bugs with custom-ops like Max() on signed chars) using CellT = T; /// Shared memory storage layout type (1.5 warps-worth of elements for each warp) typedef CellT _TempStorage[WARP_SMEM_ELEMENTS]; // Alias wrapper allowing storage to be unioned struct TempStorage : Uninitialized<_TempStorage> {}; /****************************************************************************** * Thread fields ******************************************************************************/ _TempStorage& temp_storage; unsigned int lane_id; unsigned int member_mask; /****************************************************************************** * Construction ******************************************************************************/ /// Constructor explicit _CCCL_DEVICE _CCCL_FORCEINLINE WarpScanSmem(TempStorage& temp_storage) : temp_storage(temp_storage.Alias()) , lane_id(IS_ARCH_WARP ? LaneId() : LaneId() % LOGICAL_WARP_THREADS) , member_mask(WarpMask(LaneId() / LOGICAL_WARP_THREADS)) {} /****************************************************************************** * Utility methods ******************************************************************************/ /// Basic inclusive scan iteration (template unrolled, inductive-case specialization) template _CCCL_DEVICE _CCCL_FORCEINLINE void ScanStep(T& partial, ScanOp scan_op, Int2Type /*step*/) { constexpr int OFFSET = 1 << STEP; // Share partial into buffer ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) partial); WARP_SYNC(member_mask); // Update partial if addend is in range if (HAS_IDENTITY || (lane_id >= OFFSET)) { T addend = (T) ThreadLoad(&temp_storage[HALF_WARP_THREADS + lane_id - OFFSET]); partial = scan_op(addend, partial); } WARP_SYNC(member_mask); ScanStep(partial, scan_op, Int2Type()); } /// Basic inclusive scan iteration(template unrolled, base-case specialization) template _CCCL_DEVICE _CCCL_FORCEINLINE void ScanStep(T& /*partial*/, ScanOp /*scan_op*/, Int2Type /*step*/) {} /** * @brief Inclusive prefix scan (specialized for summation across primitive types) * * @param[in] input * Calling thread's input item * * @param[out] output * Calling thread's output item. May be aliased with @p input * * @param[in] scan_op * Binary scan operator * * @param[in] * Marker type indicating whether T is primitive type */ _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScan(T input, T& output, Sum scan_op, Int2Type /*is_primitive*/) { T identity = 0; ThreadStore(&temp_storage[lane_id], (CellT) identity); WARP_SYNC(member_mask); // Iterate scan steps output = input; ScanStep(output, scan_op, Int2Type<0>()); } /** * @brief Inclusive prefix scan * * @param[in] input * Calling thread's input item * * @param[out] output * Calling thread's output item. May be aliased with @p input * * @param[in] scan_op * Binary scan operator * * @param[in] is_primitive * Marker type indicating whether T is primitive type */ template _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScan(T input, T& output, ScanOp scan_op, Int2Type /*is_primitive*/) { // Iterate scan steps output = input; ScanStep(output, scan_op, Int2Type<0>()); } /****************************************************************************** * Interface ******************************************************************************/ //--------------------------------------------------------------------- // Broadcast //--------------------------------------------------------------------- /** * @brief Broadcast * * @param[in] input * The value to broadcast * * @param[in] src_lane * Which warp lane is to do the broadcasting */ _CCCL_DEVICE _CCCL_FORCEINLINE T Broadcast(T input, unsigned int src_lane) { if (lane_id == src_lane) { ThreadStore(temp_storage, (CellT) input); } WARP_SYNC(member_mask); return (T) ThreadLoad(temp_storage); } //--------------------------------------------------------------------- // Inclusive operations //--------------------------------------------------------------------- /** * @brief Inclusive scan * * @param[in] input * Calling thread's input item. * * @param[out] inclusive_output * Calling thread's output item. May be aliased with @p input * * @param[in] scan_op * Binary scan operator */ template _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScan(T input, T& inclusive_output, ScanOp scan_op) { InclusiveScan(input, inclusive_output, scan_op, Int2Type::PRIMITIVE>()); } /** * @brief Inclusive scan with aggregate * * @param[in] input * Calling thread's input item * * @param[out] inclusive_output * Calling thread's output item. May be aliased with @p input * * @param[in] scan_op * Binary scan operator * * @param[out] warp_aggregate * Warp-wide aggregate reduction of input items. */ template _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScan(T input, T& inclusive_output, ScanOp scan_op, T& warp_aggregate) { InclusiveScan(input, inclusive_output, scan_op); // Retrieve aggregate ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive_output); WARP_SYNC(member_mask); warp_aggregate = (T) ThreadLoad(&temp_storage[WARP_SMEM_ELEMENTS - 1]); WARP_SYNC(member_mask); } //--------------------------------------------------------------------- // Get exclusive from inclusive //--------------------------------------------------------------------- /** * @brief Update inclusive and exclusive using input and inclusive * * @param[in] input * * @param[in, out] inclusive * * @param[out] exclusive * * @param[in] scan_op * * @param[in] is_integer */ template _CCCL_DEVICE _CCCL_FORCEINLINE void Update(T /*input*/, T& inclusive, T& exclusive, ScanOpT /*scan_op*/, IsIntegerT /*is_integer*/) { // initial value unknown ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive); WARP_SYNC(member_mask); exclusive = (T) ThreadLoad(&temp_storage[HALF_WARP_THREADS + lane_id - 1]); } /** * @brief Update inclusive and exclusive using input and inclusive (specialized for summation of * integer types) */ _CCCL_DEVICE _CCCL_FORCEINLINE void Update(T input, T& inclusive, T& exclusive, cub::Sum /*scan_op*/, Int2Type /*is_integer*/) { // initial value presumed 0 exclusive = inclusive - input; } /** * @brief Update inclusive and exclusive using initial value using input, inclusive, and initial * value */ template _CCCL_DEVICE _CCCL_FORCEINLINE void Update(T /*input*/, T& inclusive, T& exclusive, ScanOpT scan_op, T initial_value, IsIntegerT /*is_integer*/) { inclusive = scan_op(initial_value, inclusive); ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive); WARP_SYNC(member_mask); exclusive = (T) ThreadLoad(&temp_storage[HALF_WARP_THREADS + lane_id - 1]); if (lane_id == 0) { exclusive = initial_value; } } /** * @brief Update inclusive and exclusive using initial value using input and inclusive * (specialized for summation of integer types) */ _CCCL_DEVICE _CCCL_FORCEINLINE void Update(T input, T& inclusive, T& exclusive, cub::Sum scan_op, T initial_value, Int2Type /*is_integer*/) { inclusive = scan_op(initial_value, inclusive); exclusive = inclusive - input; } /** * @brief Update inclusive, exclusive, and warp aggregate using input and inclusive */ template _CCCL_DEVICE _CCCL_FORCEINLINE void Update(T /*input*/, T& inclusive, T& exclusive, T& warp_aggregate, ScanOpT /*scan_op*/, IsIntegerT /*is_integer*/) { // Initial value presumed to be unknown or identity (either way our padding is correct) ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive); WARP_SYNC(member_mask); exclusive = (T) ThreadLoad(&temp_storage[HALF_WARP_THREADS + lane_id - 1]); warp_aggregate = (T) ThreadLoad(&temp_storage[WARP_SMEM_ELEMENTS - 1]); } /** * @brief Update inclusive, exclusive, and warp aggregate using input and inclusive (specialized * for summation of integer types) */ _CCCL_DEVICE _CCCL_FORCEINLINE void Update(T input, T& inclusive, T& exclusive, T& warp_aggregate, cub::Sum /*scan_o*/, Int2Type /*is_integer*/) { // Initial value presumed to be unknown or identity (either way our padding is correct) ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive); WARP_SYNC(member_mask); warp_aggregate = (T) ThreadLoad(&temp_storage[WARP_SMEM_ELEMENTS - 1]); exclusive = inclusive - input; } /** * @brief Update inclusive, exclusive, and warp aggregate using input, inclusive, and initial * value */ template _CCCL_DEVICE _CCCL_FORCEINLINE void Update( T /*input*/, T& inclusive, T& exclusive, T& warp_aggregate, ScanOpT scan_op, T initial_value, IsIntegerT /*is_integer*/) { // Broadcast warp aggregate ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive); WARP_SYNC(member_mask); warp_aggregate = (T) ThreadLoad(&temp_storage[WARP_SMEM_ELEMENTS - 1]); WARP_SYNC(member_mask); // Update inclusive with initial value inclusive = scan_op(initial_value, inclusive); // Get exclusive from exclusive ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id - 1], (CellT) inclusive); WARP_SYNC(member_mask); exclusive = (T) ThreadLoad(&temp_storage[HALF_WARP_THREADS + lane_id - 2]); if (lane_id == 0) { exclusive = initial_value; } } }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/warp/warp_exchange.cuh000066400000000000000000000400001463375617100202400ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * @file * The cub::WarpExchange class provides [collective](index.html#sec0) * methods for rearranging data partitioned across a CUDA warp. */ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include CUB_NAMESPACE_BEGIN enum WarpExchangeAlgorithm { WARP_EXCHANGE_SMEM, WARP_EXCHANGE_SHUFFLE, }; namespace detail { template using InternalWarpExchangeImpl = cub::detail::conditional_t, WarpExchangeShfl>; } // namespace detail /** * @brief The WarpExchange class provides [collective](index.html#sec0) * methods for rearranging data partitioned across a CUDA warp. * * @tparam T * The data type to be exchanged. * * @tparam ITEMS_PER_THREAD * The number of items partitioned onto each thread. * * @tparam LOGICAL_WARP_THREADS * [optional] The number of threads per "logical" warp (may be less * than the number of hardware warp threads). Default is the warp size of the * targeted CUDA compute-capability (e.g., 32 threads for SM86). Must be a * power of two. * * @tparam LEGACY_PTX_ARCH * Unused. * * @par Overview * - It is commonplace for a warp of threads to rearrange data items between * threads. For example, the global memory accesses prefer patterns where * data items are "striped" across threads (where consecutive threads access * consecutive items), yet most warp-wide operations prefer a "blocked" * partitioning of items across threads (where consecutive items belong to a * single thread). * - WarpExchange supports the following types of data exchanges: * - Transposing between [blocked](index.html#sec5sec3) and * [striped](index.html#sec5sec3) arrangements * - Scattering ranked items to a * [striped arrangement](index.html#sec5sec3) * * @par A Simple Example * @par * The code snippet below illustrates the conversion from a "blocked" to a * "striped" arrangement of 64 integer items partitioned across 16 threads where * each thread owns 4 items. * @par * @code * #include // or equivalently * * __global__ void ExampleKernel(int *d_data, ...) * { * constexpr int warp_threads = 16; * constexpr int block_threads = 256; * constexpr int items_per_thread = 4; * constexpr int warps_per_block = block_threads / warp_threads; * const int warp_id = static_cast(threadIdx.x) / warp_threads; * * // Specialize WarpExchange for a virtual warp of 16 threads owning 4 integer items each * using WarpExchangeT = * cub::WarpExchange; * * // Allocate shared memory for WarpExchange * __shared__ typename WarpExchangeT::TempStorage temp_storage[warps_per_block]; * * // Load a tile of data striped across threads * int thread_data[items_per_thread]; * // ... * * // Collectively exchange data into a blocked arrangement across threads * WarpExchangeT(temp_storage[warp_id]).StripedToBlocked(thread_data, thread_data); * @endcode * @par * Suppose the set of striped input @p thread_data across the block of threads * is { [0,16,32,48], [1,17,33,49], ..., [15, 32, 47, 63] }. * The corresponding output @p thread_data in those threads will be * { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [60,61,62,63] }. */ template class WarpExchange : private detail::InternalWarpExchangeImpl { using InternalWarpExchange = detail::InternalWarpExchangeImpl; public: /// \smemstorage{WarpExchange} using TempStorage = typename InternalWarpExchange::TempStorage; //! @name Collective constructors //! @{ WarpExchange() = delete; /** * @brief Collective constructor using the specified memory allocation as * temporary storage. */ explicit _CCCL_DEVICE _CCCL_FORCEINLINE WarpExchange(TempStorage& temp_storage) : InternalWarpExchange(temp_storage) {} //! @} end member group //! @name Data movement //! @{ /** * @brief Transposes data items from blocked arrangement to * striped arrangement. * * @par * @smemwarpreuse * * @par Snippet * The code snippet below illustrates the conversion from a "blocked" to a * "striped" arrangement of 64 integer items partitioned across 16 threads * where each thread owns 4 items. * @par * @code * #include // or equivalently * * __global__ void ExampleKernel(int *d_data, ...) * { * constexpr int warp_threads = 16; * constexpr int block_threads = 256; * constexpr int items_per_thread = 4; * constexpr int warps_per_block = block_threads / warp_threads; * const int warp_id = static_cast(threadIdx.x) / warp_threads; * * // Specialize WarpExchange for a virtual warp of 16 threads owning 4 integer items each * using WarpExchangeT = cub::WarpExchange; * * // Allocate shared memory for WarpExchange * __shared__ typename WarpExchangeT::TempStorage temp_storage[warps_per_block]; * * // Obtain a segment of consecutive items that are blocked across threads * int thread_data[items_per_thread]; * // ... * * // Collectively exchange data into a striped arrangement across threads * WarpExchangeT(temp_storage[warp_id]).BlockedToStriped(thread_data, thread_data); * @endcode * @par * Suppose the set of striped input @p thread_data across the block of threads * is { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [60,61,62,63] }. * The corresponding output @p thread_data in those threads will be * { [0,16,32,48], [1,17,33,49], ..., [15, 32, 47, 63] }. * * @param[in] input_items * Items to exchange, converting between blocked and * striped arrangements. * * @param[out] output_items * Items from exchange, converting between striped and * blocked arrangements. May be aliased to @p input_items. */ template _CCCL_DEVICE _CCCL_FORCEINLINE void BlockedToStriped(const InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD]) { InternalWarpExchange::BlockedToStriped(input_items, output_items); } /** * @brief Transposes data items from striped arrangement to * blocked arrangement. * * @par * @smemwarpreuse * * @par Snippet * The code snippet below illustrates the conversion from a "striped" to a * "blocked" arrangement of 64 integer items partitioned across 16 threads * where each thread owns 4 items. * @par * @code * #include // or equivalently * * __global__ void ExampleKernel(int *d_data, ...) * { * constexpr int warp_threads = 16; * constexpr int block_threads = 256; * constexpr int items_per_thread = 4; * constexpr int warps_per_block = block_threads / warp_threads; * const int warp_id = static_cast(threadIdx.x) / warp_threads; * * // Specialize WarpExchange for a virtual warp of 16 threads owning 4 integer items each * using WarpExchangeT = cub::WarpExchange; * * // Allocate shared memory for WarpExchange * __shared__ typename WarpExchangeT::TempStorage temp_storage[warps_per_block]; * * // Load a tile of data striped across threads * int thread_data[items_per_thread]; * // ... * * // Collectively exchange data into a blocked arrangement across threads * WarpExchangeT(temp_storage[warp_id]).StripedToBlocked(thread_data, thread_data); * @endcode * @par * Suppose the set of striped input @p thread_data across the block of threads * is { [0,16,32,48], [1,17,33,49], ..., [15, 32, 47, 63] }. * The corresponding output @p thread_data in those threads will be * { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [60,61,62,63] }. * * @param[in] input_items * Items to exchange * * @param[out] output_items * Items from exchange. May be aliased to @p input_items. */ template _CCCL_DEVICE _CCCL_FORCEINLINE void StripedToBlocked(const InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD]) { InternalWarpExchange::StripedToBlocked(input_items, output_items); } /** * @brief Exchanges valid data items annotated by rank * into striped arrangement. * * @par * @smemwarpreuse * * @par Snippet * The code snippet below illustrates the conversion from a "scatter" to a * "striped" arrangement of 64 integer items partitioned across 16 threads * where each thread owns 4 items. * @par * @code * #include // or equivalently * * __global__ void ExampleKernel(int *d_data, ...) * { * constexpr int warp_threads = 16; * constexpr int block_threads = 256; * constexpr int items_per_thread = 4; * constexpr int warps_per_block = block_threads / warp_threads; * const int warp_id = static_cast(threadIdx.x) / warp_threads; * * // Specialize WarpExchange for a virtual warp of 16 threads owning 4 integer items each * using WarpExchangeT = cub::WarpExchange; * * // Allocate shared memory for WarpExchange * __shared__ typename WarpExchangeT::TempStorage temp_storage[warps_per_block]; * * // Obtain a segment of consecutive items that are blocked across threads * int thread_data[items_per_thread]; * int thread_ranks[items_per_thread]; * // ... * * // Collectively exchange data into a striped arrangement across threads * WarpExchangeT(temp_storage[warp_id]).ScatterToStriped( * thread_data, thread_ranks); * @endcode * @par * Suppose the set of input @p thread_data across the block of threads * is `{ [0,1,2,3], [4,5,6,7], ..., [60,61,62,63] }`, and the set of * @p thread_ranks is `{ [63,62,61,60], ..., [7,6,5,4], [3,2,1,0] }`. The * corresponding output @p thread_data in those threads will be * `{ [63, 47, 31, 15], [62, 46, 30, 14], ..., [48, 32, 16, 0] }`. * * @tparam OffsetT [inferred] Signed integer type for local offsets * * @param[in,out] items Items to exchange * @param[in] ranks Corresponding scatter ranks */ template _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToStriped(InputT (&items)[ITEMS_PER_THREAD], OffsetT (&ranks)[ITEMS_PER_THREAD]) { InternalWarpExchange::ScatterToStriped(items, ranks); } /** * @brief Exchanges valid data items annotated by rank * into striped arrangement. * * @par * @smemwarpreuse * * @par Snippet * The code snippet below illustrates the conversion from a "scatter" to a * "striped" arrangement of 64 integer items partitioned across 16 threads * where each thread owns 4 items. * @par * @code * #include // or equivalently * * __global__ void ExampleKernel(int *d_data, ...) * { * constexpr int warp_threads = 16; * constexpr int block_threads = 256; * constexpr int items_per_thread = 4; * constexpr int warps_per_block = block_threads / warp_threads; * const int warp_id = static_cast(threadIdx.x) / warp_threads; * * // Specialize WarpExchange for a virtual warp of 16 threads owning 4 integer items each * using WarpExchangeT = cub::WarpExchange; * * // Allocate shared memory for WarpExchange * __shared__ typename WarpExchangeT::TempStorage temp_storage[warps_per_block]; * * // Obtain a segment of consecutive items that are blocked across threads * int thread_input[items_per_thread]; * int thread_ranks[items_per_thread]; * // ... * * // Collectively exchange data into a striped arrangement across threads * int thread_output[items_per_thread]; * WarpExchangeT(temp_storage[warp_id]).ScatterToStriped( * thread_input, thread_output, thread_ranks); * @endcode * @par * Suppose the set of input @p thread_input across the block of threads * is `{ [0,1,2,3], [4,5,6,7], ..., [60,61,62,63] }`, and the set of * @p thread_ranks is `{ [63,62,61,60], ..., [7,6,5,4], [3,2,1,0] }`. The * corresponding @p thread_output in those threads will be * `{ [63, 47, 31, 15], [62, 46, 30, 14], ..., [48, 32, 16, 0] }`. * * @tparam OffsetT [inferred] Signed integer type for local offsets * * @param[in] input_items * Items to exchange * * @param[out] output_items * Items from exchange. May be aliased to @p input_items. * * @param[in] ranks * Corresponding scatter ranks */ template _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToStriped( const InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], OffsetT (&ranks)[ITEMS_PER_THREAD]) { InternalWarpExchange::ScatterToStriped(input_items, output_items, ranks); } //@} end member group }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/warp/warp_load.cuh000066400000000000000000000575471463375617100174250ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ //! @file Operations for reading linear tiles of data into the CUDA warp. #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include #include CUB_NAMESPACE_BEGIN //! @rst //! ``cub::WarpLoadAlgorithm`` enumerates alternative algorithms for :cpp:struct:`cub::WarpLoad` to //! read a linear segment of data from memory into a CUDA warp. //! @endrst enum WarpLoadAlgorithm { //! @rst //! Overview //! ++++++++++++++++++++++++++ //! //! A :ref:`blocked arrangement ` of data is read directly from memory. //! //! Performance Considerations //! ++++++++++++++++++++++++++ //! //! The utilization of memory transactions (coalescing) decreases as the //! access stride between threads increases (i.e., the number items per thread). //! @endrst WARP_LOAD_DIRECT, //! @rst //! Overview //! ++++++++++++++++++++++++++ //! //! A :ref:`striped arrangement ` of data is read directly from memory. //! //! Performance Considerations //! ++++++++++++++++++++++++++ //! //! The utilization of memory transactions (coalescing) doesn't depend on //! the number of items per thread. //! @endrst WARP_LOAD_STRIPED, //! @rst //! Overview //! ++++++++++++++++++++++++++ //! //! A :ref:`blocked arrangement ` of data is read from memory using //! CUDA's built-in vectorized loads as a coalescing optimization. //! For example, ``ld.global.v4.s32`` instructions will be generated when ``T = int`` and //! ``ITEMS_PER_THREAD % 4 == 0``. //! //! Performance Considerations //! ++++++++++++++++++++++++++ //! //! - The utilization of memory transactions (coalescing) remains high until the the //! access stride between threads (i.e., the number items per thread) exceeds the //! maximum vector load width (typically 4 items or 64B, whichever is lower). //! - The following conditions will prevent vectorization and loading will fall //! back to cub::WARP_LOAD_DIRECT: //! //! - ``ITEMS_PER_THREAD`` is odd //! - The ``InputIteratorT`` is not a simple pointer type //! - The block input offset is not quadword-aligned //! - The data type ``T`` is not a built-in primitive or CUDA vector type //! (e.g., ``short``, ``int2``, ``double``, ``float2``, etc.) //! @endrst WARP_LOAD_VECTORIZE, //! @rst //! Overview //! ++++++++++++++++++++++++++ //! //! A :ref:`striped arrangement ` of data is read efficiently from //! memory and then locally transposed into a //! :ref:`blocked arrangement `. //! //! Performance Considerations //! ++++++++++++++++++++++++++ //! //! - The utilization of memory transactions (coalescing) remains high //! regardless of items loaded per thread. //! - The local reordering incurs slightly longer latencies and throughput than the direct //! ``cub::WARP_LOAD_DIRECT`` and ``cub::WARP_LOAD_VECTORIZE`` alternatives. //! @endrst WARP_LOAD_TRANSPOSE }; //! @rst //! The WarpLoad class provides :ref:`collective ` data movement methods for //! loading a linear segment of items from memory into a //! :ref:`blocked arrangement ` across a CUDA thread warp. //! //! Overview //! ++++++++++++++++ //! //! - The WarpLoad class provides a single data movement abstraction that can be //! specialized to implement different cub::WarpLoadAlgorithm strategies. This //! facilitates different performance policies for different architectures, data //! types, granularity sizes, etc. //! - WarpLoad can be optionally specialized by different data movement strategies: //! //! #. :cpp:enumerator:`cub::WARP_LOAD_DIRECT`: //! a :ref:`blocked arrangement ` of data is read directly from //! memory. //! #. :cpp:enumerator:`cub::WARP_LOAD_STRIPED`: //! a :ref:`striped arrangement ` of data is read directly from //! memory. //! #. :cpp:enumerator:`cub::WARP_LOAD_VECTORIZE`: //! a :ref:`blocked arrangement ` of data is read directly from //! memory using CUDA's built-in vectorized loads as a coalescing optimization. //! #. :cpp:enumerator:`cub::WARP_LOAD_TRANSPOSE`: //! a :ref:`striped arrangement ` of data is read directly from //! memory and is then locally transposed into a //! :ref:`blocked arrangement `. //! //! A Simple Example //! ++++++++++++++++ //! //! The code snippet below illustrates the loading of a linear segment of 64 //! integers into a "blocked" arrangement across 16 threads where each thread //! owns 4 consecutive items. The load is specialized for ``WARP_LOAD_TRANSPOSE``, //! meaning memory references are efficiently coalesced using a warp-striped access //! pattern (after which items are locally reordered among threads). //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! __global__ void ExampleKernel(int *d_data, ...) //! { //! constexpr int warp_threads = 16; //! constexpr int block_threads = 256; //! constexpr int items_per_thread = 4; //! //! // Specialize WarpLoad for a warp of 16 threads owning 4 integer items each //! using WarpLoadT = WarpLoad; //! //! constexpr int warps_in_block = block_threads / warp_threads; //! constexpr int tile_size = items_per_thread * warp_threads; //! const int warp_id = static_cast(threadIdx.x) / warp_threads; //! //! // Allocate shared memory for WarpLoad //! __shared__ typename WarpLoadT::TempStorage temp_storage[warps_in_block]; //! //! // Load a segment of consecutive items that are blocked across threads //! int thread_data[items_per_thread]; //! WarpLoadT(temp_storage[warp_id]).Load(d_data + warp_id * tile_size, //! thread_data); //! //! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, ...``. //! The set of ``thread_data`` across the first logical warp of threads in those //! threads will be: ``{ [0,1,2,3], [4,5,6,7], ..., [60,61,62,63] }``. //! @endrst //! //! @tparam InputT //! The data type to read into (which must be convertible from the input //! iterator's value type). //! //! @tparam ITEMS_PER_THREAD //! The number of consecutive items partitioned onto each thread. //! //! @tparam ALGORITHM //! [optional] cub::WarpLoadAlgorithm tuning policy. //! default: cub::WARP_LOAD_DIRECT. //! //! @tparam LOGICAL_WARP_THREADS //! [optional] The number of threads per "logical" warp (may be less //! than the number of hardware warp threads). Default is the warp size of the //! targeted CUDA compute-capability (e.g., 32 threads for SM86). Must be a //! power of two. //! //! @tparam LEGACY_PTX_ARCH //! Unused. template class WarpLoad { static constexpr bool IS_ARCH_WARP = LOGICAL_WARP_THREADS == CUB_WARP_THREADS(0); static_assert(PowerOfTwo::VALUE, "LOGICAL_WARP_THREADS must be a power of two"); private: /***************************************************************************** * Algorithmic variants ****************************************************************************/ /// Load helper template struct LoadInternal; template struct LoadInternal { using TempStorage = NullType; int linear_tid; _CCCL_DEVICE _CCCL_FORCEINLINE LoadInternal(TempStorage& /*temp_storage*/, int linear_tid) : linear_tid(linear_tid) {} template _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD]) { LoadDirectBlocked(linear_tid, block_itr, items); } template _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items) { LoadDirectBlocked(linear_tid, block_itr, items, valid_items); } template _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default) { LoadDirectBlocked(linear_tid, block_itr, items, valid_items, oob_default); } }; template struct LoadInternal { using TempStorage = NullType; int linear_tid; _CCCL_DEVICE _CCCL_FORCEINLINE LoadInternal(TempStorage& /*temp_storage*/, int linear_tid) : linear_tid(linear_tid) {} template _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD]) { LoadDirectStriped(linear_tid, block_itr, items); } template _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items) { LoadDirectStriped(linear_tid, block_itr, items, valid_items); } template _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default) { LoadDirectStriped(linear_tid, block_itr, items, valid_items, oob_default); } }; template struct LoadInternal { using TempStorage = NullType; int linear_tid; _CCCL_DEVICE _CCCL_FORCEINLINE LoadInternal(TempStorage& /*temp_storage*/, int linear_tid) : linear_tid(linear_tid) {} template _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputT* block_ptr, InputT (&items)[ITEMS_PER_THREAD]) { InternalLoadDirectBlockedVectorized(linear_tid, block_ptr, items); } template _CCCL_DEVICE _CCCL_FORCEINLINE void Load(const InputT* block_ptr, InputT (&items)[ITEMS_PER_THREAD]) { InternalLoadDirectBlockedVectorized(linear_tid, block_ptr, items); } template _CCCL_DEVICE _CCCL_FORCEINLINE void Load(CacheModifiedInputIterator block_itr, InputT (&items)[ITEMS_PER_THREAD]) { InternalLoadDirectBlockedVectorized(linear_tid, block_itr.ptr, items); } template _CCCL_DEVICE _CCCL_FORCEINLINE void Load(_InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD]) { LoadDirectBlocked(linear_tid, block_itr, items); } template _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items) { LoadDirectBlocked(linear_tid, block_itr, items, valid_items); } template _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default) { LoadDirectBlocked(linear_tid, block_itr, items, valid_items, oob_default); } }; template struct LoadInternal { using WarpExchangeT = WarpExchange; struct _TempStorage : WarpExchangeT::TempStorage {}; struct TempStorage : Uninitialized<_TempStorage> {}; _TempStorage& temp_storage; int linear_tid; _CCCL_DEVICE _CCCL_FORCEINLINE LoadInternal(TempStorage& temp_storage, int linear_tid) : temp_storage(temp_storage.Alias()) , linear_tid(linear_tid) {} template _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD]) { LoadDirectStriped(linear_tid, block_itr, items); WarpExchangeT(temp_storage).StripedToBlocked(items, items); } template _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items) { LoadDirectStriped(linear_tid, block_itr, items, valid_items); WarpExchangeT(temp_storage).StripedToBlocked(items, items); } template _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default) { LoadDirectStriped(linear_tid, block_itr, items, valid_items, oob_default); WarpExchangeT(temp_storage).StripedToBlocked(items, items); } }; /***************************************************************************** * Type definitions ****************************************************************************/ /// Internal load implementation to use using InternalLoad = LoadInternal; /// Shared memory storage layout type using _TempStorage = typename InternalLoad::TempStorage; /***************************************************************************** * Utility methods ****************************************************************************/ /// Internal storage allocator _CCCL_DEVICE _CCCL_FORCEINLINE _TempStorage& PrivateStorage() { __shared__ _TempStorage private_storage; return private_storage; } /***************************************************************************** * Thread fields ****************************************************************************/ /// Thread reference to shared storage _TempStorage& temp_storage; /// Linear thread-id int linear_tid; public: /// @smemstorage{WarpLoad} struct TempStorage : Uninitialized<_TempStorage> {}; //! @name Collective constructors //! @{ //! @brief Collective constructor using a private static allocation of //! shared memory as temporary storage. _CCCL_DEVICE _CCCL_FORCEINLINE WarpLoad() : temp_storage(PrivateStorage()) , linear_tid(IS_ARCH_WARP ? LaneId() : (LaneId() % LOGICAL_WARP_THREADS)) {} //! @brief Collective constructor using the specified memory allocation as //! temporary storage. _CCCL_DEVICE _CCCL_FORCEINLINE WarpLoad(TempStorage& temp_storage) : temp_storage(temp_storage.Alias()) , linear_tid(IS_ARCH_WARP ? LaneId() : (LaneId() % LOGICAL_WARP_THREADS)) {} //! @} end member group //! @name Data movement //! @{ //! @rst //! Load a linear segment of items from memory. //! //! @smemwarpreuse //! //! Snippet //! +++++++ //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! __global__ void ExampleKernel(int *d_data, ...) //! { //! constexpr int warp_threads = 16; //! constexpr int block_threads = 256; //! constexpr int items_per_thread = 4; //! //! // Specialize WarpLoad for a warp of 16 threads owning 4 integer items each //! using WarpLoadT = WarpLoad; //! //! constexpr int warps_in_block = block_threads / warp_threads; //! constexpr int tile_size = items_per_thread * warp_threads; //! const int warp_id = static_cast(threadIdx.x) / warp_threads; //! //! // Allocate shared memory for WarpLoad //! __shared__ typename WarpLoadT::TempStorage temp_storage[warps_in_block]; //! //! // Load a segment of consecutive items that are blocked across threads //! int thread_data[items_per_thread]; //! WarpLoadT(temp_storage[warp_id]).Load(d_data + warp_id * tile_size, //! thread_data); //! //! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, ...``, //! The set of ``thread_data`` across the first logical warp of threads in those //! threads will be: ``{ [0,1,2,3], [4,5,6,7], ..., [60,61,62,63] }``. //! @endrst //! //! @param[in] block_itr The thread block's base input iterator for loading from //! @param[out] items Data to load template _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD]) { InternalLoad(temp_storage, linear_tid).Load(block_itr, items); } //! @rst //! Load a linear segment of items from memory, guarded by range. //! //! @smemwarpreuse //! //! Snippet //! +++++++ //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! __global__ void ExampleKernel(int *d_data, int valid_items, ...) //! { //! constexpr int warp_threads = 16; //! constexpr int block_threads = 256; //! constexpr int items_per_thread = 4; //! //! // Specialize WarpLoad for a warp of 16 threads owning 4 integer items each //! using WarpLoadT = WarpLoad; //! //! constexpr int warps_in_block = block_threads / warp_threads; //! constexpr int tile_size = items_per_thread * warp_threads; //! const int warp_id = static_cast(threadIdx.x) / warp_threads; //! //! // Allocate shared memory for WarpLoad //! __shared__ typename WarpLoadT::TempStorage temp_storage[warps_in_block]; //! //! // Load a segment of consecutive items that are blocked across threads //! int thread_data[items_per_thread]; //! WarpLoadT(temp_storage[warp_id]).Load(d_data + warp_id * tile_size, //! thread_data, //! valid_items); //! //! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, ...`` and ``valid_items`` is ``5``. //! The set of ``thread_data`` across the first logical warp of threads in those threads will be: //! ``{ [0,1,2,3], [4,?,?,?], ..., [?,?,?,?] }`` with only the first two threads being unmasked to //! load portions of valid data (and other items remaining unassigned). //! @endrst //! //! @param[in] block_itr The thread block's base input iterator for loading from //! @param[out] items Data to load //! @param[in] valid_items Number of valid items to load template _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items) { InternalLoad(temp_storage, linear_tid).Load(block_itr, items, valid_items); } //! @rst //! Load a linear segment of items from memory, guarded by range. //! //! @smemwarpreuse //! //! Snippet //! +++++++ //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! __global__ void ExampleKernel(int *d_data, int valid_items, ...) //! { //! constexpr int warp_threads = 16; //! constexpr int block_threads = 256; //! constexpr int items_per_thread = 4; //! //! // Specialize WarpLoad for a warp of 16 threads owning 4 integer items each //! using WarpLoadT = WarpLoad; //! //! constexpr int warps_in_block = block_threads / warp_threads; //! constexpr int tile_size = items_per_thread * warp_threads; //! const int warp_id = static_cast(threadIdx.x) / warp_threads; //! //! // Allocate shared memory for WarpLoad //! __shared__ typename WarpLoadT::TempStorage temp_storage[warps_in_block]; //! //! // Load a segment of consecutive items that are blocked across threads //! int thread_data[items_per_thread]; //! WarpLoadT(temp_storage[warp_id]).Load(d_data + warp_id * tile_size, //! thread_data, //! valid_items, //! -1); //! //! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, ...``, ``valid_items`` is ``5``, and the //! out-of-bounds default is ``-1``. The set of ``thread_data`` across the first logical warp of //! threads in those threads will be: ``{ [0,1,2,3], [4,-1,-1,-1], ..., [-1,-1,-1,-1] }`` with //! only the first two threads being unmasked to load portions of valid data (and other items //! are assigned ``-1``). //! @endrst //! //! @param[in] block_itr The thread block's base input iterator for loading from //! @param[out] items Data to load //! @param[in] valid_items Number of valid items to load //! @param[in] oob_default Default value to assign out-of-bound items template _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default) { InternalLoad(temp_storage, linear_tid).Load(block_itr, items, valid_items, oob_default); } //! @} end member group }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/warp/warp_merge_sort.cuh000066400000000000000000000146661463375617100206470ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include CUB_NAMESPACE_BEGIN //! @rst //! The WarpMergeSort class provides methods for sorting items partitioned across a CUDA warp //! using a merge sorting method. //! //! Overview //! ++++++++++++++++ //! //! WarpMergeSort arranges items into ascending order using a comparison //! functor with less-than semantics. Merge sort can handle arbitrary types //! and comparison functors. //! //! A Simple Example //! ++++++++++++++++ //! //! The code snippet below illustrates a sort of 64 integer keys that are //! partitioned across 16 threads where each thread owns 4 consecutive items. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! struct CustomLess //! { //! template //! __device__ bool operator()(const DataType &lhs, const DataType &rhs) //! { //! return lhs < rhs; //! } //! }; //! //! __global__ void ExampleKernel(...) //! { //! constexpr int warp_threads = 16; //! constexpr int block_threads = 256; //! constexpr int items_per_thread = 4; //! constexpr int warps_per_block = block_threads / warp_threads; //! const int warp_id = static_cast(threadIdx.x) / warp_threads; //! //! // Specialize WarpMergeSort for a virtual warp of 16 threads //! // owning 4 integer items each //! using WarpMergeSortT = //! cub::WarpMergeSort; //! //! // Allocate shared memory for WarpMergeSort //! __shared__ typename WarpMergeSortT::TempStorage temp_storage[warps_per_block]; //! //! // Obtain a segment of consecutive items that are blocked across threads //! int thread_keys[items_per_thread]; //! // ... //! //! WarpMergeSortT(temp_storage[warp_id]).Sort(thread_keys, CustomLess()); //! // ... //! } //! //! Suppose the set of input ``thread_keys`` across a warp of threads is //! ``{ [0,64,1,63], [2,62,3,61], [4,60,5,59], ..., [31,34,32,33] }``. //! The corresponding output ``thread_keys`` in those threads will be //! ``{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [31,32,33,34] }``. //! @endrst //! //! @tparam KeyT //! Key type //! //! @tparam ITEMS_PER_THREAD //! The number of items per thread //! //! @tparam LOGICAL_WARP_THREADS //! [optional] The number of threads per "logical" warp (may be less //! than the number of hardware warp threads). Default is the warp size of the //! targeted CUDA compute-capability (e.g., 32 threads for SM86). Must be a //! power of two. //! //! @tparam ValueT //! [optional] Value type (default: cub::NullType, which indicates a //! keys-only sort) //! //! @tparam LEGACY_PTX_ARCH //! Unused. //! template class WarpMergeSort : public BlockMergeSortStrategy> { private: static constexpr bool IS_ARCH_WARP = LOGICAL_WARP_THREADS == CUB_WARP_THREADS(0); static constexpr bool KEYS_ONLY = ::cuda::std::is_same::value; static constexpr int TILE_SIZE = ITEMS_PER_THREAD * LOGICAL_WARP_THREADS; using BlockMergeSortStrategyT = BlockMergeSortStrategy; const unsigned int warp_id; const unsigned int member_mask; public: WarpMergeSort() = delete; _CCCL_DEVICE _CCCL_FORCEINLINE WarpMergeSort(typename BlockMergeSortStrategyT::TempStorage& temp_storage) : BlockMergeSortStrategyT(temp_storage, IS_ARCH_WARP ? LaneId() : (LaneId() % LOGICAL_WARP_THREADS)) , warp_id(IS_ARCH_WARP ? 0 : (LaneId() / LOGICAL_WARP_THREADS)) , member_mask(WarpMask(warp_id)) {} _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int get_member_mask() const { return member_mask; } private: _CCCL_DEVICE _CCCL_FORCEINLINE void SyncImplementation() const { WARP_SYNC(member_mask); } friend BlockMergeSortStrategyT; }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/warp/warp_reduce.cuh000066400000000000000000000641061463375617100177420ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ //! @file //! @rst //! The ``cub::WarpReduce`` class provides :ref:`collective ` methods for //! computing a parallel reduction of items partitioned across a CUDA thread warp. //! @endrst #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include CUB_NAMESPACE_BEGIN //! @rst //! The ``WarpReduce`` class provides :ref:`collective ` methods for //! computing a parallel reduction of items partitioned across a CUDA thread warp. //! //! .. image:: ../img/warp_reduce_logo.png //! :align: center //! //! Overview //! ++++++++++++++++++++++++++ //! //! - A `reduction `__ (or *fold*) //! uses a binary combining operator to compute a single aggregate from a list of input elements. //! - Supports "logical" warps smaller than the physical warp size (e.g., logical warps of 8 //! threads) //! - The number of entrant threads must be an multiple of ``LOGICAL_WARP_THREADS`` //! //! Performance Considerations //! ++++++++++++++++++++++++++ //! //! - Uses special instructions when applicable (e.g., warp ``SHFL`` instructions) //! - Uses synchronization-free communication between warp lanes when applicable //! - Incurs zero bank conflicts for most types //! - Computation is slightly more efficient (i.e., having lower instruction overhead) for: //! //! - Summation (**vs.** generic reduction) //! - The architecture's warp size is a whole multiple of ``LOGICAL_WARP_THREADS`` //! //! Simple Examples //! ++++++++++++++++++++++++++ //! //! @warpcollective{WarpReduce} //! //! The code snippet below illustrates four concurrent warp sum reductions within a block of //! 128 threads (one per each of the 32-thread warps). //! //! .. code-block:: c++ //! //! #include //! //! __global__ void ExampleKernel(...) //! { //! // Specialize WarpReduce for type int //! typedef cub::WarpReduce WarpReduce; //! //! // Allocate WarpReduce shared memory for 4 warps //! __shared__ typename WarpReduce::TempStorage temp_storage[4]; //! //! // Obtain one input item per thread //! int thread_data = ... //! //! // Return the warp-wide sums to each lane0 (threads 0, 32, 64, and 96) //! int warp_id = threadIdx.x / 32; //! int aggregate = WarpReduce(temp_storage[warp_id]).Sum(thread_data); //! //! Suppose the set of input ``thread_data`` across the block of threads is //! ``{0, 1, 2, 3, ..., 127}``. The corresponding output ``aggregate`` in threads 0, 32, 64, and 96 //! will be ``496``, ``1520``, ``2544``, and ``3568``, respectively //! (and is undefined in other threads). //! //! The code snippet below illustrates a single warp sum reduction within a block of //! 128 threads. //! //! .. code-block:: c++ //! //! #include //! //! __global__ void ExampleKernel(...) //! { //! // Specialize WarpReduce for type int //! typedef cub::WarpReduce WarpReduce; //! //! // Allocate WarpReduce shared memory for one warp //! __shared__ typename WarpReduce::TempStorage temp_storage; //! ... //! //! // Only the first warp performs a reduction //! if (threadIdx.x < 32) //! { //! // Obtain one input item per thread //! int thread_data = ... //! //! // Return the warp-wide sum to lane0 //! int aggregate = WarpReduce(temp_storage).Sum(thread_data); //! //! Suppose the set of input ``thread_data`` across the warp of threads is //! ``{0, 1, 2, 3, ..., 31}``. The corresponding output ``aggregate`` in thread0 will be ``496`` //! (and is undefined in other threads). //! @endrst //! //! @tparam T //! The reduction input/output element type //! //! @tparam LOGICAL_WARP_THREADS //! [optional] The number of threads per "logical" warp (may be less than the number of //! hardware warp threads). Default is the warp size of the targeted CUDA compute-capability //! (e.g., 32 threads for SM20). //! //! @tparam LEGACY_PTX_ARCH //! [optional] Unused. template class WarpReduce { private: /****************************************************************************** * Constants and type definitions ******************************************************************************/ enum { /// Whether the logical warp size and the PTX warp size coincide IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(0)), /// Whether the logical warp size is a power-of-two IS_POW_OF_TWO = PowerOfTwo::VALUE, }; public: #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document /// Internal specialization. /// Use SHFL-based reduction if LOGICAL_WARP_THREADS is a power-of-two using InternalWarpReduce = cub::detail:: conditional_t, WarpReduceSmem>; #endif // DOXYGEN_SHOULD_SKIP_THIS private: /// Shared memory storage layout type for WarpReduce using _TempStorage = typename InternalWarpReduce::TempStorage; /****************************************************************************** * Thread fields ******************************************************************************/ /// Shared storage reference _TempStorage& temp_storage; /****************************************************************************** * Utility methods ******************************************************************************/ public: /// \smemstorage{WarpReduce} struct TempStorage : Uninitialized<_TempStorage> {}; //! @name Collective constructors //! @{ //! @rst //! Collective constructor using the specified memory allocation as temporary storage. //! Logical warp and lane identifiers are constructed from ``threadIdx.x``. //! @endrst //! //! @param[in] temp_storage Reference to memory allocation having layout type TempStorage _CCCL_DEVICE _CCCL_FORCEINLINE WarpReduce(TempStorage& temp_storage) : temp_storage(temp_storage.Alias()) {} //! @} end member group //! @name Summation reductions //! @{ //! @rst //! Computes a warp-wide sum in the calling warp. //! The output is valid in warp *lane*\ :sub:`0`. //! //! @smemwarpreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates four concurrent warp sum reductions within a block of //! 128 threads (one per each of the 32-thread warps). //! //! .. code-block:: c++ //! //! #include //! //! __global__ void ExampleKernel(...) //! { //! // Specialize WarpReduce for type int //! typedef cub::WarpReduce WarpReduce; //! //! // Allocate WarpReduce shared memory for 4 warps //! __shared__ typename WarpReduce::TempStorage temp_storage[4]; //! //! // Obtain one input item per thread //! int thread_data = ... //! //! // Return the warp-wide sums to each lane0 //! int warp_id = threadIdx.x / 32; //! int aggregate = WarpReduce(temp_storage[warp_id]).Sum(thread_data); //! //! Suppose the set of input ``thread_data`` across the block of threads is //! ``{0, 1, 2, 3, ..., 127}``. //! The corresponding output ``aggregate`` in threads 0, 32, 64, and 96 will ``496``, ``1520``, //! ``2544``, and ``3568``, respectively (and is undefined in other threads). //! @endrst //! //! @param[in] input Calling thread's input _CCCL_DEVICE _CCCL_FORCEINLINE T Sum(T input) { return InternalWarpReduce(temp_storage).template Reduce(input, LOGICAL_WARP_THREADS, cub::Sum()); } //! @rst //! Computes a partially-full warp-wide sum in the calling warp. //! The output is valid in warp *lane*\ :sub:`0`. //! //! All threads across the calling warp must agree on the same value for ``valid_items``. //! Otherwise the result is undefined. //! //! @smemwarpreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates a sum reduction within a single, partially-full //! block of 32 threads (one warp). //! //! .. code-block:: c++ //! //! #include //! //! __global__ void ExampleKernel(int *d_data, int valid_items) //! { //! // Specialize WarpReduce for type int //! typedef cub::WarpReduce WarpReduce; //! //! // Allocate WarpReduce shared memory for one warp //! __shared__ typename WarpReduce::TempStorage temp_storage; //! //! // Obtain one input item per thread if in range //! int thread_data; //! if (threadIdx.x < valid_items) //! thread_data = d_data[threadIdx.x]; //! //! // Return the warp-wide sums to each lane0 //! int aggregate = WarpReduce(temp_storage).Sum( //! thread_data, valid_items); //! //! Suppose the input ``d_data`` is ``{0, 1, 2, 3, 4, ...`` and ``valid_items`` is ``4``. //! The corresponding output ``aggregate`` in *lane*\ :sub:`0` is ``6`` //! (and is undefined in other threads). //! @endrst //! //! @param[in] input //! Calling thread's input //! //! @param[in] valid_items //! Total number of valid items in the calling thread's logical warp //! (may be less than ``LOGICAL_WARP_THREADS``) _CCCL_DEVICE _CCCL_FORCEINLINE T Sum(T input, int valid_items) { // Determine if we don't need bounds checking return InternalWarpReduce(temp_storage).template Reduce(input, valid_items, cub::Sum()); } //! @rst //! Computes a segmented sum in the calling warp where segments are defined by head-flags. //! The sum of each segment is returned to the first lane in that segment //! (which always includes *lane*\ :sub:`0`). //! //! @smemwarpreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates a head-segmented warp sum //! reduction within a block of 32 threads (one warp). //! //! .. code-block:: c++ //! //! #include //! //! __global__ void ExampleKernel(...) //! { //! // Specialize WarpReduce for type int //! typedef cub::WarpReduce WarpReduce; //! //! // Allocate WarpReduce shared memory for one warp //! __shared__ typename WarpReduce::TempStorage temp_storage; //! //! // Obtain one input item and flag per thread //! int thread_data = ... //! int head_flag = ... //! //! // Return the warp-wide sums to each lane0 //! int aggregate = WarpReduce(temp_storage).HeadSegmentedSum( //! thread_data, head_flag); //! //! Suppose the set of input ``thread_data`` and ``head_flag`` across the block of threads //! is ``{0, 1, 2, 3, ..., 31`` and is ``{1, 0, 0, 0, 1, 0, 0, 0, ..., 1, 0, 0, 0``, //! respectively. The corresponding output ``aggregate`` in threads 0, 4, 8, etc. will be //! ``6``, ``22``, ``38``, etc. (and is undefined in other threads). //! @endrst //! //! @tparam ReductionOp //! **[inferred]** Binary reduction operator type having member //! `T operator()(const T &a, const T &b)` //! //! @param[in] input //! Calling thread's input //! //! @param[in] head_flag //! Head flag denoting whether or not `input` is the start of a new segment template _CCCL_DEVICE _CCCL_FORCEINLINE T HeadSegmentedSum(T input, FlagT head_flag) { return HeadSegmentedReduce(input, head_flag, cub::Sum()); } //! @rst //! Computes a segmented sum in the calling warp where segments are defined by tail-flags. //! The sum of each segment is returned to the first lane in that segment //! (which always includes *lane*\ :sub:`0`). //! //! @smemwarpreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates a tail-segmented warp sum reduction within a block of 32 //! threads (one warp). //! //! .. code-block:: c++ //! //! #include //! //! __global__ void ExampleKernel(...) //! { //! // Specialize WarpReduce for type int //! typedef cub::WarpReduce WarpReduce; //! //! // Allocate WarpReduce shared memory for one warp //! __shared__ typename WarpReduce::TempStorage temp_storage; //! //! // Obtain one input item and flag per thread //! int thread_data = ... //! int tail_flag = ... //! //! // Return the warp-wide sums to each lane0 //! int aggregate = WarpReduce(temp_storage).TailSegmentedSum( //! thread_data, tail_flag); //! //! Suppose the set of input ``thread_data`` and ``tail_flag`` across the block of threads //! is ``{0, 1, 2, 3, ..., 31}`` and is ``{0, 0, 0, 1, 0, 0, 0, 1, ..., 0, 0, 0, 1}``, //! respectively. The corresponding output ``aggregate`` in threads 0, 4, 8, etc. will be //! ``6``, ``22``, ``38``, etc. (and is undefined in other threads). //! @endrst //! //! @tparam ReductionOp //! **[inferred]** Binary reduction operator type having member //! `T operator()(const T &a, const T &b)` //! //! @param[in] input //! Calling thread's input //! //! @param[in] tail_flag //! Head flag denoting whether or not `input` is the start of a new segment template _CCCL_DEVICE _CCCL_FORCEINLINE T TailSegmentedSum(T input, FlagT tail_flag) { return TailSegmentedReduce(input, tail_flag, cub::Sum()); } //! @} end member group //! @name Generic reductions //! @{ //! @rst //! Computes a warp-wide reduction in the calling warp using the specified binary reduction //! functor. The output is valid in warp *lane*\ :sub:`0`. //! //! Supports non-commutative reduction operators //! //! @smemwarpreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates four concurrent warp max reductions within a block of //! 128 threads (one per each of the 32-thread warps). //! //! .. code-block:: c++ //! //! #include //! //! __global__ void ExampleKernel(...) //! { //! // Specialize WarpReduce for type int //! typedef cub::WarpReduce WarpReduce; //! //! // Allocate WarpReduce shared memory for 4 warps //! __shared__ typename WarpReduce::TempStorage temp_storage[4]; //! //! // Obtain one input item per thread //! int thread_data = ... //! //! // Return the warp-wide reductions to each lane0 //! int warp_id = threadIdx.x / 32; //! int aggregate = WarpReduce(temp_storage[warp_id]).Reduce( //! thread_data, cub::Max()); //! //! Suppose the set of input ``thread_data`` across the block of threads is //! ``{0, 1, 2, 3, ..., 127}``. The corresponding output ``aggregate`` in threads 0, 32, 64, and //! 96 will be ``31``, ``63``, ``95``, and ``127``, respectively //! (and is undefined in other threads). //! @endrst //! //! @tparam ReductionOp //! **[inferred]** Binary reduction operator type having member //! `T operator()(const T &a, const T &b)` //! //! @param[in] input //! Calling thread's input //! //! @param[in] reduction_op //! Binary reduction operator template _CCCL_DEVICE _CCCL_FORCEINLINE T Reduce(T input, ReductionOp reduction_op) { return InternalWarpReduce(temp_storage).template Reduce(input, LOGICAL_WARP_THREADS, reduction_op); } //! @rst //! Computes a partially-full warp-wide reduction in the calling warp using the specified binary //! reduction functor. The output is valid in warp *lane*\ :sub:`0`. //! //! All threads across the calling warp must agree on the same value for ``valid_items``. //! Otherwise the result is undefined. //! //! Supports non-commutative reduction operators //! //! @smemwarpreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates a max reduction within a single, partially-full //! block of 32 threads (one warp). //! //! .. code-block:: c++ //! //! #include //! //! __global__ void ExampleKernel(int *d_data, int valid_items) //! { //! // Specialize WarpReduce for type int //! typedef cub::WarpReduce WarpReduce; //! //! // Allocate WarpReduce shared memory for one warp //! __shared__ typename WarpReduce::TempStorage temp_storage; //! //! // Obtain one input item per thread if in range //! int thread_data; //! if (threadIdx.x < valid_items) //! thread_data = d_data[threadIdx.x]; //! //! // Return the warp-wide reductions to each lane0 //! int aggregate = WarpReduce(temp_storage).Reduce( //! thread_data, cub::Max(), valid_items); //! //! Suppose the input ``d_data`` is ``{0, 1, 2, 3, 4, ... }`` and ``valid_items`` //! is ``4``. The corresponding output ``aggregate`` in thread0 is ``3`` (and is //! undefined in other threads). //! @endrst //! //! @tparam ReductionOp //! **[inferred]** Binary reduction operator type having member //! `T operator()(const T &a, const T &b)` //! //! @param[in] input //! Calling thread's input //! //! @param[in] reduction_op //! Binary reduction operator //! //! @param[in] valid_items //! Total number of valid items in the calling thread's logical warp //! (may be less than ``LOGICAL_WARP_THREADS``) template _CCCL_DEVICE _CCCL_FORCEINLINE T Reduce(T input, ReductionOp reduction_op, int valid_items) { return InternalWarpReduce(temp_storage).template Reduce(input, valid_items, reduction_op); } //! @rst //! Computes a segmented reduction in the calling warp where segments are defined by head-flags. //! The reduction of each segment is returned to the first lane in that segment //! (which always includes *lane*\ :sub:`0`). //! //! Supports non-commutative reduction operators //! //! @smemwarpreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates a head-segmented warp max //! reduction within a block of 32 threads (one warp). //! //! .. code-block:: c++ //! //! #include //! //! __global__ void ExampleKernel(...) //! { //! // Specialize WarpReduce for type int //! typedef cub::WarpReduce WarpReduce; //! //! // Allocate WarpReduce shared memory for one warp //! __shared__ typename WarpReduce::TempStorage temp_storage; //! //! // Obtain one input item and flag per thread //! int thread_data = ... //! int head_flag = ... //! //! // Return the warp-wide reductions to each lane0 //! int aggregate = WarpReduce(temp_storage).HeadSegmentedReduce( //! thread_data, head_flag, cub::Max()); //! //! Suppose the set of input ``thread_data`` and ``head_flag`` across the block of threads //! is ``{0, 1, 2, 3, ..., 31}`` and is ``{1, 0, 0, 0, 1, 0, 0, 0, ..., 1, 0, 0, 0}``, //! respectively. The corresponding output ``aggregate`` in threads 0, 4, 8, etc. will be //! ``3``, ``7``, ``11``, etc. (and is undefined in other threads). //! @endrst //! //! @tparam ReductionOp //! **[inferred]** Binary reduction operator type having member //! `T operator()(const T &a, const T &b)` //! //! @param[in] input //! Calling thread's input //! //! @param[in] head_flag //! Head flag denoting whether or not `input` is the start of a new segment //! //! @param[in] reduction_op //! Reduction operator template _CCCL_DEVICE _CCCL_FORCEINLINE T HeadSegmentedReduce(T input, FlagT head_flag, ReductionOp reduction_op) { return InternalWarpReduce(temp_storage).template SegmentedReduce(input, head_flag, reduction_op); } //! @rst //! Computes a segmented reduction in the calling warp where segments are defined by tail-flags. //! The reduction of each segment is returned to the first lane in that segment //! (which always includes *lane*\ :sub:`0`). //! //! Supports non-commutative reduction operators //! //! @smemwarpreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates a tail-segmented warp max //! reduction within a block of 32 threads (one warp). //! //! .. code-block:: c++ //! //! #include //! //! __global__ void ExampleKernel(...) //! { //! // Specialize WarpReduce for type int //! typedef cub::WarpReduce WarpReduce; //! //! // Allocate WarpReduce shared memory for one warp //! __shared__ typename WarpReduce::TempStorage temp_storage; //! //! // Obtain one input item and flag per thread //! int thread_data = ... //! int tail_flag = ... //! //! // Return the warp-wide reductions to each lane0 //! int aggregate = WarpReduce(temp_storage).TailSegmentedReduce( //! thread_data, tail_flag, cub::Max()); //! //! Suppose the set of input ``thread_data`` and ``tail_flag`` across the block of threads //! is ``{0, 1, 2, 3, ..., 31}`` and is ``{0, 0, 0, 1, 0, 0, 0, 1, ..., 0, 0, 0, 1}``, //! respectively. The corresponding output ``aggregate`` in threads 0, 4, 8, etc. will be //! ``3``, ``7``, ``11``, etc. (and is undefined in other threads). //! @endrst //! //! @tparam ReductionOp //! **[inferred]** Binary reduction operator type having member //! `T operator()(const T &a, const T &b)` //! //! @param[in] input //! Calling thread's input //! //! @param[in] tail_flag //! Tail flag denoting whether or not \p input is the end of the current segment //! //! @param[in] reduction_op //! Reduction operator template _CCCL_DEVICE _CCCL_FORCEINLINE T TailSegmentedReduce(T input, FlagT tail_flag, ReductionOp reduction_op) { return InternalWarpReduce(temp_storage).template SegmentedReduce(input, tail_flag, reduction_op); } //! @} end member group }; #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template class WarpReduce { private: using _TempStorage = cub::NullType; public: struct InternalWarpReduce { struct TempStorage : Uninitialized<_TempStorage> {}; _CCCL_DEVICE _CCCL_FORCEINLINE InternalWarpReduce(TempStorage& /*temp_storage */) {} template _CCCL_DEVICE _CCCL_FORCEINLINE T Reduce(T input, int /* valid_items */, ReductionOp /* reduction_op */) { return input; } template _CCCL_DEVICE _CCCL_FORCEINLINE T SegmentedReduce(T input, FlagT /* flag */, ReductionOp /* reduction_op */) { return input; } }; using TempStorage = typename InternalWarpReduce::TempStorage; _CCCL_DEVICE _CCCL_FORCEINLINE WarpReduce(TempStorage& /*temp_storage */) {} _CCCL_DEVICE _CCCL_FORCEINLINE T Sum(T input) { return input; } _CCCL_DEVICE _CCCL_FORCEINLINE T Sum(T input, int /* valid_items */) { return input; } template _CCCL_DEVICE _CCCL_FORCEINLINE T HeadSegmentedSum(T input, FlagT /* head_flag */) { return input; } template _CCCL_DEVICE _CCCL_FORCEINLINE T TailSegmentedSum(T input, FlagT /* tail_flag */) { return input; } template _CCCL_DEVICE _CCCL_FORCEINLINE T Reduce(T input, ReductionOp /* reduction_op */) { return input; } template _CCCL_DEVICE _CCCL_FORCEINLINE T Reduce(T input, ReductionOp /* reduction_op */, int /* valid_items */) { return input; } template _CCCL_DEVICE _CCCL_FORCEINLINE T HeadSegmentedReduce(T input, FlagT /* head_flag */, ReductionOp /* reduction_op */) { return input; } template _CCCL_DEVICE _CCCL_FORCEINLINE T TailSegmentedReduce(T input, FlagT /* tail_flag */, ReductionOp /* reduction_op */) { return input; } }; #endif // DOXYGEN_SHOULD_SKIP_THIS CUB_NAMESPACE_END cccl-2.5.0/cub/cub/warp/warp_scan.cuh000066400000000000000000001131541463375617100174150ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ //! @file //! @rst //! The ``cub::WarpScan`` class provides :ref:`collective ` methods for //! computing a parallel prefix scan of items partitioned across a CUDA thread warp. //! @endrst #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include CUB_NAMESPACE_BEGIN //! @rst //! The WarpScan class provides :ref:`collective ` methods for computing a //! parallel prefix scan of items partitioned across a CUDA thread warp. //! //! .. image:: ../img/warp_scan_logo.png //! :align: center //! //! Overview //! ++++++++++++++++++++++++++ //! //! * Given a list of input elements and a binary reduction operator, a //! `prefix scan `__ produces an output list where each //! element is computed to be the reduction of the elements occurring earlier in the input list. //! *Prefix sum* connotes a prefix scan with the addition operator. The term *inclusive* //! indicates that the *i*\ :sup:`th` output reduction incorporates the *i*\ :sup:`th` input. //! The term *exclusive* indicates the *i*\ :sup:`th` input is not incorporated into //! the *i*\ :sup:`th` output reduction. //! * Supports non-commutative scan operators //! * Supports "logical" warps smaller than the physical warp size //! (e.g., a logical warp of 8 threads) //! * The number of entrant threads must be an multiple of ``LOGICAL_WARP_THREADS`` //! //! Performance Considerations //! ++++++++++++++++++++++++++ //! //! * Uses special instructions when applicable (e.g., warp ``SHFL``) //! * Uses synchronization-free communication between warp lanes when applicable //! * Incurs zero bank conflicts for most types //! * Computation is slightly more efficient (i.e., having lower instruction overhead) for: //! //! * Summation (**vs.** generic scan) //! * The architecture's warp size is a whole multiple of ``LOGICAL_WARP_THREADS`` //! //! Simple Examples //! ++++++++++++++++++++++++++ //! //! @warpcollective{WarpScan} //! //! The code snippet below illustrates four concurrent warp prefix sums within a block of //! 128 threads (one per each of the 32-thread warps). //! //! .. code-block:: c++ //! //! #include //! //! __global__ void ExampleKernel(...) //! { //! // Specialize WarpScan for type int //! typedef cub::WarpScan WarpScan; //! //! // Allocate WarpScan shared memory for 4 warps //! __shared__ typename WarpScan::TempStorage temp_storage[4]; //! //! // Obtain one input item per thread //! int thread_data = ... //! //! // Compute warp-wide prefix sums //! int warp_id = threadIdx.x / 32; //! WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data); //! //! Suppose the set of input ``thread_data`` across the block of threads is //! ``{1, 1, 1, 1, ...}``. The corresponding output ``thread_data`` in each of the four warps of //! threads will be ``0, 1, 2, 3, ..., 31}``. //! //! The code snippet below illustrates a single warp prefix sum within a block of //! 128 threads. //! //! .. code-block:: c++ //! //! #include //! //! __global__ void ExampleKernel(...) //! { //! // Specialize WarpScan for type int //! typedef cub::WarpScan WarpScan; //! //! // Allocate WarpScan shared memory for one warp //! __shared__ typename WarpScan::TempStorage temp_storage; //! ... //! //! // Only the first warp performs a prefix sum //! if (threadIdx.x < 32) //! { //! // Obtain one input item per thread //! int thread_data = ... //! //! // Compute warp-wide prefix sums //! WarpScan(temp_storage).ExclusiveSum(thread_data, thread_data); //! //! Suppose the set of input ``thread_data`` across the warp of threads is //! ``{1, 1, 1, 1, ...}``. The corresponding output ``thread_data`` will be //! ``{0, 1, 2, 3, ..., 31}``. //! @endrst //! //! @tparam T //! The scan input/output element type //! //! @tparam LOGICAL_WARP_THREADS //! **[optional]** The number of threads per "logical" warp (may be less than the number of //! hardware warp threads). Default is the warp size associated with the CUDA Compute Capability //! targeted by the compiler (e.g., 32 threads for SM20). //! //! @tparam LEGACY_PTX_ARCH //! **[optional]** Unused. template class WarpScan { private: /****************************************************************************** * Constants and type definitions ******************************************************************************/ enum { /// Whether the logical warp size and the PTX warp size coincide IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(0)), /// Whether the logical warp size is a power-of-two IS_POW_OF_TWO = ((LOGICAL_WARP_THREADS & (LOGICAL_WARP_THREADS - 1)) == 0), /// Whether the data type is an integer (which has fully-associative addition) IS_INTEGER = ((Traits::CATEGORY == SIGNED_INTEGER) || (Traits::CATEGORY == UNSIGNED_INTEGER)) }; /// Internal specialization. /// Use SHFL-based scan if LOGICAL_WARP_THREADS is a power-of-two using InternalWarpScan = cub::detail:: conditional_t, WarpScanSmem>; /// Shared memory storage layout type for WarpScan using _TempStorage = typename InternalWarpScan::TempStorage; /****************************************************************************** * Thread fields ******************************************************************************/ /// Shared storage reference _TempStorage& temp_storage; unsigned int lane_id; /****************************************************************************** * Public types ******************************************************************************/ public: /// @smemstorage{WarpScan} struct TempStorage : Uninitialized<_TempStorage> {}; //! @name Collective constructors //! @{ //! @brief Collective constructor using the specified memory allocation as temporary storage. //! Logical warp and lane identifiers are constructed from `threadIdx.x`. //! //! @param[in] temp_storage //! Reference to memory allocation having layout type TempStorage _CCCL_DEVICE _CCCL_FORCEINLINE WarpScan(TempStorage& temp_storage) : temp_storage(temp_storage.Alias()) , lane_id(IS_ARCH_WARP ? LaneId() : LaneId() % LOGICAL_WARP_THREADS) {} //! @} end member group //! @name Inclusive prefix sums //! @{ //! @rst //! Computes an inclusive prefix sum across the calling warp. //! //! * @smemwarpreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates four concurrent warp-wide inclusive prefix sums within a //! block of 128 threads (one per each of the 32-thread warps). //! //! .. code-block:: c++ //! //! #include //! //! __global__ void ExampleKernel(...) //! { //! // Specialize WarpScan for type int //! typedef cub::WarpScan WarpScan; //! //! // Allocate WarpScan shared memory for 4 warps //! __shared__ typename WarpScan::TempStorage temp_storage[4]; //! //! // Obtain one input item per thread //! int thread_data = ... //! //! // Compute inclusive warp-wide prefix sums //! int warp_id = threadIdx.x / 32; //! WarpScan(temp_storage[warp_id]).InclusiveSum(thread_data, thread_data); //! //! Suppose the set of input ``thread_data`` across the block of threads is //! ``{1, 1, 1, 1, ...}``. The corresponding output ``thread_data`` in each of the four warps //! of threads will be ``1, 2, 3, ..., 32}``. //! @endrst //! //! @param[in] input //! Calling thread's input item. //! //! @param[out] inclusive_output //! Calling thread's output item. May be aliased with `input`. _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveSum(T input, T& inclusive_output) { InclusiveScan(input, inclusive_output, cub::Sum()); } //! @rst //! Computes an inclusive prefix sum across the calling warp. //! Also provides every thread with the warp-wide ``warp_aggregate`` of all inputs. //! //! * @smemwarpreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates four concurrent warp-wide inclusive prefix sums within a //! block of 128 threads (one per each of the 32-thread warps). //! //! .. code-block:: c++ //! //! #include //! //! __global__ void ExampleKernel(...) //! { //! // Specialize WarpScan for type int //! typedef cub::WarpScan WarpScan; //! //! // Allocate WarpScan shared memory for 4 warps //! __shared__ typename WarpScan::TempStorage temp_storage[4]; //! //! // Obtain one input item per thread //! int thread_data = ... //! //! // Compute inclusive warp-wide prefix sums //! int warp_aggregate; //! int warp_id = threadIdx.x / 32; //! WarpScan(temp_storage[warp_id]).InclusiveSum(thread_data, //! thread_data, //! warp_aggregate); //! //! Suppose the set of input ``thread_data`` across the block of threads is //! ``{1, 1, 1, 1, ...}``. The corresponding output ``thread_data`` in each of the four warps //! of threads will be ``1, 2, 3, ..., 32}``. Furthermore, ``warp_aggregate`` for all threads //! in all warps will be ``32``. //! @endrst //! //! @param[in] input //! Calling thread's input item //! //! @param[out] inclusive_output //! Calling thread's output item. May be aliased with `input` //! //! @param[out] warp_aggregate //! Warp-wide aggregate reduction of input items _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveSum(T input, T& inclusive_output, T& warp_aggregate) { InclusiveScan(input, inclusive_output, cub::Sum(), warp_aggregate); } //! @} end member group //! @name Exclusive prefix sums //! @{ //! @rst //! Computes an exclusive prefix sum across the calling warp. The value of 0 is applied as the //! initial value, and is assigned to ``exclusive_output`` in *lane*\ :sub:`0`. //! //! * @identityzero //! * @smemwarpreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates four concurrent warp-wide exclusive prefix sums within a //! block of 128 threads (one per each of the 32-thread warps). //! //! .. code-block:: c++ //! //! #include //! //! __global__ void ExampleKernel(...) //! { //! // Specialize WarpScan for type int //! typedef cub::WarpScan WarpScan; //! //! // Allocate WarpScan shared memory for 4 warps //! __shared__ typename WarpScan::TempStorage temp_storage[4]; //! //! // Obtain one input item per thread //! int thread_data = ... //! //! // Compute exclusive warp-wide prefix sums //! int warp_id = threadIdx.x / 32; //! WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data); //! //! Suppose the set of input ``thread_data`` across the block of threads is //! ``{1, 1, 1, 1, ...}``. The corresponding output ``thread_data`` in each of the four warps //! of threads will be ``0, 1, 2, ..., 31}``. //! @endrst //! //! @param[in] input //! Calling thread's input item. //! //! @param[out] exclusive_output //! Calling thread's output item. May be aliased with `input`. _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveSum(T input, T& exclusive_output) { T initial_value{}; ExclusiveScan(input, exclusive_output, initial_value, cub::Sum()); } //! @rst //! Computes an exclusive prefix sum across the calling warp. The value of 0 is applied as the //! initial value, and is assigned to ``exclusive_output`` in *lane*\ :sub:`0`. //! Also provides every thread with the warp-wide ``warp_aggregate`` of all inputs. //! //! * @identityzero //! * @smemwarpreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates four concurrent warp-wide exclusive prefix sums within a //! block of 128 threads (one per each of the 32-thread warps). //! //! .. code-block:: c++ //! //! #include //! //! __global__ void ExampleKernel(...) //! { //! // Specialize WarpScan for type int //! typedef cub::WarpScan WarpScan; //! //! // Allocate WarpScan shared memory for 4 warps //! __shared__ typename WarpScan::TempStorage temp_storage[4]; //! //! // Obtain one input item per thread //! int thread_data = ... //! //! // Compute exclusive warp-wide prefix sums //! int warp_aggregate; //! int warp_id = threadIdx.x / 32; //! WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, //! thread_data, //! warp_aggregate); //! //! Suppose the set of input ``thread_data`` across the block of threads is //! ``{1, 1, 1, 1, ...}``. The corresponding output ``thread_data`` in each of the four warps //! of threads will be ``0, 1, 2, ..., 31}``. Furthermore, ``warp_aggregate`` for all threads //! in all warps will be ``32``. //! @endrst //! //! //! @param[in] input //! Calling thread's input item //! //! @param[out] exclusive_output //! Calling thread's output item. May be aliased with `input` //! //! @param[out] warp_aggregate //! Warp-wide aggregate reduction of input items _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveSum(T input, T& exclusive_output, T& warp_aggregate) { T initial_value{}; ExclusiveScan(input, exclusive_output, initial_value, cub::Sum(), warp_aggregate); } //! @} end member group //! @name Inclusive prefix scans //! @{ //! @rst //! Computes an inclusive prefix scan using the specified binary scan functor across the //! calling warp. //! //! * @smemwarpreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates four concurrent warp-wide inclusive prefix max scans //! within a block of 128 threads (one per each of the 32-thread warps). //! //! .. code-block:: c++ //! //! #include //! //! __global__ void ExampleKernel(...) //! { //! // Specialize WarpScan for type int //! typedef cub::WarpScan WarpScan; //! //! // Allocate WarpScan shared memory for 4 warps //! __shared__ typename WarpScan::TempStorage temp_storage[4]; //! //! // Obtain one input item per thread //! int thread_data = ... //! //! // Compute inclusive warp-wide prefix max scans //! int warp_id = threadIdx.x / 32; //! WarpScan(temp_storage[warp_id]).InclusiveScan(thread_data, thread_data, cub::Max()); //! //! Suppose the set of input ``thread_data`` across the block of threads is //! ``{0, -1, 2, -3, ..., 126, -127}``. The corresponding output ``thread_data`` in the first //! warp would be ``0, 0, 2, 2, ..., 30, 30``, the output for the second warp would be //! ``32, 32, 34, 34, ..., 62, 62``, etc. //! @endrst //! //! @tparam ScanOp //! **[inferred]** Binary scan operator type having member //! `T operator()(const T &a, const T &b)` //! //! @param[in] input //! Calling thread's input item //! //! @param[out] inclusive_output //! Calling thread's output item. May be aliased with `input` //! //! @param[in] can_op //! Binary scan operator template _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScan(T input, T& inclusive_output, ScanOp scan_op) { InternalWarpScan(temp_storage).InclusiveScan(input, inclusive_output, scan_op); } //! @rst //! Computes an inclusive prefix scan using the specified binary scan functor across the //! calling warp. Also provides every thread with the warp-wide ``warp_aggregate`` of //! all inputs. //! //! * @smemwarpreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates four concurrent warp-wide inclusive prefix max scans //! within a block of 128 threads (one per each of the 32-thread warps). //! //! .. code-block:: c++ //! //! #include //! //! __global__ void ExampleKernel(...) //! { //! // Specialize WarpScan for type int //! typedef cub::WarpScan WarpScan; //! //! // Allocate WarpScan shared memory for 4 warps //! __shared__ typename WarpScan::TempStorage temp_storage[4]; //! //! // Obtain one input item per thread //! int thread_data = ... //! //! // Compute inclusive warp-wide prefix max scans //! int warp_aggregate; //! int warp_id = threadIdx.x / 32; //! WarpScan(temp_storage[warp_id]).InclusiveScan( //! thread_data, thread_data, cub::Max(), warp_aggregate); //! //! Suppose the set of input ``thread_data`` across the block of threads is //! ``{0, -1, 2, -3, ..., 126, -127}``. The corresponding output ``thread_data`` in the first //! warp would be ``0, 0, 2, 2, ..., 30, 30``, the output for the second warp would be //! ``32, 32, 34, 34, ..., 62, 62``, etc. Furthermore, ``warp_aggregate`` would be assigned //! ``30`` for threads in the first warp, ``62`` for threads in the second warp, etc. //! @endrst //! //! @tparam ScanOp //! **[inferred]** Binary scan operator type having member //! `T operator()(const T &a, const T &b)` //! @param[in] input //! Calling thread's input item //! //! @param[out] inclusive_output //! Calling thread's output item. May be aliased with ``input`` //! //! @param[in] scan_op //! Binary scan operator //! //! @param[out] warp_aggregate //! Warp-wide aggregate reduction of input items. template _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScan(T input, T& inclusive_output, ScanOp scan_op, T& warp_aggregate) { InternalWarpScan(temp_storage).InclusiveScan(input, inclusive_output, scan_op, warp_aggregate); } //! @} end member group //! @name Exclusive prefix scans //! @{ //! @rst //! Computes an exclusive prefix scan using the specified binary scan functor across the //! calling warp. Because no initial value is supplied, the ``output`` computed for //! *lane*\ :sub:`0` is undefined. //! //! * @smemwarpreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans //! within a block of 128 threads (one per each of the 32-thread warps). //! //! .. code-block:: c++ //! //! #include //! //! __global__ void ExampleKernel(...) //! { //! // Specialize WarpScan for type int //! typedef cub::WarpScan WarpScan; //! //! // Allocate WarpScan shared memory for 4 warps //! __shared__ typename WarpScan::TempStorage temp_storage[4]; //! //! // Obtain one input item per thread //! int thread_data = ... //! //! // Compute exclusive warp-wide prefix max scans //! int warp_id = threadIdx.x / 32; //! WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, cub::Max()); //! //! Suppose the set of input ``thread_data`` across the block of threads is //! ``{0, -1, 2, -3, ..., 126, -127}``. The corresponding output ``thread_data`` in the first //! warp would be ``?, 0, 0, 2, ..., 28, 30``, the output for the second warp would be //! ``?, 32, 32, 34, ..., 60, 62``, etc. //! (The output ``thread_data`` in warp *lane*\ :sub:`0` is undefined.) //! @endrst //! //! @tparam ScanOp //! **[inferred]** Binary scan operator type having member //! `T operator()(const T &a, const T &b)` //! //! @param[in] input //! Calling thread's input item //! //! @param[out] exclusive_output //! Calling thread's output item. May be aliased with `input` //! //! @param[in] scan_op //! Binary scan operator template _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScan(T input, T& exclusive_output, ScanOp scan_op) { InternalWarpScan internal(temp_storage); T inclusive_output; internal.InclusiveScan(input, inclusive_output, scan_op); internal.Update(input, inclusive_output, exclusive_output, scan_op, Int2Type()); } //! @rst //! Computes an exclusive prefix scan using the specified binary scan functor across the //! calling warp. //! //! * @smemwarpreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans //! within a block of 128 threads (one per each of the 32-thread warps). //! //! .. code-block:: c++ //! //! #include //! //! __global__ void ExampleKernel(...) //! { //! // Specialize WarpScan for type int //! typedef cub::WarpScan WarpScan; //! //! // Allocate WarpScan shared memory for 4 warps //! __shared__ typename WarpScan::TempStorage temp_storage[4]; //! //! // Obtain one input item per thread //! int thread_data = ... //! //! // Compute exclusive warp-wide prefix max scans //! int warp_id = threadIdx.x / 32; //! WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, //! thread_data, //! INT_MIN, //! cub::Max()); //! //! Suppose the set of input ``thread_data`` across the block of threads is //! ``{0, -1, 2, -3, ..., 126, -127}``. The corresponding output ``thread_data`` in the first //! warp would be ``INT_MIN, 0, 0, 2, ..., 28, 30``, the output for the second warp would be //! ``30, 32, 32, 34, ..., 60, 62``, etc. //! @endrst //! //! @tparam ScanOp //! **[inferred]** Binary scan operator type having member //! `T operator()(const T &a, const T &b)` //! //! @param[in] input //! Calling thread's input item //! //! @param[out] exclusive_output //! Calling thread's output item. May be aliased with `input` //! //! @param[in] initial_value //! Initial value to seed the exclusive scan //! //! @param[in] scan_op //! Binary scan operator template _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScan(T input, T& exclusive_output, T initial_value, ScanOp scan_op) { InternalWarpScan internal(temp_storage); T inclusive_output; internal.InclusiveScan(input, inclusive_output, scan_op); internal.Update(input, inclusive_output, exclusive_output, scan_op, initial_value, Int2Type()); } //! @rst //! Computes an exclusive prefix scan using the specified binary scan functor across the //! calling warp. Because no initial value is supplied, the ``output`` computed for //! *lane*\ :sub:`0` is undefined. Also provides every thread with the warp-wide //! ``warp_aggregate`` of all inputs. //! //! * @smemwarpreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans //! within a block of 128 threads (one per each of the 32-thread warps). //! //! .. code-block:: c++ //! //! #include //! //! __global__ void ExampleKernel(...) //! { //! // Specialize WarpScan for type int //! typedef cub::WarpScan WarpScan; //! //! // Allocate WarpScan shared memory for 4 warps //! __shared__ typename WarpScan::TempStorage temp_storage[4]; //! //! // Obtain one input item per thread //! int thread_data = ... //! //! // Compute exclusive warp-wide prefix max scans //! int warp_aggregate; //! int warp_id = threadIdx.x / 32; //! WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, //! thread_data, //! cub::Max(), //! warp_aggregate); //! //! Suppose the set of input ``thread_data`` across the block of threads is //! ``{0, -1, 2, -3, ..., 126, -127}``. The corresponding output ``thread_data`` in the first //! warp would be ``?, 0, 0, 2, ..., 28, 30``, the output for the second warp would be //! ``?, 32, 32, 34, ..., 60, 62``, etc. (The output ``thread_data`` in warp *lane*\ :sub:`0` //! is undefined). Furthermore, ``warp_aggregate`` would be assigned ``30`` for threads in the //! first warp, \p 62 for threads in the second warp, etc. //! @endrst //! //! @tparam ScanOp //! **[inferred]** Binary scan operator type having member //! `T operator()(const T &a, const T &b)` //! //! @param[in] input //! Calling thread's input item //! //! @param[out] exclusive_output //! Calling thread's output item. May be aliased with `input` //! //! @param[in] scan_op //! Binary scan operator //! //! @param[out] warp_aggregate //! Warp-wide aggregate reduction of input items template _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScan(T input, T& exclusive_output, ScanOp scan_op, T& warp_aggregate) { InternalWarpScan internal(temp_storage); T inclusive_output; internal.InclusiveScan(input, inclusive_output, scan_op); internal.Update(input, inclusive_output, exclusive_output, warp_aggregate, scan_op, Int2Type()); } //! @rst //! Computes an exclusive prefix scan using the specified binary scan functor across the //! calling warp. Also provides every thread with the warp-wide ``warp_aggregate`` of //! all inputs. //! //! * @smemwarpreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans //! within a block of 128 threads (one per each of the 32-thread warps). //! //! .. code-block:: c++ //! //! #include //! //! __global__ void ExampleKernel(...) //! { //! // Specialize WarpScan for type int //! typedef cub::WarpScan WarpScan; //! //! // Allocate WarpScan shared memory for 4 warps //! __shared__ typename WarpScan::TempStorage temp_storage[4]; //! //! // Obtain one input item per thread //! int thread_data = ... //! //! // Compute exclusive warp-wide prefix max scans //! int warp_aggregate; //! int warp_id = threadIdx.x / 32; //! WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, //! thread_data, //! INT_MIN, //! cub::Max(), //! warp_aggregate); //! //! Suppose the set of input ``thread_data`` across the block of threads is //! ``{0, -1, 2, -3, ..., 126, -127}``. The corresponding output ``thread_data`` in the first //! warp would be ``INT_MIN, 0, 0, 2, ..., 28, 30``, the output for the second warp would be //! ``30, 32, 32, 34, ..., 60, 62``, etc. Furthermore, ``warp_aggregate`` would be assigned //! ``30`` for threads in the first warp, ``62`` for threads in the second warp, etc. //! @endrst //! //! @tparam ScanOp //! **[inferred]** Binary scan operator type having member //! `T operator()(const T &a, const T &b)` //! //! @param[in] input //! Calling thread's input item //! //! @param[out] exclusive_output //! Calling thread's output item. May be aliased with `input` //! //! @param[in] initial_value //! Initial value to seed the exclusive scan //! //! @param[in] scan_op //! Binary scan operator //! //! @param[out] warp_aggregate //! Warp-wide aggregate reduction of input items //! template _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScan(T input, T& exclusive_output, T initial_value, ScanOp scan_op, T& warp_aggregate) { InternalWarpScan internal(temp_storage); T inclusive_output; internal.InclusiveScan(input, inclusive_output, scan_op); internal.Update( input, inclusive_output, exclusive_output, warp_aggregate, scan_op, initial_value, Int2Type()); } //! @} end member group //! @name Combination (inclusive & exclusive) prefix scans //! @{ //! @rst //! Computes both inclusive and exclusive prefix scans using the specified binary scan functor //! across the calling warp. Because no initial value is supplied, the ``exclusive_output`` //! computed for *lane*\ :sub:`0` is undefined. //! //! * @smemwarpreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans //! within a block of 128 threads (one per each of the 32-thread warps). //! //! .. code-block:: c++ //! //! #include //! //! __global__ void ExampleKernel(...) //! { //! // Specialize WarpScan for type int //! typedef cub::WarpScan WarpScan; //! //! // Allocate WarpScan shared memory for 4 warps //! __shared__ typename WarpScan::TempStorage temp_storage[4]; //! //! // Obtain one input item per thread //! int thread_data = ... //! //! // Compute exclusive warp-wide prefix max scans //! int inclusive_partial, exclusive_partial; //! WarpScan(temp_storage[warp_id]).Scan(thread_data, //! inclusive_partial, //! exclusive_partial, //! cub::Max()); //! //! Suppose the set of input ``thread_data`` across the block of threads is //! ``{0, -1, 2, -3, ..., 126, -127}``. The corresponding output ``inclusive_partial`` in the //! first warp would be ``0, 0, 2, 2, ..., 30, 30``, the output for the second warp would be //! ``32, 32, 34, 34, ..., 62, 62``, etc. The corresponding output ``exclusive_partial`` in the //! first warp would be ``?, 0, 0, 2, ..., 28, 30``, the output for the second warp would be //! ``?, 32, 32, 34, ..., 60, 62``, etc. //! (The output ``thread_data`` in warp *lane*\ :sub:`0` is undefined.) //! @endrst //! //! @tparam ScanOp //! **[inferred]** Binary scan operator type having member //! `T operator()(const T &a, const T &b)` //! //! @param[in] input //! Calling thread's input item //! //! @param[out] inclusive_output //! Calling thread's inclusive-scan output item //! //! @param[out] exclusive_output //! Calling thread's exclusive-scan output item //! //! @param[in] scan_op //! Binary scan operator template _CCCL_DEVICE _CCCL_FORCEINLINE void Scan(T input, T& inclusive_output, T& exclusive_output, ScanOp scan_op) { InternalWarpScan internal(temp_storage); internal.InclusiveScan(input, inclusive_output, scan_op); internal.Update(input, inclusive_output, exclusive_output, scan_op, Int2Type()); } //! @rst //! Computes both inclusive and exclusive prefix scans using the specified binary scan functor //! across the calling warp. //! //! * @smemwarpreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates four concurrent warp-wide prefix max scans within a //! block of 128 threads (one per each of the 32-thread warps). //! //! .. code-block:: c++ //! //! #include //! //! __global__ void ExampleKernel(...) //! { //! // Specialize WarpScan for type int //! typedef cub::WarpScan WarpScan; //! //! // Allocate WarpScan shared memory for 4 warps //! __shared__ typename WarpScan::TempStorage temp_storage[4]; //! //! // Obtain one input item per thread //! int thread_data = ... //! //! // Compute inclusive warp-wide prefix max scans //! int warp_id = threadIdx.x / 32; //! int inclusive_partial, exclusive_partial; //! WarpScan(temp_storage[warp_id]).Scan(thread_data, //! inclusive_partial, //! exclusive_partial, //! INT_MIN, //! cub::Max()); //! //! Suppose the set of input ``thread_data`` across the block of threads is //! ``{0, -1, 2, -3, ..., 126, -127}``. The corresponding output ``inclusive_partial`` in the //! first warp would be ``0, 0, 2, 2, ..., 30, 30``, the output for the second warp would be //! ``32, 32, 34, 34, ..., 62, 62``, etc. The corresponding output ``exclusive_partial`` in the //! first warp would be ``INT_MIN, 0, 0, 2, ..., 28, 30``, the output for the second warp would //! be ``30, 32, 32, 34, ..., 60, 62``, etc. //! @endrst //! //! @tparam ScanOp //! **[inferred]** Binary scan operator type having member //! `T operator()(const T &a, const T &b)` //! //! @param[in] input //! Calling thread's input item //! //! @param[out] inclusive_output //! Calling thread's inclusive-scan output item //! //! @param[out] exclusive_output //! Calling thread's exclusive-scan output item //! //! @param[in] initial_value //! Initial value to seed the exclusive scan //! //! @param[in] scan_op //! Binary scan operator template _CCCL_DEVICE _CCCL_FORCEINLINE void Scan(T input, T& inclusive_output, T& exclusive_output, T initial_value, ScanOp scan_op) { InternalWarpScan internal(temp_storage); internal.InclusiveScan(input, inclusive_output, scan_op); internal.Update(input, inclusive_output, exclusive_output, scan_op, initial_value, Int2Type()); } //! @} end member group //! @name Data exchange //! @{ //! @rst //! Broadcast the value ``input`` from *lane*\ :sub:`src_lane` to all lanes in the warp //! //! * @smemwarpreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates the warp-wide broadcasts of values from *lane*\ :sub:`0` //! in each of four warps to all other threads in those warps. //! //! .. code-block:: c++ //! //! #include //! //! __global__ void ExampleKernel(...) //! { //! // Specialize WarpScan for type int //! typedef cub::WarpScan WarpScan; //! //! // Allocate WarpScan shared memory for 4 warps //! __shared__ typename WarpScan::TempStorage temp_storage[4]; //! //! // Obtain one input item per thread //! int thread_data = ... //! //! // Broadcast from lane0 in each warp to all other threads in the warp //! int warp_id = threadIdx.x / 32; //! thread_data = WarpScan(temp_storage[warp_id]).Broadcast(thread_data, 0); //! //! Suppose the set of input ``thread_data`` across the block of threads is //! ``{0, 1, 2, 3, ..., 127}``. The corresponding output ``thread_data`` will be //! ``{0, 0, ..., 0}`` in warp\ :sub:`0`, //! ``{32, 32, ..., 32}`` in warp\ :sub:`1`, //! ``{64, 64, ..., 64}`` in warp\ :sub:`2`, etc. //! @endrst //! //! @param[in] input //! The value to broadcast //! //! @param[in] src_lane //! Which warp lane is to do the broadcasting _CCCL_DEVICE _CCCL_FORCEINLINE T Broadcast(T input, unsigned int src_lane) { return InternalWarpScan(temp_storage).Broadcast(input, src_lane); } //@} end member group }; CUB_NAMESPACE_END cccl-2.5.0/cub/cub/warp/warp_store.cuh000066400000000000000000000465011463375617100176260ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ //! @file Operations for writing linear segments of data from the CUDA warp #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include CUB_NAMESPACE_BEGIN //! @rst //! ``cub::WarpStoreAlgorithm`` enumerates alternative algorithms for :cpp:struct:`cub::WarpStore` //! to write a blocked arrangement of items across a CUDA warp to a linear segment of memory. //! @endrst enum WarpStoreAlgorithm { //! @rst //! Overview //! ++++++++++++++++++++++++++ //! //! A :ref:`blocked arrangement ` of data is written directly //! to memory. //! //! Performance Considerations //! ++++++++++++++++++++++++++ //! //! The utilization of memory transactions (coalescing) decreases as the //! access stride between threads increases (i.e., the number items per thread). //! @endrst WARP_STORE_DIRECT, //! @rst //! Overview //! ++++++++++++++++++++++++++ //! //! A :ref:`striped arrangement ` of data is written //! directly to memory. //! //! Performance Considerations //! ++++++++++++++++++++++++++ //! //! The utilization of memory transactions (coalescing) remains high regardless //! of items written per thread. //! @endrst WARP_STORE_STRIPED, //! @rst //! Overview //! ++++++++++++++++++++++++++ //! //! A :ref:`blocked arrangement ` of data is written //! directly to memory using CUDA's built-in vectorized stores as a coalescing //! optimization. For example, ``st.global.v4.s32`` instructions will be //! generated when ``T = int`` and ``ITEMS_PER_THREAD % 4 == 0``. //! //! Performance Considerations //! ++++++++++++++++++++++++++ //! //! * The utilization of memory transactions (coalescing) remains high until //! the the access stride between threads (i.e., the number items per thread) //! exceeds the maximum vector store width (typically 4 items or 64B, //! whichever is lower). //! * The following conditions will prevent vectorization and writing will fall //! back to ``cub::WARP_STORE_DIRECT``: //! //! * ``ITEMS_PER_THREAD`` is odd //! * The ``OutputIteratorT`` is not a simple pointer type //! * The block output offset is not quadword-aligned //! * The data type ``T`` is not a built-in primitive or CUDA vector type //! (e.g., ``short``, ``int2``, ``double``, ``float2``, etc.) //! //! @endrst WARP_STORE_VECTORIZE, //! @rst //! Overview //! ++++++++++++++++++++++++++ //! //! A :ref:`blocked arrangement ` is locally //! transposed and then efficiently written to memory as a //! :ref:`striped arrangement `. //! //! Performance Considerations //! ++++++++++++++++++++++++++ //! //! * The utilization of memory transactions (coalescing) remains high //! regardless of items written per thread. //! * The local reordering incurs slightly longer latencies and throughput than the //! direct ``cub::WARP_STORE_DIRECT`` and ``cub::WARP_STORE_VECTORIZE`` alternatives. //! //! @endrst WARP_STORE_TRANSPOSE }; //! @rst //! The WarpStore class provides :ref:`collective ` //! data movement methods for writing a :ref:`blocked arrangement ` //! of items partitioned across a CUDA warp to a linear segment of memory. //! //! Overview //! ++++++++++++++++ //! //! * The WarpStore class provides a single data movement abstraction that can be //! specialized to implement different cub::WarpStoreAlgorithm strategies. This //! facilitates different performance policies for different architectures, //! data types, granularity sizes, etc. //! * WarpStore can be optionally specialized by different data movement strategies: //! //! #. :cpp:enumerator:`cub::WARP_STORE_DIRECT`: //! a :ref:`blocked arrangement ` of data is written directly to //! memory. //! #. :cpp:enumerator:`cub::WARP_STORE_STRIPED`: //! a :ref:`striped arrangement ` of data is written directly to //! memory. //! #. :cpp:enumerator:`cub::WARP_STORE_VECTORIZE`: //! a :ref:`blocked arrangement ` of data is written directly to //! memory using CUDA's built-in vectorized stores as a coalescing optimization. //! #. :cpp:enumerator:`cub::WARP_STORE_TRANSPOSE`: //! a :ref:`blocked arrangement ` is locally transposed into a //! :ref:`striped arrangement ` which is then written to memory. //! //! * @rowmajor //! //! A Simple Example //! ++++++++++++++++ //! //! The code snippet below illustrates the storing of a "blocked" arrangement //! of 64 integers across 16 threads (where each thread owns 4 consecutive items) //! into a linear segment of memory. The store is specialized for //! ``WARP_STORE_TRANSPOSE``, meaning items are locally reordered among threads so //! that memory references will be efficiently coalesced using a warp-striped //! access pattern. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! __global__ void ExampleKernel(int *d_data, ...) //! { //! constexpr int warp_threads = 16; //! constexpr int block_threads = 256; //! constexpr int items_per_thread = 4; //! //! // Specialize WarpStore for a virtual warp of 16 threads owning 4 integer items each //! using WarpStoreT = WarpStore; //! //! constexpr int warps_in_block = block_threads / warp_threads; //! constexpr int tile_size = items_per_thread * warp_threads; //! const int warp_id = static_cast(threadIdx.x) / warp_threads; //! //! // Allocate shared memory for WarpStore //! __shared__ typename WarpStoreT::TempStorage temp_storage[warps_in_block]; //! //! // Obtain a segment of consecutive items that are blocked across threads //! int thread_data[4]; //! ... //! //! // Store items to linear memory //! WarpStoreT(temp_storage[warp_id]).Store(d_data + warp_id * tile_size, thread_data); //! //! Suppose the set of ``thread_data`` across the warp threads is //! ``{ [0,1,2,3], [4,5,6,7], ..., [60,61,62,63] }``. //! The output ``d_data`` will be ``0, 1, 2, 3, 4, 5, ...``. //! @endrst //! //! @tparam T //! The type of data to be written. //! //! @tparam ITEMS_PER_THREAD //! The number of consecutive items partitioned onto each thread. //! //! @tparam ALGORITHM //! [optional] cub::WarpStoreAlgorithm tuning policy enumeration. //! default: cub::WARP_STORE_DIRECT. //! //! @tparam LOGICAL_WARP_THREADS //! [optional] The number of threads per "logical" warp (may be less //! than the number of hardware warp threads). Default is the warp size of the //! targeted CUDA compute-capability (e.g., 32 threads for SM86). Must be a //! power of two. //! //! @tparam LEGACY_PTX_ARCH //! Unused. template class WarpStore { static_assert(PowerOfTwo::VALUE, "LOGICAL_WARP_THREADS must be a power of two"); static constexpr bool IS_ARCH_WARP = LOGICAL_WARP_THREADS == CUB_WARP_THREADS(0); private: /// Store helper template struct StoreInternal; template struct StoreInternal { using TempStorage = NullType; int linear_tid; _CCCL_DEVICE _CCCL_FORCEINLINE StoreInternal(TempStorage& /*temp_storage*/, int linear_tid) : linear_tid(linear_tid) {} template _CCCL_DEVICE _CCCL_FORCEINLINE void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD]) { StoreDirectBlocked(linear_tid, block_itr, items); } template _CCCL_DEVICE _CCCL_FORCEINLINE void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD], int valid_items) { StoreDirectBlocked(linear_tid, block_itr, items, valid_items); } }; template struct StoreInternal { using TempStorage = NullType; int linear_tid; _CCCL_DEVICE _CCCL_FORCEINLINE StoreInternal(TempStorage& /*temp_storage*/, int linear_tid) : linear_tid(linear_tid) {} template _CCCL_DEVICE _CCCL_FORCEINLINE void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD]) { StoreDirectStriped(linear_tid, block_itr, items); } template _CCCL_DEVICE _CCCL_FORCEINLINE void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD], int valid_items) { StoreDirectStriped(linear_tid, block_itr, items, valid_items); } }; template struct StoreInternal { using TempStorage = NullType; int linear_tid; _CCCL_DEVICE _CCCL_FORCEINLINE StoreInternal(TempStorage& /*temp_storage*/, int linear_tid) : linear_tid(linear_tid) {} _CCCL_DEVICE _CCCL_FORCEINLINE void Store(T* block_ptr, T (&items)[ITEMS_PER_THREAD]) { StoreDirectBlockedVectorized(linear_tid, block_ptr, items); } template _CCCL_DEVICE _CCCL_FORCEINLINE void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD]) { StoreDirectBlocked(linear_tid, block_itr, items); } template _CCCL_DEVICE _CCCL_FORCEINLINE void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD], int valid_items) { StoreDirectBlocked(linear_tid, block_itr, items, valid_items); } }; template struct StoreInternal { using WarpExchangeT = WarpExchange; struct _TempStorage : WarpExchangeT::TempStorage {}; struct TempStorage : Uninitialized<_TempStorage> {}; _TempStorage& temp_storage; int linear_tid; _CCCL_DEVICE _CCCL_FORCEINLINE StoreInternal(TempStorage& temp_storage, int linear_tid) : temp_storage(temp_storage.Alias()) , linear_tid(linear_tid) {} template _CCCL_DEVICE _CCCL_FORCEINLINE void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD]) { WarpExchangeT(temp_storage).BlockedToStriped(items, items); StoreDirectStriped(linear_tid, block_itr, items); } template _CCCL_DEVICE _CCCL_FORCEINLINE void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD], int valid_items) { WarpExchangeT(temp_storage).BlockedToStriped(items, items); StoreDirectStriped(linear_tid, block_itr, items, valid_items); } }; /// Internal load implementation to use using InternalStore = StoreInternal; /// Shared memory storage layout type using _TempStorage = typename InternalStore::TempStorage; _CCCL_DEVICE _CCCL_FORCEINLINE _TempStorage& PrivateStorage() { __shared__ _TempStorage private_storage; return private_storage; } _TempStorage& temp_storage; int linear_tid; public: struct TempStorage : Uninitialized<_TempStorage> {}; //! @name Collective constructors //! @{ //! @brief Collective constructor using a private static allocation of shared //! memory as temporary storage. _CCCL_DEVICE _CCCL_FORCEINLINE WarpStore() : temp_storage(PrivateStorage()) , linear_tid(IS_ARCH_WARP ? LaneId() : (LaneId() % LOGICAL_WARP_THREADS)) {} //! @brief Collective constructor using the specified memory allocation as //! temporary storage. _CCCL_DEVICE _CCCL_FORCEINLINE WarpStore(TempStorage& temp_storage) : temp_storage(temp_storage.Alias()) , linear_tid(IS_ARCH_WARP ? LaneId() : (LaneId() % LOGICAL_WARP_THREADS)) {} //! @} end member group //! @name Data movement //! @{ //! @rst //! Store items into a linear segment of memory. //! //! @smemwarpreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates the storing of a "blocked" arrangement //! of 64 integers across 16 threads (where each thread owns 4 consecutive items) //! into a linear segment of memory. The store is specialized for //! ``WARP_STORE_TRANSPOSE``, meaning items are locally reordered among threads so //! that memory references will be efficiently coalesced using a warp-striped //! access pattern. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! __global__ void ExampleKernel(int *d_data, ...) //! { //! constexpr int warp_threads = 16; //! constexpr int block_threads = 256; //! constexpr int items_per_thread = 4; //! //! // Specialize WarpStore for a virtual warp of 16 threads owning 4 integer items each //! using WarpStoreT = WarpStore; //! //! constexpr int warps_in_block = block_threads / warp_threads; //! constexpr int tile_size = items_per_thread * warp_threads; //! const int warp_id = static_cast(threadIdx.x) / warp_threads; //! //! // Allocate shared memory for WarpStore //! __shared__ typename WarpStoreT::TempStorage temp_storage[warps_in_block]; //! //! // Obtain a segment of consecutive items that are blocked across threads //! int thread_data[4]; //! ... //! //! // Store items to linear memory //! WarpStoreT(temp_storage[warp_id]).Store(d_data + warp_id * tile_size, thread_data); //! //! Suppose the set of ``thread_data`` across the warp threads is //! ``{ [0,1,2,3], [4,5,6,7], ..., [60,61,62,63] }``. //! The output ``d_data`` will be ``0, 1, 2, 3, 4, 5, ...``. //! @endrst //! //! @param[out] block_itr The thread block's base output iterator for storing to //! @param[in] items Data to store template _CCCL_DEVICE _CCCL_FORCEINLINE void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD]) { InternalStore(temp_storage, linear_tid).Store(block_itr, items); } //! @rst //! Store items into a linear segment of memory, guarded by range. //! //! @smemwarpreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates the storing of a "blocked" arrangement //! of 64 integers across 16 threads (where each thread owns 4 consecutive items) //! into a linear segment of memory. The store is specialized for //! ``WARP_STORE_TRANSPOSE``, meaning items are locally reordered among threads so //! that memory references will be efficiently coalesced using a warp-striped //! access pattern. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! __global__ void ExampleKernel(int *d_data, int valid_items ...) //! { //! constexpr int warp_threads = 16; //! constexpr int block_threads = 256; //! constexpr int items_per_thread = 4; //! //! // Specialize WarpStore for a virtual warp of 16 threads owning 4 integer items each //! using WarpStoreT = WarpStore; //! //! constexpr int warps_in_block = block_threads / warp_threads; //! constexpr int tile_size = items_per_thread * warp_threads; //! const int warp_id = static_cast(threadIdx.x) / warp_threads; //! //! // Allocate shared memory for WarpStore //! __shared__ typename WarpStoreT::TempStorage temp_storage[warps_in_block]; //! //! // Obtain a segment of consecutive items that are blocked across threads //! int thread_data[4]; //! ... //! //! // Store items to linear memory //! WarpStoreT(temp_storage[warp_id]).Store( //! d_data + warp_id * tile_size, thread_data, valid_items); //! //! Suppose the set of ``thread_data`` across the warp threads is //! ``{ [0,1,2,3], [4,5,6,7], ..., [60,61,62,63] }`` and ``valid_items`` //! is ``5``. The output ``d_data`` will be ``0, 1, 2, 3, 4, ?, ?, ...``, //! with only the first two threads being unmasked to store portions of valid //! data. //! @endrst //! //! @param[out] block_itr The thread block's base output iterator for storing to //! @param[in] items Data to store //! @param[in] valid_items Number of valid items to write //! template _CCCL_DEVICE _CCCL_FORCEINLINE void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD], int valid_items) { InternalStore(temp_storage, linear_tid).Store(block_itr, items, valid_items); } //! @} end member group }; CUB_NAMESPACE_END cccl-2.5.0/cub/docs/000077500000000000000000000000001463375617100141405ustar00rootroot00000000000000cccl-2.5.0/cub/docs/.gitignore000066400000000000000000000000261463375617100161260ustar00rootroot00000000000000_build _repo api *png cccl-2.5.0/cub/docs/VERSION.md000066400000000000000000000000061463375617100156030ustar00rootroot00000000000000104.0 cccl-2.5.0/cub/docs/benchmarking.rst000066400000000000000000000041051463375617100173220ustar00rootroot00000000000000CUB Benchmarks ************************************* This file contains instrutions on how to run all CUB benchmarks using CUB tuning infrastructure. .. code-block:: bash pip3 install --user fpzip pandas scipy git clone https://github.com/NVIDIA/cccl.git cmake -B build -DCCCL_ENABLE_THRUST=OFF\ -DCCCL_ENABLE_LIBCUDACXX=OFF\ -DCCCL_ENABLE_CUB=ON\ -DCUB_ENABLE_DIALECT_CPP11=OFF\ -DCUB_ENABLE_DIALECT_CPP14=OFF\ -DCUB_ENABLE_DIALECT_CPP17=ON\ -DCUB_ENABLE_DIALECT_CPP20=OFF\ -DCUB_ENABLE_RDC_TESTS=OFF\ -DCUB_ENABLE_BENCHMARKS=YES\ -DCUB_ENABLE_TUNING=YES\ -DCMAKE_BUILD_TYPE=Release\ -DCMAKE_CUDA_ARCHITECTURES="89;90" cd build ../cub/benchmarks/scripts/run.py Expected output for the command above is: .. code-block:: bash ../cub/benchmarks/scripts/run.py &&&& RUNNING bench ctk: 12.2.140 cub: 812ba98d1 &&&& PERF cub_bench_adjacent_difference_subtract_left_base_T_ct__I32___OffsetT_ct__I32___Elements_io__pow2__16 4.095999884157209e-06 -sec &&&& PERF cub_bench_adjacent_difference_subtract_left_base_T_ct__I32___OffsetT_ct__I32___Elements_io__pow2__20 1.2288000107218977e-05 -sec &&&& PERF cub_bench_adjacent_difference_subtract_left_base_T_ct__I32___OffsetT_ct__I32___Elements_io__pow2__24 0.00016998399223666638 -sec &&&& PERF cub_bench_adjacent_difference_subtract_left_base_T_ct__I32___OffsetT_ct__I32___Elements_io__pow2__28 0.002673664130270481 -sec ... It's also possible to benchmark a subset of algorithms and workloads: .. code-block:: bash ../cub/benchmarks/scripts/run.py -R '.*scan.exclusive.sum.*' -a 'Elements{io}[pow2]=[24,28]' -a 'T{ct}=I32' &&&& RUNNING bench ctk: 12.2.140 cub: 812ba98d1 &&&& PERF cub_bench_scan_exclusive_sum_base_T_ct__I32___OffsetT_ct__I32___Elements_io__pow2__24 0.00016899200272746384 -sec &&&& PERF cub_bench_scan_exclusive_sum_base_T_ct__I32___OffsetT_ct__I32___Elements_io__pow2__28 0.002696000039577484 -sec &&&& PASSED bench cccl-2.5.0/cub/docs/block_wide.rst000066400000000000000000000034701463375617100170000ustar00rootroot00000000000000.. _block-module: Block-Wide "Collective" Primitives ================================================== .. toctree:: :glob: :hidden: :maxdepth: 2 ${repo_docs_api_path}/*Block* CUB block-level algorithms are specialized for execution by threads in the same CUDA thread block: * :cpp:class:`cub::BlockAdjacentDifference ` computes the difference between adjacent items partitioned across a CUDA thread block * :cpp:class:`cub::BlockDiscontinuity ` flags discontinuities within an ordered set of items partitioned across a CUDA thread block * :cpp:struct:`cub::BlockExchange ` rearranges data partitioned across a CUDA thread block * :cpp:class:`cub::BlockHistogram ` constructs block-wide histograms from data samples partitioned across a CUDA thread block * :cpp:class:`cub::BlockLoad ` loads a linear segment of items from memory into a CUDA thread block * :cpp:class:`cub::BlockMergeSort ` sorts items partitioned across a CUDA thread block * :cpp:class:`cub::BlockRadixSort ` sorts items partitioned across a CUDA thread block using radix sorting method * :cpp:struct:`cub::BlockReduce ` computes reduction of items partitioned across a CUDA thread block * :cpp:class:`cub::BlockRunLengthDecode ` decodes a run-length encoded sequence partitioned across a CUDA thread block * :cpp:struct:`cub::BlockScan ` computes a prefix scan of items partitioned across a CUDA thread block * :cpp:struct:`cub::BlockShuffle ` shifts items partitioned across a CUDA thread block * :cpp:class:`cub::BlockStore ` stores items partitioned across a CUDA thread block to a linear segment of memory cccl-2.5.0/cub/docs/deps/000077500000000000000000000000001463375617100150735ustar00rootroot00000000000000cccl-2.5.0/cub/docs/deps/repo-deps.packman.xml000066400000000000000000000004551463375617100211300ustar00rootroot00000000000000 cccl-2.5.0/cub/docs/developer_overview.rst000066400000000000000000000670561463375617100206230ustar00rootroot00000000000000CUB Developer Overview ########################## This living document serves as a guide to the design of the internal structure of CUB. CUB provides layered algorithms that correspond to the thread/warp/block/device hierarchy of threads in CUDA. There are distinct algorithms for each layer and higher-level layers build on top of those below. For example, CUB has four flavors of ``reduce``, one for each layer: ``ThreadReduce, WarpReduce, BlockReduce``, and ``DeviceReduce``. Each is unique in how it is invoked, how many threads participate, and on which thread(s) the result is valid. These layers naturally build on each other. For example, :cpp:struct:`WarpReduce ` uses :cpp:func:`ThreadReduce `, :cpp:struct:`BlockReduce ` uses :cpp:struct:`WarpReduce `, etc. :cpp:func:`ThreadReduce ` - A normal function invoked and executed sequentially by a single thread that returns a valid result on that thread - Single thread functions are usually an implementation detail and not exposed in CUB's public API :cpp:struct:`WarpReduce ` and :cpp:struct:`BlockReduce ` - A "cooperative" function where threads concurrently invoke the same function to execute parallel work - The function's return value is well-defined only on the "first" thread (lowest thread index) :cpp:struct:`DeviceReduce ` - A normal function invoked by a single thread that spawns additional threads to execute parallel work - Result is stored in the pointer provided to the function - Function returns a ``cudaError_t`` error code - Function does not synchronize the host with the device The table below provides a summary of these functions: .. list-table:: :class: table-no-stripes :header-rows: 1 * - layer - coop invocation - parallel execution - max threads - valid result in * - :cpp:func:`ThreadReduce ` - :math:`-` - :math:`-` - :math:`1` - invoking thread * - :cpp:struct:`WarpReduce ` - :math:`+` - :math:`+` - :math:`32` - main thread * - :cpp:struct:`BlockReduce ` - :math:`+` - :math:`+` - :math:`1024` - main thread * - :cpp:struct:`DeviceReduce ` - :math:`-` - :math:`+` - :math:`\infty` - global memory The details of how each of these layers are implemented is described below. Common Patterns ************************************ While CUB's algorithms are unique at each layer, there are commonalities among all of them: - Algorithm interfaces are provided as *types* (classes)\ [1]_ - Algorithms need temporary storage - Algorithms dispatch to specialized implementations depending on compile-time and runtime information - Cooperative algorithms require the number of threads at compile time (template parameter) Invoking any CUB algorithm follows the same general pattern: #. Select the class for the desired algorithm #. Query the temporary storage requirements #. Allocate the temporary storage #. Pass the temporary storage to the algorithm #. Invoke it via the appropriate member function An example of :cpp:struct:`cub::BlockReduce` demonstrates these patterns in practice: .. code-block:: c++ __global__ void kernel(int* per_block_results) { // (1) Select the desired class // `cub::BlockReduce` is a class template that must be instantiated for the // input data type and the number of threads. Internally the class is // specialized depending on the data type, number of threads, and hardware // architecture. Type aliases are often used for convenience: using BlockReduce = cub::BlockReduce; // (2) Query the temporary storage // The type and amount of temporary storage depends on the selected instantiation using TempStorage = typename BlockReduce::TempStorage; // (3) Allocate the temporary storage __shared__ TempStorage temp_storage; // (4) Pass the temporary storage // Temporary storage is passed to the constructor of the `BlockReduce` class BlockReduce block_reduce{temp_storage}; // (5) Invoke the algorithm // The `Sum()` member function performs the sum reduction of `thread_data` across all 128 threads int thread_data[4] = {1, 2, 3, 4}; int block_result = block_reduce.Sum(thread_data); per_block_results[blockIdx.x] = block_result; } .. [1] Algorithm interfaces are provided as classes because it provides encapsulation for things like temporary storage requirements and enables partial template specialization for customizing an algorithm for specific data types or number of threads. Thread-level ************************************ In contrast to algorithms at the warp/block/device layer, single threaded functionality like ``cub::ThreadReduce`` is typically implemented as a sequential function and rarely exposed to the user. .. code-block:: c++ template < int LENGTH, typename T, typename ReductionOp, typename PrefixT, typename AccumT = detail::accumulator_t> __device__ __forceinline__ AccumT ThreadReduce( T (&input)[LENGTH], ReductionOp reduction_op, PrefixT prefix) { return ...; } Warp-level ************************************ CUB warp-level algorithms are specialized for execution by threads in the same CUDA warp. These algorithms may only be invoked by ``1 <= n <= 32`` *consecutive* threads in the same warp. Overview ==================================== Warp-level functionality is provided by types (classes) to provide encapsulation and enable partial template specialization. For example, :cpp:struct:`cub::WarpReduce` is a class template: .. code-block:: c++ template class WarpReduce { // ... // (1) define `_TempStorage` type // ... _TempStorage &temp_storage; public: // (2) wrap `_TempStorage` in uninitialized memory struct TempStorage : Uninitialized<_TempStorage> {}; __device__ __forceinline__ WarpReduce(TempStorage &temp_storage) // (3) reinterpret cast : temp_storage(temp_storage.Alias()) {} // (4) actual algorithms __device__ __forceinline__ T Sum(T input); }; In CUDA, the hardware warp size is 32 threads. However, CUB enables warp-level algorithms on "logical" warps of ``1 <= n <= 32`` threads. The size of the logical warp is required at compile time via the ``LOGICAL_WARP_THREADS`` non-type template parameter. This value is defaulted to the hardware warp size of ``32``. There is a vital difference in the behavior of warp-level algorithms that depends on the value of ``LOGICAL_WARP_THREADS``: - If ``LOGICAL_WARP_THREADS`` is a power of two - warp is partitioned into *sub*-warps, each reducing its data independently from other *sub*-warps. The terminology used in CUB: ``32`` threads are called hardware warp. Groups with less than ``32`` threads are called *logical* or *virtual* warp since it doesn't correspond directly to any hardware unit. - If ``LOGICAL_WARP_THREADS`` is **not** a power of two - there's no partitioning. That is, only the first logical warp executes algorithm. .. TODO: Add diagram showing non-power of two logical warps. It's important to note that ``LEGACY_PTX_ARCH`` has been recently deprecated. This parameter used to affect specialization selection (see below). It was conflicting with the PTX dispatch refactoring and limited NVHPC support. Temporary storage usage ==================================== Warp-level algorithms require temporary storage for scratch space and inter-thread communication. The temporary storage needed for a given instantiation of an algorithm is known at compile time and is exposed through the ``TempStorage`` member type definition. It is the caller's responsibility to create this temporary storage and provide it to the constructor of the algorithm type. It is possible to reuse the same temporary storage for different algorithm invocations, but it is unsafe to do so without first synchronizing to ensure the first invocation is complete. .. TODO: Add more explanation of the `TempStorage` type and the `Uninitialized` wrapper. .. TODO: Explain if `TempStorage` is required to be shared memory or not. .. code-block:: c++ using WarpReduce = cub::WarpReduce; // Allocate WarpReduce shared memory for four warps __shared__ WarpReduce::TempStorage temp_storage[4]; // Get this thread's warp id int warp_id = threadIdx.x / 32; int aggregate_1 = WarpReduce(temp_storage[warp_id]).Sum(thread_data_1); // illegal, has to add `__syncwarp()` between the two int aggregate_2 = WarpReduce(temp_storage[warp_id]).Sum(thread_data_2); // illegal, has to add `__syncwarp()` between the two foo(temp_storage[warp_id]); Specialization ==================================== The goal of CUB is to provide users with algorithms that abstract the complexities of achieving speed-of-light performance across a variety of use cases and hardware. It is a CUB developer's job to abstract this complexity from the user by providing a uniform interface that statically dispatches to the optimal code path. This is usually accomplished via customizing the implementation based on compile time information like the logical warp size, the data type, and the target architecture. For example, :cpp:struct:`cub::WarpReduce` dispatches to two different implementations based on if the logical warp size is a power of two (described above): .. code-block:: c++ using InternalWarpReduce = cub::detail::conditional_t< IS_POW_OF_TWO, WarpReduceShfl, // shuffle-based implementation WarpReduceSmem>; // smem-based implementation Specializations provide different shared memory requirements, so the actual ``_TempStorage`` type is defined as: .. code-block:: c++ typedef typename InternalWarpReduce::TempStorage _TempStorage; and algorithm implementation look like: .. code-block:: c++ __device__ __forceinline__ T Sum(T input, int valid_items) { return InternalWarpReduce(temp_storage) .Reduce(input, valid_items, cub::Sum()); } Due to ``LEGACY_PTX_ARCH`` issues described above, we can't specialize on the PTX version. ``NV_IF_TARGET`` shall be used by specializations instead: .. code-block:: c++ template struct WarpReduceShfl { template __device__ __forceinline__ T ReduceImpl(T input, int valid_items, ReductionOp reduction_op) { // ... base case (SM < 80) ... } template __device__ __forceinline__ typename std::enable_if::value || std::is_same::value, T>::type ReduceImpl(T input, int, // valid_items cub::Sum) // reduction_op { T output = input; NV_IF_TARGET(NV_PROVIDES_SM_80, (output = __reduce_add_sync(member_mask, input);), (output = ReduceImpl( input, LOGICAL_WARP_THREADS, cub::Sum{});)); return output; } }; Specializations are stored in the ``cub/warp/specializations`` directory. Block-scope ************************************ Overview ==================================== Block-scope algorithms are provided by structures as well: .. code-block:: c++ template class BlockReduce { public: struct TempStorage : Uninitialized<_TempStorage> {}; // (1) new constructor __device__ __forceinline__ BlockReduce() : temp_storage(PrivateStorage()), linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) {} __device__ __forceinline__ BlockReduce(TempStorage &temp_storage) : temp_storage(temp_storage.Alias()), linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) {} }; While warp-scope algorithms only provide a single constructor that requires the user to provide temporary storage, block-scope algorithms provide two constructors: #. The default constructor that allocates the required shared memory internally. #. The constructor that requires the user to provide temporary storage as argument. In the case of the default constructor, the block-level algorithm uses the ``PrivateStorage()`` member function to allocate the required shared memory. This ensures that shared memory required by the algorithm is only allocated when the default constructor is actually called in user code. If the default constructor is never called, then the algorithm will not allocate superfluous shared memory. .. code-block:: c++ __device__ __forceinline__ _TempStorage& PrivateStorage() { __shared__ _TempStorage private_storage; return private_storage; } The ``__shared__`` memory has static semantic, so it's safe to return a reference here. Specialization ==================================== Block-scope facilities usually expose algorithm selection to the user. The algorithm is represented by the enumeration part of the API. For the reduction case, ``BlockReduceAlgorithm`` is provided. Specializations are stored in the ``cub/block/specializations`` directory. Temporary storage usage ==================================== For block-scope algorithms, it's unsafe to use temporary storage without synchronization: .. code-block:: c++ using BlockReduce = cub::BlockReduce ; __shared__ BlockReduce::TempStorage temp_storage; int aggregate_1 = BlockReduce(temp_storage).Sum(thread_data_1); // illegal, has to add `__syncthreads` between the two int aggregate_2 = BlockReduce(temp_storage).Sum(thread_data_2); // illegal, has to add `__syncthreads` between the two foo(temp_storage); Device-scope ************************************ Overview ==================================== Device-scope functionality is provided by static member functions: .. code-block:: c++ struct DeviceReduce { template CUB_RUNTIME_FUNCTION static cudaError_t Sum( void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, cudaStream_t stream = 0) { using OffsetT = int; using OutputT = cub::detail::non_void_value_t>; return DispatchReduce::Dispatch(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, cub::Sum(), OutputT(), stream); } }; Device-scope facilities always return ``cudaError_t`` and accept ``stream`` as the last parameter (main stream by default). The first two parameters are always ``void *d_temp_storage, size_t &temp_storage_bytes``. The algorithm invocation consists of two phases. During the first phase, temporary storage size is calculated and returned in ``size_t &temp_storage_bytes``. During the second phase, ``temp_storage_bytes`` of memory is expected to be allocated and ``d_temp_storage`` is expected to be the pointer to this memory. .. code-block:: c++ // Determine temporary device storage requirements void *d_temp_storage {}; std::size_t temp_storage_bytes {}; cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); // Allocate temporary storage cudaMalloc(&d_temp_storage, temp_storage_bytes); // Run sum-reduction cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); .. warning:: Even if the algorithm doesn't need temporary storage as a scratch space, we still require one byte of memory to be allocated. Dispatch layer ==================================== The dispatch layer is specific to the device scope (`DispatchReduce`) and located in `cub/device/dispatch`. High-level control flow can be represented by the code below. A more precise description is given later. .. code-block:: c++ // Device-scope API cudaError_t cub::Device::Algorithm(args) { return DispatchAlgorithm::Dispatch(args); // (1) } // Dispatch entry point static cudaError_t DispatchAlgorithm::Dispatch(args) { // (1) DispatchAlgorithm invokable(args); // MaxPolicy - tail of linked list contaning architecture-specific tunings return MaxPolicy::Invoke(get_runtime_ptx_version(), invokable); // (2) } // Chained policy - linked list of tunings cudaError_t ChainedPolicy::Invoke(ptx_version, invokable) { // (2) if (ptx_version < ChainedPolicy::PTX_VERSION) { ChainedPolicy::PrevPolicy::Invoke(ptx_version, invokable); // (2) } invokable.Invoke(); // (3) } // Dispatch object - parameters closure template cudaError_t DispatchAlgorithm::Invoke() { // (3) kernel<<>>(args); // (4) } template void __global__ __launch_bounds__(ChainedPolicy::ActivePolicy::BLOCK_THREADS) kernel(args) { // (4) using policy = ChainedPolicy::ActivePolicy; // (5) using agent = AgentAlgorithm; // (6) agent a(args); a.Process(); } template struct ChainedPolicy { using ActivePolicy = conditional_t<(CUB_PTX_ARCH < PTX_VERSION), // (5) PrevPolicyT::ActivePolicy, PolicyT>; }; template struct AlgorithmAgent { // (6) void Process(); }; The code above represents control flow. Let's look at each of the building blocks closer. The dispatch entry point is typically represented by a static member function that constructs an object of ``DispatchReduce`` and passes it to ``ChainedPolicy`` ``Invoke`` method: .. code-block:: c++ template struct DispatchReduce : SelectedPolicy { // CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Dispatch( void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, /* ... */) { typedef typename DispatchSegmentedReduce::MaxPolicy MaxPolicyT; if (num_segments <= 0) return cudaSuccess; cudaError error = cudaSuccess; do { // Get PTX version int ptx_version = 0; error = CubDebug(PtxVersion(ptx_version)); if (cudaSuccess != error) { break; } // Create dispatch functor DispatchSegmentedReduce dispatch( d_temp_storage, temp_storage_bytes, d_in, /* ... */); // Dispatch to chained policy MaxPolicyT::Invoke(ptx_version, dispatch); } while (0); return error; } }; For many algorithms, the dispatch layer is part of the API. The main reason for this integration is to support ``size_t``. Our API uses ``int`` as a type for ``num_items``. Users rely on the dispatch layer directly to workaround this. Exposing the dispatch layer also allows users to tune algorithms for their use cases. In the newly added functionality, the dispatch layer should not be exposed. The ``ChainedPolicy`` converts the runtime PTX version to the closest compile-time one: .. code-block:: c++ template struct ChainedPolicy { using ActivePolicy = cub::detail::conditional_t<(CUB_PTX_ARCH < PTX_VERSION), typename PrevPolicyT::ActivePolicy, PolicyT>; template CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Invoke(int ptx_version, FunctorT& op) { if (ptx_version < PTX_VERSION) { return PrevPolicyT::Invoke(ptx_version, op); } return op.template Invoke(); } }; The dispatch object's ``Invoke`` method is then called with proper policy: .. code-block:: c++ template struct DispatchReduce : SelectedPolicy { template CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Invoke() { using MaxPolicyT = typename DispatchSegmentedReduce::MaxPolicy; return InvokePasses( DeviceReduceKernel); } }; This is where the actual work happens. Note how the kernel is used against ``MaxPolicyT`` (2) while the kernel invocation part uses ``ActivePolicyT`` (1). This is an important part: .. code-block:: c++ template __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)) __global__ void DeviceReduceKernel(InputIteratorT d_in /* ... */) { // Thread block type for reducing input tiles using AgentReduceT = AgentReduce; // Shared memory storage __shared__ typename AgentReduceT::TempStorage temp_storage; // Consume input tiles OutputT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op).ConsumeTiles(even_share); // Output result if (threadIdx.x == 0) { d_out[blockIdx.x] = block_aggregate; } } The kernel gets compiled for each PTX version that was provided to the compiler. During the device pass, ``ChainedPolicy`` compares ``CUDA_ARCH`` against the template parameter to select ``ActivePolicy`` type alias. During the host pass, ``Invoke`` is compiled for each architecture in the tuning list. If we use ``ActivePolicy`` instead of ``MaxPolicy`` as a kernel template parameter, we will compile ``O(N^2)`` kernels instead of ``O(N)``. Finally, the tuning looks like: .. code-block:: c++ template struct DeviceReducePolicy { /// SM35 struct Policy350 : ChainedPolicy<350, Policy350, Policy300> { typedef AgentReducePolicy<256, 20, InputT, 4, BLOCK_REDUCE_WARP_REDUCTIONS, LOAD_LDG> ReducePolicy; }; /// SM60 struct Policy600 : ChainedPolicy<600, Policy600, Policy350> { typedef AgentReducePolicy<256, 16, InputT, 4, BLOCK_REDUCE_WARP_REDUCTIONS, LOAD_LDG> ReducePolicy; }; /// MaxPolicy typedef Policy600 MaxPolicy; }; The kernels in the dispatch layer shouldn't contain a lot of code. Usually, the functionality is extracted into the agent layer. All the kernel does is derive the proper policy type, unwrap the policy to initialize the agent and call one of its ``Consume`` / ``Process`` methods. Agents are frequently reused by unrelated device-scope algorithms. Temporary storage usage ==================================== It's safe to reuse storage in the stream order: .. code-block:: c++ cub::DeviceReduce::Sum(nullptr, storage_bytes, d_in, d_out, num_items, stream_1); // allocate temp storage cub::DeviceReduce::Sum(d_storage, storage_bytes, d_in, d_out, num_items, stream_1); // fine not to synchronize stream cub::DeviceReduce::Sum(d_storage, storage_bytes, d_in, d_out, num_items, stream_1); // illegal, should call cudaStreamSynchronize(stream) cub::DeviceReduce::Sum(d_storage, storage_bytes, d_in, d_out, num_items, stream_2); Temporary storage management ==================================== Often times temporary storage for device-scope algorithms has a complex structure. To simplify temporary storage management and make it safer, we introduced ``cub::detail::temporary_storage::layout``: .. code-block:: c++ cub::detail::temporary_storage::layout<2> storage_layout; auto slot_1 = storage_layout.get_slot(0); auto slot_2 = storage_layout.get_slot(1); auto allocation_1 = slot_1->create_alias(); auto allocation_2 = slot_1->create_alias(42); auto allocation_3 = slot_2->create_alias(12); if (condition) { allocation_1.grow(num_items); } if (d_temp_storage == nullptr) { temp_storage_bytes = storage_layout.get_size(); return; } storage_layout.map_to_buffer(d_temp_storage, temp_storage_bytes); // different slots, safe to use simultaneously use(allocation_1.get(), allocation_3.get(), stream); // `allocation_2` alias `allocation_1`, safe to use in stream order use(allocation_2.get(), stream); Symbols visibility ==================================== Using CUB/Thrust in shared libraries is a known source of issues. For a while, the solution to these issues consisted of wrapping CUB/Thrust namespaces with the ``THRUST_CUB_WRAPPED_NAMESPACE`` macro so that different shared libraries have different symbols. This solution has poor discoverability, since issues present themselves in forms of segmentation faults, hangs, wrong results, etc. To eliminate the symbol visibility issues on our end, we follow the following rules: #. Hiding symbols accpeting kernel pointers: it's important that API accepting kernel pointers (e.g. ``triple_chevron``) always reside in the same library as the code taking this pointers. #. Hiding all kernels: it's important that kernels always reside in the same library as the API using these kernels. #. Incorporating GPU architectures into symbol names: it's important that kernels compiled for a given GPU architecture are always used by the host API compiled for that architecture. To satisfy (1), ``thrust::cuda_cub::launcher::triple_chevron`` visibility is hidden. To satisfy (2), instead of annotating kernels as ``__global__`` we annotate them as ``CUB_DETAIL_KERNEL_ATTRIBUTES``. Apart from annotating a kernel as global function, the macro contains hidden visibility attribute. To satisfy (3), CUB symbols are placed inside an inline namespace containing the set of GPU architectures for which the TU is being compiled. NVTX ************************************ The `NVIDIA Tools Extension SDK (NVTX) `_ is a cross-platform API for annotating source code to provide contextual information to developer tools. All device-scope algorithms in CUB are annotated with NVTX ranges, allowing their start and stop to be visualized in profilers like `NVIDIA Nsight Systems `_. Only the public APIs available in the ```` headers are annotated, excluding direct calls to the dispatch layer. NVTX annotations can be disabled by defining ``NVTX_DISABLE`` during compilation. cccl-2.5.0/cub/docs/device_wide.rst000066400000000000000000000047611463375617100171510ustar00rootroot00000000000000.. _device-module: Device-Wide Primitives ================================================== .. toctree:: :glob: :hidden: :maxdepth: 2 ${repo_docs_api_path}/struct*Device* CUB device-level single-problem parallel algorithms: * :cpp:class:`cub::DeviceAdjacentDifference ` computes the difference between adjacent elements residing within device-accessible memory * :cpp:struct:`cub::DeviceFor ` provides device-wide, parallel operations for iterating over data residing within device-accessible memory * :cpp:class:`cub::DeviceHistogram ` constructs histograms from data samples residing within device-accessible memory * :cpp:struct:`cub::DevicePartition ` partitions data residing within device-accessible memory * :cpp:class:`cub::DeviceMergeSort ` sorts items residing within device-accessible memory * :cpp:class:`cub::DeviceRadixSort ` sorts items residing within device-accessible memory using radix sorting method * :cpp:struct:`cub::DeviceReduce ` computes reduction of items residing within device-accessible memory * :cpp:class:`cub::DeviceRunLengthEncode ` demarcating "runs" of same-valued items withing a sequence residing within device-accessible memory * :cpp:struct:`cub::DeviceScan ` computes a prefix scan across a sequence of data items residing within device-accessible memory * :cpp:struct:`cub::DeviceSelect ` compacts data residing within device-accessible memory CUB device-level segmented-problem (batched) parallel algorithms: * :cpp:struct:`cub::DeviceSegmentedSort ` computes batched sort across non-overlapping sequences of data residing within device-accessible memory * :cpp:struct:`cub::DeviceSegmentedRadixSort ` computes batched radix sort across non-overlapping sequences of data residing within device-accessible memory * :cpp:struct:`cub::DeviceSegmentedReduce ` computes reductions across multiple sequences of data residing within device-accessible memory * :cpp:struct:`cub::DeviceCopy ` provides device-wide, parallel operations for batched copying of data residing within device-accessible memory * :cpp:struct:`cub::DeviceMemcpy ` provides device-wide, parallel operations for batched copying of data residing within device-accessible memory cccl-2.5.0/cub/docs/gen_docs.bash000077500000000000000000000021431463375617100165630ustar00rootroot00000000000000#!/usr/bin/env bash ## This script just wraps launching a docs build within a container ## Tag is passed on as the first argument ${1} set -e SCRIPT_PATH=$(cd $(dirname ${0}); pwd -P) cd $SCRIPT_PATH ## Clean image directory, without this any artifacts will prevent fetching rm -rf img mkdir -p img if [ ! -d cubimg ]; then git clone -b gh-pages https://github.com/NVlabs/cub.git cubimg fi if [ ! -n "$(find cubimg -name 'example_range.png')" ]; then wget -q https://raw.githubusercontent.com/NVIDIA/NVTX/release-v3/docs/images/example_range.png -O cubimg/example_range.png fi if [ ! -n "$(find img -name '*.png')" ]; then wget -q https://docs.nvidia.com/cuda/_static/Logo_and_CUDA.png -O img/logo.png # Parse files and collects unique names ending with .png imgs=( $(grep -R -o -h '[[:alpha:][:digit:]_]*.png' ../cub | uniq) ) imgs+=( "cub_overview.png" "nested_composition.png" "tile.png" "blocked.png" "striped.png" ) for img in "${imgs[@]}" do echo ${img} cp cubimg/${img} img/${img} done fi ./repo.sh docs || echo "!!! There were errors while generating" cccl-2.5.0/cub/docs/index.rst000066400000000000000000000523211463375617100160040ustar00rootroot00000000000000CUB ================================================== .. toctree:: :hidden: :maxdepth: 3 modules developer_overview test_overview tuning benchmarking ${repo_docs_api_path}/CUB_api .. the line below can be used to use the README.md file as the index page .. .. mdinclude:: ../README.md What is CUB? ================================================== CUB provides state-of-the-art, reusable software components for every layer of the CUDA programming model: * **Parallel primitives** * :ref:`Warp-wide ` "collective" primitives * Cooperative warp-wide prefix scan, reduction, etc. * Safely specialized for each underlying CUDA architecture * :ref:`Block-wide ` "collective" primitives * Cooperative I/O, sort, scan, reduction, histogram, etc. * Compatible with arbitrary thread block sizes and types * :ref:`Device-wide ` primitives * Parallel sort, prefix scan, reduction, histogram, etc. * Compatible with CUDA dynamic parallelism * **Utilities** * **Fancy iterators** * **Thread and thread block I/O** * **PTX intrinsics** * **Device, kernel, and storage management** .. _collective-primitives: CUB's collective primitives ================================================== Collective software primitives are essential for constructing high-performance, maintainable CUDA kernel code. Collectives allow complex parallel code to be re-used rather than re-implemented, and to be re-compiled rather than hand-ported. .. figure:: img/cub_overview.png :align: center :alt: Orientation of collective primitives within the CUDA software stack :name: fig_cub_overview Orientation of collective primitives within the CUDA software stack As a SIMT programming model, CUDA engenders both **scalar** and **collective** software interfaces. Traditional software interfaces are *scalar* : a single thread invokes a library routine to perform some operation (which may include spawning parallel subtasks). Alternatively, a *collective* interface is entered simultaneously by a group of parallel threads to perform some cooperative operation. CUB's collective primitives are not bound to any particular width of parallelism or data type. This flexibility makes them: * **Adaptable** to fit the needs of the enclosing kernel computation * **Trivially tunable** to different grain sizes (threads per block, items per thread, etc.) Thus CUB is *CUDA Unbound*. An example (block-wide sorting) ================================================== The following code snippet presents a CUDA kernel in which each block of ``BLOCK_THREADS`` threads will collectively load, sort, and store its own segment of (``BLOCK_THREADS * ITEMS_PER_THREAD``) integer keys: .. code-block:: c++ #include // // Block-sorting CUDA kernel // template __global__ void BlockSortKernel(int *d_in, int *d_out) { // Specialize BlockLoad, BlockStore, and BlockRadixSort collective types typedef cub::BlockLoad< int, BLOCK_THREADS, ITEMS_PER_THREAD, cub::BLOCK_LOAD_TRANSPOSE> BlockLoadT; typedef cub::BlockStore< int, BLOCK_THREADS, ITEMS_PER_THREAD, cub::BLOCK_STORE_TRANSPOSE> BlockStoreT; typedef cub::BlockRadixSort< int, BLOCK_THREADS, ITEMS_PER_THREAD> BlockRadixSortT; // Allocate type-safe, repurposable shared memory for collectives __shared__ union { typename BlockLoadT::TempStorage load; typename BlockStoreT::TempStorage store; typename BlockRadixSortT::TempStorage sort; } temp_storage; // Obtain this block's segment of consecutive keys (blocked across threads) int thread_keys[ITEMS_PER_THREAD]; int block_offset = blockIdx.x * (BLOCK_THREADS * ITEMS_PER_THREAD); BlockLoadT(temp_storage.load).Load(d_in + block_offset, thread_keys); __syncthreads(); // Barrier for smem reuse // Collectively sort the keys BlockRadixSortT(temp_storage.sort).Sort(thread_keys); __syncthreads(); // Barrier for smem reuse // Store the sorted segment BlockStoreT(temp_storage.store).Store(d_out + block_offset, thread_keys); } .. code-block:: c++ // Elsewhere in the host program: parameterize and launch a block-sorting // kernel in which blocks of 128 threads each sort segments of 2048 keys int *d_in = ...; int *d_out = ...; int num_blocks = ...; BlockSortKernel<128, 16><<>>(d_in, d_out); In this example, threads use ``cub::BlockLoad``, ``cub::BlockRadixSort``, and ``cub::BlockStore`` to collectively load, sort and store the block's segment of input items. Because these operations are cooperative, each primitive requires an allocation of shared memory for threads to communicate through. The typical usage pattern for a CUB collective is: #. Statically specialize the primitive for the specific problem setting at hand, e.g., the data type being sorted, the number of threads per block, the number of keys per thread, optional algorithmic alternatives, etc. (CUB primitives are also implicitly specialized by the targeted compilation architecture.) #. Allocate (or alias) an instance of the specialized primitive's nested ``TempStorage`` type within a shared memory space. #. Specify communication details (e.g., the ``TempStorage`` allocation) to construct an instance of the primitive. #. Invoke methods on the primitive instance. In particular, ``cub::BlockRadixSort`` is used to collectively sort the segment of data items that have been partitioned across the thread block. To provide coalesced accesses to device memory, we configure the cub::BlockLoad and cub::BlockStore primitives to access memory using a striped access pattern (where consecutive threads simultaneously access consecutive items) and then *transpose* the keys into a :ref:`blocked arrangement ` of elements across threads. To reuse shared memory across all three primitives, the thread block statically allocates a union of their ``TempStorage`` types. Why do you need CUB? ================================================== Writing, tuning, and maintaining kernel code is perhaps the most challenging, time-consuming aspect of CUDA programming. Kernel software is where the complexity of parallelism is expressed. Programmers must reason about deadlock, livelock, synchronization, race conditions, shared memory layout, plurality of state, granularity, throughput, latency, memory bottlenecks, etc. With the exception of CUB, however, there are few (if any) software libraries of *reusable* kernel primitives. In the CUDA ecosystem, CUB is unique in this regard. As a `SIMT `_ library and software abstraction layer, CUB provides: #. **Simplicity of composition**. CUB enhances programmer productivity by allowing complex parallel operations to be easily sequenced and nested. For example, ``cub::BlockRadixSort`` is constructed from cub::BlockExchange and ``cub::BlockRadixRank``. The latter is composed of cub::BlockScan which incorporates cub::WarpScan. .. figure:: img/nested_composition.png :align: center #. **High performance**. CUB simplifies high-performance program and kernel development by taking care to implement the state-of-the-art in parallel algorithms. #. **Performance portability**. CUB primitives are specialized to match the diversity of NVIDIA hardware, continuously evolving to accommodate new architecture-specific features and instructions. And because CUB's device-wide primitives are implemented using flexible block-wide and warp-wide collectives, we are able to performance-tune them to match the processor resources provided by each CUDA processor architecture. #. **Simplicity of performance tuning**: * **Resource utilization**. CUB primitives allow developers to quickly change grain sizes (threads per block, items per thread, etc.) to best match the processor resources of their target architecture * **Variant tuning**. Most CUB primitives support alternative algorithmic strategies. For example, cub::BlockHistogram is parameterized to implement either an atomic-based approach or a sorting-based approach. (The latter provides uniform performance regardless of input distribution.) * **Co-optimization**. When the enclosing kernel is similarly parameterizable, a tuning configuration can be found that optimally accommodates their combined register and shared memory pressure. #. **Robustness and durability**. CUB just works. CUB primitives are designed to function properly for arbitrary data types and widths of parallelism (not just for the built-in C++ types or for powers-of-two threads per block). #. **Reduced maintenance burden**. CUB provides a SIMT software abstraction layer over the diversity of CUDA hardware. With CUB, applications can enjoy performance-portability without intensive and costly rewriting or porting efforts. #. **A path for language evolution**. CUB primitives are designed to easily accommodate new features in the CUDA programming model, e.g., thread subgroups and named barriers, dynamic shared memory allocators, etc. How do CUB collectives work? ================================================== Four programming idioms are central to the design of CUB: #. :ref:`Generic programming `. C++ templates provide the flexibility and adaptive code generation needed for CUB primitives to be useful, reusable, and fast in arbitrary kernel settings. #. :ref:`Reflective class interfaces `. CUB collectives statically export their their resource requirements (e.g., shared memory size and layout) for a given specialization, which allows compile-time tuning decisions and resource allocation. #. :ref:`Flexible data arrangement across threads `. CUB collectives operate on data that is logically partitioned across a group of threads. For most collective operations, efficiency is increased with increased granularity (i.e., items per thread). #. :ref:`Static tuning and co-tuning `. Simple constants and static types dictate the granularities and algorithmic alternatives to be employed by CUB collectives. When the enclosing kernel is similarly parameterized, an optimal configuration can be determined that best accommodates the combined behavior and resource consumption of all primitives within the kernel. .. _generic-programming: Generic programming -------------------------------------------------- We use template parameters to specialize CUB primitives for the particular problem setting at hand. Until compile time, CUB primitives are not bound to any particular: * Data type (int, float, double, etc.) * Width of parallelism (threads per thread block) * Grain size (data items per thread) * Underlying processor (special instructions, warp size, rules for bank conflicts, etc.) * Tuning configuration (e.g., latency vs. throughput, algorithm selection, etc.) .. _reflective-class-interfaces: Reflective class interfaces -------------------------------------------------- Unlike traditional function-oriented interfaces, CUB exposes its collective primitives as templated C++ classes. The resource requirements for a specific parameterization are reflectively advertised as members of the class. The resources can then be statically or dynamically allocated, aliased to global or shared memory, etc. The following illustrates a CUDA kernel fragment performing a collective prefix sum across the threads of a thread block: .. code-block:: c++ #include __global__ void SomeKernelFoo(...) { // Specialize BlockScan for 128 threads on integer types typedef cub::BlockScan BlockScan; // Allocate shared memory for BlockScan __shared__ typename BlockScan::TempStorage scan_storage; ... // Obtain a segment of consecutive items that are blocked across threads int thread_data_in[4]; int thread_data_out[4]; ... // Perform an exclusive block-wide prefix sum BlockScan(scan_storage).ExclusiveSum(thread_data_in, thread_data_out); Furthermore, the CUB interface is designed to separate parameter fields by concerns. CUB primitives have three distinct parameter fields: #. *Static template parameters*. These are constants that will dictate the storage layout and the unrolling of algorithmic steps (e.g., the input data type and the number of block threads), and are used to specialize the class. #. *Constructor parameters*. These are optional parameters regarding inter-thread communication (e.g., storage allocation, thread-identifier mapping, named barriers, etc.), and are orthogonal to the functions exposed by the class. #. *Formal method parameters*. These are the operational inputs/outputs for the various functions exposed by the class. This allows CUB types to easily accommodate new programming model features (e.g., named barriers, memory allocators, etc.) without incurring a combinatorial growth of interface methods. .. _flexible-data-arrangement: Flexible data arrangement across threads -------------------------------------------------- CUDA kernels are often designed such that each thread block is assigned a segment of data items for processing. .. figure:: img/tile.png :align: center :alt: Segment of eight ordered data items :name: fig_tile Segment of eight ordered data items When the tile size equals the thread block size, the mapping of data onto threads is straightforward (one datum per thread). However, there are often performance advantages for processing more than one datum per thread. Increased granularity corresponds to decreased communication overhead. For these scenarios, CUB primitives will specify which of the following partitioning alternatives they accommodate: .. list-table:: :class: table-no-stripes :widths: 70 30 * - **Blocked arrangement**. The aggregate tile of items is partitioned evenly across threads in "blocked" fashion with *thread*\ :sub:`i` owning the *i*\ :sup:`th` segment of consecutive elements. Blocked arrangements are often desirable for algorithmic benefits (where long sequences of items can be processed sequentially within each thread). - .. figure:: img/blocked.png :align: center :alt: *Blocked* arrangement across four threads :name: fig_blocked *Blocked* arrangement across four threads (emphasis on items owned by *thread*\ :sub:`0`) * - **Striped arrangement**. The aggregate tile of items is partitioned across threads in "striped" fashion, i.e., the ``ITEMS_PER_THREAD`` items owned by each thread have logical stride ``BLOCK_THREADS`` between them. Striped arrangements are often desirable for data movement through global memory (where `read/write coalescing `_ is an important performance consideration). - .. figure:: img/striped.png :align: center :alt: *Striped* arrangement across four threads :name: fig_striped *Striped* arrangement across four threads (emphasis on items owned by *thread*\ :sub:`0`) The benefits of processing multiple items per thread (a.k.a., *register blocking*, *granularity coarsening*, etc.) include: * Algorithmic efficiency. Sequential work over multiple items in thread-private registers is cheaper than synchronized, cooperative work through shared memory spaces. * Data occupancy. The number of items that can be resident on-chip in thread-private register storage is often greater than the number of schedulable threads. * Instruction-level parallelism. Multiple items per thread also facilitates greater ILP for improved throughput and utilization. Finally, cub::BlockExchange provides operations for converting between blocked and striped arrangements. .. _static-tuning-and-co-tuning: Static tuning and co-tuning -------------------------------------------------- This style of flexible interface simplifies performance tuning. Most CUB primitives support alternative algorithmic strategies that can be statically targeted by a compiler-based or JIT-based autotuner. (For example, cub::BlockHistogram is parameterized to implement either an atomic-based approach or a sorting-based approach.) Algorithms are also tunable over parameters such as thread count and grain size as well. Taken together, each of the CUB algorithms provides a fairly rich tuning space. Whereas conventional libraries are optimized offline and in isolation, CUB provides interesting opportunities for whole-program optimization. For example, each CUB primitive is typically parameterized by threads-per-block and items-per-thread, both of which affect the underlying algorithm's efficiency and resource requirements. When the enclosing kernel is similarly parameterized, the coupled CUB primitives adjust accordingly. This enables autotuners to search for a single configuration that maximizes the performance of the entire kernel for a given set of hardware resources. How do I get started using CUB? ================================================== CUB is implemented as a C++ header library. There is no need to build CUB separately. To use CUB primitives in your code, simply: #. Download and unzip the latest CUB distribution #. ``#include`` the "umbrella" ```` header file in your CUDA C++ sources. (Or ``#include`` the particular header files that define the CUB primitives you wish to use.) #. Compile your program with NVIDIA's ``nvcc`` CUDA compiler, specifying a ``-I`` include-path flag to reference the location of the CUB header library. We also have collection of simple CUB example programs. How is CUB different than Thrust and Modern GPU? ================================================== CUB and Thrust -------------------------------------------------- CUB and `Thrust `_ share some similarities in that they both provide similar device-wide primitives for CUDA. However, they target different abstraction layers for parallel computing. Thrust abstractions are agnostic of any particular parallel framework (e.g., CUDA, TBB, OpenMP, sequential CPU, etc.). While Thrust has a "backend" for CUDA devices, Thrust interfaces themselves are not CUDA-specific and do not explicitly expose CUDA-specific details (e.g., ``cudaStream_t`` parameters). CUB, on the other hand, is slightly lower-level than Thrust. CUB is specific to CUDA C++ and its interfaces explicitly accommodate CUDA-specific features. Furthermore, CUB is also a library of SIMT collective primitives for block-wide and warp-wide kernel programming. CUB and Thrust are complementary and can be used together. In fact, the CUB project arose out of a maintenance need to achieve better performance-portability within Thrust by using reusable block-wide primitives to reduce maintenance and tuning effort. CUB and Modern GPU -------------------------------------------------- CUB and `Modern GPU `_ also share some similarities in that they both implement similar device-wide primitives for CUDA. However, they serve different purposes for the CUDA programming community. MGPU is a pedagogical tool for high-performance GPU computing, providing clear and concise exemplary code and accompanying commentary. It serves as an excellent source of educational, tutorial, CUDA-by-example material. The MGPU source code is intended to be read and studied, and often favors simplicity at the expense of portability and flexibility. CUB, on the other hand, is a production-quality library whose sources are complicated by support for every version of CUDA architecture, and is validated by an extensive suite of regression tests. Although well-documented, the CUB source text is verbose and relies heavily on C++ template metaprogramming for situational specialization. CUB and MGPU are complementary in that MGPU serves as an excellent descriptive source for many of the algorithmic techniques used by CUB. Stable releases ================================================== CUB releases are labeled using version identifiers having three fields: ``..``. The *epoch* field corresponds to support for a major change or update to the CUDA programming model. The *feature* field corresponds to a stable set of features, functionality, and interface. The *update* field corresponds to a bug-fix or performance update for that feature set. At the moment, we do not publicly provide non-stable releases such as development snapshots, beta releases or rolling releases. (Feel free to contact us if you would like access to such things.) Contributors ================================================== CUB is developed as an open-source project by NVIDIA. The primary contributor is the CCCL team. Open Source License ================================================== CUB is available under the `BSD 3-Clause "New" or "Revised" License `_ cccl-2.5.0/cub/docs/modules.rst000066400000000000000000000014531463375617100163450ustar00rootroot00000000000000CUB Modules ================================================== .. toctree:: :hidden: :maxdepth: 2 warp_wide block_wide device_wide CUB provides state-of-the-art, reusable software components for every layer of the CUDA programming model: * **Parallel primitives** * :ref:`Warp-wide ` "collective" primitives * Cooperative warp-wide prefix scan, reduction, etc. * Safely specialized for each underlying CUDA architecture * :ref:`Block-wide ` "collective" primitives * Cooperative I/O, sort, scan, reduction, histogram, etc. * Compatible with arbitrary thread block sizes and types * :ref:`Device-wide ` primitives * Parallel sort, prefix scan, reduction, histogram, etc. * Compatible with CUDA dynamic parallelism cccl-2.5.0/cub/docs/repo.bat000066400000000000000000000002611463375617100155740ustar00rootroot00000000000000@echo off call "%~dp0tools\packman\python.bat" %~dp0tools\repoman\repoman.py %* if %errorlevel% neq 0 ( goto Error ) :Success exit /b 0 :Error exit /b %errorlevel% cccl-2.5.0/cub/docs/repo.sh000077500000000000000000000002071463375617100154430ustar00rootroot00000000000000#!/bin/bash set -e SCRIPT_DIR=$(dirname ${BASH_SOURCE}) cd "$SCRIPT_DIR" exec "tools/packman/python.sh" tools/repoman/repoman.py $@ cccl-2.5.0/cub/docs/repo.toml000066400000000000000000000207471463375617100160140ustar00rootroot00000000000000######################################################################################################################## # Repo tool base settings ######################################################################################################################## [repo_docs] enabled = true name = "CUB" project = "CUB" logo = "img/logo.png" enhanced_search_enabled = true api_output_directory = "api" use_fast_doxygen_conversion = true sphinx_generate_doxygen_groups = true sphinx_generate_doxygen_pages = true sphinx_exclude_patterns = [ "build", "tools", "VERSION.md", "source/extensions/*/docs/Overview.md", "source/extensions/*/docs/CHANGELOG.md", ] # list of files from which to extract documentation. if a directory is specified, # it will be recursively searched. # # paths are relative to ${docs_root}. # # defaults to an empty list. doxygen_input = [ "../cub/*.cuh", "../cub/thread/*.cuh", "../cub/warp/*.cuh", "../cub/block/*.cuh", "../cub/device/*.cuh", "../cub/grid/*.cuh", "../cub/iterator/*.cuh" ] # Using wildcards is also supported in `doxygen_input`. Assuming there are no other `.h` files # in the `include/carb` directory, the above may also be specified as: # doxygen_input = [ # "include/carb/*.h", # "source/examples/example.doxygen/ExampleDoxygen.h", # ] # doxygen allows the creation of custom commands to ease in the documentation process. # for example, this adds a @carb_framework_overview command which creates a link # to a reStructuredText document. defaults to an empty list. # # more information on the format can be found at: # https://www.doxygen.nl/manual/config.html#cfg_aliases doxygen_aliases = [ "smemwarpreuse=A subsequent ``__syncwarp()`` warp-wide barrier should be invoked after calling this method if the collective's temporary storage (e.g., ``temp_storage``) is to be reused or repurposed.", "smemreuse=A subsequent ``__syncthreads()`` threadblock barrier should be invoked after calling this method if the collective's temporary storage (e.g., ``temp_storage``) is to be reused or repurposed.", "smemreuse{1}=After any operation, a subsequent ``__syncthreads()`` barrier is required if the collective's \\1 is to be reused or repurposed", "smemstorage{1}=The operations exposed by \\1 require a temporary memory allocation of this nested type for thread communication. This opaque storage can be allocated directly using the ``__shared__`` keyword. Alternatively, it can be aliased to externally allocated memory (shared or global) or ``union``'d with other storage allocation types to facilitate memory reuse.", "granularity=Efficiency is increased with increased granularity ``ITEMS_PER_THREAD``. Performance is also typically increased until the additional register pressure or shared memory allocation size causes SM occupancy to fall too low. Consider variants of ``cub::BlockLoad`` for efficiently gathering a :ref:`blocked arrangement ` of elements across threads.", "blocksize=The number of threads in the block is a multiple of the architecture's warp size", "ptxversion=The PTX compute capability for which to to specialize this collective, formatted as per the ``__CUDA_ARCH__`` macro (e.g., 350 for sm_35). Useful for determining the collective's storage requirements for a given device from the host. (Default: the value of ``__CUDA_ARCH__`` during the current compiler pass)", "blockcollective{1}=Every thread in the block uses the \\1 class by first specializing the \\1 type, then instantiating an instance with parameters for communication, and finally invoking one or more collective member functions.", "warpcollective{1}=Every thread in the warp uses the \\1 class by first specializing the \\1 type, then instantiating an instance with parameters for communication, and finally invoking or more collective member functions.", "devicestorage=When ``d_temp_storage`` is ``nullptr``, no work is done and the required allocation size is returned in ``temp_storage_bytes``.", "devicestorageP=This operation requires a relatively small allocation of temporary device storage that is ``O(P)``, where ``P`` is the number of streaming multiprocessors on the device (and is typically a small constant relative to the input size ``N``).", "devicestorageNP=This operation requires an allocation of temporary device storage that is ``O(N+P)``, where ``N`` is the length of the input and ``P`` is the number of streaming multiprocessors on the device.", "devicestorageNCP=This operation requires a relatively small allocation of temporary device storage that is ``O(N/C + P)``, where ``N`` is the length of the input, ``C`` is the number of concurrent threads that can be actively scheduled on each streaming multiprocessor (typically several thousand), and ``P`` is the number of streaming multiprocessors on the device.", "cdp_class{1}= - Dynamic parallelism. \\1 methods can be called within kernel code on devices in which CUDA dynamic parallelism is supported.", "iterator=(may be a simple pointer type)", "offset_size1=(Consider using 32-bit values as offsets/lengths/etc. For example, ``int`` will typically yeild better performance than ``size_t`` in 64-bit memory mode.)", "offset_size2=Careful consideration should be given to the size of integer types used for offsets and lengths. Many (if not most) scenarios will only require 32-bit offsets (e.g., ``int``). 64-bit offset types (e.g., ``size_t`` on 64-bit memory mode) can consume a significant amount of thread storage resources, adversely affecting processor occupancy and performance.", "rowmajor=For multi-dimensional blocks, threads are linearly ranked in row-major order.", "blocked=Assumes a :ref:`blocked arrangement ` of (*block-threads* * *items-per-thread*) items across the thread block, where *thread*\\ :sub:`i` owns the *i*\\ :sup:`th` range of *items-per-thread* contiguous items. For multi-dimensional thread blocks, a row-major thread ordering is assumed.", "striped=Assumes a :ref:`striped arrangement ` of (*block-threads* * *items-per-thread*) items across the thread block, where *thread*\\ :sub:`i` owns items (*i*), (*i* + *block-threads*), ..., (*i* + (*block-threads* * (*items-per-thread* - 1))). For multi-dimensional thread blocks, a row-major thread ordering is assumed.", "warpstriped=Assumes a *warp-striped arrangement* of elements across threads, where warp\\ :sub:`i` owns the *i*\\ :sup:`th` range of (*warp-threads* * *items-per-thread*) contiguous items, and each thread owns items (*i*), (*i* + *warp-threads*), ..., (*i* + (*warp-threads* * (*items-per-thread* - 1))).", "linear_performance{1}=The work-complexity of \\1 as a function of input size is linear, resulting in performance throughput that plateaus with problem sizes large enough to saturate the GPU." , "plots_below=Performance plots for other scenarios can be found in the detailed method descriptions below.", "identityzero=This operation assumes the value of obtained by the ``T``'s default constructor (or by zero-initialization if no user-defined default constructor exists) is suitable as the identity value \"zero\" for addition.", "lookback=`decoupled look-back `_" ] # doxygen sometimes gets confused by macros. the array below allows the user to # tell doxygen how to expand a macro. defaults to an empty list. # # more information on the format can be found at: # https://www.doxygen.nl/manual/config.html#cfg_predefined doxygen_predefined = [ "_CCCL_HOST", "_CCCL_DEVICE", "_CCCL_HOST_DEVICE", "_CCCL_FORCEINLINE", "_CCCL_STD_VER", "CUB_DISABLE_NAMESPACE_MAGIC", "CUB_IGNORE_NAMESPACE_MAGIC_ERROR", "CUB_NAMESPACE_BEGIN=namespace cub {", "CUB_NAMESPACE_END=}", "DOXYGEN_SHOULD_SKIP_THIS", "DOXYGEN_ACTIVE", "__device__", "__host__", "__forceinline__", "__declspec(x)=", "__align__(x)=", "CUB_DEPRECATED", "CUB_STATIC_ASSERT(cond,msg)=", "CUB_RUNTIME_FUNCTION", "CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED", "CUB_IGNORE_DEPRECATED_CPP_DIALECT" ] # make sure to use ./fetch_imgs.sh doxygen_conf_extra = """ IMAGE_PATH = ${config_root}/img DOXYFILE_ENCODING = UTF-8 INPUT_ENCODING = UTF-8 EXTENSION_MAPPING = cuh=c++ cu=c++ EXAMPLE_PATH = ../examples/device EXAMPLE_RECURSIVE = NO EXAMPLE_PATTERNS = *.cu EXCLUDE_SYMBOLS = "*detail*" "CUB_DETAIL*" AUTOLINK_SUPPORT = YES """ cccl-2.5.0/cub/docs/test_overview.rst000066400000000000000000000264001463375617100176010ustar00rootroot00000000000000CUB Testing Overview ########################### .. warning:: CUB is in the progress of migrating to [Catch2](https://github.com/catchorg/Catch2) framework. CUB tests rely on `CPM `_ to fetch `Catch2 `_ that's used as our main testing framework along with `metal `_ that's used as template metaprogramming backend for some of the test macro implementation. Currently, legacy tests coexist with Catch2 ones. This guide is focused on new tests. .. important:: Instead of including ```` directly, use ``catch2_test_helper.h``. .. code-block:: c++ #include #include #include "catch2_test_helper.h" Directory and File Naming ************************************* Our tests can be found in the ``test`` directory. Legacy tests have the following naming scheme: ``test_SCOPE_FACILITY.cu``. For instance, here are the reduce tests: .. code-block:: c++ test/test_warp_reduce.cu test/test_block_reduce.cu test/test_device_reduce.cu Catch2-based tests have a different naming scheme: ``catch2_test_SCOPE_FACILITY.cu``. The prefix is essential since that's how CMake finds tests and distinguishes new tests from legacy ones. Test Structure ************************************* Base case ===================================== Let's start with a simple example. Say there's no need to cover many types with your test. .. code-block:: c++ // 0) Define test name and tags CUB_TEST("SCOPE FACILITY works with CONDITION", "[FACILITY][SCOPE]") { using type = std::int32_t; constexpr int threads_per_block = 256; constexpr int num_items = threads_per_block; // 1) Allocate device input c2h::device_vector d_input(num_items); // 2) Generate 3 random input arrays using Catch2 helper c2h::gen(CUB_SEED(3), d_input); // 3) Allocate output array c2h::device_vector d_output(d_input.size()); // 4) Copy device input to host c2h::host_vector h_reference = d_input; // 5) Compute reference output std::ALGORITHM( thrust::raw_pointer_cast(h_reference.data()), thrust::raw_pointer_cast(h_reference.data()) + h_reference.size()); // 6) Compute CUB output SCOPE_ALGORITHM(d_input.data(), d_output.data(), d_input.size()); // 7) Compare device and host results REQUIRE( d_input == d_output ); } We introduce test cases with the ``CUB_TEST`` macro in (0). This macro always takes two string arguments - a free-form test name and one or more tags. Then, in (1), we allocate device memory using ``c2h::device_vector``. ``c2h::device_vector`` and ``c2h::host_vector`` behave similarly to their Thrust counterparts, but are modified to provide more stable behavior in some testing edge cases. These must be used in place of Thrust vectors and manual allocations, unless the test code is being used for documentation examples. Similarly, any thrust algorithms that executed on the device must be invoked with the `c2h::device_policy`` execution policy (not shown here) to support the same edge cases. The memory is filled with random data in (2). Generator ``c2h::gen`` takes two parameters. The first one is a random generator seed. Instead of providing a single value, we use the ``CUB_SEED`` macro. The macro expects a number of seeds that has to be generated. In the example above, we require three random seeds to be generated. This leads to the whole test being executed three times with different seed values. Later, in (3) and (4), we allocate device output and host reference. Then, in (4), we populate host input data and perform reference computation on the host in (5). Then launch the CUB algorithm in (6). At this point, we have a reference solution on CPU and CUB solution on GPU. The two can be compared with ``REQUIRE`` assert. .. warning:: Standard algorithms (``std::``) have to be used as much as possible when computing reference solutions. If your test has to cover floating point types, it's sufficient to replace ``REQUIRE( a == b )`` with ``REQUIRE_APPROX_EQ(a, b)``. It's strongly advised to always use ``c2h::gen`` to produce input data. Other data generation methods might be used if absolutely necessary in tests of corner cases. Do not use ``assert`` in tests. We run CUB tests in release mode. The issue with ``assert`` is that it only works in debug mode. If a custom type has to be tested, the following helper should be used: .. code-block:: c++ using type = c2h::custom_type_t; Here we enumerate all the type properties that we are interested in. The produced type ends up having ``operator==`` and ``operator+``. There are more properties implemented. If some property is missing, it'd be better to add one in ``c2h`` instead of writing a custom type from scratch. Type Lists ===================================== Since CUB is a generic library, it's often required to test CUB algorithms against many types. To do so, it's sufficient to define a type list and provide it to the ``CUB_TEST`` macro. .. code-block:: c++ // 0) Define type list using types = c2h::type_list; CUB_TEST("SCOPE FACILITY works with CONDITION", "[FACILITY][SCOPE]", types) // 1) Provide it to the test case { // 2) Access current type with `c2h::get` using type = typename c2h::get<0, TestType>; // ... } This will lead to the test running two times. The first run will cause the ``type`` to be ``std::uint8_t``. The second one will cause ``type`` to be ``std::uint32_t``. .. warning:: It's important to use types in ``std::`` instead of primitive types like ``char`` and ``int``. Multidimensional Configuration Spaces ===================================== In most cases, the input data type is not the only compile-time parameter we want to vary. For instance, you might need to test a block algorithm for different data types **and** different thread block sizes. To do so, you can add another type list as follows: .. code-block:: c++ using block_sizes = c2h::enum_type_list; using types = c2h::type_list; CUB_TEST("SCOPE FACILITY works with CONDITION", "[FACILITY][SCOPE]", types, block_sizes) { using type = typename c2h::get<0, TestType>; constexpr int threads_per_block = c2h::get<1, TestType>::value; // ... } The code above leads to the following combinations being compiled: - ``type = std::uint8_t``, ``threads_per_block = 128`` - ``type = std::uint8_t``, ``threads_per_block = 256`` - ``type = std::int32_t``, ``threads_per_block = 128`` - ``type = std::int32_t``, ``threads_per_block = 256`` As an example, the following test case includes both multidimensional configuration spaces and multiple random sequence generations. .. code-block:: c++ using block_sizes = c2h::enum_type_list; using types = c2h::type_list; CUB_TEST("SCOPE FACILITY works with CONDITION", "[FACILITY][SCOPE]", types, block_sizes) { using type = typename c2h::get<0, TestType>; constexpr int threads_per_block = c2h::get<1, TestType>::value; // ... c2h::device_vector d_input(5); c2h::gen(CUB_SEED(2), d_input); } The code above leads to the following combinations being compiled: - ``type = std::uint8_t``, ``threads_per_block = 128``, 1st random generated input sequence - ``type = std::uint8_t``, ``threads_per_block = 256``, 1st random generated input sequence - ``type = std::int32_t``, ``threads_per_block = 128``, 1st random generated input sequence - ``type = std::int32_t``, ``threads_per_block = 256``, 1st random generated input sequence - ``type = std::uint8_t``, ``threads_per_block = 128``, 2nd random generated input sequence - ``type = std::uint8_t``, ``threads_per_block = 256``, 2nd random generated input sequence - ``type = std::int32_t``, ``threads_per_block = 128``, 2nd random generated input sequence - ``type = std::int32_t``, ``threads_per_block = 256``, 2nd random generated input sequence Each new generator multiplies the number of execution times by its number of seeds. That means that if there were further more sequence generators (``c2h::gen(CUB_SEED(X), ...)``) on the example above the test would execute X more times and so on. Speedup Compilation Time ===================================== Since type lists in the ``CUB_TEST`` form a Cartesian product, compilation time grows quickly with every new dimension. To keep the compilation process parallelized, it's possible to rely on ``%PARAM%`` machinery: .. code-block:: c++ // %PARAM% BLOCK_SIZE bs 128:256 using block_sizes = c2h::enum_type_list; using types = c2h::type_list; CUB_TEST("SCOPE FACILITY works with CONDITION", "[FACILITY][SCOPE]", types, block_sizes) { using type = typename c2h::get<0, TestType>; constexpr int threads_per_block = c2h::get<1, TestType>::value; // ... } The comment with ``%PARAM%`` is recognized at CMake level. It leads to multiple executables being produced from a single test source. .. code-block:: bash bin/cub.test.scope_algorithm.bs_128 bin/cub.test.scope_algorithm.bs_256 Multiple ``%PARAM%`` comments can be specified forming another Cartesian product. Final Test ===================================== Let's consider the final test that illustrates all of the tools we discussed above: .. code-block:: c++ // %PARAM% BLOCK_SIZE bs 128:256 using block_sizes = c2h::enum_type_list; using types = c2h::type_list; CUB_TEST("SCOPE FACILITY works with CONDITION", "[FACILITY][SCOPE]", types, block_sizes) { using type = typename c2h::get<0, TestType>; constexpr int threads_per_block = c2h::get<1, TestType>::value; constexpr int max_num_items = threads_per_block; c2h::device_vector d_input( GENERATE_COPY(take(2, random(0, max_num_items)))); c2h::gen(CUB_SEED(3), d_input); c2h::device_vector d_output(d_input.size()); SCOPE_ALGORITHM(d_input.data(), d_output.data(), d_input.size()); REQUIRE( d_input == d_output ); const type expected_sum = 4; const type sum = thrust::reduce(c2h::device_policy, d_output.cbegin(), d_output.cend()); REQUIRE( sum == expected_sum); } Apart from discussed tools, here we also rely on ``Catch2`` to generate random input sizes in ``[0, max_num_items]`` range. Overall, the test will produce two executables. Each of these executables is going to generate ``2`` input problem sizes. For each problem size, ``3`` random vectors are generated. As a result, we have ``12`` different tests. This also demonstrates the syntax and usage of ``c2h::device_policy`` with a Thrust alorithm. cccl-2.5.0/cub/docs/tools/000077500000000000000000000000001463375617100153005ustar00rootroot00000000000000cccl-2.5.0/cub/docs/tools/packman/000077500000000000000000000000001463375617100167125ustar00rootroot00000000000000cccl-2.5.0/cub/docs/tools/packman/bootstrap/000077500000000000000000000000001463375617100207275ustar00rootroot00000000000000cccl-2.5.0/cub/docs/tools/packman/bootstrap/configure.bat000077500000000000000000000150171463375617100234070ustar00rootroot00000000000000:: Copyright 2019 NVIDIA CORPORATION :: :: Licensed under the Apache License, Version 2.0 (the "License"); :: you may not use this file except in compliance with the License. :: You may obtain a copy of the License at :: :: http://www.apache.org/licenses/LICENSE-2.0 :: :: Unless required by applicable law or agreed to in writing, software :: distributed under the License is distributed on an "AS IS" BASIS, :: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. :: See the License for the specific language governing permissions and :: limitations under the License. set PM_PACKMAN_VERSION=6.57 :: Specify where packman command is rooted set PM_INSTALL_PATH=%~dp0.. :: The external root may already be configured and we should do minimal work in that case if defined PM_PACKAGES_ROOT goto ENSURE_DIR :: If the folder isn't set we assume that the best place for it is on the drive that we are currently :: running from set PM_DRIVE=%CD:~0,2% set PM_PACKAGES_ROOT=%PM_DRIVE%\packman-repo :: We use *setx* here so that the variable is persisted in the user environment echo Setting user environment variable PM_PACKAGES_ROOT to %PM_PACKAGES_ROOT% setx PM_PACKAGES_ROOT %PM_PACKAGES_ROOT% if %errorlevel% neq 0 ( goto ERROR ) :: The above doesn't work properly from a build step in VisualStudio because a separate process is :: spawned for it so it will be lost for subsequent compilation steps - VisualStudio must :: be launched from a new process. We catch this odd-ball case here: if defined PM_DISABLE_VS_WARNING goto ENSURE_DIR if not defined VSLANG goto ENSURE_DIR echo The above is a once-per-computer operation. Unfortunately VisualStudio cannot pick up environment change echo unless *VisualStudio is RELAUNCHED*. echo If you are launching VisualStudio from command line or command line utility make sure echo you have a fresh launch environment (relaunch the command line or utility). echo If you are using 'linkPath' and referring to packages via local folder links you can safely ignore this warning. echo You can disable this warning by setting the environment variable PM_DISABLE_VS_WARNING. echo. :: Check for the directory that we need. Note that mkdir will create any directories :: that may be needed in the path :ENSURE_DIR if not exist "%PM_PACKAGES_ROOT%" ( echo Creating packman packages cache at %PM_PACKAGES_ROOT% mkdir "%PM_PACKAGES_ROOT%" ) if %errorlevel% neq 0 ( goto ERROR_MKDIR_PACKAGES_ROOT ) :: The Python interpreter may already be externally configured if defined PM_PYTHON_EXT ( set PM_PYTHON=%PM_PYTHON_EXT% goto PACKMAN ) set PM_PYTHON_VERSION=3.7.13-nv1-windows-x86_64 set PM_PYTHON_BASE_DIR=%PM_PACKAGES_ROOT%\python set PM_PYTHON_DIR=%PM_PYTHON_BASE_DIR%\%PM_PYTHON_VERSION% set PM_PYTHON=%PM_PYTHON_DIR%\python.exe if exist "%PM_PYTHON%" goto PACKMAN if not exist "%PM_PYTHON_BASE_DIR%" call :CREATE_PYTHON_BASE_DIR set PM_PYTHON_PACKAGE=python@%PM_PYTHON_VERSION%.cab for /f "delims=" %%a in ('powershell -ExecutionPolicy ByPass -NoLogo -NoProfile -File "%~dp0\generate_temp_file_name.ps1"') do set TEMP_FILE_NAME=%%a set TARGET=%TEMP_FILE_NAME%.zip call "%~dp0fetch_file_from_packman_bootstrap.cmd" %PM_PYTHON_PACKAGE% "%TARGET%" if %errorlevel% neq 0 ( echo !!! Error fetching python from CDN !!! goto ERROR ) for /f "delims=" %%a in ('powershell -ExecutionPolicy ByPass -NoLogo -NoProfile -File "%~dp0\generate_temp_folder.ps1" -parentPath "%PM_PYTHON_BASE_DIR%"') do set TEMP_FOLDER_NAME=%%a echo Unpacking Python interpreter ... "%SystemRoot%\system32\expand.exe" -F:* "%TARGET%" "%TEMP_FOLDER_NAME%" 1> nul del "%TARGET%" :: Failure during extraction to temp folder name, need to clean up and abort if %errorlevel% neq 0 ( echo !!! Error unpacking python !!! call :CLEAN_UP_TEMP_FOLDER goto ERROR ) :: If python has now been installed by a concurrent process we need to clean up and then continue if exist "%PM_PYTHON%" ( call :CLEAN_UP_TEMP_FOLDER goto PACKMAN ) else ( if exist "%PM_PYTHON_DIR%" ( rd /s /q "%PM_PYTHON_DIR%" > nul ) ) :: Perform atomic rename rename "%TEMP_FOLDER_NAME%" "%PM_PYTHON_VERSION%" 1> nul :: Failure during move, need to clean up and abort if %errorlevel% neq 0 ( echo !!! Error renaming python !!! call :CLEAN_UP_TEMP_FOLDER goto ERROR ) :PACKMAN :: The packman module may already be externally configured if defined PM_MODULE_DIR_EXT ( set PM_MODULE_DIR=%PM_MODULE_DIR_EXT% ) else ( set PM_MODULE_DIR=%PM_PACKAGES_ROOT%\packman-common\%PM_PACKMAN_VERSION% ) set PM_MODULE=%PM_MODULE_DIR%\run.py if exist "%PM_MODULE%" goto ENSURE_7ZA :: Clean out broken PM_MODULE_DIR if it exists if exist "%PM_MODULE_DIR%" ( rd /s /q "%PM_MODULE_DIR%" > nul ) set PM_MODULE_PACKAGE=packman-common@%PM_PACKMAN_VERSION%.zip for /f "delims=" %%a in ('powershell -ExecutionPolicy ByPass -NoLogo -NoProfile -File "%~dp0\generate_temp_file_name.ps1"') do set TEMP_FILE_NAME=%%a set TARGET=%TEMP_FILE_NAME% call "%~dp0fetch_file_from_packman_bootstrap.cmd" %PM_MODULE_PACKAGE% "%TARGET%" if %errorlevel% neq 0 ( echo !!! Error fetching packman from CDN !!! goto ERROR ) echo Unpacking ... "%PM_PYTHON%" -S -s -u -E "%~dp0\install_package.py" "%TARGET%" "%PM_MODULE_DIR%" if %errorlevel% neq 0 ( echo !!! Error unpacking packman !!! goto ERROR ) del "%TARGET%" :ENSURE_7ZA set PM_7Za_VERSION=22.01-1 set PM_7Za_PATH=%PM_PACKAGES_ROOT%\7za\%PM_7ZA_VERSION% if exist "%PM_7Za_PATH%" goto END set PM_7Za_PATH=%PM_PACKAGES_ROOT%\chk\7za\%PM_7ZA_VERSION% if exist "%PM_7Za_PATH%" goto END "%PM_PYTHON%" -S -s -u -E "%PM_MODULE%" pull "%PM_MODULE_DIR%\deps.packman.xml" if %errorlevel% neq 0 ( echo !!! Error fetching packman dependencies !!! goto ERROR ) goto END :ERROR_MKDIR_PACKAGES_ROOT echo Failed to automatically create packman packages repo at %PM_PACKAGES_ROOT%. echo Please set a location explicitly that packman has permission to write to, by issuing: echo. echo setx PM_PACKAGES_ROOT {path-you-choose-for-storing-packman-packages-locally} echo. echo Then launch a new command console for the changes to take effect and run packman command again. exit /B %errorlevel% :ERROR echo !!! Failure while configuring local machine :( !!! exit /B %errorlevel% :CLEAN_UP_TEMP_FOLDER rd /S /Q "%TEMP_FOLDER_NAME%" exit /B :CREATE_PYTHON_BASE_DIR :: We ignore errors and clean error state - if two processes create the directory one will fail which is fine md "%PM_PYTHON_BASE_DIR%" > nul 2>&1 exit /B 0 :END cccl-2.5.0/cub/docs/tools/packman/bootstrap/download_file_from_url.ps1000066400000000000000000000026251463375617100260740ustar00rootroot00000000000000<# Copyright 2019 NVIDIA CORPORATION Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. #> param( [Parameter(Mandatory=$true)][string]$source=$null, [string]$output="out.exe" ) $filename = $output $triesLeft = 4 $delay = 2 do { $triesLeft -= 1 try { Write-Host "Downloading from bootstrap.packman.nvidia.com ..." $wc = New-Object net.webclient $wc.Downloadfile($source, $fileName) exit 0 } catch { Write-Host "Error downloading $source!" Write-Host $_.Exception|format-list -force if ($triesLeft) { Write-Host "Retrying in $delay seconds ..." Start-Sleep -seconds $delay } $delay = $delay * $delay } } while ($triesLeft -gt 0) # We only get here if the retries have been exhausted, remove any left-overs: if (Test-Path $fileName) { Remove-Item $fileName } exit 1 cccl-2.5.0/cub/docs/tools/packman/bootstrap/fetch_file_from_packman_bootstrap.cmd000077500000000000000000000026141463375617100303240ustar00rootroot00000000000000:: Copyright 2019 NVIDIA CORPORATION :: :: Licensed under the Apache License, Version 2.0 (the "License"); :: you may not use this file except in compliance with the License. :: You may obtain a copy of the License at :: :: http://www.apache.org/licenses/LICENSE-2.0 :: :: Unless required by applicable law or agreed to in writing, software :: distributed under the License is distributed on an "AS IS" BASIS, :: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. :: See the License for the specific language governing permissions and :: limitations under the License. :: You need to specify as input to this command @setlocal @set PACKAGE_NAME=%1 @set TARGET_PATH=%2 @echo Fetching %PACKAGE_NAME% ... @powershell -ExecutionPolicy ByPass -NoLogo -NoProfile -File "%~dp0download_file_from_url.ps1" ^ -source "http://bootstrap.packman.nvidia.com/%PACKAGE_NAME%" -output %TARGET_PATH% :: A bug in powershell prevents the errorlevel code from being set when using the -File execution option :: We must therefore do our own failure analysis, basically make sure the file exists: @if not exist %TARGET_PATH% goto ERROR_DOWNLOAD_FAILED @endlocal @exit /b 0 :ERROR_DOWNLOAD_FAILED @echo Failed to download file from S3 @echo Most likely because endpoint cannot be reached or file %PACKAGE_NAME% doesn't exist @endlocal @exit /b 1 cccl-2.5.0/cub/docs/tools/packman/bootstrap/generate_temp_file_name.ps1000066400000000000000000000240271463375617100261770ustar00rootroot00000000000000<# Copyright 2019 NVIDIA CORPORATION Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. #> $out = [System.IO.Path]::GetTempFileName() Write-Host $out # SIG # Begin signature block # MIIaVwYJKoZIhvcNAQcCoIIaSDCCGkQCAQExDzANBglghkgBZQMEAgEFADB5Bgor # BgEEAYI3AgEEoGswaTA0BgorBgEEAYI3AgEeMCYCAwEAAAQQH8w7YFlLCE63JNLG # KX7zUQIBAAIBAAIBAAIBAAIBADAxMA0GCWCGSAFlAwQCAQUABCAK+Ewup1N0/mdf # 1l4R58rxyumHgZvTmEhrYTb2Zf0zd6CCCiIwggTTMIIDu6ADAgECAhBi50XpIWUh # PJcfXEkK6hKlMA0GCSqGSIb3DQEBCwUAMIGEMQswCQYDVQQGEwJVUzEdMBsGA1UE # ChMUU3ltYW50ZWMgQ29ycG9yYXRpb24xHzAdBgNVBAsTFlN5bWFudGVjIFRydXN0 # IE5ldHdvcmsxNTAzBgNVBAMTLFN5bWFudGVjIENsYXNzIDMgU0hBMjU2IENvZGUg # U2lnbmluZyBDQSAtIEcyMB4XDTE4MDcwOTAwMDAwMFoXDTIxMDcwOTIzNTk1OVow # gYMxCzAJBgNVBAYTAlVTMRMwEQYDVQQIDApDYWxpZm9ybmlhMRQwEgYDVQQHDAtT # YW50YSBDbGFyYTEbMBkGA1UECgwSTlZJRElBIENvcnBvcmF0aW9uMQ8wDQYDVQQL # DAZJVC1NSVMxGzAZBgNVBAMMEk5WSURJQSBDb3Jwb3JhdGlvbjCCASIwDQYJKoZI # hvcNAQEBBQADggEPADCCAQoCggEBALEZN63dA47T4i90jZ84CJ/aWUwVtLff8AyP # YspFfIZGdZYiMgdb8A5tBh7653y0G/LZL6CVUkgejcpvBU/Dl/52a+gSWy2qJ2bH # jMFMKCyQDhdpCAKMOUKSC9rfzm4cFeA9ct91LQCAait4LhLlZt/HF7aG+r0FgCZa # HJjJvE7KNY9G4AZXxjSt8CXS8/8NQMANqjLX1r+F+Hl8PzQ1fVx0mMsbdtaIV4Pj # 5flAeTUnz6+dCTx3vTUo8MYtkS2UBaQv7t7H2B7iwJDakEQKk1XHswJdeqG0osDU # z6+NVks7uWE1N8UIhvzbw0FEX/U2kpfyWaB/J3gMl8rVR8idPj8CAwEAAaOCAT4w # ggE6MAkGA1UdEwQCMAAwDgYDVR0PAQH/BAQDAgeAMBMGA1UdJQQMMAoGCCsGAQUF # BwMDMGEGA1UdIARaMFgwVgYGZ4EMAQQBMEwwIwYIKwYBBQUHAgEWF2h0dHBzOi8v # ZC5zeW1jYi5jb20vY3BzMCUGCCsGAQUFBwICMBkMF2h0dHBzOi8vZC5zeW1jYi5j # b20vcnBhMB8GA1UdIwQYMBaAFNTABiJJ6zlL3ZPiXKG4R3YJcgNYMCsGA1UdHwQk # MCIwIKAeoByGGmh0dHA6Ly9yYi5zeW1jYi5jb20vcmIuY3JsMFcGCCsGAQUFBwEB # BEswSTAfBggrBgEFBQcwAYYTaHR0cDovL3JiLnN5bWNkLmNvbTAmBggrBgEFBQcw # AoYaaHR0cDovL3JiLnN5bWNiLmNvbS9yYi5jcnQwDQYJKoZIhvcNAQELBQADggEB # AIJKh5vKJdhHJtMzATmc1BmXIQ3RaJONOZ5jMHn7HOkYU1JP0OIzb4pXXkH8Xwfr # K6bnd72IhcteyksvKsGpSvK0PBBwzodERTAu1Os2N+EaakxQwV/xtqDm1E3IhjHk # fRshyKKzmFk2Ci323J4lHtpWUj5Hz61b8gd72jH7xnihGi+LORJ2uRNZ3YuqMNC3 # SBC8tAyoJqEoTJirULUCXW6wX4XUm5P2sx+htPw7szGblVKbQ+PFinNGnsSEZeKz # D8jUb++1cvgTKH59Y6lm43nsJjkZU77tNqyq4ABwgQRk6lt8cS2PPwjZvTmvdnla # ZhR0K4of+pQaUQHXVIBdji8wggVHMIIEL6ADAgECAhB8GzU1SufbdOdBXxFpymuo # MA0GCSqGSIb3DQEBCwUAMIG9MQswCQYDVQQGEwJVUzEXMBUGA1UEChMOVmVyaVNp # Z24sIEluYy4xHzAdBgNVBAsTFlZlcmlTaWduIFRydXN0IE5ldHdvcmsxOjA4BgNV # BAsTMShjKSAyMDA4IFZlcmlTaWduLCBJbmMuIC0gRm9yIGF1dGhvcml6ZWQgdXNl # IG9ubHkxODA2BgNVBAMTL1ZlcmlTaWduIFVuaXZlcnNhbCBSb290IENlcnRpZmlj # YXRpb24gQXV0aG9yaXR5MB4XDTE0MDcyMjAwMDAwMFoXDTI0MDcyMTIzNTk1OVow # gYQxCzAJBgNVBAYTAlVTMR0wGwYDVQQKExRTeW1hbnRlYyBDb3Jwb3JhdGlvbjEf # MB0GA1UECxMWU3ltYW50ZWMgVHJ1c3QgTmV0d29yazE1MDMGA1UEAxMsU3ltYW50 # ZWMgQ2xhc3MgMyBTSEEyNTYgQ29kZSBTaWduaW5nIENBIC0gRzIwggEiMA0GCSqG # SIb3DQEBAQUAA4IBDwAwggEKAoIBAQDXlUPU3N9nrjn7UqS2JjEEcOm3jlsqujdp # NZWPu8Aw54bYc7vf69F2P4pWjustS/BXGE6xjaUz0wt1I9VqeSfdo9P3Dodltd6t # HPH1NbQiUa8iocFdS5B/wFlOq515qQLXHkmxO02H/sJ4q7/vUq6crwjZOeWaUT5p # XzAQTnFjbFjh8CAzGw90vlvLEuHbjMSAlHK79kWansElC/ujHJ7YpglwcezAR0yP # fcPeGc4+7gRyjhfT//CyBTIZTNOwHJ/+pXggQnBBsCaMbwDIOgARQXpBsKeKkQSg # mXj0d7TzYCrmbFAEtxRg/w1R9KiLhP4h2lxeffUpeU+wRHRvbXL/AgMBAAGjggF4 # MIIBdDAuBggrBgEFBQcBAQQiMCAwHgYIKwYBBQUHMAGGEmh0dHA6Ly9zLnN5bWNk # LmNvbTASBgNVHRMBAf8ECDAGAQH/AgEAMGYGA1UdIARfMF0wWwYLYIZIAYb4RQEH # FwMwTDAjBggrBgEFBQcCARYXaHR0cHM6Ly9kLnN5bWNiLmNvbS9jcHMwJQYIKwYB # BQUHAgIwGRoXaHR0cHM6Ly9kLnN5bWNiLmNvbS9ycGEwNgYDVR0fBC8wLTAroCmg # J4YlaHR0cDovL3Muc3ltY2IuY29tL3VuaXZlcnNhbC1yb290LmNybDATBgNVHSUE # DDAKBggrBgEFBQcDAzAOBgNVHQ8BAf8EBAMCAQYwKQYDVR0RBCIwIKQeMBwxGjAY # BgNVBAMTEVN5bWFudGVjUEtJLTEtNzI0MB0GA1UdDgQWBBTUwAYiSes5S92T4lyh # uEd2CXIDWDAfBgNVHSMEGDAWgBS2d/ppSEefUxLVwuoHMnYH0ZcHGTANBgkqhkiG # 9w0BAQsFAAOCAQEAf+vKp+qLdkLrPo4gVDDjt7nc+kg+FscPRZUQzSeGo2bzAu1x # +KrCVZeRcIP5Un5SaTzJ8eCURoAYu6HUpFam8x0AkdWG80iH4MvENGggXrTL+QXt # nK9wUye56D5+UaBpcYvcUe2AOiUyn0SvbkMo0yF1u5fYi4uM/qkERgSF9xWcSxGN # xCwX/tVuf5riVpLxlrOtLfn039qJmc6yOETA90d7yiW5+ipoM5tQct6on9TNLAs0 # vYsweEDgjY4nG5BvGr4IFYFd6y/iUedRHsl4KeceZb847wFKAQkkDhbEFHnBQTc0 # 0D2RUpSd4WjvCPDiaZxnbpALGpNx1CYCw8BaIzGCD4swgg+HAgEBMIGZMIGEMQsw # CQYDVQQGEwJVUzEdMBsGA1UEChMUU3ltYW50ZWMgQ29ycG9yYXRpb24xHzAdBgNV # BAsTFlN5bWFudGVjIFRydXN0IE5ldHdvcmsxNTAzBgNVBAMTLFN5bWFudGVjIENs # YXNzIDMgU0hBMjU2IENvZGUgU2lnbmluZyBDQSAtIEcyAhBi50XpIWUhPJcfXEkK # 6hKlMA0GCWCGSAFlAwQCAQUAoHwwEAYKKwYBBAGCNwIBDDECMAAwGQYJKoZIhvcN # AQkDMQwGCisGAQQBgjcCAQQwHAYKKwYBBAGCNwIBCzEOMAwGCisGAQQBgjcCARUw # LwYJKoZIhvcNAQkEMSIEIPW+EpFrZSdzrjFFo0UT+PzFeYn/GcWNyWFaU/JMrMfR # MA0GCSqGSIb3DQEBAQUABIIBAA8fmU/RJcF9t60DZZAjf8FB3EZddOaHgI9z40nV # CnfTGi0OEYU48Pe9jkQQV2fABpACfW74xmNv3QNgP2qP++mkpKBVv28EIAuINsFt # YAITEljLN/VOVul8lvjxar5GSFFgpE5F6j4xcvI69LuCWbN8cteTVsBGg+eGmjfx # QZxP252z3FqPN+mihtFegF2wx6Mg6/8jZjkO0xjBOwSdpTL4uyQfHvaPBKXuWxRx # ioXw4ezGAwkuBoxWK8UG7Qu+7CSfQ3wMOjvyH2+qn30lWEsvRMdbGAp7kvfr3EGZ # a3WN7zXZ+6KyZeLeEH7yCDzukAjptaY/+iLVjJsuzC6tCSqhgg1EMIINQAYKKwYB # BAGCNwMDATGCDTAwgg0sBgkqhkiG9w0BBwKggg0dMIINGQIBAzEPMA0GCWCGSAFl # AwQCAQUAMHcGCyqGSIb3DQEJEAEEoGgEZjBkAgEBBglghkgBhv1sBwEwMTANBglg # hkgBZQMEAgEFAAQg14BnPazQkW9whhZu1d0bC3lqqScvxb3SSb1QT8e3Xg0CEFhw # aMBZ2hExXhr79A9+bXEYDzIwMjEwNDA4MDkxMTA5WqCCCjcwggT+MIID5qADAgEC # AhANQkrgvjqI/2BAIc4UAPDdMA0GCSqGSIb3DQEBCwUAMHIxCzAJBgNVBAYTAlVT # MRUwEwYDVQQKEwxEaWdpQ2VydCBJbmMxGTAXBgNVBAsTEHd3dy5kaWdpY2VydC5j # b20xMTAvBgNVBAMTKERpZ2lDZXJ0IFNIQTIgQXNzdXJlZCBJRCBUaW1lc3RhbXBp # bmcgQ0EwHhcNMjEwMTAxMDAwMDAwWhcNMzEwMTA2MDAwMDAwWjBIMQswCQYDVQQG # EwJVUzEXMBUGA1UEChMORGlnaUNlcnQsIEluYy4xIDAeBgNVBAMTF0RpZ2lDZXJ0 # IFRpbWVzdGFtcCAyMDIxMIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEA # wuZhhGfFivUNCKRFymNrUdc6EUK9CnV1TZS0DFC1JhD+HchvkWsMlucaXEjvROW/ # m2HNFZFiWrj/ZwucY/02aoH6KfjdK3CF3gIY83htvH35x20JPb5qdofpir34hF0e # dsnkxnZ2OlPR0dNaNo/Go+EvGzq3YdZz7E5tM4p8XUUtS7FQ5kE6N1aG3JMjjfdQ # Jehk5t3Tjy9XtYcg6w6OLNUj2vRNeEbjA4MxKUpcDDGKSoyIxfcwWvkUrxVfbENJ # Cf0mI1P2jWPoGqtbsR0wwptpgrTb/FZUvB+hh6u+elsKIC9LCcmVp42y+tZji06l # chzun3oBc/gZ1v4NSYS9AQIDAQABo4IBuDCCAbQwDgYDVR0PAQH/BAQDAgeAMAwG # A1UdEwEB/wQCMAAwFgYDVR0lAQH/BAwwCgYIKwYBBQUHAwgwQQYDVR0gBDowODA2 # BglghkgBhv1sBwEwKTAnBggrBgEFBQcCARYbaHR0cDovL3d3dy5kaWdpY2VydC5j # b20vQ1BTMB8GA1UdIwQYMBaAFPS24SAd/imu0uRhpbKiJbLIFzVuMB0GA1UdDgQW # BBQ2RIaOpLqwZr68KC0dRDbd42p6vDBxBgNVHR8EajBoMDKgMKAuhixodHRwOi8v # Y3JsMy5kaWdpY2VydC5jb20vc2hhMi1hc3N1cmVkLXRzLmNybDAyoDCgLoYsaHR0 # cDovL2NybDQuZGlnaWNlcnQuY29tL3NoYTItYXNzdXJlZC10cy5jcmwwgYUGCCsG # AQUFBwEBBHkwdzAkBggrBgEFBQcwAYYYaHR0cDovL29jc3AuZGlnaWNlcnQuY29t # ME8GCCsGAQUFBzAChkNodHRwOi8vY2FjZXJ0cy5kaWdpY2VydC5jb20vRGlnaUNl # cnRTSEEyQXNzdXJlZElEVGltZXN0YW1waW5nQ0EuY3J0MA0GCSqGSIb3DQEBCwUA # A4IBAQBIHNy16ZojvOca5yAOjmdG/UJyUXQKI0ejq5LSJcRwWb4UoOUngaVNFBUZ # B3nw0QTDhtk7vf5EAmZN7WmkD/a4cM9i6PVRSnh5Nnont/PnUp+Tp+1DnnvntN1B # Ion7h6JGA0789P63ZHdjXyNSaYOC+hpT7ZDMjaEXcw3082U5cEvznNZ6e9oMvD0y # 0BvL9WH8dQgAdryBDvjA4VzPxBFy5xtkSdgimnUVQvUtMjiB2vRgorq0Uvtc4GEk # JU+y38kpqHNDUdq9Y9YfW5v3LhtPEx33Sg1xfpe39D+E68Hjo0mh+s6nv1bPull2 # YYlffqe0jmd4+TaY4cso2luHpoovMIIFMTCCBBmgAwIBAgIQCqEl1tYyG35B5AXa # NpfCFTANBgkqhkiG9w0BAQsFADBlMQswCQYDVQQGEwJVUzEVMBMGA1UEChMMRGln # aUNlcnQgSW5jMRkwFwYDVQQLExB3d3cuZGlnaWNlcnQuY29tMSQwIgYDVQQDExtE # aWdpQ2VydCBBc3N1cmVkIElEIFJvb3QgQ0EwHhcNMTYwMTA3MTIwMDAwWhcNMzEw # MTA3MTIwMDAwWjByMQswCQYDVQQGEwJVUzEVMBMGA1UEChMMRGlnaUNlcnQgSW5j # MRkwFwYDVQQLExB3d3cuZGlnaWNlcnQuY29tMTEwLwYDVQQDEyhEaWdpQ2VydCBT # SEEyIEFzc3VyZWQgSUQgVGltZXN0YW1waW5nIENBMIIBIjANBgkqhkiG9w0BAQEF # AAOCAQ8AMIIBCgKCAQEAvdAy7kvNj3/dqbqCmcU5VChXtiNKxA4HRTNREH3Q+X1N # aH7ntqD0jbOI5Je/YyGQmL8TvFfTw+F+CNZqFAA49y4eO+7MpvYyWf5fZT/gm+vj # RkcGGlV+Cyd+wKL1oODeIj8O/36V+/OjuiI+GKwR5PCZA207hXwJ0+5dyJoLVOOo # CXFr4M8iEA91z3FyTgqt30A6XLdR4aF5FMZNJCMwXbzsPGBqrC8HzP3w6kfZiFBe # /WZuVmEnKYmEUeaC50ZQ/ZQqLKfkdT66mA+Ef58xFNat1fJky3seBdCEGXIX8RcG # 7z3N1k3vBkL9olMqT4UdxB08r8/arBD13ays6Vb/kwIDAQABo4IBzjCCAcowHQYD # VR0OBBYEFPS24SAd/imu0uRhpbKiJbLIFzVuMB8GA1UdIwQYMBaAFEXroq/0ksuC # MS1Ri6enIZ3zbcgPMBIGA1UdEwEB/wQIMAYBAf8CAQAwDgYDVR0PAQH/BAQDAgGG # MBMGA1UdJQQMMAoGCCsGAQUFBwMIMHkGCCsGAQUFBwEBBG0wazAkBggrBgEFBQcw # AYYYaHR0cDovL29jc3AuZGlnaWNlcnQuY29tMEMGCCsGAQUFBzAChjdodHRwOi8v # Y2FjZXJ0cy5kaWdpY2VydC5jb20vRGlnaUNlcnRBc3N1cmVkSURSb290Q0EuY3J0 # MIGBBgNVHR8EejB4MDqgOKA2hjRodHRwOi8vY3JsNC5kaWdpY2VydC5jb20vRGln # aUNlcnRBc3N1cmVkSURSb290Q0EuY3JsMDqgOKA2hjRodHRwOi8vY3JsMy5kaWdp # Y2VydC5jb20vRGlnaUNlcnRBc3N1cmVkSURSb290Q0EuY3JsMFAGA1UdIARJMEcw # OAYKYIZIAYb9bAACBDAqMCgGCCsGAQUFBwIBFhxodHRwczovL3d3dy5kaWdpY2Vy # dC5jb20vQ1BTMAsGCWCGSAGG/WwHATANBgkqhkiG9w0BAQsFAAOCAQEAcZUS6VGH # VmnN793afKpjerN4zwY3QITvS4S/ys8DAv3Fp8MOIEIsr3fzKx8MIVoqtwU0HWqu # mfgnoma/Capg33akOpMP+LLR2HwZYuhegiUexLoceywh4tZbLBQ1QwRostt1AuBy # x5jWPGTlH0gQGF+JOGFNYkYkh2OMkVIsrymJ5Xgf1gsUpYDXEkdws3XVk4WTfraS # Z/tTYYmo9WuWwPRYaQ18yAGxuSh1t5ljhSKMYcp5lH5Z/IwP42+1ASa2bKXuh1Eh # 5Fhgm7oMLSttosR+u8QlK0cCCHxJrhO24XxCQijGGFbPQTS2Zl22dHv1VjMiLyI2 # skuiSpXY9aaOUjGCAk0wggJJAgEBMIGGMHIxCzAJBgNVBAYTAlVTMRUwEwYDVQQK # EwxEaWdpQ2VydCBJbmMxGTAXBgNVBAsTEHd3dy5kaWdpY2VydC5jb20xMTAvBgNV # BAMTKERpZ2lDZXJ0IFNIQTIgQXNzdXJlZCBJRCBUaW1lc3RhbXBpbmcgQ0ECEA1C # SuC+Ooj/YEAhzhQA8N0wDQYJYIZIAWUDBAIBBQCggZgwGgYJKoZIhvcNAQkDMQ0G # CyqGSIb3DQEJEAEEMBwGCSqGSIb3DQEJBTEPFw0yMTA0MDgwOTExMDlaMCsGCyqG # SIb3DQEJEAIMMRwwGjAYMBYEFOHXgqjhkb7va8oWkbWqtJSmJJvzMC8GCSqGSIb3 # DQEJBDEiBCCHEAmNNj2zWjWYRfEi4FgzZvrI16kv/U2b9b3oHw6UVDANBgkqhkiG # 9w0BAQEFAASCAQCdefEKh6Qmwx7xGCkrYi/A+/Cla6LdnYJp38eMs3fqTTvjhyDw # HffXrwdqWy5/fgW3o3qJXqa5o7hLxYIoWSULOCpJRGdt+w7XKPAbZqHrN9elAhWJ # vpBTCEaj7dVxr1Ka4NsoPSYe0eidDBmmvGvp02J4Z1j8+ImQPKN6Hv/L8Ixaxe7V # mH4VtXIiBK8xXdi4wzO+A+qLtHEJXz3Gw8Bp3BNtlDGIUkIhVTM3Q1xcSEqhOLqo # PGdwCw9acxdXNWWPjOJkNH656Bvmkml+0p6MTGIeG4JCeRh1Wpqm1ZGSoEcXNaof # wOgj48YzI+dNqBD9i7RSWCqJr2ygYKRTxnuU # SIG # End signature block cccl-2.5.0/cub/docs/tools/packman/bootstrap/generate_temp_folder.ps1000066400000000000000000000242641463375617100255360ustar00rootroot00000000000000<# Copyright 2019 NVIDIA CORPORATION Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. #> param( [Parameter(Mandatory=$true)][string]$parentPath=$null ) [string] $name = [System.Guid]::NewGuid() $out = Join-Path $parentPath $name New-Item -ItemType Directory -Path ($out) | Out-Null Write-Host $out # SIG # Begin signature block # MIIaVwYJKoZIhvcNAQcCoIIaSDCCGkQCAQExDzANBglghkgBZQMEAgEFADB5Bgor # BgEEAYI3AgEEoGswaTA0BgorBgEEAYI3AgEeMCYCAwEAAAQQH8w7YFlLCE63JNLG # KX7zUQIBAAIBAAIBAAIBAAIBADAxMA0GCWCGSAFlAwQCAQUABCB29nsqMEu+VmSF # 7ckeVTPrEZ6hsXjOgPFlJm9ilgHUB6CCCiIwggTTMIIDu6ADAgECAhBi50XpIWUh # PJcfXEkK6hKlMA0GCSqGSIb3DQEBCwUAMIGEMQswCQYDVQQGEwJVUzEdMBsGA1UE # ChMUU3ltYW50ZWMgQ29ycG9yYXRpb24xHzAdBgNVBAsTFlN5bWFudGVjIFRydXN0 # IE5ldHdvcmsxNTAzBgNVBAMTLFN5bWFudGVjIENsYXNzIDMgU0hBMjU2IENvZGUg # U2lnbmluZyBDQSAtIEcyMB4XDTE4MDcwOTAwMDAwMFoXDTIxMDcwOTIzNTk1OVow # gYMxCzAJBgNVBAYTAlVTMRMwEQYDVQQIDApDYWxpZm9ybmlhMRQwEgYDVQQHDAtT # YW50YSBDbGFyYTEbMBkGA1UECgwSTlZJRElBIENvcnBvcmF0aW9uMQ8wDQYDVQQL # DAZJVC1NSVMxGzAZBgNVBAMMEk5WSURJQSBDb3Jwb3JhdGlvbjCCASIwDQYJKoZI # hvcNAQEBBQADggEPADCCAQoCggEBALEZN63dA47T4i90jZ84CJ/aWUwVtLff8AyP # YspFfIZGdZYiMgdb8A5tBh7653y0G/LZL6CVUkgejcpvBU/Dl/52a+gSWy2qJ2bH # jMFMKCyQDhdpCAKMOUKSC9rfzm4cFeA9ct91LQCAait4LhLlZt/HF7aG+r0FgCZa # HJjJvE7KNY9G4AZXxjSt8CXS8/8NQMANqjLX1r+F+Hl8PzQ1fVx0mMsbdtaIV4Pj # 5flAeTUnz6+dCTx3vTUo8MYtkS2UBaQv7t7H2B7iwJDakEQKk1XHswJdeqG0osDU # z6+NVks7uWE1N8UIhvzbw0FEX/U2kpfyWaB/J3gMl8rVR8idPj8CAwEAAaOCAT4w # ggE6MAkGA1UdEwQCMAAwDgYDVR0PAQH/BAQDAgeAMBMGA1UdJQQMMAoGCCsGAQUF # BwMDMGEGA1UdIARaMFgwVgYGZ4EMAQQBMEwwIwYIKwYBBQUHAgEWF2h0dHBzOi8v # ZC5zeW1jYi5jb20vY3BzMCUGCCsGAQUFBwICMBkMF2h0dHBzOi8vZC5zeW1jYi5j # b20vcnBhMB8GA1UdIwQYMBaAFNTABiJJ6zlL3ZPiXKG4R3YJcgNYMCsGA1UdHwQk # MCIwIKAeoByGGmh0dHA6Ly9yYi5zeW1jYi5jb20vcmIuY3JsMFcGCCsGAQUFBwEB # BEswSTAfBggrBgEFBQcwAYYTaHR0cDovL3JiLnN5bWNkLmNvbTAmBggrBgEFBQcw # AoYaaHR0cDovL3JiLnN5bWNiLmNvbS9yYi5jcnQwDQYJKoZIhvcNAQELBQADggEB # AIJKh5vKJdhHJtMzATmc1BmXIQ3RaJONOZ5jMHn7HOkYU1JP0OIzb4pXXkH8Xwfr # K6bnd72IhcteyksvKsGpSvK0PBBwzodERTAu1Os2N+EaakxQwV/xtqDm1E3IhjHk # fRshyKKzmFk2Ci323J4lHtpWUj5Hz61b8gd72jH7xnihGi+LORJ2uRNZ3YuqMNC3 # SBC8tAyoJqEoTJirULUCXW6wX4XUm5P2sx+htPw7szGblVKbQ+PFinNGnsSEZeKz # D8jUb++1cvgTKH59Y6lm43nsJjkZU77tNqyq4ABwgQRk6lt8cS2PPwjZvTmvdnla # ZhR0K4of+pQaUQHXVIBdji8wggVHMIIEL6ADAgECAhB8GzU1SufbdOdBXxFpymuo # MA0GCSqGSIb3DQEBCwUAMIG9MQswCQYDVQQGEwJVUzEXMBUGA1UEChMOVmVyaVNp # Z24sIEluYy4xHzAdBgNVBAsTFlZlcmlTaWduIFRydXN0IE5ldHdvcmsxOjA4BgNV # BAsTMShjKSAyMDA4IFZlcmlTaWduLCBJbmMuIC0gRm9yIGF1dGhvcml6ZWQgdXNl # IG9ubHkxODA2BgNVBAMTL1ZlcmlTaWduIFVuaXZlcnNhbCBSb290IENlcnRpZmlj # YXRpb24gQXV0aG9yaXR5MB4XDTE0MDcyMjAwMDAwMFoXDTI0MDcyMTIzNTk1OVow # gYQxCzAJBgNVBAYTAlVTMR0wGwYDVQQKExRTeW1hbnRlYyBDb3Jwb3JhdGlvbjEf # MB0GA1UECxMWU3ltYW50ZWMgVHJ1c3QgTmV0d29yazE1MDMGA1UEAxMsU3ltYW50 # ZWMgQ2xhc3MgMyBTSEEyNTYgQ29kZSBTaWduaW5nIENBIC0gRzIwggEiMA0GCSqG # SIb3DQEBAQUAA4IBDwAwggEKAoIBAQDXlUPU3N9nrjn7UqS2JjEEcOm3jlsqujdp # NZWPu8Aw54bYc7vf69F2P4pWjustS/BXGE6xjaUz0wt1I9VqeSfdo9P3Dodltd6t # HPH1NbQiUa8iocFdS5B/wFlOq515qQLXHkmxO02H/sJ4q7/vUq6crwjZOeWaUT5p # XzAQTnFjbFjh8CAzGw90vlvLEuHbjMSAlHK79kWansElC/ujHJ7YpglwcezAR0yP # fcPeGc4+7gRyjhfT//CyBTIZTNOwHJ/+pXggQnBBsCaMbwDIOgARQXpBsKeKkQSg # mXj0d7TzYCrmbFAEtxRg/w1R9KiLhP4h2lxeffUpeU+wRHRvbXL/AgMBAAGjggF4 # MIIBdDAuBggrBgEFBQcBAQQiMCAwHgYIKwYBBQUHMAGGEmh0dHA6Ly9zLnN5bWNk # LmNvbTASBgNVHRMBAf8ECDAGAQH/AgEAMGYGA1UdIARfMF0wWwYLYIZIAYb4RQEH # FwMwTDAjBggrBgEFBQcCARYXaHR0cHM6Ly9kLnN5bWNiLmNvbS9jcHMwJQYIKwYB # BQUHAgIwGRoXaHR0cHM6Ly9kLnN5bWNiLmNvbS9ycGEwNgYDVR0fBC8wLTAroCmg # J4YlaHR0cDovL3Muc3ltY2IuY29tL3VuaXZlcnNhbC1yb290LmNybDATBgNVHSUE # DDAKBggrBgEFBQcDAzAOBgNVHQ8BAf8EBAMCAQYwKQYDVR0RBCIwIKQeMBwxGjAY # BgNVBAMTEVN5bWFudGVjUEtJLTEtNzI0MB0GA1UdDgQWBBTUwAYiSes5S92T4lyh # uEd2CXIDWDAfBgNVHSMEGDAWgBS2d/ppSEefUxLVwuoHMnYH0ZcHGTANBgkqhkiG # 9w0BAQsFAAOCAQEAf+vKp+qLdkLrPo4gVDDjt7nc+kg+FscPRZUQzSeGo2bzAu1x # +KrCVZeRcIP5Un5SaTzJ8eCURoAYu6HUpFam8x0AkdWG80iH4MvENGggXrTL+QXt # nK9wUye56D5+UaBpcYvcUe2AOiUyn0SvbkMo0yF1u5fYi4uM/qkERgSF9xWcSxGN # xCwX/tVuf5riVpLxlrOtLfn039qJmc6yOETA90d7yiW5+ipoM5tQct6on9TNLAs0 # vYsweEDgjY4nG5BvGr4IFYFd6y/iUedRHsl4KeceZb847wFKAQkkDhbEFHnBQTc0 # 0D2RUpSd4WjvCPDiaZxnbpALGpNx1CYCw8BaIzGCD4swgg+HAgEBMIGZMIGEMQsw # CQYDVQQGEwJVUzEdMBsGA1UEChMUU3ltYW50ZWMgQ29ycG9yYXRpb24xHzAdBgNV # BAsTFlN5bWFudGVjIFRydXN0IE5ldHdvcmsxNTAzBgNVBAMTLFN5bWFudGVjIENs # YXNzIDMgU0hBMjU2IENvZGUgU2lnbmluZyBDQSAtIEcyAhBi50XpIWUhPJcfXEkK # 6hKlMA0GCWCGSAFlAwQCAQUAoHwwEAYKKwYBBAGCNwIBDDECMAAwGQYJKoZIhvcN # AQkDMQwGCisGAQQBgjcCAQQwHAYKKwYBBAGCNwIBCzEOMAwGCisGAQQBgjcCARUw # LwYJKoZIhvcNAQkEMSIEIG5YDmcpqLxn4SB0H6OnuVkZRPh6OJ77eGW/6Su/uuJg # MA0GCSqGSIb3DQEBAQUABIIBAA3N2vqfA6WDgqz/7EoAKVIE5Hn7xpYDGhPvFAMV # BslVpeqE3apTcYFCEcwLtzIEc/zmpULxsX8B0SUT2VXbJN3zzQ80b+gbgpq62Zk+ # dQLOtLSiPhGW7MXLahgES6Oc2dUFaQ+wDfcelkrQaOVZkM4wwAzSapxuf/13oSIk # ZX2ewQEwTZrVYXELO02KQIKUR30s/oslGVg77ALnfK9qSS96Iwjd4MyT7PzCkHUi # ilwyGJi5a4ofiULiPSwUQNynSBqxa+JQALkHP682b5xhjoDfyG8laR234FTPtYgs # P/FaeviwENU5Pl+812NbbtRD+gKlWBZz+7FKykOT/CG8sZahgg1EMIINQAYKKwYB # BAGCNwMDATGCDTAwgg0sBgkqhkiG9w0BBwKggg0dMIINGQIBAzEPMA0GCWCGSAFl # AwQCAQUAMHcGCyqGSIb3DQEJEAEEoGgEZjBkAgEBBglghkgBhv1sBwEwMTANBglg # hkgBZQMEAgEFAAQgJhABfkDIPbI+nWYnA30FLTyaPK+W3QieT21B/vK+CMICEDF0 # worcGsdd7OxpXLP60xgYDzIwMjEwNDA4MDkxMTA5WqCCCjcwggT+MIID5qADAgEC # AhANQkrgvjqI/2BAIc4UAPDdMA0GCSqGSIb3DQEBCwUAMHIxCzAJBgNVBAYTAlVT # MRUwEwYDVQQKEwxEaWdpQ2VydCBJbmMxGTAXBgNVBAsTEHd3dy5kaWdpY2VydC5j # b20xMTAvBgNVBAMTKERpZ2lDZXJ0IFNIQTIgQXNzdXJlZCBJRCBUaW1lc3RhbXBp # bmcgQ0EwHhcNMjEwMTAxMDAwMDAwWhcNMzEwMTA2MDAwMDAwWjBIMQswCQYDVQQG # EwJVUzEXMBUGA1UEChMORGlnaUNlcnQsIEluYy4xIDAeBgNVBAMTF0RpZ2lDZXJ0 # IFRpbWVzdGFtcCAyMDIxMIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEA # wuZhhGfFivUNCKRFymNrUdc6EUK9CnV1TZS0DFC1JhD+HchvkWsMlucaXEjvROW/ # m2HNFZFiWrj/ZwucY/02aoH6KfjdK3CF3gIY83htvH35x20JPb5qdofpir34hF0e # dsnkxnZ2OlPR0dNaNo/Go+EvGzq3YdZz7E5tM4p8XUUtS7FQ5kE6N1aG3JMjjfdQ # Jehk5t3Tjy9XtYcg6w6OLNUj2vRNeEbjA4MxKUpcDDGKSoyIxfcwWvkUrxVfbENJ # Cf0mI1P2jWPoGqtbsR0wwptpgrTb/FZUvB+hh6u+elsKIC9LCcmVp42y+tZji06l # chzun3oBc/gZ1v4NSYS9AQIDAQABo4IBuDCCAbQwDgYDVR0PAQH/BAQDAgeAMAwG # A1UdEwEB/wQCMAAwFgYDVR0lAQH/BAwwCgYIKwYBBQUHAwgwQQYDVR0gBDowODA2 # BglghkgBhv1sBwEwKTAnBggrBgEFBQcCARYbaHR0cDovL3d3dy5kaWdpY2VydC5j # b20vQ1BTMB8GA1UdIwQYMBaAFPS24SAd/imu0uRhpbKiJbLIFzVuMB0GA1UdDgQW # BBQ2RIaOpLqwZr68KC0dRDbd42p6vDBxBgNVHR8EajBoMDKgMKAuhixodHRwOi8v # Y3JsMy5kaWdpY2VydC5jb20vc2hhMi1hc3N1cmVkLXRzLmNybDAyoDCgLoYsaHR0 # cDovL2NybDQuZGlnaWNlcnQuY29tL3NoYTItYXNzdXJlZC10cy5jcmwwgYUGCCsG # AQUFBwEBBHkwdzAkBggrBgEFBQcwAYYYaHR0cDovL29jc3AuZGlnaWNlcnQuY29t # ME8GCCsGAQUFBzAChkNodHRwOi8vY2FjZXJ0cy5kaWdpY2VydC5jb20vRGlnaUNl # cnRTSEEyQXNzdXJlZElEVGltZXN0YW1waW5nQ0EuY3J0MA0GCSqGSIb3DQEBCwUA # A4IBAQBIHNy16ZojvOca5yAOjmdG/UJyUXQKI0ejq5LSJcRwWb4UoOUngaVNFBUZ # B3nw0QTDhtk7vf5EAmZN7WmkD/a4cM9i6PVRSnh5Nnont/PnUp+Tp+1DnnvntN1B # Ion7h6JGA0789P63ZHdjXyNSaYOC+hpT7ZDMjaEXcw3082U5cEvznNZ6e9oMvD0y # 0BvL9WH8dQgAdryBDvjA4VzPxBFy5xtkSdgimnUVQvUtMjiB2vRgorq0Uvtc4GEk # JU+y38kpqHNDUdq9Y9YfW5v3LhtPEx33Sg1xfpe39D+E68Hjo0mh+s6nv1bPull2 # YYlffqe0jmd4+TaY4cso2luHpoovMIIFMTCCBBmgAwIBAgIQCqEl1tYyG35B5AXa # NpfCFTANBgkqhkiG9w0BAQsFADBlMQswCQYDVQQGEwJVUzEVMBMGA1UEChMMRGln # aUNlcnQgSW5jMRkwFwYDVQQLExB3d3cuZGlnaWNlcnQuY29tMSQwIgYDVQQDExtE # aWdpQ2VydCBBc3N1cmVkIElEIFJvb3QgQ0EwHhcNMTYwMTA3MTIwMDAwWhcNMzEw # MTA3MTIwMDAwWjByMQswCQYDVQQGEwJVUzEVMBMGA1UEChMMRGlnaUNlcnQgSW5j # MRkwFwYDVQQLExB3d3cuZGlnaWNlcnQuY29tMTEwLwYDVQQDEyhEaWdpQ2VydCBT # SEEyIEFzc3VyZWQgSUQgVGltZXN0YW1waW5nIENBMIIBIjANBgkqhkiG9w0BAQEF # AAOCAQ8AMIIBCgKCAQEAvdAy7kvNj3/dqbqCmcU5VChXtiNKxA4HRTNREH3Q+X1N # aH7ntqD0jbOI5Je/YyGQmL8TvFfTw+F+CNZqFAA49y4eO+7MpvYyWf5fZT/gm+vj # RkcGGlV+Cyd+wKL1oODeIj8O/36V+/OjuiI+GKwR5PCZA207hXwJ0+5dyJoLVOOo # CXFr4M8iEA91z3FyTgqt30A6XLdR4aF5FMZNJCMwXbzsPGBqrC8HzP3w6kfZiFBe # /WZuVmEnKYmEUeaC50ZQ/ZQqLKfkdT66mA+Ef58xFNat1fJky3seBdCEGXIX8RcG # 7z3N1k3vBkL9olMqT4UdxB08r8/arBD13ays6Vb/kwIDAQABo4IBzjCCAcowHQYD # VR0OBBYEFPS24SAd/imu0uRhpbKiJbLIFzVuMB8GA1UdIwQYMBaAFEXroq/0ksuC # MS1Ri6enIZ3zbcgPMBIGA1UdEwEB/wQIMAYBAf8CAQAwDgYDVR0PAQH/BAQDAgGG # MBMGA1UdJQQMMAoGCCsGAQUFBwMIMHkGCCsGAQUFBwEBBG0wazAkBggrBgEFBQcw # AYYYaHR0cDovL29jc3AuZGlnaWNlcnQuY29tMEMGCCsGAQUFBzAChjdodHRwOi8v # Y2FjZXJ0cy5kaWdpY2VydC5jb20vRGlnaUNlcnRBc3N1cmVkSURSb290Q0EuY3J0 # MIGBBgNVHR8EejB4MDqgOKA2hjRodHRwOi8vY3JsNC5kaWdpY2VydC5jb20vRGln # aUNlcnRBc3N1cmVkSURSb290Q0EuY3JsMDqgOKA2hjRodHRwOi8vY3JsMy5kaWdp # Y2VydC5jb20vRGlnaUNlcnRBc3N1cmVkSURSb290Q0EuY3JsMFAGA1UdIARJMEcw # OAYKYIZIAYb9bAACBDAqMCgGCCsGAQUFBwIBFhxodHRwczovL3d3dy5kaWdpY2Vy # dC5jb20vQ1BTMAsGCWCGSAGG/WwHATANBgkqhkiG9w0BAQsFAAOCAQEAcZUS6VGH # VmnN793afKpjerN4zwY3QITvS4S/ys8DAv3Fp8MOIEIsr3fzKx8MIVoqtwU0HWqu # mfgnoma/Capg33akOpMP+LLR2HwZYuhegiUexLoceywh4tZbLBQ1QwRostt1AuBy # x5jWPGTlH0gQGF+JOGFNYkYkh2OMkVIsrymJ5Xgf1gsUpYDXEkdws3XVk4WTfraS # Z/tTYYmo9WuWwPRYaQ18yAGxuSh1t5ljhSKMYcp5lH5Z/IwP42+1ASa2bKXuh1Eh # 5Fhgm7oMLSttosR+u8QlK0cCCHxJrhO24XxCQijGGFbPQTS2Zl22dHv1VjMiLyI2 # skuiSpXY9aaOUjGCAk0wggJJAgEBMIGGMHIxCzAJBgNVBAYTAlVTMRUwEwYDVQQK # EwxEaWdpQ2VydCBJbmMxGTAXBgNVBAsTEHd3dy5kaWdpY2VydC5jb20xMTAvBgNV # BAMTKERpZ2lDZXJ0IFNIQTIgQXNzdXJlZCBJRCBUaW1lc3RhbXBpbmcgQ0ECEA1C # SuC+Ooj/YEAhzhQA8N0wDQYJYIZIAWUDBAIBBQCggZgwGgYJKoZIhvcNAQkDMQ0G # CyqGSIb3DQEJEAEEMBwGCSqGSIb3DQEJBTEPFw0yMTA0MDgwOTExMDlaMCsGCyqG # SIb3DQEJEAIMMRwwGjAYMBYEFOHXgqjhkb7va8oWkbWqtJSmJJvzMC8GCSqGSIb3 # DQEJBDEiBCDvFxQ6lYLr8vB+9czUl19rjCw1pWhhUXw/SqOmvIa/VDANBgkqhkiG # 9w0BAQEFAASCAQB9ox2UrcUXQsBI4Uycnhl4AMpvhVXJME62tygFMppW1l7QftDy # LvfPKRYm2YUioak/APxAS6geRKpeMkLvXuQS/Jlv0kY3BjxkeG0eVjvyjF4SvXbZ # 3JCk9m7wLNE+xqOo0ICjYlIJJgRLudjWkC5Skpb1NpPS8DOaIYwRV+AWaSOUPd9P # O5yVcnbl7OpK3EAEtwDrybCVBMPn2MGhAXybIHnth3+MFp1b6Blhz3WlReQyarjq # 1f+zaFB79rg6JswXoOTJhwICBP3hO2Ua3dMAswbfl+QNXF+igKLJPYnaeSVhBbm6 # VCu2io27t4ixqvoD0RuPObNX/P3oVA38afiM # SIG # End signature block cccl-2.5.0/cub/docs/tools/packman/bootstrap/install_package.py000066400000000000000000000134521463375617100244270ustar00rootroot00000000000000# Copyright 2019 NVIDIA CORPORATION # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # http://www.apache.org/licenses/LICENSE-2.0 # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import logging import zipfile import tempfile import sys import os import stat import time from typing import Any, Callable RENAME_RETRY_COUNT = 100 RENAME_RETRY_DELAY = 0.1 logging.basicConfig(level=logging.WARNING, format="%(message)s") logger = logging.getLogger("install_package") def remove_directory_item(path): if os.path.islink(path) or os.path.isfile(path): try: os.remove(path) except PermissionError: # make sure we have access and try again: os.chmod(path, stat.S_IRWXU) os.remove(path) else: # try first to delete the dir because this will work for folder junctions, otherwise we would follow the junctions and cause destruction! clean_out_folder = False try: # make sure we have access preemptively - this is necessary because recursing into a directory without permissions # will only lead to heart ache os.chmod(path, stat.S_IRWXU) os.rmdir(path) except OSError: clean_out_folder = True if clean_out_folder: # we should make sure the directory is empty names = os.listdir(path) for name in names: fullname = os.path.join(path, name) remove_directory_item(fullname) # now try to again get rid of the folder - and not catch if it raises: os.rmdir(path) class StagingDirectory: def __init__(self, staging_path): self.staging_path = staging_path self.temp_folder_path = None os.makedirs(staging_path, exist_ok=True) def __enter__(self): self.temp_folder_path = tempfile.mkdtemp(prefix="ver-", dir=self.staging_path) return self def get_temp_folder_path(self): return self.temp_folder_path # this function renames the temp staging folder to folder_name, it is required that the parent path exists! def promote_and_rename(self, folder_name): abs_dst_folder_name = os.path.join(self.staging_path, folder_name) os.rename(self.temp_folder_path, abs_dst_folder_name) def __exit__(self, type, value, traceback): # Remove temp staging folder if it's still there (something went wrong): path = self.temp_folder_path if os.path.isdir(path): remove_directory_item(path) def rename_folder(staging_dir: StagingDirectory, folder_name: str): try: staging_dir.promote_and_rename(folder_name) except OSError as exc: # if we failed to rename because the folder now exists we can assume that another packman process # has managed to update the package before us - in all other cases we re-raise the exception abs_dst_folder_name = os.path.join(staging_dir.staging_path, folder_name) if os.path.exists(abs_dst_folder_name): logger.warning( f"Directory {abs_dst_folder_name} already present, package installation already completed" ) else: raise def call_with_retry( op_name: str, func: Callable, retry_count: int = 3, retry_delay: float = 20 ) -> Any: retries_left = retry_count while True: try: return func() except (OSError, IOError) as exc: logger.warning(f"Failure while executing {op_name} [{str(exc)}]") if retries_left: retry_str = "retry" if retries_left == 1 else "retries" logger.warning( f"Retrying after {retry_delay} seconds" f" ({retries_left} {retry_str} left) ..." ) time.sleep(retry_delay) else: logger.error("Maximum retries exceeded, giving up") raise retries_left -= 1 def rename_folder_with_retry(staging_dir: StagingDirectory, folder_name): dst_path = os.path.join(staging_dir.staging_path, folder_name) call_with_retry( f"rename {staging_dir.get_temp_folder_path()} -> {dst_path}", lambda: rename_folder(staging_dir, folder_name), RENAME_RETRY_COUNT, RENAME_RETRY_DELAY, ) def install_package(package_path, install_path): staging_path, version = os.path.split(install_path) with StagingDirectory(staging_path) as staging_dir: output_folder = staging_dir.get_temp_folder_path() with zipfile.ZipFile(package_path, allowZip64=True) as zip_file: zip_file.extractall(output_folder) # attempt the rename operation rename_folder_with_retry(staging_dir, version) print(f"Package successfully installed to {install_path}") if __name__ == "__main__": executable_paths = os.getenv("PATH") paths_list = executable_paths.split(os.path.pathsep) if executable_paths else [] target_path_np = os.path.normpath(sys.argv[2]) target_path_np_nc = os.path.normcase(target_path_np) for exec_path in paths_list: if os.path.normcase(os.path.normpath(exec_path)) == target_path_np_nc: raise RuntimeError(f"packman will not install to executable path '{exec_path}'") install_package(sys.argv[1], target_path_np) cccl-2.5.0/cub/docs/tools/packman/config.packman.xml000066400000000000000000000003231463375617100223100ustar00rootroot00000000000000 cccl-2.5.0/cub/docs/tools/packman/packman000077500000000000000000000127341463375617100202610ustar00rootroot00000000000000#!/bin/bash # Copyright 2019-2020 NVIDIA CORPORATION # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # http://www.apache.org/licenses/LICENSE-2.0 # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. SAVED_SETTINGS="$-" set -eu if echo ${PM_VERBOSITY-} | grep -i "debug" > /dev/null ; then set -x else PM_CURL_SILENT="-s -S" PM_WGET_QUIET="--quiet" fi PM_PACKMAN_VERSION=6.57 # This is necessary for newer macOS if [ `uname` == 'Darwin' ]; then export LC_ALL=en_US.UTF-8 export LANG=en_US.UTF-8 fi # We cannot rely on realpath, it isn't installed on macOS and some Linux distros get_abs_filename() { echo "$(cd "$(dirname "$1")" && pwd)/$(basename "$1")" } # Specify where packman command exists export PM_INSTALL_PATH="$(get_abs_filename "$(dirname "${BASH_SOURCE}")")" # The packages root may already be configured by the user if [ -z "${PM_PACKAGES_ROOT:-}" ]; then # Set variable temporarily in this process so that the following execution will work if [ `uname` == 'Darwin' ]; then export PM_PACKAGES_ROOT="${HOME}/Library/Application Support/packman-cache" else if [ -z "${XDG_CACHE_HOME:-}" ]; then export PM_PACKAGES_ROOT="${HOME}/.cache/packman" else export PM_PACKAGES_ROOT="${XDG_CACHE_HOME}/packman" fi fi fi # Ensure the packages root path exists: if [ ! -d "$PM_PACKAGES_ROOT" ]; then echo "Creating packman packages cache at $PM_PACKAGES_ROOT" mkdir -p -m a+rwx "$PM_PACKAGES_ROOT" fi fetch_file_from_s3() { SOURCE=$1 SOURCE_URL=http://bootstrap.packman.nvidia.com/$SOURCE TARGET=$2 echo "Fetching $SOURCE from bootstrap.packman.nvidia.com ..." if command -v wget >/dev/null 2>&1; then wget $PM_WGET_QUIET -O$TARGET $SOURCE_URL else curl -o $TARGET $SOURCE_URL $PM_CURL_SILENT fi } generate_temp_file_name() { if [ `uname` == "Darwin" ]; then local tmpfile=`mktemp -t packman` else local tmpfile=`mktemp -t packman.XXXXXXXX` fi echo "$tmpfile" } install_python() { PLATFORM=`uname` PROCESSOR=`uname -m` PYTHON_VERSION=3.7.13-nv1 if [ $PLATFORM == 'Darwin' ]; then PYTHON_PACKAGE=$PYTHON_VERSION-macos-x86_64 elif [ $PLATFORM == 'Linux' ] && [ $PROCESSOR == 'x86_64' ]; then PYTHON_PACKAGE=$PYTHON_VERSION-linux-x86_64 elif [ $PLATFORM == 'Linux' ] && [ $PROCESSOR == 'aarch64' ]; then PYTHON_PACKAGE=$PYTHON_VERSION-linux-aarch64 else echo "Operating system not supported" exit 1 fi PYTHON_INSTALL_FOLDER="$PM_PACKAGES_ROOT/python/$PYTHON_PACKAGE" if [ ! -d "$PYTHON_INSTALL_FOLDER" ]; then mkdir -p "$PYTHON_INSTALL_FOLDER" fi export PM_PYTHON="$PYTHON_INSTALL_FOLDER/python" if [ ! -f "$PM_PYTHON" ]; then PYTHON_PACKAGE_TMP=$(generate_temp_file_name) fetch_file_from_s3 "python@$PYTHON_PACKAGE.tar.gz" "$PYTHON_PACKAGE_TMP" if [ "$?" -eq "0" ]; then echo "Unpacking python" tar -xf "$PYTHON_PACKAGE_TMP" -C "$PYTHON_INSTALL_FOLDER" rm "$PYTHON_PACKAGE_TMP" else echo "Failed downloading the Python interpreter" exit $? fi fi } # Ensure python is available: if [ -z "${PM_PYTHON_EXT:-}" ]; then install_python else PM_PYTHON="$PM_PYTHON_EXT" fi # The packman module may be externally configured if [ -z "${PM_MODULE_DIR_EXT:-}" ]; then PM_MODULE_DIR="$PM_PACKAGES_ROOT/packman-common/$PM_PACKMAN_VERSION" else PM_MODULE_DIR="$PM_MODULE_DIR_EXT" fi export PM_MODULE="$PM_MODULE_DIR/run.py" # Ensure the packman package exists: if [ ! -f "$PM_MODULE" ]; then # Remove a previously corrupt packman-common if it's there if [ -d "$PM_MODULE_DIR" ]; then rm -rf "$PM_MODULE_DIR" fi PM_MODULE_PACKAGE="packman-common@$PM_PACKMAN_VERSION.zip" TARGET=$(generate_temp_file_name) # We always fetch packman from S3: fetch_file_from_s3 "$PM_MODULE_PACKAGE" "$TARGET" if [ "$?" -eq "0" ]; then echo "Unpacking ..." "$PM_PYTHON" -S -s -u -E "$PM_INSTALL_PATH/bootstrap/install_package.py" "$TARGET" "$PM_MODULE_DIR" rm "$TARGET" else echo "Failure while fetching packman module from S3!" exit 1 fi fi # Ensure 7za package exists: PM_7za_VERSION=22.01-1 export PM_7za_PATH="$PM_PACKAGES_ROOT/7za/$PM_7za_VERSION" if [ ! -d "$PM_7za_PATH" ]; then export PM_7za_PATH="$PM_PACKAGES_ROOT/chk/7za/$PM_7za_VERSION" if [ ! -d "$PM_7za_PATH" ]; then "$PM_PYTHON" -S -s -u -E "$PM_MODULE" pull "$PM_MODULE_DIR/deps.packman.xml" if [ "$?" -ne 0 ]; then echo "Failure while installing required 7za package" exit 1 fi fi fi # Generate temporary file name for environment variables: PM_VAR_PATH=`mktemp -u -t tmp.$$.pmvars.XXXXXX` if [ $# -ne 0 ] then PM_VAR_PATH_ARG=--var-path="$PM_VAR_PATH" fi "$PM_PYTHON" -S -s -u -E "$PM_MODULE" "$@" ${PM_VAR_PATH_ARG:-} exit_code=$? # Export the variables if the file was used and remove the file: if [ -f "$PM_VAR_PATH" ]; then while read -r line do if [ ${#line} -gt 0 ]; then export "$line" fi done < "$PM_VAR_PATH" rm -f "$PM_VAR_PATH" fi # avoid leaking -e and -u into the host script if they weren't originally set if [[ ! ( "$SAVED_SETTINGS" =~ e ) ]]; then set +e fi if [[ ! ( "$SAVED_SETTINGS" =~ u ) ]]; then set +u fi # Return the exit code from python if [ "$exit_code" != 0 ]; then exit "$exit_code" fi cccl-2.5.0/cub/docs/tools/packman/packman.cmd000077500000000000000000000037631463375617100210250ustar00rootroot00000000000000:: RUN_PM_MODULE must always be at the same spot for packman update to work (batch reloads file during update!) :: [xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx] :: Reset errorlevel status (don't inherit from caller) @call :ECHO_AND_RESET_ERROR :: You can remove the call below if you do your own manual configuration of the dev machines call "%~dp0\bootstrap\configure.bat" if %errorlevel% neq 0 ( exit /b %errorlevel% ) :: Everything below is mandatory if not defined PM_PYTHON goto :PYTHON_ENV_ERROR if not defined PM_MODULE goto :MODULE_ENV_ERROR set PM_VAR_PATH_ARG= if "%1"=="pull" goto :SET_VAR_PATH if "%1"=="install" goto :SET_VAR_PATH :RUN_PM_MODULE "%PM_PYTHON%" -S -s -u -E "%PM_MODULE%" %* %PM_VAR_PATH_ARG% if %errorlevel% neq 0 ( exit /b %errorlevel% ) :: Marshall environment variables into the current environment if they have been generated and remove temporary file if exist "%PM_VAR_PATH%" ( for /F "usebackq tokens=*" %%A in ("%PM_VAR_PATH%") do set "%%A" ) if %errorlevel% neq 0 ( goto :VAR_ERROR ) if exist "%PM_VAR_PATH%" ( del /F "%PM_VAR_PATH%" ) if %errorlevel% neq 0 ( goto :VAR_ERROR ) set PM_VAR_PATH= goto :eof :: Subroutines below :PYTHON_ENV_ERROR @echo User environment variable PM_PYTHON is not set! Please configure machine for packman or call configure.bat. exit /b 1 :MODULE_ENV_ERROR @echo User environment variable PM_MODULE is not set! Please configure machine for packman or call configure.bat. exit /b 1 :VAR_ERROR @echo Error while processing and setting environment variables! exit /b 1 :ECHO_AND_RESET_ERROR @echo off if /I "%PM_VERBOSITY%"=="debug" ( @echo on ) exit /b 0 :SET_VAR_PATH :: Generate temporary path for variable file for /f "delims=" %%a in ('%PM_PYTHON% -S -s -u -E -c "import tempfile;file = tempfile.NamedTemporaryFile(mode='w+t', delete=False);print(file.name)"') do (set PM_VAR_PATH=%%a) set PM_VAR_PATH_ARG=--var-path="%PM_VAR_PATH%" goto :RUN_PM_MODULE cccl-2.5.0/cub/docs/tools/packman/packmanconf.py000066400000000000000000000077051463375617100215550ustar00rootroot00000000000000# Use this file to bootstrap packman into your Python environment (3.7.x). Simply # add the path by doing sys.insert to where packmanconf.py is located and then execute: # # >>> import packmanconf # >>> packmanconf.init() # # It will use the configured remote(s) and the version of packman in the same folder, # giving you full access to the packman API via the following module # # >> import packmanapi # >> dir(packmanapi) import os import platform import sys def init(): """Call this function to initialize the packman configuration. Calls to the packman API will work after successfully calling this function. Note: This function only needs to be called once during the execution of your program. Calling it repeatedly is harmless but wasteful. Compatibility with your Python interpreter is checked and upon failure the function will report what is required. Example: >>> import packmanconf >>> packmanconf.init() >>> import packmanapi >>> packmanapi.set_verbosity_level(packmanapi.VERBOSITY_HIGH) """ major = sys.version_info[0] minor = sys.version_info[1] if major != 3 or minor != 7: raise RuntimeError( f"This version of packman requires Python 3.7.x, but {major}.{minor} was provided" ) conf_dir = os.path.dirname(os.path.abspath(__file__)) os.environ["PM_INSTALL_PATH"] = conf_dir packages_root = get_packages_root(conf_dir) version = get_version(conf_dir) module_dir = get_module_dir(conf_dir, packages_root, version) sys.path.insert(1, module_dir) def get_packages_root(conf_dir: str) -> str: root = os.getenv("PM_PACKAGES_ROOT") if not root: platform_name = platform.system() if platform_name == "Windows": drive, _ = os.path.splitdrive(conf_dir) root = os.path.join(drive, "packman-repo") elif platform_name == "Darwin": # macOS root = os.path.join( os.path.expanduser("~"), "/Library/Application Support/packman-cache" ) elif platform_name == "Linux": try: cache_root = os.environ["XDG_HOME_CACHE"] except KeyError: cache_root = os.path.join(os.path.expanduser("~"), ".cache") return os.path.join(cache_root, "packman") else: raise RuntimeError(f"Unsupported platform '{platform_name}'") # make sure the path exists: os.makedirs(root, exist_ok=True) return root def get_module_dir(conf_dir, packages_root: str, version: str) -> str: module_dir = os.path.join(packages_root, "packman-common", version) if not os.path.exists(module_dir): import tempfile tf = tempfile.NamedTemporaryFile(delete=False) target_name = tf.name tf.close() url = f"http://bootstrap.packman.nvidia.com/packman-common@{version}.zip" print(f"Downloading '{url}' ...") import urllib.request urllib.request.urlretrieve(url, target_name) from importlib.machinery import SourceFileLoader # import module from path provided script_path = os.path.join(conf_dir, "bootstrap", "install_package.py") ip = SourceFileLoader("install_package", script_path).load_module() print("Unpacking ...") ip.install_package(target_name, module_dir) os.unlink(tf.name) return module_dir def get_version(conf_dir: str): path = os.path.join(conf_dir, "packman") if not os.path.exists(path): # in dev repo fallback path += ".sh" with open(path, "rt", encoding="utf8") as launch_file: for line in launch_file.readlines(): if line.startswith("PM_PACKMAN_VERSION"): _, value = line.split("=") return value.strip() raise RuntimeError(f"Unable to find 'PM_PACKMAN_VERSION' in '{path}'") cccl-2.5.0/cub/docs/tools/packman/python.bat000077500000000000000000000013641463375617100207320ustar00rootroot00000000000000:: Copyright 2019-2020 NVIDIA CORPORATION :: :: Licensed under the Apache License, Version 2.0 (the "License"); :: you may not use this file except in compliance with the License. :: You may obtain a copy of the License at :: :: http://www.apache.org/licenses/LICENSE-2.0 :: :: Unless required by applicable law or agreed to in writing, software :: distributed under the License is distributed on an "AS IS" BASIS, :: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. :: See the License for the specific language governing permissions and :: limitations under the License. @echo off setlocal call "%~dp0\packman" init set "PYTHONPATH=%PM_MODULE_DIR%;%PYTHONPATH%" set PYTHONNOUSERSITE=1 "%PM_PYTHON%" -u %* cccl-2.5.0/cub/docs/tools/packman/python.sh000077500000000000000000000017341463375617100205770ustar00rootroot00000000000000#!/bin/bash # Copyright 2019-2020 NVIDIA CORPORATION # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # http://www.apache.org/licenses/LICENSE-2.0 # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. set -e PACKMAN_CMD="$(dirname "${BASH_SOURCE}")/packman" if [ ! -f "$PACKMAN_CMD" ]; then PACKMAN_CMD="${PACKMAN_CMD}.sh" fi source "$PACKMAN_CMD" init export PYTHONPATH="${PM_MODULE_DIR}:${PYTHONPATH}" export PYTHONNOUSERSITE=1 # workaround for our python not shipping with certs if [[ -z ${SSL_CERT_DIR:-} ]]; then export SSL_CERT_DIR=/etc/ssl/certs/ fi "${PM_PYTHON}" -u "$@" cccl-2.5.0/cub/docs/tools/repoman/000077500000000000000000000000001463375617100167415ustar00rootroot00000000000000cccl-2.5.0/cub/docs/tools/repoman/omni/000077500000000000000000000000001463375617100177035ustar00rootroot00000000000000cccl-2.5.0/cub/docs/tools/repoman/omni/repo/000077500000000000000000000000001463375617100206505ustar00rootroot00000000000000cccl-2.5.0/cub/docs/tools/repoman/omni/repo/format/000077500000000000000000000000001463375617100221405ustar00rootroot00000000000000cccl-2.5.0/cub/docs/tools/repoman/omni/repo/format/.gitignore000066400000000000000000000003011463375617100241220ustar00rootroot00000000000000# Dummy omni.repo.format Python module so we don't have to pull down the format package. # Ignore everything in this directory, except this file to ensure the folder is created. * !.gitignore cccl-2.5.0/cub/docs/tools/repoman/repoman.py000066400000000000000000000012771463375617100207630ustar00rootroot00000000000000import os import sys import io import contextlib import packmanapi REPO_ROOT = os.path.join(os.path.dirname(os.path.realpath(__file__)), "../..") REPO_DEPS_FILE = os.path.join(REPO_ROOT, "deps/repo-deps.packman.xml") def bootstrap(): """ Bootstrap all omni.repo modules. Pull with packman from repo.packman.xml and add them all to python sys.path to enable importing. """ #with contextlib.redirect_stdout(io.StringIO()): deps = packmanapi.pull(REPO_DEPS_FILE) for dep_path in deps.values(): if dep_path not in sys.path: sys.path.append(dep_path) if __name__ == "__main__": bootstrap() import omni.repo.man omni.repo.man.main(REPO_ROOT) cccl-2.5.0/cub/docs/tuning.rst000066400000000000000000000245121463375617100162020ustar00rootroot00000000000000CUB Tuning Infrastructure ================================================================================ Device-scope algorithms in CUB have many knobs that do not affect the algorithms' correctness but can significantly impact performance. For instance, the number of threads per block and items per thread can be tuned to maximize performance for a given device and data type. This document describes CUB Tuning Infrastructure, a set of tools facilitating the process of selecting optimal tuning parameters for a given device and data type. Definitions -------------------------------------------------------------------------------- Terms might be ambiguous in a generic context. Below, we omit the word "tuning" but assume it in all definitions. Algorithms are tuned for different workloads. For instance, radix sort can be tuned for different key types, different number of keys, and different distribution of keys. We separate tuning parameters into two categories: * **Compile-time (ct) Workload** - a workload that can be recognized at compile time. For instance, the combination of key type and offset type is a compile-time workload for radix sort. * **Runtime (rt) Workload** - a workload that can be recognized only at runtime. For instance, the number of keys along with their distribution is a runtime workload for radix sort. * **Parameter** - a parameter that can be tuned to maximize performance for a given device and data type. For instance, the number of threads per block and items per thread are tuning parameters. * **Parameter Space** - the set of all possible values for a given tuning parameter. Parameter Space is specific to algorithm. For instance, the parameter space for the number of threads per block is :math:`\{32, 64, 96, 128, \dots, 1024\}` for radix sort, but :math:`\{32, 64, 128, 256, 512\}` for merge sort. * **Parameter Point** - a concrete value of a tuning parameter. For instance, the parameter point for the number of threads per block is :math:`threads\_per\_block=128`. * **Search Space** - Cartesian product of parameter spaces. For instance, search space for an algorithm with tunable items per thread and threads per block might look like :math:`\{(ipt \times tpb) | ipt \in \{1, \dots, 25\} \text{and} tpb \in \{32, 64, 96, 128, \dots, 1024\}\}`. * **Variant** - a point from corresponding search space. * **Base** - a variant that CUB uses by default. * **Score** - a single number representing the performance for a given compile-time workload and all runtime workloads. For instance, a weighted-sum of speedups of a given variant compared to its base for all runtime workloads is a score. * **Search** - a process consisting of covering all variants for all compile-time workloads to find a variant with maximal score. Contributing Benchmarks -------------------------------------------------------------------------------- There are a few constraints on benchmarks. First of all, all benchmarks in a single file should share type axes. Only alphabetical characters, numbers and underscore are allowed in the benchmark name. The name of the file is represets the name of the algorithm. For instance, the :code:`benchmarks/bench/radix_sort/keys.cu` file name is going to be transformed into :code:`cub.bench.radix_sort.keys` that's further used in the infrastructure. You start writing a benchmark by including :code:`nvbench_helper.cuh` file. It contains all necessary includes and definitions. .. code:: c++ #include The next step is to define a search space. The search space is represented by a number of comments. The format consists of :code:`%RANGE%` keyword, a parameter name, and a range. The range is represented by three numbers: start, end, and step. For instance, the following code defines a search space for the number of threads per block and items per thread. .. code:: c++ // %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1 // %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32 Next, you need to define a benchmark function. The function accepts :code:`nvbench::state &state` and a :code:`nvbench::type_list`. For more details on the benchmark signature, take a look at the nvbench docs. .. code:: c++ template void algname(nvbench::state &state, nvbench::type_list) { Now we have to specialize the dispatch layer. The tuning infrastructure will use `TUNE_BASE` macro to distinguish between the base and the variant. When base is used, do not specify the policy, so that the default one is used. If the macro is not defined, specify custom policy using macro names defined at the search space specification step. .. code:: c++ #if TUNE_BASE using dispatch_t = cub::DispatchReduce; #else using policy_t = policy_hub_t; using dispatch_t = cub::DispatchReduce; #endif If possible, do not initialize the input data in the benchmark function. Instead, use the :code:`gen` function. This function will fill the input vector with random data on GPU with no compile-time overhead. .. code:: c++ const auto elements = static_cast(state.get_int64("Elements{io}")); thrust::device_vector in(elements); thrust::device_vector out(1); gen(seed_t{}, in); You can optionally add memory usage to the state: .. code:: c++ state.add_element_count(elements); state.add_global_memory_reads(elements, "Size"); state.add_global_memory_writes(1); Now we are ready to allocate temporary storage: .. code:: c++ std::size_t temp_size; dispatch_t::Dispatch(nullptr, temp_size, d_in, d_out, static_cast(elements), 0 /* stream */); thrust::device_vector temp(temp_size); auto *temp_storage = thrust::raw_pointer_cast(temp.data()); Finally, we can run the algorithm: .. code:: c++ state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) { dispatch_t::Dispatch(temp_storage, temp_size, d_in, d_out, static_cast(elements), launch.get_stream()); }); } Having the benchmark function, we can tell nvbench about it. A few things to note here. First of all, compile-time axes should be annotated as :code:`{ct}`. The runtime axes might be optionally annotated as :code:`{io}` which stands for importance-ordered. This will tell the tuning infrastructure that the later values on the axis are more important. If the axis is not annotated, each value will be treated as equally important. .. code:: c++ NVBENCH_BENCH_TYPES(algname, NVBENCH_TYPE_AXES(all_types, offset_types)) .set_name("base") .set_type_axes_names({"T{ct}", "OffsetT{ct}"}) .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4)); When you define a type axis that's annotated as :code:`{ct}`, you might want to consider optimizing the build time. Many variants are going to be build, but the search is considering one compile-time use case at a time. This means, that if you have many types to tune for, you'll end up having many specializations that you don't need. To avoid this, for each compile time axis, you can expect a `TUNE_AxisName` macro with the type that's currently being tuned. For instance, if you have a type axes :code:`T{ct}` and :code:`OffsetT` (as shown above), you can use the following construct: .. code:: c++ #ifdef TUNE_T using types = nvbench::type_list; #else using types = all_types; #endif #ifdef TUNE_OffsetT using offset_types = nvbench::type_list; #else using offset_types = nvbench::type_list; #endif This logic is automatically applied to :code:`all_types`, :code:`offset_types`, and :code:`fundamental_types` lists when you use matching names for the axes. You can define your own axis names and use the logic above for them (see sort pairs example). Search Process -------------------------------------------------------------------------------- To get started with tuning / benchmarking, you need to configure CMake. The following options are available: * :code:`CUB_ENABLE_BENCHMARKS` - enable bases (default: OFF). * :code:`CUB_ENABLE_TUNING` - enable variants (default: OFF). Having configured CMake, you can start the search process. Note that the search has to be started from the build directory. .. code:: bash $ cd build $ cmake -DThrust_DIR=path-to-thrust/thrust/cmake -DCUB_ENABLE_TUNING=YES -DCUB_ENABLE_BENCHMARKS=YES -DCMAKE_BUILD_TYPE=Release -DCMAKE_CUDA_ARCHITECTURES="90" .. $ ../benchmarks/scripts/search.py -a "T{ct}=[I8,I16]" -R ".*algname.*" Both :code:`-a` and :code:`-R` options are optional. The first one is used to specify types to tune for. The second one is used to specify benchmarks to be tuned. If not specified, all benchmarks are going to be tuned. The result of the search is stored in the :code:`build/cccl_meta_bench.db` file. To analyze the result you can use the :code:`analyze.py` script: .. code:: bash $ ../benchmarks/scripts/analyze.py --coverage cub.bench.radix_sort.keys[T{ct}=I8, OffsetT{ct}=I32] coverage: 167 / 522 (31.9923%) cub.bench.radix_sort.keys[T{ct}=I8, OffsetT{ct}=I64] coverage: 152 / 522 (29.1188%) $ ../benchmarks/scripts/analyze.py --top=5 cub.bench.radix_sort.keys[T{ct}=I8, OffsetT{ct}=I32]: variant score mins means maxs 97 ipt_19.tpb_512 1.141015 1.039052 1.243448 1.679558 84 ipt_18.tpb_512 1.136463 1.030434 1.245825 1.668038 68 ipt_17.tpb_512 1.132696 1.020470 1.250665 1.688889 41 ipt_15.tpb_576 1.124077 1.011560 1.245011 1.722379 52 ipt_16.tpb_512 1.121044 0.995238 1.252378 1.717514 cub.bench.radix_sort.keys[T{ct}=I8, OffsetT{ct}=I64]: variant score mins means maxs 71 ipt_19.tpb_512 1.250941 1.155738 1.321665 1.647868 86 ipt_20.tpb_512 1.250840 1.128940 1.308591 1.612382 55 ipt_17.tpb_512 1.244399 1.152033 1.327424 1.692091 98 ipt_21.tpb_448 1.231045 1.152798 1.298332 1.621110 85 ipt_20.tpb_480 1.229382 1.135447 1.294937 1.631225 $ ../benchmarks/scripts/analyze.py --variant='ipt_(18|19).tpb_512' The last command plots distribution of the elapsed times for the specified variants. cccl-2.5.0/cub/docs/warp_wide.rst000066400000000000000000000017751463375617100166650ustar00rootroot00000000000000.. _warp-module: Warp-Wide "Collective" Primitives ================================================== .. toctree:: :glob: :hidden: :maxdepth: 2 ${repo_docs_api_path}/*Warp* CUB warp-level algorithms are specialized for execution by threads in the same CUDA warp. These algorithms may only be invoked by ``1 <= n <= 32`` *consecutive* threads in the same warp: * :cpp:struct:`cub::WarpExchange ` rearranges data partitioned across a CUDA warp * :cpp:class:`cub::WarpLoad ` loads a linear segment of items from memory into a CUDA warp * :cpp:class:`cub::WarpMergeSort ` sorts items partitioned across a CUDA warp * :cpp:struct:`cub::WarpReduce ` computes reduction of items partitioned across a CUDA warp * :cpp:struct:`cub::WarpScan ` computes a prefix scan of items partitioned across a CUDA warp * :cpp:class:`cub::WarpStore ` stores items partitioned across a CUDA warp to a linear segment of memory cccl-2.5.0/cub/examples/000077500000000000000000000000001463375617100150265ustar00rootroot00000000000000cccl-2.5.0/cub/examples/CMakeLists.txt000066400000000000000000000043261463375617100175730ustar00rootroot00000000000000# Create meta targets that build all examples for a single configuration: foreach(cub_target IN LISTS CUB_TARGETS) cub_get_target_property(config_prefix ${cub_target} PREFIX) set(config_meta_target ${config_prefix}.examples) add_custom_target(${config_meta_target}) add_dependencies(${config_prefix}.all ${config_meta_target}) endforeach() ## cub_add_example # # Add an example executable and register it with ctest. # # target_name_var: Variable name to overwrite with the name of the example # target. Useful for post-processing target information per-backend. # example_name: The name of the example minus ".example." For # instance, examples/vector.cu will be "vector", and examples/cuda/copy.cu # would be "cuda.copy". # example_src: The source file that implements the example. # cub_target: The reference cub target with configuration information. # function(cub_add_example target_name_var example_name example_src cub_target) cub_get_target_property(config_prefix ${cub_target} PREFIX) # The actual name of the test's target: set(example_target ${config_prefix}.example.${example_name}) set(${target_name_var} ${example_target} PARENT_SCOPE) # Related target names: set(config_meta_target ${config_prefix}.examples) set(example_meta_target cub.all.example.${example_name}) add_executable(${example_target} "${example_src}") target_link_libraries(${example_target} ${cub_target}) cub_clone_target_properties(${example_target} ${cub_target}) cub_configure_cuda_target(${example_target} RDC ${CUB_FORCE_RDC}) target_include_directories(${example_target} PRIVATE "${CUB_SOURCE_DIR}/examples") if (CUB_IN_THRUST) thrust_fix_clang_nvcc_build_for(${example_target}) endif() # Add to the active configuration's meta target add_dependencies(${config_meta_target} ${example_target}) # Meta target that builds examples with this name for all configurations: if (NOT TARGET ${example_meta_target}) add_custom_target(${example_meta_target}) endif() add_dependencies(${example_meta_target} ${example_target}) add_test(NAME ${example_target} COMMAND "$" ) endfunction() add_subdirectory(cmake) add_subdirectory(block) add_subdirectory(device) cccl-2.5.0/cub/examples/block/000077500000000000000000000000001463375617100161205ustar00rootroot00000000000000cccl-2.5.0/cub/examples/block/.gitignore000066400000000000000000000001051463375617100201040ustar00rootroot00000000000000/bin /Debug /Release /cuda55.sdf /cuda55.suo /cuda60.sdf /cuda60.suo cccl-2.5.0/cub/examples/block/CMakeLists.txt000066400000000000000000000007271463375617100206660ustar00rootroot00000000000000file(GLOB_RECURSE example_srcs RELATIVE "${CMAKE_CURRENT_LIST_DIR}" CONFIGURE_DEPENDS example_*.cu ) foreach (cub_target IN LISTS CUB_TARGETS) foreach (example_src IN LISTS example_srcs) get_filename_component(example_name "${example_src}" NAME_WE) string(REGEX REPLACE "^example_block_" "block." example_name "${example_name}" ) cub_add_example(target_name ${example_name} "${example_src}" ${cub_target}) endforeach() endforeach() cccl-2.5.0/cub/examples/block/example_block_radix_sort.cu000066400000000000000000000241421463375617100235170ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /****************************************************************************** * Simple demonstration of cub::BlockRadixSort * * To compile using the command line: * nvcc -arch=sm_XX example_block_radix_sort.cu -I../.. -lcudart -O3 * ******************************************************************************/ // Ensure printing of CUDA runtime errors to console (define before including cub.h) #define CUB_STDERR #include #include #include #include #include #include "../../test/test_util.h" #include using namespace cub; //--------------------------------------------------------------------- // Globals, constants and typedefs //--------------------------------------------------------------------- /// Verbose output bool g_verbose = false; /// Timing iterations int g_timing_iterations = 100; /// Default grid size int g_grid_size = 1; /// Uniform key samples bool g_uniform_keys; //--------------------------------------------------------------------- // Kernels //--------------------------------------------------------------------- /** * Simple kernel for performing a block-wide sorting over integers */ template __launch_bounds__(BLOCK_THREADS) __global__ void BlockSortKernel(Key* d_in, // Tile of input Key* d_out, // Tile of output clock_t* d_elapsed) // Elapsed cycle count of block scan { enum { TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD }; // Specialize BlockLoad type for our thread block (uses warp-striped loads for coalescing, then transposes in shared // memory to a blocked arrangement) typedef BlockLoad BlockLoadT; // Specialize BlockRadixSort type for our thread block typedef BlockRadixSort BlockRadixSortT; // Shared memory __shared__ union TempStorage { typename BlockLoadT::TempStorage load; typename BlockRadixSortT::TempStorage sort; } temp_storage; // Per-thread tile items Key items[ITEMS_PER_THREAD]; // Our current block's offset int block_offset = blockIdx.x * TILE_SIZE; // Load items into a blocked arrangement BlockLoadT(temp_storage.load).Load(d_in + block_offset, items); // Barrier for smem reuse __syncthreads(); // Start cycle timer clock_t start = clock(); // Sort keys BlockRadixSortT(temp_storage.sort).SortBlockedToStriped(items); // Stop cycle timer clock_t stop = clock(); // Store output in striped fashion StoreDirectStriped(threadIdx.x, d_out + block_offset, items); // Store elapsed clocks if (threadIdx.x == 0) { d_elapsed[blockIdx.x] = (start > stop) ? start - stop : stop - start; } } //--------------------------------------------------------------------- // Host utilities //--------------------------------------------------------------------- /** * Initialize sorting problem (and solution). */ template void Initialize(Key* h_in, Key* h_reference, int num_items, int tile_size) { for (int i = 0; i < num_items; ++i) { if (g_uniform_keys) { h_in[i] = 0; } else { RandomBits(h_in[i]); } h_reference[i] = h_in[i]; } // Only sort the first tile std::sort(h_reference, h_reference + tile_size); } /** * Test BlockScan */ template void Test() { constexpr int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD; // Allocate host arrays Key* h_in = new Key[TILE_SIZE * g_grid_size]; Key* h_reference = new Key[TILE_SIZE * g_grid_size]; clock_t* h_elapsed = new clock_t[g_grid_size]; // Initialize problem and reference output on host Initialize(h_in, h_reference, TILE_SIZE * g_grid_size, TILE_SIZE); // Initialize device arrays Key* d_in = NULL; Key* d_out = NULL; clock_t* d_elapsed = NULL; CubDebugExit(cudaMalloc((void**) &d_in, sizeof(Key) * TILE_SIZE * g_grid_size)); CubDebugExit(cudaMalloc((void**) &d_out, sizeof(Key) * TILE_SIZE * g_grid_size)); CubDebugExit(cudaMalloc((void**) &d_elapsed, sizeof(clock_t) * g_grid_size)); // Display input problem data if (g_verbose) { printf("Input data: "); for (int i = 0; i < TILE_SIZE; i++) { std::cout << h_in[i] << ", "; } printf("\n\n"); } // Kernel props int max_sm_occupancy; CubDebugExit(MaxSmOccupancy(max_sm_occupancy, BlockSortKernel, BLOCK_THREADS)); // Copy problem to device CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(Key) * TILE_SIZE * g_grid_size, cudaMemcpyHostToDevice)); printf( "BlockRadixSort %d items (%d timing iterations, %d blocks, %d threads, %d items per thread, %d SM occupancy):\n", TILE_SIZE * g_grid_size, g_timing_iterations, g_grid_size, BLOCK_THREADS, ITEMS_PER_THREAD, max_sm_occupancy); fflush(stdout); // Run kernel once to prime caches and check result BlockSortKernel<<>>(d_in, d_out, d_elapsed); // Check for kernel errors and STDIO from the kernel, if any CubDebugExit(cudaPeekAtLastError()); CubDebugExit(cudaDeviceSynchronize()); // Check results printf("\tOutput items: "); int compare = CompareDeviceResults(h_reference, d_out, TILE_SIZE, g_verbose, g_verbose); printf("%s\n", compare ? "FAIL" : "PASS"); AssertEquals(0, compare); fflush(stdout); // Run this several times and average the performance results GpuTimer timer; float elapsed_millis = 0.0; unsigned long long elapsed_clocks = 0; for (int i = 0; i < g_timing_iterations; ++i) { timer.Start(); // Run kernel BlockSortKernel<<>>(d_in, d_out, d_elapsed); timer.Stop(); elapsed_millis += timer.ElapsedMillis(); // Copy clocks from device CubDebugExit(cudaMemcpy(h_elapsed, d_elapsed, sizeof(clock_t) * g_grid_size, cudaMemcpyDeviceToHost)); for (int j = 0; j < g_grid_size; j++) { elapsed_clocks += h_elapsed[j]; } } // Check for kernel errors and STDIO from the kernel, if any CubDebugExit(cudaDeviceSynchronize()); // Display timing results float avg_millis = elapsed_millis / g_timing_iterations; float avg_items_per_sec = float(TILE_SIZE * g_grid_size) / avg_millis / 1000.0f; double avg_clocks = double(elapsed_clocks) / g_timing_iterations / g_grid_size; double avg_clocks_per_item = avg_clocks / TILE_SIZE; printf("\tAverage BlockRadixSort::SortBlocked clocks: %.3f\n", avg_clocks); printf("\tAverage BlockRadixSort::SortBlocked clocks per item: %.3f\n", avg_clocks_per_item); printf("\tAverage kernel millis: %.4f\n", avg_millis); printf("\tAverage million items / sec: %.4f\n", avg_items_per_sec); fflush(stdout); // Cleanup if (h_in) { delete[] h_in; } if (h_reference) { delete[] h_reference; } if (h_elapsed) { delete[] h_elapsed; } if (d_in) { CubDebugExit(cudaFree(d_in)); } if (d_out) { CubDebugExit(cudaFree(d_out)); } if (d_elapsed) { CubDebugExit(cudaFree(d_elapsed)); } } /** * Main */ int main(int argc, char** argv) { // Initialize command line CommandLineArgs args(argc, argv); g_verbose = args.CheckCmdLineFlag("v"); g_uniform_keys = args.CheckCmdLineFlag("uniform"); args.GetCmdLineArgument("i", g_timing_iterations); args.GetCmdLineArgument("grid-size", g_grid_size); // Print usage if (args.CheckCmdLineFlag("help")) { printf("%s " "[--device=] " "[--i=]" "[--grid-size=]" "[--v] " "\n", argv[0], g_timing_iterations, g_grid_size); exit(0); } // Initialize device CubDebugExit(args.DeviceInit()); fflush(stdout); // Run tests printf("\nuint32:\n"); fflush(stdout); Test(); printf("\n"); fflush(stdout); printf("\nfp32:\n"); fflush(stdout); Test(); printf("\n"); fflush(stdout); printf("\nuint8:\n"); fflush(stdout); Test(); printf("\n"); fflush(stdout); return 0; } cccl-2.5.0/cub/examples/block/example_block_reduce.cu000066400000000000000000000220231463375617100226040ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /****************************************************************************** * Simple demonstration of cub::BlockReduce * * To compile using the command line: * nvcc -arch=sm_XX example_block_reduce.cu -I../.. -lcudart -O3 * ******************************************************************************/ // Ensure printing of CUDA runtime errors to console (define before including cub.h) #define CUB_STDERR #include #include #include #include #include "../../test/test_util.h" #include using namespace cub; //--------------------------------------------------------------------- // Globals, constants and typedefs //--------------------------------------------------------------------- /// Verbose output bool g_verbose = false; /// Timing iterations int g_timing_iterations = 100; /// Default grid size int g_grid_size = 1; //--------------------------------------------------------------------- // Kernels //--------------------------------------------------------------------- /** * Simple kernel for performing a block-wide reduction. */ template __global__ void BlockReduceKernel(int* d_in, // Tile of input int* d_out, // Tile aggregate clock_t* d_elapsed) // Elapsed cycle count of block reduction { // Specialize BlockReduce type for our thread block typedef BlockReduce BlockReduceT; // Shared memory __shared__ typename BlockReduceT::TempStorage temp_storage; // Per-thread tile data int data[ITEMS_PER_THREAD]; LoadDirectStriped(threadIdx.x, d_in, data); // Start cycle timer clock_t start = clock(); // Compute sum int aggregate = BlockReduceT(temp_storage).Sum(data); // Stop cycle timer clock_t stop = clock(); // Store aggregate and elapsed clocks if (threadIdx.x == 0) { *d_elapsed = (start > stop) ? start - stop : stop - start; *d_out = aggregate; } } //--------------------------------------------------------------------- // Host utilities //--------------------------------------------------------------------- /** * Initialize reduction problem (and solution). * Returns the aggregate */ int Initialize(int* h_in, int num_items) { int inclusive = 0; for (int i = 0; i < num_items; ++i) { h_in[i] = i % 17; inclusive += h_in[i]; } return inclusive; } /** * Test thread block reduction */ template void Test() { constexpr int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD; // Allocate host arrays int* h_in = new int[TILE_SIZE]; int* h_gpu = new int[TILE_SIZE + 1]; // Initialize problem and reference output on host int h_aggregate = Initialize(h_in, TILE_SIZE); // Initialize device arrays int* d_in = NULL; int* d_out = NULL; clock_t* d_elapsed = NULL; cudaMalloc((void**) &d_in, sizeof(int) * TILE_SIZE); cudaMalloc((void**) &d_out, sizeof(int) * 1); cudaMalloc((void**) &d_elapsed, sizeof(clock_t)); // Display input problem data if (g_verbose) { printf("Input data: "); for (int i = 0; i < TILE_SIZE; i++) { printf("%d, ", h_in[i]); } printf("\n\n"); } // Kernel props int max_sm_occupancy; CubDebugExit( MaxSmOccupancy(max_sm_occupancy, BlockReduceKernel, BLOCK_THREADS)); // Copy problem to device cudaMemcpy(d_in, h_in, sizeof(int) * TILE_SIZE, cudaMemcpyHostToDevice); printf("BlockReduce algorithm %s on %d items (%d timing iterations, %d blocks, %d threads, %d items per thread, %d " "SM occupancy):\n", (ALGORITHM == BLOCK_REDUCE_RAKING) ? "BLOCK_REDUCE_RAKING" : "BLOCK_REDUCE_WARP_REDUCTIONS", TILE_SIZE, g_timing_iterations, g_grid_size, BLOCK_THREADS, ITEMS_PER_THREAD, max_sm_occupancy); // Run kernel BlockReduceKernel<<>>(d_in, d_out, d_elapsed); // Check total aggregate printf("\tAggregate: "); int compare = CompareDeviceResults(&h_aggregate, d_out, 1, g_verbose, g_verbose); printf("%s\n", compare ? "FAIL" : "PASS"); AssertEquals(0, compare); // Run this several times and average the performance results GpuTimer timer; float elapsed_millis = 0.0; clock_t elapsed_clocks = 0; for (int i = 0; i < g_timing_iterations; ++i) { // Copy problem to device cudaMemcpy(d_in, h_in, sizeof(int) * TILE_SIZE, cudaMemcpyHostToDevice); timer.Start(); // Run kernel BlockReduceKernel <<>>(d_in, d_out, d_elapsed); timer.Stop(); elapsed_millis += timer.ElapsedMillis(); // Copy clocks from device clock_t clocks; CubDebugExit(cudaMemcpy(&clocks, d_elapsed, sizeof(clock_t), cudaMemcpyDeviceToHost)); elapsed_clocks += clocks; } // Check for kernel errors and STDIO from the kernel, if any CubDebugExit(cudaPeekAtLastError()); CubDebugExit(cudaDeviceSynchronize()); // Display timing results float avg_millis = elapsed_millis / g_timing_iterations; float avg_items_per_sec = float(TILE_SIZE * g_grid_size) / avg_millis / 1000.0f; float avg_clocks = float(elapsed_clocks) / g_timing_iterations; float avg_clocks_per_item = avg_clocks / TILE_SIZE; printf("\tAverage BlockReduce::Sum clocks: %.3f\n", avg_clocks); printf("\tAverage BlockReduce::Sum clocks per item: %.3f\n", avg_clocks_per_item); printf("\tAverage kernel millis: %.4f\n", avg_millis); printf("\tAverage million items / sec: %.4f\n", avg_items_per_sec); // Cleanup if (h_in) { delete[] h_in; } if (h_gpu) { delete[] h_gpu; } if (d_in) { cudaFree(d_in); } if (d_out) { cudaFree(d_out); } if (d_elapsed) { cudaFree(d_elapsed); } } /** * Main */ int main(int argc, char** argv) { // Initialize command line CommandLineArgs args(argc, argv); g_verbose = args.CheckCmdLineFlag("v"); args.GetCmdLineArgument("i", g_timing_iterations); args.GetCmdLineArgument("grid-size", g_grid_size); // Print usage if (args.CheckCmdLineFlag("help")) { printf("%s " "[--device=] " "[--i=] " "[--grid-size=] " "[--v] " "\n", argv[0]); exit(0); } // Initialize device CubDebugExit(args.DeviceInit()); // Run tests Test<1024, 1, BLOCK_REDUCE_RAKING>(); Test<512, 2, BLOCK_REDUCE_RAKING>(); Test<256, 4, BLOCK_REDUCE_RAKING>(); Test<128, 8, BLOCK_REDUCE_RAKING>(); Test<64, 16, BLOCK_REDUCE_RAKING>(); Test<32, 32, BLOCK_REDUCE_RAKING>(); Test<16, 64, BLOCK_REDUCE_RAKING>(); printf("-------------\n"); Test<1024, 1, BLOCK_REDUCE_WARP_REDUCTIONS>(); Test<512, 2, BLOCK_REDUCE_WARP_REDUCTIONS>(); Test<256, 4, BLOCK_REDUCE_WARP_REDUCTIONS>(); Test<128, 8, BLOCK_REDUCE_WARP_REDUCTIONS>(); Test<64, 16, BLOCK_REDUCE_WARP_REDUCTIONS>(); Test<32, 32, BLOCK_REDUCE_WARP_REDUCTIONS>(); Test<16, 64, BLOCK_REDUCE_WARP_REDUCTIONS>(); return 0; } cccl-2.5.0/cub/examples/block/example_block_reduce_dyn_smem.cu000066400000000000000000000156461463375617100245140ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /****************************************************************************** * Simple demonstration of cub::BlockReduce with dynamic shared memory * * To compile using the command line: * nvcc -arch=sm_XX example_block_reduce_dyn_smem.cu -I../.. -lcudart -O3 -std=c++14 * ******************************************************************************/ // Ensure printing of CUDA runtime errors to console (define before including cub.h) #define CUB_STDERR #include #include #include #include #include #include "../../test/test_util.h" #include // Some implementation details rely on c++14 #if _CCCL_STD_VER >= 2014 using namespace cub; //--------------------------------------------------------------------- // Globals, constants and typedefs //--------------------------------------------------------------------- /// Verbose output bool g_verbose = false; /// Default grid size int g_grid_size = 1; //--------------------------------------------------------------------- // Kernels //--------------------------------------------------------------------- /** * Simple kernel for performing a block-wide reduction. */ template __global__ void BlockReduceKernel(int* d_in, // Tile of input int* d_out // Tile aggregate ) { // Specialize BlockReduce type for our thread block using BlockReduceT = cub::BlockReduce; using TempStorageT = typename BlockReduceT::TempStorage; union ShmemLayout { TempStorageT reduce; int aggregate; }; // shared memory byte-array extern __shared__ __align__(alignof(ShmemLayout)) char smem[]; // cast to lvalue reference of expected type auto& temp_storage = reinterpret_cast(smem); int data = d_in[threadIdx.x]; // Compute sum int aggregate = BlockReduceT(temp_storage).Sum(data); // block-wide sync barrier necessary to re-use shared mem safely __syncthreads(); int* smem_integers = reinterpret_cast(smem); if (threadIdx.x == 0) { smem_integers[0] = aggregate; } // sync to make new shared value available to all threads __syncthreads(); aggregate = smem_integers[0]; // all threads write the aggregate to output d_out[threadIdx.x] = aggregate; } //--------------------------------------------------------------------- // Host utilities //--------------------------------------------------------------------- /** * Initialize reduction problem (and solution). * Returns the aggregate */ int Initialize(int* h_in, int num_items) { int inclusive = 0; for (int i = 0; i < num_items; ++i) { h_in[i] = i % 17; inclusive += h_in[i]; } return inclusive; } /** * Test thread block reduction */ template void Test() { // Allocate host arrays int* h_in = new int[BLOCK_THREADS]; // Initialize problem and reference output on host int h_aggregate = Initialize(h_in, BLOCK_THREADS); // Initialize device arrays int* d_in = NULL; int* d_out = NULL; cudaMalloc((void**) &d_in, sizeof(int) * BLOCK_THREADS); cudaMalloc((void**) &d_out, sizeof(int) * BLOCK_THREADS); // Display input problem data if (g_verbose) { printf("Input data: "); for (int i = 0; i < BLOCK_THREADS; i++) { printf("%d, ", h_in[i]); } printf("\n\n"); } // Copy problem to device cudaMemcpy(d_in, h_in, sizeof(int) * BLOCK_THREADS, cudaMemcpyHostToDevice); // determine necessary storage size: auto block_reduce_temp_bytes = sizeof(typename cub::BlockReduce::TempStorage); // finally, we need to make sure that we can hold at least one integer // needed in the kernel to exchange data after reduction auto smem_size = (std::max)(1 * sizeof(int), block_reduce_temp_bytes); // use default stream cudaStream_t stream = NULL; // Run reduction kernel BlockReduceKernel<<>>(d_in, d_out); // Check total aggregate printf("\tAggregate: "); int compare = 0; for (int i = 0; i < BLOCK_THREADS; i++) { compare = compare || CompareDeviceResults(&h_aggregate, d_out + i, 1, g_verbose, g_verbose); } printf("%s\n", compare ? "FAIL" : "PASS"); AssertEquals(0, compare); // Check for kernel errors and STDIO from the kernel, if any CubDebugExit(cudaPeekAtLastError()); CubDebugExit(cudaDeviceSynchronize()); // Cleanup if (h_in) { delete[] h_in; } if (d_in) { cudaFree(d_in); } if (d_out) { cudaFree(d_out); } } /** * Main */ int main(int argc, char** argv) { // Initialize command line CommandLineArgs args(argc, argv); g_verbose = args.CheckCmdLineFlag("v"); args.GetCmdLineArgument("grid-size", g_grid_size); // Print usage if (args.CheckCmdLineFlag("help")) { printf("%s " "[--device=] " "[--grid-size=] " "[--v] " "\n", argv[0]); exit(0); } // Initialize device CubDebugExit(args.DeviceInit()); // Run tests Test<1024>(); Test<512>(); Test<256>(); Test<128>(); Test<64>(); Test<32>(); Test<16>(); return 0; } #else // < C++14 int main() {} #endif cccl-2.5.0/cub/examples/block/example_block_scan.cu000066400000000000000000000253121463375617100222650ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /****************************************************************************** * Simple demonstration of cub::BlockScan * * To compile using the command line: * nvcc -arch=sm_XX example_block_scan.cu -I../.. -lcudart -O3 * ******************************************************************************/ // Ensure printing of CUDA runtime errors to console (define before including cub.h) #define CUB_STDERR #include #include #include #include #include "../../test/test_util.h" #include using namespace cub; //--------------------------------------------------------------------- // Globals, constants and typedefs //--------------------------------------------------------------------- /// Verbose output bool g_verbose = false; /// Timing iterations int g_timing_iterations = 100; /// Default grid size int g_grid_size = 1; //--------------------------------------------------------------------- // Kernels //--------------------------------------------------------------------- /** * Simple kernel for performing a block-wide exclusive prefix sum over integers */ template __global__ void BlockPrefixSumKernel(int* d_in, // Tile of input int* d_out, // Tile of output clock_t* d_elapsed) // Elapsed cycle count of block scan { // Specialize BlockLoad type for our thread block (uses warp-striped loads for coalescing, then transposes in shared // memory to a blocked arrangement) typedef BlockLoad BlockLoadT; // Specialize BlockStore type for our thread block (uses warp-striped loads for coalescing, then transposes in shared // memory to a blocked arrangement) typedef BlockStore BlockStoreT; // Specialize BlockScan type for our thread block typedef BlockScan BlockScanT; // Shared memory __shared__ union TempStorage { typename BlockLoadT::TempStorage load; typename BlockStoreT::TempStorage store; typename BlockScanT::TempStorage scan; } temp_storage; // Per-thread tile data int data[ITEMS_PER_THREAD]; // Load items into a blocked arrangement BlockLoadT(temp_storage.load).Load(d_in, data); // Barrier for smem reuse __syncthreads(); // Start cycle timer clock_t start = clock(); // Compute exclusive prefix sum int aggregate; BlockScanT(temp_storage.scan).ExclusiveSum(data, data, aggregate); // Stop cycle timer clock_t stop = clock(); // Barrier for smem reuse __syncthreads(); // Store items from a blocked arrangement BlockStoreT(temp_storage.store).Store(d_out, data); // Store aggregate and elapsed clocks if (threadIdx.x == 0) { *d_elapsed = (start > stop) ? start - stop : stop - start; d_out[BLOCK_THREADS * ITEMS_PER_THREAD] = aggregate; } } //--------------------------------------------------------------------- // Host utilities //--------------------------------------------------------------------- /** * Initialize exclusive prefix sum problem (and solution). * Returns the aggregate */ int Initialize(int* h_in, int* h_reference, int num_items) { int inclusive = 0; for (int i = 0; i < num_items; ++i) { h_in[i] = i % 17; h_reference[i] = inclusive; inclusive += h_in[i]; } return inclusive; } /** * Test thread block scan */ template void Test() { constexpr int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD; // Allocate host arrays int* h_in = new int[TILE_SIZE]; int* h_reference = new int[TILE_SIZE]; int* h_gpu = new int[TILE_SIZE + 1]; // Initialize problem and reference output on host int h_aggregate = Initialize(h_in, h_reference, TILE_SIZE); // Initialize device arrays int* d_in = NULL; int* d_out = NULL; clock_t* d_elapsed = NULL; cudaMalloc((void**) &d_in, sizeof(int) * TILE_SIZE); cudaMalloc((void**) &d_out, sizeof(int) * (TILE_SIZE + 1)); cudaMalloc((void**) &d_elapsed, sizeof(clock_t)); // Display input problem data if (g_verbose) { printf("Input data: "); for (int i = 0; i < TILE_SIZE; i++) { printf("%d, ", h_in[i]); } printf("\n\n"); } // Kernel props int max_sm_occupancy; CubDebugExit( MaxSmOccupancy(max_sm_occupancy, BlockPrefixSumKernel, BLOCK_THREADS)); // Copy problem to device cudaMemcpy(d_in, h_in, sizeof(int) * TILE_SIZE, cudaMemcpyHostToDevice); printf( "BlockScan algorithm %s on %d items (%d timing iterations, %d blocks, %d threads, %d items per thread, %d SM " "occupancy):\n", (ALGORITHM == BLOCK_SCAN_RAKING) ? "BLOCK_SCAN_RAKING" : (ALGORITHM == BLOCK_SCAN_RAKING_MEMOIZE) ? "BLOCK_SCAN_RAKING_MEMOIZE" : "BLOCK_SCAN_WARP_SCANS", TILE_SIZE, g_timing_iterations, g_grid_size, BLOCK_THREADS, ITEMS_PER_THREAD, max_sm_occupancy); // Run aggregate/prefix kernel BlockPrefixSumKernel <<>>(d_in, d_out, d_elapsed); // Check results printf("\tOutput items: "); int compare = CompareDeviceResults(h_reference, d_out, TILE_SIZE, g_verbose, g_verbose); printf("%s\n", compare ? "FAIL" : "PASS"); AssertEquals(0, compare); // Check total aggregate printf("\tAggregate: "); compare = CompareDeviceResults(&h_aggregate, d_out + TILE_SIZE, 1, g_verbose, g_verbose); printf("%s\n", compare ? "FAIL" : "PASS"); AssertEquals(0, compare); // Run this several times and average the performance results GpuTimer timer; float elapsed_millis = 0.0; clock_t elapsed_clocks = 0; for (int i = 0; i < g_timing_iterations; ++i) { // Copy problem to device cudaMemcpy(d_in, h_in, sizeof(int) * TILE_SIZE, cudaMemcpyHostToDevice); timer.Start(); // Run aggregate/prefix kernel BlockPrefixSumKernel <<>>(d_in, d_out, d_elapsed); timer.Stop(); elapsed_millis += timer.ElapsedMillis(); // Copy clocks from device clock_t clocks; CubDebugExit(cudaMemcpy(&clocks, d_elapsed, sizeof(clock_t), cudaMemcpyDeviceToHost)); elapsed_clocks += clocks; } // Check for kernel errors and STDIO from the kernel, if any CubDebugExit(cudaPeekAtLastError()); CubDebugExit(cudaDeviceSynchronize()); // Display timing results float avg_millis = elapsed_millis / g_timing_iterations; float avg_items_per_sec = float(TILE_SIZE * g_grid_size) / avg_millis / 1000.0f; float avg_clocks = float(elapsed_clocks) / g_timing_iterations; float avg_clocks_per_item = avg_clocks / TILE_SIZE; printf("\tAverage BlockScan::Sum clocks: %.3f\n", avg_clocks); printf("\tAverage BlockScan::Sum clocks per item: %.3f\n", avg_clocks_per_item); printf("\tAverage kernel millis: %.4f\n", avg_millis); printf("\tAverage million items / sec: %.4f\n", avg_items_per_sec); // Cleanup if (h_in) { delete[] h_in; } if (h_reference) { delete[] h_reference; } if (h_gpu) { delete[] h_gpu; } if (d_in) { cudaFree(d_in); } if (d_out) { cudaFree(d_out); } if (d_elapsed) { cudaFree(d_elapsed); } } /** * Main */ int main(int argc, char** argv) { // Initialize command line CommandLineArgs args(argc, argv); g_verbose = args.CheckCmdLineFlag("v"); args.GetCmdLineArgument("i", g_timing_iterations); args.GetCmdLineArgument("grid-size", g_grid_size); // Print usage if (args.CheckCmdLineFlag("help")) { printf("%s " "[--device=] " "[--i=]" "[--grid-size=]" "[--v] " "\n", argv[0], g_timing_iterations, g_grid_size); exit(0); } // Initialize device CubDebugExit(args.DeviceInit()); // Run tests Test<1024, 1, BLOCK_SCAN_RAKING>(); Test<512, 2, BLOCK_SCAN_RAKING>(); Test<256, 4, BLOCK_SCAN_RAKING>(); Test<128, 8, BLOCK_SCAN_RAKING>(); Test<64, 16, BLOCK_SCAN_RAKING>(); Test<32, 32, BLOCK_SCAN_RAKING>(); printf("-------------\n"); Test<1024, 1, BLOCK_SCAN_RAKING_MEMOIZE>(); Test<512, 2, BLOCK_SCAN_RAKING_MEMOIZE>(); Test<256, 4, BLOCK_SCAN_RAKING_MEMOIZE>(); Test<128, 8, BLOCK_SCAN_RAKING_MEMOIZE>(); Test<64, 16, BLOCK_SCAN_RAKING_MEMOIZE>(); Test<32, 32, BLOCK_SCAN_RAKING_MEMOIZE>(); printf("-------------\n"); Test<1024, 1, BLOCK_SCAN_WARP_SCANS>(); Test<512, 2, BLOCK_SCAN_WARP_SCANS>(); Test<256, 4, BLOCK_SCAN_WARP_SCANS>(); Test<128, 8, BLOCK_SCAN_WARP_SCANS>(); Test<64, 16, BLOCK_SCAN_WARP_SCANS>(); Test<32, 32, BLOCK_SCAN_WARP_SCANS>(); return 0; } cccl-2.5.0/cub/examples/cmake/000077500000000000000000000000001463375617100161065ustar00rootroot00000000000000cccl-2.5.0/cub/examples/cmake/CMakeLists.txt000066400000000000000000000005471463375617100206540ustar00rootroot00000000000000add_test( NAME cub.example.cmake.add_subdir COMMAND "${CMAKE_COMMAND}" --log-level=VERBOSE -G "${CMAKE_GENERATOR}" -S "${CMAKE_CURRENT_SOURCE_DIR}/add_subdir" -B "${CMAKE_CURRENT_BINARY_DIR}/add_subdir" -D "CUB_ROOT=${CUB_SOURCE_DIR}" -D "CMAKE_CUDA_COMPILER=${CMAKE_CUDA_COMPILER}" -D "CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}" ) cccl-2.5.0/cub/examples/cmake/add_subdir/000077500000000000000000000000001463375617100202065ustar00rootroot00000000000000cccl-2.5.0/cub/examples/cmake/add_subdir/CMakeLists.txt000066400000000000000000000015261463375617100227520ustar00rootroot00000000000000# This example demonstrates / tests adding CUB via a CMake add_subdirectory # call from a parent project. cmake_minimum_required(VERSION 3.15) # Silence warnings about empty CUDA_ARCHITECTURES properties on example targets: if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.18) cmake_policy(SET CMP0104 OLD) endif() project(CubAddSubDirExample LANGUAGES CUDA) # Use your project's checkout of CUB here, for most cases # `add_subdirectory(cub)` will be sufficient. add_subdirectory("${CUB_ROOT}" cub) # Link the CUB::CUB target to your project's targets add_executable(HelloCUB dummy.cu) target_link_libraries(HelloCUB CUB::CUB) # # Validation # function(assert_target target_name) if (NOT TARGET "${target_name}") message(FATAL_ERROR "Target '${target_name}' not defined.") endif() endfunction() assert_target(CUB::CUB) assert_target(HelloCUB) cccl-2.5.0/cub/examples/cmake/add_subdir/dummy.cu000066400000000000000000000002011463375617100216630ustar00rootroot00000000000000#include #include int main() { std::cout << "Hello from CUB version " << CUB_VERSION << ":\n"; } cccl-2.5.0/cub/examples/device/000077500000000000000000000000001463375617100162655ustar00rootroot00000000000000cccl-2.5.0/cub/examples/device/.gitignore000066400000000000000000000001131463375617100202500ustar00rootroot00000000000000/bin /Debug /ipch /Release /cuda55.sdf /cuda55.suo /cuda60.sdf /cuda60.suo cccl-2.5.0/cub/examples/device/CMakeLists.txt000066400000000000000000000007311463375617100210260ustar00rootroot00000000000000file(GLOB_RECURSE example_srcs RELATIVE "${CMAKE_CURRENT_LIST_DIR}" CONFIGURE_DEPENDS example_*.cu ) foreach (cub_target IN LISTS CUB_TARGETS) foreach (example_src IN LISTS example_srcs) get_filename_component(example_name "${example_src}" NAME_WE) string(REGEX REPLACE "^example_device_" "device." example_name "${example_name}" ) cub_add_example(target_name ${example_name} "${example_src}" ${cub_target}) endforeach() endforeach() cccl-2.5.0/cub/examples/device/example_device_decoupled_look_back.cu000066400000000000000000000112701463375617100256210ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #include #include #include template __global__ void init_kernel(ScanTileStateT tile_state, int blocks_in_grid) { tile_state.InitializeStatus(blocks_in_grid); } template __global__ void decoupled_look_back_kernel(cub::ScanTileState tile_state) { using scan_op_t = cub::Sum; using scan_tile_state_t = cub::ScanTileState; using tile_prefix_op = cub::TilePrefixCallbackOp; using temp_storage_t = typename tile_prefix_op::TempStorage; // Allocate temp storage in shared memory __shared__ temp_storage_t temp_storage; scan_op_t scan_op{}; constexpr unsigned int threads_in_warp = 32; const unsigned int tid = threadIdx.x; // Construct prefix op tile_prefix_op prefix(tile_state, temp_storage, scan_op); const unsigned int tile_idx = prefix.GetTileIdx(); // Compute block aggregate MessageT block_aggregate = blockIdx.x; if (tile_idx == 0) { // There are no blocks to look back to, immediately set the inclusive state if (tid == 0) { tile_state.SetInclusive(tile_idx, block_aggregate); printf("tile %d: inclusive = %d\n", tile_idx, block_aggregate); } } else { // Only the first warp in the block can perform the look back const unsigned int warp_id = tid / threads_in_warp; if (warp_id == 0) { // Perform the decoupled look-back // Invocation of the prefix will block until the look-back is complete. MessageT exclusive_prefix = prefix(block_aggregate); if (tid == 0) { MessageT inclusive_prefix = scan_op(exclusive_prefix, block_aggregate); printf("tile %d: exclusive = %d inclusive = %d\n", tile_idx, exclusive_prefix, inclusive_prefix); } } } } template void decoupled_look_back_example(int blocks_in_grid) { using scan_tile_state_t = cub::ScanTileState; // Query temporary storage requirements std::size_t temp_storage_bytes{}; scan_tile_state_t::AllocationSize(blocks_in_grid, temp_storage_bytes); // Allocate temporary storage thrust::device_vector temp_storage(temp_storage_bytes); std::uint8_t* d_temp_storage = thrust::raw_pointer_cast(temp_storage.data()); // Initialize temporary storage scan_tile_state_t tile_status; tile_status.Init(blocks_in_grid, d_temp_storage, temp_storage_bytes); constexpr unsigned int threads_in_init_block = 256; const unsigned int blocks_in_init_grid = cub::DivideAndRoundUp(blocks_in_grid, threads_in_init_block); init_kernel<<>>(tile_status, blocks_in_grid); // Launch decoupled look-back constexpr unsigned int threads_in_block = 256; decoupled_look_back_kernel<<>>(tile_status); // Wait for kernel to finish cudaDeviceSynchronize(); } int main() { decoupled_look_back_example(14); } cccl-2.5.0/cub/examples/device/example_device_partition_flagged.cu000066400000000000000000000172321463375617100253370ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /****************************************************************************** * Simple example of DevicePartition::Flagged(). * * Partition flagged items from from a sequence of int keys using a * corresponding sequence of unsigned char flags. * * To compile using the command line: * nvcc -arch=sm_XX example_device_partition_flagged.cu -I../.. -lcudart -O3 * ******************************************************************************/ // Ensure printing of CUDA runtime errors to console #define CUB_STDERR #include #include #include "../../test/test_util.h" #include using namespace cub; //--------------------------------------------------------------------- // Globals, constants and typedefs //--------------------------------------------------------------------- bool g_verbose = false; // Whether to display input/output to console CachingDeviceAllocator g_allocator(true); // Caching allocator for device memory //--------------------------------------------------------------------- // Test generation //--------------------------------------------------------------------- /** * Initialize problem, setting flags at distances of random length * chosen from [1..max_segment] */ void Initialize(int* h_in, unsigned char* h_flags, int num_items, int max_segment) { unsigned short max_short = (unsigned short) -1; int key = 0; int i = 0; while (i < num_items) { // Select number of repeating occurrences unsigned short repeat; RandomBits(repeat); repeat = (unsigned short) ((float(repeat) * (float(max_segment) / float(max_short)))); repeat = CUB_MAX(1, repeat); int j = i; while (j < CUB_MIN(i + repeat, num_items)) { h_flags[j] = 0; h_in[j] = key; j++; } h_flags[i] = 1; i = j; key++; } if (g_verbose) { printf("Input:\n"); DisplayResults(h_in, num_items); printf("Flags:\n"); DisplayResults(h_flags, num_items); printf("\n\n"); } } /** * Solve unique problem */ int Solve(int* h_in, unsigned char* h_flags, int* h_reference, int num_items) { int num_selected = 0; for (int i = 0; i < num_items; ++i) { if (h_flags[i]) { h_reference[num_selected] = h_in[i]; num_selected++; } else { h_reference[num_items - (i - num_selected) - 1] = h_in[i]; } } return num_selected; } //--------------------------------------------------------------------- // Main //--------------------------------------------------------------------- /** * Main */ int main(int argc, char** argv) { int num_items = 150; int max_segment = 40; // Maximum segment length // Initialize command line CommandLineArgs args(argc, argv); g_verbose = args.CheckCmdLineFlag("v"); args.GetCmdLineArgument("n", num_items); args.GetCmdLineArgument("maxseg", max_segment); // Print usage if (args.CheckCmdLineFlag("help")) { printf("%s " "[--n= " "[--device=] " "[--maxseg=] " "[--v] " "\n", argv[0]); exit(0); } // Initialize device CubDebugExit(args.DeviceInit()); // Allocate host arrays int* h_in = new int[num_items]; int* h_reference = new int[num_items]; unsigned char* h_flags = new unsigned char[num_items]; // Initialize problem and solution Initialize(h_in, h_flags, num_items, max_segment); int num_selected = Solve(h_in, h_flags, h_reference, num_items); printf("cub::DevicePartition::Flagged %d items, %d selected (avg distance %d), %d-byte elements\n", num_items, num_selected, (num_selected > 0) ? num_items / num_selected : 0, (int) sizeof(int)); fflush(stdout); // Allocate problem device arrays int* d_in = NULL; unsigned char* d_flags = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**) &d_in, sizeof(int) * num_items)); CubDebugExit(g_allocator.DeviceAllocate((void**) &d_flags, sizeof(unsigned char) * num_items)); // Initialize device input CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(int) * num_items, cudaMemcpyHostToDevice)); CubDebugExit(cudaMemcpy(d_flags, h_flags, sizeof(unsigned char) * num_items, cudaMemcpyHostToDevice)); // Allocate device output array and num selected int* d_out = NULL; int* d_num_selected_out = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**) &d_out, sizeof(int) * num_items)); CubDebugExit(g_allocator.DeviceAllocate((void**) &d_num_selected_out, sizeof(int))); // Allocate temporary storage void* d_temp_storage = NULL; size_t temp_storage_bytes = 0; CubDebugExit( DevicePartition::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items)); CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes)); // Run CubDebugExit( DevicePartition::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items)); // Check for correctness (and display results, if specified) int compare = CompareDeviceResults(h_reference, d_out, num_items, true, g_verbose); printf("\t Data %s ", compare ? "FAIL" : "PASS"); compare |= CompareDeviceResults(&num_selected, d_num_selected_out, 1, true, g_verbose); printf("\t Count %s ", compare ? "FAIL" : "PASS"); AssertEquals(0, compare); // Cleanup if (h_in) { delete[] h_in; } if (h_reference) { delete[] h_reference; } if (d_out) { CubDebugExit(g_allocator.DeviceFree(d_out)); } if (d_num_selected_out) { CubDebugExit(g_allocator.DeviceFree(d_num_selected_out)); } if (d_temp_storage) { CubDebugExit(g_allocator.DeviceFree(d_temp_storage)); } if (d_in) { CubDebugExit(g_allocator.DeviceFree(d_in)); } if (d_flags) { CubDebugExit(g_allocator.DeviceFree(d_flags)); } printf("\n\n"); return 0; } cccl-2.5.0/cub/examples/device/example_device_partition_if.cu000066400000000000000000000173601463375617100243460ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /****************************************************************************** * Simple example of DevicePartition::If(). * * Partitions items from from a sequence of int keys using a * section functor (greater-than) * * To compile using the command line: * nvcc -arch=sm_XX example_device_select_if.cu -I../.. -lcudart -O3 * ******************************************************************************/ // Ensure printing of CUDA runtime errors to console #define CUB_STDERR #include #include #include "../../test/test_util.h" #include using namespace cub; //--------------------------------------------------------------------- // Globals, constants and typedefs //--------------------------------------------------------------------- bool g_verbose = false; // Whether to display input/output to console CachingDeviceAllocator g_allocator(true); // Caching allocator for device memory /// Selection functor type struct GreaterThan { int compare; __host__ __device__ __forceinline__ GreaterThan(int compare) : compare(compare) {} __host__ __device__ __forceinline__ bool operator()(const int& a) const { return (a > compare); } }; //--------------------------------------------------------------------- // Test generation //--------------------------------------------------------------------- /** * Initialize problem, setting runs of random length chosen from [1..max_segment] */ void Initialize(int* h_in, int num_items, int max_segment) { int key = 0; int i = 0; while (i < num_items) { // Randomly select number of repeating occurrences uniformly from [1..max_segment] unsigned short max_short = (unsigned short) -1; unsigned short repeat; RandomBits(repeat); repeat = (unsigned short) ((float(repeat) * (float(max_segment) / float(max_short)))); repeat = CUB_MAX(1, repeat); int j = i; while (j < CUB_MIN(i + repeat, num_items)) { h_in[j] = key; j++; } i = j; key++; } if (g_verbose) { printf("Input:\n"); DisplayResults(h_in, num_items); printf("\n\n"); } } /** * Solve unique problem */ template int Solve(int* h_in, SelectOp select_op, int* h_reference, int num_items) { int num_selected = 0; for (int i = 0; i < num_items; ++i) { if (select_op(h_in[i])) { h_reference[num_selected] = h_in[i]; num_selected++; } else { h_reference[num_items - (i - num_selected) - 1] = h_in[i]; } } return num_selected; } //--------------------------------------------------------------------- // Main //--------------------------------------------------------------------- /** * Main */ int main(int argc, char** argv) { int num_items = 150; int max_segment = 40; // Maximum segment length // Initialize command line CommandLineArgs args(argc, argv); g_verbose = args.CheckCmdLineFlag("v"); args.GetCmdLineArgument("n", num_items); args.GetCmdLineArgument("maxseg", max_segment); // Print usage if (args.CheckCmdLineFlag("help")) { printf("%s " "[--n= " "[--device=] " "[--maxseg=]" "[--v] " "\n", argv[0]); exit(0); } // Initialize device CubDebugExit(args.DeviceInit()); // Allocate host arrays int* h_in = new int[num_items]; int* h_reference = new int[num_items]; // DevicePartition a pivot index unsigned int pivot_index; unsigned int max_int = (unsigned int) -1; RandomBits(pivot_index); pivot_index = (unsigned int) ((float(pivot_index) * (float(num_items - 1) / float(max_int)))); printf("Pivot idx: %d\n", pivot_index); fflush(stdout); // Initialize problem and solution Initialize(h_in, num_items, max_segment); GreaterThan select_op(h_in[pivot_index]); int num_selected = Solve(h_in, select_op, h_reference, num_items); printf("cub::DevicePartition::If %d items, %d selected (avg run length %d), %d-byte elements\n", num_items, num_selected, (num_selected > 0) ? num_items / num_selected : 0, (int) sizeof(int)); fflush(stdout); // Allocate problem device arrays int* d_in = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**) &d_in, sizeof(int) * num_items)); // Initialize device input CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(int) * num_items, cudaMemcpyHostToDevice)); // Allocate device output array and num selected int* d_out = NULL; int* d_num_selected_out = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**) &d_out, sizeof(int) * num_items)); CubDebugExit(g_allocator.DeviceAllocate((void**) &d_num_selected_out, sizeof(int))); // Allocate temporary storage void* d_temp_storage = NULL; size_t temp_storage_bytes = 0; CubDebugExit( DevicePartition::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op)); CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes)); // Run CubDebugExit( DevicePartition::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op)); // Check for correctness (and display results, if specified) int compare = CompareDeviceResults(h_reference, d_out, num_items, true, g_verbose); printf("\t Data %s ", compare ? "FAIL" : "PASS"); compare = compare | CompareDeviceResults(&num_selected, d_num_selected_out, 1, true, g_verbose); printf("\t Count %s ", compare ? "FAIL" : "PASS"); AssertEquals(0, compare); // Cleanup if (h_in) { delete[] h_in; } if (h_reference) { delete[] h_reference; } if (d_in) { CubDebugExit(g_allocator.DeviceFree(d_in)); } if (d_out) { CubDebugExit(g_allocator.DeviceFree(d_out)); } if (d_num_selected_out) { CubDebugExit(g_allocator.DeviceFree(d_num_selected_out)); } if (d_temp_storage) { CubDebugExit(g_allocator.DeviceFree(d_temp_storage)); } printf("\n\n"); return 0; } cccl-2.5.0/cub/examples/device/example_device_radix_sort.cu000066400000000000000000000167401463375617100240360ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /****************************************************************************** * Simple example of DeviceRadixSort::SortPairs(). * * Sorts an array of float keys paired with a corresponding array of int values. * * To compile using the command line: * nvcc -arch=sm_XX example_device_radix_sort.cu -I../.. -lcudart -O3 * ******************************************************************************/ // Ensure printing of CUDA runtime errors to console #define CUB_STDERR #include #include #include #include "../../test/test_util.h" #include using namespace cub; //--------------------------------------------------------------------- // Globals, constants and typedefs //--------------------------------------------------------------------- bool g_verbose = false; // Whether to display input/output to console CachingDeviceAllocator g_allocator(true); // Caching allocator for device memory //--------------------------------------------------------------------- // Test generation //--------------------------------------------------------------------- /** * Simple key-value pairing for floating point types. * Treats positive and negative zero as equivalent. */ struct Pair { float key; int value; bool operator<(const Pair& b) const { return key < b.key; } }; /** * Initialize key-value sorting problem. */ void Initialize(float* h_keys, int* h_values, float* h_reference_keys, int* h_reference_values, int num_items) { Pair* h_pairs = new Pair[num_items]; for (int i = 0; i < num_items; ++i) { RandomBits(h_keys[i]); RandomBits(h_values[i]); h_pairs[i].key = h_keys[i]; h_pairs[i].value = h_values[i]; } if (g_verbose) { printf("Input keys:\n"); DisplayResults(h_keys, num_items); printf("\n\n"); printf("Input values:\n"); DisplayResults(h_values, num_items); printf("\n\n"); } std::stable_sort(h_pairs, h_pairs + num_items); for (int i = 0; i < num_items; ++i) { h_reference_keys[i] = h_pairs[i].key; h_reference_values[i] = h_pairs[i].value; } delete[] h_pairs; } //--------------------------------------------------------------------- // Main //--------------------------------------------------------------------- /** * Main */ int main(int argc, char** argv) { int num_items = 150; // Initialize command line CommandLineArgs args(argc, argv); g_verbose = args.CheckCmdLineFlag("v"); args.GetCmdLineArgument("n", num_items); // Print usage if (args.CheckCmdLineFlag("help")) { printf("%s " "[--n= " "[--device=] " "[--v] " "\n", argv[0]); exit(0); } // Initialize device CubDebugExit(args.DeviceInit()); printf("cub::DeviceRadixSort::SortPairs() %d items (%d-byte keys %d-byte values)\n", num_items, int(sizeof(float)), int(sizeof(int))); fflush(stdout); // Allocate host arrays float* h_keys = new float[num_items]; float* h_reference_keys = new float[num_items]; int* h_values = new int[num_items]; int* h_reference_values = new int[num_items]; // Initialize problem and solution on host Initialize(h_keys, h_values, h_reference_keys, h_reference_values, num_items); // Allocate device arrays DoubleBuffer d_keys; DoubleBuffer d_values; CubDebugExit(g_allocator.DeviceAllocate((void**) &d_keys.d_buffers[0], sizeof(float) * num_items)); CubDebugExit(g_allocator.DeviceAllocate((void**) &d_keys.d_buffers[1], sizeof(float) * num_items)); CubDebugExit(g_allocator.DeviceAllocate((void**) &d_values.d_buffers[0], sizeof(int) * num_items)); CubDebugExit(g_allocator.DeviceAllocate((void**) &d_values.d_buffers[1], sizeof(int) * num_items)); // Allocate temporary storage size_t temp_storage_bytes = 0; void* d_temp_storage = NULL; CubDebugExit(DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items)); CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes)); // Initialize device arrays CubDebugExit( cudaMemcpy(d_keys.d_buffers[d_keys.selector], h_keys, sizeof(float) * num_items, cudaMemcpyHostToDevice)); CubDebugExit( cudaMemcpy(d_values.d_buffers[d_values.selector], h_values, sizeof(int) * num_items, cudaMemcpyHostToDevice)); // Run CubDebugExit(DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items)); // Check for correctness (and display results, if specified) int compare = CompareDeviceResults(h_reference_keys, d_keys.Current(), num_items, true, g_verbose); printf("\t Compare keys (selector %d): %s\n", d_keys.selector, compare ? "FAIL" : "PASS"); AssertEquals(0, compare); compare = CompareDeviceResults(h_reference_values, d_values.Current(), num_items, true, g_verbose); printf("\t Compare values (selector %d): %s\n", d_values.selector, compare ? "FAIL" : "PASS"); AssertEquals(0, compare); // Cleanup if (h_keys) { delete[] h_keys; } if (h_reference_keys) { delete[] h_reference_keys; } if (h_values) { delete[] h_values; } if (h_reference_values) { delete[] h_reference_values; } if (d_keys.d_buffers[0]) { CubDebugExit(g_allocator.DeviceFree(d_keys.d_buffers[0])); } if (d_keys.d_buffers[1]) { CubDebugExit(g_allocator.DeviceFree(d_keys.d_buffers[1])); } if (d_values.d_buffers[0]) { CubDebugExit(g_allocator.DeviceFree(d_values.d_buffers[0])); } if (d_values.d_buffers[1]) { CubDebugExit(g_allocator.DeviceFree(d_values.d_buffers[1])); } if (d_temp_storage) { CubDebugExit(g_allocator.DeviceFree(d_temp_storage)); } printf("\n\n"); return 0; } cccl-2.5.0/cub/examples/device/example_device_radix_sort_custom.cu000066400000000000000000000231421463375617100254220ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #include #include #include #include #include #include #include #include #include #include #include "cub/block/radix_rank_sort_operations.cuh" struct custom_t { std::uint16_t i; float f; }; struct decomposer_t { __host__ __device__ // ::cuda::std::tuple operator()(custom_t& key) const { return {key.i, key.f}; } }; std::bitset<64> to_binary_representation(custom_t value) { std::uint64_t bits{}; memcpy(&bits, &value, sizeof(custom_t)); return std::bitset<64>{bits}; } int main() { std::cout << "This example illustrates use of radix sort with custom type.\n"; std::cout << "Let's define a simple structure of the following form:\n\n"; std::cout << "\tstruct custom_t {\n"; std::cout << "\t std::uint32_t i;\n"; std::cout << "\t float f;\n"; std::cout << "\t};\n\n"; std::cout << "The `i` field is already stored in the bit-lexicographical order.\n"; std::cout << "The `f` field, however, isn't. Therefore, to feed this structure \n"; std::cout << "into the radix sort, we have to convert `f` into bit ordered representation.\n"; std::cout << "The `custom_t{65535, -4.2f}` has the following binary representation:\n\n"; auto print_segment = [](std::string msg, std::size_t segment_size, char filler = '-') { std::string spaces((segment_size - msg.size()) / 2 - 1, filler); std::cout << '<' << spaces << msg << spaces << '>'; }; std::cout << '\t'; print_segment(" `.f` ", 32); print_segment(" padding -", 16); print_segment(" `.s` ", 16); std::cout << '\n'; std::cout << "\ts"; print_segment(" exp. ", 8); print_segment(" mantissa -", 23); print_segment(" padding -", 16); print_segment(" short -", 16); std::cout << '\n'; custom_t the_answer{65535, -4.2f}; std::cout << '\t' << to_binary_representation(the_answer); std::cout << "\n\t"; print_segment(" <---- higher bits / lower bits ----> ", 64, ' '); std::cout << "\n\n"; std::cout << "Let's say we are trying to compare l={42, -4.2f} with g={42, 4.2f}:\n"; std::cout << "\n\t"; print_segment(" `.f` ", 32); print_segment(" padding -", 16); print_segment(" `.s` ", 16); std::cout << '\n'; custom_t l{42, -4.2f}; custom_t g{42, 4.2f}; std::cout << "l:\t" << to_binary_representation(l) << '\n'; std::cout << "g:\t" << to_binary_representation(g) << "\n\n"; std::cout << "As you can see, `l` key happened to be larger in the bit-lexicographicl order.\n"; std::cout << "Since there's no reflection in C++, we can't inspect the type and convert \n"; std::cout << "each field into the bit-lexicographicl order. You can tell CUB how to do that\n"; std::cout << "by specializing cub::RadixTraits for the `custom_t`:\n\n"; std::cout << "\tstruct decomposer_t \n"; std::cout << "\t{\n"; std::cout << "\t __host__ __device__ \n"; std::cout << "\t ::cuda::std::tuple operator()(custom_t &key) const \n"; std::cout << "\t {\n"; std::cout << "\t return {key.i, key.f};\n"; std::cout << "\t }\n"; std::cout << "\t};\n"; std::cout << "\n"; std::cout << "Decomposer allows you to specify which fields are most significant and which\n"; std::cout << "are least significant. In our case, `f` is the most significant field and\n"; std::cout << "`i` is the least significant field. The decomposer is then used by CUB to convert\n"; std::cout << "the `custom_t` into the bit-lexicographicl order:\n\n"; using conversion_policy = cub::detail::radix::traits_t::bit_ordered_conversion_policy; l = conversion_policy::to_bit_ordered(decomposer_t{}, l); g = conversion_policy::to_bit_ordered(decomposer_t{}, g); std::cout << "\n\t"; print_segment(" `.f` ", 32); print_segment(" padding -", 16); print_segment(" `.s` ", 16); std::cout << '\n'; std::cout << "l:\t" << to_binary_representation(l) << '\n'; std::cout << "g:\t" << to_binary_representation(g) << "\n\n"; std::cout << '\n'; std::cout << "As you can see, `g` is now actually larger than `l` in the bit-lexicographicl order.\n"; std::cout << "After binning, CUB is able to restore the original key:\n\n"; l = conversion_policy::from_bit_ordered(decomposer_t{}, l); g = conversion_policy::from_bit_ordered(decomposer_t{}, g); std::cout << "\n\t"; print_segment(" `.f` ", 32); print_segment(" padding -", 16); print_segment(" `.s` ", 16); std::cout << '\n'; std::cout << "l:\t" << to_binary_representation(l) << '\n'; std::cout << "g:\t" << to_binary_representation(g) << "\n\n"; using inversion_policy = cub::detail::radix::traits_t::bit_ordered_inversion_policy; std::cout << '\n'; std::cout << "We are also able to inverse differentiating bits:\n"; l = inversion_policy::inverse(decomposer_t{}, l); g = inversion_policy::inverse(decomposer_t{}, g); std::cout << "\n\t"; print_segment(" `.f` ", 32); print_segment(" padding -", 16); print_segment(" `.s` ", 16); std::cout << '\n'; std::cout << "l:\t" << to_binary_representation(l) << '\n'; std::cout << "g:\t" << to_binary_representation(g) << "\n\n"; std::cout << '\n'; std::cout << "We as well can compute the minimal and minimal / maximal keys:\n"; l = cub::detail::radix::traits_t::min_raw_binary_key(decomposer_t{}); g = cub::detail::radix::traits_t::max_raw_binary_key(decomposer_t{}); std::cout << "\n\t"; print_segment(" `.f` ", 32); print_segment(" padding -", 16); print_segment(" `.s` ", 16); std::cout << '\n'; std::cout << "l:\t" << to_binary_representation(l) << '\n'; std::cout << "g:\t" << to_binary_representation(g) << "\n\n"; std::cout << "We can even compute the number of differentiating bits:\n\n"; std::cout << "end:\t"; std::cout << cub::detail::radix::traits_t::default_end_bit(decomposer_t{}); std::cout << '\n'; std::cout << "size:\t"; std::cout << sizeof(custom_t) * CHAR_BIT; std::cout << "\n\n"; std::cout << "All of these operations are used behind the scenes by CUB to sort custom types:\n\n"; constexpr int num_items = 6; thrust::device_vector in = {{4, +2.5f}, {0, -2.5f}, {3, +1.1f}, {1, +0.0f}, {2, -0.0f}, {5, +3.7f}}; std::cout << "in:\n"; for (custom_t key : in) { std::cout << "\t{.i = " << key.i << ", .f = " << key.f << "},\n"; } thrust::device_vector out(num_items); const custom_t* d_in = thrust::raw_pointer_cast(in.data()); custom_t* d_out = thrust::raw_pointer_cast(out.data()); // 1) Get temp storage size std::uint8_t* d_temp_storage{}; std::size_t temp_storage_bytes{}; cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, decomposer_t{}); // 2) Allocate temp storage thrust::device_vector temp_storage(temp_storage_bytes); d_temp_storage = thrust::raw_pointer_cast(temp_storage.data()); // 3) Sort keys cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, decomposer_t{}); cudaDeviceSynchronize(); std::cout << "\n"; std::cout << "sort:\n"; std::cout << "\n"; std::cout << "\tcub::DeviceRadixSort::SortKeys(d_temp_storage,\n"; std::cout << "\t temp_storage_bytes,\n"; std::cout << "\t d_in,\n"; std::cout << "\t d_out,\n"; std::cout << "\t num_items,\n"; std::cout << "\t decomposer_t{});\n\n"; std::cout << "out:\n"; for (custom_t key : out) { std::cout << "\t{.i = " << key.i << ", .f = " << key.f << "},\n"; } std::cout << '\n'; std::cout << "If you have any issues with radix sort support of custom types, \n"; std::cout << "please feel free to use this example to identify the problem.\n\n"; } cccl-2.5.0/cub/examples/device/example_device_reduce.cu000066400000000000000000000130261463375617100231210ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /****************************************************************************** * Simple example of DeviceReduce::Sum(). * * Sums an array of int keys. * * To compile using the command line: * nvcc -arch=sm_XX example_device_reduce.cu -I../.. -lcudart -O3 * ******************************************************************************/ // Ensure printing of CUDA runtime errors to console #define CUB_STDERR #include #include #include "../../test/test_util.h" #include using namespace cub; //--------------------------------------------------------------------- // Globals, constants and typedefs //--------------------------------------------------------------------- bool g_verbose = false; // Whether to display input/output to console CachingDeviceAllocator g_allocator(true); // Caching allocator for device memory //--------------------------------------------------------------------- // Test generation //--------------------------------------------------------------------- /** * Initialize problem */ void Initialize(int* h_in, int num_items) { for (int i = 0; i < num_items; ++i) { h_in[i] = i; } if (g_verbose) { printf("Input:\n"); DisplayResults(h_in, num_items); printf("\n\n"); } } /** * Compute solution */ void Solve(int* h_in, int& h_reference, int num_items) { for (int i = 0; i < num_items; ++i) { if (i == 0) { h_reference = h_in[0]; } else { h_reference += h_in[i]; } } } //--------------------------------------------------------------------- // Main //--------------------------------------------------------------------- /** * Main */ int main(int argc, char** argv) { int num_items = 150; // Initialize command line CommandLineArgs args(argc, argv); g_verbose = args.CheckCmdLineFlag("v"); args.GetCmdLineArgument("n", num_items); // Print usage if (args.CheckCmdLineFlag("help")) { printf("%s " "[--n= " "[--device=] " "[--v] " "\n", argv[0]); exit(0); } // Initialize device CubDebugExit(args.DeviceInit()); printf("cub::DeviceReduce::Sum() %d items (%d-byte elements)\n", num_items, (int) sizeof(int)); fflush(stdout); // Allocate host arrays int* h_in = new int[num_items]; int h_reference{}; // Initialize problem and solution Initialize(h_in, num_items); Solve(h_in, h_reference, num_items); // Allocate problem device arrays int* d_in = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**) &d_in, sizeof(int) * num_items)); // Initialize device input CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(int) * num_items, cudaMemcpyHostToDevice)); // Allocate device output array int* d_out = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**) &d_out, sizeof(int) * 1)); // Request and allocate temporary storage void* d_temp_storage = NULL; size_t temp_storage_bytes = 0; CubDebugExit(DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items)); CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes)); // Run CubDebugExit(DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items)); // Check for correctness (and display results, if specified) int compare = CompareDeviceResults(&h_reference, d_out, 1, g_verbose, g_verbose); printf("\t%s", compare ? "FAIL" : "PASS"); AssertEquals(0, compare); // Cleanup if (h_in) { delete[] h_in; } if (d_in) { CubDebugExit(g_allocator.DeviceFree(d_in)); } if (d_out) { CubDebugExit(g_allocator.DeviceFree(d_out)); } if (d_temp_storage) { CubDebugExit(g_allocator.DeviceFree(d_temp_storage)); } printf("\n\n"); return 0; } cccl-2.5.0/cub/examples/device/example_device_scan.cu000066400000000000000000000132711463375617100226000ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /****************************************************************************** * Simple example of DeviceScan::ExclusiveSum(). * * Computes an exclusive sum of int keys. * * To compile using the command line: * nvcc -arch=sm_XX example_device_scan.cu -I../.. -lcudart -O3 * ******************************************************************************/ // Ensure printing of CUDA runtime errors to console #define CUB_STDERR #include #include #include "../../test/test_util.h" #include using namespace cub; //--------------------------------------------------------------------- // Globals, constants and typedefs //--------------------------------------------------------------------- bool g_verbose = false; // Whether to display input/output to console CachingDeviceAllocator g_allocator(true); // Caching allocator for device memory //--------------------------------------------------------------------- // Test generation //--------------------------------------------------------------------- /** * Initialize problem */ void Initialize(int* h_in, int num_items) { for (int i = 0; i < num_items; ++i) { h_in[i] = i; } if (g_verbose) { printf("Input:\n"); DisplayResults(h_in, num_items); printf("\n\n"); } } /** * Solve exclusive-scan problem */ int Solve(int* h_in, int* h_reference, int num_items) { int inclusive = 0; int aggregate = 0; for (int i = 0; i < num_items; ++i) { h_reference[i] = inclusive; inclusive += h_in[i]; aggregate += h_in[i]; } return aggregate; } //--------------------------------------------------------------------- // Main //--------------------------------------------------------------------- /** * Main */ int main(int argc, char** argv) { int num_items = 150; // Initialize command line CommandLineArgs args(argc, argv); g_verbose = args.CheckCmdLineFlag("v"); args.GetCmdLineArgument("n", num_items); // Print usage if (args.CheckCmdLineFlag("help")) { printf("%s " "[--n= " "[--device=] " "[--v] " "\n", argv[0]); exit(0); } // Initialize device CubDebugExit(args.DeviceInit()); printf("cub::DeviceScan::ExclusiveSum %d items (%d-byte elements)\n", num_items, (int) sizeof(int)); fflush(stdout); // Allocate host arrays int* h_in = new int[num_items]; int* h_reference = new int[num_items]; // Initialize problem and solution Initialize(h_in, num_items); Solve(h_in, h_reference, num_items); // Allocate problem device arrays int* d_in = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**) &d_in, sizeof(int) * num_items)); // Initialize device input CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(int) * num_items, cudaMemcpyHostToDevice)); // Allocate device output array int* d_out = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**) &d_out, sizeof(int) * num_items)); // Allocate temporary storage void* d_temp_storage = NULL; size_t temp_storage_bytes = 0; CubDebugExit(DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items)); CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes)); // Run CubDebugExit(DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items)); // Check for correctness (and display results, if specified) int compare = CompareDeviceResults(h_reference, d_out, num_items, true, g_verbose); printf("\t%s", compare ? "FAIL" : "PASS"); AssertEquals(0, compare); // Cleanup if (h_in) { delete[] h_in; } if (h_reference) { delete[] h_reference; } if (d_in) { CubDebugExit(g_allocator.DeviceFree(d_in)); } if (d_out) { CubDebugExit(g_allocator.DeviceFree(d_out)); } if (d_temp_storage) { CubDebugExit(g_allocator.DeviceFree(d_temp_storage)); } printf("\n\n"); return 0; } cccl-2.5.0/cub/examples/device/example_device_select_flagged.cu000066400000000000000000000172111463375617100246020ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /****************************************************************************** * Simple example of DeviceSelect::Flagged(). * * Selects flagged items from from a sequence of int keys using a * corresponding sequence of unsigned char flags. * * To compile using the command line: * nvcc -arch=sm_XX example_device_select_flagged.cu -I../.. -lcudart -O3 * ******************************************************************************/ // Ensure printing of CUDA runtime errors to console #define CUB_STDERR #include #include #include "../../test/test_util.h" #include using namespace cub; //--------------------------------------------------------------------- // Globals, constants and typedefs //--------------------------------------------------------------------- bool g_verbose = false; // Whether to display input/output to console CachingDeviceAllocator g_allocator(true); // Caching allocator for device memory //--------------------------------------------------------------------- // Test generation //--------------------------------------------------------------------- /** * Initialize problem, setting flags at distances of random length * chosen from [1..max_segment] */ void Initialize(int* h_in, unsigned char* h_flags, int num_items, int max_segment) { unsigned short max_short = (unsigned short) -1; int key = 0; int i = 0; while (i < num_items) { // Select number of repeating occurrences unsigned short repeat; RandomBits(repeat); repeat = (unsigned short) ((float(repeat) * (float(max_segment) / float(max_short)))); repeat = CUB_MAX(1, repeat); int j = i; while (j < CUB_MIN(i + repeat, num_items)) { h_flags[j] = 0; h_in[j] = key; j++; } h_flags[i] = 1; i = j; key++; } if (g_verbose) { printf("Input:\n"); DisplayResults(h_in, num_items); printf("Flags:\n"); DisplayResults(h_flags, num_items); printf("\n\n"); } } /** * Solve unique problem */ int Solve(int* h_in, unsigned char* h_flags, int* h_reference, int num_items) { int num_selected = 0; for (int i = 0; i < num_items; ++i) { if (h_flags[i]) { h_reference[num_selected] = h_in[i]; num_selected++; } else { h_reference[num_items - (i - num_selected) - 1] = h_in[i]; } } return num_selected; } //--------------------------------------------------------------------- // Main //--------------------------------------------------------------------- /** * Main */ int main(int argc, char** argv) { int num_items = 150; int max_segment = 40; // Maximum segment length // Initialize command line CommandLineArgs args(argc, argv); g_verbose = args.CheckCmdLineFlag("v"); args.GetCmdLineArgument("n", num_items); args.GetCmdLineArgument("maxseg", max_segment); // Print usage if (args.CheckCmdLineFlag("help")) { printf("%s " "[--n= " "[--device=] " "[--maxseg=] " "[--v] " "\n", argv[0]); exit(0); } // Initialize device CubDebugExit(args.DeviceInit()); // Allocate host arrays int* h_in = new int[num_items]; int* h_reference = new int[num_items]; unsigned char* h_flags = new unsigned char[num_items]; // Initialize problem and solution Initialize(h_in, h_flags, num_items, max_segment); int num_selected = Solve(h_in, h_flags, h_reference, num_items); printf("cub::DeviceSelect::Flagged %d items, %d selected (avg distance %d), %d-byte elements\n", num_items, num_selected, (num_selected > 0) ? num_items / num_selected : 0, (int) sizeof(int)); fflush(stdout); // Allocate problem device arrays int* d_in = NULL; unsigned char* d_flags = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**) &d_in, sizeof(int) * num_items)); CubDebugExit(g_allocator.DeviceAllocate((void**) &d_flags, sizeof(unsigned char) * num_items)); // Initialize device input CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(int) * num_items, cudaMemcpyHostToDevice)); CubDebugExit(cudaMemcpy(d_flags, h_flags, sizeof(unsigned char) * num_items, cudaMemcpyHostToDevice)); // Allocate device output array and num selected int* d_out = NULL; int* d_num_selected_out = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**) &d_out, sizeof(int) * num_items)); CubDebugExit(g_allocator.DeviceAllocate((void**) &d_num_selected_out, sizeof(int))); // Allocate temporary storage void* d_temp_storage = NULL; size_t temp_storage_bytes = 0; CubDebugExit( DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items)); CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes)); // Run CubDebugExit( DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items)); // Check for correctness (and display results, if specified) int compare = CompareDeviceResults(h_reference, d_out, num_selected, true, g_verbose); printf("\t Data %s ", compare ? "FAIL" : "PASS"); compare |= CompareDeviceResults(&num_selected, d_num_selected_out, 1, true, g_verbose); printf("\t Count %s ", compare ? "FAIL" : "PASS"); AssertEquals(0, compare); // Cleanup if (h_in) { delete[] h_in; } if (h_reference) { delete[] h_reference; } if (d_out) { CubDebugExit(g_allocator.DeviceFree(d_out)); } if (d_num_selected_out) { CubDebugExit(g_allocator.DeviceFree(d_num_selected_out)); } if (d_temp_storage) { CubDebugExit(g_allocator.DeviceFree(d_temp_storage)); } if (d_in) { CubDebugExit(g_allocator.DeviceFree(d_in)); } if (d_flags) { CubDebugExit(g_allocator.DeviceFree(d_flags)); } printf("\n\n"); return 0; } cccl-2.5.0/cub/examples/device/example_device_select_if.cu000066400000000000000000000173301463375617100236110ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /****************************************************************************** * Simple example of DeviceSelect::If(). * * Selects items from from a sequence of int keys using a * section functor (greater-than) * * To compile using the command line: * nvcc -arch=sm_XX example_device_select_if.cu -I../.. -lcudart -O3 * ******************************************************************************/ // Ensure printing of CUDA runtime errors to console #define CUB_STDERR #include #include #include "../../test/test_util.h" #include using namespace cub; //--------------------------------------------------------------------- // Globals, constants and typedefs //--------------------------------------------------------------------- bool g_verbose = false; // Whether to display input/output to console CachingDeviceAllocator g_allocator(true); // Caching allocator for device memory /// Selection functor type struct GreaterThan { int compare; __host__ __device__ __forceinline__ GreaterThan(int compare) : compare(compare) {} __host__ __device__ __forceinline__ bool operator()(const int& a) const { return (a > compare); } }; //--------------------------------------------------------------------- // Test generation //--------------------------------------------------------------------- /** * Initialize problem, setting runs of random length chosen from [1..max_segment] */ void Initialize(int* h_in, int num_items, int max_segment) { int key = 0; int i = 0; while (i < num_items) { // Randomly select number of repeating occurrences uniformly from [1..max_segment] unsigned short max_short = (unsigned short) -1; unsigned short repeat; RandomBits(repeat); repeat = (unsigned short) ((float(repeat) * (float(max_segment) / float(max_short)))); repeat = CUB_MAX(1, repeat); int j = i; while (j < CUB_MIN(i + repeat, num_items)) { h_in[j] = key; j++; } i = j; key++; } if (g_verbose) { printf("Input:\n"); DisplayResults(h_in, num_items); printf("\n\n"); } } /** * Solve unique problem */ template int Solve(int* h_in, SelectOp select_op, int* h_reference, int num_items) { int num_selected = 0; for (int i = 0; i < num_items; ++i) { if (select_op(h_in[i])) { h_reference[num_selected] = h_in[i]; num_selected++; } else { h_reference[num_items - (i - num_selected) - 1] = h_in[i]; } } return num_selected; } //--------------------------------------------------------------------- // Main //--------------------------------------------------------------------- /** * Main */ int main(int argc, char** argv) { int num_items = 150; int max_segment = 40; // Maximum segment length // Initialize command line CommandLineArgs args(argc, argv); g_verbose = args.CheckCmdLineFlag("v"); args.GetCmdLineArgument("n", num_items); args.GetCmdLineArgument("maxseg", max_segment); // Print usage if (args.CheckCmdLineFlag("help")) { printf("%s " "[--n= " "[--device=] " "[--maxseg=]" "[--v] " "\n", argv[0]); exit(0); } // Initialize device CubDebugExit(args.DeviceInit()); // Allocate host arrays int* h_in = new int[num_items]; int* h_reference = new int[num_items]; // Select a pivot index unsigned int pivot_index; unsigned int max_int = (unsigned int) -1; RandomBits(pivot_index); pivot_index = (unsigned int) ((float(pivot_index) * (float(num_items - 1) / float(max_int)))); printf("Pivot idx: %d\n", pivot_index); fflush(stdout); // Initialize problem and solution Initialize(h_in, num_items, max_segment); GreaterThan select_op(h_in[pivot_index]); int num_selected = Solve(h_in, select_op, h_reference, num_items); printf("cub::DeviceSelect::If %d items, %d selected (avg run length %d), %d-byte elements\n", num_items, num_selected, (num_selected > 0) ? num_items / num_selected : 0, (int) sizeof(int)); fflush(stdout); // Allocate problem device arrays int* d_in = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**) &d_in, sizeof(int) * num_items)); // Initialize device input CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(int) * num_items, cudaMemcpyHostToDevice)); // Allocate device output array and num selected int* d_out = NULL; int* d_num_selected_out = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**) &d_out, sizeof(int) * num_items)); CubDebugExit(g_allocator.DeviceAllocate((void**) &d_num_selected_out, sizeof(int))); // Allocate temporary storage void* d_temp_storage = NULL; size_t temp_storage_bytes = 0; CubDebugExit( DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op)); CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes)); // Run CubDebugExit( DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op)); // Check for correctness (and display results, if specified) int compare = CompareDeviceResults(h_reference, d_out, num_selected, true, g_verbose); printf("\t Data %s ", compare ? "FAIL" : "PASS"); compare = compare | CompareDeviceResults(&num_selected, d_num_selected_out, 1, true, g_verbose); printf("\t Count %s ", compare ? "FAIL" : "PASS"); AssertEquals(0, compare); // Cleanup if (h_in) { delete[] h_in; } if (h_reference) { delete[] h_reference; } if (d_in) { CubDebugExit(g_allocator.DeviceFree(d_in)); } if (d_out) { CubDebugExit(g_allocator.DeviceFree(d_out)); } if (d_num_selected_out) { CubDebugExit(g_allocator.DeviceFree(d_num_selected_out)); } if (d_temp_storage) { CubDebugExit(g_allocator.DeviceFree(d_temp_storage)); } printf("\n\n"); return 0; } cccl-2.5.0/cub/examples/device/example_device_select_unique.cu000066400000000000000000000160561463375617100245250ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /****************************************************************************** * Simple example of DeviceSelect::Unique(). * * Selects the first element from each run of identical values from a sequence * of int keys. * * To compile using the command line: * nvcc -arch=sm_XX example_device_select_unique.cu -I../.. -lcudart -O3 * ******************************************************************************/ // Ensure printing of CUDA runtime errors to console #define CUB_STDERR #include #include #include "../../test/test_util.h" #include using namespace cub; //--------------------------------------------------------------------- // Globals, constants and typedefs //--------------------------------------------------------------------- bool g_verbose = false; // Whether to display input/output to console CachingDeviceAllocator g_allocator(true); // Caching allocator for device memory //--------------------------------------------------------------------- // Test generation //--------------------------------------------------------------------- /** * Initialize problem, setting runs of random length chosen from [1..max_segment] */ void Initialize(int* h_in, int num_items, int max_segment) { int key = 0; int i = 0; while (i < num_items) { // Randomly select number of repeating occurrences uniformly from [1..max_segment] unsigned short max_short = (unsigned short) -1; unsigned short repeat; RandomBits(repeat); repeat = (unsigned short) ((float(repeat) * (float(max_segment) / float(max_short)))); repeat = CUB_MAX(1, repeat); int j = i; while (j < CUB_MIN(i + repeat, num_items)) { h_in[j] = key; j++; } i = j; key++; } if (g_verbose) { printf("Input:\n"); DisplayResults(h_in, num_items); printf("\n\n"); } } /** * Solve unique problem */ int Solve(int* h_in, int* h_reference, int num_items) { int num_selected = 0; if (num_items > 0) { h_reference[num_selected] = h_in[0]; num_selected++; } for (int i = 1; i < num_items; ++i) { if (h_in[i] != h_in[i - 1]) { h_reference[num_selected] = h_in[i]; num_selected++; } } return num_selected; } //--------------------------------------------------------------------- // Main //--------------------------------------------------------------------- /** * Main */ int main(int argc, char** argv) { int num_items = 150; int max_segment = 40; // Maximum segment length // Initialize command line CommandLineArgs args(argc, argv); g_verbose = args.CheckCmdLineFlag("v"); args.GetCmdLineArgument("n", num_items); args.GetCmdLineArgument("maxseg", max_segment); // Print usage if (args.CheckCmdLineFlag("help")) { printf("%s " "[--n= " "[--device=] " "[--maxseg=]" "[--v] " "\n", argv[0]); exit(0); } // Initialize device CubDebugExit(args.DeviceInit()); // Allocate host arrays int* h_in = new int[num_items]; int* h_reference = new int[num_items]; // Initialize problem and solution Initialize(h_in, num_items, max_segment); int num_selected = Solve(h_in, h_reference, num_items); printf("cub::DeviceSelect::Unique %d items (%d-byte elements), %d selected (avg run length %d)\n", num_items, (int) sizeof(int), num_selected, num_items / num_selected); fflush(stdout); // Allocate problem device arrays int* d_in = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**) &d_in, sizeof(int) * num_items)); // Initialize device input CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(int) * num_items, cudaMemcpyHostToDevice)); // Allocate device output array and num selected int* d_out = NULL; int* d_num_selected_out = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**) &d_out, sizeof(int) * num_items)); CubDebugExit(g_allocator.DeviceAllocate((void**) &d_num_selected_out, sizeof(int))); // Allocate temporary storage void* d_temp_storage = NULL; size_t temp_storage_bytes = 0; CubDebugExit(DeviceSelect::Unique(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items)); CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes)); // Run CubDebugExit(DeviceSelect::Unique(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items)); // Check for correctness (and display results, if specified) int compare = CompareDeviceResults(h_reference, d_out, num_selected, true, g_verbose); printf("\t Data %s ", compare ? "FAIL" : "PASS"); compare = compare | CompareDeviceResults(&num_selected, d_num_selected_out, 1, true, g_verbose); printf("\t Count %s ", compare ? "FAIL" : "PASS"); AssertEquals(0, compare); // Cleanup if (h_in) { delete[] h_in; } if (h_reference) { delete[] h_reference; } if (d_in) { CubDebugExit(g_allocator.DeviceFree(d_in)); } if (d_out) { CubDebugExit(g_allocator.DeviceFree(d_out)); } if (d_num_selected_out) { CubDebugExit(g_allocator.DeviceFree(d_num_selected_out)); } if (d_temp_storage) { CubDebugExit(g_allocator.DeviceFree(d_temp_storage)); } printf("\n\n"); return 0; } cccl-2.5.0/cub/examples/device/example_device_sort_find_non_trivial_runs.cu000066400000000000000000000304141463375617100273140ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /****************************************************************************** * Simple example of sorting a sequence of keys and values (each pair is a * randomly-selected int32 paired with its original offset in the unsorted sequence), and then * isolating all maximal, non-trivial (having length > 1) "runs" of duplicates. * * To compile using the command line: * nvcc -arch=sm_XX example_device_sort_find_non_trivial_runs.cu -I../.. -lcudart -O3 * ******************************************************************************/ // Ensure printing of CUDA runtime errors to console #define CUB_STDERR #include #include #include #include #include "../../test/test_util.h" #include using namespace cub; //--------------------------------------------------------------------- // Globals, constants and typedefs //--------------------------------------------------------------------- bool g_verbose = false; // Whether to display input/output to console CachingDeviceAllocator g_allocator(true); // Caching allocator for device memory //--------------------------------------------------------------------- // Test generation //--------------------------------------------------------------------- /** * Simple key-value pairing for using std::sort on key-value pairs. */ template struct Pair { Key key; Value value; bool operator<(const Pair& b) const { return (key < b.key); } }; /** * Pair ostream operator */ template std::ostream& operator<<(std::ostream& os, const Pair& val) { os << '<' << val.key << ',' << val.value << '>'; return os; } /** * Initialize problem */ template void Initialize(Key* h_keys, Value* h_values, int num_items, int max_key) { float scale = float(max_key) / float(UINT_MAX); for (int i = 0; i < num_items; ++i) { Key sample; RandomBits(sample); h_keys[i] = (max_key == -1) ? i : (Key) (scale * sample); h_values[i] = i; } if (g_verbose) { printf("Keys:\n"); DisplayResults(h_keys, num_items); printf("\n\n"); printf("Values:\n"); DisplayResults(h_values, num_items); printf("\n\n"); } } /** * Solve sorted non-trivial subrange problem. Returns the number * of non-trivial runs found. */ template int Solve(Key* h_keys, Value* h_values, int num_items, int* h_offsets_reference, int* h_lengths_reference) { // Sort Pair* h_pairs = new Pair[num_items]; for (int i = 0; i < num_items; ++i) { h_pairs[i].key = h_keys[i]; h_pairs[i].value = h_values[i]; } std::stable_sort(h_pairs, h_pairs + num_items); if (g_verbose) { printf("Sorted pairs:\n"); DisplayResults(h_pairs, num_items); printf("\n\n"); } // Find non-trivial runs Key previous = h_pairs[0].key; int length = 1; int num_runs = 0; int run_begin = 0; for (int i = 1; i < num_items; ++i) { if (previous != h_pairs[i].key) { if (length > 1) { h_offsets_reference[num_runs] = run_begin; h_lengths_reference[num_runs] = length; num_runs++; } length = 1; run_begin = i; } else { length++; } previous = h_pairs[i].key; } if (length > 1) { h_offsets_reference[num_runs] = run_begin; h_lengths_reference[num_runs] = length; num_runs++; } delete[] h_pairs; return num_runs; } //--------------------------------------------------------------------- // Main //--------------------------------------------------------------------- /** * Main */ int main(int argc, char** argv) { typedef unsigned int Key; typedef int Value; int timing_iterations = 0; int num_items = 40; Key max_key = 20; // Max item // Initialize command line CommandLineArgs args(argc, argv); g_verbose = args.CheckCmdLineFlag("v"); args.GetCmdLineArgument("n", num_items); args.GetCmdLineArgument("maxkey", max_key); args.GetCmdLineArgument("i", timing_iterations); // Print usage if (args.CheckCmdLineFlag("help")) { printf("%s " "[--device=] " "[--i= " "[--n= " "[--maxkey=]" "[--v] " "\n", argv[0]); exit(0); } // Initialize device CubDebugExit(args.DeviceInit()); // Allocate host arrays (problem and reference solution) Key* h_keys = new Key[num_items]; Value* h_values = new Value[num_items]; int* h_offsets_reference = new int[num_items]; int* h_lengths_reference = new int[num_items]; // Initialize key-value pairs and compute reference solution (sort them, and identify non-trivial runs) printf("Computing reference solution on CPU for %d items (max key %d)\n", num_items, max_key); fflush(stdout); Initialize(h_keys, h_values, num_items, max_key); int num_runs = Solve(h_keys, h_values, num_items, h_offsets_reference, h_lengths_reference); printf("%d non-trivial runs\n", num_runs); fflush(stdout); // Repeat for performance timing GpuTimer gpu_timer; GpuTimer gpu_rle_timer; float elapsed_millis = 0.0; float elapsed_rle_millis = 0.0; for (int i = 0; i <= timing_iterations; ++i) { // Allocate and initialize device arrays for sorting DoubleBuffer d_keys; DoubleBuffer d_values; CubDebugExit(g_allocator.DeviceAllocate((void**) &d_keys.d_buffers[0], sizeof(Key) * num_items)); CubDebugExit(g_allocator.DeviceAllocate((void**) &d_keys.d_buffers[1], sizeof(Key) * num_items)); CubDebugExit(g_allocator.DeviceAllocate((void**) &d_values.d_buffers[0], sizeof(Value) * num_items)); CubDebugExit(g_allocator.DeviceAllocate((void**) &d_values.d_buffers[1], sizeof(Value) * num_items)); CubDebugExit( cudaMemcpy(d_keys.d_buffers[d_keys.selector], h_keys, sizeof(float) * num_items, cudaMemcpyHostToDevice)); CubDebugExit( cudaMemcpy(d_values.d_buffers[d_values.selector], h_values, sizeof(int) * num_items, cudaMemcpyHostToDevice)); // Start timer gpu_timer.Start(); // Allocate temporary storage for sorting size_t temp_storage_bytes = 0; void* d_temp_storage = NULL; CubDebugExit(DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items)); CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes)); // Do the sort CubDebugExit(DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items)); // Free unused buffers and sorting temporary storage if (d_keys.d_buffers[d_keys.selector ^ 1]) { CubDebugExit(g_allocator.DeviceFree(d_keys.d_buffers[d_keys.selector ^ 1])); } if (d_values.d_buffers[d_values.selector ^ 1]) { CubDebugExit(g_allocator.DeviceFree(d_values.d_buffers[d_values.selector ^ 1])); } if (d_temp_storage) { CubDebugExit(g_allocator.DeviceFree(d_temp_storage)); } // Start timer gpu_rle_timer.Start(); // Allocate device arrays for enumerating non-trivial runs int* d_offests_out = NULL; int* d_lengths_out = NULL; int* d_num_runs = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**) &d_offests_out, sizeof(int) * num_items)); CubDebugExit(g_allocator.DeviceAllocate((void**) &d_lengths_out, sizeof(int) * num_items)); CubDebugExit(g_allocator.DeviceAllocate((void**) &d_num_runs, sizeof(int) * 1)); // Allocate temporary storage for isolating non-trivial runs d_temp_storage = NULL; CubDebugExit(DeviceRunLengthEncode::NonTrivialRuns( d_temp_storage, temp_storage_bytes, d_keys.d_buffers[d_keys.selector], d_offests_out, d_lengths_out, d_num_runs, num_items)); CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes)); // Do the isolation CubDebugExit(DeviceRunLengthEncode::NonTrivialRuns( d_temp_storage, temp_storage_bytes, d_keys.d_buffers[d_keys.selector], d_offests_out, d_lengths_out, d_num_runs, num_items)); // Free keys buffer if (d_keys.d_buffers[d_keys.selector]) { CubDebugExit(g_allocator.DeviceFree(d_keys.d_buffers[d_keys.selector])); } // // Hypothetically do stuff with the original key-indices corresponding to non-trivial runs of identical keys // // Stop sort timer gpu_timer.Stop(); gpu_rle_timer.Stop(); if (i == 0) { // First iteration is a warmup: // Check for correctness (and display results, if specified) printf("\nRUN OFFSETS: \n"); int compare = CompareDeviceResults(h_offsets_reference, d_offests_out, num_runs, true, g_verbose); printf("\t\t %s ", compare ? "FAIL" : "PASS"); printf("\nRUN LENGTHS: \n"); compare |= CompareDeviceResults(h_lengths_reference, d_lengths_out, num_runs, true, g_verbose); printf("\t\t %s ", compare ? "FAIL" : "PASS"); printf("\nNUM RUNS: \n"); compare |= CompareDeviceResults(&num_runs, d_num_runs, 1, true, g_verbose); printf("\t\t %s ", compare ? "FAIL" : "PASS"); AssertEquals(0, compare); } else { elapsed_millis += gpu_timer.ElapsedMillis(); elapsed_rle_millis += gpu_rle_timer.ElapsedMillis(); } // GPU cleanup if (d_values.d_buffers[d_values.selector]) { CubDebugExit(g_allocator.DeviceFree(d_values.d_buffers[d_values.selector])); } if (d_offests_out) { CubDebugExit(g_allocator.DeviceFree(d_offests_out)); } if (d_lengths_out) { CubDebugExit(g_allocator.DeviceFree(d_lengths_out)); } if (d_num_runs) { CubDebugExit(g_allocator.DeviceFree(d_num_runs)); } if (d_temp_storage) { CubDebugExit(g_allocator.DeviceFree(d_temp_storage)); } } // Host cleanup if (h_keys) { delete[] h_keys; } if (h_values) { delete[] h_values; } if (h_offsets_reference) { delete[] h_offsets_reference; } if (h_lengths_reference) { delete[] h_lengths_reference; } printf("\n\n"); if (timing_iterations > 0) { printf("%d timing iterations, average time to sort and isolate non-trivial duplicates: %.3f ms (%.3f ms spent in " "RLE isolation)\n", timing_iterations, elapsed_millis / timing_iterations, elapsed_rle_millis / timing_iterations); } return 0; } cccl-2.5.0/cub/test/000077500000000000000000000000001463375617100141675ustar00rootroot00000000000000cccl-2.5.0/cub/test/.gitignore000066400000000000000000000000341463375617100161540ustar00rootroot00000000000000/bin /link_main.obj /dummy/ cccl-2.5.0/cub/test/CMakeLists.txt000066400000000000000000000376351463375617100167450ustar00rootroot00000000000000if(CMAKE_GENERATOR MATCHES "^Visual Studio") if(CUB_ENABLE_RDC_TESTS) if("${CMAKE_VERSION}" VERSION_LESS 3.27.5) # https://gitlab.kitware.com/cmake/cmake/-/merge_requests/8794 message(WARNING "CMake 3.27.5 or newer is required to enable RDC tests in Visual Studio.") cmake_minimum_required(VERSION 3.27.5) endif() endif() endif() if ("NVHPC" STREQUAL "${CMAKE_CUDA_COMPILER_ID}") # NVBugs 200770766 set(CUB_SEPARATE_CATCH2 ON) else() option(CUB_SEPARATE_CATCH2 "Build each catch2 test as a separate executable." OFF ) endif() include("${CUB_SOURCE_DIR}/cmake/CPM.cmake") CPMAddPackage("gh:catchorg/Catch2@2.13.9") option(METAL_BUILD_DOC OFF) option(METAL_BUILD_EXAMPLES OFF) option(METAL_BUILD_TESTS OFF) CPMAddPackage("gh:brunocodutra/metal@2.1.4") CPMAddPackage( NAME NVTX GITHUB_REPOSITORY NVIDIA/NVTX GIT_TAG release-v3 DOWNLOAD_ONLY SYSTEM ) include("${NVTX_SOURCE_DIR}/c/nvtxImportedTargets.cmake") find_package(CUDAToolkit) set(curand_default OFF) if (CUDA_curand_LIBRARY) set(curand_default ON) endif() option(CUB_C2H_ENABLE_CURAND "Use CUDA CURAND library" ${curand_default}) # The function below reads the filepath `src`, extracts the %PARAM% comments, # and fills `labels_var` with a list of `label1_value1.label2_value2...` # strings, and puts the corresponding `DEFINITION=value1:DEFINITION=value2` # entries into `defs_var`. # # See the README.md file in this directory for background info. function(cub_get_test_params src labels_var defs_var) file(READ "${src}" file_data) set(param_regex "//[ ]+%PARAM%[ ]+([^ ]+)[ ]+([^ ]+)[ ]+([^\n]*)") string(REGEX MATCHALL "${param_regex}" matches "${file_data}" ) set(variant_labels) set(variant_defs) foreach(match IN LISTS matches) string(REGEX MATCH "${param_regex}" unused "${match}" ) set(def ${CMAKE_MATCH_1}) set(label ${CMAKE_MATCH_2}) set(values "${CMAKE_MATCH_3}") string(REPLACE ":" ";" values "${values}") # Build lists of test name suffixes (labels) and preprocessor definitions # (defs) containing the cartesian product of all param values: if (NOT variant_labels) foreach(value IN LISTS values) list(APPEND variant_labels ${label}_${value}) endforeach() else() set(tmp_labels) foreach(old_label IN LISTS variant_labels) foreach(value IN LISTS values) list(APPEND tmp_labels ${old_label}.${label}_${value}) endforeach() endforeach() set(variant_labels "${tmp_labels}") endif() if (NOT variant_defs) foreach(value IN LISTS values) list(APPEND variant_defs ${def}=${value}) endforeach() else() set(tmp_defs) foreach(old_def IN LISTS variant_defs) foreach(value IN LISTS values) list(APPEND tmp_defs ${old_def}:${def}=${value}) endforeach() endforeach() set(variant_defs "${tmp_defs}") endif() endforeach() set(${labels_var} "${variant_labels}" PARENT_SCOPE) set(${defs_var} "${variant_defs}" PARENT_SCOPE) endfunction() # Create meta targets that build all tests for a single configuration: foreach(cub_target IN LISTS CUB_TARGETS) cub_get_target_property(config_prefix ${cub_target} PREFIX) set(config_meta_target ${config_prefix}.tests) add_custom_target(${config_meta_target}) add_dependencies(${config_prefix}.all ${config_meta_target}) endforeach() file(GLOB test_srcs RELATIVE "${CUB_SOURCE_DIR}/test" CONFIGURE_DEPENDS test_*.cu catch2_test_*.cu ) ## _cub_is_catch2_test # # If the test_src contains the substring "catch2_test_", `result_var` will # be set to TRUE. function(_cub_is_catch2_test result_var test_src) string(FIND "${test_src}" "catch2_test_" idx) if (idx EQUAL -1) set(${result_var} FALSE PARENT_SCOPE) else() set(${result_var} TRUE PARENT_SCOPE) endif() endfunction() ## _cub_is_fail_test # # If the test_src contains the substring "_fail", `result_var` will # be set to TRUE. function(_cub_is_fail_test result_var test_src) string(FIND "${test_src}" "_fail" idx) if (idx EQUAL -1) set(${result_var} FALSE PARENT_SCOPE) else() set(${result_var} TRUE PARENT_SCOPE) endif() endfunction() ## _cub_launcher_requires_rdc # # If given launcher id corresponds to a CDP launcher, set `out_var` to 1. function(_cub_launcher_requires_rdc out_var launcher_id) if ("${launcher_id}" STREQUAL "1") set(${out_var} 1 PARENT_SCOPE) else() set(${out_var} 0 PARENT_SCOPE) endif() endfunction() ## cub_add_test # # Add a test executable and register it with ctest. # # target_name_var: Variable name to overwrite with the name of the test # target. Useful for post-processing target information. # test_name: The name of the test minus ".test." For example, # testing/vector.cu will be "vector", and testing/cuda/copy.cu will be # "cuda.copy". # test_src: The source file that implements the test. # cub_target: The reference cub target with configuration information. # function(cub_add_test target_name_var test_name test_src cub_target launcher_id) cub_get_target_property(config_prefix ${cub_target} PREFIX) _cub_is_catch2_test(is_catch2_test "${test_src}") _cub_launcher_requires_rdc(cdp_val "${launcher_id}") # The actual name of the test's target: set(test_target ${config_prefix}.test.${test_name}) set(${target_name_var} ${test_target} PARENT_SCOPE) set(config_meta_target ${config_prefix}.tests) if (is_catch2_test) # Per config helper library: set(config_c2h_target ${config_prefix}.test.catch2_helper.lid_${launcher_id}) if (NOT TARGET ${config_c2h_target}) add_library(${config_c2h_target} STATIC c2h/generators.cu) target_include_directories(${config_c2h_target} PUBLIC "${CUB_SOURCE_DIR}/test") cub_clone_target_properties(${config_c2h_target} ${cub_target}) cub_configure_cuda_target(${config_c2h_target} RDC ${cdp_val}) target_link_libraries(${config_c2h_target} PRIVATE ${cub_target} PUBLIC CUDA::nvrtc CUDA::cuda_driver) if (CUB_C2H_ENABLE_CURAND) target_link_libraries(${config_c2h_target} PRIVATE CUDA::curand) target_compile_definitions(${config_c2h_target} PRIVATE C2H_HAS_CURAND=1) else() target_compile_definitions(${config_c2h_target} PRIVATE C2H_HAS_CURAND=0) endif() if (CUB_IN_THRUST) thrust_fix_clang_nvcc_build_for(${config_c2h_target}) endif() endif() # config_c2h_target if (CUB_SEPARATE_CATCH2) set(catch2_main_objects ${config_prefix}.catch2_main_objects) if (NOT TARGET ${catch2_main_objects}) add_library(${catch2_main_objects} OBJECT catch2_runner.cpp catch2_runner_helper.cu) target_link_libraries(${catch2_main_objects} PUBLIC Catch2::Catch2) endif() add_executable(${test_target} "${test_src}") target_link_libraries(${test_target} PRIVATE ${catch2_main_objects}) add_dependencies(${config_meta_target} ${test_target}) add_test(NAME ${test_target} COMMAND "$") else() # Not CUB_SEPARATE_CATCH2 # Per config catch2 runner set(config_c2run_target ${config_prefix}.catch2_test.lid_${launcher_id}) if (NOT TARGET ${config_c2run_target}) add_executable(${config_c2run_target} catch2_runner.cpp catch2_runner_helper.cu) target_link_libraries(${config_c2run_target} PRIVATE ${cub_target} ${config_c2h_target} Metal Catch2::Catch2) cub_clone_target_properties(${config_c2run_target} ${cub_target}) cub_configure_cuda_target(${config_c2run_target} RDC ${cdp_val}) add_dependencies(${config_meta_target} ${config_c2run_target}) target_include_directories(${config_c2run_target} PRIVATE "${CUB_SOURCE_DIR}/test" ) if ("NVHPC" STREQUAL "${CMAKE_CUDA_COMPILER_ID}") target_link_options(${config_c2run_target} PRIVATE "-cuda") endif() if (CUB_IN_THRUST) thrust_fix_clang_nvcc_build_for(${config_c2run_target}) endif() add_test(NAME ${config_c2run_target} COMMAND "$" ) endif() # per config catch2 runner add_library(${test_target} OBJECT "${test_src}") if(CMAKE_GENERATOR MATCHES "^Visual Studio") target_link_libraries(${config_c2run_target} PRIVATE $) else() target_link_libraries(${config_c2run_target} PRIVATE ${test_target}) endif() endif() # CUB_SEPARATE_CATCH2 if ("${test_target}" MATCHES "nvrtc") target_compile_definitions(${test_target} PRIVATE NVRTC_CUB_PATH="-I${CMAKE_SOURCE_DIR}/cub") target_compile_definitions(${test_target} PRIVATE NVRTC_THRUST_PATH="-I${CMAKE_SOURCE_DIR}/thrust") target_compile_definitions(${test_target} PRIVATE NVRTC_LIBCUDACXX_PATH="-I${CMAKE_SOURCE_DIR}/libcudacxx/include") target_compile_definitions(${test_target} PRIVATE NVRTC_CTK_PATH="-I${CUDAToolkit_INCLUDE_DIRS}") endif() if (CUB_IN_THRUST) thrust_fix_clang_nvcc_build_for(${test_target}) endif() target_link_libraries(${test_target} PRIVATE ${cub_target} ${config_c2h_target} Metal Catch2::Catch2 ) cub_clone_target_properties(${test_target} ${cub_target}) target_include_directories(${test_target} PUBLIC "${CUB_SOURCE_DIR}/test" ) else() # Not catch2: # Related target names: set(test_meta_target cub.all.test.${test_name}) add_executable(${test_target} "${test_src}") target_link_libraries(${test_target} ${cub_target}) cub_clone_target_properties(${test_target} ${cub_target}) target_include_directories(${test_target} PRIVATE "${CUB_SOURCE_DIR}/test") target_compile_definitions(${test_target} PRIVATE CUB_DETAIL_DEBUG_ENABLE_SYNC) if ("${test_target}" MATCHES "nvtx_in_usercode") target_link_libraries(${test_target} nvtx3-cpp) endif() if (CUB_IN_THRUST) thrust_fix_clang_nvcc_build_for(${test_target}) endif() _cub_is_fail_test(is_fail_test "${test_src}") if (is_fail_test) set_target_properties(${test_target} PROPERTIES EXCLUDE_FROM_ALL true EXCLUDE_FROM_DEFAULT_BUILD true) add_test(NAME ${test_target} COMMAND ${CMAKE_COMMAND} --build "${CMAKE_BINARY_DIR}" --target ${test_target} --config $) string(REGEX MATCH "err_([0-9]+)" MATCH_RESULT "${test_name}") file(READ ${test_src} test_content) if(MATCH_RESULT) string(REGEX MATCH "// expected-error-${CMAKE_MATCH_1}+ {{\"([^\"]+)\"}}" expected_errors_matches ${test_content}) if (expected_errors_matches) set_tests_properties(${test_target} PROPERTIES PASS_REGULAR_EXPRESSION "${CMAKE_MATCH_1}") else() set_tests_properties(${test_target} PROPERTIES WILL_FAIL true) endif() else() string(REGEX MATCH "// expected-error {{\"([^\"]+)\"}}" expected_errors_matches ${test_content}) if (expected_errors_matches) set_tests_properties(${test_target} PROPERTIES PASS_REGULAR_EXPRESSION "${CMAKE_MATCH_1}") else() set_tests_properties(${test_target} PROPERTIES WILL_FAIL true) endif() endif() else() # Add to the active configuration's meta target add_dependencies(${config_meta_target} ${test_target}) # Meta target that builds tests with this name for all configurations: if (NOT TARGET ${test_meta_target}) add_custom_target(${test_meta_target}) endif() add_dependencies(${test_meta_target} ${test_target}) add_test(NAME ${test_target} COMMAND "$") endif() endif() # Not catch2 test endfunction() # Sets out_var to launch id if the label contains launch variants function(_cub_has_lid_variant out_var label) string(FIND "${label}" "lid_" idx) if (idx EQUAL -1) set(${out_var} 0 PARENT_SCOPE) else() set(${out_var} 1 PARENT_SCOPE) endif() endfunction() # Sets out_var to 1 if the label contains "lid_1", e.g. launch id corresponds # to device-side (CDP) launch. function(_cub_launcher_id out_var label) string(REGEX MATCH "lid_([0-9]+)" MATCH_RESULT "${label}") if(MATCH_RESULT) set(${out_var} ${CMAKE_MATCH_1} PARENT_SCOPE) else() set(${out_var} 0 PARENT_SCOPE) endif() endfunction() foreach (test_src IN LISTS test_srcs) get_filename_component(test_name "${test_src}" NAME_WE) string(REGEX REPLACE "^catch2_test_" "" test_name "${test_name}") string(REGEX REPLACE "^test_" "" test_name "${test_name}") cub_get_test_params("${test_src}" variant_labels variant_defs) list(LENGTH variant_labels num_variants) # Subtract 1 to support the inclusive endpoint of foreach(...RANGE...): math(EXPR range_end "${num_variants} - 1") # Verbose output: if (num_variants GREATER 0) message(VERBOSE "Detected ${num_variants} variants of test '${test_src}':") foreach(var_idx RANGE ${range_end}) math(EXPR i "${var_idx} + 1") list(GET variant_labels ${var_idx} label) list(GET variant_defs ${var_idx} defs) message(VERBOSE " ${i}: ${test_name} ${label} ${defs}") endforeach() endif() foreach(cub_target IN LISTS CUB_TARGETS) cub_get_target_property(config_prefix ${cub_target} PREFIX) if (num_variants EQUAL 0) if (${CUB_FORCE_RDC}) set(launcher 1) else() set(launcher 0) endif() # Only one version of this test. cub_add_test(test_target ${test_name} "${test_src}" ${cub_target} ${launcher}) cub_configure_cuda_target(${test_target} RDC ${CUB_FORCE_RDC}) else() # has variants: # Meta target to build all parametrizations of the current test for the # current CUB_TARGET config set(variant_meta_target ${config_prefix}.test.${test_name}.all) if (NOT TARGET ${variant_meta_target}) add_custom_target(${variant_meta_target}) endif() # Meta target to build all parametrizations of the current test for all # CUB_TARGET configs set(cub_variant_meta_target cub.all.test.${test_name}.all) if (NOT TARGET ${cub_variant_meta_target}) add_custom_target(${cub_variant_meta_target}) endif() # Generate multiple tests, one per variant. # See `cub_get_test_params` for details. foreach(var_idx RANGE ${range_end}) list(GET variant_labels ${var_idx} label) list(GET variant_defs ${var_idx} defs) string(REPLACE ":" ";" defs "${defs}") # A unique index per variant: list(APPEND defs VAR_IDX=${var_idx}) # Check if the test explicitly specifies launcher id: _cub_has_lid_variant(explicit_launcher "${label}") _cub_launcher_id(explicit_launcher_id "${label}") if (${explicit_launcher}) set(launcher_id "${explicit_launcher_id}") else() if (${CUB_FORCE_RDC}) set(launcher_id 1) else() set(launcher_id 0) endif() endif() _cub_launcher_requires_rdc(cdp_val "${launcher_id}") if (cdp_val AND NOT CUB_ENABLE_RDC_TESTS) continue() endif() cub_add_test(test_target ${test_name}.${label} "${test_src}" ${cub_target} ${launcher_id}) # Enable RDC if the test either: # 1. Explicitly requests it (lid_1 label) # 2. Does not have an explicit CDP variant (no lid_0, lid_1, or lid_2) but # RDC testing is forced # # Tests that explicitly request no cdp (lid_0 label) should never enable # RDC. cub_configure_cuda_target(${test_target} RDC ${cdp_val}) add_dependencies(${variant_meta_target} ${test_target}) add_dependencies(${cub_variant_meta_target} ${test_target}) target_compile_definitions(${test_target} PRIVATE ${defs}) endforeach() # Variant endif() # Has variants endforeach() # CUB targets endforeach() # Source file add_subdirectory(cmake) cccl-2.5.0/cub/test/README.md000066400000000000000000000111751463375617100154530ustar00rootroot00000000000000# Test Parametrization Some of CUB's tests are very slow to build and are capable of exhausting RAM during compilation/linking. To avoid such issues, large tests are split into multiple executables to take advantage of parallel computation and reduce memory usage. CUB facilitates this by checking for special `%PARAM%` comments in each test's source code, and then uses this information to generate multiple executables with different configurations. ## Using `%PARAM%` The `%PARAM%` hint provides an automated method of generating multiple test executables from a single source file. To use it, add one or more special comments to the test source file: ```cpp // %PARAM% [definition] [label] [values] ``` CMake will parse the source file and extract these comments, using them to generate multiple test executables for the full cartesian product of values. - `definition` will be used as a preprocessor definition name. By convention, these begin with `TEST_`. - `label` is a short, human-readable label that will be used in the test executable's name to identify the test variant. - `values` is a colon-separated list of values used during test generation. Only numeric values have been tested. ## Special Labels ### Testing Different Launchers If a `label` is `lid`, it is assumed that the parameter is used to explicitly test variants built with different launchers. The `values` for such a parameter must be `0:1:2`, with `0` indicating host launch and CDP disabled (RDC off), `1` indicating device launch and CDP enabled (RDC on), `2` indicating graph capture launch and CDP disabled (RDC off). Tests that do not contain a variant labeled `lid` will only enable RDC if the CMake config enables them. ## Example For example, if `test_baz.cu` contains the following lines: ```cpp // %PARAM% TEST_FOO foo 0:1:2 // %PARAM% TEST_LAUNCH lid 0:1 ``` Six executables and CTest targets will be generated with unique definitions (only c++17 targets shown): | Executable Name | Preprocessor Definitions | Launcher | |----------------------------------|--------------------------------|-----------| | `cub.cpp17.test.baz.foo_0.lid_0` | `-DTEST_FOO=0 -DTEST_LAUNCH=0` | Host | | `cub.cpp17.test.baz.foo_0.lid_1` | `-DTEST_FOO=0 -DTEST_LAUNCH=1` | Device | | `cub.cpp17.test.baz.foo_1.lid_0` | `-DTEST_FOO=1 -DTEST_LAUNCH=0` | Host | | `cub.cpp17.test.baz.foo_1.lid_1` | `-DTEST_FOO=1 -DTEST_LAUNCH=1` | Device | | `cub.cpp17.test.baz.foo_2.lid_0` | `-DTEST_FOO=2 -DTEST_LAUNCH=0` | Host | | `cub.cpp17.test.baz.foo_2.lid_1` | `-DTEST_FOO=2 -DTEST_LAUNCH=1` | Device | ## Changing `%PARAM%` Hints Since CMake does not automatically reconfigure the build when source files are modified, CMake will need to be rerun manually whenever the `%PARAM%` comments change. ## Building and Running Split Tests CMake will generate individual build and test targets for each test variant, and also provides build "metatargets" that compile all variants of a given test. The variants follow the usual naming convention for CUB's tests, but include a suffix that differentiates them (e.g. `.foo_X.bar_Y` in the example above). ### Individual Test Variants Continuing with the `test_baz.cu` example, the test variant that uses `-DTEST_FOO=1 -DTEST_BAR=4` can be built and run alone: ```bash # Build a single variant: make cub.cpp17.test.baz.foo_1.bar_4 # Run a single variant bin/cub.cpp17.test.baz.foo_1.bar_4 # Run a single variant using CTest regex: ctest -R cub\.cpp17\.test\.baz\.foo_1\.bar_4 ``` ### All Variants of a Test Using a metatarget and the proper regex, all variants of a test can be built and executed without listing all variants explicitly: ```bash # Build all variants using the `.all` metatarget make cub.cpp17.test.baz.all # Run all variants: ctest -R cub\.cpp17\.test\.baz\. ``` ## Debugging Running CMake with `--log-level=VERBOSE` will print out extra information about all detected test variants. ## Additional Info Ideally, only parameters that directly influence kernel template instantiations should be split out in this way. If changing a parameter doesn't change the kernel template type, the same kernel will be compiled into multiple executables. This defeats the purpose of splitting up the test since the compiler will generate redundant code across the new split executables. The best candidate parameters for splitting are input value types, rather than integral parameters like BLOCK_THREADS, etc. Splitting by value type allows more infrastructure (data generation, validation) to be reused. Splitting other parameters can cause build times to increase since type-related infrastructure has to be rebuilt for each test variant. cccl-2.5.0/cub/test/bfloat16.h000066400000000000000000000166741463375617100157740ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #pragma once /** * \file * Utilities for interacting with the opaque CUDA __nv_bfloat16 type */ #include #include #include #include #include #ifdef __GNUC__ // There's a ton of type-punning going on in this file. # pragma GCC diagnostic push # pragma GCC diagnostic ignored "-Wstrict-aliasing" #endif /****************************************************************************** * bfloat16_t ******************************************************************************/ /** * Host-based fp16 data type compatible and convertible with __nv_bfloat16 */ struct bfloat16_t { uint16_t __x; /// Constructor from __nv_bfloat16 __host__ __device__ __forceinline__ explicit bfloat16_t(const __nv_bfloat16& other) { __x = reinterpret_cast(other); } /// Constructor from integer __host__ __device__ __forceinline__ explicit bfloat16_t(int a) { *this = bfloat16_t(float(a)); } /// Constructor from std::size_t __host__ __device__ __forceinline__ explicit bfloat16_t(std::size_t a) { *this = bfloat16_t(float(a)); } /// Constructor from double __host__ __device__ __forceinline__ explicit bfloat16_t(double a) { *this = bfloat16_t(float(a)); } /// Constructor from unsigned long long int template ::value && (!::cuda::std::is_same::value)>::type> __host__ __device__ __forceinline__ explicit bfloat16_t(T a) { *this = bfloat16_t(float(a)); } /// Default constructor bfloat16_t() = default; /// Constructor from float __host__ __device__ __forceinline__ explicit bfloat16_t(float a) { // Refrence: // https://github.com/pytorch/pytorch/blob/44cc873fba5e5ffc4d4d4eef3bd370b653ce1ce1/c10/util/BFloat16.h#L51 uint16_t ir; if (a != a) { ir = UINT16_C(0x7FFF); } else { union { uint32_t U32; float F32; }; F32 = a; uint32_t rounding_bias = ((U32 >> 16) & 1) + UINT32_C(0x7FFF); ir = static_cast((U32 + rounding_bias) >> 16); } this->__x = ir; } /// Cast to __nv_bfloat16 __host__ __device__ __forceinline__ operator __nv_bfloat16() const { return reinterpret_cast(__x); } /// Cast to float __host__ __device__ __forceinline__ operator float() const { float f = 0; uint32_t* p = reinterpret_cast(&f); *p = uint32_t(__x) << 16; return f; } /// Get raw storage __host__ __device__ __forceinline__ uint16_t raw() const { return this->__x; } /// Equality __host__ __device__ __forceinline__ friend bool operator==(const bfloat16_t& a, const bfloat16_t& b) { return (a.__x == b.__x); } /// Inequality __host__ __device__ __forceinline__ friend bool operator!=(const bfloat16_t& a, const bfloat16_t& b) { return (a.__x != b.__x); } /// Assignment by sum __host__ __device__ __forceinline__ bfloat16_t& operator+=(const bfloat16_t& rhs) { *this = bfloat16_t(float(*this) + float(rhs)); return *this; } /// Multiply __host__ __device__ __forceinline__ bfloat16_t operator*(const bfloat16_t& other) { return bfloat16_t(float(*this) * float(other)); } /// Add __host__ __device__ __forceinline__ bfloat16_t operator+(const bfloat16_t& other) { return bfloat16_t(float(*this) + float(other)); } /// Less-than __host__ __device__ __forceinline__ bool operator<(const bfloat16_t& other) const { return float(*this) < float(other); } /// Less-than-equal __host__ __device__ __forceinline__ bool operator<=(const bfloat16_t& other) const { return float(*this) <= float(other); } /// Greater-than __host__ __device__ __forceinline__ bool operator>(const bfloat16_t& other) const { return float(*this) > float(other); } /// Greater-than-equal __host__ __device__ __forceinline__ bool operator>=(const bfloat16_t& other) const { return float(*this) >= float(other); } /// numeric_traits::max __host__ __device__ __forceinline__ static bfloat16_t(max)() { uint16_t max_word = 0x7F7F; return reinterpret_cast(max_word); } /// numeric_traits::lowest __host__ __device__ __forceinline__ static bfloat16_t lowest() { uint16_t lowest_word = 0xFF7F; return reinterpret_cast(lowest_word); } }; /****************************************************************************** * I/O stream overloads ******************************************************************************/ /// Insert formatted \p bfloat16_t into the output stream inline std::ostream& operator<<(std::ostream& out, const bfloat16_t& x) { out << (float) x; return out; } /// Insert formatted \p __nv_bfloat16 into the output stream inline std::ostream& operator<<(std::ostream& out, const __nv_bfloat16& x) { return out << bfloat16_t(x); } /****************************************************************************** * Traits overloads ******************************************************************************/ template <> struct CUB_NS_QUALIFIER::FpLimits { static __host__ __device__ __forceinline__ bfloat16_t Max() { return bfloat16_t::max(); } static __host__ __device__ __forceinline__ bfloat16_t Lowest() { return bfloat16_t::lowest(); } }; template <> struct CUB_NS_QUALIFIER::NumericTraits : CUB_NS_QUALIFIER::BaseTraits {}; #ifdef __GNUC__ # pragma GCC diagnostic pop #endif cccl-2.5.0/cub/test/c2h/000077500000000000000000000000001463375617100146435ustar00rootroot00000000000000cccl-2.5.0/cub/test/c2h/checked_allocator.cuh000066400000000000000000000130301463375617100207670ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2024, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #pragma once #include #include #include #include #include #include #include // #define DEBUG_CHECKED_ALLOC_FAILURE #ifdef DEBUG_CHECKED_ALLOC_FAILURE # include #endif namespace c2h { namespace detail { inline cudaError_t check_free_device_memory(std::size_t bytes) { std::size_t free_bytes{}; std::size_t total_bytes{}; cudaError_t status = cudaMemGetInfo(&free_bytes, &total_bytes); if (status != cudaSuccess) { return status; } // Avoid allocating all available memory: constexpr std::size_t padding = 16 * 1024 * 1024; // 16 MiB if (free_bytes < (bytes + padding)) { #ifdef DEBUG_CHECKED_ALLOC_FAILURE const double total_GiB = static_cast(total_bytes) / (1024 * 1024 * 1024); const double free_GiB = static_cast(free_bytes) / (1024 * 1024 * 1024); const double requested_GiB = static_cast(bytes) / (1024 * 1024 * 1024); const double padded_GiB = static_cast(bytes + padding) / (1024 * 1024 * 1024); std::cerr << "Total device mem: " << total_GiB << " GiB\n" // << "Free device mem: " << free_GiB << " GiB\n" // << "Requested device mem: " << requested_GiB << " GiB\n" // << "Padded device mem: " << padded_GiB << " GiB\n"; #endif return cudaErrorMemoryAllocation; } return cudaSuccess; } // Check available memory prior to calling cudaMalloc. // This avoids hangups and slowdowns from allocating swap / non-device memory // on some platforms, namely tegra. inline cudaError_t checked_cuda_malloc(void** ptr, std::size_t bytes) { auto status = check_free_device_memory(bytes); if (status != cudaSuccess) { return status; } return cudaMalloc(ptr, bytes); } } // namespace detail using checked_cuda_memory_resource = thrust::system::cuda::detail::cuda_memory_resource>; template class checked_cuda_allocator : public thrust::mr::stateless_resource_allocator> { using base = thrust::mr::stateless_resource_allocator>; public: template struct rebind { typedef checked_cuda_allocator other; }; _CCCL_HOST_DEVICE checked_cuda_allocator() {} _CCCL_HOST_DEVICE checked_cuda_allocator(const checked_cuda_allocator& other) : base(other) {} template _CCCL_HOST_DEVICE checked_cuda_allocator(const checked_cuda_allocator& other) : base(other) {} checked_cuda_allocator& operator=(const checked_cuda_allocator&) = default; _CCCL_HOST_DEVICE ~checked_cuda_allocator() {} }; struct checked_host_memory_resource final : public thrust::mr::new_delete_resource_base { void* do_allocate(std::size_t bytes, std::size_t alignment = THRUST_MR_DEFAULT_ALIGNMENT) final { // Some systems with integrated host/device memory have issues with allocating more memory // than is available. Check the amount of free memory before attempting to allocate on // integrated systems. int device = 0; CubDebugExit(cudaGetDevice(&device)); cudaDeviceProp prop; CubDebugExit(cudaGetDeviceProperties(&prop, device)); if (prop.integrated) { auto status = detail::check_free_device_memory(bytes + alignment + sizeof(std::size_t)); if (status != cudaSuccess) { throw std::bad_alloc{}; } } return this->new_delete_resource_base::do_allocate(bytes, alignment); } }; template using checked_host_allocator = thrust::mr::stateless_resource_allocator; } // namespace c2h cccl-2.5.0/cub/test/c2h/cpu_timer.cuh000066400000000000000000000073471463375617100173460ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2024, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #pragma once #include #include #include #include // #define C2H_DEBUG_TIMING #ifdef C2H_DEBUG_TIMING # define C2H_TIME_SECTION_INIT() \ c2h::cpu_timer _c2h_timer_; \ (void) _c2h_timer_ # define C2H_TIME_SECTION_RESET() _c2h_timer_.reset() # define C2H_TIME_SECTION(label) _c2h_timer_.print_elapsed_seconds_and_reset(label) # define C2H_TIME_SCOPE(label) \ c2h::scoped_cpu_timer _c2h_scoped_cpu_timer_(label); \ (void) _c2h_scoped_cpu_timer_ #else # define C2H_TIME_SECTION_INIT() /* no-op */ []() {}() # define C2H_TIME_SECTION_RESET() /* no-op */ []() {}() # define C2H_TIME_SECTION(label) /* no-op */ []() {}() # define C2H_TIME_SCOPE(label) /* no-op */ []() {}() #endif namespace c2h { class cpu_timer { std::chrono::high_resolution_clock::time_point m_start; public: cpu_timer() : m_start(std::chrono::high_resolution_clock::now()) {} void reset() { m_start = std::chrono::high_resolution_clock::now(); } int elapsed_ms() const { auto duration = std::chrono::high_resolution_clock::now() - m_start; auto ms = std::chrono::duration_cast(duration); return static_cast(ms.count()); } std::uint64_t elapsed_us() const { auto duration = std::chrono::high_resolution_clock::now() - m_start; auto us = std::chrono::duration_cast(duration); return static_cast(us.count()); } void print_elapsed_seconds(const std::string& label) { printf("%0.6f s: %s\n", this->elapsed_us() / 1000000.f, label.c_str()); } void print_elapsed_seconds_and_reset(const std::string& label) { this->print_elapsed_seconds(label); this->reset(); } }; class scoped_cpu_timer { cpu_timer m_timer; std::string m_label; public: explicit scoped_cpu_timer(std::string label) : m_label(std::move(label)) {} ~scoped_cpu_timer() { m_timer.print_elapsed_seconds(m_label); } }; } // namespace c2h cccl-2.5.0/cub/test/c2h/custom_type.cuh000066400000000000000000000135161463375617100177250ustar00rootroot00000000000000/****************************************************************************** * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #pragma once #include #include #include namespace c2h { struct custom_type_state_t { std::size_t key{}; std::size_t val{}; }; template