pax_global_header00006660000000000000000000000064144002337710014513gustar00rootroot0000000000000052 comment=bd5c2117f62c73a9e922d5e93353a39ab3ac269b triton-2.0.0/000077500000000000000000000000001440023377100130315ustar00rootroot00000000000000triton-2.0.0/.clang-format000066400000000000000000000000231440023377100153770ustar00rootroot00000000000000BasedOnStyle: LLVM triton-2.0.0/.github/000077500000000000000000000000001440023377100143715ustar00rootroot00000000000000triton-2.0.0/.github/CODEOWNERS000066400000000000000000000031211440023377100157610ustar00rootroot00000000000000# These owners will be the default owners for everything in # the repo. Unless a later match takes precedence, # @global-owner1 and @global-owner2 will be requested for # review when someone opens a pull request. * @ptillet # -------- # Analyses # -------- # Alias analysis include/triton/Analysis/Alias.h @Jokeren lib/Analysis/Alias.cpp @Jokeren # Allocation analysis include/triton/Analysis/Allocation.h @Jokeren lib/Analysis/Allocation.cpp @Jokeren # Membar analysis include/triton/Analysis/Membar.h @Jokeren lib/Analysis/Membar.cpp @Jokeren # AxisInfo analysis include/triton/Analysis/AxisInfo.h @ptillet lib/Analysis/AxisInfo.cpp @ptillet # Utilities include/triton/Analysis/Utility.h @Jokeren lib/Analysis/Utility.cpp @Jokeren # ---------- # Dialects # ---------- # Pipeline pass lib/Dialect/TritonGPU/Transforms/Pipeline.cpp @daadaada # Prefetch pass lib/Dialect/TritonGPU/Transforms/Prefetch.cpp @daadaada # Coalesce pass lib/Dialect/TritonGPU/Transforms/Coalesce.cpp @ptillet # Layout simplification pass lib/Dialect/TritonGPU/Transforms/Combine.cpp @ptillet # ----------- # Conversions # ----------- # TritonGPUToLLVM include/triton/Conversion/TritonGPUToLLVM/ @goostavz @Superjomn lib/Conversions/TritonGPUToLLVM @goostavz @Superjomn # TritonToTritonGPU include/triton/Conversion/TritonToTritonGPU/ @daadaada lib/Dialect/TritonGPU/Transforms/TritonGPUConversion.cpp @daadaada # ------- # Targets # ------- # LLVMIR include/triton/Target/LLVMIR/ @goostavz @Superjomn lib/Target/LLVMIR @goostavz @Superjomn # PTX include/triton/Target/PTX/ @goostavz @Superjomn lib/Target/PTX @goostavz @Superjomn triton-2.0.0/.github/workflows/000077500000000000000000000000001440023377100164265ustar00rootroot00000000000000triton-2.0.0/.github/workflows/integration-tests.yml000066400000000000000000000060451440023377100226410ustar00rootroot00000000000000name: Integration Tests on: workflow_dispatch: pull_request: branches: - main - triton-mlir concurrency: group: ${{ github.ref }} cancel-in-progress: ${{ github.ref != 'refs/heads/master' }} jobs: Runner-Preparation: runs-on: ubuntu-latest outputs: matrix: ${{ steps.set-matrix.outputs.matrix }} steps: - name: Prepare runner matrix id: set-matrix run: | if [ x"${{ github.repository }}" == x"openai/triton" ]; then echo '::set-output name=matrix::[["self-hosted", "A10"], ["self-hosted", "V100"], "macos-10.15"]' else echo '::set-output name=matrix::["ubuntu-latest", "macos-10.15"]' fi Integration-Tests: needs: Runner-Preparation runs-on: ${{ matrix.runner }} strategy: matrix: runner: ${{fromJson(needs.Runner-Preparation.outputs.matrix)}} steps: - name: Checkout uses: actions/checkout@v2 - name: Clear cache run: | rm -rf ~/.triton/cache/ - name: Check imports if: ${{ matrix.runner != 'macos-10.15' }} run: | pip install isort isort -c ./python || ( echo '::error title=Imports not sorted::Please run \"isort ./python\"' ; exit 1 ) - name: Check python style if: ${{ matrix.runner != 'macos-10.15' }} run: | pip install autopep8 autopep8 -a -r -d --exit-code ./python || ( echo '::error title=Style issues::Please run \"autopep8 -a -r -i ./python\"' ; exit 1 ) - name: Check cpp style if: ${{ matrix.runner != 'macos-10.15' }} run: | pip install clang-format find . -regex '.*\.\(cpp\|hpp\|h\|cc\)' -not -path "./python/triton/*" -not -path "./python/build/*" -not -path "./include/triton/external/*" -print0 | xargs -0 -n1 clang-format -style=file --dry-run -Werror -i || (echo '::error title=Style issues:: Please run `find . -regex ".*\.\(cpp\|hpp\|h\|cc\)" -not -path "./python/triton/*" -not -path "./python/build/*" -not -path "./include/triton/external/*" -print0 | xargs -0 -n1 clang-format -style=file -i`' ; exit 1) - name: Flake8 if: ${{ matrix.runner != 'macos-10.15' }} run: | pip install flake8 flake8 --config ./python/setup.cfg ./python || ( echo '::error::Flake8 failed; see logs for errors.' ; exit 1 ) - name: Install Triton run: | cd python TRITON_USE_ASSERT_ENABLED_LLVM=TRUE pip3 install -e '.[tests]' - name: Run lit tests run: | cd python LIT_TEST_DIR="build/$(ls build)/test" if [ ! -d "$LIT_TEST_DIR" ]; then echo "Not found `$LIT_TEST_DIR`. Did you change an installation method?" ; exit -1 fi lit -v "$LIT_TEST_DIR" - name: Run python tests if: ${{matrix.runner[0] == 'self-hosted'}} run: | cd python/test/unit/ pytest - name: Run CXX unittests run: | cd python/ cd "build/$(ls build)" ctest triton-2.0.0/.github/workflows/wheels.yml000066400000000000000000000023431440023377100204420ustar00rootroot00000000000000name: Wheels on: workflow_dispatch: #schedule: # - cron: "0 0 * * *" jobs: Build-Wheels: runs-on: [self-hosted, V100] steps: - name: Checkout uses: actions/checkout@v2 - name: Patch setup.py run: | #sed -i 's/name\=\"triton\"/name="triton-nightly"/g' python/setup.py export LATEST_DATE=$(TZ=UTC0 git show --quiet --date='format-local:%Y%m%d' --format="%cd") #sed -i -r "s/version\=\"(.*)\"/version=\"\1-dev"$LATEST_DATE"\"/g" python/setup.py echo "" >> python/setup.cfg echo "[build_ext]" >> python/setup.cfg echo "base-dir=/project" >> python/setup.cfg - name: Build wheels run: | export CIBW_MANYLINUX_X86_64_IMAGE="quay.io/pypa/manylinux2014_x86_64:latest" #export CIBW_MANYLINUX_PYPY_X86_64_IMAGE="quay.io/pypa/manylinux2014_x86_64:latest" export CIBW_BEFORE_BUILD="pip install cmake;" export CIBW_SKIP="{cp,pp}35-*" export CIBW_BUILD="{cp,pp}3*-manylinux_x86_64" python3 -m cibuildwheel python --output-dir wheelhouse - name: Upload wheels to PyPI run: | python3 -m twine upload wheelhouse/* -u __token__ -p ${{ secrets.PYPY_API_TOKEN }} triton-2.0.0/.gitignore000066400000000000000000000004301440023377100150160ustar00rootroot00000000000000# Triton builds build/ # Triton Python module builds python/build/ python/triton.egg-info/ python/triton/_C/libtriton.pyd python/triton/_C/libtriton.so # Python caches __pycache__ .pytest_cache # VS Code project files .vscode .vs # JetBrains project files .idea cmake-build-* triton-2.0.0/.gitmodules000066400000000000000000000001561440023377100152100ustar00rootroot00000000000000[submodule "deps/dlfcn-win32"] path = deps/dlfcn-win32 url = https://github.com/dlfcn-win32/dlfcn-win32.git triton-2.0.0/.isort.cfg000066400000000000000000000001021440023377100147210ustar00rootroot00000000000000[settings] known_local_folder=triton line_length=88 py_version=36 triton-2.0.0/CMakeLists.txt000066400000000000000000000154721440023377100156020ustar00rootroot00000000000000cmake_minimum_required(VERSION 3.6) include(ExternalProject) set(CMAKE_CXX_STANDARD 17) set(CMAKE_INCLUDE_CURRENT_DIR ON) project(triton) include(CTest) if(NOT WIN32) list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake") endif() # Options option(TRITON_BUILD_TUTORIALS "Build C++ Triton tutorials" ON) option(TRITON_BUILD_PYTHON_MODULE "Build Python Triton bindings" OFF) # Ensure Python3 vars are set correctly # used conditionally in this file and by lit tests # Customized release build type with assertions: TritonRelBuildWithAsserts set(CMAKE_C_FLAGS_TRITONRELBUILDWITHASSERTS "-O2 -g") set(CMAKE_CXX_FLAGS_TRITONRELBUILDWITHASSERTS "-O2 -g") # Default build type if(NOT CMAKE_BUILD_TYPE) message(STATUS "Default build type: Release") set(CMAKE_BUILD_TYPE "Release") endif() if(NOT WIN32) find_library(TERMINFO_LIBRARY tinfo) endif() # Compiler flags include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include) # Third-party include_directories(${PYBIND11_INCLUDE_DIR}) if(WIN32) SET(BUILD_SHARED_LIBS OFF) include_directories(${CMAKE_CURRENT_SOURCE_DIR}/deps/dlfcn-win32/src) add_subdirectory(deps/dlfcn-win32/src ${CMAKE_BINARY_DIR}/dlfcn-win32) endif() set(CMAKE_CXX_FLAGS "${CMAKE_C_FLAGS} -D__STDC_FORMAT_MACROS -fPIC -std=gnu++17 -fvisibility=hidden -fvisibility-inlines-hidden") if(APPLE) set(CMAKE_OSX_DEPLOYMENT_TARGET 11.6) endif() ########## # LLVM ########## if (NOT MLIR_DIR) if(NOT LLVM_LIBRARY_DIR) if(WIN32) find_package(LLVM 13 REQUIRED COMPONENTS nvptx amdgpu) include_directories(${LLVM_INCLUDE_DIRS}) separate_arguments(LLVM_DEFINITIONS_LIST NATIVE_COMMAND ${LLVM_DEFINITIONS}) add_definitions(${LLVM_DEFINITIONS_LIST}) llvm_map_components_to_libnames(LLVM_LIBRARIES support core NVPTXInfo nvptxcodegen AMDGPUInfo AMDGPUcodegen ) else() find_package(LLVM 11 REQUIRED COMPONENTS "nvptx;amdgpu") endif() message(STATUS "Found LLVM ${LLVM_PACKAGE_VERSION}") # FindLLVM outputs LLVM_LIBRARY_DIRS but we expect LLVM_LIBRARY_DIR here set(LLVM_LIBRARY_DIR ${LLVM_LIBRARY_DIRS}) if(APPLE) set(CMAKE_OSX_DEPLOYMENT_TARGET "10.14") endif() # sometimes we don't want to use llvm-config, since it may have been downloaded for some specific linux distros else() set(LLVM_LDFLAGS "-L${LLVM_LIBRARY_DIR}") set(LLVM_LIBRARIES libLLVMNVPTXCodeGen.a libLLVMNVPTXDesc.a libLLVMNVPTXInfo.a libLLVMAMDGPUDisassembler.a libLLVMMCDisassembler.a libLLVMAMDGPUCodeGen.a libLLVMMIRParser.a libLLVMGlobalISel.a libLLVMSelectionDAG.a libLLVMipo.a libLLVMInstrumentation.a libLLVMVectorize.a libLLVMLinker.a libLLVMIRReader.a libLLVMAsmParser.a libLLVMFrontendOpenMP.a libLLVMAsmPrinter.a libLLVMDebugInfoDWARF.a libLLVMCodeGen.a libLLVMTarget.a libLLVMScalarOpts.a libLLVMInstCombine.a libLLVMAggressiveInstCombine.a libLLVMTransformUtils.a libLLVMBitWriter.a libLLVMAnalysis.a libLLVMProfileData.a libLLVMObject.a libLLVMTextAPI.a libLLVMBitReader.a libLLVMAMDGPUAsmParser.a libLLVMMCParser.a libLLVMAMDGPUDesc.a libLLVMAMDGPUUtils.a libLLVMMC.a libLLVMDebugInfoCodeView.a libLLVMDebugInfoMSF.a libLLVMCore.a libLLVMRemarks.a libLLVMBitstreamReader.a libLLVMBinaryFormat.a libLLVMAMDGPUInfo.a libLLVMSupport.a libLLVMDemangle.a libLLVMPasses.a libLLVMAnalysis.a libLLVMTransformUtils.a libLLVMScalarOpts.a libLLVMTransformUtils.a libLLVMipo.a libLLVMObjCARCOpts.a libLLVMCoroutines.a libLLVMAnalysis.a ) endif() set (MLIR_DIR ${LLVM_LIBRARY_DIR}/cmake/mlir) endif() # Python module if(TRITON_BUILD_PYTHON_MODULE) message(STATUS "Adding Python module") set(PYTHON_SRC_PATH ${CMAKE_CURRENT_SOURCE_DIR}/python/src) set(PYTHON_SRC ${PYTHON_SRC_PATH}/main.cc ${PYTHON_SRC_PATH}/triton.cc) include_directories("." ${PYTHON_SRC_PATH}) if (PYTHON_INCLUDE_DIRS) include_directories(${PYTHON_INCLUDE_DIRS}) else() find_package(Python3 REQUIRED COMPONENTS Development Interpreter) include_directories(${Python3_INCLUDE_DIRS}) link_directories(${Python3_LIBRARY_DIRS}) link_libraries(${Python3_LIBRARIES}) add_link_options(${Python3_LINK_OPTIONS}) endif() endif() # # Triton # file(GLOB_RECURSE LIBTRITON_SRC lib/*.cc) # if (WIN32 AND TRITON_BUILD_PYTHON_MODULE) # Python3_add_library(triton SHARED ${LIBTRITON_SRC} ${PYTHON_SRC}) # set_target_properties(triton PROPERTIES SUFFIX ".pyd") # set_target_properties(triton PROPERTIES PREFIX "lib") # else() # add_library(triton SHARED ${LIBTRITON_SRC} ${PYTHON_SRC}) # endif() # MLIR find_package(MLIR REQUIRED CONFIG PATHS ${MLIR_DIR}) list(APPEND CMAKE_MODULE_PATH "${MLIR_CMAKE_DIR}") list(APPEND CMAKE_MODULE_PATH "${LLVM_CMAKE_DIR}") include(TableGen) # required by AddMLIR include(AddLLVM) include(AddMLIR) # Disable warnings that show up in external code (gtest;pybind11) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror -Wno-covered-switch-default") include_directories(${MLIR_INCLUDE_DIRS}) include_directories(${LLVM_INCLUDE_DIRS}) include_directories(${PROJECT_SOURCE_DIR}/include) include_directories(${PROJECT_BINARY_DIR}/include) # Tablegen'd files # link_directories(${LLVM_LIBRARY_DIR}) add_subdirectory(include) add_subdirectory(lib) add_subdirectory(bin) # find_package(PythonLibs REQUIRED) set(TRITON_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}") set(TRITON_BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}") get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS) get_property(conversion_libs GLOBAL PROPERTY MLIR_CONVERSION_LIBS) if(TRITON_BUILD_PYTHON_MODULE) add_library(triton SHARED ${PYTHON_SRC}) target_link_libraries(triton TritonAnalysis TritonTransforms TritonGPUTransforms TritonLLVMIR TritonPTX ${dialect_libs} ${conversion_libs} # optimizations MLIRPass MLIRTransforms MLIRLLVMIR MLIRSupport MLIRTargetLLVMIRExport MLIRExecutionEngine MLIRMathToLLVM MLIRNVVMToLLVMIRTranslation MLIRIR ) target_link_options(triton PRIVATE ${LLVM_LDFLAGS}) if(WIN32) target_link_libraries(triton PRIVATE ${LLVM_LIBRARIES} dl) # dl is from dlfcn-win32 elseif(APPLE) target_link_libraries(triton ${LLVM_LIBRARIES} z) else() target_link_libraries(triton ${LLVM_LIBRARIES} z stdc++fs) endif() endif() if(TRITON_BUILD_PYTHON_MODULE AND NOT WIN32) set(CMAKE_SHARED_LIBRARY_SUFFIX ".so") # Check if the platform is MacOS if(APPLE) set(PYTHON_LDFLAGS "-undefined dynamic_lookup -flto") endif() target_link_libraries(triton ${CUTLASS_LIBRARIES} ${PYTHON_LDFLAGS}) endif() add_subdirectory(test) add_subdirectory(unittest) triton-2.0.0/LICENSE000077500000000000000000000021731440023377100140440ustar00rootroot00000000000000/* * Copyright 2018-2020 Philippe Tillet * Copyright 2020-2022 OpenAI * * Permission is hereby granted, free of charge, to any person obtaining * a copy of this software and associated documentation files * (the "Software"), to deal in the Software without restriction, * including without limitation the rights to use, copy, modify, merge, * publish, distribute, sublicense, and/or sell copies of the Software, * and to permit persons to whom the Software is furnished to do so, * subject to the following conditions: * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ triton-2.0.0/README.md000066400000000000000000000045011440023377100143100ustar00rootroot00000000000000
Triton logo
[![Wheels](https://github.com/openai/triton/actions/workflows/wheels.yml/badge.svg)](https://github.com/openai/triton/actions/workflows/wheels.yml) **`Documentation`** | ------------------- | [![Documentation](https://github.com/openai/triton/actions/workflows/documentation.yml/badge.svg)](https://triton-lang.org/) # Triton This is the development repository of Triton, a language and compiler for writing highly efficient custom Deep-Learning primitives. The aim of Triton is to provide an open-source environment to write fast code at higher productivity than CUDA, but also with higher flexibility than other existing DSLs. The foundations of this project are described in the following MAPL2019 publication: [Triton: An Intermediate Language and Compiler for Tiled Neural Network Computations](http://www.eecs.harvard.edu/~htk/publication/2019-mapl-tillet-kung-cox.pdf). Please consider citing this work if you use Triton! The [official documentation](https://triton-lang.org) contains installation instructions and tutorials. # Quick Installation You can install the latest stable release of Triton from pip: ```bash pip install triton ``` Binary wheels are available for CPython 3.6-3.11 and PyPy 3.7-3.9. And the latest nightly release: ```bash pip install -U --pre triton ``` # Install from source ``` git clone https://github.com/openai/triton.git; cd triton/python; pip install cmake; # build time dependency pip install -e . ``` # Changelog Version 2.0 is out! New features include: - Many, many bugfixes - Performance improvements - Backend rewritten to use MLIR - Support for kernels taht contain back-to-back matmuls (e.g., flash attention) # Contributing Community contributions are more than welcome, whether it be to fix bugs or to add new features. Feel free to open GitHub issues about your contribution ideas, and we will review them. A contributor's guide containing general guidelines is coming soon! If you’re interested in joining our team and working on Triton & GPU kernels, [we’re hiring](https://openai.com/jobs/#acceleration)! # Compatibility Supported Platforms: * Linux Supported Hardware: * NVIDIA GPUs (Compute Capability 7.0+) * Under development: AMD GPUs, CPUstriton-2.0.0/bin/000077500000000000000000000000001440023377100136015ustar00rootroot00000000000000triton-2.0.0/bin/CMakeLists.txt000066400000000000000000000030041440023377100163360ustar00rootroot00000000000000add_subdirectory(FileCheck) # add_llvm_executable(FileCheck FileCheck/FileCheck.cpp) # target_link_libraries(FileCheck PRIVATE LLVMFileCheck LLVMSupport) get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS) get_property(conversion_libs GLOBAL PROPERTY MLIR_CONVERSION_LIBS) add_llvm_executable(triton-opt triton-opt.cpp PARTIAL_SOURCES_INTENDED) # TODO: what's this? llvm_update_compile_flags(triton-opt) target_link_libraries(triton-opt PRIVATE TritonAnalysis TritonTransforms TritonGPUTransforms ${dialect_libs} ${conversion_libs} # tests TritonTestAnalysis # MLIR core MLIROptLib MLIRPass MLIRTransforms ) mlir_check_all_link_libraries(triton-opt) add_llvm_executable(triton-translate triton-translate.cpp PARTIAL_SOURCES_INTENDED) llvm_update_compile_flags(triton-translate) target_link_libraries(triton-translate PRIVATE TritonAnalysis TritonTransforms TritonGPUTransforms TritonLLVMIR TritonPTX ${dialect_libs} ${conversion_libs} # tests TritonTestAnalysis LLVMCore LLVMSupport LLVMOption LLVMCodeGen LLVMAsmParser # MLIR core MLIROptLib MLIRIR MLIRLLVMIR MLIRPass MLIRSupport MLIRTransforms MLIRExecutionEngine MLIRMathToLLVM MLIRTransformUtils MLIRLLVMToLLVMIRTranslation MLIRNVVMToLLVMIRTranslation ) mlir_check_all_link_libraries(triton-translate) triton-2.0.0/bin/FileCheck/000077500000000000000000000000001440023377100154165ustar00rootroot00000000000000triton-2.0.0/bin/FileCheck/CMakeLists.txt000066400000000000000000000001571440023377100201610ustar00rootroot00000000000000add_llvm_executable(FileCheck FileCheck.cpp) target_link_libraries(FileCheck PRIVATE LLVMFileCheck LLVMSupport)triton-2.0.0/bin/FileCheck/FileCheck.cpp000066400000000000000000001077711440023377100177540ustar00rootroot00000000000000//===- FileCheck.cpp - Check that File's Contents match what is expected --===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // FileCheck does a line-by line check of a file that validates whether it // contains the expected content. This is useful for regression tests etc. // // This program exits with an exit status of 2 on error, exit status of 0 if // the file matched the expected contents, and exit status of 1 if it did not // contain the expected contents. // //===----------------------------------------------------------------------===// #include "llvm/FileCheck/FileCheck.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/InitLLVM.h" #include "llvm/Support/Process.h" #include "llvm/Support/WithColor.h" #include "llvm/Support/raw_ostream.h" #include #include using namespace llvm; static cl::extrahelp FileCheckOptsEnv( "\nOptions are parsed from the environment variable FILECHECK_OPTS and\n" "from the command line.\n"); static cl::opt CheckFilename(cl::Positional, cl::desc(""), cl::Optional); static cl::opt InputFilename("input-file", cl::desc("File to check (defaults to stdin)"), cl::init("-"), cl::value_desc("filename")); static cl::list CheckPrefixes( "check-prefix", cl::desc("Prefix to use from check file (defaults to 'CHECK')")); static cl::alias CheckPrefixesAlias( "check-prefixes", cl::aliasopt(CheckPrefixes), cl::CommaSeparated, cl::NotHidden, cl::desc( "Alias for -check-prefix permitting multiple comma separated values")); static cl::list CommentPrefixes( "comment-prefixes", cl::CommaSeparated, cl::Hidden, cl::desc("Comma-separated list of comment prefixes to use from check file\n" "(defaults to 'COM,RUN'). Please avoid using this feature in\n" "LLVM's LIT-based test suites, which should be easier to\n" "maintain if they all follow a consistent comment style. This\n" "feature is meant for non-LIT test suites using FileCheck.")); static cl::opt NoCanonicalizeWhiteSpace( "strict-whitespace", cl::desc("Do not treat all horizontal whitespace as equivalent")); static cl::opt IgnoreCase("ignore-case", cl::desc("Use case-insensitive matching")); static cl::list ImplicitCheckNot( "implicit-check-not", cl::desc("Add an implicit negative check with this pattern to every\n" "positive check. This can be used to ensure that no instances of\n" "this pattern occur which are not matched by a positive pattern"), cl::value_desc("pattern")); static cl::list GlobalDefines("D", cl::AlwaysPrefix, cl::desc("Define a variable to be used in capture patterns."), cl::value_desc("VAR=VALUE")); static cl::opt AllowEmptyInput( "allow-empty", cl::init(false), cl::desc("Allow the input file to be empty. This is useful when making\n" "checks that some error message does not occur, for example.")); static cl::opt AllowUnusedPrefixes( "allow-unused-prefixes", cl::init(false), cl::ZeroOrMore, cl::desc("Allow prefixes to be specified but not appear in the test.")); static cl::opt MatchFullLines( "match-full-lines", cl::init(false), cl::desc("Require all positive matches to cover an entire input line.\n" "Allows leading and trailing whitespace if --strict-whitespace\n" "is not also passed.")); static cl::opt EnableVarScope( "enable-var-scope", cl::init(false), cl::desc("Enables scope for regex variables. Variables with names that\n" "do not start with '$' will be reset at the beginning of\n" "each CHECK-LABEL block.")); static cl::opt AllowDeprecatedDagOverlap( "allow-deprecated-dag-overlap", cl::init(false), cl::desc("Enable overlapping among matches in a group of consecutive\n" "CHECK-DAG directives. This option is deprecated and is only\n" "provided for convenience as old tests are migrated to the new\n" "non-overlapping CHECK-DAG implementation.\n")); static cl::opt Verbose( "v", cl::init(false), cl::ZeroOrMore, cl::desc("Print directive pattern matches, or add them to the input dump\n" "if enabled.\n")); static cl::opt VerboseVerbose( "vv", cl::init(false), cl::ZeroOrMore, cl::desc("Print information helpful in diagnosing internal FileCheck\n" "issues, or add it to the input dump if enabled. Implies\n" "-v.\n")); // The order of DumpInputValue members affects their precedence, as documented // for -dump-input below. enum DumpInputValue { DumpInputNever, DumpInputFail, DumpInputAlways, DumpInputHelp }; static cl::list DumpInputs( "dump-input", cl::desc("Dump input to stderr, adding annotations representing\n" "currently enabled diagnostics. When there are multiple\n" "occurrences of this option, the that appears earliest\n" "in the list below has precedence. The default is 'fail'.\n"), cl::value_desc("mode"), cl::values(clEnumValN(DumpInputHelp, "help", "Explain input dump and quit"), clEnumValN(DumpInputAlways, "always", "Always dump input"), clEnumValN(DumpInputFail, "fail", "Dump input on failure"), clEnumValN(DumpInputNever, "never", "Never dump input"))); // The order of DumpInputFilterValue members affects their precedence, as // documented for -dump-input-filter below. enum DumpInputFilterValue { DumpInputFilterError, DumpInputFilterAnnotation, DumpInputFilterAnnotationFull, DumpInputFilterAll }; static cl::list DumpInputFilters( "dump-input-filter", cl::desc("In the dump requested by -dump-input, print only input lines of\n" "kind plus any context specified by -dump-input-context.\n" "When there are multiple occurrences of this option, the \n" "that appears earliest in the list below has precedence. The\n" "default is 'error' when -dump-input=fail, and it's 'all' when\n" "-dump-input=always.\n"), cl::values(clEnumValN(DumpInputFilterAll, "all", "All input lines"), clEnumValN(DumpInputFilterAnnotationFull, "annotation-full", "Input lines with annotations"), clEnumValN(DumpInputFilterAnnotation, "annotation", "Input lines with starting points of annotations"), clEnumValN(DumpInputFilterError, "error", "Input lines with starting points of error " "annotations"))); static cl::list DumpInputContexts( "dump-input-context", cl::value_desc("N"), cl::desc("In the dump requested by -dump-input, print input lines\n" "before and input lines after any lines specified by\n" "-dump-input-filter. When there are multiple occurrences of\n" "this option, the largest specified has precedence. The\n" "default is 5.\n")); typedef cl::list::const_iterator prefix_iterator; static void DumpCommandLine(int argc, char **argv) { errs() << "FileCheck command line: "; for (int I = 0; I < argc; I++) errs() << " " << argv[I]; errs() << "\n"; } struct MarkerStyle { /// The starting char (before tildes) for marking the line. char Lead; /// What color to use for this annotation. raw_ostream::Colors Color; /// A note to follow the marker, or empty string if none. std::string Note; /// Does this marker indicate inclusion by -dump-input-filter=error? bool FiltersAsError; MarkerStyle() {} MarkerStyle(char Lead, raw_ostream::Colors Color, const std::string &Note = "", bool FiltersAsError = false) : Lead(Lead), Color(Color), Note(Note), FiltersAsError(FiltersAsError) { assert((!FiltersAsError || !Note.empty()) && "expected error diagnostic to have note"); } }; static MarkerStyle GetMarker(FileCheckDiag::MatchType MatchTy) { switch (MatchTy) { case FileCheckDiag::MatchFoundAndExpected: return MarkerStyle('^', raw_ostream::GREEN); case FileCheckDiag::MatchFoundButExcluded: return MarkerStyle('!', raw_ostream::RED, "error: no match expected", /*FiltersAsError=*/true); case FileCheckDiag::MatchFoundButWrongLine: return MarkerStyle('!', raw_ostream::RED, "error: match on wrong line", /*FiltersAsError=*/true); case FileCheckDiag::MatchFoundButDiscarded: return MarkerStyle('!', raw_ostream::CYAN, "discard: overlaps earlier match"); case FileCheckDiag::MatchFoundErrorNote: // Note should always be overridden within the FileCheckDiag. return MarkerStyle('!', raw_ostream::RED, "error: unknown error after match", /*FiltersAsError=*/true); case FileCheckDiag::MatchNoneAndExcluded: return MarkerStyle('X', raw_ostream::GREEN); case FileCheckDiag::MatchNoneButExpected: return MarkerStyle('X', raw_ostream::RED, "error: no match found", /*FiltersAsError=*/true); case FileCheckDiag::MatchNoneForInvalidPattern: return MarkerStyle('X', raw_ostream::RED, "error: match failed for invalid pattern", /*FiltersAsError=*/true); case FileCheckDiag::MatchFuzzy: return MarkerStyle('?', raw_ostream::MAGENTA, "possible intended match", /*FiltersAsError=*/true); } llvm_unreachable_internal("unexpected match type"); } static void DumpInputAnnotationHelp(raw_ostream &OS) { OS << "The following description was requested by -dump-input=help to\n" << "explain the input dump printed by FileCheck.\n" << "\n" << "Related command-line options:\n" << "\n" << " - -dump-input= enables or disables the input dump\n" << " - -dump-input-filter= filters the input lines\n" << " - -dump-input-context= adjusts the context of filtered lines\n" << " - -v and -vv add more annotations\n" << " - -color forces colors to be enabled both in the dump and below\n" << " - -help documents the above options in more detail\n" << "\n" << "These options can also be set via FILECHECK_OPTS. For example, for\n" << "maximum debugging output on failures:\n" << "\n" << " $ FILECHECK_OPTS='-dump-input-filter=all -vv -color' ninja check\n" << "\n" << "Input dump annotation format:\n" << "\n"; // Labels for input lines. OS << " - "; WithColor(OS, raw_ostream::SAVEDCOLOR, true) << "L:"; OS << " labels line number L of the input file\n" << " An extra space is added after each input line to represent" << " the\n" << " newline character\n"; // Labels for annotation lines. OS << " - "; WithColor(OS, raw_ostream::SAVEDCOLOR, true) << "T:L"; OS << " labels the only match result for either (1) a pattern of type T" << " from\n" << " line L of the check file if L is an integer or (2) the" << " I-th implicit\n" << " pattern if L is \"imp\" followed by an integer " << "I (index origin one)\n"; OS << " - "; WithColor(OS, raw_ostream::SAVEDCOLOR, true) << "T:L'N"; OS << " labels the Nth match result for such a pattern\n"; // Markers on annotation lines. OS << " - "; WithColor(OS, raw_ostream::SAVEDCOLOR, true) << "^~~"; OS << " marks good match (reported if -v)\n" << " - "; WithColor(OS, raw_ostream::SAVEDCOLOR, true) << "!~~"; OS << " marks bad match, such as:\n" << " - CHECK-NEXT on same line as previous match (error)\n" << " - CHECK-NOT found (error)\n" << " - CHECK-DAG overlapping match (discarded, reported if " << "-vv)\n" << " - "; WithColor(OS, raw_ostream::SAVEDCOLOR, true) << "X~~"; OS << " marks search range when no match is found, such as:\n" << " - CHECK-NEXT not found (error)\n" << " - CHECK-NOT not found (success, reported if -vv)\n" << " - CHECK-DAG not found after discarded matches (error)\n" << " - "; WithColor(OS, raw_ostream::SAVEDCOLOR, true) << "?"; OS << " marks fuzzy match when no match is found\n"; // Elided lines. OS << " - "; WithColor(OS, raw_ostream::SAVEDCOLOR, true) << "..."; OS << " indicates elided input lines and annotations, as specified by\n" << " -dump-input-filter and -dump-input-context\n"; // Colors. OS << " - colors "; WithColor(OS, raw_ostream::GREEN, true) << "success"; OS << ", "; WithColor(OS, raw_ostream::RED, true) << "error"; OS << ", "; WithColor(OS, raw_ostream::MAGENTA, true) << "fuzzy match"; OS << ", "; WithColor(OS, raw_ostream::CYAN, true, false) << "discarded match"; OS << ", "; WithColor(OS, raw_ostream::CYAN, true, true) << "unmatched input"; OS << "\n"; } /// An annotation for a single input line. struct InputAnnotation { /// The index of the match result across all checks unsigned DiagIndex; /// The label for this annotation. std::string Label; /// Is this the initial fragment of a diagnostic that has been broken across /// multiple lines? bool IsFirstLine; /// What input line (one-origin indexing) this annotation marks. This might /// be different from the starting line of the original diagnostic if /// !IsFirstLine. unsigned InputLine; /// The column range (one-origin indexing, open end) in which to mark the /// input line. If InputEndCol is UINT_MAX, treat it as the last column /// before the newline. unsigned InputStartCol, InputEndCol; /// The marker to use. MarkerStyle Marker; /// Whether this annotation represents a good match for an expected pattern. bool FoundAndExpectedMatch; }; /// Get an abbreviation for the check type. static std::string GetCheckTypeAbbreviation(Check::FileCheckType Ty) { switch (Ty) { case Check::CheckPlain: if (Ty.getCount() > 1) return "count"; return "check"; case Check::CheckNext: return "next"; case Check::CheckSame: return "same"; case Check::CheckNot: return "not"; case Check::CheckDAG: return "dag"; case Check::CheckLabel: return "label"; case Check::CheckEmpty: return "empty"; case Check::CheckComment: return "com"; case Check::CheckEOF: return "eof"; case Check::CheckBadNot: return "bad-not"; case Check::CheckBadCount: return "bad-count"; case Check::CheckNone: llvm_unreachable("invalid FileCheckType"); } llvm_unreachable("unknown FileCheckType"); } static void BuildInputAnnotations(const SourceMgr &SM, unsigned CheckFileBufferID, const std::pair &ImpPatBufferIDRange, const std::vector &Diags, std::vector &Annotations, unsigned &LabelWidth) { struct CompareSMLoc { bool operator()(const SMLoc &LHS, const SMLoc &RHS) const { return LHS.getPointer() < RHS.getPointer(); } }; // How many diagnostics does each pattern have? std::map DiagCountPerPattern; for (auto Diag : Diags) ++DiagCountPerPattern[Diag.CheckLoc]; // How many diagnostics have we seen so far per pattern? std::map DiagIndexPerPattern; // How many total diagnostics have we seen so far? unsigned DiagIndex = 0; // What's the widest label? LabelWidth = 0; for (auto DiagItr = Diags.begin(), DiagEnd = Diags.end(); DiagItr != DiagEnd; ++DiagItr) { InputAnnotation A; A.DiagIndex = DiagIndex++; // Build label, which uniquely identifies this check result. unsigned CheckBufferID = SM.FindBufferContainingLoc(DiagItr->CheckLoc); auto CheckLineAndCol = SM.getLineAndColumn(DiagItr->CheckLoc, CheckBufferID); llvm::raw_string_ostream Label(A.Label); Label << GetCheckTypeAbbreviation(DiagItr->CheckTy) << ":"; if (CheckBufferID == CheckFileBufferID) Label << CheckLineAndCol.first; else if (ImpPatBufferIDRange.first <= CheckBufferID && CheckBufferID < ImpPatBufferIDRange.second) Label << "imp" << (CheckBufferID - ImpPatBufferIDRange.first + 1); else llvm_unreachable("expected diagnostic's check location to be either in " "the check file or for an implicit pattern"); if (DiagCountPerPattern[DiagItr->CheckLoc] > 1) Label << "'" << DiagIndexPerPattern[DiagItr->CheckLoc]++; LabelWidth = std::max((std::string::size_type)LabelWidth, A.Label.size()); A.Marker = GetMarker(DiagItr->MatchTy); if (!DiagItr->Note.empty()) { A.Marker.Note = DiagItr->Note; // It's less confusing if notes that don't actually have ranges don't have // markers. For example, a marker for 'with "VAR" equal to "5"' would // seem to indicate where "VAR" matches, but the location we actually have // for the marker simply points to the start of the match/search range for // the full pattern of which the substitution is potentially just one // component. if (DiagItr->InputStartLine == DiagItr->InputEndLine && DiagItr->InputStartCol == DiagItr->InputEndCol) A.Marker.Lead = ' '; } if (DiagItr->MatchTy == FileCheckDiag::MatchFoundErrorNote) { assert(!DiagItr->Note.empty() && "expected custom note for MatchFoundErrorNote"); A.Marker.Note = "error: " + A.Marker.Note; } A.FoundAndExpectedMatch = DiagItr->MatchTy == FileCheckDiag::MatchFoundAndExpected; // Compute the mark location, and break annotation into multiple // annotations if it spans multiple lines. A.IsFirstLine = true; A.InputLine = DiagItr->InputStartLine; A.InputStartCol = DiagItr->InputStartCol; if (DiagItr->InputStartLine == DiagItr->InputEndLine) { // Sometimes ranges are empty in order to indicate a specific point, but // that would mean nothing would be marked, so adjust the range to // include the following character. A.InputEndCol = std::max(DiagItr->InputStartCol + 1, DiagItr->InputEndCol); Annotations.push_back(A); } else { assert(DiagItr->InputStartLine < DiagItr->InputEndLine && "expected input range not to be inverted"); A.InputEndCol = UINT_MAX; Annotations.push_back(A); for (unsigned L = DiagItr->InputStartLine + 1, E = DiagItr->InputEndLine; L <= E; ++L) { // If a range ends before the first column on a line, then it has no // characters on that line, so there's nothing to render. if (DiagItr->InputEndCol == 1 && L == E) break; InputAnnotation B; B.DiagIndex = A.DiagIndex; B.Label = A.Label; B.IsFirstLine = false; B.InputLine = L; B.Marker = A.Marker; B.Marker.Lead = '~'; B.Marker.Note = ""; B.InputStartCol = 1; if (L != E) B.InputEndCol = UINT_MAX; else B.InputEndCol = DiagItr->InputEndCol; B.FoundAndExpectedMatch = A.FoundAndExpectedMatch; Annotations.push_back(B); } } } } static unsigned FindInputLineInFilter( DumpInputFilterValue DumpInputFilter, unsigned CurInputLine, const std::vector::iterator &AnnotationBeg, const std::vector::iterator &AnnotationEnd) { if (DumpInputFilter == DumpInputFilterAll) return CurInputLine; for (auto AnnotationItr = AnnotationBeg; AnnotationItr != AnnotationEnd; ++AnnotationItr) { switch (DumpInputFilter) { case DumpInputFilterAll: llvm_unreachable("unexpected DumpInputFilterAll"); break; case DumpInputFilterAnnotationFull: return AnnotationItr->InputLine; case DumpInputFilterAnnotation: if (AnnotationItr->IsFirstLine) return AnnotationItr->InputLine; break; case DumpInputFilterError: if (AnnotationItr->IsFirstLine && AnnotationItr->Marker.FiltersAsError) return AnnotationItr->InputLine; break; } } return UINT_MAX; } /// To OS, print a vertical ellipsis (right-justified at LabelWidth) if it would /// occupy less lines than ElidedLines, but print ElidedLines otherwise. Either /// way, clear ElidedLines. Thus, if ElidedLines is empty, do nothing. static void DumpEllipsisOrElidedLines(raw_ostream &OS, std::string &ElidedLines, unsigned LabelWidth) { if (ElidedLines.empty()) return; unsigned EllipsisLines = 3; if (EllipsisLines < StringRef(ElidedLines).count('\n')) { for (unsigned i = 0; i < EllipsisLines; ++i) { WithColor(OS, raw_ostream::BLACK, /*Bold=*/true) << right_justify(".", LabelWidth); OS << '\n'; } } else OS << ElidedLines; ElidedLines.clear(); } static void DumpAnnotatedInput(raw_ostream &OS, const FileCheckRequest &Req, DumpInputFilterValue DumpInputFilter, unsigned DumpInputContext, StringRef InputFileText, std::vector &Annotations, unsigned LabelWidth) { OS << "Input was:\n<<<<<<\n"; // Sort annotations. llvm::sort(Annotations, [](const InputAnnotation &A, const InputAnnotation &B) { // 1. Sort annotations in the order of the input lines. // // This makes it easier to find relevant annotations while // iterating input lines in the implementation below. FileCheck // does not always produce diagnostics in the order of input // lines due to, for example, CHECK-DAG and CHECK-NOT. if (A.InputLine != B.InputLine) return A.InputLine < B.InputLine; // 2. Sort annotations in the temporal order FileCheck produced // their associated diagnostics. // // This sort offers several benefits: // // A. On a single input line, the order of annotations reflects // the FileCheck logic for processing directives/patterns. // This can be helpful in understanding cases in which the // order of the associated directives/patterns in the check // file or on the command line either (i) does not match the // temporal order in which FileCheck looks for matches for the // directives/patterns (due to, for example, CHECK-LABEL, // CHECK-NOT, or `--implicit-check-not`) or (ii) does match // that order but does not match the order of those // diagnostics along an input line (due to, for example, // CHECK-DAG). // // On the other hand, because our presentation format presents // input lines in order, there's no clear way to offer the // same benefit across input lines. For consistency, it might // then seem worthwhile to have annotations on a single line // also sorted in input order (that is, by input column). // However, in practice, this appears to be more confusing // than helpful. Perhaps it's intuitive to expect annotations // to be listed in the temporal order in which they were // produced except in cases the presentation format obviously // and inherently cannot support it (that is, across input // lines). // // B. When diagnostics' annotations are split among multiple // input lines, the user must track them from one input line // to the next. One property of the sort chosen here is that // it facilitates the user in this regard by ensuring the // following: when comparing any two input lines, a // diagnostic's annotations are sorted in the same position // relative to all other diagnostics' annotations. return A.DiagIndex < B.DiagIndex; }); // Compute the width of the label column. const unsigned char *InputFilePtr = InputFileText.bytes_begin(), *InputFileEnd = InputFileText.bytes_end(); unsigned LineCount = InputFileText.count('\n'); if (InputFileEnd[-1] != '\n') ++LineCount; unsigned LineNoWidth = std::log10(LineCount) + 1; // +3 below adds spaces (1) to the left of the (right-aligned) line numbers // on input lines and (2) to the right of the (left-aligned) labels on // annotation lines so that input lines and annotation lines are more // visually distinct. For example, the spaces on the annotation lines ensure // that input line numbers and check directive line numbers never align // horizontally. Those line numbers might not even be for the same file. // One space would be enough to achieve that, but more makes it even easier // to see. LabelWidth = std::max(LabelWidth, LineNoWidth) + 3; // Print annotated input lines. unsigned PrevLineInFilter = 0; // 0 means none so far unsigned NextLineInFilter = 0; // 0 means uncomputed, UINT_MAX means none std::string ElidedLines; raw_string_ostream ElidedLinesOS(ElidedLines); ColorMode TheColorMode = WithColor(OS).colorsEnabled() ? ColorMode::Enable : ColorMode::Disable; if (TheColorMode == ColorMode::Enable) ElidedLinesOS.enable_colors(true); auto AnnotationItr = Annotations.begin(), AnnotationEnd = Annotations.end(); for (unsigned Line = 1; InputFilePtr != InputFileEnd || AnnotationItr != AnnotationEnd; ++Line) { const unsigned char *InputFileLine = InputFilePtr; // Compute the previous and next line included by the filter. if (NextLineInFilter < Line) NextLineInFilter = FindInputLineInFilter(DumpInputFilter, Line, AnnotationItr, AnnotationEnd); assert(NextLineInFilter && "expected NextLineInFilter to be computed"); if (NextLineInFilter == Line) PrevLineInFilter = Line; // Elide this input line and its annotations if it's not within the // context specified by -dump-input-context of an input line included by // -dump-input-filter. However, in case the resulting ellipsis would occupy // more lines than the input lines and annotations it elides, buffer the // elided lines and annotations so we can print them instead. raw_ostream *LineOS = &OS; if ((!PrevLineInFilter || PrevLineInFilter + DumpInputContext < Line) && (NextLineInFilter == UINT_MAX || Line + DumpInputContext < NextLineInFilter)) LineOS = &ElidedLinesOS; else { LineOS = &OS; DumpEllipsisOrElidedLines(OS, ElidedLinesOS.str(), LabelWidth); } // Print right-aligned line number. WithColor(*LineOS, raw_ostream::BLACK, /*Bold=*/true, /*BF=*/false, TheColorMode) << format_decimal(Line, LabelWidth) << ": "; // For the case where -v and colors are enabled, find the annotations for // good matches for expected patterns in order to highlight everything // else in the line. There are no such annotations if -v is disabled. std::vector FoundAndExpectedMatches; if (Req.Verbose && TheColorMode == ColorMode::Enable) { for (auto I = AnnotationItr; I != AnnotationEnd && I->InputLine == Line; ++I) { if (I->FoundAndExpectedMatch) FoundAndExpectedMatches.push_back(*I); } } // Print numbered line with highlighting where there are no matches for // expected patterns. bool Newline = false; { WithColor COS(*LineOS, raw_ostream::SAVEDCOLOR, /*Bold=*/false, /*BG=*/false, TheColorMode); bool InMatch = false; if (Req.Verbose) COS.changeColor(raw_ostream::CYAN, true, true); for (unsigned Col = 1; InputFilePtr != InputFileEnd && !Newline; ++Col) { bool WasInMatch = InMatch; InMatch = false; for (auto M : FoundAndExpectedMatches) { if (M.InputStartCol <= Col && Col < M.InputEndCol) { InMatch = true; break; } } if (!WasInMatch && InMatch) COS.resetColor(); else if (WasInMatch && !InMatch) COS.changeColor(raw_ostream::CYAN, true, true); if (*InputFilePtr == '\n') { Newline = true; COS << ' '; } else COS << *InputFilePtr; ++InputFilePtr; } } *LineOS << '\n'; unsigned InputLineWidth = InputFilePtr - InputFileLine; // Print any annotations. while (AnnotationItr != AnnotationEnd && AnnotationItr->InputLine == Line) { WithColor COS(*LineOS, AnnotationItr->Marker.Color, /*Bold=*/true, /*BG=*/false, TheColorMode); // The two spaces below are where the ": " appears on input lines. COS << left_justify(AnnotationItr->Label, LabelWidth) << " "; unsigned Col; for (Col = 1; Col < AnnotationItr->InputStartCol; ++Col) COS << ' '; COS << AnnotationItr->Marker.Lead; // If InputEndCol=UINT_MAX, stop at InputLineWidth. for (++Col; Col < AnnotationItr->InputEndCol && Col <= InputLineWidth; ++Col) COS << '~'; const std::string &Note = AnnotationItr->Marker.Note; if (!Note.empty()) { // Put the note at the end of the input line. If we were to instead // put the note right after the marker, subsequent annotations for the // same input line might appear to mark this note instead of the input // line. for (; Col <= InputLineWidth; ++Col) COS << ' '; COS << ' ' << Note; } COS << '\n'; ++AnnotationItr; } } DumpEllipsisOrElidedLines(OS, ElidedLinesOS.str(), LabelWidth); OS << ">>>>>>\n"; } int main(int argc, char **argv) { // Enable use of ANSI color codes because FileCheck is using them to // highlight text. llvm::sys::Process::UseANSIEscapeCodes(true); InitLLVM X(argc, argv); cl::ParseCommandLineOptions(argc, argv, /*Overview*/ "", /*Errs*/ nullptr, "FILECHECK_OPTS"); // Select -dump-input* values. The -help documentation specifies the default // value and which value to choose if an option is specified multiple times. // In the latter case, the general rule of thumb is to choose the value that // provides the most information. DumpInputValue DumpInput = DumpInputs.empty() ? DumpInputFail : *std::max_element(DumpInputs.begin(), DumpInputs.end()); DumpInputFilterValue DumpInputFilter; if (DumpInputFilters.empty()) DumpInputFilter = DumpInput == DumpInputAlways ? DumpInputFilterAll : DumpInputFilterError; else DumpInputFilter = *std::max_element(DumpInputFilters.begin(), DumpInputFilters.end()); unsigned DumpInputContext = DumpInputContexts.empty() ? 5 : *std::max_element(DumpInputContexts.begin(), DumpInputContexts.end()); if (DumpInput == DumpInputHelp) { DumpInputAnnotationHelp(outs()); return 0; } if (CheckFilename.empty()) { errs() << " not specified\n"; return 2; } FileCheckRequest Req; append_range(Req.CheckPrefixes, CheckPrefixes); append_range(Req.CommentPrefixes, CommentPrefixes); append_range(Req.ImplicitCheckNot, ImplicitCheckNot); bool GlobalDefineError = false; for (StringRef G : GlobalDefines) { size_t EqIdx = G.find('='); if (EqIdx == std::string::npos) { errs() << "Missing equal sign in command-line definition '-D" << G << "'\n"; GlobalDefineError = true; continue; } if (EqIdx == 0) { errs() << "Missing variable name in command-line definition '-D" << G << "'\n"; GlobalDefineError = true; continue; } Req.GlobalDefines.push_back(G); } if (GlobalDefineError) return 2; Req.AllowEmptyInput = AllowEmptyInput; Req.AllowUnusedPrefixes = AllowUnusedPrefixes; Req.EnableVarScope = EnableVarScope; Req.AllowDeprecatedDagOverlap = AllowDeprecatedDagOverlap; Req.Verbose = Verbose; Req.VerboseVerbose = VerboseVerbose; Req.NoCanonicalizeWhiteSpace = NoCanonicalizeWhiteSpace; Req.MatchFullLines = MatchFullLines; Req.IgnoreCase = IgnoreCase; if (VerboseVerbose) Req.Verbose = true; FileCheck FC(Req); if (!FC.ValidateCheckPrefixes()) return 2; Regex PrefixRE = FC.buildCheckPrefixRegex(); std::string REError; if (!PrefixRE.isValid(REError)) { errs() << "Unable to combine check-prefix strings into a prefix regular " "expression! This is likely a bug in FileCheck's verification of " "the check-prefix strings. Regular expression parsing failed " "with the following error: " << REError << "\n"; return 2; } SourceMgr SM; // Read the expected strings from the check file. ErrorOr> CheckFileOrErr = MemoryBuffer::getFileOrSTDIN(CheckFilename, /*IsText=*/true); if (std::error_code EC = CheckFileOrErr.getError()) { errs() << "Could not open check file '" << CheckFilename << "': " << EC.message() << '\n'; return 2; } MemoryBuffer &CheckFile = *CheckFileOrErr.get(); SmallString<4096> CheckFileBuffer; StringRef CheckFileText = FC.CanonicalizeFile(CheckFile, CheckFileBuffer); unsigned CheckFileBufferID = SM.AddNewSourceBuffer(MemoryBuffer::getMemBuffer( CheckFileText, CheckFile.getBufferIdentifier()), SMLoc()); std::pair ImpPatBufferIDRange; if (FC.readCheckFile(SM, CheckFileText, PrefixRE, &ImpPatBufferIDRange)) return 2; // Open the file to check and add it to SourceMgr. ErrorOr> InputFileOrErr = MemoryBuffer::getFileOrSTDIN(InputFilename, /*IsText=*/true); if (InputFilename == "-") InputFilename = ""; // Overwrite for improved diagnostic messages if (std::error_code EC = InputFileOrErr.getError()) { errs() << "Could not open input file '" << InputFilename << "': " << EC.message() << '\n'; return 2; } MemoryBuffer &InputFile = *InputFileOrErr.get(); if (InputFile.getBufferSize() == 0 && !AllowEmptyInput) { errs() << "FileCheck error: '" << InputFilename << "' is empty.\n"; DumpCommandLine(argc, argv); return 2; } SmallString<4096> InputFileBuffer; StringRef InputFileText = FC.CanonicalizeFile(InputFile, InputFileBuffer); SM.AddNewSourceBuffer(MemoryBuffer::getMemBuffer( InputFileText, InputFile.getBufferIdentifier()), SMLoc()); std::vector Diags; int ExitCode = FC.checkInput(SM, InputFileText, DumpInput == DumpInputNever ? nullptr : &Diags) ? EXIT_SUCCESS : 1; if (DumpInput == DumpInputAlways || (ExitCode == 1 && DumpInput == DumpInputFail)) { errs() << "\n" << "Input file: " << InputFilename << "\n" << "Check file: " << CheckFilename << "\n" << "\n" << "-dump-input=help explains the following input dump.\n" << "\n"; std::vector Annotations; unsigned LabelWidth; BuildInputAnnotations(SM, CheckFileBufferID, ImpPatBufferIDRange, Diags, Annotations, LabelWidth); DumpAnnotatedInput(errs(), Req, DumpInputFilter, DumpInputContext, InputFileText, Annotations, LabelWidth); } return ExitCode; } triton-2.0.0/bin/triton-opt.cpp000066400000000000000000000026221440023377100164260ustar00rootroot00000000000000#include "triton/Dialect/Triton/IR/Dialect.h" #include "triton/Dialect/TritonGPU/IR/Dialect.h" #include "triton/Dialect/Triton/Transforms/Passes.h" #include "triton/Dialect/TritonGPU/Transforms/Passes.h" #include "triton/Conversion/Passes.h" #include "mlir/IR/Dialect.h" #include "mlir/InitAllPasses.h" #include "mlir/Support/MlirOptMain.h" namespace mlir { namespace test { void registerTestAliasPass(); void registerTestAlignmentPass(); void registerTestAllocationPass(); void registerTestMembarPass(); } // namespace test } // namespace mlir int main(int argc, char **argv) { mlir::registerAllPasses(); mlir::registerTritonPasses(); mlir::registerTritonGPUPasses(); mlir::test::registerTestAliasPass(); mlir::test::registerTestAlignmentPass(); mlir::test::registerTestAllocationPass(); mlir::test::registerTestMembarPass(); mlir::triton::registerConvertTritonToTritonGPUPass(); mlir::triton::registerConvertTritonGPUToLLVMPass(); // TODO: register Triton & TritonGPU passes mlir::DialectRegistry registry; registry.insert(); return mlir::asMainReturnCode(mlir::MlirOptMain( argc, argv, "Triton (GPU) optimizer driver\n", registry)); } triton-2.0.0/bin/triton-translate.cpp000066400000000000000000000101211440023377100176120ustar00rootroot00000000000000#include "mlir/ExecutionEngine/ExecutionEngine.h" #include "mlir/ExecutionEngine/OptUtils.h" #include "mlir/IR/AsmState.h" #include "mlir/IR/BuiltinOps.h" #include "mlir/IR/Dialect.h" #include "mlir/Parser.h" #include "mlir/Pass/Pass.h" #include "mlir/Pass/PassManager.h" #include "mlir/Support/FileUtilities.h" #include "mlir/Support/LogicalResult.h" #include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h" #include "mlir/Target/LLVMIR/Export.h" #include "triton/Conversion/TritonGPUToLLVM/TritonGPUToLLVMPass.h" #include "triton/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.h" #include "triton/Dialect/Triton/IR/Dialect.h" #include "triton/Dialect/TritonGPU/IR/Dialect.h" #include "triton/Target/LLVMIR/LLVMIRTranslation.h" #include "triton/Target/PTX/PTXTranslation.h" #include "llvm/IR/LLVMContext.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/InitLLVM.h" #include "llvm/Support/SourceMgr.h" #include "llvm/Support/ToolOutputFile.h" #include namespace mlir { namespace triton { OwningOpRef loadMLIRModule(llvm::StringRef inputFilename, MLIRContext &context) { std::string errorMessage; auto input = openInputFile(inputFilename, &errorMessage); if (!input) { llvm::errs() << errorMessage << "\n"; return nullptr; } mlir::DialectRegistry registry; registry.insert(); context.appendDialectRegistry(registry); auto processBuffer = [&](std::unique_ptr ownedBuffer) -> OwningOpRef { llvm::SourceMgr sourceMgr; sourceMgr.AddNewSourceBuffer(std::move(ownedBuffer), SMLoc()); context.loadAllAvailableDialects(); context.allowUnregisteredDialects(); OwningOpRef module(parseSourceFile(sourceMgr, &context)); if (!module) { llvm::errs() << "Parse MLIR file failed."; return nullptr; } return module; }; auto module = processBuffer(std::move(input)); if (!module) { return nullptr; } return module; } LogicalResult tritonTranslateMain(int argc, char **argv, llvm::StringRef toolName) { static llvm::cl::opt inputFilename( llvm::cl::Positional, llvm::cl::desc(""), llvm::cl::init("-")); static llvm::cl::opt outputFilename( "o", llvm::cl::desc("Output filename"), llvm::cl::value_desc("filename"), llvm::cl::init("-")); static llvm::cl::opt targetKind( "target", llvm::cl::desc(""), llvm::cl::value_desc("target"), llvm::cl::init("llvmir")); static llvm::cl::opt SMArch("sm", llvm::cl::desc("sm arch"), llvm::cl::init(80)); static llvm::cl::opt ptxVersion( "ptx-version", llvm::cl::desc("PTX version"), llvm::cl::init(10000)); llvm::InitLLVM y(argc, argv); registerAsmPrinterCLOptions(); registerMLIRContextCLOptions(); llvm::cl::ParseCommandLineOptions(argc, argv, toolName); mlir::MLIRContext context; auto module = loadMLIRModule(inputFilename, context); if (!module) { return failure(); } std::string errorMessage; auto output = openOutputFile(outputFilename, &errorMessage); if (!output) { llvm::errs() << errorMessage << "\n"; return failure(); } llvm::LLVMContext llvmContext; auto llvmir = translateTritonGPUToLLVMIR(&llvmContext, *module, SMArch.getValue()); if (!llvmir) { llvm::errs() << "Translate to LLVM IR failed"; } if (targetKind == "llvmir") llvm::outs() << *llvmir << '\n'; else if (targetKind == "ptx") llvm::outs() << ::triton::translateLLVMIRToPTX(*llvmir, SMArch.getValue(), ptxVersion.getValue()); return success(); } } // namespace triton } // namespace mlir int main(int argc, char **argv) { return failed(mlir::triton::tritonTranslateMain( argc, argv, "Triton Translate Testing Tool.")); } triton-2.0.0/cmake/000077500000000000000000000000001440023377100141115ustar00rootroot00000000000000triton-2.0.0/cmake/FindLLVM.cmake000066400000000000000000000210321440023377100164640ustar00rootroot00000000000000 # - Find LLVM headers and libraries. # This module locates LLVM and adapts the llvm-config output for use with # CMake. # # A given list of COMPONENTS is passed to llvm-config. # # The following variables are defined: # LLVM_FOUND - true if LLVM was found # LLVM_CXXFLAGS - C++ compiler flags for files that include LLVM headers. # LLVM_ENABLE_ASSERTIONS - Whether LLVM was built with enabled assertions (ON/OFF). # LLVM_INCLUDE_DIRS - Directory containing LLVM include files. # LLVM_IS_SHARED - Whether LLVM is going to be linked dynamically (ON) or statically (OFF). # LLVM_LDFLAGS - Linker flags to add when linking against LLVM # (includes -LLLVM_LIBRARY_DIRS). # LLVM_LIBRARIES - Full paths to the library files to link against. # LLVM_LIBRARY_DIRS - Directory containing LLVM libraries. # LLVM_NATIVE_ARCH - Backend corresponding to LLVM_HOST_TARGET, e.g., # X86 for x86_64 and i686 hosts. # LLVM_ROOT_DIR - The root directory of the LLVM installation. # llvm-config is searched for in ${LLVM_ROOT_DIR}/bin. # LLVM_TARGETS_TO_BUILD - List of built LLVM targets. # LLVM_VERSION_MAJOR - Major version of LLVM. # LLVM_VERSION_MINOR - Minor version of LLVM. # LLVM_VERSION_STRING - Full LLVM version string (e.g. 6.0.0svn). # LLVM_VERSION_BASE_STRING - Base LLVM version string without git/svn suffix (e.g. 6.0.0). # # Note: The variable names were chosen in conformance with the official CMake # guidelines, see ${CMAKE_ROOT}/Modules/readme.txt. # Try suffixed versions to pick up the newest LLVM install available on Debian # derivatives. # We also want an user-specified LLVM_ROOT_DIR to take precedence over the # system default locations such as /usr/local/bin. Executing find_program() # multiples times is the approach recommended in the docs. set(llvm_config_names llvm-config-6.0 llvm-config60 llvm-config) foreach(v RANGE 7 17) # names like llvm-config-7.0 llvm-config70 llvm-config-7 llvm-config-7-64 list(PREPEND llvm_config_names llvm-config-${v}.0 llvm-config${v}0 llvm-config-${v} llvm-config-${v}-64) endforeach() find_program(LLVM_CONFIG NAMES ${llvm_config_names} PATHS ${LLVM_ROOT_DIR}/bin NO_DEFAULT_PATH DOC "Path to llvm-config tool.") find_program(LLVM_CONFIG NAMES ${llvm_config_names}) if(APPLE) # extra fallbacks for MacPorts & Homebrew find_program(LLVM_CONFIG NAMES ${llvm_config_names} PATHS /opt/local/libexec/llvm-11/bin /opt/local/libexec/llvm-10/bin /opt/local/libexec/llvm-9.0/bin /opt/local/libexec/llvm-8.0/bin /opt/local/libexec/llvm-7.0/bin /opt/local/libexec/llvm-6.0/bin /opt/local/libexec/llvm/bin /usr/local/opt/llvm@11/bin /usr/local/opt/llvm@10/bin /usr/local/opt/llvm@9/bin /usr/local/opt/llvm@8/bin /usr/local/opt/llvm@7/bin /usr/local/opt/llvm@6/bin /usr/local/opt/llvm/bin NO_DEFAULT_PATH) endif() # Prints a warning/failure message depending on the required/quiet flags. Copied # from FindPackageHandleStandardArgs.cmake because it doesn't seem to be exposed. macro(_LLVM_FAIL _msg) if(LLVM_FIND_REQUIRED) message(FATAL_ERROR "${_msg}") else() if(NOT LLVM_FIND_QUIETLY) message(WARNING "${_msg}") endif() endif() endmacro() if(NOT LLVM_CONFIG) if(NOT LLVM_FIND_QUIETLY) _LLVM_FAIL("No LLVM installation (>= ${LLVM_FIND_VERSION}) found. Try manually setting the 'LLVM_ROOT_DIR' or 'LLVM_CONFIG' variables.") endif() else() macro(llvm_set var flag) if(LLVM_FIND_QUIETLY) set(_quiet_arg ERROR_QUIET) endif() set(result_code) execute_process( COMMAND ${LLVM_CONFIG} --link-static --${flag} RESULT_VARIABLE result_code OUTPUT_VARIABLE LLVM_${var} OUTPUT_STRIP_TRAILING_WHITESPACE ${_quiet_arg} ) if(result_code) _LLVM_FAIL("Failed to execute llvm-config ('${LLVM_CONFIG}', result code: '${result_code})'") else() if(${ARGV2}) file(TO_CMAKE_PATH "${LLVM_${var}}" LLVM_${var}) endif() endif() endmacro() macro(llvm_set_libs var flag components) if(LLVM_FIND_QUIETLY) set(_quiet_arg ERROR_QUIET) endif() set(result_code) execute_process( COMMAND ${LLVM_CONFIG} --link-static --${flag} ${components} RESULT_VARIABLE result_code OUTPUT_VARIABLE tmplibs OUTPUT_STRIP_TRAILING_WHITESPACE ${_quiet_arg} ) if(result_code) _LLVM_FAIL("Failed to execute llvm-config ('${LLVM_CONFIG}', result code: '${result_code})'") else() file(TO_CMAKE_PATH "${tmplibs}" tmplibs) string(REGEX MATCHALL "${pattern}[^ ]+" LLVM_${var} ${tmplibs}) endif() endmacro() llvm_set(VERSION_STRING version) llvm_set(CXXFLAGS cxxflags) llvm_set(INCLUDE_DIRS includedir true) llvm_set(ROOT_DIR prefix true) llvm_set(ENABLE_ASSERTIONS assertion-mode) # The LLVM version string _may_ contain a git/svn suffix, so match only the x.y.z part string(REGEX MATCH "^[0-9]+[.][0-9]+[.][0-9]+" LLVM_VERSION_BASE_STRING "${LLVM_VERSION_STRING}") llvm_set(SHARED_MODE shared-mode) if(LLVM_SHARED_MODE STREQUAL "shared") set(LLVM_IS_SHARED ON) else() set(LLVM_IS_SHARED OFF) endif() llvm_set(LDFLAGS ldflags) llvm_set(SYSTEM_LIBS system-libs) string(REPLACE "\n" " " LLVM_LDFLAGS "${LLVM_LDFLAGS} ${LLVM_SYSTEM_LIBS}") if(APPLE) # unclear why/how this happens string(REPLACE "-llibxml2.tbd" "-lxml2" LLVM_LDFLAGS ${LLVM_LDFLAGS}) endif() llvm_set(LIBRARY_DIRS libdir true) llvm_set_libs(LIBRARIES libfiles "${LLVM_FIND_COMPONENTS}") # LLVM bug: llvm-config --libs tablegen returns -lLLVM-3.8.0 # but code for it is not in shared library if("${LLVM_FIND_COMPONENTS}" MATCHES "tablegen") if (NOT "${LLVM_LIBRARIES}" MATCHES "LLVMTableGen") set(LLVM_LIBRARIES "${LLVM_LIBRARIES};-lLLVMTableGen") endif() endif() llvm_set(CMAKEDIR cmakedir) llvm_set(TARGETS_TO_BUILD targets-built) string(REGEX MATCHALL "${pattern}[^ ]+" LLVM_TARGETS_TO_BUILD ${LLVM_TARGETS_TO_BUILD}) # Parse LLVM_NATIVE_ARCH manually from LLVMConfig.cmake; including it leads to issues like # https://github.com/ldc-developers/ldc/issues/3079. file(STRINGS "${LLVM_CMAKEDIR}/LLVMConfig.cmake" LLVM_NATIVE_ARCH LIMIT_COUNT 1 REGEX "^set\\(LLVM_NATIVE_ARCH (.+)\\)$") string(REGEX MATCH "set\\(LLVM_NATIVE_ARCH (.+)\\)" LLVM_NATIVE_ARCH "${LLVM_NATIVE_ARCH}") set(LLVM_NATIVE_ARCH ${CMAKE_MATCH_1}) message(STATUS "LLVM_NATIVE_ARCH: ${LLVM_NATIVE_ARCH}") # On CMake builds of LLVM, the output of llvm-config --cxxflags does not # include -fno-rtti, leading to linker errors. Be sure to add it. if(NOT MSVC AND (CMAKE_COMPILER_IS_GNUCXX OR (${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang"))) if(NOT ${LLVM_CXXFLAGS} MATCHES "-fno-rtti") set(LLVM_CXXFLAGS "${LLVM_CXXFLAGS} -fno-rtti") endif() endif() # Remove some clang-specific flags for gcc. if(CMAKE_COMPILER_IS_GNUCXX) string(REPLACE "-Wcovered-switch-default " "" LLVM_CXXFLAGS ${LLVM_CXXFLAGS}) string(REPLACE "-Wstring-conversion " "" LLVM_CXXFLAGS ${LLVM_CXXFLAGS}) string(REPLACE "-fcolor-diagnostics " "" LLVM_CXXFLAGS ${LLVM_CXXFLAGS}) # this requires more recent gcc versions (not supported by 4.9) string(REPLACE "-Werror=unguarded-availability-new " "" LLVM_CXXFLAGS ${LLVM_CXXFLAGS}) endif() # Remove gcc-specific flags for clang. if(${CMAKE_CXX_COMPILER_ID} MATCHES "Clang") string(REPLACE "-Wno-maybe-uninitialized " "" LLVM_CXXFLAGS ${LLVM_CXXFLAGS}) endif() string(REGEX REPLACE "([0-9]+).*" "\\1" LLVM_VERSION_MAJOR "${LLVM_VERSION_STRING}" ) string(REGEX REPLACE "[0-9]+\\.([0-9]+).*[A-Za-z]*" "\\1" LLVM_VERSION_MINOR "${LLVM_VERSION_STRING}" ) if (${LLVM_VERSION_STRING} VERSION_LESS ${LLVM_FIND_VERSION}) _LLVM_FAIL("Unsupported LLVM version ${LLVM_VERSION_STRING} found (${LLVM_CONFIG}). At least version ${LLVM_FIND_VERSION} is required. You can also set variables 'LLVM_ROOT_DIR' or 'LLVM_CONFIG' to use a different LLVM installation.") endif() endif() # Use the default CMake facilities for handling QUIET/REQUIRED. include(FindPackageHandleStandardArgs) find_package_handle_standard_args(LLVM REQUIRED_VARS LLVM_ROOT_DIR VERSION_VAR LLVM_VERSION_STRING) triton-2.0.0/docs/000077500000000000000000000000001440023377100137615ustar00rootroot00000000000000triton-2.0.0/docs/Makefile000066400000000000000000000011331440023377100154170ustar00rootroot00000000000000# Minimal makefile for Sphinx documentation # # You can set these variables from the command line. SPHINXOPTS = SPHINXBUILD = sphinx-build SPHINXPROJ = Triton SOURCEDIR = . BUILDDIR = _build # Put it first so that "make" without argument is like "make help". help: @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) .PHONY: help Makefile # Catch-all target: route all unknown targets to Sphinx using the new # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). %: Makefile @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)triton-2.0.0/docs/_templates/000077500000000000000000000000001440023377100161165ustar00rootroot00000000000000triton-2.0.0/docs/_templates/versions.html000066400000000000000000000016111440023377100206530ustar00rootroot00000000000000{%- if current_version %}
Other Versions v: {{ current_version.name }}
{%- if versions.tags %}
Tags
{%- for item in versions.tags %}
{{ item.name }}
{%- endfor %}
{%- endif %} {%- if versions.branches %}
Branches
{%- for item in versions.branches %}
{{ item.name }}
{%- endfor %}
{%- endif %}
{%- endif %}triton-2.0.0/docs/conf.py000066400000000000000000000157731440023377100152750ustar00rootroot00000000000000#!/usr/bin/env python3 # -*- coding: utf-8 -*- # # Triton documentation build configuration file, created by # sphinx-quickstart on Mon Feb 10 01:19:09 2020. # # This file is execfile()d with the current directory set to its # containing dir. # # Note that not all possible configuration values are present in this # autogenerated file. # # All configuration values have a default; values that are commented out # serve to show the default. # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. # # import os # import sys # sys.path.insert(0, os.path.abspath('.')) # -- General configuration ------------------------------------------------ def process_sig(app, what, name, obj, options, signature, return_annotation): if signature and '_builder' in signature: signature = signature.split('_builder')[0] + ")" return (signature, return_annotation) def setup(app): """Customize function args retrieving to get args under decorator.""" import sphinx import os app.connect("autodoc-process-signature", process_sig) os.system("pip install -e ../python") def forward_jit_fn(func): old = func def wrapped(obj, **kwargs): import triton if isinstance(obj, triton.code_gen.JITFunction): obj = obj.fn return old(obj) return wrapped old_documenter = sphinx.ext.autosummary.get_documenter def documenter(app, obj, parent): import triton if isinstance(obj, triton.code_gen.JITFunction): obj = obj.fn return old_documenter(app, obj, parent) sphinx.ext.autosummary.get_documenter = documenter sphinx.util.inspect.unwrap_all = forward_jit_fn(sphinx.util.inspect.unwrap_all) sphinx.util.inspect.signature = forward_jit_fn(sphinx.util.inspect.signature) sphinx.util.inspect.object_description = forward_jit_fn(sphinx.util.inspect.object_description) # Auto Doc import sys import os sys.path.insert(0, os.path.abspath('../python/')) extensions = ['sphinx.ext.autodoc', 'sphinx.ext.intersphinx', 'sphinx.ext.autosummary', 'sphinx.ext.coverage', 'sphinx.ext.napoleon', 'sphinx_multiversion'] autosummary_generate = True # versioning config smv_tag_whitelist = r'^(v1.1.2)$' smv_branch_whitelist = r'^master$' smv_remote_whitelist = None smv_released_pattern = r'^tags/.*$' smv_outputdir_format = '{ref.name}' smv_prefer_remote_refs = False # Sphinx gallery extensions += ['sphinx_gallery.gen_gallery'] from sphinx_gallery.sorting import FileNameSortKey sphinx_gallery_conf = { 'examples_dirs': '../python/tutorials/', 'gallery_dirs': 'getting-started/tutorials', 'filename_pattern': '', 'ignore_pattern': r'__init__\.py', 'within_subsection_order': FileNameSortKey, 'reference_url': { 'sphinx_gallery': None, } } # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] html_sidebars = { '**': [ '_templates/versions.html', ], } # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: # # source_suffix = ['.rst', '.md'] source_suffix = '.rst' # The master toctree document. master_doc = 'index' # General information about the project. project = 'Triton' copyright = '2020, Philippe Tillet' author = 'Philippe Tillet' # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. # # The short X.Y version. version = '' # The full version, including alpha/beta/rc tags. release = '' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. # # This is also used if you do content translation via gettext catalogs. # Usually you set "language" from the command line for these cases. language = None # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This patterns also effect to html_static_path and html_extra_path exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] # The name of the Pygments (syntax highlighting) style to use. pygments_style = 'sphinx' # If true, `todo` and `todoList` produce output, else they produce nothing. todo_include_todos = False # -- Options for HTML output ---------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # import sphinx_rtd_theme html_theme = 'sphinx_rtd_theme' html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. # # html_theme_options = {} # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ['_static'] html_css_files = [ 'css/custom.css', ] # Custom sidebar templates, must be a dictionary that maps document names # to template names. # # This is required for the alabaster theme # refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars html_sidebars = { '**': [ 'relations.html', # needs 'show_related': True theme option to display 'searchbox.html', ] } # -- Options for HTMLHelp output ------------------------------------------ # Output file base name for HTML help builder. htmlhelp_basename = 'Tritondoc' # -- Options for LaTeX output --------------------------------------------- latex_elements = { # The paper size ('letterpaper' or 'a4paper'). # # 'papersize': 'letterpaper', # The font size ('10pt', '11pt' or '12pt'). # # 'pointsize': '10pt', # Additional stuff for the LaTeX preamble. # # 'preamble': '', # Latex figure (float) alignment # # 'figure_align': 'htbp', } # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ (master_doc, 'Triton.tex', 'Triton Documentation', 'Philippe Tillet', 'manual'), ] # -- Options for manual page output --------------------------------------- # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [(master_doc, 'triton', 'Triton Documentation', [author], 1)] # -- Options for Texinfo output ------------------------------------------- # Grouping the document tree into Texinfo files. List of tuples # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ (master_doc, 'Triton', 'Triton Documentation', author, 'Triton', 'One line description of project.', 'Miscellaneous'), ]triton-2.0.0/docs/getting-started/000077500000000000000000000000001440023377100170665ustar00rootroot00000000000000triton-2.0.0/docs/getting-started/installation.rst000066400000000000000000000021731440023377100223240ustar00rootroot00000000000000============== Installation ============== --------------------- Binary Distributions --------------------- You can install the latest stable release of Triton from pip: .. code-block:: bash pip install triton Binary wheels are available for CPython 3.6-3.9 and PyPy 3.6-3.7. And the latest nightly release: .. code-block:: bash pip install -U --pre triton -------------- From Source -------------- +++++++++++++++ Python Package +++++++++++++++ You can install the Python package from source by running the following commands: .. code-block:: bash git clone https://github.com/openai/triton.git; cd triton/python; pip install cmake; # build time dependency pip install -e . Note that, if llvm-11 is not present on your system, the setup.py script will download the official LLVM11 static libraries link against that. You can then test your installation by running the unit tests: .. code-block:: bash pip install -e '.[tests]' pytest -vs test/unit/ and the benchmarks .. code-block:: bash cd bench/ python -m run --with-plots --result-dir /tmp/triton-bench triton-2.0.0/docs/getting-started/tutorials/000077500000000000000000000000001440023377100211145ustar00rootroot00000000000000triton-2.0.0/docs/getting-started/tutorials/grouped_vs_row_major_ordering.png000066400000000000000000016421631440023377100277640ustar00rootroot00000000000000PNG  IHDRBoU miCCPICC ProfileHWXS[H %@zD%$cBP ]D"mٱ+be]ņʛͽΜO3+Z 1i R' ]@(dV\\42xUt$,Wqu<Bo9P!֕!^9JCMRQ\iQˁ<!vEb4G@r+bQP0Q+!a<gܜ!k@BD2I>wY-A6pP҈E&F)0nqVLuJ{Ԙ'c}]ܐ(!DY٢0pSD$ ^ &l6I'&|R6K?ǕUz Kf?Q,LJU(%b eyQ*QBv̠T 8H kq$?Ne[z =dExJ!ܜJ~<[R/F)h!pd ں/L) Hk"(@$uP_ʫ-XB\@>-X%@޹p`p(~PMÂhF>葡9hI %#aD{hx g>y|'<%:'J?D9t@0U-n9=`Cf\7NgOeVT2i.d<DqGkPC3?gW}>Gh-`gy01kŎ(z2% ēyDUTTRRY9W(R8x쉒RQo#9`x(Co7]1yMg@[sMٚK%SK[KNrEbK$Qr)s(:i=[uuu uxulJPljUN]LN=NM}KlhAtZ!m1vAkҨҨ׸JiYYy@fYF՚UդuSW]H{y:$P<-:'u1%MҷO;u\rݺm=z:zz)zSuc6|%of25L0l᰺aW7nd 0(3cp!00pa}#(hFu /c8xV^Sp&ݦA+MvDf+͎`1X|F%<\nټͼ"٢b}K%2reehVVwLkjmlmRm4<5޳MfOgٯ:x:.9^"##|FGTDub99:=twv.qnp~5jde#ώ宫kkk77[5w{,F <鞣={x~zyuy[ygze11|}f[O?'<~Gَ:걿?G# 3`S@Gy 7:Qe?h[3=+*%X|0=ۗ=}< ) i M] ",'6'3|ZBDTIJSy*6QC4y4::rbc1 "~mܤC &'$L|$n]<%E3%#&}jHԎ1#s1(M֘NJOIߖ;6t쪱7َ2xLМp 337[dyy/A.`Y99+r a-Z+z1}^l=jMbqDӉS&K%IVMFI8Yc.o?, (*09e)SSZ:L]8YqX/ii-ͧϙpk̬-,g͛9;|99ys~+q)Y^ԹL͞jK5J7߸_ Zж}ᚅ_e]+?/-ϕ?/^ܶkɆĥ7.۱\{y+F_XYUVظZ2q՚k>^\gޯ篿!hCF?mm9|s}Mu▢-Ol= mFʷ}.ޱ#aǩ;Ԣڮ].XTy`|}nryW_,G4:ۛ"Zr>#zGwX'rNF(iNxBASCIIScreenshotmJ pHYs%%IR$iTXtXML:com.adobe.xmp 958 1346 Screenshot d/iDOT(fE%F@IDATx[|WuyDbq QvST;A83AT.3PH6Q& F?Np? N!d$ӹ DM:Bk}.T@6n}kYk햹=XXX(ZZBW ,_KN :gn]q^s-K%s`D?ADߐ?B3?B 3?d bP+i%ۦlɐ .|/_NӟD;g$JNSI_̿?A #/& (&i2ϐM?(07S4t:TˀS_ӂп!az-s3 r'JґDuix0*Sdm1RZߓ94!l~ 74oWȟ!W&NC4cwLXJIp%T(R阳azOx.`y=!*&h4vJ{ٴw[K(o(OvZrܯn_?#&A"q.F_J6UB 3?G@3B 3 )o?͒0MЁ7euHh Vu #}?F>Oπ)ch9;]0N[=\uCw?17'"_C0|1َvc.03Dѵ8t'=nqNyd!!9.& *&B[['Gjok+B.~ַ͛ʰW?};5B F+A/Exyjƛe;6*$k"p|Qw:0]s3{K`=vyvttHh vwwo?\vͼLOU333ŰBG =Zܾ}۪cХѻ^fhoo/h]H;#zWމkFPsNqzt111YX YTگ_ L{;`;i'7fy 8Ӎ ņ }&Ceŋsyx?wQQUl{OK@(Ў؏BmݚOu z.d{rɁK7o{M Ȁs)N~Q%ϝ!# +'ÒXM}FcVK#POJ]Iy8L2t~\ǽV&;Cmh/h~$/?A-N(X1̊q3f:߼.O@T%A*/sʂA/zh}TL J\yJބicvؑ(Ǐ7NE)T:BvLp2x˫g(2}Uy&i?]/$\ȵ:6v=h7Ѫ͈7g.e>ײ1zt|Wm˻?A'MyjEYbu}w(yӚG"׫B'Cl/XF. O چKQG3a#47l^yI$XRfOɻӝUs'kuWH ^: ` +/_ПD..Yh4F+Y{wHarJ.CZ'-4]b4ŃZ?."/"Kx56mq݅CgMJ<~䟞ݻ FFFdxU*{HidpoNy(֡-GAs䅠:q>{Cn.xNȫV7KS2d /C-ބn`>x ~צF@1ʔ,>{r._/)38wҨ}xE1çs+̍1rƀd,GIϏz2WY] {i0~lNUZ..ޞ/,j i6{.y ^bʤO+ksdxv\;yv {xHiM!F$@%/W_!-hb &h[)?N]obK cyenV9Bixe k<hwGUs#Aa/~YFc*6 /M]Lh٩<,3;'dEdxl!Ʉ?Z[GW)[Un cUxГT䈼,E9LsOiy& s羬_W 6xw[t g/F#tގ ߍ];˳E8"xX ۔u0~^'-.3 ֞j~,fg[dXwkVpK\ԟrrng|(ç;?'r!:'m\ng\}S>a=lfq(;Kͳs7[IF^?MF+VXhsN 㦯|d&U \?,'B (%Ƭw*:UmA)FߔzVWUT%/UM93^ dΞ a߿E~)jH8w@/}yXZjXw./w=~LG_X]]/gɄwC#Tʋ%!ۊ󼤊{'+tPD뎝;kW=/ fScBf%0PCT&BO:(4oU%O4gȟS(ё4Mؤ? 1y{?]>JpI͵ Oo M-]/xz?0:z:6=1) !/eR.|>By {>FYJyJ ͛( 簥 +I-Nuf3VWOWO~FCp'%?63s# Ԣsg&+CK"V$T~CV>[wkɸ.YY.-ڱ*Ѿ%iB7S #ɴ(# .$`&ՕәRv9ĖYŢm[?S̳ʸVmUqT(<)IxQvuZR Kh>-tq!X!M a{d4pA?_V6+io;wY ;paWWyee ٵxk<W*a!o+M~9cIh8 (9M<(Q2N`u[,*%e>,^޶?%υc4'G*/ڢN?p 8Q)'PRȏC0$:C ?obr5e#? Z=yR濛y[^ihS*G~J?}o '$Fb BYO0gWh ^ǎ+?W|LUI2IK^*!\#RK?ܴiS~Tq]3Y^埭Z|vmVNc$'o-Ǜy#ڟQQ-4 Ťd#L8d_Xa|Ϟ=I2ԿBt}ѿ_!/ZU_98&( uCJ@'-UՅ= BG-m@32r!S։bڴYɸ;$ڮ2?Ew8ߥD_ȟanJT1Zauaẹzr dՑ}`W`Bvibq{ uhb"I)>  EY 'o!ʃp+=Hdg(ak/9I; XKA[.3Y!*o9ZW-Qf⸥ "9vM0=//%p/וdJl7dSMސ :r:fxX*Ѿ.A%-O ; `>KEe(_Q37oNj+Svrm#9]_W|#˱CX;9lYjN~Ggy^wSx-%Zc:Aͅkc?GifM?n zGNBkHA r&Plg^ !lLu^FkB' `Lt_a M?<sW[5d/~[uL u kt}+3 JxFnT!> g c1 FП"/\QG?t$|΅/%CתCZ'֒-511Q1%jעI4?_"q]lyB;$38zix)\aBgǰ ((-v\L !d?շ~t~A DǪE% W(O)e~j>7#($;+>OH %?\G5J:G_!k,42doֱl_1!^ϞтSvij\? эtSG!OO ׫AjNu,N!;п`d%+NxS!2`Awez&vʯmvBULЁU>ՔzNT=/UV1ҬW"O_Q?B2Y){jH`~3^v5a|5пH蟡gkV$4psAJ$!1ZP`Ȑ}߷Ԭw!W#y!Z=!'/!?C+ϐ3O=пBْBC_aa) ySc&6k0KYxkt9H2wl3&S^&D0 L4U-G vAӁ?CpWbIL1B,? M[.p?C F&D7B 3O[@דBFܙĚ-Xu`u9f6n~=]D-5149OП47䏐 B ;пB ;1¤ʕI~7OL\beBuB/h%&_>[y\ϹO'ږhg~^;M= '1 @B)Wȟ! ?u̺Ӯsw>B$'27xa?/'|B.6$pD酒k$fӢ3jW\*d8gh/w6b @LBp>…|ȟ $\& Q1hdZ9w!kB ?a g>NܜJ5]j EH9VMg+Q|W-|.V7;&!,us3染?6U 9Z7䏐B, !W蟡!/d۟|ox!0 -,A[@/vǺj_F;/w܄ !E_(~PN} K؟TxyXTj4 b J[8q8"[7iyRh_I˜S`#H K̿DoJc0b'oM!BbFs/2K+Oΐ?5!4#JYٓ'N|3&B ;EB~oy!Ԩvi8jÓ$W;ZEgT { B˧T+E *hUfOW3%菠J? C3"䯐?C#KU&W_CU`??C #;QԿZnb~w!nqeA}fnіpƂ7Ivk~POjhe7 SA #l!wA~wŒn) d.#cOA]J.Ւwh_5R&;ZE}"b CП)hG_O%ĄC&OBj@rTJV C %sl6TOޝ,^YW"/IMDbH{O4b XHH8+JPV@?K$䯊ϕLg"!GOhn!M1ǵ*G:uB4 +p*~&ձUc8\P\+h㙶6!\K1&CLK/Gv˨859Y8(C_SSő#V߻RtoVY`T?Yi;п4ߡW被M̿?AG?oPgƼ3?8svx.w6aGgg/ny¿4?Ϝ9S\g& vKXڴ_)Μ>cΖ]PI{v6&FbV@U-?v6RvBFwn/+RC^l\)NRorK/zZp87:jӯ^`L^*Ck+PNOn}Z7V(%#$c|$C1y lY?C=4ʶ*P x> DϛAgN)?1|8aG~E.]7-oBѵesqaX6+Wgwn`z_`jr{%{h5?=7CE5PLtÏ2JfX'HdJcl9*~k(* uWGK(w1ChZ[FGJyr(xYPV}RnXvX}Oiܘ=ЇԾu8` (F. 81=._&JS%6_TBgKq˜- d4 %`nM٦Efn( .,&488NJ4.>22 ˨I(Ssʵh1GF郠xXjܨ5#`zd<VnX0Ɵ1bl1[QP %w}\BkFu#tOh0BFϟ+_‹Aa-A4G8yh@Y{Q)ԭ.yRn,&3WoY 2C{agA)IM1 Sڦ{Z9H.RM.S}{-!=k@Wb|gF$Og[I&My+97ߔRId׋:*u|G];w#gChM҂1 >ŗ&m?Iw;grb9}=(ǟcg[ gQ4ti3%3nǿ=j*=i`gp' H03{}arف,`GѾ3`!/kB kC@)/Kirb|t3O g  _kk[q^9ѲJ?0Hn8E/Qm%W3!kQ @(可f;W7OC(;)xJ-0*(+ '[ FPW @U3[ rb߯$:re?t ((}Y)W4}yK/[t}3jbqˆ<R?>63l)l:7u*ΥY:[ƍ >JE=pשXz53[F/ "W_7G=nloj7g!tgD~PIA p@IDAT7}95ۗX'__kGMM ؚU?tń2m5Z",L-v"dO0TK{6dطVVMfI?+%u!xlzS2XbmֆwécŅWLf91TX));/>X>P:I7XIyQ |k5_nq(ߐW/6,ݜs TRX'җvt'qU!w]zL +@%ڿokTe(V0\w!㘇 '5+hhy>/X蕫w,&?pm۶ٛusR _y;*1W}_! 7{FN·vxAIm-.) f_o?WjaiNlX< VkW Di4e-UMr2Б2=i !91 1 n2qj޹_?  Mpǎ14|*ݕv ƳjM 1Pܬ\,Ii_ׯ_WC Ŏ;G^)_xn UKs8灭?TrToV*Vϣ;V~7ݹz<~?ɛ,.7('|W?$|;//~'F1[{<8N9maD i|Mp"6kB+G$<<[n]1j >έDCtLs:擇ֈ!ԤQ8Gr,\>ՓxӺB׼8 $2m FcK"a\¼OpR@Y׎@9UodWGтWqKA)7 #2rek(<Ocǫ/40ٜ<>mJYBGhAI3;/|%K ۶mՋΪn#1Ζ i?]u_LjS5{6 !4۬}k.#rK2mvmlJ%D kFFy{:5U*σ_o 6gzHA&a4l> P6T'T#<B6i?3(@gc㒨\06)-O9 X^OWH`cBK/Q _xz=Cn[;:`x{M9|H̽dWû]*SWjzυQ1}@$)P?Sc_eba_ @S\BQ1ޒ'N #&}yj*cl)圳 [YCro&m.7 ϱ𜽜^jC}bm?ϸ#Ŧ T~m Ϳ?@Wq~ĢE;[9_Ώ1*]$_ΓsNy̟5>~YW#**E_"&cL"=+# :\ph&EyE:U-] }ZpRc.Enauo9c& ~ߜ-FJ-66+8 JSm>^~>M[5}{C0]7#E?hYq-_P}0'A[R-}Nx϶bŠ?64Kc 0㚗<:Jؖs_[5\Q#SAetR^C?SչQuyФ&׫0{B1}OY 7/b[xMj(eNR,thUPoͫwxw+X-CɀI~` ڇSN/sjG]ݘ3FzLځ1]#Z^Us16SH-rrV埽{Z0Uxj.cIa +IyVuءu~2mJN%7mk?JQhg@ayK_Gs?UoӦO<~"J52EhnNZ{E(%NH:r{d49 " eH.b͹< zWoO􂊔C&2 S"A_xj~<)#'5y_tьY^EV>`W{Z6Ѿ E#r󿄡CuπRv߁ 0Zc&!UFPw.L\Q 1-wζ34y_^TҚIlnÆn0>ȑ<|wd/YmsWt2soTgΎIo6j~St)ܾUHL*8ڧĮ'ە~T.uxfA}(=?jljtR4'gOۻiأw} oS2.ܾ-Gh*>_`f>ޤY÷2}Xm5o6k&4f2 {KSQZbfȆjKꄭ~fvt~AyV?Q~ۧnBݣz%lۯ\HˬzlОWXǸiH57dUoe<(E CQR5rl/{@] tBx'Bόxj5D)7j<ʈS乳bh{3 Z k̿J!XJ{/XM1K_`N{噋2P%V/9laYdQB:0L/^Jm_+.G BH/v{[|[߲,4xJHCm2>20V xǻB3^):LS;01eQ2 w([x>4ញ6_rlYa.k{cZ;րvig>FFx@џ*寽 l)?q,Q7uh߱] )}raD6%Kb*d034Vt͐yQ<~PJv0g7ջvM0=H>=۷C^j&΋.~ߺUzP$9c00㦌:JQuyzEnP%D{o3Na"C(Qq3(99w[C8#xr|ˇbX.كK-UL1-fS=@@.kKt>VC2 bDu;&8_ϣӫ)?cQ/lc5?k =Y8ʋ\&|PFD1(U_/ϗe`f LmM8b#Mzzq/[)1}ge. []^vӄx8i.ukT|09V޸|mZ Q5hh?(„UjƋ}`qG揦y87"xQ( u-Oɯf}$C'fؾUN?ꉴϳICJ/̈i?oV׿^|;߱>3j-~πaOAeH5"Q_:`2nh:fg1zʏ-Kw6+ږ$O1E̡_HRP2iH@hyoRצrBrY&^ !DYUl'GAEpz]1 : GkVTirH9}DPj: W!>35# Ur|t48!,>@CA>8-*6'b>~I)۷Xr9խ?_ #S gU_#$nwIjP}H^F8]Ký[7̃cOWU>eh]?sB=.? 'gU? Dwu}`!m=t\Qq%,D"I~I[]}E&; $%Z]򍡐(OV^4Q0 H[jӾ{C'%rebIˊ)7QRC=>.RKz{zk|.Trr% ޕ9ޯゝ/D|ݚ`wamr*!m!\GQ}mkAp~kʧ|[ބS}Wl> QoЯhHeM @0-aӌFOtykN ǝ=+㗡[Hd}G҇f׮zo6rf_NgbBF[912"o}D5wYpEM@ dixi sPU΃y+ ſ/zOD?ϭZChC1tecߔWEV`Oc)y'&_2QhS)g˥EJn1D g*B6#!\?GB!9m۷k.ge$*\GyR0' 4h1&o&P^i7Sٟ/_1Ve(C"BZn1]/Rc,{[x {U1FMO"oFYpjx5.C偁b<+Q?6sς܎HIN^% ^#EMN8 S?vΝ;'<)i7/ftӔ.'LdzC }?o\ǸDGhÑԇz?` |gHݣ}D?/߱y8EŎ (?{;d^?O*kyx 0 Im֥},?kŸ#x,q˽֔M(baBo\}C-WoVN &?>o}V3l?c^kc>LnA\3SaAh2+BUo<\*4تoVwB "gDX>Mnj?ɭW !+!B nu7,˰"%3^Cjޭھ-@^znOf%s37u[ 2Ϧ6-^G0$|@c&/zfv!Ϳr$(f㸬~RTa/yqPfbʹc3J݊7ǿ'}o) {Cv+ѯ-FӤ:mdk1ikwP`kWs2}0cq1>rA'Y} KCKZ#6rLӾ-0A^jGoיl_ 7>g/ z"b$,hnǶVlsD9B9.׾/)GDG:$FmX7$pBrrQPx-Je33 ^M$T#sYn䘵Ҫ&P6w4=pm9^-UR}wx^FiӟowW|gIs#ԃ2vq|T+>YF?JDFǾlGˣ6y$C?0!T<:$'ސJ[(eE ȳH>2K^|H$Ļ?Im EǬ_jyȉJn*w]9-D}Kǚ ̇L?6vA|_ DҡeۯOtR5}+jx᫊Vo>%kڪ`\Y.)DŽDƐ⮨Qf" Pݠ_OW2WoU7(~臞"{W|O ExH<-爿Mվ650Jh^jφ2nԖyy^V8$ӌmMű{%0fsnͦY1)r}YZ)K9̞4(Mxƌ/mu1gGh-z/|E}ڭ>ϒd+)[ʹ)ER}CP{Kyr" {4Lɍ~BdƓЍzU+ff8[h_c|uqߚ/.zh!eS#O'o~'?ŖO}oq?Hc}/[߄h2=۩mLZ0WumoW(7ק}Rڬ"~3O_*R:0O+pz>\5,~D 7wQ_x=Um߰v/g%goo~=_篽P|#5nE&97,zJ ZTM It->j s]Rȭ rus躔Bo àe,wE&r@ߞz.01-6_>-3ףbCK JgfX@ݜ}t oZlG}?x=5oo'P>8 ߎ>>m?I=G>`OYKs% #"s%g<{_y 63Sm+#&&Mr0zun{Od)U_iBQ-[>Ul?,O(//WJ)B;w-9Bխ93,*[e,VD?mv|ά(u 0C ϿI%0#bBgljoˉV Dv_oRGB_wCF#O.:3\濁@*/_࿢ Wc F˙w5ϟٟh!W*MqiC;6h3mAP\=2Ա8o35@0&="⭶moTMb(w? 9!CSx^c1sN;Q2ޱ^O6:LqnRj #DO.efDPvk 2|bou}Er׉Rk?8Ϲby1Y/(T`%.8׉5[l =)*4^~K 20K!1/4~`n #.4˗_~W~ ~7~%>9kG]AauA4Mn#s8\&я7TƟe(Eƶ~r<6ߘգ} }xUmǟUoG(vn$N*=+Y/<'Y!VYa@ٹYzJ\{!ɵ-r,_(Ϫxpr LfA E&f,L1c=[!IAO/) '֣PĿʤ_w]7ݽ[HKE"5R7῁Gg&u"Y+?>F !# gް߉񙭟A˖z"%p F0]XrX*B/_duKCQmc㨭(+C)ǥ=;=" r7' q/Tp>9ˆzY?eJIkj#yXLwΎ1<9OSv1>?=cݑfVp}:aivpa>)H@~h^3/`ڧr:;1_MVԾBvVɤr%^N8JZ ]gmR#GLb<bf(ObO+Hg׾V|;XɟR}V&{ڠs]{k[[zH!;vp4, ],|g BvJ $F9R.ώ+T ފajSK7+ݻ-' Ɨ.shEqFL&6~颞Ş #}cA&!ZM !w0 +T"0r ]gU> /j95oX:2Pg0Y_ y{Oҿ0_Vh& hU7s? 3 ?u]mTg-,ce:k(u{<y*??ew?!ƪ꧴(].a1L_JZƴxkR`D#E=/':R?4d;˅b;PrU g wnۂ#} M?~7iJoK&C/4^Yfdn?؝r1Rz&<Cu/bJG6 5$r{>G*9Z>7K]J@O.=CLDC}tK8IxGn[[_~OYlrS@!oF0,KyзR{rju J^z<,ptYE ץǶJ'q2gA9PV~W,*}kdn~3Tm4a^y]gZ}'+xx0AO s}G?Bt/L!pŐDwse#`!yu}!La ˍ?LpgŔbZ!}Bs͕ zf?/$Fa)&Uri~&Mx _(sY}MebXGr'e=пGRhڌ7lܖuϿ%%)ɑg[^Yüvc%H ?%_0)baoȟ/ǩ/OG~ůo8ZDwmFu?#? ю8ou(ypDh~ENkIPm?|{ oQF)3J?Yf l˓K;~_?%_Ijk*tjW?B(gleXƠSO>,e_WW-'m kS=(}#/>:ڐX>nҿ`- ?>Cvy?1xUgwI*SS ^رc]j V3F^{u ;{jUdzz5_K8,9F}C1s7!-X±4<;QE:߲gLW鿲J''JAdTӖ>JF:OF$?TF_?UKbي 6t^go?}ڿk!w({S˵׮@7@ѫO?[w~it4h$$*o,FȲ 8oj̔ `\ObD1"H a=q$'TH? Ԭ4s jQM'H/hD+e:~+SX"1d6+R'R'SBG`e̔;[h@I-H2y<źz.gZ8 oD!YH"OOK/Ť[4-K[* {am*T8R+?S;?R+?S1?4j5#4vθlŮFၝuLU8Y hrLXmN;ӈC0 M@.KdI Q">$o?NGjgjMGHuAS"#UTf<'ǟpF 5X;I7v9lJ9^Aڄ%dx_!'δØ 1$xH (TI7ڇdS6A R[R˼ڟyΛY4Ɵ0pQG]l^dž&CVR"⏖1  7M,A"*CɸhICa#*%֙4} )%"&R x0z$XP*F\>i[ᗡM/ nOޖY.Nw8KK5X J*bGSNG,aOҿ$ڟּLH/D;_F2YH(K.g{H2A8GǰjU8 >p"m9Ug(Sk)Sf{__0fup ϮNahbG~ʛ)xnJpgko7r_hNsIF+{/W;qU Ig8egQ 7?o6E@@IDAT3tYXXXl b5 q7 AMiܤIj2C/Fg%S: ӱ;Pb+1qv~^` Y.&d~?^qr]vТvyT'4m&z Uߙ=Ƞ(T%ilicO9ٿ~?$P(E?/dT `/e+ _wDuEv9-2Wb+Obݼ.'AwmR.?Ս`'Z6tP灀eW5p8W؁ਖ਼9'pӝnLN_`pH?ܠa^-Ѽ:>B>]I!6>L~Gy; AgŸv݂kPzm:ɯdh}`At!qb'}{d>Kz5$&&yE ]_dv5Q~tyXD mn6lLugD:~l^-$dƽo@ 0zc"x3SYBTp~g&J.F`hIȏƷ~ ˗!;@ 2/uN Vou(^c&V![‹W"N&'YݎWX:K _婑Mt~Qk/}K^ڿez3I.:cӘh )G/[˲CEdqoWdǕku?x3T;c)K τsZ7trb^A|LoVғґ˗!o=Pp]!>[h7?B6cqq'?O&I/F~MwII+H%whwÌcT'`ki]b ~11+U¨XLpAѳk_V 2d]}v)=_Ƴam3P3,d~S}vHP<y'Ioa籃1Dmа*gi#Y4 *WL?РK;o=韡ol:ud?vNL+LiS~̷}'?xO(vPx컄1zLsqMvZd\B/j~e?O{u~4 B3x#OA^I (k?@($vzI-B!$UQ'pmFЧ MQ &M$\ʟꛕOgaaMK@QV¶:1J=G5ů`*{<ߣ##E y sxQP`f$}|})f,_D,k] ykf *:P1/'&r 'Mo ygbZJo_;*U pBM@)cGS%8_2B<\%! B S2wGn9LffAQUǿ+hT`G LC+'eͨ3iH: ա%ͦrU'~KOY0VoxKVN 9O_Z;C8QWJegd i 2ZŌ!Q8 P^b^/e['ӓO'_?@:>j@hN}VX:ߣGRsHq䟮p0)Tϻ #rp7&V~g X*!2ޑJALRd&͠!0w K7P@r>gBpb[4&7aՌPO7?$򟕾SDҧaH&'n!׿/L:uV/DB5 AO8}iLpפ_iUHk ![ܰcP 6Ot.lƥ%a D(?jkcSw 60ڥu'At%{pr ӛq)=/}[ϴCհ2lt^6#t82\ơ̄f 2'unWQ};-&i'ń`#! )2O.ݎ?N!tC0r/uo"|uZFЌ)&+а4~{ ; '")}= GU}SO\ocųN?O1 Ṇ#BQj絞5zX%#}DSC{wfwCo|+gt[B/WFdgے*;;@23v@㿩n\qғEӧ2젡\o]/$ڟB_va:An#yz3C~)0?N_T`+jg0NάWC3eI!os?˗i'|slSMO Vedøn?E3de&;kN ?Oړ[XXahd}fN:o|=LβjgC1/y&'ud j":󬞂]hH>3G:!#;E[l+j_'ՋnpS؄A_ ,kHUrĴl87o'xKq2o{\RV{eN(wf[oMPRS*fbq#CN_9L(6/msվ //)dD<նQw3 Dlv }ΔϬ`\o-Zju jLM9]A^>Qe+`Ͻa?yzVgᯯ}0, wtqoXA>80L܋CxEO;fh2,/$?"{:Na 8'7`tNvÂu391R Ww/\!d„F́G~c՞`*e?Əc\r~3O$k7їeK@~Soako30 ZuIYԿ?1**JH‘i>鯝xȟ<(gPkcՄ7g4h3>;M6 ր77JC p?)۟ 4럡o.dX\DfK05yl/=(%X\05XNR~ߴO" >&3-2OUO?m_*ֱ7tKC9d F-ܶkU}Y ihN2\kӂ;(F*yR( f-E8M.{vi `H]5}>[<} ^ͷɿf́&opr_"Y7j7MOC휀ݎ.Ă-g0uI9<42lE.=-P.4&E MK>?\.cILVOwZfr$iaO O%u ˶K$o⿉N(j׭7`OO*y bs55NP,С(/og /<Eܭ3 ?N]xR_{?b,g}}#{{).`OwʉS䱫8=mF:Yv55?eSTXǙ)7mtUiƫo3I5?6ѷP h=qe 21` e%w2KG!Aa-RkD.t+@dA .1b_]# ̨o` 3]pHMa+a ?~MA;Mo{pro1~WM?+}CW (7?O?x7tP9+ F ?%|@(e?*?K=! LV&DN:?bR:_hg&|;B:G()KGd6I%eVAIXZ{쮻uv* *2~nϮڿҭ{\VfJ[c0S1/eϳN')|FaMZ^{O} AFer}AKTH 8 ޱϽVb<ϸ=7dioZ2NDy [ޭ< ,>pGb=0̎)>wM=ēIE׿c;|D)G"#A m+Tvo}7E/}A}Ayn3z{tQo:#8w \a#X'7+t@!@`40;G発 #2Q+Ч?C*21a}sZ%uuC5MEmoO+v1 ʟ3BY $QNQpBjV&?(ǽ5K^xAOƿ>`fK5QBD9^ԅ@ PyR;M/oKAn/f}g:u օxFQ9>'4:Կzfk >#1?h)}2#Qmu=cn68kk~) A pPk){OtnC G[Jr[rf~t:{yc?%N]~[]SC:ܖRUe=o:  7˾>١.6+ۚ ܥ>=y7߈|eDW>'b-ZA ƕ;opA @"s}|TϓU\.lOO90"_($>8fub?:Ig?΢+ ƿ .ȇu駿٘ X?u?܉?޻{gO̙B?+Lo*gע?uKoz7\Eœʉd\CtPd n;} E is:MZAGpw o.PU   =>Rb9/fRm=C4Zv;hC%ǵfw=Eqoh@>hȨ˟wJy57a՟Bַbp:n()?}}3Yu#~XƼ^?WGI٩`99Гm r]=OzG\2\"d G6#/s RL'|I[C)X玄HN@;Bzy=V{S_^oϲ$}@p(?+גN7ʦ&'Mn3of;#ڃ71m~ $lv h'5¡fR0oJC=.di[L!YX"U/Q7OuJoǽL l%9hVf|*g^={sicBc+0Gꯩl;W\ͷ^td\0Hy7`0udapӇ4;~/LUQ>},!I菳|äIb-we`[D4fVvDza-{D-Cv߸lo evg|#]oVm`7=rg\5[ueoA>U g7 ;>ݶ,e늿߀޻CT?Mλ w]}A@gٵUA ! 3(>=BAY>E|kyq^ qț:g|2r{ڔn'  [) @b̼1u*2렸qY$}rܚPC{\.Mm>R#&ʴ/M$AEjokj8KC Aw ގEMP? ˜$֟^T2k<{ COUkދYfgG]OzTƻ0<k `i i?s1@mou@cȮڪ?Dz۰<~1*fd@/u㝿ܝBz8x/1fV3 olK(m?k  }q`mhnv|PQ<0#̊}u_fo{bg[srˍ[d$XqeM⟳1߂s%f+lnXb7|HX|/?[F\{SXSG°8~#`P\YxU3!(T=y}!-81~- S;$2fWV|A>; !;;ZEH@[NTCK{pbK-IZ&5WK?cyv7_3BQVp婝gTfeNCI Rk蟡o3BMZ5iPh6Z+qu? E |nѷS1yKGm(AόuNvJ??g\YO}ߵ;}4@͐?^|jk_gQz?.dU ֎j1-diw'`n844r==B8S)4? k Qsd} rYeP^ l&dviKDCzTS1"-O Z<-9JYةxmf3f4 |)\] [gaqFH* |ƪIkL{-@^_sWeNCi6!{M/?8g#2+­|C`<̪`_Ekd9̮ĬB",[\Ut|ьK޹UXǖr {a0%'[OvB4'Lc:o<0Gq ¿mfg=l~"ʮ2fYW̚OcWkg\˯f{bZ֏,}K:k+n/m!pq~"y:O2$ ?AO?v\?޻aiQ8M}9mϼ;#|00(?8ŌPX.䊗,^|q w)Lh$VCJ4-*Y'_wOyHy+B*8dIp1>S}Dߤ$ t~v&dۘƭ~d\xHL2DYn.B=]".$Lz\b| \`c9%ɞ36rŐ4,IϿےD/Xph/=:O`f{^X3:#S o(4P%f a׭uzڕ=BʉU>$x 9 m%ûA`;E' rv~,ϞHO~b+8іv[8el"U]xA?ҏS41q=%z3n;1ШYec+"O2?6lca0Lq/7P"о(sf>yveʏi`%STLdJx~ Ŝ}|CIo@ڳHL7 nZmAZ^@-g%=vNf~P';˽k!НDHe]ס^.X_6#?$ -=vz@c(.D8ЪpigO?DN.#}sT[pO Կax0~n2#2a{UYri?s9:y-~ jG&Q5蚶3t>wCun?nW_{9*_8$Ci>x'+F,4С/`%<}fֳ!Rq=El0PjJ>N7!;g0#^/ܛxuzn[ԣv\!+~#x;<,ǧ%'+&7Ù"=YzyY??A`F^9yr;.n%?ew߽u'PukJM3YѯB NY7a 72T?/yWÖ wЦSgEd-#M?W?t rsw@ɛA"K N/ W"Œ'T4}-6b?3{a.ҷt+Toq+{r1@ a?&3,eȫgҝT6)bbYV$񷾱ܱG{ E7t{Nc g }LLoqqOK{p eYxTM}ojhN.~[~ƄK]K3um g!-?THA9q?[vXKڅ !C*\5?ZXt+;8l*Y {f#C>㒇 F]7~KO;3':λxhח,Wğ{L5ڎyqYxW~vo*.kGWt7ާ3 c?UbtTwkv'^qb=gr@n,8ߠH7P=BMkO!b71a9Bi@HMh?' ~[d[eۈ́3] 2#e:,V |xe 3à i01Uo_PgňxMog Ӱ΁&MC V .~S W&'IC:OrNڗWTNKrgtw:"\5tvX KLӜ_SwY9 CNθ$}h VE7 ?%Ciq Yދ1(0O+Y7o?fCǟ$UjD =i9~Ly[?8oO~5qO~_?.Oo|S=|S&7MYq]8I{0X>+E9AK$?PL+u8+bT?߼.mybO?m??<-Wf۰O-=WX<td<\rc^s]֬AhΥ^bArg>Ql+p\8x<=uOsYozv 93+Ur=zoUP|!.^DFהp(cM/# E-`R0Mi܎ .aއYOť݈rc& 7|X3)P>f7ңȠS'yӿᬶ3-^'ӭ(q 7~~/w cJQbp*8WqL;RDifT;ٞU_ *;p=f4r?Q7SmcXv,ٗ?MoQEV >{n^ڟ]wme%gmS(Bs;%j[@r/bMiFh4rVbUH0٦Xd2ssTH;;x4<7$)Q3}-ad _҇OHTzAEi%8|'33d^ 4r:xW$p!HmA5KG/KפOt&oU7fƶ<VSJFA%)eK±È(;JLB3!OtX!mώqkI<:g6N⿉`\I5V7_(svKC[];/vĽT<˾>mwS|HUĿםjvzB8 ?RkYװm;c7by9,C~Q u_aN7;2ߵĂn{I`|4^;wbP.>È賍3L+G[n|?fş|*?]﹏ݧ\{]W1A_A}$Y"A虏sPh{yo|6{?ǟ=Bq`$[ ?\9A&HVObEVyk /1|kn-7 θ #iK&OD 7{U tHw/8i(ʩѲG`~_Â[^Mu5q7ѧ$sX?_X7=_Ƀ1^ǿM~ fMO^.O͖Yn|Ly෥4_ujX@IDAT29 (_%NEomKyXlG}=ͮ G)2=,?-;ĥQ^ '.by&3JԌݾ p a6@$uVk\zivZ svpM8K9Cu㵚fC7}I^aQ/LfFƏ`]wYVZ:@2{1.2VAct(Z͕'[Y94fnessg|moIuyL:䆂͂?)׼D175NETfo{ω\ea`An?F^ڌN`Oiq4Jϖ˷p)FxyᵚJŖOיBޕw?w]yu`^-ҷ~AA" L߳,2!E_G I&ϥ]?+&yHto#-.Yuo3f sKEH%s3?t-ƥ,;IܧUB^spw܃2A.i?FsFI@5'L'v~1wuՁJuQWD(lI1׿VyqZ/;(QG?o H=# /:_(}.hO'[s_jb"hxH BhiXo?hAr\_ iy`g> K.(}v ejQ.?"?tRwR6.E{嗀VWA~-d(.Y3y,5EʌWy\_['?ڇ(-7o .d܆q`i&uk%={i^cQh_3BK{AhxמxD=3JΡJ,9ΨĝfM<߮WLj׼"':02]KOz<;rvCi,i~>wj}#{g\p>ȣb]7?Ӈ?ԋKu{U׹Gy a Z\2@Eg6~=tn'oy->[a%3!t3@[`##"*RB|)}φ1ZWQA-*<C8D9UyY(Bi74VV_Q c$Ӟ.dQu5ϕ+Tk?\3k}KkEБCg<)W]lPɫLbbQHHyK4g"}KA?qy, R~*aJrT@ySs焿~wJ[&:T9dTzgĿD~"pZ?nocXwC4>h. /u36ߢD\WZe(T ]L`XLYP.T0drt~z⏥)/#u ,5Hy)H{-e|-1i!OI/SDzDYkI? PT5^_p6 ?7gfT ~JPN]?Ɛ_9eQ#" Kg>e! 2>ʞ.ew]GZ4}8 @C08ESŸibmI_Sw[0= kNW z(nU %˟9(;a\{W2V9]-D V 19>[XQn4AiM7S #՘aaC+",9Edi48 ^:zB!-HE- Mb:FKH$׷ ;j[Ð,)D&qя}e+ j7M⇁hJ)j| ^?+}B:y1$I 韡oKY& ZqZ/qF(:(X9!eܚ= }_,_-dt;IE_eGzc6G00"c'~7C/N6o2-?\-#ȡ|@?>4g)gLM\^CWo+'B\tc$ DmGl}.o(@hl脿 O4922 Ϭxxgf9ÞL0Ш{XD1<,--jb ȌƷLO:*^6ӥ셲PfXg/ g;oPs2+R^#q>pLkWi]Cǟf B ]Q#b}>iRBC!?+B>]E\ "E/y};?w>m-M'PSO,`ݩ LaF1jCc'񷌟ac{{`Ei̚//%e4y}JO!D@KHT>ғJDE,:AsI'ϰHBaaTsd@<ҷ=Sg1Aq";O,0&7Nf0l;JߤT@Uj`OM Òc$OR)dܼn̷lu(T]ke%G;eIg>3~{+; J +'/o2ǵw No:2V=7W]g~_1B彰Cԣ{.r:[?]%/&||;@LxKnoZk!r _y!O so}cGͮC阵C Ix A-:jM{V,ۃÒ8N+L_l;׀_x$+%9EZT(AB_3]G_ʍ>o@"˲goKԗ?L&'}x^uL*1Wx.r:xlKJpYxS2wGO^̇'F'?$yδ?Tq2J|#,!?['Q_1UWf?C: +:&!7ZߑxUwCG{Q˩+BILxB|;[p/ka,cq]>We7yFo )8A8koky+}ϙsŶhMoxO3BM}DIbb&NJ?bJhL&'h2 Sg&k]No`q@b_,zP ?x 7yTplߵ;Mļj[;v<MG]k7t4.d/oKg5͆?5kDxٌHռQȕ}RPO7}d˛ʪJrH--zT?/phd,`+폡kX/uyC_uk9}'~O1@Pb6rW)#Q+,b+zKZ1o$%а4~{ v,S}=GU}S1c$+O/Q⹆~U,"Y[fhx7?/Ԁ8fy)c޵-e'!~7.vdh>%a^?.#R^W@e.6ҁάWgq ^_lϡO<@Czua\g؟brr:S3ą Nt[ٗ7 ̃ ]2A+ .Ё>됮2}7T`3D_7v%hDL :'Q  \_ /bq +P}hOK㩑Cvʁ-}GK4>!:OB <BDH%8Ő0.3&عQ8XZzg2?ɉ7w/B/~/ s࿉~7Od1e+{/ N7K#$?g;I*.|C4OLENdgͳ_ȳK.o(_HQJ mWQ휋Zt&O`uT L:o#E^ދ&^Dp\D_O^h!]?+Kc! i6g{5((ϜO*ˌ!]}1``xMoS[୓/H{?i\*KjEZSm[c>v]|4T$9K\/;Ry4Կ?V;Y*?_-7ubZbE@MN[ F?/~@](U A!8+N)>+ToTr}? n!VS Ri1n>+&&0JR}D߄mUG4Z m?+}Lǿ忍7Ꮵ4&'AlgΛ=] zqFY/RA"lRXN;v-BW.AI&Dn&lhr\ZlK7j6(THwp-~['L@#7M*mȏ׍$bzJSmY F_7'D]wfZAQNŽx9P/@MrF!AOs iZ7lF_Rj9XD/$MIioտŗ+(߄?HLvX׿6t2?p#ο={ݦ$J*=.tUӾPRdVau6={|NWl8sAjuR̘;N?bs1DC"y{1?Îދ!r:yJ~Fg9}]1?}VC祉ލ+__w'(ewD/rY]@{i3^ˡzs.OaƯk 5a]bfi7Џ[d`%X'.sS83KhLabA` me.Ұ \C] ^jUegJjfX">BKzhdEo{42cÒz?7ѷALT f[VMðI VϨ~JjI6d(he;[kY3Fh !n1dyGc=hCbl<~G3lSY*b7ɟ?^\7WW\`'l`tbW тS_ I>&ՊT'տ?9]nzUL?_~j|TJ-Ķa1ó}E"iZ#xљJie6\xFB\ biydݍ~ o3GBaS>BnX濉>e:H)g^?+/h V#!0& l!I%RH'$揺OWdb!P0}RiVpI WPA*!, lBFYRA1축!ӠRW?##Bj ^8]^V*_}y!bL#c/9D˜(ͨW)?T2&m4A SA?%ce>EKP꟡o~AGHiHU'?$3F5PN%( >:/ӵRID8DCx/OVbQ/^#=<%}T"onءygM8PKzeFM7o_` (j_&?J@^FFRZ Ëj!y_:ƑPep^K-BAE(OzCx~><#+9#j9|So 3/m#%TGP _Οܿ^2%e˿\5x)˿?/:Sjy=IKk2_җ)@纓2YE9nxs8 0hL_=e#ZZk]?=O+zoČU3j L#ҕ~Ss H&o!m?R?"dudLmUZ+K'^A?i߳z`ȝ <s_!d9(ρZfB濉0n-V0\w2gw$ڕ'\|?߱c ]C-=ЮKEGuk(Ok,R>>Oc i⥫Ĉtd_F3@byU*yࡁ@dee>Sn{!9DI[%́.~SsIF$3Bi9aQB'DN#s7~Y_6GKe̫g$b(qg5m͟gg_ZOyH& W1K],VA-Є]{q{_%pUVQtǿ \[fiQ\h=zTc5+zO3ɟ7E K=?VPnbnrpydW X1KK4!'Wm[w(C@reMSD {\,~HĿoŸTʓE~k10;Ӗy`V`/޸,EU6$ҷ!θSab0 тk˝6nxaQLxW wp( )I T4L H7_}s  .!$i^|"}ϔ?4EzTcZޘcJ;[}dR}/9!]!Gpn3B)vȋ̈? N2ܯ[E1Ƽ$Q(RKñI|ߞIDǥf.>̍__}BImDϥ?+}@SG@RNxоC!qS-oun, =F٩J0g< b@B0.:(ğRh#p)p1å;A.Hn x*LT.BoLds.v~QZuKK3ʍagQA9 %[qF(;(:5 R&mIL6iRQ_BbPs $`-?BE,Ł[U ;Ha;3B}k1GY#Cw߯:KP5oŸzvi T:]g؟b/@(lv:dJ/7Tשc cM~*G_1v),="苞Ql$w9nLYa1-oK9FEZ?k_:,x>-z oLG U?bOXl##TΒ)+aˋ~pW^b/a'F%)b11=תg!:MC$VO_ϛ?Q!,G_HZNy̿fʁky ?YŸ$Lݎca@^w P;tmY%hv };5ch\z.[}hJ_c`4PhQ#L_s904?i o7*72Kp}dqI࿉~^US1Eo*'xI ~͝?x@PЩ}g\/? N <2F-Iw_L?x焿K ?[cF5ې ;?ދ-vh43B &w!',3!aAX1KB@KR/Z0c^ sbr(X;C}()&౐V8bô7- 30VJOה$nLZ 2"vmJA$H$ bO(lfD^;|O8ٴn톜ʎc$Aqk\DKwr!yO??smS$cG,DAl?Pu ?k e-UȷD tUBt߉e囐5HgբJȑ;qOj +nB؉ktGXGﱞSluπuW}"<.SlE Y:<F 8hOi9?'R,uOY˧G5p v-r?胡.h7;BSqx4$>םeVGcj8 3,2gpr>ik8 ܟ(#Y&#N3dpcj8 tc$`ZiLpm`HΎi=r6d1CzYp@__ \`mCy[QX\|UYU7 } XG k/N<}T>[~Y%gSkq"dϢwxѝY ?(ې7lXJdO]ۜsjF(9n7܊Rcu蟵*]A>t5C'x;B$j0ewiJWBN5$]xQ-4 'Θp( l@π_FfAQ"FG#/~DL_ZHŸ Ovk_ -4_gyL*ܶ?ZsvF;I?O%߲9ppb΄GX ed0 fLU"Вe„ΩZ^)gS!}"ksC~-9 `[NMycd۵?K%%\C-3bDDmLgz^["Gn?cp vID(пPEd8:LU˜=} Ă;swU4k%ñUIQj:z324"3+5ʱ)~>ݧe9yϿ1_˧tGlkk8=v\̗M\3} J CA/kп9 yp?K 2N>|_hT:0\ d l`>l!lOjYegY:dr鷝\bc6[Q9Z(UhV6_9!/ B~X#B&|P~J:MGI|rJAuxۜ*'Ƿr8_ttz?ze"fBf&cĭ͗adoW{_5`?"~ǯSe2w"F'eI(п:u&Ц2PsO>;pyBcw MthgBN|ۧyg̤%@ *)3o^h1TF S?g3Ā=1\;f%CL_+0&/>g/7p2U'}҈ JLN֠\e3%Q ==h瑫~KsP25 PWUi{Oϣ<&$𘫿`5 nOAq4/BYcpyH`̭ #ً%m}lz@_"I-ciz9[R?|K/& = @9KL\{_'HuHb9SmdUYj!S*ʧ0y#]_|? LP^2"PiX;?Fs>Z΀- Iu TO牚7ßg\3o%2"[h+*Ȉ$-DB hؐ/S ^OWAsFOFs`-k)M, ܱK7_be ۽]$ :|ĨHȵ؟kmpf'ZhU'ȫReh>˞Z6w,v3MMS}yƟBpɡ>G=3 eI^f],Q {jYPT 1#t&]YJ.'Ui4 ]i>b_+ D__T} S.+ ,ZS05h9"pkVÑ\1 \-*t=4QƟ2PZHrv z tvj4mhw9EQ_ڠ162Ǔ4m_PRtzK^3}`_BAP3=S% ak0&-ƽD/O?Sdb(ólk)|*DM$^ ne t_W VbGhOqXG|r m`K;Bs$"6Q=Rl? 4-0[)VkCZ ]߿18oC`ȉ r)҇X~daRI`ѱB~"V)'7Jg66ͨ ?[ e?_a>=/inҧs⢦KM(:W2G"1aD$9rcn-Q!R5MQ=g$O<6qqѣ=9;&m1AK ҙ.mȪG{$p=38qΜWKbv_kbaп R?/˪AfBk?OchXW~[+o?wre?;.1h`'iUDD?&F-,SRR'(}o$4?g$3p+;~`5>ֹ \w;~3/@T%1_g?nkY?~ CE461A^>q.KW֣5jߚNMD Ipdg$-킻ʵBgz3gqNyVZ*̿F-FgHs={ 1_dyѺ%kޞCroZ^_,K׿d٥?-*9_A\:߭!IEJ5.ڒw%4yBXA](Uϭ+~I< Lw,k-k\MgP>^^ZZlƒ J 3"><\ŋ)5"ZϵorG(>~f4m"@{J+'_`Cq,`ZݬD!2NF1ږe`aw_v#׸54f |. ]F;V7Kk`Pϗ%?[F|wCHBwn2 (ZP2'-&"jw_?w|*Cڗ7kY]>~iϵXjw:R^K2JG̓]`SLY9Uq ov9p> !|ySWJiy0m) D !ev|KC XA|m#&2tԗX=rvq7I̚%%@꛺*,6:qYrkg"ʊn4UzEp B|ư%(c+P2I?Q\X3$Y.?cuZ3j܊!.B[F\?/r,?SVq}3]BJIQb+!KT⼳2SgĆy7C38!gh"Dwt@./o/!JS-cX ?T ;v4^5k[!v]wYj-: fMC?lLJz;-z Qk=L(jGVraGBo o!KpQN2vDu$ӼP4a_u$= 2=3FToTO/o粂\?, J|?Z͟О͛7VRQm9&> x_! @C ph>e(̘N)OQָ Ed3;$֜psqTD'Jc!gRYItBO}֒M fS d )Å,tV=1KGn;sÌGi,KI1芸[)D8{!/ x.K* eh\IJo/>woI?􏪿׮g?s>r#/02q T{~*f?'w ָ +Bbjt&!̿/gϟ;/%E+X*)f]}GA(jkMHfIBt=pVGM\N+&Ek%kt6o9sfq5seLe_9*i@8s ."0`-)4\t1Bd8EQ}8hSLWD=Oc,4i1q@󯑀L֧$q=4if`M{D^~LP.H3p}-c̯gX*JMM1m9п$^}8Hk?JHi {q~)DCbL'vBi7()j2aqaa>jcF ˖&9ß ̂g~ kVCQ#?wjq.e]_k]4ξ5i8F.]> *(phqv4fFW_H1p}ͣ))Ȓg> 4ifiI|]|#_ ff>el hZ(}oVC֪ ؇Az;'8BQQu ^^J]HC""4|jCN%&C~Zp LzP>DG2g\?fkA, n4 %? B\_#/,obgoa)Bп_~$["?5b`nթϵP!)5$Pd,˖;d0a+wЖʤK[[2dn yHscK1K.$ZǭB2|߬(C?ۗ˜ENzI0F#A̒Gh;Ce!ԚDMx fBAT0 v̿&V}; fVV9U5# Z66|3li#=BayVne?~S\Ms /$*Cem ykпe_п6ʘm;\َ\ [oaj^HV>qq\ xq_?Y !KJ&f!o&B ܙir?6O L)BСIh;C ,W+ZT O;BK}XK4> TP2fYR=!iNCK6" jS1U._͔buL^ əF&تk# ~ U5U8/x ZEه-iXn*;$eFJ 0?7\O_ձ؆f_;e`Aŧ#3BY;}zn՗&oHݑ }oyT1,2_ƕ?J=_xc^p&.C-QĥgysOeQGWGf3PK)0r0[1BdӹLYfP>EI\(X.ʧyΠD`bʧhLY 6:Rv-m§-kn 0M̢|C\$SvIb! #uʒf?~ԧnۼLhxPi7.me1< |eV:}.R2|k|U3E %EP98Ataxd ;ǻ<2ss"N/v}[H3NA~>.m-AxGv 9N8Sg˰ә+/ō[9S۟kn|G($Ţ*OKc96斑h"`&<F[?x˟<| b;_V`7d?Udc_"CY?\k׿:p,Xδ3P~'[ j鹹_/}T_gM5| -9X}U2k6Rϴ_La=َաY&e sI5 Bj{=n~[ZVwx ")JtmS`AA-pJ+2>)nG "h;LȎ"R#FA)#e{ Ew\D6>$;x=DWfHYR(n+1s6&5G?u6ȭH, :bth.KbV>7q9ʀGgs/nő)j_xg~?էMneK?/ AS}o%,;\eYo^_Xz4^q].FЖrͲn7C_q_ =(-ns"/՟դh±2*2ޣÎUg=RK鶨?BC_rtϴпPc4!yL`o#s.xj 4ǜ5TJwPf+r`5Hm̔xOr[6;_OJొ@s/hOhB!z_"=קC׀ Q&U>}^.]1UCg{o~e|T!D1Q.bԣI 9SlO^b&c.lgS[#9:D$%B )yFh=\9,e_\NS+W)'n&"S" !h?Ԣ<R(TwGݕm^g$[O1|`|xD_n)Jq-"\ \(OL4aVɆ1v3&tQZ_30M|Y+x2s!isw`B.KB5]57t{7ssԂ?"1_g}-fdeh9b% fZ=91`?p2䃿l1?S>f&S0AϘ4),f?Nna?c k tZHC_kϥ|CMCLn?vBs; ( ZaW|r˅PF `SCO/t H ]oPӒԚC ).7ϱcr?ˋb9_ yΗ_ 7 GB7 ABu5Bуw@kq+%̥Tx[9kDv5u.5lSǘo/|UAm"cJWju_Q[ߘTڍ}GA[sk?bƟK׿1{!#0d3B,>XqUf֤I|JZ&O!hJ'Q϶W.Op.?K˩ ^Hfj}+Nes'^:l-#CkX9i, ? Er[ɗoG C.]˝%w=Sm?/ZӗBaC@L[Ju8#gz6̅ :·y{iQV8ૄ{lwsYW?ÿ}2ÿ 3+I?_^;Bm!Tծv*܆EgB NX=C=J.#[(+ Xvi;],rZ[V|ʿQ|RW 츤RBƿ #d_hxCQ'H Vs]?h D(! |rgrGiRJK&,+RʳK(gxugO,ww.k׎<kSb.ۧ(jM&x3wߒ_-:v{glؿ 5,J;3O)[TFF?P(G_r`#2v4&I&ƚ,[/j|22rOk;X8w'V?hd7+@\[Im!isݢh7YEP>)<ҫ*8Y cDI?|MgL/X YHzV-o~w:+nB؉ktGXGﱞSlu xg+NGo~])6NۢJ7R"orrٹE~C($ZT t{gjr%G9H2any6xR4Ip7#:*jcZ9%w4"E(-H 1=O16)SiLp˖ =B,fcj8 砑hж1Lcj8  ;H 1/&(G#9A#Y&vh$́FBj%C|ai h\t wa hC:[V+Kʞ_y<)J;iyNjɿ FifdiM&?qSv|-r?胡.1owEo9Qh?IP}jO;' 8_b:9p@Ig|#Y&d:mtpdpcʎPdx;l7eiL(Qk,xi1=ri!ɦ;Y0eB reU9PY~-l"rq 1m9DcsOgWeUݼJ*`&V35NN8P4oGdDrO7ϟ1NIZg Pg;ŧo~)Q!Oo"Gn DLLsjF(9n7܊Rcp蟵*]A>t5C'x;B$j0ewiJWBN5$]xQ-4 'Θp( l@π_FfAQ"FG#/~DL_ZHŸ Ovk_ -4_gyL*ܶ?ZsvF;I?O%߲9ppb΄GX ed0 fLU"Вe„ΩZ^)gS!}"ksC~-9 `~t76L]d^7lXJdO7/ۘϨf9Zvc?c"/"dء2DgJCBILx.K|2Vi.s/8# QW][ЬjWK'Eϼ^.\ʛPu?! }Or7ÿ1_˧tG]X?c%|9ϕ?ק ?4k7a_ݳR1ЇO1kJ\6g?_CNj w6'({FkV die:-$G6vr8j;VytV);Bzbǫ[??ݧhmGiпE_пO}Zu yuvY*4HڳI8z0[١YM(hL9Ğ T5MFPkOɿp7o:П .iGhTšgjùi/wi2Q1!lgN!S4@O&e~:Lsbz*>렅2zS㔇yfI;02xSzug8t^@@Z♐S:_&!@BK@AK׿E;U6  : O>ˈ*`B~/bg˙;Z:$8a6F.dD 2.V?38iVc)y;~g2h wʔE(=qF@g:za$9EP=/PQLFmOlp.m\k^㯅 gi D $N 4\y4rDs!-_C\)|N?? ضuסR3SZOE恗p6?HRw1hC-O}Ӻ] >I(`<%kkiW۴?c!qqI.$* sk׭(٭0<^Cy L%,"fI_g;>oV?O+ͦjs<٥]76ߦ"IacIIĺIJcsr~ڭ*K4&ҕ2y zP@TI˿%_ (WX_϶sΟ_h܉a/%sIp##q eP@sQ'>x9.ogg)G,q4F\8>x2'r͆sG`nMQ^-hccȴ2(Ih4U N[ʗA[z1ieyMD`"ϡ]_w4gR -M<w"%/#r$?Fs>Z΀- Iu Pe;s=ȅ ;.eI]5k%sF_kt~ߧ azw/ /a )}±QfȔ"r(c?P8#`"ɡMSs*BU%8*|4EFS)G \T"I% е *,1;!R_J0d8i 50DA̗+'.\E6<[YWdQIS+ju>//j*R66Ly=33*<-i(?vfu'NDR@LSS}?ђی2*iп֧4bH@IDAT3m.TLPoB% uBWHo!bQK.!.%SBqS+<82l4Aa ('O866Ҝ_Kt3 eI^fieIFYn}k 5#p%K?'Uh/P,0ţ*z3 Z*(35hi͈/<ܚUm6pV<66` XsRh ߖdOO(b-$q9%<<}Kچ 1-<#-A~z׆ZmC(&6b=?/I($2@vZqOX3sM$'d2Q8~ .#OWȧ:yFC.g&K/liP̥]ZYbGBm?X'zo8b_N<\.vп%_?(74rgtgގte4 EBtE,>`NQ2dҔ7)#m п sj%]w_x(/.O[[梦4c{Թ!=$$x[ṋ<*DƵI"G{$-zV򌻈q5i\ <^btiDV=g$Tm'.%gpv"eΎy]|*?1gh?/Kf˲*p+ _Bh7u?sJ[4dX v+ip"\& FZm|R=e>/IˈX"9Bt-GjK-/Pk`2ڡ5۬׮*^ r Юeɶ&qH? ~xgaZ~eIH2ϩC~:os]}$ɜx$|sΜ0VW5meO%:5bt$:׳ׯC]qm 6??# 2Zi2 2t :lпVF5{[H/}Pf~ j5/p\ _?;p_cޛ,4!po1~aL< Lw,kB ey%}R3?Q~2\L _QӖ_7)A"-^O9gkćs؟kP|;iWҞ +߼,#͎B(dB0"m. SYݼƭH!|. ]F;V7Kk`P|`ttԚ*1a۲Z䦹&e  m?vl-:˟eC8uկ?gQkNOK#Nɸ<#Иǧ_q?2U k׿jZNgR>s#y5fI{u˓7fuK]W&#* S9#IӦpI6Yd{ ê_W7_?PbUKQdĔM]∈$_ wb[/&QЄ$#vD伌п&"ʈM^5)A ܷweIiAR=t# !i+T06 ^nmZqƒ S 4e#5c^ @f?x!BSwSZ^*jCHߛ-cX ?T ;v4^5k[!v]wYj-: fMC?lLJz;-z Qk=L(jGVraGBo o!KpQN2vDu$ӼP4ea_u$= …d~znk(҈ ڽ?vd[/1)nhX3eݹp?/vYV?=7oQm9'c|:AC@*|P1'Sq(f]v&I29J㖩3NBt]ͥB4X(%zJ?$!O}- +&Ch?RPD n_5O㿺72\_P#.K* eY "=e%!!ItmJ 'r#N eok!)x[sWQKH?eN<[=[5FV9qYcШ&C}0oI?A*]ƟK׿Xg3aÜ)( Gg\·<=垟D'ω;]5B: P]I#<~ w w ָ tD'DWQ_&>hZd)woQeS!ICZ.]-cUp{濨%c䯍 (w3- O Aq<3VPNʿT&^U= qG9zw%?}I!HH͖JO;^IXiHSB L|;V%Gv\B(92T %0O@Bi|r?ҢR(|gYțɳEPwf'OiA~~ʥEPt_?$BtӎR>Ob6-Էz{T0bOv_%~751ųre橄c&v$§TLAgkB3eA)3;tfQ>Ȥ%,íAc;~M?dm*BQKαC eY * W3{Wu,rNd"cP%eiFPN[%I}5[m-~wBA[&%A$K̿qşRן;XW?kI\(cV{GG&ʟ-?7J2۲ .]?זF%ϰC6WC#ZOGD"9Lg?̧ 򯘖gᐼLY3(" $A.yLazWBS,Ծ*o5O7T3晡La=َաY&e sI5 Bj{=n~[ZVwiݟ2A$eT_~y)`[Byq"dxTf]@_'2OPC^gnG;w1D9;"]tO?,:r_ӌCj/B 4m};sPW boH DF 7?i琷g?=1.u\ I||hxsPv?}A<&,G=]F"+#ZCG٘>e9N?*+\|V5kB2;ϵ9IAC_]tDZtg PX4CJ`$A_5>&CwE'NbkE4ʡ}uJ_b<>B~zQ7uGOj&^HtLu׎8mߖo0\5wv@пCпW82PO5pJY(wjs-ʲ޼[k[3:vS>7Ĕ3huadjˆf[ZAinq'w3ylg&EUmixu_JEJ7xgrdeaUcZ{siSW9_GR2k\+ѬDzocd;ȗrrz|?!w0*u*ͅZz=ϣn snRކ~_k^.`GTLyetW ƣCQK\@jBX[DQ&%P`O=s#xUHHILl{4]-\su|%s9M\#Jڶ~fJO ^R$ag sEC.]rǹ##k)q<񴗺U\?Hnв-h!%HZ%K[ d=]69q] DR˯bԟs!n?ASPQviGۘ;v?4NkY]|G(]?kV$ FO))1;(y+"L QB5Y,@ym;iBY,:˿$,b>v9UiOş'3+E ۲sܾĝlhRg:.tDdR5mתoUD|QHb}EZ(U/V(ŕcHs|s-w\ rd`<1!ЄY1&0͘dhFi/;hEN[#c3 4eɠυg tb_C֍-2>ưrMW?wx^3,#i$T43\(fPG[kߊBmGk?5}r87j<NjC#Cg\"Ύ#Q$3((6?elZ? pgoKUu-!j_KC6뿉zvGca?W!|js-ǎP];BCYWe3V@j:G;ixf,:K!+Ȁ$ B u3.Q>qʛ|ßbꯅZ߾ĎUSUX5V8PISv'oP b#"u9pC(' mT~4Q>s3Q7 'yA6˘h6( eXPDYSon Oy\%Fd:[E=6@ AZ(CkG-b=_7 YICk.]x546\xCGb&[+:͐>(C!/ؿ!:[b|ad>/XUrI  lvMM`n o'4k7-䱆Mv_?ͥ 4-qEZʮdy;&9,9CU(ɲtxIB"(KI4iPoKgj3؟kwr[㋤Ш1-V(g8n}'Me3'tRAtb*B'@$?.} kB c#_I~$oGgzws<"5۬׮ZUmt<6۴9BFvCWMm=y yQ/:202gɎz bK5_C9nl-Ch"v?>Weȴ T=Oo&q!4ECM=.=gꇚ,UR4V/VRE$(Vy8&?sϰ>*V@O;mT1d$Oʙ.%,)m⏴);vwMz:xPs˟Q0G$AC~5zjehB#s] <(ǪNĿ)L K~X6F~ lUE5hOg4%#^%~#yaDVF'6ZsWɿ_l(*lv>o؟ւ?vo1q*n*N) I24珞KGb83/[_C=7(5Bwr*{Qq0U8q-ȿmqGŬhfC4Hw ̕j ,Z0c.e헬M_5jA/a=m܊=[eSt~*sT(Qҵ)%DrAXj! %DrX<&0hopṞ`Tm('X^}"9RlHmX # ZB$3oj 2B1W$0TaԈk#a( /PS6O?Kb@:?x5̏*!oJhIS).j> FG{r9F|?é?l۩Г=k /|Òw?ZǎamOzG .޾{>@?yΉ6h\L;C><%Q7_Шkb -|=AI`ж(2 .;jo~" Za"9œWBƫJv I.RpFjc@9&~)]\BKdmoT'Tu_NCW-c.ny?tEYh:qnl0P2I⒛n,t" jc^];B - PoFL8M15Py5o#L,OLSzL(b$.!h)= @5z69{L(36ei >-ԣّk"x樑,NSzL4qg7Zh2RWO8LVfZXaDc|piЧ1ډ(2J˚YV}[sD(I\шNsRN^-WY#˿Mwejw[/->}0/\G$ k;ByrN1j*z߱יgZnut dad d)'y @3|e$˄Ӕ79#^Lh$˄ӔS åO:F/PN-l:cZDa6d)z]xp__ BamcLyQ{R.|U]%[Ȑ'JD=Xθ89Q*WZ@W?kWO?Ub*n̅ܮZyQlRdCf_Q60WdFC/9Tc|?>}d#ʿ6k;Pkl쌻a۟Vj۱O^~obbŝ9|d裇=*A)R˧A49mOB9a PQtNLXVaIxGBh^ݷri.Fyb!UZy͟:gݗs/cyL5K) hAa?.BN-eH/Wd8\6n_hh P*Z 1y1sT8G2a-Wڈd ?'6G=yd7c\,$S XC)?}0/l^7}s!쁽3=X˧WN kW7X(Àf&q_o_?*o>ޣo=~W:b>\e8|$\\/`+Sh?|ڧ ;_g4ӎPw#k]Oq񦢩[RS9˙5'oNI)Gl4e",AP-dZK4lkfbQ|AA@\"q0w}P (hK-L:LG8/ٮ$QD  MGX@pR\9GkK'?W0"9"A~/Rgקu\ծ#o> ߿FoMtO?kȗtX/}ZbK<, ?^8}A3wt͏!.݂)J\zZnzJ)L3t>4y]6%R [ 2Y ߬g)@5.sI_8l<(gɿ1(?e6ϔU S" t+Ƴ>ߗ'`^5ТGAԞ8 |+vOi Hr^JB(>78܍bհ%9Z@5N?hx98-B7`kS !-/!?l*rrN?? ߰?Ξgvd$maqjwA8?%MY! g?' #׺c(!I`]> }7m2ߙ2 L{op.x2RM~ O S: NeSٜ7XH浫WNÏ5`0ckϥ߷o~Cs7Hه:HS?Q2{j Xտ L),zP9ԼZE5^HR5l+xL“31?ڌu: _ܥDO1fOjRI }#ԥiP~!D '$MjÒL3a؊&$+L1QdZaUE#vL$-/U^[z0iueMDW/jRze>ʘ {jg2gTeZjP5<%q_h[4^jpy{؟k˴u8/ 9o_SsPo koj--8.S˥v#(WW M7Sj=(-V - A5k?B#wCgWe ۽]} ?qdw fc|+3GKC1F gc] -"J><+3ž).-,V3r!(`rܽ~n>Y%v-DfC;}!.@ZZT_h\*TQ~^>k^K}2 gRYKk-eK[.[1\BhzbS{XF,6`$z"#\478bR@}*q&K3<~;.HKq(sI{x_F}V&>ef;BM~֏Û-ԆEEdSo񺣶aSOc3;ȿ.W5&AAŬ);Z뢽WKڿ^ҫsbѩ*x@Թ!#U)!ޖKk1 ,%GG|/Y4Q*OوSw =:sjSt<b#gգ#na,zkopv3SAٕ-gVXZaYW&[zc?[x?c%_Cm>TJ?~w 6 n\b#B mʪ"|apڤ¨<5J*2Sğ;BWDQKRJz=Ne':# &ySfA\-j ͔[dQyc &Lf_n~ ^ Fc76"L|OgƭG;['.X}k6KI ߐ$kAfa,_rP0 .zt3gqAe&)og7$ѹ>7B&Vtoq dTZP֫2/k%t-j6k~_aX㴿WdJvtP[%#_}!K5Vu1P˘SHҁ0?H?فxhBTU@g]Hps1*${J;5udy9Kzc&3P?~^ RkwZlƒ PJ 3@W"o.^F}atsG(^~4푈^XWCLq0V'+Q"qBOח1ږ`aw_vqj::ӄ?sDd10SV|7s MJTlOIrk#ȟ[a Balch|" *K~޻Ƨћ*{Ɲ2S ){0-ږR1XdD_G %KӦg,TLH2%,U;B՛NsGM'ax&$1_s{%;5X֧JBC؝%*&eu 0QQFL?/7okW?nr?ۢF3>JΏA>\#vå-5ZPfw{#D+Ax xc)͟$%BP fǒ<, P\jojK˧,N4̫n7\Ɵ91>|ii6^apMH2\qG(6SXX %|eDa@SOW#GFYZhDo>ŎKG}B8".6\>v'*)Baʟ?$5/l,%9 ={=MqJG$;*~o?]cN*DXR,$D|+Q^npG9g&?jYk\Pma,.k_?w~>e@f͏a9iSjy[~na3zn}M +/d@!߿eEHe]ހti# \qB8BRVi^`~ L !eqr 60uL?)IzF8/ti&Cl Pik߱ߗ/ͯ7wo(zwީVVīa޲myDni^VA2fLtxFz!D= _\=kK ߐd;Efog*>ZefH*a튲ܮ#]Otl % @Cv9?[Pg4̩ ܌caoBP^TOVUտSBy*^I}m+|nlϊ9mGd(b/oNڵC֖ Xo؟O۰o} JeVƿ'6vՌJ4SSk˙ ^xnA(=^ʌR!&;DA'ێḞq%jU6.B2_=5MI*1?mx?6 ]!hu:VN2E=_aI4Jp@Yrs_|i5Nx`!lV IG~#Dcͫ Fg qEaMPN'IrE7Ic"¾R2INB_ Lb싖naDZ(8?[yR?"̱bU ?0'~ƛ_om;T-40ƿaKa<,30 LaWx8*R"C-7.ZJwL"L".YњV!} P$đsG?+2F #i2INNQo&ʾF&$F3)$!:QBPDx߰?B۵]qc [oD9sfq=5>Je̅" 4z {|"@WIh)t\cpJ7x&L1 :>~ƾ>^J&L1 D@}!dXqY{iB=7Qʪct4'Mk]?/ԔC%=Jㅴ& 3T%M_mG=T_Ӈ+>l;Bn.QYPՂ|i@" RP` X+_υPoki[}I5kB-Aæ~]_k4Ͼ};ckg?pc "[K7~n\?7BGј/3=)A_H/9 W)~ I dpfigP'wkÈfQ_&} n؟=PiVaK?9qܥ?,~08=x\22 |is) |dwvRZ 9T <UQpdP @%z#ܕ 00!HQ-FY~ZSϣ|K&1+3%+%vmL"]!o߳L>Y;,t_f,0^a'y=C9wq~ <#ߐ}) 5y:J28ڕ?֥h+8;.!MyST@e' N~\ Pxe(gZ\6ӎ߾U}} ; KrdW|!Ԛ"i"7,\ i( (+d|YOsQǒ?_5寍 hNVT A*ʿ߈gaEꇻgt͡ۿo_eoQeS$i#1orVv4=,K7RUW (/W3G38l9k 1Tc PWM|l" E_RrT6B+ߴd+o^eSjqIo>I_7=U|w_mv[jo_ZkkG?\}>F%Xg?!Km@c\"q"Kng?̧ XR~Ŵ%:$A.Ej̗afRgP"|J,` W +6:Rq-m §`V5Lj*nJ,`N¿TWl*`V7/TRB.iZYR؂}٬]ܮIzUB XHlU q*v\ @Up# i)25_ EpNX)NώkSc%aEh"A{Bc<3|\Q44zUֽMWm,,9©qk_N5KrМ??85j<T|DYk$K,kyuzyǛw^+%+U0qj|>̩[pD&5 ;Q,wCKq?o??v{#$bGn? :;BYRl Er#"~ T/sP1DOy_[({O ymfq5j23)24#a䊐8۱:/z^BS yO^{r{Saqjv~)DR&yr 5C{s:@("N#^( 8<Dӭse,*N|| UyٱFۇJ=;<*]rafW{ykhRXUKG_CrJdIS}?CO(hRxфmD5eZ*eꫪ.Pr3zOێP&?Jh5N< xڳ~;Pʋð-Ѻ߷onˎ )) TC&ĢhB(ЯgMc((THxR49wB>a?*]ٖRۃknOQT,7S|{ e)&ߘOrFح#MU_gN7X*Co\qjDM_-_sX_=51( Z51lb1&Mtټ&oID759-kBD84KʄEfƿssx*P@_C.ñ[5&߿B NիQyPkh \k|B&4ZxWK Pv/^V5c9?|96y}?aIhbG$1Bew-7W{-28>Rp? eX_v [e3˂zUڟ@5)ոo?\PbuE_?hh?v{W9:O-o7ƅk?|oz,s^TO_((.,G=F"KQ Ձ7?Bk1z,-䝚i\s-5ɯdw_XsX _tәi-`u,FU'H_)Z6/ 1=Pw"4!\sŨ*qӨ6׽*ED4Bp wD]g?o qMB=1:kǿ)^Z~ o|#_(n=ő9jWz0S*UJ "a?T $SŸ _wKK[o/W)AױQe@GS ;P;#."okyopr7Ga+dhQC巂\{5vB785>_?,tu_ʷ\ 1t<~8>GêimcΥw-^_4ҙYs^XfSҒ8iأ\ng ܅˄"ͅZzl =O \#S,t]6D!wп^hFBN?:sw{:: ƣC)K @jBT[DOρK9lO #^bd$ƮIYQwy!h%-7B3gq~%r*\#IVfJO UeFR~o 3E.^qǹ##Vj)qxjE7L- 3wKz־=߰dal3nd)-U)ED_^8OL4aV)L7&vqV^_38M|Xx2s!ys`BKǖ?IXHo>\;Pkh\aߛXuc(L1NܤG#Y#i%#I`*ig)fPQF[U mGk5@9uQMo\">GV̠Βpߓ 2ÒLa9?%6׳Q.rgcqnD8hH_%ާ\ILL (C,)Y*fT^ԣtSZx@Vx$BSewj̝&_+aѼm?ubPZ_ r"].ѵaI/nmS㵿!@) nGò5|!@;Bv-̻3}cU fs{"5, _QFD_]m'&Oo0}7.QeyZܒZEرʁJopxOݙR@Pv="Q7zrF-? ƟKoqj>u:>u1Fc)8lݑ\?q,H~Spc*7x v^Y9±oOJ(84lVMHM5PFvпQ>7"XO׍5d!۵?nm21-Ҷօ׺ (': l؟lI Cv*"=r XaS㱆Wϥy,z*?T"ǜƂX(5"{(f;m*y,׮2/}(d_߷ݽ?L˽Ɓ_ƣwre(wvBm#/k|ay74vlaZCZ%{߿~-;NoZE¼v}mcwr>y~VeSaZ}i{FfǢǟK?-BBJ9u)'_SQagֽ\pXں QIԬtsB+WMKK#o*ݯ&?#:xPsʟQTԙ# $Aj*N~4F/x0US.B¢#߱llUU5h"Oti#xƱŦ"b=_q!jSϕtZp0rgN/@f?/Ne(-@p;Vmo_аeLHƟp/˙7;ȝ?ʹ.'o ÈbǬhO۝c*Qd5h;Q!_Z5j3_&~GMʎ?_n[|`_H7w*Gcǣ ? Uhc˯3'PS :2!S Nګ=m(gH )=!nsF$H )= KZcu$2L+8M15^"[0$tfǴS m 2!Se᰿64Ƙ? X\7BૺJ!/Oz2zqqrIU䯴x -t!yL J5b-KBƛx:{ԩo?(X7l/J,!VrnsПQYb,po JV?OڑCIQ ye;;B$ͬb^~bw,j~%$ ug쐹>,vX}K%8cm@xE<#`K(!EG҃+,`{0{ y ğ Ov#:#-$l;.)Rp~O Um(B--H ݡ?os;T00$gzTukP$q0'v7B,+gjyeҠ`k?SpV o-aZ1񷲦h/^\H!_ vϚEM Xz/JRk^~ۘQeEh{IPb,p ygJP133%Bb2dqХu>T<9{K|# QW][ЬBñQPj>Fz>AxQXؕZzBGd~ .pK]H݁SPjrI eֺk8waS?W_@ah2衷9ek7fAǩ_ݲ0ЇO1kF Wl cW'[;aVFFk^zvwZEo?@Ip ~Blg9?|*G eܱWVג5vD*_Ϛm?_?ŧT(׫K`eaߛ?]P 0 Mf4ٍfXy h\"N-neZS6š{c~eX$GAs7 oKBY@FXl )ON)"[Rv>t~VaԎNQs<,]ӎҍ8tKI@}']wi2ŏbY/_Ȕ&R J_n-. %1?^ʠ2FS/Ջ’/v4cu,00QB,hK-L,(oaE'  U"z_RX",d t{SD)S\#x`ՑnjFd^tSe?YC)yao^|u}sQg<57#_?ab%"6??Wn{ Slkk1%ƟKoX݇>~?AShPAA@\"hgbA|ګyfB%I.,y1)@e.sQ?xQ 8-cQgG])T]WYg=;B #`CQ{80ç\e,KdOLT'Hcr) L@I/,ABhgyXw"!Gˑi\B7H6'`5 nA~i s"Qiht?# 758O2pFu}tWѷ&)bVB؟<<~4w?ocІZ_ڛ};O?>RD"'u毭=X~EWlB`O, 3__vfO SJOq}f92V"Gzf[8O4aR1vo۴i'먧Ks3s!_3Uy\060"Ey,P( j )+EUQ]G$Q% -Szx-ԥ k,4F6zmZkx,S ܱk7_ae ۽]$ :3cbb\6>X3FKCBs-ohɒQ.%'QqS+>92\R7byIsy%ta(O]N˟.smG谿M:z[k_<[g-6InOJ|!vhB5|)\^X?Š,$^Oܠaqknʩm31KU `r_-*Pt=6EƟ2PZH.ڡC+y"46m'XHVo@IDATkisʣbqaGh:BQzgUWK6,^%<ӲqoRkFĐ4O??}y{CϤ/ZK,l-R»ҳ8OS@L]',0r(E*us!uQN 5i)2_fM<*V[isw;b\Q=~?)S㟍{qnAqsԭ^"-BKlc9e ]ݽ~%77:.dM~Cq=_ZҿwTm8lO kaŷjXor ce%ڀ̦`Qayn [1;wOd_gSl[ϺuWKˉDD\ n)<,U#8L5%=69[$@KY~.tC3@iZ<(lXͮOwk5Yĉ{FFNef;BM~֏Û-ԆEC"TBIRrmwm>^wN[E+4۝RkAu!g'9wԐƎmֺdTݭhOSK9XZhL?T?5[re/wkapw\iCoU#w{?%]?Azhx>4'd%￱op<78CɿƟ<_~P|ؤўj߰O +%0:_Y5@k4qZQrтzj}_q9c-RNN02b_k c = '_84_?V= nk5Kc7괦jaGٰ[F4B Y4NqḮḌhҡ\!$.ڱ y,X f>(?IDHDuySɋFKO?$?R(Y5kQ L( ذy_?L ߥA=eCGQW|_+Oǝڟ@i;&W?g(YΉy?"M~ b!ҲR!**}eR激7^B2uM=iPaXG6PrVsokG+m/~we6&A7+gƋ)u %oa4K/.$o^{,ƕ[]QoJɿƟO+/6֐nw?ϰ[̄9Ѹ6coFKdԵj2_}5>Ռ-ퟴǀH:m \9Cb8׶Sj.^Ťz,`:ŝB}395>p)afdzSϠb)2`QERCeĔd#9w#ҧrY5gMH2bwUL.˸kߚΦ5f9EmC} (u >`F(a% ,?2ݒ#"/Xqż9?O椱Lhe zoqn&A~HK0?kX[츄T{KjK˧tX״O͟*?W防6ŶF(1qi?BqGϞ(0d&XB񽥣mTg<#_1~eÒM14voDqmw殔xwuވ*.h6b ucG-ơ,bl\.^ζ47aA%P!EPMS/MZF\ !9c(] ͧA-;}G]SUl MD%:T(8Ls`-p!}[NYZJT5c^h|>O OÒ0";`DQv MHqnf|ϩ2(ycy=W;q&?jYkt S7fSQO (GM6co~#gSjy[~na3ze52,G̅>!ּ$a~;a)cb+"ÁaS_.@6BhTUTi5/qԂGona?쏖ð_ÀA2.~~O(n.eu4IGc!R0[s*WWxAcpFm5b*x [ƌR-cƆFvMؕ"FBL)xeދVPibذ^o"Ep is㟞@-)5Yg'OUcNIC~)f߿6I~5Pr@w!׿*΅P]^jPWȰ`} Tm?oD 4dب 9hapdbOӲXv0eaJ`+Z^3Bc3uG(.1 NAS6 =Ќjo?6 . )D9\m?bÒ8P:c˗/Tco D:ncL.isfOW'STI;*TBO;e-9֎Rv#Z*S)e+WA?մM߿VhSZ2+'ʶ`jZz"sMrV?Oh.U/n_x ,{ ,d>?u &6 e| A$Wx_(f]-6-Uh+Ē,K 23WΖmk!9xyxy(FVWU]9_ac˿Mw<B# kPko>gOS_?v,31MaWxv1вnT@)%AahM>uX(SJMbz#(wvɊִ tJ$'D'쨍7He_{i߰?6Fm <ܮkܺ0P{qc?1(ų 7>FOd ?AF"!AR&v$K;I0sKiS+FͿVP/z5K߸,=4k!9yB\& iN?5 $XH~47b/5a؟)j(i nC={ >ĘV ?;BDFeUMR&ctgm=tKt,06T|׷fs!EwFz^)G']{,Wן$*;*meoڟC/B4M#et{QwPBo{ř\7^[J %4!0j_HQH x'Ӭ4(O hץ7J<5j|uZ)WDkO {۶g?VLP>K?9u/.a!KUUoñ+o\iRHƈK0q2=;~jّb,P"އUQpdP~O7.ȟu`}ϐ_ ^M,ty[+%vmL" U0FzG<쫀 A>{X] ܭW_z0TujXP~CRCoH/PCY/[z0N,gѮA_"Fl( ? ysSh{5B _ P~ڎ?Z}ٚ:w;5aꉦ{/6>"d\?@3q?_Pkh55 Q?,\ i t\K4"W| 8ÿ濨cɿ_~DvQ் 7]f $ L#鿯 buR.LQ`؟{Utk"j%?A! 0x{xH_Vt ѯ'Qś-5OšWI)[J&( սPJ B XS9->zIb LXHB#,`L(+”֢R,|S%U.#I2԰?ݠ]%-no\ffRgP"|J,`4Sm0ؚ㊅L3czX8ef0Yhi/`)%tEǏAd˓La[D Qհ?ki?hȆU%}q5c{TXjC2Ic LioRzn&kZ*~|2!WC9 (V&I2_y?f}X7oz@FS5{P۩&PLכ(r$?7/J:S#Y{ٝ-1aTϥ߷oZlTM,k{?q (}*,0)@AUL"S-Sz J)§D9$ȅH2rL JYO,@U1j<a~J͠TqK/b{))|M0ڡJ WP\,8 R]Y߰PI +JYR؂}٬]ܮIzUB!WBbۭ|oT,D(0 :^-]#hapP|5|!sa6 3Er;Èi)2gGh s5(/MCW8-_kaf㰿~_:#g@Bsfk7W?ٷ!z˿+>i`yo&_!ysDނ85d#ug v0n$ CЛ-jeE7+ɓWOR"SaN*#b7G͖W;V8?o??v{glli?r[;B$V)jÝbYLQ0濶P U|т2oz(Sd8G*!qcu_ 22E~1%B`>u-[u0LO 2ɓKA'9"8bP<9M;Jn?^l0.7Kn˧Pڑ:|k}XJޑԳcSਲq% q9ia+59\PXy*A;?p1,h D8U\?}뷮E?]o7W~Wu8gO-ë5v9>T*^4aQM~٢oW]R}U;܅Hnl 1KgF(B5x,|Y< xڳ~;Pʋð-Ѻ߷onˎ )) TC&h6Hׅ(!0m޵:/_\kPC2 hcGptbـknOQT̚)ޔ} `ߘOGY6MzљM74&\!'waBbkUq-%p[+1soBߜl<׿e>տyL?'rp/O_W~g(1|#= Th=!0iQDI@&CL%Hd'Mלg/kE4*uJF)0wD]g?o @7S&Ux@!`O7ŋC۽OϿ__BX lm:BOϿ1G_e`O5\$!>g_7W|v_^1gWG l#7NmB(i_XRLE0fh,nYVw!;#."okyopr7Ga?dhQD-& :Ccopj|YJ랿o E7ocxbq}lU?d/" ۈ!|5NgRJj4FmT+/4 ܅˄"ͅZ/??-pO=wMګ! #ĈFxF5 paIĞWߥ-h)F{,/j<:j&DHQL鰽貖&%s'ٞ,G *OH]e;5(CtJ ZHofsgK5wd=wЕ&a)7vo\!X$19ySO 5ߣ~԰v%_ƟKAp1Ji(J]8R:E ;լ+SJRM{Aem;yX>Xg < /FI$aSTL{*&>W(nK?`C!~6#'LBF-ɯ?oG_}wo@Fv&/=B9)%?ng/_T]5 [Žsilj&B*q$ tY ; *lIj-U/3Nyf{5 ɝ]5o)Ы3Pt|k g8w؟GgvU * jǴz$"pK}+hfWW;ìWo?O}[v(]J sLZmy߶?OT 1>pOWFQ1ExL:'S8qυ<΅Pg&,谤ȿƟ_[ݤ,$S7S( ;o?o?v{jlwcoؘxp,0Nۧ}#qjwsʝ:"m{$ ,W% ; *(?|TZ3 |(3.Q<84to;c>" S)~Ȅ2ju:>mc&CWߙ k}` fE`T)mDe9)Q璜 eXJ~#:g|$Ͷ]9r&vyIڙJE23ݪ˖e-UTj*=zgl3*yɬRd&{`}繙/78wDz9[Mg߰?7]6A.a=sPӎZۙG-zʿnv7gNCTx5̿pF m.vʸaB-N-yF 9$% ْD4 #yZ…(GWW0zND$idh x wxɢS< *6LN 7uj4\ۊDwò '6'-@X嘆epOZϣ-t"JGC6$DqDžSǞ= g:Zɒ uAP$:D-9 'g0x/)AC;VhCJh4^J`H)6 #/.J1ح瀊͕ \dSR!$NAyJ~/Vvшs!"o^rfVߖ]ov4!>c?Bk%kKۣpq]Qws`&a'V?ǫV2u.hn_~eeq}>d*OmH4q$@DI{C0"cGQa!)dɿ_ Wƚ_-}'}@[^=;7;WZj f#wGR[EǶ?|FkhAN'a(I8##<&Gn|ɠwsK@#w-£Kb0`x۴%H#c,£XSlQ<3g$'t-£K0&i[T 9}3],£<-9yGXG35A.+JJg+r9 "<  zFx @϶bOLf~8ׯxAi1뿼_y?/9`>BQ#_(]w&E) LkͦmFQ%?q{M|xȩ=vK|˒(w?ZߡmOn Nkhvo2z'io;1aBO _gk1cȎLiaq7_RG @oq="&i[T 9}'w&|,߄A@=UBt߉÷v?Gw:+nBˮ8{l+@9>g+NGo~])6NۢJ7R"orrٹE~C($ZT teb&=eI7QF8y&;VЉVQ%);,2nG0/)QF(H=dcjء4oSi@y֤=rXŅ<#Y 4p@A#i m~mcp@v,Ncp_LPfGr.'r㙃FL815?H; *d3пϤmi h\Z0ڛؼoWW^YyEwW^;4B"?sy} Qw̚ &oدB qRʛi<Y:'F<)v4Ӳg_X}}\>>rOMo]oop?sG,?JG#(ǣ߶i {oo\oIǮ( 8kS)r}'d18?ا=m 3|e$˄Ә7=?wٌo$˄ӘS9PX#NczL lCMwvL15)`$˄˪rCZDkb̋r4+gyPW>]GD''JTyJ |ρ#DK"9ѧ$\C-3bD$3<^ubYDنAy'pF(&ܙjMĜa(j:z3o24" k[y5tmcba}(nc u!vOCͧ|ݏz/p/:'g&vCf5,66K׿_*)Q@pW?IJRWWp[W?d1u)n߶i<Bs"Dpeg~KY:b']?0ܔl6qTʸc;ϟţ3yLH@}g6ɟNSqʶ4y{ l7ow{ ?dMuRlK4 \~ V7nuا>V6_y/ os=]lzU~'8![u!mM[C6?s5HtS Dh*𲳴>vF5p XXlLMlS <Zf:Lx0k`,N2ŏN`4uB (eᔇ_htÐGKì.s{9h#@K<rJ~7/dP@I`rs?@p"p9Fў]a#??#ӫ~IZ]>E@sƻؿm 4.#2`SIO鲭 ڐ 4[K[i#nP0r%sݾ2.(Ro?"}[_5ke:,dC"C_2[^vWY?ڄ?!K׿Ws_R=pR-z7%VnyzEE -)/}tg &ο>P 0p`O.h$3>$a c8[^\ec%Q 8geX xgB㯅 dk34R#1B`3& <{2ij!Gubd RR5.oۧ}O2p~x]p17\ ( pqK\#ws?SKMnx~__K% IԇؿmŏTG1hWB]w|Jgedh3墖RH3-B;.n0=~St&]*?oegs>B2ݮ?ZխCfMCe^KlI&k*Q/:i㱫cs؟K}Pސ~$uUdبk ߹P;.'z,.ZE5ǪݜsBN-_1_= Zyhe?=$Xe}vFhNp)& LJ>qI]pyH`̭ #khD[#EFFH@F;ƟgpޢwNVT*ҋI+'@IDATCk'yR?'"`Wɬ:xd`XT%YGZf i"7QHW<~#gTki5;&%P i-/}:OԌ#.]~T eH43B__8Q#u 0P~8Xpnb\jCr!"yQ%PT`=Iƴ(`(+U@mmLԉ.D!~9oKjkpO<# Pcy5f!ׯ%B؊_ ':Mb0b%50DA̗o9a(yOh3vշ3YT a.t)b7=33*imnmrB=33*k9R~3{֨_ġ$ BK6п^M:YF=E_֧Pn\ -r&$ f;Of  p!Tm!!YUEF$Qm!JZ>džx2M*} k,4z2 WM[?dm!X_OAl]Yh}(\9tk7/kO%a]O FED/ ~\]ڠ DX ӶJyU까['Q>qS+< 2ΐnzj^#SHs~99yVxl>wqs% dO3 *!fv$C:Kd*K?'UaB4ф*leJ%D:|FL ZΦ/<ܚGqxm_\`x.sF ~:[O(b-$q9X;th=Fӆro;V}GrnZ6;Xn=pIo2jCUnC,*㊽t{V^9֟6xļ'~SLL_2".A3%,=Dr&]:`YZueͯ!~O8Z,Y8?ǟ QQEI_>Fyr[x)C+&C՟xC*l?j2|OzvwʨI. Z@1OIe6);ܷ6Xː  D ٜdmaȬZBx6yƥNSkj_пYfW*@CZoCYh-y<B?C$Ku2lm_:ɶƿY+8Hu)$M$^ nd t*_APg8яeaaIQ s*Ñq=e>/__[ىˈX9k$ x=2~s%:LY;&?*^ r\Q;eIr^jzt~z 7cfXVoD$i_R~(/[kum]q%i` o.]^F| G`}ds_/#v?F.Šzwreef% Y$q51ږe`aw_v#Opk<ҩt?k}nŎU&RG5260GRB9δ˟IC(koqo)t(B64 C:jMˆ"l[ '0iR^gS[|#Z5e/?o\Pڸ2[ǫ,9oi\(Y p+Cvإ'/?墐?CMCj/ kusլA޻ƧҚ*]za qUN܃h׶jr'$#*sH$N%dbB?2 v\)ޔt;Bm(Gz;*#=bARJz]ӿz5-* k}^/j%Ȧj`տ%[nڋ 5'\fȖ:-qS<@?IgKBPX H#H3nrVڂ^?>e]?5lS,пs':#tLךn?_J#Q:!neN|5/&$QW#nCRZ)fxB5NGL\/}axFg|YRqFEO m7\!ng;Hl{ɣ7aG KY(n3Ix+KYԥblJ՘O:g})Tl,,ͫ@ `t !|yL)7B#, Bwڔs%v8ڝZOSOÔ?6w\/PCRIWbr|UI}׿8#k:Qo4*蹲'?_eIvERTXL;]4V2 "V7A) o9*b߽G|\(cq)Ɵ!%sGʇl](E|CMC6}׿7!##Us$5.d6Y@)kkCN5U,=8gbD5+]`SLY9Uq ov9p> !|ySWJiy0m( D !ev|Kq!1HOm?dUU.Pk4Ó[ /~uʸx1Mhm~ M]ɋMLi<:qYSOSz1vd˘!q/y%/XxBE i>ϐdթ. %2wDWi|CLy[/%?tLn# @ (9?[|L1LĊ38!b S+BE]䀉:+ۤ6Di߾`lv?go;Ba'ӎZv8ĮP5KX;gꃧпoMzߌԐSA?4U5AܙP/#7(c+PrgbQNTK9Q6x*pzUGR 38#*y^* G>fj7k..Kbd/cԞ?snܡVRKچm1axRt|\ISq(f]v&I29J㖩3NB.#]ͥB4X(%zJ?$!O-R:KН[?*/\Cq3n,%L+9n c}^ xP%24Fȿ.b|x exYȗW;\N e/Rk!)x[sGT6/!9S~o`W5¨*=3,ZhTw>=kG_g?s>r#O12rVOUOb& SN~`w4Bbjja^ϟ;xD`I ]}GA(NԸ yD'DǿI@ToQeS! )=۵]qc [79?ylƿ2{JmԄ 4z {lxuzh ֒"LEHn/DSD~0Շv1q@~>>9M# z5P 43.'& iȫ7Ï 2jsiN߳%rLk]?_))-B3 igB !Mw&`o Q_ӆd+?vB?\ 2aqA"EC/:f8plJkٟ3,xFV:mf+#oP3(p7ЖPk˜ٷ3&3 G: et{26>)OyY[>%QYLه?!& ?7` .^vq'Vz|u@ߎg/ 3-Rwptӝ|;.a!3U_ð/YUG,e:NXÏpdI ; -ׄ/PK.e``B0A_{4Z( q^H0R&=(e%>Fͭ瞿x˜E zk?;B6/&/߳.YBп?oF];81?ʹ[/_y6wjs_P!)S(j(+* 'w|`3hW-IS#-sr!iE˟UU( T;?|ƅdk Ïڎ?Z}M̩ $݊Dl>A|a.gD,4 gl_OeB57<(f6@Ҏ?@9/`'Y_u_@33@wђi滥y00!HA|ugXL?OmpؾUH=^>7bY+{*1ۻk-ҁ{yG>>wũFY߅8a)E͟Wy=} _?Q==A8S5:k'?>}VʶoLG՞5c/8Hz۾]cuY@cKiINl ~`*_4>\mKa҈lS=udIA\})SO h$"S_OQ>O~l<yy){G*C4W785+/.]~/yj=PSb6frcOqQݜ!xkߵO'OZ*"ǂ]?X^a-!]߲|p9_7o7K.]~K58tջmx 2d+64xȗ~:>B?z?T|2spU<3rarVVyW<^ŀ/c{L{Pʕ7a[{ pB?rʼnVЈݜwZ7T]{q/e]X5y'uj/J v ҥ%3pԇ-*;$%lQR7˛4EQ|o3V!SvៀTq83`sTe~ֹ ;KS⦟>9yYXxZeTGs႗E5(qؒop<_Ƌ˜_%OQ o_@&3i|8BR+6/1I-e#[u?eR•?ߗ9?l>vivC"NU]e^.=yެlxEa5~vO*_ 06|[ $ +"qrgse:ܚ z0*]m|*Vq'ٰF[] QΎ#ojIO?uп^z- -*8P̩"veGVJ9~{yA @!5.#bsebR)6>4E->@W.T \˪*'S%7fk$v v\<#B#p_?/a ]?X(ʋ˰AMCԲƟK׿Wo0e3eV^*rbZl->PEwyJS+ȃZ#\ܹ' m;2#ϩ4Su@ `u$r68GInksr6eߒ|?cEbYp[$fuzk|96Q8.G;Z51l0ƍ;Jg)|HjE -97"%v1adfXo9A*>Kȅwc?eI/[p# /;^k O(T5Clk͸UC]&6c9?l?aח,Ղ $ig56?O6V-<(w?eqfAR0,# 8[e,E~L)kG-+`^tCV-s&ͷ#<ES?ԥp1?z Di);(<&Hx.ˈ@d.ft'@2O?ϘE{!.%0<3k!纒$soM]1Sf@g ^J\c$5EHCWwL((O"$!\s˟>טliC^uxu]w4>[kB_++z'ušmߖoQ,G❝пV7VE-п,$W0KOO8jWJY(!6gpŅP[&eJf(Cj-):eHj(a_ =G@ B;OKTX{C#_ğ'ر ?f{}?B-mGê$iyvaLOGV+r-cjDOЎpw#q<FzO3q4sM3& ixN<#4ӞT/Ps.Q|肕d&"S{=Tl֟RB~'woQ>J PkϣH3bm#.ux?<8 pb9i4X'AT>=o%•;LDF;QH?;1N vb>,eվf:'f"E8>|;;p3u]P鄛Ȩk;ook$/|rK5Wِ_H}XxbB?9op3꒳Sk_:}\8& 4ۢ"b is!is_/鲤H?]WwQCCxϥ7#@[ 9IBDZfxI#mQ<2Y  BPۑZsC_zP t8_qdyX{7Amc?Hi!y 7\]k=_^P5hPė]F_^yJAr>"U" N,1*aK _Yᳶ0yklw焅u~p\'9bH@F6><4$34}_ WߴF9 L'D(BgdM8MM"yh Q*Og oegiPL[cpG('(5H "E%V(ijwOmX+23=s`Jb-Z@3^ yZhGKQeBW?CI݁| 47>?Efѳw/]Jnv>ǎPV"_wi!2ʴӟZTI`&k&`ɼ$ $O hwBJ;AkH; 'E->6($ɥ5PEF3BygVEl0nV>y/k}$P-̥vc.eg,UsRK]-0F$mkU/ [e:!"&O֢Jȑ;qw%hfP mr{l[@uऀ)G:qw%b->cՙ.{ҖeI{}/3蟿?) Mű7~7ǰOu I=M+[\H tUBt߉e囐5HgբJȑ;q/_?W܄]!p:#c,£V )6sM#8Ex]t :m*!x?H6 e~1п$ZT teb&kpY e!YXjK$q*:*jcZ9%z̼*LKJJ=0RO0%v(>Tfd/z5ip&ąFFq!HA315wsHZB[_h15gl$˄~A9~3dpcjءpwU4%g!IoۦѸ\ v+r?o ˈ|Ǭ`o尿A !c,y_OaBoN;i3/BQti2q_&B#ݮ?_n}0CߚwPh-?T8 ' OiuDgUQG7G=o$˄X-5u,NczL N3dpcj8 tc$`ZiLpm`HΎi=r6d1CzYp@__ \`mCyQQX\|UYU7 } XG k/N<}T>[~Y%gSkq"_xYL[GwƋO/S:f1,RlCް3bD(9>vVrns~ROݪDr+Jա֢tt-TS3͓4|'vߥA+_ 9I^.׌tşӎjH8qƄDax2–6<1?xc%l`B%D:D\h|XuFZHPdm%dn߲|S-`Wᶕڟ C4BUI&?, ]w?ȁ?u&<ºhP.#3I0`?g,&uNʤNAn?cp V{0Y4boyM6^sOwk$ۮY2/)jW+ B re|mcR>rRψD8t㔄kȰCL*3d& BWl*J%ƙ\d\pF(&ܙG㯺Y+լTNR1y}\^YoW-4OX40D>/Q{A'0ׅ[>_5&Gu?"hf[_|oo`lʟSTb rN~YI{Ybq7T SZGҁU [`xs_xBmFўU+WzvwZExHȟY '~;!6fE[8?2Xg_cG(_O$PkYM8Z@eQ_k}?Z{Y~mFY6Mz!:,*q2D[XI)~M%oeOO?2} mGAH_@N愼Ecݏ=C)4%)Y8os·|C0q 6K݈eǡۇ>X a tL 3N8dbvF2EA TDorNY*[4':hL딾8!xY?27 3&]1r0-DG x& o_$.?"C:%z~oeEN1sp1ǻ®tՇO2" &+<.{+ 9uX:$86_߾ڋ2.<*Ǽ5_oJ,^r_TPӻ, WY?T*v.]^m.ak3^]p@@Z♐S:_4yޙ3ij#Bvm=r@䔻YyeD91`OL5׎Y rG?>W'L tOYb/zw  -zD`4"j&{Cd>!ӄS5(B@I.AFmOip>y_-Tv8sM-Tp'hx$qZ}Oϣ<&$𘫿`5 nOAPD'uf=~EWM-B`,_vdu_ODcȟ9a!nWx&2}׿WOj&k*oDia$NG2 $OSzH W[DQ4hpf~ B *inujQ~cMSגI۲kvsy M“;1?IZKͺ ^C\"_&"-'4.ogg)Z(4dhɍp|G3"dN t*0(Z2HidQ.Т:&i6蝓/ʷb.DCԿhɥHAy2G$Y$CAZ@%`X^tG~3y%?ƎvY/#coq*E΅d](OiQk7φ-Vv?ǎPG)h?(@ h>Qfq12Ht 4i@IDAT آO_KmH.Q$?D`JC'ژ6Ts|wve(:p9:+*Ȉ$-DB hؐ/S ^OWAsFOFs`-k)M+ ܱK7_be ۽]$ :|ĨH}??~k\~JkatVɞ J=qcK$;.{jgdT45UOk i/5'? 0p'y.u}DvfAR5LtPڎСd`tz蟿[g) W0t}?_Y&Рz@HrYI`B<W'OA[3(no 0s %|NhOoK`)E  'PkqhPN]mǪ]έ6^KӆvSTíQ.iMFmmv%Ve\ϵ#~B}0W^xʒiI8~3BYk\FDO;?<^z]Τ/ZK,l-\ܺ237?]=jyXv5%d^C3/W,+̟ Qw_>Fr[xY,Cn=kiswͧ;|\Q=~? S㟍{ XS|-ssѴ\|.yq9wWo_͛O2.dߝZs^~_Gk__tվC چYfxR k{\\7XoK9Bgh*3`u2l ƤŸHzGx38l\T wxs-XD+~~.T=`SzJ)O@. liGhd]VF"ZB?4Ńf+Ūv-~#t_+77M#N472rz\??J,>¤O%H}# c?ENoFllQA~ _;@~3^;}z3.xThW3MoorqG(U}WgޖZnkB%eaпƟK׿c!ݖ`?ϰ@owNYhqJ_F}VƏ2VÛ-ԺFQr񺣶a\1Zwv]?nP? ۶kֻ}Oy#.OEM?ZQ(udHjEbxHrZLͣBj\$zH2 "xm<.G{8sv\Mb*Wj!3]$UI8zkgp2,d=9;ʿQ _U[ -^kЮ.Vj_WS@l<ĠUDVW& FZ\LIEfJsG(i(?"jN XxPXŭDeD,>MFQk[9/vʯZRt{QH]B~ĥsB\:`TSH-9lpi_?;p]MG^!LqUXtڅ_eYxj̼LqV<%>i_R~(/[k-~Aqq%i` o.]^F| Gd_pG(>~f4m"@{J+wy!YF8Z0 nVER'YmK20ڰ;O'54f |. ]F;V7Kk`PUF` G;`BS5u#"̞/B>&$;%&eu 0QFL?7kWf>mQY`%cu^/j%ȦKWҍ6/n_$F$Pil0NqKgqn,&EGȭ&$qGox tzR /WM?&],eaPqZc^|֗IBIQG >!ZO:Oz >Ј2>;. :hpha><.6\ v'*S0OJ]Tj_j9J}XLп*9grS% ,.# [xO׀)C_9{(?JExK)` x_! @C ph>e(̘N)OQָ Ed3;$֜psqTD'Jc!gRYItBO}֒M fS d )Å,tV=1KGn;sÌGi,KI1芸y[)D8{!/ xP%24Fȿ.b|x exYȗW;\N e/Rk!)x[sGT6/!9S~o`W5¨*=3,ZhTw>=kG_g?s>r#O12q T{~*f?'w ָ +Bbjt&!̿/gϟ;/%E+X*)f]}GA(jkMHfIBt=pVGM\N+&Ek%kt6o9sfq5seLe_9*i@8s ."0`-)4\t1Bd8EQ}8hSLWD=/_Xh3Ñ4b _# OI:2{i *3 c67]f=Z"ǘ_ıUrbr(4SINpֈ&*Q78_S6Ę&[/# ?%U/_&17 3#_M:^tdvR5䯳?gs!YuM8{G~B-ΠeS@v_9 ogMCgQK׿ׯ>-n#$aOqF({q }&}8RL#cv(|J +$8;~B8M@~o@R__/GYb ZV; n_۶?*ez!a^ ,{4TT]âRp;qWuRŐ@/~l?e5!!?zS&i=(` ޣB=g\?fkA, n4 %? B\_#/,obgoa)Bп_~$=п!E9kz1"O!S۟.g I!$C e_`4%IL]T&M\ڒ!s8\CGQ>kOͿS|-;?|ƅdV_V\H~kvڟeNE~j~?O=Ѥ}|^TmO |fui4x١jM &R3 *GلI;Q_Ak+¾@d| 3 QǪ-_Pf[e!hxgXL?OCzmX9r K4 !Cѯ'J5ֹw}T&aͧ"O [T %0z^(X_<2N4ܟrjQ)Dí鳿0f {^B L䫠8/eLpfNӎ%%t-*ȇeal ,(1iT3  \L֧IFackU]֟ N9 }F8,–2i~񧸪A:_HUgz&Um1}vt^G| (>:@yR(|JC)B L;BL-*3>~Om>=/S.-* ?0CvAXV'*vȗni|he۳{CF'(񣇸 g)3.3O%l33(%>E@Ԧb:[3\) O3 "MUG&-a;kp^&lPѵ[ҰܰUvIt/˒D`Bp5c7>}Ub-A6.B[1URjϯ ?ҧV}lRzV[4lɿiIE)ɒ.o>e\isg7G8Ig[wՖ(R|C@PpY?t{eQGWGf3PKO)0r0[1BdӹLYfP>EI\(X.ʧyΠD`bʧhLY 6:Rv-m§-kn 0M̢|C\$SvIb! #uʒf?~ԧnۼLhxPi7.me1< |eV:}.R2|k|U3E %Eo_P98Ataxd ;ǻ<2ss"N/v}[H3NA~>.m-AxGv 9N8Sg˰ә,/ō[9S۟O1PZMI yE Usl-#D>MEyp<fٝ?yyTK0~kw)Jo8"v/% `ԧs*)ͿE[-Z~]^OcZf;v? :;BRl E;~#:;}hKh989p-=BG-|mfI/L5vyKSOXeBL]?WSc!4lxEa5~뙌O 2DA?4Gz$!> +.vD2,T/P)"5?bDTB;PGdú{"7ThqYcZT[.Ӑθ4п_#j\t@9+ˁ?/a}(54rƎP6G죑ICeU-uo&rcP2r-\'wКtb>o;B<#B ߦ0_M(; P>Bp=˷_ZyuпE_?sv՛o۲~L0t3e!q_ľX6Xv\l]+1Z#\ܹ' m?.1I3>\D6>$;x=DWfHYR(n+1s6&5G?u6ȭH, :bth.KbV>7q9ʀGjib~bMiس,35Z@snBD84Kb<|rsʹUCe { Y09dp~?2"YtY&T82d/˱wjVX B-(_QޡIb7;'ߥ?[Y*0OT# t1b,[kB_+~A]w7ًCKߖzm -[oqdh+ٹkyl[DBrsyc[I2 Nm .ۖ7JV8䇭n^C,RvU3 \ @$z/nicxř t(A[b t4˺Eްz23"zP[݌E^?/[Iфce2ޣ?c*[VfW\t[ԟ T/9gߟ[xF(I]?VL`o#s.xj 4ǜ5TJwPfsR<2x3%%#qӰGܖͭ7zF^'x"\% Ǯay b{MEпF5*IDWƟK׿{(^i<:j&DQLiF4)dv9UiOş'3"m9n64 qqT:f"2j[C6kտw* udg?>(X_:߻wTV5Vpo*+\&f^7Ԃ Q4?e NgH2/܀JRUDg)fPQf“eTkjqK3 aA;,i)ܲ8P$ SLP=kߡo_RPTtkӿ{l>/V x(ŕcHs|s-w\rd`<1!ЄY1&0:̘dhFi/;hEN[#c3 4edBp!<]jky&?a!s9 Cvrmk׿:/!?bXcT9&,W ;+gyNGRUΰS̠"÷n9 :+ y}7 ߨ\_^T:qvj\I2 :sO#30/K;E2[пƟK׿wXC[D^ƩHFxpᠭBmgJOJAr_~ 1~#Z=dM,:@<Ú{mȇyf?Xݽ~m,?߾z}_?KoS"CS6CL _{Q{?FI*q_Sɘ1oCZ%Wߢ(+ʟ[dHYbԓ,Y;y˴0PjNsf`. e?rL?ɉ;E(KxӿC~cԞ?Z?V?5i%eB7?k?&v#il b6"擘RyJJ .LAz_7nktKk{N|s>D>tgot3Y>͛?7q]uA7;c|nog;ꅍD6$b8 Y^`>JU`Fznzrcψ` g eeW`OH.N00OQ ϟ1[8),f?AnaZd1g ]5kF:-$!Co߼mglG/4'9ކ3P\`?v\hdZ[xeEUoHzD3g?r ;VQP+(s߼XX5bAOXHf(>??%ndlF?ڠ{QZ=uX)0>R_~O+ ͵&j(O]j ئ1Q> $*0DƔ]O-"_&-$ط1ۍ}GA[sk~B?F]?ϥ/ãƐ}K/jMXj*3kh߰ɓv&$J2V#i #!KBO`9y~m 5i*) Vm{Pp >l4>J6e$qr~+2?SAgN6j3D+V28b [k׿{ɤ HdA>p)0-+D{-?g+ קH|vT?Jh7`-9Lƿ虜2n~ԚZց W5W-55჎xNH"X)tWIR:#:l uu [R栐;noi+y& QWW&[J,Vφ\@W@4TU13#Ld)/;3 Јt]~LkoOQOg48N)rf8TD-R v9 z٤dZp0dJKl ='z~v+;BYB/\\k,R,Բ\Ǭ3(*C3E;_п.]~C`05 F0|P_~lbqnuk<ܙKFbl#ޒi>f,_-t%Cճ!42>Е`j3!,DlMC(2=Е`j3@R۷܇Ф˒y Ytl t`i??CMC\)LDXJu8#gz6̅ :·yGiQV|W #i$V沤g?ÿ}2ÿ 3+IпX^;Bm!Tծv*܆EgB NX#C=J.#[(sW_,.9--EF+~ܟoTgp;.Tx{Hed-?/Ci п\ZdfOˆhE9N_F4e/7\Q>|Ԭ K\Zy bFfg:GXO?xR`~s_v0\_/Koƣ|\C72ҎߡK&[1:to?f6haQ @}_y|Hƣ22?O {tC[eGQa:@acZXs` ZO*&B'y,\_?o=?& 0FGR[EZ_c.[0|w#U RKS-0Fx$ ~ȃeZ,^ڙ.$nDv i?q-"=hfÐ]C ?'l~!k) ऀ h [DDE{0&0h[>O!#:3)6Aۢ<L 9OFV-C ?o~AqLգ GҡE C<a.mX7 vl)Lbc۟k?j yfGKɞnQ#߾̢bO(lfDѝ?}  ؍=vK,@h] sjy?/޿&~7ǰu U7_e(kB-ψ"-C ?; >oB R UP [DD?}@ħv]%,"Ï)6s}3`9AЅŷ`M`ж(AJqBdMN! o%UK׿{۸&kpY=~d)xk`&NgW/E$ wGz4aqrIUx -Gn?cp X+3<QuL7,RlCް (D8t_g bbw!?K?#wf %ƿ (;Y*) [f ?)K׿'iVO.HVrPx\3Ne/?oB#j/p   e-m"1?xJLB""?_??x6ԟ;.Y[hd Y_&yL*ܶ?ZsvF;I?O%߲9ppb΄G.}d0 fLszFe0 W& tkȰzàk ?_@85 *S,Yj ?KJdO7/ۘϨfZvc?cJ^ aI(GJt!zƯd;T2i /di~O&*e^gbb9jp;k QA*`(5]=__ ,jy@ Us٧4, 0ׅ[>_5Ɨ~K/p=v\̗M\3} J CAV66K׿{_*)Q@ ZxBmFehͪAl8YG.Jb,iam'ÃEh?I eܱ38ϟLǧ7[l<^dߒ>G l;PkO-mk׿ \ar[g%B4.=M`hOa?͟TN_@NS6b3gPr&i8,S82!qƿ;c-);xCs 㹠wyvJ`з8t]9p.cZK?]LpT26anp_&PϯU?RӜ:hL딾!K"$ear")g~=LLW8ˈ*`B~/bg˙;Z:$a6F.dD?+lv+ߔ|YLHɧwyY 2.]RqpWx |sǿY8. 8p?MQ]l΋B@:i0(㽏}:qw&4i˒B[T? D]JkK)<9Ez<2p?&k̃Q;τBcWn%PHؠg\;BHWe|hѫ JO$PNq^INxQj!q_((&OgD6vD5/AxBk34B;Fˑi->=/ 6^ncQ47K~?%O}+=7}tgzd$mu>e@ԽKZ)-IB'"p6KRw1hC-O}ӺCc>I(`9HzkkiW۴ʸ[$YU sk׭(٭0r2]u#* 4F\8>x2'r Wt,shˆbhD[#EF-ciz9[R?|K/& # X.;3qr)R,Ay2G$YXT%YGZf i!o<=_w7( I ܍|굂['X vX{ ]ȿ7ßg\Wqo%2"[h-[eŎ˞Z¶kز ·n-4aNazAl>wq;-̱C6(˭ofQβ$| mc?_kcQ<7CРx N0x2=S֌x_ ݕ[fnC ,JqPgląo)PNoKce  qrKxE\ .b [Xo 3·6o?H,hn=Io"jCEl% Y`PN;B'&wOY2(gd_Pѧ + KoI? ͦ.]>UoO :b2tʢq !: (f:|O"f~ㅽSxq[tId?>'k YSso<"~Չ U [\AKxE\ .b}ADUԖʨ UIR^רXo16h# X˰5sɭG/O?SdR[Z-E(4x^;~.T@M GUx<3pd;B?_v\C ]5(, 6\ t.Pd^KׅPY6m〒}i"nfIGَ_?GB-pRRW6;j'́8%>d6C`_F5Ԯ4-KV֭r{l:W2dDuCoˍGHո6I6??K=b+ Uj".GG8s\Mb*WK ҙ.mUY*ٶ38[(2<[̮k>_v4[!$DeY9܊h/]l!ri ˹bkPY]_ 2\d'„o`IѦχ'PFxOrNkĭ,V\F:XM.?Xѵx=p7o(_3`d蟵CkY]~ {1T5@jxmM2H? ~xgaZ~eIH2ϩC~os]}$%9 IZ9$*TϿj_Yo;.|-ѩX3$ѹ-b+3MRztϒ@Pn4IeS]H6K~_+?3lKi!~%Yi~:0@uWS\hw/!>FɲMO; gcqs;/\H,oeI^Z9'Õt_iO7m|i 7?~?k]#>D\ wHVX/>݇fiv0V7+Q"qB0"2FT]Fvi7#JZXOvQٱYH_&^ÆP|P3?$T?ڵsqo8P TSD$++.B_}u 5Uc|W1"mha?Pj^mz6|ȃ-c|a̡SY6XAX}8sZwzY;[$,@TO<>EA@1?j_M0Uv~4>rP7tɫp}$eE8D_1S[ڕ̸‘`2̃0$)wd IFŰjGxSi)VոT)\*=3A,tŤDd=4!Ɉ^lr^x_eTCB~pks_/J?D ܗ?veIiAR=tKJr/K7a%lgR%@iry O!B*kgIOXV{+P[Pk_>e]?5ԈwQe`m\ݲPi?Rǰ071u6am6&$Qj9PNy35:eGpBU a𦟒:#1nOeIVOOC Nu[C$$N0!Ɉϸ,ވ" 1c_G)-c}U?n)7x|, jUj'3&\&U8 yT57 " >',B#x;.Yh  !|E]mʹVH;9>NT-')LaʟTj_j9J}XL/0URsgrNqLϵw,.# [xO7)X9(?JExY0 zS)Peż1ˆzO9ZP2'-"jw_?w|*C#ڗ7kY]~?b8bwsc { K%{/} dBu?B[çkZR0}od~T1[[@ Hm3} M.ʟr(yF\5?:IkZfw ̧p)t?/V:tɪ> C~E ԗX=rvqwI̚DK~IojbUOhO;+itqd!XNf/!Dgj4283& STk\JDWI kNR9e*Lbe! gRYItBO}֒M fS d iI4蟍}IxRPtnU'_\行F%B,7 5Q\IJ$d5$B<^WĐT!iL--"/ +N**8G gỎgxq7%1V *n?ZCMC)(2\}B>a0@5bF(쏣"F>12q T{~*f?'w ָ +BbjgGߗyD5W%E+X*)f]cGm|A(K֚$GA~|C(* WLJnwǍ5ln9sxzk|ʾ*(0&3BWј{߿bGSR_%KI p'Ӥ4O H|7B<ˈf㥬ӂM1 ZV; n_۶?*ez!a^ ,{4UT]â/]P_:FbHF_3&a9`{?e``B6 M6qyF,˛ _@֧ hn=O00!g\?@3q??Pkh55 Q?̦8>*4P0"O|Up{濨%c䯍 (w3bO Atq<3VPN+S^U=][8Vǣ;>guBvICѯ'-5ֹwĽIXiSB L|;VS(1M"g{y"< ᧼ZT %0pkj/0ÑW)EP*8K Y?e(;#y{Ivo)}J&Y!Kaw19cD"\ Ŵ}zM7 #gXrwլŞl:m;^\G4 [tZZ_OqU7tϖ%CMC(csg;Bs+lbz!Y@Ah+-P}tne~/A%QJ& _(mE^\ZT %0 6 y3yJLW)>>x"H`"OrP`@%BOkQ)vz}XK4> TieS!iNC 7)3.3O%l33(%>E6Su0ؚኅL3cr?efg,44[ɤ%tA~ر _d ZEه-iXn*$eFJ*`Bp5c{4>}Ub-A6!B[1URj/?eU_2?^" /wG.9*dioZRDJO.z߽$_ןMBo[M8:b6P̟nY_LVٖodvso{6*_-B*~:"ĉ,Wx00r(`Jt˔A)F P3]O󘉝A))(*?G#/fJ/ K"" XX_vҾ$KqŦ f|CB$ŧĊ(+0.%-w?~5O4OhxPi!Ķ[E2mgTy^&"TyT O?h~pP4e:QJqzxw/eDA~ug qo s5qGnSxSDj6o_scO.0סT㿤-Ʌl9_x)vHR@_.v-aG쯥"oy']OKV*a|S+حkd -Xi2ܱ/psykпe_?K?GHj:ܰt=h'w2; V/y_/} T_砢 _[({ڷ¿J`6_d8|I5jyK?ROXeBL]пԟSa!4l^xgP F '[h|`؊7op=˷ﭼ kߢ{Y]~_mqQKV*T}FĢhB(<ЯWϦ11 29R=Ѓ'.ߕm!=m?yA|3>gPl}IRN>du g:r9! k6%gs\+v$i[%Ek.KbV>7*c%#9ʀGRO8 Mk,^OO-2ſeEڟk%SqAQ Q~pPbuпE_?Hh_}ħv ?[d0|̇ˌo$(x ! Q_ʧU-`Bu͎#SlLf`K y_*+\|˧;kꯅdw_hsX?ΉwBi H_)j6}L('NbTטliC^" c">B~zQ7uGOj&^{d/tLu׎8mAߖ/o pp+>؅YGCC^q ??e.j$wjs-ʲ޼_Xx͙ t)A[b t4˺Eްz23"zP[݌E^?/[Iфce2ޣ?c*[VfW\t[ԟ T/9gߟ[xF(I]?VL`o#s.xj 4ǜ5TJwPfsR<2x3%%#qӰGܖ [ #~k=*]gD[ ?t8 <ְSخkBRvee$˛)7gF@-OeCLDBJOegN3 E0tF>_sG('(5%FE%n@IlyGhPiᶨ,t׎PL@bQ$2P^ 4"6oc;>'b@klIs"3*caB\c7O#x!a$^s\(ӳQlpO-|*cGd1X(B4]:n?ʶMP0\L:tZK,lnI[^oj?-a?OD1}=2qNLsoA?Ȳkyy+3 !fϷ$$8mQpCOdLलts㒮zVs_{5g;f僼u'Z?ζGl6OEg'Ha-̃B:bYbUL8r9fGxck0I&d{͎gߔoz(7/zx#k"#t'eCբ9N_$2DO| =LLH5aT$p'#. e+ێJ8`đ^%~#yeDVFWlJ (_ų(O; W[ gg8|HzmQ } 9zYv4v` WPō5m ;8Fªlp _6߸# bj7" ɥoi?‘f# B n0c.e,M)_, ^jAP~O6nYya-խe:SS>ٹDtmdJf~!h0ÀnmHa /`~F6N @ NV8Μ9v/sq4Q .|DWgXDD_m)mQ dA\|hq?##y@CG4B(ᇰoMO =oú]c[M1$ sLs-Ƨt\3;?EdVVBRbO(lfDѝ?}?l[!'z$7ȿ\=.KܥPk;~ T<7:Ar۵]aÑh?HΥϤC;+|]_u5Y*#HmP Ƨ7!k)ȪEy(q-"џ>2>W܄U$/`~F6N ԝh۷<6]]|hmTf&RN 1= @]\hdd4ӘS IKh = mÔ= @a24G hv$E;9h$˄ӘS }٧ :q9ӡs؟৙7,BD21k68ڴ_vHrd:JʚYVy[sDuxngd*hOi9?'/Jק,wiM&ejw[/=>b.#?"5vP[ԩ8fq<`_PD^6;vuYWأn15\3,2gpr>ik@?7PFL81qc 3 ?FL815\_:j1H0-4pmdӝ,zL ( )`$˄˪ CZDkb{?DﱊP3B⫲nQAt wɉ'Uz/?{qT`*v!|l s6[-`∼Y*u矘fPnV'쑯kOX߾z,)_O5&g7k٠׮?sT/]+((v=3$7#GM? a}/nʉ /oOY x0tIm* ËRit!>MTHYk߼蟮BdlC (MJX/O]̇6Ӹs;9Lf4[Y>{fqA8֛k5WX5O(Tǟ?H7Z[(CCβ3mk?Ecci_gbbŝ9|e裗=ZZJQ,d@^ /3&0s(m40rȋ|Lqb /#H gBR嫴*/J{PrVn=6ʜe_^W(HͿ[? k|2_BXG 5s8vrl)Ŝw{"L8p۸ *iphc> ލB)cy1!sT8G2a铜-Wڈd|w:?l{ ɮǬp]&_LQ'M<`_6uo_{!gr6M  kO/PڃM*߶P]lZ}F{?rc ?߾Cfz-o*Et_ǴpA^Ƅbל̲SI)l4e/AP-h~Z:iֳ-U E7BMt @)ܭA|/\ Z.3!R2_p/$Q п$ Iއ_@p"p9FmKoO2"" A~/bgקuTSbW6_ame"t9Kr&*to?~/˟\:Zb&KΏ, ?\89C ;\o.ak݂)J\zZn)Lt>4yޙ]6%P [2I gֳ|Mt  r9H/6^F_syG 3a#UB~”@*a.]ʧ,?.z#w Z*ǁtn5)Ҿ^INxkP.2+)i]D5/AxBh34B;Fˑi{_KEM$`5 n˦"uDSJO )G6@r`]ŭޥ$!ey2F0KRwȆZʟfu*}Pr< }7{m}2]3B64@> ޏWSˆ+}h.%c>$fL #ي%m}lz@_Jhɿc*jc /xoK-2~+`DCԿh2&^_Ft!QI,CA<10HR:¡|M\ҹGp?`a1B󏻑?\vlQPHbK0,/}:FԌD>؎˯9.KeDxX{y'3LСh=?ے_;BG)h?(@h1Qfq12Ht,rHc=.G;V.R1sDW(AmSY (P+`ʿ-_*/wQy@mJ: ϩژ6 XF_@C)Y( OT8Bڿf!ׯ,;aɐ;iTI&*Ĝ U9}vyOh3շ3YT a.Z?>-2Gs~OL4O ʏݒF}t[EVEʨ OL:YFIO245ƹ˙_Bڂo \.5\_d0vb>]eZ,4F2K2 bCw/CL(;B? \bT9Ow?Z?b/蟽& z0qcѰ+PBtAP5>`Nt^ąom(' ^ {sZCRNdV-D6EƥN?Skj_пYfTCCZoCYh-Ŗ@ccm.a!GA2=ة=S% a0On=y. &6z*@֊n)BaD tK74|#R?\tG%EU/K@o[#YlKh)6ן C arRIdng:oa:ȉ )҇XíELZSHji.V%'3flm׎-yf8Oq|yf8 (wywm>FA{?B(?h3?R ChΉ40W~o(¾ٟYcQf#qxhT RhxQ0)'A̱֝_B[5,ԦZCRTvmE}]K>nҧsբQcUqBs%CFT+DC -7bj"U$%GG,I4rS g&.2"zt3դ-r5A y >G6IɪGG,I&_n!L9gQZ+T,Y;j˭,^CnUk?[x?]c9_ClԾTJ;~dw 6 \b!'iȪ}"|a70ڤ¨45eJ*2Sğ;BODQsRrz#ne':.#b:{µx=p7o7SlH DI:Lp{_п/]~ {1TDc+Eq6؈2akuY?׿lpU mA4ϐ$kAf~~$]WV(=:™ɠ<ISr#3$ѹQ+YzIJDY*UW(Sd$˚\%%?QR0:⦅ڰ/i!VW?ҹ}n U] RfZ8t 9 ri_?;p]M"[C2 _eYxj̼/<u%= O˫_W/<(п:ůd?Sz= t!=^O9o0fkNg;Bikh4iDŠ~7>0HG}iu,'d4z}mZFvi7# şB~ˈʎU&RG52"60G.IQ(Ǚ? hkGtoӸTtlhd{쟎Z@9CQl[ANmw}eDM&ŷ#T3ũ V8cNQ% Tsk#?>.E݆!M{ٺ gԿ.9 yvW?O%57T;yv}$egPMN܃h7jr'$#*UF` w$#kj#$ }@ :Dd=4!Ɉ^lr^x_eTCB~pks5%V8FXu~rǎP/hW{@%fכ҆ĚE7a&-s)ny)͟$٘<㒧i!T3c, \jojKCfMCr'x[.]tF'fNQzgp%N|&`ن26h- IFTc8eƐV` d:S%IJ| 3!sl`t?_ߒ,:j> ")p/?"ɣȭ&$qGoDaG=KӁ_Տc `) +*k5擎yY_ pF&u8 yD57 h  !|E0RoF-v\: Bڔs6v8n|ڝZOSOÔ? +u;.?k пrҕ_`C=={=NqJG۟kvY]FDA>`nSrY'Q~",K)`ZW8˻r (|8}鵍x/p !|ES%҈˳B'B`M. 6qL? Ir?/dI&Cl Pik߱w̮7wo(zwVVħaKo4"4Ubϫ fÁl33]40(^T"DjOQ5hB3$Y8cbSH~띴|CLyZGL 1\Eݮ:߅/]tKWf[/QPu_{\(3|FÜ1_owVxBmOSMkC0' N0O=J? FVtsyiGUп%o^v8Į̎?Hmq`g|CL}B߆mkl(VGSAiQk˝ x@(>ˌÒ!&;D^*'ێḞq%JU:] …d~znkXU/cx=~6H[/1L7l?o8%*u T%"frt˒=ysg΍;{J!pq|a&,@L'G.(љͧ Fg"gU(V$"LdX=Ϸ䭈e*Lbe! gRYItBO}֒mhS dy+OGVL%l/6%JB%ҁK׿w5W!j"uodV\行F%B#|Z[{/ao/>' 3Y)-"/ N*?2'yOhY,ZhDf胡K&<#B qfA0?Hп@|t%|3RYItBpPE+X* eўMB_Ξ?w(_0J~`POItBti.mT[kBb4L9E >\rFL1 H@}&dhθ,=4i!(yUf1ABmnBkc~=wWWjmiˡL%9BÅF 3T!M_mG=Іd+l_KdC?_&17"HA^ 3Bl]T|5\e<#9՟N[H_&G, \6-rwu?;vF?~T?Gqiq+hά`.bJg҇KiD`ne֖OIAԿ, D&ÁN ?7` |]DN2x_7^(K̴H+п8ǎwi ?LE//^p0g-W+tikf>;; -ׄ/PK.e``B0MA̟#x-iθDX/p$s)2lOkasy|)ބ0f}bT5`yY{fV?廹Xh`@c'y=C9wa}~ \ g Iퟍ@PCӫ4%IL]L&Mm]sBܕ?:P,;éO8}Ådkd;^mĺxk?Z}Q,}0Viǿ_U}} ; Kk?]CjM 8烨0)$fhB* G/`'Y>8gVӌaߞ/jX_OWiMkc/e``B(_oHЗXSi٢oճ?߷_)iT(Ǿ`FFsVO]'dKxz>. "5wA OšOAMQ4-~  ^DY)TW3Q<S^-*u55fL+עR(|[%Q,ß2Nڑ\;fJ|OYJ`"- vQ s=9cV3؟k[㹘qOϽI_k M ()Ykً= )_/( X-e:?vۭǯ)](l-?okWFӷmǟK?Xa5[ *"B;?_is;.Ck~B(2T %0O@- Ӎ+KJ&fg&B ܙi ?ǧ6O LW)Bj п$VUI~-*tӎP|O'1*-Էz{z*x;/mt}‰?z(7E1W~e橄c&v$§Ԧb[3\)buLQ *D_lU5%gp~`*v*y)l k!eaa;^%*ɫ ՌUmhSلmTIY|Z1><#ӧV}lRzV[4lɿiIE)ɒ.o>eVs%ۓ~ V(%mY_soǭ6*A;?y_otDY,`:a>P+)=R)A' r,bg +1;RSd PxUF_͔^!3niE,t/%>E^ƛP}5M,IMJHO%QV`_]*K[п4UO4OhxPi!Ķ[2mgTy^&"TyT O?h~pP4e:QJqzxw/!oA{f qo s5qGnSxSDj6o_sԱzп>j43 ??kO1ŎPZI yET6[u45% ∀}4Z@^:=Ͳ;/iJ1L/s*V`7r>;V%8Vn?okgpI]>uI~S s )+{KRV(" H3spO @AHTؒ"%L;| "(JvR奀K@0ocX\W“t&<dTZ[aUp$E`'[<#m} ׺5P8w\v=@CnWGyȵ4T‘IOt@é?/a=(\eb@}_Oew>T2|h>"En +2j-gp(<{C'mGqdO@~[Hve0ś՞ۑ/ kߢ{Y]>}7 [\PIb3EJoھop=5JBVN'dγsO-@$R%GCޜۣl f6o| j5nl¯ZK\uҦk67%WsX+6N i[sX8k.KbU>7"c%#9GtV/փX|)b.Þ ? OFV$^n-`G̀*}6*DCcZL_gB$+)8{*]c!4^>Y Tc9CÓ L(7$%gjBpI6[lZkUtc`-urGɕUQ$[o+iDJ`ajYx !>'>1%awaIAqmw3է2-Y)"ϵ_# 9 &10nC.7*<~A+pbӨ/+t%:\c.2]6ز9˒@Ʊ_V7 ;yvk7o_}E.05~l[\!1c8DpsG(;vƓm$lv2}"T.3I^G[k 5 mGk?5=A94 šaMš3.gGE$,6пQ5iZc37\D "] yRی!XyQ;Hh?>`\ϵ;Bv gf]CX,%,Lr(#үk`'oaZw;<,6n~-"[X?zD[%N5CwƔ[ϑ6|"sGG[E:AO.]>}?t|_a 1FD\HGя}\(˗Dz$Pxpc*~7hxv^9BOy'_FT6j+F ;_ epEP)Suc͐$dW>wNP|O:;5قf4c1#c0]Y/U#ԶS̿r+khc5҆:H $ ;G mxD*=Ooƒ?q!4Š&e3BCML+\)+/F Re" Q3Vq8;L*$ǒ(JJ}^mBHפms@a~avm09w:DY>m2;h[ BNvbʅ;9_=oY+φIĐi'#SO/n4=uoc Yv?T"Zr͎|ԀF%ϓhs~B*v"%4KthZk__<~k wxOC\'ߧ}ϰ>*V@O1:mT1<6p 6抡B1F8:kliL۱Zs-F~(9I_ZB" N+=跍 [˗鷣†H?ϥ˟BA!_©pg=  iBգlGU9ӥťMQ6U)]? 5YMC rxL~$hsfjLh`|.$W c'4?wd;MC * +BCv4 3]Rő71OϨ[l*"3. v9BB,xi0x$y\ʋ$f{ގR_n;Qgc*q_wB-xLwċAG=(~7/5GnT{EZ<~IGް~.}!p;~nFQۄ>IIA N s U)@c#|ϹW:u=~XaJ!Fkfȟ1!2㎞ <5T3tDžP:k4;BXA!]{Hœ%Jyv*Mni\]%OOdy%ܽ>ȌsG4l~\Xs ko eɭb0*w _oܑwD1KF搿"L*܃4w t Vy(j ([X n!@\_ؚh?F'z۸U'?{xPV#ƞɮ$9%\YgME0`S #DbXg^Sn :' Ng~sd\ՖT}dՙ.GMBöI%;pL!`3FJtVzҐ?rpÙ3+|l.2\hu ǶB$)HsLs-_x:.U<E!в'S9r6FGs?[ϟ?N6-ۭSzleI{}?'>Qķ]ߥߗ/h?p%gܡ5.a|MlA#%[ %H :lTb |lIHڤD8|ls׏4O7X8Q,ó\)7uŏ<]p8=;|lM)mR#KseO;E%M*1x.wL |r.Kâ,8K[ [87q<L=Mw23N7B-p,+B߰#T @9}J{(>miSiNR]\hd d0Ӝ>#;9j --/mK)5!vaYh͎2/~3G dpӧԸE3U uZNҧ'ѹޔsǧ Bz5}9RUFqY3o+qjD_/<,4<'_V53i2S_[XunC[C1>\=c[k?rAEE}tGEmwlugWQ')5!Eg|Yd:}x< ^O2eiN7=?wٌo ˀӜ>#/6pӧx]lwƵɉ'U^"T>-,HgC2\cmg*1;|4޾[GwOoSf1lRlc^Rg*i鶣[ɹO?+?v#3b v0(Z ?qGlv^'F 3̋-˟͓4y%iCeIvF#K8qƄXqC1 [,|:Xrd=ڗJe`C% ρv:#~-$l;.I}"p택yjܶ?Zsn]Sy!n?]T[%-\43`NSEJf7ZJXp9T+:}yqHkR5߀R_P`m z~kt?ˋ!ӝ[pW@*@rJU b gC2\c:ϸ?g D,9퐿Ed8c:LU˜= Ă;swU4+NfpzrZo// )^Tre^1yOu+Cdt/`k]StGe:k0˦Ή>CeC/s4oC>*t`` BkB Wl 3؟kO'[;ӰI)=5VzoZxU&~;!4f[0>2XŧOͣ[?=P%>kٲMӔ8Z@mۑ_}<䏣RY]>tH7e$7RZɹغE{ẳhka~-اt~뾷NW/;Kn#4pU>*,eEq$3N8dBˉq;#t Rl"?79,kŏҁ`ZS- k)yhWO| Cn9%.A%v70@31t(C')SRD"!a]9r~o%!pT|@StU ~}oU0^ᩀ1O鲷ٿYCa+eeÇ_t)˸yï)'|/]Bxz%!C!A i hs_5|hw w\ϟyB߻WGD9oNC)ˤ&;"X(U2_F/#h!䐻UyeD0^k!ƌ}?ʕO)@LK4m.pG`R&EUg>*;gY&{Cd>G˄Su(B@/ ڞӤ׺c*F;a+i't4J"ӴP Oϣü& 1G`е ӛ?xDSZO !&G \rOܢ5^t$I@o5ɲRGêg~x}ji)ovZ˟~|t')EO&օ~`]UֳoZXeK g¿FnF^ S'KO)y+; tPk[aLJQyl(tndf1JnՔfoeP rV ozFƒG hW(ѤmI-XU=xl3܉q&gl/ZgKMnȟsg?v57'|Rr.oggjh !?%z"͍'fUh< WArSk2bh[3F(6c&B ?$q[R:jҋICk'F@9 L܊T{#pl2F#'#*r{#s )i"7aHW<o^o` &(oy3t,?-49wz`ֺ*ҧL7g KsF ˒x-45kd9vCfCmVv.ĎP2Q 6 8-$Qvʬ4.f .\IEZ @rnBqHBY<Jpj˱MI65EY!(G aK},cr.]9R˟.%I<#ʆ _foX\,k 0l/OƊ?HS-| PR6lÎVx B=⩰~AA l)]ʶiajZ9)8r3L6 ЊI9V#Gx;g $B'L).}Cz6d9~[J),6!8*rƪdv\ ~ p!T}!&~$] }F΁IS;}38I-5d&?FX݇ϭïOIC>.]tY?/ݙνs-5~t!9DX 5ErR@,w FIw\ЊI9h#`a-O}S@s}9\kaUz'|rG;-~baC6(!n~@IDAT&^i>)ҮPi /EBHrYIx+N0z#` &O%4nm*vsc=_X\G㹔 0Z@S{mk ??e TKV˸o0m,ێ~۱;Bk´GMCƣ6V6o;<.#qBy0GXW^xʒIq0~_PˈӖ:ipUr),>6[Oͯa?gρvZ]G\ 'r(~EmE0Bۢ5kקh .ts /iʽ?yY:䏽Bisw;\/xO̔3W$A8X;|O"3񂼵ѫ*om, Х@.g?'+K5__h<7~wg\?/5{6U8˿CLA$zl.]`r5r:âɆ(F"7{iʺyx i{GQ_7;5tG~4?Cb?^D+C6?ߦV8J "U^Y9@iqGIO]Qsރz}矏_je"!crM !ckm;ws>ЗV:~ǟ}4_?R> +juo/i!j~GٱNSPEw"$N?7\70\d¡0wT4 /[Ǻ" `-XC@ "@&4`X׼`Ks!jޑ_ÿF=Dݍ_c 7/kၐI0t8>釈)jWC_{_T5=w?*<9XOr vO ?gzx`l*č8Xď_u{B- ,$WX6)WCNҽ;σL^OR*lۈ ?s?}lՐ?Q)[K//]?o?`6$fe(xP`24<':/:B-7dU6k{8Elc !ϋ6!۽ sb"LeD,.ic Izhheؿi|j{N3!?i wHF~~d),staRn媥Xr& 9VE"u ɭ޴\KOϏ04o~:X\^g,?@*bI9aߋ3tAlB;wU2C⦬̳)9M֬^*Q5v9h]d|C!aQvϾH q\ ?o1&4He(g"MABOE5/gp T{v\q~"iK]5S'ˆ-C?SҦؖOgb_?$;疣we:HCBP{ G,g\:#)8Ôgݦ7uSor~,?C&kCwo.]> o2z! !q=/?~CӨ"Zb l{ҔWcXP $Cm8#T*Ԁeux2^*%(hq>|.%7B#xOQ>`#)AAI~okD;tIP ZOSO?e Oߥn~em_9*9F`_bՌ_}86~o42ϵw,.#"[ is\aGX9b&rY7v3HU)2`}=;L=_ l .s:kg׭hߏWU(5=.nz'oF;kD ?0†q$d'?y_&]RHf҄a%<;MB_-)5YgN'1Qg?'!>JWUO2~5ڸc*΅Pݝ^_yF(k LS3,ߧ?*RIaJמ$_h^l l]FN@ΞɎF!qLo=ݎ$Zv !l\,݌Lۑr'$ԡ+/45CJGm;А? DX$*RG KZLOmt2˦P6'ʏt6,f1gJs2dʵ4L,5nYK]ňּꡔ3N +BO Zړ&>`+4r-z-O*I +'ȶbj^ @fXisV?O+85t{[X{gJ&6_/\ 2g?P&Chmi>/B[)d*X͂1TTaٳ-~-o=w+/)UUB_P _=/=DЈ.?ZCv*kAt}?6@S4(r EzRCAxʙd%,wيּQW2RMJ82=~E޾W[~hͫN9;j W$oΨ}SC}')'ek9tv781{/D_7z zlx.B00V/ J:#Lc템3>EӔ~ĎK;78z"ק0\:2{ís\/&4N߫-$XH~MkfK;hgL &M~и=oQІjվsw%22_m3@d:"aj+ξυPV3B8#ZA.K"ٹ|Vf?"t>oȟL4b[Kϟ~.uCwؿ^Ԧ]-@(2{acOE4PT|7L<5a>m6l55kQpCѶ-OU(Sgb˟"f{KXEM՗0,z!/ W08AMqF0t.^xX~j~j<݋#\EP^< zT>L+t-ߋ| lo?_ hY}W2E `{?~CKC=  C!#@p^}N,9҅bnп`ZrF(ϐ2Px(UeK_2X8?hKdRɥ*sBFm<5LaG_B2:.kB2?Xۑ_/{S9Ë?D ?=@}FO(|VI42̰DZ4^v,ZWx/ed>#j?HSi|ڭd>· 3gg Z:keGpt؀x3- OtŶ)߷P9B:)BL0?O(CxiU?/H՞=r uxp4- jN#0jx3DfdLLšOXb) z]Sor?&Ḙ[!AG~MJLXHB#$` ^)MJLAq_b,Kg ܘUbb?2s- Grs-yk<ny9hwf-֪&8J^;qYZ)eMm Ja3?a!<5Z/+?w B??%?oG.C$1k?ia=Dw?,*b;?_kҭM~gYR`y]ZڤKsYțMJi_r?~H_r?&X jĐĴC,+ڤ+K?-ɗni|b衾g۳›s6:>D=("-ncˌSLL(=eF0AMM2hpBh,3I00Yhh` TFÏdM&F@-BZ5:rs;27êįF* W3"r-v$!2ڂ?%ȗMwkZ2vl2CT;[E$Qeͧ+.aĒ?w$JYvO Bo%C2^on~oȟxN'{b*CȔϥߧ/-ڢL+A&?Q~2PGly[u5!tTo1"` @$ՔG,y3IQz,A'Br9zLL(=eF0O W2diL!3li/t/Pzʌ`f鯹ƛ풢+(Ɏ`N?+4# sO%+k`_ȳ3ea }wsf6(S*͗@:Lmet.B̬DzXQ:\-a~a4i(nzƥr%=63D;vyM{mS?CCn}K{}[H3A|9|\^7TCؓmCZuIv?s _},/ _9sΙ\ W5l/+?-u%=g_[("ym=ͪ;/~J5R|S[UҘke ,^RէHU[<9/~1C8\k!)6j?JJg:rG(kZ{=bY/8z7}'Bs,$+'7WsZ32D;<~ϥgXyBL]!Ω?Bh^lxȧTؒ"%L;| "(JvR奀K@0ocX\W“t&<dTZ[aUp$E`'[<#m} ׺5P8w\v=@CnWGyȵ4T‘IOt@é?/a=(\eb@}_Oew>T2|h>"En +2j-r"<{C'mGqdO@~[Hve0ś՞ۑ/ kߢ{Y]>}7 [\PIb3EJoھop=5JBVN'dγsO-@&R%GCޜۣl f6o| j5nl¯ZK\uҦk67%WsX+v$i[sX8k.KbU>7"c%#9Gmeg/ۂ~.R˿tV/փX|)b.Þ ? OFV$^n-`G̀*}6*DCcZL_gB$+)8{*]c!4^>Y Tc9CÓ L(7$%gjBpI6[lZkUtc`-urGɕUQ$[o+iDJ`ajYx !>'>1%awaIAqmw3է2-Y)"ϵ_# 9 &10nC.7*<~A+pbӨ/+t%:\c.2]6ز9˒@Ʊ_V7 ;yvk7o_}E.05~l[\!1c8DpsG(2DZ'HdDX 9\ f[:~k ێ<~kzr<7h<NjCÚCg\"ώ#P͋"3IgYxpd9˒ʝ"C;2?0skh9(.rgcq **mw)J2ŧRL C#YS)ϨJ+ziD(n( +)0<Gbyk)ruMEzƅPn=G\Yh2O#-O$CyqlW[mG# pL?.5U[eIx1(`w!m]}s"ʘzHG/cp[ϥ/.uGm\=(^ j|GQҿ4))XA1bQ dnY7hlo97Joc;> C)݁| 77>?Yfѳa细q.]Jav cG(;>4k 3B"D)NI >GhShU*=fw0 Q6-XX C٭gOV4 MQ DCWJZb 8f uK"zP=-i-s+_ΨIalbG4lt-FX~M]auUoeId &Ir sLJ,Gf>:mԗ?Q3ǎK\F5g;&k:fn{_g[aⷅn7'kB/(?.4v !yc(1%nAjVwT?P_LG< ',<S?i&\|);K}/*{Zn7C{y3sv_åRUe z 6/aG-:1S9aIfƟ0yY,ng4\Jë/Q.Wd.chQ'ttBRQ7;ӾZʡސ? '?O6cW(2\n,w|0 ̓@)~`>z'x?pebeL寉 pU+aA g$^a/; CpWdkA]a:jC+P 9'[q3-Яk m ;=X&PEn5k;΀(fI#W`I{Cn ;Em!ek!1K= [ zzd1_o*g0Ojwx2S?$g k3S1lI,vu~Tl=׫xMA$I̙c/|̑ ڒ ùP:2#3.l ˀ-ّ[p樁,NsА?p}ߐ*.S)C4?:3ڛ߰"w4Ӻ`?|B(XT/#fO;Xj7 (.kfYm=Nh>?Kjf>Mf~*v5k7}m(~+w8`ȟ9Ƈ˿g kt[4.W34ףM@;=v>#o ˀy\O{ܟI 8S8g# dpӧx#4` ˀ#k"gXs>A)#V9jtF(_|Vҧ+"ђT}θ89QWXʧT%Xy}yqHkM\"fςw~k<΃pmj,MJm[j\"<>vs+99gndF,yn[Z'Ĩ!`Fye;vyfR1O?SwVdr9MrR~,h$b 'Θp+ nHb:#faKo^+CGW,_zĐs9n1\gį s%OBRN5߳}7op1\ۖ_.RKjC6C?˒PCw?uBf&3iH3BK]V :jqePO79pYP_ aZSouM6^ryq:}_ vϖyK63rXZG7OmLgz][ D,y}yqHkRWgL%0_=W(Lgs^_瓉*sXpg:㎿ڂf%ɬTVNR1 u\!ŋ\l4+64we`}_8X ` ;z*>|P@L4Z 1|9ϵ?ק b |YqGcq.w SQ耀|*-!xs-i)egfՊ]MWbu o;`'l` &ÇrP4 ytK7Ǣ ?[tG m;PkOq@vC6/\)FlF*\!9`_[7]߿h/\wmq s=sSݯVvt/NJM}0r&ԅ_dz&|P}*:-GI|r A5UL·Ώx!C!w4am"ğae:ގFBh#x&搎o_$JH}m.ak3?/AWHw{(> @31tiyޙ3edG$ jCfk߼UcD"P-r*/o(FCVQ;ϘUr)%")tƳ_8 _ʤ Lb\e,KdoTcpʷB(^(%ڀAsZBxBh32b%DWIdjytvD!< az@S/[ɿ!=?dqSRC)u4[ƋΚ$ ?YY hXocT-/NvӏTQ?ĺ܏C zM qIL׭h aj5d 1?oegwb!nWukM&s٫ ,-/#r_>N آ1Z@`X]tI3fA~hvY/#cӿ8c,B2ݮ?Z!ҌȟʮJ]?*BF俾@s=NuT"E01THآOύ_5\#K^C .Pm9@=)洱Zh` +$,6#{_@۟Czcܥ+SZJt$gPCVՌ kP~!Xq'b/C!j4PfmQW TQARV</(s!-Kٖ<-w3L-C+>S8'p|Z[[nئZ9)j$ogA!R)esx_/Ɲ̣"/]pS3f1Be]XUⲝ #tSpٟk3B.//E-<"hm Psn?%HB]zЧB9;ij'кy'鷅cѿDzPus? >o_ߥ˟.K]%;ӹbƏn19>盝kavFHN.h>Z9)m,w3LT  h/kM: J $O]sOޖ9#t&e5߭3ZK?'Euؿj;M4AźS~ S.+Ta 3]o,Ҁ䩄í[nnck XzjQ.gf =f%IDAT֕5L=_9P +ᄓz[Ϻ_uf\b[Ծпtm~nn% W_?/!KWvC4 m|1.]>CvӺ81koIs7^6zUE<-ټdq&wk }F~oտC*Ùg鄰J|VzO&ԥ9B/ 3ŒjTsu^2'lmkQ3ߥmk!`&BWu<~* P/[=a-4ŖvLҲ6E @ ~.tC7iZ<f\bצOwk9tww$D>䅿/SȹH;%'q%-rFP +$?/BPQ  _;P<3G}z3.xb0ǣ?aǿ;,$]3Zno@y;.]P $ %"K3?\sChΉ40}S2ф}oSduef;BϏ~j]AK'p-۝~Vk[i#P ǥ ?ՐXyS{ Y(vkKB=ZVi<2\t>#FTxI+Yg,1fE&1m6QCj\[$dQDO\Z%L'PLբmJ*l " N\lʂ;Aߡ,b p[$dƵ?Nϣ5 )'3~g uڨ;1~+;B=u|-+ȰZX RYj?j% 60j(4B=~e>/EΉfeD(>kZi7o7Cz%u, ~C6ߧ69!"V.QM6[H9UY~eI(27 YP/~-#6923e̞RKWێ?K]>j]$Q+؎l/ŧoG%)}ͻfiz,`XݬD& 2OFm. SUݼí(!~.B  ?jŎURG y ;gCsR)A!gCȠks/ui 'av~aOG`*l8$mI?X$W. gR[|c.#j4A_߰ s g.;iRiD 미\0'&c➞^M0fҔjn񩥴&>.f d|Hyĝ͍-a O@rB_{d΢HLRlP1 ?CbV=?¸Ҽ)4wJ|"F$'L% Qg|(uAdb}hvD 񐿮&, SsíY-*jt+hG/weIiA#M Gjd_ݾH q\1^Ʌ-uZ57BH87~6ecfABPfǢ , `joYh;2ſq.?,Cy:#t aLsP~sQ-6 &Tp* zs.PߴPo -٩ r>č?_P)告XJNX Gƙ~,!LVC&,d(rr·g.y7]KŎ;ⷉǤ<,B҈5c]|FPMp6:G8ڔTҀY`ySB'R,;~X"T:XC+pOm?xUο87cԓOۮ?5 YDz>q?M]KM6~*Hs3eeM7*Є_g ¸@Swm($_)X|.>6 IX0 ,5g־!&<-y;>Og q?U_WJV`e $71Cf+ES ;);jH⿽TԎ(]18nC]gY&mG(dXw8k[vݝwY-@S3!KO6S&B1@$fiykEJn6o=k*+u0[[hT=U:i]hXَ2̇q)WIlp!4"P^>^BR*fȟ ~#s!`X$ Z?C{[K}w( sNta!P9LSE?9)+5"3ɮ v&I29-C)gV菅PһZK LS W-`eN R'SrO$ ay_bg/1/5%pwSL>Pkx//%sGeIeLt 5Q\IJ=du$X͂<^!ȣB[ZE ^VT`q51I%)58nG~TԐ}!-v.XFc5xճ}y :/z[1X纈X$(%|N;Q^-MsVi4%k@=c,4i1M Pf"W>M$E4GU 4N߫-$ֿ)iJf3hgL &MKPOF7!ZaG(?J/_&13GwI 4]T|ٟ3*xFS_[G?}B-ΠeSBv_9 ogCg}ߥO?D; {/1;{|1+Qb)i72kۧ߿JO> @}oS_0诗ǣӆMS~; n_?ڶ? e LtӝTl{vi LE//vaB}YzW4z0By[~j~j<݋J=,"4x ,Q|0 毤{6ӵP|/g6Z( ͭ RXd.W~H ۼx*GXQXq{ vpb1.;wE\U?ׂ3By֐C/[z0N"gѯA[*N.}mPU=P6hmf 38I\H&Yǥ \H_gk;Pkeo2?uxSh>O_~VH*\?RF8P u̔~YM8 e09z-5[|Afou,ˎh؀v7D,Q|} #/{Cً?䯊gV%kQ>qOƓz, z1YR#ͽc-*_3x|hp,סшcf80:7OB;ű1?p6_=c9ˣ D4> M&sJO)r瘝K& +?zRTŲt=Z1f=~jx4?ow؟ax̝K?FI*3`g 3x0ofh~_2޿A1)>3Gğ5޿WUk/H+aJ8g58fׁc$Sd9n1:T>`v_GY W[C6:;΁\aj̴g%`cȿ7˜{ZTFBQ&yR)Bi\B(5a@TD^uw @"I4^}w͆5/t]&־]qkg? =Hjh^$N?_OCE>}BK<t{4SXP:?= B9Y]3!8waꚃdr[̙sc:_3*c`-'YӐf R:QQFWs\T6Gs%Eɚ$Ε*J6 "ޡ4%s ʑ?sF⸧yS R_1kX Hg? C5wrA\0LBC!BQ V2u=?Y_?X]RJ'd[~[F^抽mڭvVO5-N42R_en~S`V?G5OfHJ{u/I\N%S$/[]tMGfo_?GsG(`At/y@@;3=Ǿ-t<]kc=_tq?Τ%F+XMVdKm^k1?_Wrlm$_*<áq[7&BXI0FQ+РR+wN!Q{a!0IŕǴ~Q7Vs ϯwנ7.m+"){Њm?_kd?ÊkЖ? #믵gpoG"__T#⟻-u>/1_;aXM2Jo|@\ #7ףiXϭ9'U5ɿ{l[R?Xd?cIrS&p{ñxPW73dO}wS|^egݩ=7$츪g?\*C%'wP?Ę%y2HIC-2}GAנ1R5ؒfuw<,Ʀu}&73\u ɡڿЦMЭ2wO zқ)6]"גN9sA[P?vȒ[ݚ#w{_"wf6%y`1m(M~~VXpB,d N?^4_3pQ5\ytgxFw[.Q)eéVKʿ/OUcbqhƤyz+Xٞ|>R#`q,9ƙڿƴcߙXvXh_3cȿd>΢PUPn_?#߭6ܤ?4`XqwI#ެ=/'>#UA2M~eg9S_2pP3+6]_2%Cry?̼p(A.Xc%;$wɿbmS˿ЂoclJK_?G?PmC'!^m| C/;g}S~6Qmݡ! أx*N50Ӵ?;w?_à3IR N+䟘BB-V_/>FW Q=$9Vpf;%6@ l_ؔn=Mdcc6Yn3|G!;OQB?$W6aa2[V!Z9$+F6:T*)j0,FEo m%^q +L[+t&I=Mg`c"en䮒C݉6ۯP۲;|z诮k_α>[`lXzȿ&vj-X0Z*O "GͿO\+WL lY1̿Ot;jS)Yot Ͱ[:3Iftm]߳}8}xixcWL8P 8g?BwlN5KW!ʿ@$Oq厘CAL UK"_8k2}H!վI+~7FH~/bM7Az~\,~?MvY1#FM,Gn7αV?WsBP?՟L[Oٲ^r4G++$swƬP}eL㖓k<:dzkks#y~\ΣM8d޳dfcLu2IܕCGKkx#VqLLcO=튵6',?_K)+RDP?#'1ȏxjy4[,`;x4>RW 8 zj([?K4{6<ǿ' k_WX7{O%lx[nG9B`[!cDST7?d@IDAT[|_U$eLIvBr4`0t,y-捲$1#? os#GIOAN_7ŽEhdʍ ,2*$ILIT׸3z(=_NEDpA¥a%otb%,',HڐpW ?"? WXVKHY^?t*G(56Zc8Es2ae*RL*&C}xjUJu⑯9t;b>_8  ^E+A 6}Cgm'oȟ#Vi<ƍ477z۰qC,P8O^tUi!;v: 8nm44yl 3oȟ kt ;\ ijzt^9Ӛ5kV >p }Ow7GmǎMNNUnCҩ'@֗U~Yw5iae 'w]Y?BxE!-'k)dkU~#ykNUk*\>/c^2 *CD{΢EY~9Q01EYNf? SX@ k2>6w((|bdRDh.J)=R7w)zRBUq#f7}пDt ?GHp&OcR"Bx=/_?z߲}UݮFN6w>)K5$cd͗37׬+|GzN5j=O?CGبANߐ?![tltz=i۶mK===kҹ+]|"Cךo‰HGE>aDڟ_3sͦ?37nѱѤU?ۻz.cfF93ЖgM<3iSK/%4?Nݳ':r2щBX_hL?Yτrb -Gh迡SX_ /߰=-?-tSAx %PNRxyT)nN;rt;z~`7ն7 *!2~7O'G8aVΜ=lrsNZSHSSI.ݻ҉'"|L.MɉԴ_ya-3sJ`f܅s… +:|Io@tmv9|$MHO[o4읚>:WNhsbQɢb_;R`vvF]jzM7?~"B7Kg|}2]d[pv&z6%<=kmA&̥-[6[$*ǭovϝO7O__̬Ew 0|^.wJ;1!:S;;?ŐvCilݤ9h/ꛖ\w=|:rH:6>.]^g-By#o7ُ /Xn["MPwnC)YWҖoS;WϛX!7J?<9-G( 1bZS:ḱ|NꚀ{uW2J X?=i ?CL ?Cd4O5^UFSZ?33+Ǜ&RN2ix#S8j)mXAA4=|T+#Fp6f9KG~k߭HZfvJO4v+%gNgܽkWz\A{Ξ9V^)Jh:ςR<{ EH[#jpd{$Uz-̤I_/]שTp {rV oʑXϭwn]/l֦mTQgt,9fn3yKfE gz8v(ޕۻvPM cJTvAZ:ϳ8*8OG#~?/F࿦7 B:VחpRp@QT4ιB|BB86oyiBQCu:Ǐ;tÊܻw;N3=h)Y׿l1"ha9 h krΖwԖr,oڗ^[$--M1?^{i-3+#ʙTr2s4͛7qٳg55SyI)W#g[H;)b/禦}OO]nҿv> "BdZhL]')PCrٷW_=9ds[^iL}uF[e(Q"Ka$/#Q'DNg ?W@t4:Sؿa3b sdO2ňLfB>:sVߡȪw7MDl)6L OѮuxUJްϧ=yyj _|F.:}7s w)"BlSDSHǃr`N]yAF4j!*D%"d1z4$2[䪤͋rh&N<(־K9Eug#[4{p+8ģM:Ł5%)Us{WOHGsʳpdz|B+O&}*8JQ5"as4/oʹ5(8Nc1|ЦS9j^bx #,`gGyS,mJKr'*oE}Xsӎ٩-Q=~*O}1ѵ8B `݂rdB>u蟡ӏ(B ap7O[);d3a&G(4gѽn3#X=SM3^{̛ݸ2]UŞ3 U2iro'oB_x"?l?#TS9S=qM%G\-$,؜fڝTT!N~q~&߅êD\ZNM&%N3l9"U>No>yM 9:iYQ#>WN3eJ:U mѦ|/PSS'(G U;<0r 5j^M!:3fտЦ;h>.$hrKqrPoPMWǙ[Uī,|h,5sGnrg%o7o_zQw# q>p7B:8.*^yOv6Mi=D0)G2-M q7|E;|vy8Blk8L^_?CH"RCuԡCϛ9e ?n,dG"BqUQ| X&1ŽhLmP%RQ8p ;锜k%xI֔.1S@TTR2yh>3wh#r\c9#r}lA:||vNn>uKsGc˗/ߥ[pm&gGaXl蟧|6i/''^ 9ۯ+Wȥ$mPfCuҞ{jÌ-y;SsTcuw.rzxI|/[ c @-ퟙUD YL--Ҍ ?aoxz// 3avП!aڷsq4a[[_[a/GkPFM:54k}'Xe2_d bYʮ@$U*<[ ^U! &9/!5(ZbvϨ7XL5gB45KqY1^EΦgk!RZ#NjmkZpy~O-棕h$/wPKgѿ^ОT9A,df48"' 9d5ݻ'|j_=iٚgŐXvf.;Ntttt~Bj5v>4kp0QwJ.\ZוYٴI"B-GRS%뿣Z o)%jL/v#TDBs37,w+8ٶO5tmoS4p\1"aJZ/th_<7Ãʍ;">]i*(Za*w:CŏP6{ ˻u'oSg +ԠqּJ'Y- oS࿲"B::?kjtgZݻO9.ְШr*( 'hZ'UPDRۈw+B)CUo"ɧY!;tk<\I9BYX^9g+kI{նR"WQb6YD(oxu[K_G{;G5g_i 4vݺ"7>c?p`O*kyI߫JPBDD}wک\j3tpZt0w)jqN޴Zf;9Bǭ7o=8l'Ϝվ-Jz7~[QB A2qkZ)OuTS'Zx9B-LFMX윳5M_ F9rB]k|nqgdOqxgߐ?!]@N!Y)Е?Ν 'n WzsZ9t GvL)| Pk1SN9]֯E6y)"9vkѠ#ܝZxn Mha[v nyE 9@QS%AՆKz&u@)9=)y@r3ukv,Cw͚n[suWpsP+8Zq."Q'_;q|]\ТLHL=z1ܫrQm˂Q`ؒN@ڗfl_ݶ~޾5JO0)g0mojU>Vo,0-X PnoE^آo"'9S~(78}z"ݾ-|$9BCFBCS م8L*dleCu_?4ӁKQG9е]!9O]}FIѹ݈p;Y=_hC#A }"V񉐿˶RO}<$UA`ՊN{r.ÔxN~9SoH^ι3gH-WVs/ 9lhW3,j%^mbrINȁVY}rQw]֯t>~EN*5bDzWߩz2$p>V_.|P_sp.xH~ZM^,^eݘָx4GZU #lK%ћkl+jvsrmP_zw,D#~S zF W/^CvBwD* H/A(VV _ETQs6#J? &cP]:t^:oդ'Yj ͼ1CǧxDӠϝ?oJOm8(v4E.qb!R p0=9P;)]i3Sϭw8KN3 }H_s$ ix{^>S4 Lga*ݬh q0R))Ss"1'GlGFTbڳ"6ϟ?gg='upݙنr1%pjșWW]v%XHhDRmY65^5 Y|(़nÇ:t>NVBiJ=g|>M++ZuZ>%5rTD訜%-o >,-|rv߻=C3i'g}Z(/w/za: O a Aqсns.rP]/ͥ:IH b 7ki\б:?8 CgQjKND3r c,"!ƐG>"YɽKJV{zMfa#s+Q_d?|rfZ}Nδa#+/5[UlJ8YQGν3%­.$G.w?KD٣hK۴41,D^z]0I_"j^4 ZjWr/'+rR\f_-''_pddؿX`6 -_C! TҟFcFJ72eYߗ-Jw*sNZ=^+~e20#8dOt_ x6Q,cC?!kGVZqEɔt>eېτqJj+@;Zi"k,E$k/[:}"?C|g{R6QFj|YPEʱ)C@[VU:ɡJu/h:*Qٞ+3@L Q 'ouj/MƏΡ&+Kr"BMi ?pݬE(,Wjsq\aR0Giߍ.W_y-f #쏰?CYӄ-zf4"j\ rQLH脭4: YT7X.X~"1 Wi䥿nW' +HAȟ"B&kG7?#*ǕK '~*3H:ag=M EǿpN۹g{?䟳c#ͦ#,C1=B%m׉*̈́_jUMy1 V (aGu4Jߦ.T!&O?צM4ё=ok3\ -uMymxO7ߘۭ4?!߳8=Eȅ=`F MnfWꈈ:u%[J3vEȿ,g!8]%R'?'Q 3)MSk>jQ?/_JP +G_+I' 3ﺝ$Qɶk +Ϧ>wa?7S.QR}.Ȉl #;dPHdpG%g-9TSvɋb>/O_dC5!BW蟡Gz'aY;x8^%5$73.kLKsV?Y.bp\yMjŬVmBVP<u+V?ĵ/\tA7]FRR( 7O G_.#B̂6yY8FzWߡCtUmBJ+jPRIY7蟍UgEhBIxGomlo^Tda?A 7п\mACa3?-¨ 3o7;h'/bQS*fQW"곉XG(QLVI\YlJ 7.žn BOEǜ a"O_e~S$ï?C# #"p01jyQD dG_uQZYtK18/gߡ-F+"ƵLNc)bFN:"9J~sUyFш{`0չzJRϨwQ~?sEsGeJw+ k4SuLo-.ADcdl%jnE SC #lw]@~aw-Œn*(t.cOA]J)Ւ;BCWgvYmmE>xzA'oȟz_) W؟aFm10 rǃWi5 stuW2J̋dJkNW? #deR_g5T$ k}YFߡcGڲwS C?:f#tLxL+p*~:ձU8\мV, \=:B *ߖvܙVw>k'_M3sә6ߘ !9-Rq%_j<|qoiv<77g[;(?~|IY4}XŽUantt|C:qtᗼVI?Dl;>7f2Ϲ3%|iϮr*Meɟ-6|un)/Юr?om/)Dx hj=bIdXǎ3tLwuo8@5r@Y~~ʕz[)|@@WK u8Gj#D58[?qz(rߏ#%j2D!-8r/[?/]Hg0۴EˠM7ɉ76_N.Z'E\RܧPwJ.cZ{ ?@A_bD?FPfߞ={,ktƍtty^קly /BoPѠ$[&or$"Fry\ND.IMp3í._yd"/~yJ2qMS(ݻkߗ>w{PoKi|wa[z-:ퟱU |=svRQ+ tT>vom @ɺQ:_Y[o]OωO[ ]{uZ[+<O= _NPtx`'ݼoȿfsF3 9سgܳ[h_r#SJN~VxQ F}??e9z%=Tʑu"GrVVuIin;l@\XpK)_p:vY^ ,0e9ψT$E]85YAE 2tzJa8^y[ Ԣ<%H_"\ero߿-0!ÎUp?aoQ?S ;M:g2t/#LO1k%Hi#&NOȸ2*% '^MQtt<+rZhz"$LhwZD93J.>-$өqxѻDXd?j˔?`a9[k?ܙ8x#ojM3hE* n˗ bI8 ~:~fl~(:,=tb/.2"p;lc>X 4VC z6q{@WP٦G_I,u(RڒR@IDATm/+ 7l`=)|Mp_Sz-H8q7d^JM^tPșY_x`gyLWtkm4EK%|;?uW>(tdʆj'*,sz 5F9&{V+v!c? O *7aa0O+O ͣXD2r#_Q4yX^6tY ǿ!# \pZ%"Gx鿬k? )="E6;/GABƗ{_O2-!E|5ߪ[A3-Ѧ#?259 4aݺ3Q$MMg`l]dyy멯oeNʏ 3ۖ{]Sݨ( /.U!yPNcޝ^"|ZH׉V42^_ Qƿ!|jvVs\S5c7^4!eylyfrX{i*8#S/sҦQߺԼ]"öV4PߤRM> hw%[WNCF96zG% |))حO mi9诱j|E9j4Rb\B9MB: B;VqJFoNkv\~tnAogozx ނz>Q$l"Rzwo+:tDH;d5RFnTiye勖ktMBiϻ1v/{[ # 1ʮWuW ϗ1' /KDhњ~| %\;g]2.rUd!ׯo]4UgyCNOYf@MZWS>kN$?@Md8,FXɔ l2S*~͚`#Kw~VYn)W휒i4yM'"iT,rH(E5:邏ÊV=ݭD/Gig굗ݻ/rZJG@!ǙJ)Ld&"G-8](ѝCn ;3 g[0 g4`0UQ۶n,f54x2d뉇^Lw(3 +^)[]|CX"W+W9Otūw+{AA/b]u+O\i6  ߼XJj!}a5@\.Z-Fغ]\߶( W" ~mPs2Ca_g޻ϦG؞ɓ/ }2vJX_8wAH3sgDPէ/EeM{SdiB:!Y1͞qep)<"\ZdIᏔ T!彙<#Gc0!ż.HRTǭTVdT[ѽT! j~;Wh}Zs{?*"<¹Ju{]9@ᕥT{3\eRV,2vXkg!'?w6lőe#a{!YGIQ:x/ӻefDi?5j-A!{*/gb % 6\SNƌO׿plnQJOˎgFdppRvþlg֠]>.c!'p2(y;({gSÀ- G 7G 5Ьɟq bhn4XذJ+| 9S y7}GDQ^S?&oS9cU!Bc2zС:vcӐ[Svij\? m&*Im NJ}0;ʰ*)˱AW?Fڪ}U4NuiYyV?9_mOIyA]6+ f,;R_FhCWWMy`5_v|_N =-hgcD c[*+7b<" Ⱥ~r`Qh򞑴|M$Gvv/N2ٽ ɹ>ȟ"a@U{q.ڣȴ]<"8xְ&dԬ_b:z dꒌBPXQpeQ2v Pi~EF"Y?W3p}V+88giSIM_z%AE&S2hIVE>sܜɿ͛?iѬ6]NI>=MQw 2< frܡL)ڗ5V5^b=>[? (v;2> j a?U&诉*S۱Y2K v'~K%#́3QxMM؋ )γ"-d 6jPhٓCt$.ϴ}] zSi?<ڑaO߿_,LH IoΏ,l>z-K-rsbK?I9k6f J\ ޛt6dο-0ۺǏC mh]rBGh BWd\b0rC7{gS4R-D+۟WE+׮IPTFdy:Ro'CLE`T*`A2kBR#u, -ytX1 zXWmXV}^[`=~*vʯ6yz M8M5؄|;o[uWO$HVE^1(~cT!#v`Z|wE"ēD{R:a< )/o͠hd~X\ѿnݪ>ߌ ʷmڿ[ M}M+yIP^|+oX1y=zVM|[5P5NI[pHH{wKT̘.aٷow[_;r~"iPD)modUr%T>_SggжJфâ?|oҭwn|Rׇ?~#Iχ ߞU??ck)h/S ٣ڴhuU2;z)8Q><O}Jѝz2E$d,E0x6+/{v? oF-f!)Kzoy [♔gj/ifѣG* G⡃mmҰeB#lEt Ӑ̙:( !Owh#=6&8j@O1=n3|7`"n-t\rH;<[bu+FNM4BuvwuuADrhHtM@Rm9쪇P=GuR<-ʋ|-,Dr0i״=S G4ήz'o(=/%;>~c''ByNW wT#"Oqd GJ *=4},If^7y Z5^5X#tJ`~ʂdD 0^ oCD\/C7o1$5 F.0<4]ߢi4h cTvpR0z{7Y];ڿc`-&eQ2'^Af׀ϷO~@/v,7m/h {wH,D׾xv9EY6YQ8̀a7ɈVSl" ݯhId3/[ڵ:S_Kf9y\t;u8X[v􇳓ZGr6K/ yɃͻoA! "Wsno?߭Nq>q@-!*9 LG٘ M@cHrz│_=fK/"AGB~폎7j~ =LfFyvr]FxTR\m|3R\ý; MO_=+?>Q2՛ kO|_WN81j$D>CZwzsۿnz1G:)cf?5EJ>-fL[v*#\p?c nIVD\A0x( 2rJ`UҸ8%żcRdNrΟK5)Ok\ =?0ɩ½G4Zh3BI^/T)_ hXyȡyF~[W͛.i0_/BZk$4SqY3߉PkA8B=#B"Νw;};ߵk?CxC({K_xj0[ff!±sTd[:)yN#D>Y&!DJtGAH9 /xX+Xz(#^ =%l@~gho 4h!qI\.OD?/`` l\3" |K%##pHG'?Y( S;6Ǭ3w*GeMeMoDM2mQm*t fbL޵?b }AU/'v?<;ﳧ{iUIWi>8Vda(yߪ+rpKguGm+l#˙%G5^??M OikVBsXQ.&&_WX6 N+vS9B@:#Ÿ>;5gV{t,={v)j]^),gysF6Fdb(g T"A#fKYؘ?1IPe_? bEC{->-olʥj>|9D⬝폼:wl9- ;UdfBo42Fa(t2Rݑ԰  VBA[~=_dr^oM!N?S?Gl_^e4Ve"|?ɉVF' 92Z)jRmoi[a0r(-c":J\yւ2x FNPeXNYavЙHQ"3#m迯|ٚ_rY@eVk/&۳{w׮iڅmY #|*0mD󌩇_|tң=[昗bHF K3-?@]yXG_~x6G&~>=/Ao?_ <.',Z80}~I׉*$e]]A7f9-a|N3e!8@U6l\;gv)G 6_^zɦܑ x>3 )F0ګpvka Vw]9ڰ6 87sRbDJݯWrv2F`L-8ɥ)ge;.Y zsU u|GzHӮR0y p~ @a ,^2U>qӢGH 9 `e/qV+'Nӗ*wGL??VwI (6ҷҪUүʯ}f^$F|u٩(IrNN)? 3,5d$4cx 3<,If0jG).[ż5xE$eLū߿8|:~wYϺ$bS-}4X?3W_IP [fWėJtm_tmNw ȟK,9ȟm}鋧H p4Q,)&-)Y >bQAJec?E7=ݪ^R[۵s~uv<{_OiiJE^F "L0D cS~!r2K|ݯ ?oo۳6n(gV/jZ>=wk4ѷ:]G |93"8XVCCkL.ȡM +/N?0D0? jӛ%+U ̔VS۵1GPrE)e/Q,h"YTE1-"eLaoGiMyfGv~nyi ^ojO!hrF(4~iiBg~nO_Ao~Y}Nݎfm#_|1ȲW2NMjV%`]&$&@"><銜^;WZU*k'm܃ᅬ#/grh@oN?y|,,DLe` 5S_"rUf}7dZ)8D#:hEd"Kf)=j`~޷Qn "Z/Ȑ+i6.(CZ(̉Yq?>NObb8, aQK:cezhm}]]! [[^Ї>dA5ٟw;JO?`A+@Lzgow !sIA%86XYE.3%+42Fmte܏ТLgFITsms! o 8)0@:q.zj%K?)8~^/y!ك#ԃm_ - 7;]:'ALsCv@z"}_ۍ͂3 \%>vۘEWkn?Oj.| R.-s'Dz/{ {8mo=$9/԰2ܗ0f"J*y2Q1bJ#uk55 E\/XwG.rwK![O*HmSEK_i?0^[E"4I]n |2(ߢLZ;ʍbq3i_R'•k2Z'G+QF`Գ Р`;f)ga${YFH1D)Ly2ˀo7Ǩ~ v܎V_xѧ@œ@RDWP:lo+߼fx'>oʹ?CV[V+ֽ/2gs]wͿs+]5E6MRyɑ^P"DMÈ#( hMt%~R`bϻ?NL ]P-ؔ?w{#~ߴ_%3AP ӷJ=GE{dRzeۆ**plN8C_͔k1Q翓D~dG;M^f׉DfUu#uX yO~/,e_"P%_?mG M*Z; }lL򬷿̊`~̮SoQz7Gy @ 9t a )setN5HCj>p,GKٿot;fxHOb!rJ v/ጣAq\;J{hJYa\S'H|G_8KP4>~gq%mS#nJk;`Z_WI!B1Z9N)GK^oS"O*:Y}lcZ,E4LRJJ]Jv2"ITr?ei&Fƾ0fa䥖h?ƒ#Tqbk}>`LvOoˈiNN->N"ϓI6yJqJ8[Ol!b%m۵{}{-}Fx&cBpUPR ijJ;wF-˔FxBjQlgV3(o?gDlٲ%@N,vg)Gߣg[Ů[?|/o;B#$29?}hO#e0t^J}[?vy"Y|ZЮ0SlOEI!Pe嚖5 ;qOcB=Ő6n֌?Lg= [h0VENbGB@m"I3z_OfYURN?yJڪ"|o"'qhϩ[rV4߿5E4iE:/FĜ[Ҋ:)m2oDBzte->+/䤦J0% Fڿ!C4ebv| 3_Wb#6 O HrPމ4YGS/QxS'^p$ Io9-g R.{I:SCOn>+nݶUlT_5jmL9{{,VGZ37µ߇mj<%9u=NJ'sƍyӦ?+CtcUOh O?/?oMZ-ߓ)ɕi' .l?C{3ldݬXW_U0]2sk3Xz"B#6 dCr;"Kb uCG'^3Mi,l%?69LO&ďd*ܩ٘󾸔!_E-$k%pTk_7w'Wïӟ] Bv)&Eؾo.f4@7wuNC0c(̏??NO8Y?W*؏X&H#M1_Z}ܕoJrN͟u[-E- ,WUTw)(|mU* `8ꆔdFR]VDh'qxQkRcǵVttm=8\oBJuM-A<$$ I19O]bIy P@$)([),-8F+_0 .# .FW86z1lTz9ucN+JVL Ѻ{Q'+EYOlt\G:E\?:#H^bgWL*;MLҧh^֋ k J@ N0=+AC?Eg)C yߥH~ޓKe1V"(RF9*/6oJ[?E*!eAZ=-Gel(l#OuZNG2gߏ/',%o;r ~K3^}7(),NT?Gm!;JAJj|IX b[P?rnE=X7yCt#=[0{!|f݆5#lLp> 6'(U9bNO18[n^K{q>MUi혇{ʭP;$oo~{:KOҟ?ժ:[EGx#<4&SyGrR[9$޾MF"(G[rɟ̾K~IW^Z_?Kۚ1erG>ma|y[wk?hA~J /^) SG'M{gV"k1ZEF7G:W<|0U&`6Xk<EUm[NA7@/-i۰) (߿l(X]KV/cL'M_>&G!Wp"̬&xq.1GlNEQ !sזH g?6[ZMmQaEVؤ]ؘ* '۴}t#8w)1SSo.)7G(g8xt wiSƜ}㛕 A|??, 1/lgZ}UۙĒp Q7G~vt6o\˿J_!:ƏI7xN`߯W?z>ƍ kxBh0R>3,CNXzojr 0Ub}zӍESU7ӫQLw" ї<*: ~࿢z3Gw~_7| >k9r@1-/LTA{%lnuLe*%7aٻOBV#~'K-ĖN&~~[kj<\a;-fWKM??dWlQhsNu:  h+yOgDA|I:?-l >9xvjY?S 5"OZ )IBYu[LZe#겛fN E}4G!gqr /ۭ4]_֟ euWrBD:t p6κ,>gnmƎYg~mc=hQw%bgf)Ga|6?ߛJƢ}`Ê~%ߵ

N [yfD8~!盳gXKn?J =~O9u'! _:W_?.j,O=j<蓓ÃT|NŤ`EM_!u @4t*ZZ-S> N W >f{꿪sABJo]{ `w XÅPna͛L5Y)sى`C꟪꟪^)7qu2l~l뿯Ay}:@juU_u/kרˮ~ L]ssQc26^: G,MH%KikҎr47/W}3 k֪14߶T;-~+p]q ZYӓ^{윻I>ËjqSfFxLH41e2AbLXX8`?7?Qud\ze瞹(~rǤ? UN?`؏GПW%9o[p>Dflwh9(7z>5=0\6~q+#?||`JҀ  uҷ^9!.U%ïCĿ}9"E$;%|zi𓟈_׃_C;rD"e^T2/>5] .2rcGKQ.Ef˂mPGx\xCea^Uq[YP* biG\ba d<4((Qa%DEx&"΄(TR}W_:⹐g?F*~wPkNfӇ \/J# R%?/+~րFo?Jc ɿ蟓U0js:r,.Q rm ΝI3f4l}7&(&_HNbQ0>0SՏ9ۀˇ2bʪ,cU՜}/{оf%%UT+^<{'~q8ڒtw> pDB݄Ͼ = }k7Jk.[?Rx67-~,n\38j)_бe,t_ vXuޞ闛X`VFx0nivx%إ!  _9{tiV]0˛V4D[Rԥw.FͩϾ{vzR&?Uِ`]͇0SwL_|0$H~S~ TUB,,?X|Dl}U@P Vtȏ@=Cz\  H%L>ӓ[)q5o/wI"OHVW")i#o`ԓ;SȿLkN7mʟ9 2pM-Nx4 c29sX;IBݎ39⍞1!Ǣ2*1<pupźncTl.c诅dU dzFm35nρIa?-ɟM翫ANԖdʮɾjƱ&ylJ}i?'Oz)mlR46Dd20IErB(2YI?Z0O@]1CԮ6#Vb?81"sʢTL': m!zfIqdN<L0e;<[9f n- Q 4 D^> ,tSTT1RMJ?ޙeo=$se]#I??LJ~߃ >h!,f_@hgH: ?DgS<)r8ӉfN$>ji =lyw:\Yɢo~X/+P־8QoG=IP@o?yl_/+Rc6^ x/i@@lxE鶶t&|Bt{ IV &Ȃ5{!fB+am%c 4phL߸ST3z3u1!mN{7iXvԓj|ŭX%'R_c<> &!}e{ yd0F 4k7|!r0+coF? $_b ?Gq4eB !̊t@4rSXKLsc!_@wi/X7czڋhJ/GKvP.+[9)'A}#alW>I(7Rmq=Nqo?mhJM+ڎ ԬA@Ikre3 {eb&(`P 4 4+ {egX Qe*D6 9#,`|/Ho.D&'Oޞc 7g_?'E_o ]ج3N!=j7IH.8ah <'E9~xyȔ!Zhth?R`4D,$UkR*ߺDYu݊I/{ ǣLi_oI˽3}_s@X i4iPotl,,P9¦PgFĩgt4ܧTv5]LZlLte8nӝ.* ɼiJ0IE)Ye%b6O~_Q>k?VCqIZÎP.*R?"i[AOQeHQxg}ήm:]}|!)Y{h8/C-M֑Xg'9$:ߴ`ϮhtwZc/~qƑajLHRu[pwm)2(eNFF`|;ة!) s΍z^n{82(lGN,TN,Q2gQ _NY|2g>[o~ێ~rƱɒWr^C C|OrM{kCg8%( ?Sߏw@͆yzezw\Nƚb=[Sx&U3Ȝxz|r)2_gv y=yA/sΌr;H@3#W =ut4>[Bt  }UZ|/ḅ \h/P>!-7[&-:nt…,n;!mS10SPrڴ #ewYݗY1"Nb9(N!ж_?ѿ6O:]N5A)0S 3G9ozBkg}F1;co;9 KǻK>k@? ??߭8)+M7/Q R³茎XhF &xg')< eFT! ?jeGgt,,;ZoH=KhJ!CdObZce#7?9$Jw`SGNphǐ{٥D3X=K{C=v_CV' Ya4=AXPV`<WelI%B'!S|X&oc?L :Ү50q<ʀ ?VN^Ilr/c ZOLY_E]H9*/】P&]WNwnq︴ _VB-y??I;[ot|k Kn.1 sv۞ GfY9DrB}0S?^BiPX)n^G Cšچ*7| }:,󟓢wi~צ f%a!wpv?s.pZmYkSO#9?O< ~XY1tZhxYgv|5kqǥ);EGIoWTcI}>KdZ.yx!fZ$8ac,6 QxiAR\ciL4mxg5p ,@sd L~.nv],0/q`q6k {1kxx ʴQc9U#*-72N(aA30̷ܼj}+d6IN~ƑDPqvvǑU E=:R e( Gtl0,;eH+Yz_h,%2yh<#tw|CC2Wz/|~t!cUt0""G7< laEF!:\3?h6v|,rd`%hWmuƯGq:_DD1;诘yX@k!IN;VccDrJwˤ?dC -$1HY_ʌ_A͸_PUXO@% d~u:؋nb]TԈصaP mQ\9J$P$^&Qc1?$2r 'Q?V{l%ʅ B8owpBq'a*@ ͣ/DpH'+%tU?_W ^ 쯈a8pJWW,S\eY}Fʏm8(Lj2[F ?'(*Z;`ȑfvR_&˦Z=(13?t/GTL3&+K pNHht=͉Q q;qHPB\sG(;8Yb@Q,.33/?; "cT,?m#J;j-{{CAbJ_6/'g o(8wMG'yGhxnoV v%e.BFȊ^\%Bh>#Jґ!`pZ_GA^QƋp5{{\莂ZTu9G}YeE@D-bIB Ll3(3DŽ,M⚐xDD,A% .0,W!H}a$ z{꿏;LIdLΥ ? 2j-$N.narυG+&mM4rrySX'd%Slxϟh$(O!OODԮeUA3E }yD[l+S #4yOec iRiWl2Up(Q|iO|Z|UTJx C[- ?]1#4yNȃ Ga 0JNf^mP)9nuAzTwlb >ߥ"2_5p*QkqBK0i eCH@KO?C?`qh8tq[8&`H1blam΀*E1=mǠh<Fn2ˬ1B60'?Xv U1WE34Kt;BW(Pyܠ1`I@P:Xwo 8^?*5QN?hpL~;AP;_~v>IzN(1-r=A7>>BJΐv rwH(~~29ssBʗGD9̳$܄9#v֌N7p{*4_yM\ȅn%-+DihsRb>v|XrTJ(eG<R\Bd_ݔ63#˝Bf!!pؤnht #~{5 w ~[`VH]r?Fcv\BE񏵿t\o?/bII N0 #_GF!b\ (VN '!?`6a3@zm8!@wy$Q(5i,B-wevR @A|E^"J1,p T[&ߙXj7IN F vUoc;Pvo]#~/PgYb3>C;Clq; gj+"'JދRhBIؔ& ʄ ߜ݃OABsR}b?ԓ3O_l"~f>bC#VXP ߉VG-]sW;B< Hv'Z35<юU_ߣx, ~=3EtcYX=.k_BH=|Oe Yo9Xh ʎ`oCp{)$Y0g@71^)OTBuE0cef+njEl2Y~}1@  ]˨~d _;Bپxl4?X1WSGWyw:$QoHЩ1Ok|dȃ&!3 9w,cwlP2wԓ?Aɸeq;&󍈃6<AZ>9 O]_݂xY(1救m5 ig>Hwq]IBgF[xdgsP%N6 PXx-Bq Btڿ?y?Z<2~?Ի݉ݟz#؝,vZ~y+zɳ}Sf 70˅3ɕWf$Y>v\pv}^!&,&z-]8p`֢ؗ&5a,w4"5/[ CPۛ&j7]{v]0֖q4G6E!y?a.fCXO~ykgeK[Eh^ d1ao vr5[ .ZY.Ǜ~7p\Q[>ҟף(ɆobeE( LzҨ4NHP+4 N1l=m_w}<'M ̃4TDj;Bm!T֧QnG;Y:2 <$,A |5JXlԲMy0R3o $Y?~cRXz)T_ uc +!؉s 7~F3ƥ&?DoB7eW0( '{V"849@~29M)pꞻ`dG@#`H1 Ye߭ {ϙ$cq (ˌnD$kk y3~h x^Ό;]EbDҶ[>)_fр~K^E)!bч_ Q18: QS܉#/c&ם8h~Ceٲ/g]G)900\oٚN iߜ0\$c<p]ԿʿO&M?F?L𮟗M*2Kz,ùѶvY,qI;VXExtہm65ʨ9{bt\3ʂvS)< kf\J^YdF>aYy''<;83ӯ9f-?揽}NGЀ7B- HӪ$_lkQ?xOCI&Н{6| 7<̩&](<;=Ylt2( 9<ɟo7 GgWY̢>u!לi}a}җ_뿄ݳ5XIM_Jc aMM&[''tLo12鏈 Rj_GgZ0>e.?V~#V=?c#v<~5h_QLfµ׫VFJguw}REFMћ}b !E:5蟓_GA'[%5sgn1.O/a׏D3J^l3i?%xϣ n)лTu=@|41."xЯx, `@ sEE$c(w4z y_|P?qfWPT"Tnb{krF~t fkXCPh &_~4'_)?Vy { юh!> Uk!_TÝy;QE{PF?6=|Iv᪺=q}?WGޯO3\l~ GFj~V 76nB1 'tP]+(/GWwGIPS q_Mrnb'? ^{)bZDz\yg37n:lo_L 7=@QA&(Ae`Lfnŏym [Ia2TKa6~Ąyچ2kώy챟4i;BێPr>c_1 ~q/Rv, m[^8+W]o܎r BW]UJ$k|y5=O[oCxxL$Lː?Iz]| F?quKՇߎNq2w=ёA;rz+艎~YC΂?K/ǯSGx4I\s* 3֠K&TyI@'t'?B*Xm.$ B=?=|&]' @[@IDAT~0+(׶ʧQ[p__A0[f0>tSO_D\6X]o_o1PMSm˒ב76m2CgARqmqϻ`%NV "dף??%xFwiߊH_V_o1GMa[ ? d>(ʴbPz].&h]P 'E dͤA?Uid6W㶮`Hh B޵Kssxf6F݁_<2#L|~/2'?T'G{v/.k.s^DgM0! ?8JK]˕#&jpW1Cau%DdYE&]~guҐ o(;]b~]~߇_ )%0c?)#|d#h23 S_!?2+j(g|a "1W#AGMs.PGye19o%9LɩL;Y}0vDW SYTd90>*,=12B1w{ GudQeܷ cE?V[HNN@<3: SE&?Kb_!τ(X-q?x;Q}x5ÿj x5yR7woIݰ(f`6~13! s;X ( f#]_Hfȉy [&\+p=g"zܣ3:"M_fq4Ⱪ .:c/W]DR)MjݥT#}&P֒rJeۊ~ FpQ"LJ9yvR1# .\DZ! VWhP G')x&$c?zȪO("1'~/^o~ŋa<_X5G+<?*o0_ʍ&Rnv# (B0fiTCȷG#V;I/ QΞۣd*-cP e'[nܝØb'äTmh{k!eNa .A˕0l㰹mp Z䍠yPamqqZIG8CmmE8+:#'g?ȏIȝc7"5 3s#oi?#޳q@rWTsO0BG~‹ͻ~Q 9 ߼nU/?G71Z濎"=o_QlV`-0O]դgs̴"i xlG(OƄ-r+y#t#@tXLdBZyPhb'D`~Zԓ9ï=<6d&߯>/,Efhfw&QRH^QS05`)VӫxU\q/3˕sT m-_1 /ȫ/T 1#w[(_ǟbA.#( *??\h$QlG3,E^;F6_Q^?rв oN2&%sh}]!C:Țc=o/! ;-~eӣ}(7_~s>BNP4334 s[~HW‰(Ѷ1}^3F.4r'?ůdރcf!dk-DU1W0'((˾2[+4w=!??ϓI25ՋLG<77)3ןRwh!#+w~2|/6菜o mwsGtd2?N_{HNG7:bTQB62\ OWq[̬c5´EQ\`RhB^{z{9h%4Nc7O{N]BLWXE&%G㙺LI?U3,QQA;=8,>npf<{yw6yhq{,XEِd+8޿ɿܧN>yQh(l 9TĀê 2QOEJ}/{ /5~ i> O} : ܶ$2_h@SBl#jSgK{I.~$H5 6B \})+nbFim:h<4: h|_RN@!&J K N )07y26WlHRNX Jo;.MqR[*;K )YRYmZď @_G-E Ra7."攓<?/gl~OTkwX-oK' BJQ}2ГdJqG.7]sG(6?ȅ&? 3qŽPcX@L:a-U2`$pXOvܗ$3nEov't(  Ibf`$p8dyVEu'()sIːr&CWRK!q0[׫h;`lߏy=ohށݟ\l'?/56'BapDI>3JwҫCߣXL<5ޠr}Sk^xzzkևN(Y Ωf6?^@?8yYэq)?XV=}h~ w*Ev7;: OrHONxKlL[2I,y)t0ߘDb(ێ䄗 j%6,CؾRKHVIZҿO K"/J{G(Qp)_j a7J@;؁L9І! ¢?,Sʙ _"YEkX}a<*VX*!O@]1aQmwz/1[_?ѿbghV'\>1-l9'Cgz! XMK NNH:kLɿ17?wR=nߏ+dT];7/sG(w rLiC鳝$b0n\)MG! qȟ`w]f;-C*Ÿۘ e8ixV8և_(DS#Ӆq˭I?9d`|O;B٦3*GmlI/)("RLS3?|fyqYK{p4;nG/:0y/6?jOR,8fޟ=?͝9S۸b[05,>5_eh~Hʲͣo 6&w1uB6y ˇݺ* hUA1A.Y qdȴ)?Qk͚3{{B]$[C<7OUuюyXM!Z?w 'Z.YkCCJ(tyP{,ò迱erv,SN8eXdo!B-%`^{== <߲c=<Ht &&=`k'RW O:-C ylgٴRlIr/_Nuʙ#– 'R֡@l$-'oNnZ=?@oq >r}? y߆FXpQ%UYB$r)E<;S֏UTʎv,c?D?znw.l^4_?yfRn;T1W[Ll:]s?NNhHBx?{D;G AE&YOsL49_IO@`0)Oo!Oi~x

N&N,n ?YeiυPҍo:8։=nןwF "!dNXqų$8wɶ<>á},nvr"%:G,B>zUpk씇?FCgr4|mUMK'RaXcvZynypDer4L߲?׿S#& בUህXyȥ'L{KEY<8Qv_' 'T SR*/ H/ )w1Y7~+p#\ ʛe0}u 2ؖQcx\]78O-83 I%MدN ~ˆ+PP5鿱Z wO$ Mm{9Xf6=0nAPQgQ"%|#0# o&hq Fi VDWJ2 x~(.*NW?V}L@ୠcoG&Lw> ѿrq,gV L2uQ/L)H=3>079~.cӏY@mߘ-C_-"^õ~([Aά,l\NN;q~!&P*Q?-ps;%WOb:Y@o yiny+bJ1,`w&x^5OL,YљϥkB7u?R'Q-v6w3{#ru) A#ٗ>mXY쐯ߡ_~=y><Vx~pqƲ?lxo''p}4Ax[%8R;Oqpn+̈rI.hZŤh)bޝ;%W-bh^uejwV3yQݴI `S_z5ޥ KZՐa%LE2 !L8;Sd$$"` ^Ab~}TQX0&6x& =c?|m_7Tk(]ʩ˫~'Mc|Qhbd!剝.8B L'ZU[Et/B+ѠrBMbHM. `G3tu鿱 c-hGt Ո1?PaQ15\BoSe)1݈nHNق[4g;5b`݈nH%9&P;_.f!nHB]G/`%tX5A7wNRyOl{:BW7?;B)wyE 7*?-[k1߿=N9Ͳ_ ߌ?C*L`z6]i i?E/$O^_~Un>:9K~'auW#~}>zm7i7#n/'M?ݥbMPMX}dݩ1B]莕X3 !ׯ@M =6 % _+ _Ȁ 4꣇ᘠ1aLZ-қ#? w\8ցQhG~enz_@wq/0ѿ_sMk/^r2?˔Ƣ$-u=jI/= zp!oO~|PNNx;nqG*`;g(8.Aa'1gUƣdp` _̚y3D@8d ̙0{eAY 6ACGQi,͟qc/&xD23h2t洣kFEN"'o1ǒ[yehsRO^M:9]"?6C.nHG#\' qyJ 6@OU?!Zh߀?% Hs+4>(1AzљG̟?d6+xRZ[ҢE|~g JKZ9W6Gv #l #ԋ#?1Yc: فjXG0'`$ mðt}]d/…C%~+$2Ce%bJ߿;]OG-u!$va -c/,&!THQ9ETcL_\6t^݂&;W mV7"XGBabfLzhoZ{gWpL;-~fh6\H?.(O~l`ʠ̗9vnri`h397^Oyˠ;IS9=[DeP˜xtjG-|;]"Tpx^ngeP˜Noo;B-&Kg?gMvx ݱEJGSoƣ(=*}z6~?߽5nL p&w;.2_pT,N |38]OCY.eP˜b!'2(eΙx^ng~4(sf8tyݒlIYlIf'tEC88P~eeBcIbA8+$-Z8)O­ a-Sν\z~ 8$4CFf/cE$} B:MY%W8فU,28/]$eP7x>.W}=]~5]r&<A(|fw!7B!eaPGբl”g ux"8ׅgG+ynUEvKuYv w~M)yoJUcXt{_[`eTs G(O[6Qj=o?|jytВjFk֒:󦇌~?VvΝ@:џ$ħ4@&rA+0T~ e1 VI֣.f3l 3\!5+*=ZR+ \V e o_]l<қ1lO0sR_Q9Rw6&9"8*CCeօj'?|`0څ>~=**1BW硇B6I1A˗Ц۠%D;ȡl?\R?N>8SY"+Ӌ{;n_A'ImxTƫi*^Ay`H`z'[ @d"-@\(aH &*`LD! cQl,|=$ֈɬ!Cu ?-Ԓ~_B(h ?ݸ E?)CX lH!3%rQk;P>&/~քc;j?Fҏ[Hy(~k/5V/}_>W| ɬ~1I?.'0 0?#Ɛyǿ LCƯ$bg IwZo;${'4Ɵ>_>/r*Ёe鿱2ncgv_$[AR{cAׇ_*4wn3=%{j< )?4p;ĬKNM61X.Fe'Ɍehìh]0x<”#LLBng*>XC\h ?_4@C48:y(ǜ*+A,LW_|d HeRn.**QQ[h {?GF $;~}Ə[8(]$;&?QnN,T䢤u7v#3e''?=rH]h>j VqKp4I9@ q-VX$Y~/G8 G??ڸ:5aUb6/_x=M?oIxi; c(I?eS` 8ͫ?s c·XcO?,?OAb>ϱC 6ĩ2d,2 xˤ1p.0u?;xܖq?6ףE<:"n~|cWPNCS=FFrJwˤEey^1ƑTK=qۗ"3dW?[U%#L '5=CcbUdᗁUP maQ\'1PvsZY[aL ZϻXk?V{l%,$cowpn;*~l_X ?dѿ$H+tgϣna/! 4Q- Yo5\%ScoP.-4;swZ(P}4-c48Cy-#S/^9A㫵C~-t:ǹm_ @{+>EY\W_ufi^`#p;^BdwED̽n|2&$Ill(?N!r8Yb@&~W'NS?μ?; ڎdO|H ? Kr[wRO_{3H3'#^e]1_6ѝ`p3 ʬSu$؊F\ϊ*u~\Mt <8%q9~dy1XI=K4 g !FuEƢ*)!ڭ6U'bSDC; +ylVU7B$dnyBhƓG$-׿) 0*yϊMƟ'C?4 NǺUj[Nl?E`0KDtʭ5t89/)Jґ5l1Cfի.eSFFMo'eRY6~zĕB[JYFYA!̛҆ϡ}Пn*9D=sލx|;Ŭ~쟱\C)'[vvֱSǦCǓn+i4R9j] GJR K/h,YCslk[0ҭA]وЧLE5ohϕ$mv+5FM4rEhX)џ2 D+%;Và Hy猏`&s:Ewl'^Ҍm/U>ȭ2? p[봓h|kwja5X{(=-2H ь<GE_+Vlm̅mE(O^.A1 ~ jUc?N$3(m%g_Z eNkKcCaC.!P!"R␮x §W~dF\a/%DJM9ploR ,Jg&34/VsdǘȻ61 vK[v@ v*. u-VkK'5˟H+Y?.biIӉ*l_#6Q"ͯT Ŏe&9vv#jhOCsK"t iߩ{9c_Ut9$CqՃԁBRt [llk5@H &RJlS TL\k$)OJjlV Ƴ& 2,iJRY2{`L1FsXN4 ?YP3?Yk\h+. 4)tƟ_J(Es9~ZO2MO S_+ "}L+hR4"I+Y7zIמy/6cI5i%|ϸgve/ul,IMԌhUOv6ܔfVA?=>D8)wGY66O&&R-BT$^[#$1X%BE`\R)fB-X>s~Y7Ɛ&@)‰&% yv/du"֤I[UioW;+P3V -Vńmgz+YLe{݂P{\d8ҼW!*_;? E2^E-ے[USU.t*'ԽCVjRY _TfKSЖ?'rcaUfZ?+]D#PTeViKGy(q:?ɌS}-E'g<7Dgg49쟺*X?!m=G O:S~$m!9tZ%3bM@IgVeVZD@+V!CU,k¯t5F&'UTFK,jD- oJQaٿ+oP} d-ʹbgDK |NǓXDN$ 'j xn t'&tkD< RIoo_+Bپlʚa=oo\[m sw4IK[74,:9tU=&<vk<&uD )7[Z/mJO;IR]?ٿ3\>y}\M8E㨅J+AJ7pĬx[J:%]ND˼~n doshOJ fuv,`ʰDm"1oojy,$~7qQg<@*z2'Ѽ/a4r-XewMɂpGKcXEwNЮ2Hov]a~Qt#9L8 /Oޕ=x{Fr"<=<-?𻠏&Q,b_cDW޺և e@p_BTNJ9`Χu mYAęr0S5As\xq^*`19d¯/dHD@Sy{O?KT[覦e.⿟dg9'6q%ßo=gK 2,I_ewj&hO]x31FBSۭqo+a0Q6b=.x.@SpGYjv`\9JˈܒJKQg+JwȣFXUK -0|1!%X/у>mU~v #~ęossW\,sǛ{JP޹J 1"aҝ?08{;aƓXm$шPg?)Y(% 6RVKƟbjLD휠?9b g.Dh'V\($~W4N)O31~QoYs=s)YL؀>j\?|6gaӉxz$L^rf$ngpH/Dȯ2/%,6qj;5 {Lțy謄!e+I_4MpR7QT)"-,͸g.j%~㟽8 }CgCȌ,߶w&JTOĘu,(A p="c#>!-hewo&?[%moU.$>TòC>VoO# 8w:-(Wӑ:ۍWETI@IDATXSJl j;߷?;X@w.2zWjؖ ]{v%z0?m&jٗrV^~c8.Ig W+"TbŸ"ٴ=r㱩]"o=DyLؔn ˒AZ2a fOclQ!JJ̱P0?3s{4r-Ӏ!G0xDžrdO_S~]pK PT%xE%%x?;)RK:_RBO?UDQ]h\'z<-^R#xޒ"B x:*<ҟdз9ww QwNLdWx?]*Zٖ_.?} G\xog֧Z2Oc[mbUx.tZܟ$`H ^RUkeyeRaH:p~>"B x:*<ҟ:5^i=@䅴.Uy;37{a"9_dևS_\tE9D!k0sԣ $9E~#R Ǽr[dA)9}KEğ[:+R|Y$ݼNArB^BWI[_.W6\ oocGu-~ےLqr`fNݻk_ʙrG f_ )H\7D(Xg-{lߖKߪ>(qgVq54}ƄBn, el@]J\ < dҍ)KLK[?4t.R%P֍)s?s޾@5I X–pǟ}9@)%F+]f(-,gvO?+ɛ9A9xcߪϥ!gpұ~n:޿ YB kUvv]]q ivg+&(LOk@H/x<3Cdλ-Nx oogWM-Vs%bao?H|w3BPMLDݢ_jTJU?@vmb.%ʀ"&UxQM4:`\s gpZS%&-[ x$2 ?t{+Bv.y(ym#ᐖCoN'2[y~X7Ե?-w7ʓCG;YV\b¾znaYOa+nѓE PJNԃq?sRE0uBb $Xfc)xyBc[ 맭oo߶$[?}Y 3,C ,e/: 1$SdqTԱ)Q;# /B?ƃO6JnFk~01_mq)AtƉ>~Nl6ʖ2KVX5[fϭɩj)63 j`RKUį_uY: Bmk J1+=֟MK( g\_jDʟ|?k9c0"76_=.x(Nu/[?؄Qiy޻|-XW? QRlO\ 8Qk؟7eq*/{daJpPlpTrLpTHtJ~'"9yY<̛6!WMv#% RzG]ǔ+].?D]r"mٔe gH#u8? ێg#[;KtF(aB<)VJL=u6a+W3 x/B Eoѯ- @W; .]Tx ~1WnDqhg9dN4 ^FdW<[: >ߢgBq^̡Jΰ3D;*XW9=ш!W?Tgڡ_ ^g3ۿi 9A55n< uWkwރ7^wοf&WiELz5ѻMKxӹ?_^|w'ؗ^`(m+_ ~V7,>ʯoz1byo?-^?C_{_18_uItʐWS| C^W=h˖:28x*.xZi '~O7. d+U.ي4V~0l$~qyqU\qg-{˳K f3U}U[wr+B4-zՖ) %lK#\kpgn8Y'yq?dakӯɝ $+ĆhYnFʇ)e!BCsť?-=k4YA<=)/eY:m"=;Ji1Ō@F\I{SzʯBɍs:"~I gtBL^@xgyҔKYTSe^֧PVVW~M~~a-L_ K%B{o,z `f[.D](ŠKW}'Oַt/u+|I<װ-ӟJe?-e80ϪVEjs gR< @W4FEK<ĊK҅V9dKoĐ4uh:˟cT;K #] Iv/;Z?o>x:(rx[ NPkO΢#)(E\D]_<&g;ƹ_?*R #9 tDs>n=V*x}<=T3ۯǍ? dQ-VrgL: [YDJ0CǨcc0ݝ ]h2ǨhRd,&YP XH ʭU\iV.ˏƁ?7IW "Y2'>3nY؏kEC;6ScK~<[E _do *e>DJg=&IȳNR<\ Љgi @<@/~kEemE׺k,pO[by5~d Dh1~ӯ,5sϺ}"t[5#$h\6CŊ;rO΂8-HrĬ>ܯKN4~_I%]z":hO˟D7 W_xk |"٬qoմH88-I=3?۽5>&.is sKȥןHvLu( J~ԝh7ΎY6'X1\/Sa"- ;Eׯ~'odS" g]zg2" 9WAk)~eLoAa"n4GůtG[{[FN4r%S4茭}HF8>i4-Vk"jQo?_vHK_E?ɭ(=JYx^:".C ¸֦/7,#o m/g;s\G-\q5ji`v_TۛrT~U . ,\wK";?uiuX~u >[2"Kuk׊U00[?|x3?t ?/$s嗱+B1:ksvl7ϱsE_6,Š<^:k ?k*&jyYpY~2ǯrH;1S$!_~DGgpՠ?_⏿*t ?a 1:.XZXiy<~e]}6 *D-?LTxJYCY~/;E}M]]ngkʟƣYɣ0 [㇚FVf )ԗRfIzRG$ŌẆKd*iudDMb`'TȈ[lfwK쬕2DP FHxی_rYү@Hße %gfµo;sw;ߘLi VFQ:!jKu$4 WRjk|jC? @1g_,DĮ'Zk?La:ɿպ oe4sx,W-}4?`yF{؞7ft(+X/B_gQTR11k񭂟0uI%4/Z> ׋P' pf7 L0M S[e-^&TG ]Git=ˑܢIYRM~׿(!_l0CGYawBDz~'iIZs.'BV4B_* ٍ=ӡ, ,yDIK*%G]~A <ÔeN:>+E2ZNC 8,Wk[Ψe$i"f驰ʆP}-2 `逗D2+~q˟JQ 6 C,#X G,/,2I'X~$=tL ~y xHF"3缤@>~y+H6ɱZo?o;f-"l0@V,m]kfw厚X"a7"Hޒ=NZ]sE(чq 3-;~"+9Vzx[(e% B(ٛ ;yoHQU|* ;;.?; x 4+wa^Ɠ+i?ʪX-҅iE }6_j>rlZiE(tO5D_oy/=pg?񲤿K?/ߗW3a._ŪJܼ3num@-\?S;_a-^%&:vNSrEʉPx/>.?Y\']}N_XQ!g%V*}_".K2skٿV2=`+IWmQۋS4ֶo~w?y`dRDuQrdO a/>=˭c+ugpjEcmI("qϰ/Man`q9K9֓;\/²gv D῿  %~G`1чҾ.m˺Xa%fZT^z>Zڟ+B*3ڟ{[;j6#/؅<(.i'X:E)Gr#~R׿˿~o~W_*I?x0BD٥pΒ=).s#-޹o9b~FLF ?e g3ރBl(wWo[gsm]}MFu/V,V!}GyA1 #>`"(/˒2eqS[ʼnDKO|C,?*Hҏ1~]dy ~²arY,9ә.φ[Y='׭ [?kgm5/xʯ-YYexܥϓcBKQK+\7m;+>e8/\O l;W2 dØNg1ѣ: FyPB.+iR▿ ="0Ju.xH-{oZvs%/ώ+p-%g8|[r &j+W !B^ njß-[6\öojo&BQ٣g`8(Q&eL}[y.Ltw į|?cO߫_Nj_?_|?"Y_Gw~t`"~OXq‚g~[kzۄ>&BJk?S]~MGXQus8RE\lgJTeĔR`#ӳ~^mk7dC^i05b"l>ĤlƁ3EV4*݁ҳ~EO/#+j) ) L95/'4=3Ո=W%?,KBoŠ.ӷ}ԙ^ ESnΜLuT" 1WG*Đas% FALTL®Y*G$g{_= Y>PU+BSmav[YPi8ߨ?˟jsw-#0)HdCSgm?P(U0Xr26|1F$B\SIN[\e3ܘD(mgI2y v>~kq?ދjW@9pY?Ϝ<%BJ%&!-V|]8CĊx{ߥBEGঢ??(GnYȝ¹UWϤ̵*?uo{*?Y՚,0jko[yYg_di.w^l=_emRKUx;y~FH_~'Cs7./6)sG`[!X)"r9ŸW֗s8*"YJ)>hZdŜO^V_HW)!+BEoVP}E,6kiێO;'W?x/ųX# '1R{ a+FWt4b# w#m?) 1H_{a^I}4cfg5J??_fAӯt6l~Ltro=oOL L,uV ,.^]w1;C~: 1مs70#3t}ٿVwωPw p )a <^55ߕ?V*nõ3SV"Od}&j G1u^~OW#zPh=Ql? Om?o[oPB;߯Q] g۸m j*i/]M_BExmE3f^yskKZ|n<=}vz5+Tzowۯiu}?L`t06vf_ ]n^91-T]nC\sN'tφF?&Z\FOٷB]t6,uC}KUN~*@TuH& " ȯ_$"gmJQ,&U Y~S   ?ª#g<~tJ`UؚVR}ťΎ# C 0~3[U!0c|a܄o3.ggK=`wݏJOJKwP7#`e/; ~pע%/3Ei焷^{:%ORAl@\1;kU^?B A=èJ P[A9T%7D0ڃz򧍿E53aȌ :q|z=}.MΞ؞-˒Xyx2؟H㿩 Ռێ>/-*Ӷ ]NV`ω)폶[ತ$#iq #N; 7O//g/+2~ʃpYU>TY{0/gQ"S!jٿ~Ao[诶v+?dr*i!r<2,/Nm+=ݟPyCa='5 3TPv?3nP #|0?m@5X‰ZYC)j:֙.C`ɲ_KO&.HLNy˰%<;85>г-q%yu ܉qg[[PVIdmgeNlѸf,Q]wu?,=tS̀g:*۟˂s0+?;#Z8NN$K1%H艶=\p -:*a7~zYDAzFbx@n3zYx<#+IB^?C:*7go w`KJ/˟! +{پm[:n6O1Q/W.a^zex[_o2b?-g0"W>ߴ˒4d{ӯQ K>VX6ڍW0NpLIVD%/?(lt/֟# xxv|pťC{Yx< ( ]i7ϐ[g#S7ퟶGڊ Z@]!T":"lS,=>2p1.ůJR~XiPΊI A4xM͓\Ȅ+ uRXz~ "&1tlFX2-bYKw ݄oV ~ p]o?0m_vjt0TP~-IR<>҅cR[ײ,ezN?N>e?I_%ɊoO[$ngN?{JH{?lu@ZeQ%&Y[^̜_HY ]fT)rٛ=qOC<'M&ds u Jٛ=cؾm gq7AyXMm˒p9'z_wW[_P:r 3w#Znv,erKǘeW~E+V?'5llZHM{߼7κ wĥWCI5 'Gf:2%uV4r?&?WXsSɼџʞԁ(+>$fod\%&< OFTmg|v{ɑ۹ #~iMb5 QiO+L gsJ/Z0nYsgf"*2b@Te3 BaO@sh5_sKs!ytcpAnJ?o!&j!khg}ts,.mL?rr[A~h8Dm"Y=TDC{Zu/7sSY,?x[ڟ˂l9#[ȟM@o]/=d3%ҿSwǕLqm?hb. v#o=O,fM\zW.a= :g ՟h\ 跭uv~%Y._ͺQ\b$]I>R!n[KෝSgu駝3$;xO,f9e?J|ŊLNx_?~[9W\z 3sTbW,k;`M_]Dwrk|%J5BsMeK;h# 2YY?. їMw2L³e} ⿩))_%̢$o]s% ?\aK&5'/h%T_+]ua\R ?9[[Rs =Bd!>p&@SO^hE"רWÄ߶s+Bw@ Qm<_PU -~w JY%YcACя: [ҿ1_gh kTٗceeh>,UƟ"EAAٻvkKrU,'Xԩ 40QJ?Czuvg#g?߫oU;78B&;7gDVɉZ>IO@z‹_| 'V]ƃ^LHD"RmB4XI~4hMQl<),] >~$|93=iua%$T'Is1+^+LqK}a]b^08Ô|Ϧvk<ܒ>jȣg}^\/$@RB $jo%)LQ.;~;*Oף:fB;-38| ?~]O%q ѯG7h( NhĖ47~V,Vj9)7,qSO 9/G k%&HQ]CgJ ƒqQJ^S0޳"~#e\L'~0k2QpbVVO|D->-WG)yH7PD,SQ@Be3a0_1qb)geS[iTpQ[,2D{Zx6[kEƮ&o\޵2yQib'*v h08J?b+0N$ Y@g%&H-ɪ7Rli;  Q" [%D=FT!)05g2[[#ڀoV 7*- n2+#M@IDAT]wN=hzlƿK `q|ZG]ߖqg;;eʹuLYџ (Q"Ԏ,V ¶Eџr DmlD`~PSbr&"g4 K*hWj~hаR(%(?ejWK,w8ѭl[:)>?2m@`E)IC(.K:M ŭtҿc?C?>Z8~BAn);8j 8;N;v!N?1~̿;ҿ:bHz4cO~m~8Z|8bVBpVq]b8SU藝Y3=pyYWvk<6:!?Ĥ@!=V|rTNƸ)aF2\Bd_ݔ6'\ +Bf^CR"a~L'^;O o/F0lܵ3.;cV\&m_[:QYotY?.biI)N #@ l~J(vl. l'5!϶?`W=6jaܒ! e%z%~/gk8b!*Ar|c%9,(d8{.Z<3/\IRB_FH ?KEH?oh#R;U~RYC?^'_cM WҊC?"mˈh4^cK'A2ְY1fM 7vIaz3#@Zwh,t%Pd__+bwLlB[H;\aJ ?K6lgkIs[7>_%!3R3q[0MvR0? 8?z}qwTifhCֹ$CjE pk$@4~zOc p˅Zh f~Y7BT d%)˱v/duwO^.[&mIL~#y+5qo%O£(-g:eL/{ 'b_#`H'?2X ~=ڿVwo+NK ǺDc *p9e?UL8d>:e{镅Q8wsU218[UM4"\T 3R?э,gt6'gW=v,&d^k'ZairlGACcm=O4涇noR)64?O6ƟSoѯ`[`(G~*Wzb~VtFhUf nJb7t)^˂!{A_UT+*/,5Q;_y#1Oblke3B9OZ$L5D2ΈA}! 0+eQ'Dޟ20 3OV.d;E9 8P̞!^6bPЃ$8?_4x4pNH>濈U?:{l=?* mOKO4c6rF<2>m]/sEǧ;7b DaIC&)zYẁJi_nc2̚;W9~I+h9R}w1q?S~0B}$ω&O* ~Bc-ϘklusW>PXOTMV?߬~'v]m%?,hX+6g#Fw曌rt?l6O[׿}d(~<NQ͂2G1v Ư:S糢~%f31-FKGUXhoK,2(Sо/ns /E18E?ߢФOC 䏥,$j2uط96N8Ii窶 e}1`$xgJEO%u/DsB71Ϥnt-)$P MY>GF#un_Sf>,Qx?w1=Ah^~ϕLT(w Ã: ` fŪV6w%OiA7-BU0W0b_Stw8ɮ zq_npӷOR=/?22!x͓KK9.,uU|gVOU@h u1[m^.>e0{(g >֥?d-ހz^o˦p[o<Ô[Ґ{ Ŕb|t4::Iׄ)Lj;oJKcm2_ _.ʭJk?SuaQ`Q6')E/rݥ_ fS{?@w=ɝNv  p9g*co߇Yг{RzW%]~]5θ~+Bm"T.@R 1H- br2 xeXlԊO*?1)r^*S!e&BE.[9:;"i/*-u"3_8Woߖ~k&UY(>]/o=_:9dHJb̺?m 1SV?XrX,ٟcbR tItҙn(;px6Y*aDy1$giقn ASkyȒs7=v|?dLr6 ?6Khw[YCԏaDq\p\!*8<8r*RyVebt4VN,9GE߲jܒh_,ԙuBy4 O¹~Ïq[?--Ɠq-¸j:߰?94.]loZjjFvߔ>$xBvTYf}L8z_iM Č涧 ?DMD%%x?;)RK:_1RBO?UDQ05@= UDQ)t`MoDŽSK1^5f5ɪ? NY9^+?єsKFNE wݥ%qwVkTc`,!^ZBx< Dy^zuCj:lBOGwsz`UDUGGӁ<ٟ1CKb*؍JO~-^]9_IǑu#LA^WlZp.K:PD\kS<7L}* HP7B-p+L5{߅9/n ~hn: (n $~UNaɥv_}& r?_o+iOY?>[g.k[4٤MfɷZo;swӷY|Ӧx#އq|滘Jð?1:JImkwP%{BUEj8˓s)ݘξmVeWs3AS1%}gT-uQ"1%}~F0#oS9]>sC~RVdθǟ};m+ѬY>}~?ͅ5c7MuMHmh|peϩ<G=$ivg+3 dҍQxπ#Z(KtAq2OƔp'==h49LE0@:#4 -x&/j;8^gJ33]:8Jc;#3s*gp8B# ~&iai2c(I][pov³Ϡ88Ѹ[[8Uoo\)DoX[CPP0I)iDݢ0ƊuEk8=obE(&d]s T$ ֛5Hv4϶ƛ Y=)MU7 ?|Jה~Vwiw޿쐖CLu9UPv6 R.u=/Tnt58@k!ZoN'IWLwHypPʩK+MP*fA0s10`7i-go[OM֛t681#/}DP+:?9U P'za[!;9dqV^/!>(CK!vEu gˢ(DzKY{ȞQAS+peHRPTWhr_ 'A)|}:ϲV+PIIT`VrÏ7ũ(q߆ZRJY.4xEỈ2?% ˈ.~~cpkHtqۖD;[o7*-Wv;t*ʂ_Rj]GGŝt=s0c1%g| sYkk:-XrY'u8~ֶƳSO/'Q)?suo*]x?r(Kt?pg_vΩ? x0R$28Iw G7Eoѯ- @dba%tjXKQ  GԞ̉Fˈ. : Šf@(?Ύss {Frh\oKϘC|.*g$,w m.v_ms7X Kg[QWbȅ!RA֧b04?vϽF𲟋o[{L_%@aCh-uY[ť&|_$qOTg+B. ur3N>,?r[}:gg0)N16Vrl dt>-x-]n .h 'ަn[@:f;4 ;UNsܙӯ+Xnʇ)e"]vL>KǴa5Qzc%O/a ^*KQ,6Y`NNa2/c? ѼbB ?yHs"mq4*Tx ,.=̟Mb8bz0s>3S\ʢ.>}6B%'෺k" .og2sX^-qWL{f`f[.y?tPl҇- 9Wd/u_Ac. ~n#?O!t?s 6\l@ےˏ_a ?˒[ɻ[ڒ0$[Ŏ\V?UY(IyL(,Iv1.8D+*3%!W\^6̕$sU \f`chY:O$?y?Qkzɕ^m1cG.O6I4Ѩ۱kǯO$?%ݦwo)í q gR<)VWnib pٛ=e+V|ki,E- go(_ -_#,EvZuT]?\ \e^? 8_5J*5[.N$Np2YH1&r*S>4 v8]+S%mYpƟܦB7ge?6C=i%+-yW}o l [̓[ݳ.amׅVx s.[ʝ1.@'le+ ?dtw&t?K]{E;yX-],/#ïMqgq˚~_/!+X6(%~kU'-'t'BWg=C1I"@u)eN"G/縴G[x/ekP/W9IJD +}V^Zj8Dc=QYj0Kwe"bu?|54 Nn|/֥G!V_+VQ,\l߷3wll&T/[!ڿ/]W?~BOrګ]u9ksvl7_;'X[ XeR ʣ'>%RY%;1Ӌ):#Ϳ|__D.⥗>~i&i ~}.,8ge{,c{*B?m"[z5gi@쏿~MRDe5DM͜gSGU%n֘qVik<<ߡۿ5~Yim%`VY•EzRG$t2\p RۓeP5͏*5e) $[lfwK.~Zi+@e`D)iK.+Z#PAkhSW.0[,(%)*,_fmYe{タaTߘLi @FQ}2jKu/"T[Slx9Lgu/8P*P™[_K˿uhgYx\JJ_I $t=l3:ōou\X79 <ɢ1ؘ_R)jh\ d$:Mks`;LSy=i/-ŧ>s[/ m"gFX-x.}!gd؟}opUp&A<{(}mՏeYp1ȧh ) &AL,>5;ow7mxp UŠF|+YtDκu\71CYX ,y焧Nz1{ts+_3LY&l!2p(ȶ؈a򧎿QΨuDSb $QBp*_|$2%C˲Dra?n3@SkYPYoZ0H`-W ,D: f> xHF"3~Uk 缤@>~y+HΦl_?{mYr\z#VAvU -IE$">ɖ h衉^^KE[$D ND{DnA~ν>ɪwq2o}O?o;f-"n@Q,1`u-nv uc؟M-_NȘfkqn!AWC|jWۑވ wr ؟I|G ;ICZ)Ă8zTH012AuGcl$:(>}ɨ`aA{Ɠ+8EʪRl'Ӆy)m[S~آ|ĴiӎPJ5%,ny//=|7՗o߷S˟iOWeŽ>x_q62?P|;ffOlK/]>g'>_d—ܑG9 t(~+upNlh3N@c9% û?p{fCC&Q&2K?veI\LBSn8j%R`IrMSK#-'T{]'tJk'[ 9V)F?Ԏ\ۓ3,eb?Nxm•TV/;HNO);`?zrG5ZEi`@To:Yb#.S#7~? W/>V&}귿bNM_C8g^?AɅPJu3)N|)Ч# Z`8s?_o Po>/b(wom_ϸ,_ȿGˆR<-ŭP VW=b$PJ)"(͎c=c=(1愪jtѿԙ4-<  p*:7Ŏۨ%€Nw#?XHwƨƬq'qi0%N;5[I#v HT9,nXHT, aё`IV',{\K>嶎ʎ=qvb}?>=3LњVvPEJYxT/JI?eцA>GS?YOwg<v3^*'OokϞs2pJn12HG L?+%*<nT je<,$'3>I( us'/ώp .O ZOuK P۸#[ J -']DsOv\ö>ono%BQٳCGBAt 15ƻ ?O?Ϭt/H__.n<柬wo)v?CILmCj˅Pʍw:2} BAǗ Ñ?]}^SUE<#ZUab1`c]Uշ_%и ƾe q Hw?=N_qD l tJP|EyEyV5~PqDUF('QnJ_hGARv?=Dc>C?D="U>,d nDub0| (Q,h K)w\ eI܎i3%~l:. *kj\L,"{ޡ.([G/^D/Tp!).UUlΜ8ێm?ޠ# U?.A 5TӀ%&?Mim5w+}ކˆ#P 0\Lkd 4YHQH'k-;Ϳ|Ϩg\ji۠Z#@,uxf~n:-^Ψ`G% \hL.Xx$z~|Rw[H;O/4_wU DK PW@I\_d` eIt,"/~WПV_/[zݻQH컺5 WXl/c1;\}>^gJZcG?oR VC}?߃>/~Zp$B?xyRp }/ x$z}k}M;v(P{u@-HAΎoG%[o+HGb쨥3L俏 rd)j^`]Q.$FAMPTr++i0A °~eХʰjGIpNO>avT-/(1@_g4.7S_OOjs>v}@ʮɾj. cN?&x<7 bcUF v#ZQ,mli)x܅qÉ]L@MǓov TH.^@ٯy5VN/;%x: x neC)wD #/:h6\C r ?~X+9)aCXjdfJȾnE($ 7歯 Y՚3ڟ.+?z~t+kbS(gs_1_?7tW%8fyu8xxj]g ¼XX|Y-G?rq MM znJ( ";k^ \e-^K qG#],*f+/춿ܽPVP9);ByDF~BǎElN6`ˉDǕxT_5>F;5~dQ { Xk:@&xI( _)* HF_#Q|E}ad藊t'"#"~x,?6Za5C&su>臘o <\N5- 'S, բxu!š3#j2cN&x9>C<1s!݆{:ǔ>a ^M k(OXs܆k 3SV LIlFE,5HR]MRɖ_p b4.ҟ_EvnB)o7)mπbvPz}?X};B?A7_,bO0z *T׶?AqƿS&][*>?;Uqu? ,K c|Z싿O~hg/~eU=G맴%䏛O-NX\Zاݼ }yt?2 &"t%(hhˈPQ~J&aMчh@zcXX?%З)fȝ.T>ڀ[I 5x @F7ƎPn#d|,:UDfutaH0G\bLGg ^x ?v\8~c?E_ȍqKq߸O?;S׿x<9k00BŸ>ӸuXsUNBk?~P— h?}9x']R;{cKP5j >OR2pǠfBF\B`2RCCLh!R"*iS7_H#f HhK,h Gߴ}tYOb ?,*g9U?Srr%|5~~g`>/6y 6!?jw?4G]g<7 O%lW H|ahoW-9}"0ca!@n;_,4Z]Q DT>LZoZQg?(){c>D/ AAQ7#x\/2&6ߊ67J?J8!`J PD1_wgH33<#4#tɚ)ܡ'>gtBGV7[>+!|koK>E]OGxP-;Dz;8#p`p rmR`ÎC+MJL[mRDW)?>_BIJ$^0~0ҧni|C#600b0:rӾ=9wOИ7T) R|S% ޞp@@2K -}]v.˻|u7A}\ߝm'.K­6?o}}Xӟէ>3dzPqD 7KDXLZgίϽ_~o6lHح[}wVC>/Y z{ƥ,Ww7 )?2K>pFgN\޲ ?Rg!M=X/;"7|ႫrY%Ph7P!RICg|ѿE1&gҶXD-Y?v8S7}˚Dܤl\n fgXv3|1cQ} bS40 ӧ f/c=Upv?9>e ();V>?-\'ݒƘtV[B ߗTk*_HM٬-^ 55KH (!G$}ta5cGߦoqii؟qk`h5dKjT7eSVctS _'zzvQ{ PZ-Cɴ.X_-KG83DdKf"rO&C!cvV` ?wd41%|Ǭ{\U-S?^AR5Z~BW\ϳ9e?V!_rkӼvY?OAK}Ӫڰ mXojN}WV‚/gA꼞رz ؀/`O~C/~Ϳy_{?_~˸U[[ぁT6ҹTlǟ P>mǠ mKp@F?0rԩ*Ba B`ϥ]!E8 eaE(Zci!QX;awmβôǯo20eg0R?&.@D~9:_`OYI gnlPN*DAK,ݩӧӘhGO_>I63?Eߚt\_@ѷ`cھ-jߩwx Z׍A=q_=dbe3#Fi{xCNIY{-@1 /=d3o;Y1~2? ? D09/WC)] o e6צ^U0lGءKiUyRqj;ҟ>kwh-~,&/Zf?,߮giGŵy m};BOx]AZ_o_{ >[م3G>$iZm/'8NƟm>o~ X0&m.g}|wst'tv$dN T[?2]OT>.aʭd.SЈ'ݱg9yk<۹츌@@KR dKiֵ}߮o.Kwzk|J6Pkޅt*p{%@%r<`k|n$.g85ԡUPVōaBk-GCc }.䑪}v|(q6N?E_2_# iG.ߗTeN øFlc'75Teư{|cXr>טa'7p6'~>_TC;|*~9y6C#kykWJ/yu,9P8[OE_';BG>o}Ş?]4;}[OT(;2$: bJ]&ޙ 295Wp=}k7BG <'nmqFDH'D[FI™?ݿ CO]tFvb8J۟z8,m:çQ:W"~w>y$yϧJIcG t o~9$FAsG;DM˦"ˮO'u,EFm1eJ}mJnVg;Bk4a?VǪ<-L̹`~'`(X|@Lcin8VB!5`*FmAe0坅(d4dEօ9mǖh uk7xx mnO?BϸShKX)ߨO?(Xѱߩ<#zPG87()cO\6^eDZS? ݔ[Qg_vsџ7W7ӧh֙kZC $yo>'߱\vp9&P=a/4/9AaHمK,-_00@?.pdQ1m;Bd .39ISU .t<ۖNbLP`!ә*n7~eqᐜ!ac,C W#w۟aK_gi~d)򔋈Ze 8Ս;-yIYu8j 8_>X9X |ʿ_٠οzNA@㯎8Z(KS.k~ 'aي/aHT#E9?qJCvi>T $E_.E?'oxՋ|rvXRI34?gy.\_T6\awBXe}]faߊ[P3e3HnaxзVn^Bdg\3?EJvCq?Z4FoѿƟS׿7SFI<iW*'I-4pպOgҧfRgj.61L{eK#v^bw=xI geVDžR\z8l"}^v_,6@t mp6yYJ| (t@ % i篽DZX w?JeJ5bm?nuA!u}zyh.SŭN)JَA[0#J[tT> D@G(vKIBn{dg c"xRZ5$Bf zA_g- /p4.󟐿5 dǻ;3BOP8X3.iSaCoRމ̉hX×0n!H?p6P<>(먣GJ^ Ph 9+9cptp4;.0 ‘isF 'cgpK?@ݻGܴ(Ʋ3^pXk!B˜B )*lgwBe`b`]\|.O֤|$_ºc (.> ^E~u0+ԮcIRt! yZq:r%O8џd3 } GRq3soM?o}6a^~?4xxqLQ=wOʱn=GHCv6aVmZԮ\qL6/2_w$&.Ӻ>BqQp"~ S-hM)ǀba`h7הgn~M70>sv¬ BF鯣<mǤ?TL;qvlN𪫒%u ؿy?Lc5(ۣ:BU嬧dX瓔&KR$2VºM)ߵݱ/"?D;~s{wi+ 0j3BGu\=ɏAjŶZ8,;A?o{Ʃ_IY6;m0s?|`6LPV"&"@(0l=83g5HuR ]PJZY:<3prC$ůG e[@i;VCvGsOfChX* ()K_4ִ.@5)/Bu9u{J 95 .@/%p6pplcGO㒎 db#dd!z 'oWrpka}UZă2vvG#29k>UPKB`OtیA\[>=;6n))+`Nw WBgKj!m!k7祂(cFӰ-.cECbzɝO[/'ǦOXCץȱĉ؟͎K0Bţk hg0ƎFI5zBRf4*ږF}ϪQ;+ږ~*?h2i?aBIvBZ=O'CVd1z[G6\BIXϨ©'u7j\̛e P$ǧ/@ iX)Ʀ6>,ԥq7oߗ)_EЧ2c[㫐A ՅhQ'Hr1GzL?I yM6v)I"ܾ)Pq!DGH/&#yG?-!T}tͬv}a9sة{oةa$6p@A%/-K;v7;j/ jS?NؤnGEnx28cE?2 *B,a7ex %X$?k wf}mAup\,}b(\YP!՜03 yixkws\>2NKە)܈U? 9eI T`n 1r{_87PVܑ~w%tg `(,!﵋/ v0 P.96"m=xs0#?}QY|jwH?E_kי8/PEM/}lqEgwQv4ѼO ?jjFqB)U gqp?Ys_.Ǘ ߩ5,>NPh(G%RFw=rKرrH _%2";$H lh]hfF&!ER0`$Eb &'=&!ER0PI3[(tRp IVwndat?Jt>mT^Aؗ=#d|h īe ;|;"??&xg8oi ES$[LxV52C,mSn lH ?RX")$Eb\|ˈsFB|TR_F.תjĢ/ G,9_?s_\trYk8"~ϏGe$Y"`Q%ʩ#|u% l@HfV%,*O/![FotF,߿3S@] C.݆ nOɟcGm/}LQLlˮOѿƟS׿^zwaީ ,B4qrاwj+Co 2;]hgSOW4>{cR B:]NRj8tSj<`~@ As)5^B⓿r\E7K($ԧȪgTK(@95E;]0X,L(3-φ2.!nu9sٟ N:֯Me~pe.p $)wjsr~NBwyK~k|x E 6q@<}]/ߡ2<*;V ? dMm $ ys)5^B_)a'jtFт4L:z6S;@lR+,BX0:zh_3K$ilx '^8*qAMcjSb#D1`GV3SIAG?< d?EJ<?vڟ; P8N]tFTUctoaTfXJWFL+4@LP!PH.K'u Z(>1sgp%fҟp }o:d4\bߓʷ t []NIwi%#~k@ ;ǜt'Dߕ\J,F}~E,|K>K%">]K?3g1q N$32ܞ~`:Oo?eG߾jwcD7=j ?T,|N=d96< n<6KgB(tHP^OYƗm9aGUpѧ#4[oC:v(= 4tѷg`#F?ssn)40[:^lX+"sk/PLl3G_8psLr_xL$DQcUO!'[4gaSS?j|.ǧO \.l-Og?~G…py!g"5@lcZu*ק_sYЎP@Qtxs-5~gh[ 56B-K6{kZ <3/K3 n߾@2ۜ+O~Dύ5yXne}c#q]GM?eQQuw[{-\ݟV ;G/Ə (G;kk"g;^ZQ8-yTZ;v5`]'T @ Y =.2i02$"C~E|~&-eu:h|RlUbKR` Xy}W~gU[>9.E̛_Ғ5E#?%EU?}w񟃛f rQ|$. 3gq5Bh`伙\Jk& O `Yz7{ _pg^?K'MP!G/vlO7%ɷߣz=Ԣlv~aUH_v3YmBţk?+Bo7g}cUo I,$}ѐ@f;{8n&v*K DluZaSǝ;1ߠ8115L 7A^V{$se@mjC6#ڹG? .)ywO5'Ѿ%X7IoW|>Noѿ ]ߩۿ#4i4&x d`,GP:*cqOs넅P_K xDq"Ga Ja$|c<S_ W?)zW\pP^u?EJO&P/ O/ i[o6K^=g0@a ,Bةa㙟O/}͞5AyTqZC6!̊nWxāoW;.oBёsHb /޸P[1y}ꬓoG%UJ#jdf )_YoN w\]WMTen Apd,瘏V՜芿6SP F:35J8(}jUO0Bagg<Gņ۷0hFW%29`z I ^TNxK?'I*?ZL@6*r B=Tk2"J/Ϲk<C}fBf=F|Zgi{7@>=! i s, ɸx K{?E_6 K Yoٓ$tCЕuNٿ7N='kHuĶ/pl , I2 mi|ҩJwdhiCτK@rl܆vvܢSrM%n%o! f'v. )^JVQ sW% [x"]|l5K@rlH.e9G!'~\ E|yVum Cѿ ϥ:YM4` 2bpFѿF<څh ?M5 +D=OJs3L>Ȍ?e0"Zs2 fm8HW  )ԅV<֙s!x>.jRoY/~l.!*B5Z.%`C1\. ^(@sR'v1!rʽwř.ЭrRg\0TZTxѿ,~J2BD />o@YUㇰ?WPѸBDz(r6/c[.hTOU-}^Xhb?4}*U]J s?EyF(*˅a[gaj.]B@vEa*Uhjg!;Uhj.%`]gpQ}O]hŻr-]SKkELI9vrU?W?hƐh!ڲB5Z.%`c0z~"]h .dKr}SgH-4)BWdz_5g(ԭ}∺ Bޢ!!5`9)((J"-O&mںUP`ir6Kx_ ~qƮ6Rfnc.crrp,~67I%3b5ŭGg1]hav? {\l Xzy-L+_mEVIIzѿVXԿ1b#7h2 ^ ފm 6`wϤ_/i5Z,ukj!Ba+@`A@E!-ߢvep >1nҧClѨsG xj`ܣ0\#u[$x[j֦T5DƵEJV79g)OϹZJ*q'L9j6R'(O"U79g)@@YZ>۶R[( 6>K*q3c (Kۖ8&L;jn_/k[-dO]l!r4~B\EdµR{\sG(9kYS奰.nl% 3m/&h(e>sG(i9&}TZEz,2Jy.9Z;BYsqX٘5tK䈛{9vRC1G*bՏ:Bw#n~~eij eh"Jb\hj53~6RMTL[=3VjU5Dۜ'nbϢ(q5mK6 S QmJ$D>ϐ$NSA*PǨہ9Syz@~K4ū'\HjmMY PV^CZ?ȡȢY*![dkeݦwdjBW2A6hURf1gSH3MG"axܥ 9W=SrE:H?WyM4,)o.Ss!<15^Pg?7/ߣP?^^z:5Bu&_ϔ^O Q|};uCY![OorG#iOizwre㾀갺YB2O31ږe`aO;BiBpG(رv"Iflk59 .m;Lmʎ r2%c! PtzCzSLtҊF9Ckx" xG/RVk'+~Ȭ֭O΋F#mR4ddzftă:D~Gҗ"i5mot@z*'њ5=- q+w\90雦ץ"-6~ 8xQY98cj)RډeG?պGя hU 0dkK,%&Oh & b 88}Xǿ>X8HMf`vwߝ, 1aܔͷ^94 cA}Ɛx$S_;\e焪n4}2蛞d|r?ȧ7nrҿ66a.uf RP0z"9Q2u/SՇF`ѿ*!wA֠G IҨV\ϼi%eD#{KQ"LaGXQ~9b&r\&n{I'-CmաηE]?>)P2C?E!}m>w2Ύ#t8IdIH Y#!S-'W`K S瓎p ? o99KG= I OC7 B|CP'?-'p |JP|ҭ>I\$U!?KL VoU?6\m M>ZWqD[Q"%(hq>Xo_ͣ☃"]Κw~ú"-}x QR(%(X$wƚE$|{/GMͮ0RSoNYtxroʉSq:e8pr|GI7uKUMVntB1}I Vqx$ә.40rT/HQ\ j%4v.~۝-զ/=* ẕdrGә.G__hh|Fd37y%!PH=Kզ_#~'d: %ƎN H ^$(x^*jC0l{+@IDAT <#TQXI ѷp^X7,LsƥKU}Ug ˸C$~뉾v|M1J k PG`<[‘=@v gpdAG7'u4:<_w3VV2YQ/^Q"lJն?2#KGq$叿l~ (ZN0*{@A4QhV2ّ>.OPrC™ #KDF5o.KBIzw/c֩...a:( cY̷%Crљ3$I$1DrՆN0j&ځ#<^vJ,Oj=*RN/;%LO]jzo>^K3 //k/ kdzŅBp"k_d$*@H9#$sYVoBR&_JBV,Ҁ Đ!vM;}-/s +/*uO_U!L!.s♧4Ny\Oѷ6ɷ^UQj]7br*äo{P@?PN9 )y9A1]L vVz0$%lzK=pϳ?0ee 'E|t&Zjz{9kwS;b玼ݙoq[Nx ÏP^vJYV%!4lWD@rF9UWoU!4/5Vj-n>_Ʉ^a"DJE-A8W+.:}4YZerV7> hUE h пw7?Ǝ.ngS p h[i24Ig\L}j\;oڣ`l ȿdn3)\$VLSJek@]Uvտz.zԺ356r9h A"}*SD\Oxf6\߇KYSִn3)Ե?i yS(\J)LI )ÑS S+z;; ~ N.sNJyO q8"<sre?c]dE@ @#ZP#8M>z|S؀vܢQ*}-g[ XJ8O]t6ֽB˪a˻W\g\IJJ "#.Ior΅9nы($L_bN \B`2`<=|%C.!`J |D1N3gi7 YTY;K ۼD|>v%/EJ HDB?׹~UK п;!E5b(n2"ϯ!C۟B~F(>=7" 57䎓 19#..[[#n;.~ԏZC')J'x0~~ ̟)S? ?w zɉ9$G‘T dٶ#.1`"χKVeN,0~*WTCD_'uTG>O)7@_;]dq ~˄#,oOc>dM DS@ %OOѧzN] !oɁ)pinTwfw/.㳽 nm]7L>6o(3׭߱;3wR .#,a+?uOI]'d.Xgu\|RAKFwӞIXP#TdYj`[bXo@"X$u鷷g=A$MJLXH #0f\᧼6)R0OEBdY-Nut5.tOۤK_Ʀ%qwŨ~qTC [D` }W|OϽI_k ssb{Ԧl[8ό6BayeзqV{;Ը4~Ͻzyȿs D;nR,n>oH)T*p?eIvn6@~lR,x uwZ(S~b)7}]#tXed?˒UY-}g } `,KJL`kǁ~&X &𭃪pi!ygb)5n~ʏOGxVv~*&#Ih[BOkR̃!YbO$f3P-cޞ^ Ff|NC) ipK>'ן{AiOqwGx@r2K@0|a$ۿ7+Y%PhՔqV $.4P+_%0+}M:y6w=/jXU]ʈEujSP՝I6xbh0ګK둻?E" 2V|G#7|@U܁$+=e` Z5wCwj3j icZx c2K_ vJj2KBTWd* V_]Xe ,׳cѿ;/2GxnW>ujٮJ) SۭB`4\ã L* ai-"~ iyc8<}; א/հ0·C ;m!yȰf9u,8>j5|BE]USL#/I!qb"ViEd[/`e a!`K?4tǥOtU{Zqk|̩Z  W^"v>}:*Ru r`?ci9-ayZȃFC u` ?P(1/ZL<_6^AR5Z~B=h'wSs]R<]k?~ "(<2OR(D#]ID+f'=#VK??eqr)?lPqiAUT`ma.I%Ji8U8؎n@g4CM  y3`F`>#x莝LCEcnO;>g@-jo-GYO=XO =O`X36_;=G^@dNOtp>$;K-VЌf}'V储@8_)f-~lѿLSg˂O_E;_5ZRM.!VK3Wri39l:&Pi=o!C!h\ y<.+F~K-*N=$~:7S)u?\5$Qy_S vx _-$H+pbB%EPf/>v6csoZBgNv_$4#,pR}JZ 'jbA5>zhmH)g8?  ZAK)sG(NnB-1б 9?G))8"N8j$(6S{`Sֱ߬3BI8Bă}υn\y)kº4RHU>*?@_?E%C",H`&4G'ʥs"(r.# myQ% e$O),$$9 cl!%+6.r|/_&)/ڜ$Aoѿ"4Hd !S?9nOBlWQG` #nalH 2*H!)#pJ=N t#~wDy{:; zM=3'{:tO_ 'x.?LEX {fDO_o_VP2pywqq͚;Mʐ8).V3DcIS IRA$4JդaH:[#12vdI7]5n;!څ"$V<7~u88g~Wl[.'9 A O<^9hѿj}_FD+x$EU6d[_D Ɠ E(GQ]TyNIӨ)(gpr!Ǧ??0euYؘ)RIT W8i<<" jQz__Qj[|#fm%[:>28Dj̝>_˭9@6>e#4QZTѳ4AF٠-^MWW:Dͣ̈́G8"Ǡ/eik@AugO-?kxj"}^HQk;V"Dl֣W@3jMҍP/[ؘ~ ȢH5%z=JpѿqQd]ɼ'z!=-wed׬=4ʓ |e B`BH(9ScILK-T> %Hפxҷ_8vaӷzu0k+~rj4erQ )t~TK0?Lgw 6xa{ѿq2-0ݢS6'̃Qj|$:c1Uoc K4.7e)OX(5 {)f.y!^3 {z:js8 nG]Vm o.胴KlXOo ytVaq[*er bB}rPQ?+p6W] &4 G:S8/d]^xSQp~5' w0ں  YY)ԬQ>|0[^? bۻ<#ti?< }ձd4`jdA]4Hr@Hp2pTS'IJ~L!j6 XhNTtR~,Cm)~Og4 4Sy/-6酾nsBStO2-i%`ɒ;HC `6y傖yO_wB25'SB(Yu߿R7JiG'\ȣF \S5ۅ . wT?vs'آҎ ^uUDTcQd.RyBtVB6#t,,gz6ߎ?5u}H!c}16F+D)&?o{y L)!_p!+"x9u ;n~>ǎPT6q&( CLL8G^(iQaبp6 9ç|*ipTJ} үBk5Gr?[kT U ]XhSJMoZӠ~؆&NO cgy#J\8!(3zkFYЏjn tGr] p?%RZ@HѿFäb}EO Ebx?fvyr1̛Q[g4LEn>ߖE2%HPie J{]~DϚ>Զ Y3Bz(ĜL#Regnq>-zDDڰmq] ̓:| gxgр\Cgn5o vY\f,6TN-{e&̭,+! GN!$W\/e#PghJ[%~D8&]ƿY w_@|(ڍ@U4ո`BƿXhTC48^qrjcMwՋ(-أ=G?:!Z/ۃs_⯴D]K!w|!eˢD}\ȣH/aLp"HJ!c`oW/=>ڻ5rJ9Ad <n3~ "+]g,&)$P)w8Ad!ALۢJNHb "3o?8C1LJG7rVNǟ \a|LVS Ii=/ eY_XdoDeetf1c3FHr1O= A߿?Nyrg:A.8UħT̑:Add w\NDTj% "+ 2 ozW<\練RY R&mQ%'R1#[HJd*9!9R'd:SѢJNHΚNYi04x-?9>xϝх2)EMO]x۲%'R1Go&|߀AL%EJDVe hǿe|8W9Si8AdFyC$jۢJ TG.C~C.񗼓@ES L4 bPeBϰ#»)ג+=zrV )=ţxcfd֢=67$q!Y&3HN:K˘w15Ya;4_@qcz4;sTHi3geqH15/h?x;U4Lsնg WPa+X;sSB1"1X/鿎rr]!>cwa(^;>?pnm%6M9NSۣgʫl9ǒX?|GR4:4cj>+nX?~bp(9`RzL$@@#ȟ?zC=t(602h?Tb=o$@ih28Ttۖ P:ta.ml8a Gg1e߾987'(6a(YIJRJ=o$@^eqH1eG(}(YS (ԫXgQaS P:fPD( mCYIY/U$KeHkȋ2qʉP^_ȔG-dbwm/NT*Ƴ-[%11qzK8~ų ~V)q9wlKl1屙c.w˷,nfpoVj%|urpFysfO;B$ͣb~b]O p8d~~5-4zs 'Θp+!"I7ɠ+A1?V"` /-TrB-;8u]Go7Rk|g ilsv)tZzݽ1&Ų$T\`J ljo)eT"cc;W,S˹ȐeY6j#-w|%Sm].C7m0'ybn...nd;B,1Y\&YB^Z9OmA54)cl*ri/+mK+4ˬ 2 '`>W[u1)VUA5 rM8!J>$&S|9*z9? ぼ#t q;BmLd%bD Pf)D*_x3/4 /]c)t0!xU@@CL-HNҫ4X[/|P K%u@M6 i8c?ŝ҈21rdhGr^t0R&JU8~ jhȑ Ź,I W2#S*w.nyRyg/'řƓ?Nz{;BuԌ)]39]c8>RUv),g4o2q8m!;L-~"ΦUA8l_\P @eF ZϿ|i74f;w-C|Z(JWמ[nğǑw_6Yvymd;sSW*Q|ȻZ찱oVqBiQȰ`e==nEb+X϶i?B(;.)3tJq?g}5n]9ff#u^Bgo-?QD iϝ.PPXXq#]H^Ro0:.mp3g{4e!5m_#KZ}T!V)w;Bwe3wzXɫG ^鹟r!A/s2/͙\i*h;.KS!¶b-Ԣ^EH9lC&717_(shA8_OѵW{qOI_7wTW(& q)fO.IG& Gfhl88I/vDZ4r)M~MGK_rGb}[J0M|`9NҾC$p%Q\~H0sT_a&_5ܗsrll. 3N]SKܖ)l ypw2B]SٝrTQC] xs|۠ ֌p0ԞE s[/ PSs_2*tA:oxM{C#_WяcY"1c˪ (Q:cIO?ܖ)Q8u=O0O`?ull)Y^ 9%WoGwU V?.Ug|G|T%_V%p%Vw2*PS?'7c+ Xa?a>4&_B1XkǗ[3w;*6s^3jsH @%“,R/b e`8>S\is! t-s _uZl˅6w}rZZny\0־K?Mj5Xϸk#e>/=NhiX}m^NjݰMpr>h"'̓vZ)UhoaCS"2Ů79#/Lկh9?4vrV/YwliGh&Qgb\Aꁖc\h\q%_Åe2N^ȏ$/21fnO/+qvoу7:lyY$z kନO3 9O|'ס?̰_hG+zVУϚE[L'(G"!G+i>Gr\m;8R_( 6Kh+%qꟖَZ0-V6dG3 *o?C܏Km>#}-k_VqY"ha&-6=)X^?wdƗ"?g%jY-v\9P]7ͮf,NY c`:'ھo_-Ce>WNiyW5bGǗ׃uCkJ 9Ih&P(߰!6w?G]swGqG#;NO9 y o}nDf/ RIjۆݡ~*;Dƣc!qu@IDAT1aw#v2Hdyay9#>9[Шt@pmBw`y\qo͍?w>OCh#wo$'IJiiU?:lo;B٦[ ?gxmbir s"\ fg=v*֛5vFm+d+nA_⏑Nv/u<,N4On&Mq恈Ẅ́Rٯԏ믏gVv* <V} I,pn%e1,{*p} [@>#JzKc  !.IkL~k?Z-&(=voeWwW8۟-7s_}t3nr3bdu"_~Woc_Q7q@#/ѷ| ]ɚk}|EUz\Va%r!?}_w?سsuϡqH9/Z?AwK.ر I]6oי1h`k-?Ky3g=5"A\%1.F(CB//5={w=a>>G?@ٟ?A?X8s[b]35>B]08?9^%%JWdeX{v^Nv; H-φ3kP0N#4 HKC \ozgCkZ.Q\;Z.?ߴFG. ¦{HB-!o"\=]&u,p_>|՞YXeZڋ83`~N>C$s߳򃰵۔ͼfhq4xNvYr??i?IjY-o_iC'^0;.e';ʩq Q{z_'Xc9l\V?zpmo%Nǿ#NWoqɏ.vG)2'od6+Ҟ+k14|U2p1v?`fGPy!K{ %T ''gP?mIGxջ2=8l ;ʆGP/9T}iuG~KS%ºΔ.gtSȉ#Ǐ@DW+>8} N{ZIy@ Y6_WMWQPT+~t?B? S*2hPdF*i4߁|-R#׿y53cu?L^Q?1a}߃K6M6߽hpfN@..isK@q<<2ԱusB2 'ČGRQ9!f'^bL7A)V~ $![=j~ ׅekU;xb,U!,拝bcEX/ťʑ4 'Q,xɖ+#" NxħDPf9F9?g}A[[aO962D2Q( Ci/7da(rB(:P:X`hllhFGϑrߕn^lSә |Fr_:ɣJ. oY?cy]%#RNqV e@NCiGlUȾIjRELBY_^Gw7m>%޴|nio?6FbF2J"/z.qrRO=8NvFWQj^j_A4y.x})fǷCga2#fqPOXb`LQdHb`H/?Bc/χ!a _z!KIw8za`YU c67]f%HOW[oo<6܌_Qks5B<1:;Piy>_;Ba+qhMtd`s~va>%{+݊+ݣcJXs?PK1ڀؓ!oG?ƳbOvu:# `c|!w EICR*) Iv6! L vj#h\o!X ?83#d ~]Uǣ`l*zOC5C 1Xՠ:%AzшTa_o2X<ڕXbC5rikʰ;.cxl`Ywxz~ ɤ9sP/2TD ;u$&(k)O+K%eRv2fA] (Dh_eI?=g֪<^?Lj~IğIkCrDLn?r|țB e-剨t|Z?Ma2$&*Ÿ*VGUgDmӝ!e~:^3^~od p0!4r_kC?npZUJK L컁4IG^R@UJ+mKN\RX)MkQ)WS?`HXSfϥ[wpćKQ?#uZ4BAY'b5ScPj}Bh,#}W=<*k0BZc0 c n Q~ G#(+iD 0V1C KvȞ*T|{Xaof^Wӷ 8ٰ2e]R_s.lf1wr=z0 /.)c-`їc2Q^M]mbT*ydͣY?Z/ i5XJ|E(O6oC,aSN' T'bȜbŅF v>^Ԥwi3|_VMp0A#q> t]owsd+J77Y?C"m[7THmG,@*&y9 ie'by?OJY.&9PGPV`bʣɈY*6)q-mBT'b_ž&vh_&bDTT]R[O.K WV`SL9l"^,Uè.IR sdXvsdR/=vfY\ZR_XA}ޯ~Is]Wls+}?,գHRh#TC~.MO7iG(GMovl̮W QR;5k;gߨ@.D{$RWש^ fUI_scfSu =_a1c:`E5eIIR*i4sdXqEl!WK#Nu}P֙Ԛf(kWPyZ׺Ƥ?V߃~ %j' ~횾)?wҪvy;m[SJ`UZ?.ZŴ1y)R9+;9S׼JanK6>P樰/. y"bbas[j >5L@*5蟳eh!i?l/f4Oi[JVFYc֥D.hFʼWPÅ??*2#CCjl1@-[sk!T*k:5o׿g4:..@}Y}aS! f[sDZg~bX2S> |Rqy<~$}Nqr3'd1?mqm%Dzl!E51б3We>QʿnO1k`V錓\^<@eN hp;%u1a0P5vpqH& oӧ||jvnkvG*kӿ_-k$v4j _V P)A1e"Etn# g}XXPiV?Go9S}5G:gMՖ<<|dtPDJᚯ* NHɀ?Cnodeޙ;#w h@^LOVZJ+44z#[)PZ>lrJ?~)M/w|(JSQM.ⷒT*B<Ɯ9׿M>>Xw_7a7ԧ>vQ{}loSJ޳{eRhg k|c6Qc}E>dUbQsI_UIX߱͋9sU(rbp5D+sKmR{9/Wl˿`G$wm&$kP\BR%VoSx Gsw5Bup)ullAJF.*r&x.l^Sˍ;8I;C콢V.eU{'|4};sS̆?}GFm:2&Ct|G#<)cgrzw>-;D,#uXejg>w> 碆!dԶC~CÉ+/Q֟Ԍǟ#lϦ?1ƿM˧q185f]!dԮo3Y?deVa-\#zFl*ywl)m!e8~a?Nbi~.{}3|r?ުvW뺛pO~{UZ_8FWĬ4$垖wz :2gҹ/U91R3^MGDžoVw "<}e⼡,Z';L?4a槑wuFY \3S {k}߆GafZ_}{_$vyob竅Z;}k~\?p VV?&ߦ_W&̓nnc3)$k5B?,i|㯱qV.4 ס_0ecIN?:+&3\8FP(I8QX8 d$f.FPWV6+pdB+xJu9}GkڸbU ?jYO}f"~4,G^OM.D?o;N1;s>{5CsccX[DE16]_\VM !YȳRTfEJbyEX518~~5o+z~G_oLVG[Ӈk5R 7&E~o~[CŗkZ2\K]vB&e[GB\C￿W__W__!BWoի{uOX}zv6"v#4} ?@}4Ot0-ujD|6 Vk߾m#:ZܔP }X?3uʁk#PѸIKi;rM `_%PSR5_R*Ő9oEf>ڇE|dZkB#=OW*9ˑ-A/ߥ}tZ94 >,MV_7PR,T1o}gu5޳P?K;Z?M6rM $ICŶjc??c}DCVV{>} 1P+qƿM믈ܶKc>FITP?CfGxxG7pӄRD 즔lli> :',T~Ԁ QVa9;Hm `\kO&3,6 >ru췎[ȣMMf3 ;~ m#w/gA`qğG]r$>vƞ0tPΜ(X'˷~ yG >߱_V?V7gw6wrb_!Tc{ξ^CޭeLWhO^@SL~v]~֟yGyNk4-6*&ѡv54a~dwPdr 菝V͓;mStUql2">hoC)4G#ɴؾKO½y}vht>H(_?~gG_V__@=hVm]N1Gdi1 G"_1ۓ?(q7'X3a k6_CmdZ>d"lx w2 %NCX,Ǭ-#_/;Zq2E1{o;B*ldQuI|g,bPyL1.'[!wL*$R"Ob0IhiO(v/<,px^/U7LoLEYLsuJի3tnww#mTm)%,0aKM N#)]Ɵ?~g??uoۇ~ӟZF$ط~]c ەVX-~3"NI-o 0 $>axCjrHx .kR?X|_Gjǚ1Ь  ESy3OLLvK U?'O,rGawYIL:oa$z^ B~kVUۻ q}?כ0N2?F}?ZoۣW"Ց5nmW'|/%Iᒊ< )]nXUN=zdq|0tdjKG76p/wo~'h)1SY:WN=l!ԯ$}?}yޡ2lۺ d2?6h֔UZlw6LPʻJ^2H*G:&"ϦMyfxtmhW3YY[&7*!; X_%BXfxtOls'Ҏ ^=g)nWrB(}ARk 3uW<^)TGqrMx?nozG!sL0?&Zgw_Ϗ%gݭq-J9T 25ݚ(:#S/׿\r'qɃ ס'|GoSPwN?MDM)24\ź##| h'ew^+J㑫F 4n aB:1j%XA7H| Ogxuwy{5n0c7!3f,6TN-{eέ,.q x;Dv_;Z:~)(OB헢)F_co+a.*9NIOeS,nwtehR͇\<@@rW? M?MoC5&7ty(hϓcBynvG?g%ӕ=HB(CƟ[.lԝT 7O8bFKk _ͥPj<`HJZ*9!9R'#S}`>ݖ-9!`NYm/ Pzɽ9cb "+% o*9!9:^{DV*e$ T8Ad-g1殔I`I|?JIDfzg*7}1>[]&)M&?X^ַe;DƟCߦ_^+Μ4 RQ!5mQ%[^#!u?wJEKI|բJɩ,&=cISf 5k}nzgQ$2zNwCJQ,k36;B S%+'gU!Ӑcj^P<1f(E% CJ($.9#xS PYgiN’=4+`gfRjޘKљC P|wфǟ=;U#]Ɵw/k\/i˷2gCIƟ}vT`!u#ҎYV#~ߝr2J181$MW{sCtB,uͯs?~SӔh J{),ӌCJ(76>CEi׃28H@I^ƺ8 ={ޚH\ɲC=ә2HCUL]@J_V䰞rfk̉P^_ȔG-dbwm%NT*Ƴ-[%11qzK8~ų ~V)q9wlKl1屙.;^vxt73rQ~+JRğy,_'G-g7l7=#4O<*'veK' K8C/}i|%W?_ꭷW?z͟>)<"?(cI|4 8N1vW =CE<3>K!aGghyKLi#E?oA?Io7Rk|߿sv)t7O^Uyg7S.1HdJj@䂃?h2|'ёWY7xݼ18Dc Yb_~\u.B9NNcIXN؎ d*N;J̅FIj t9^&@@CL 8$NUhȑ eɉzHwa,Q 92Lhqn! y9el)JW hL>@M!z_ c?ŝR]ڐ#@=̊Szӥðard(U}/ iL`rZ=Wp/;]I-5cLKA71Uܹk܄?GAT:H?LFwk%<; ̲;o#&~vnVoV~+.v~R_GY?_]ӯm ? *cw6ǿ+Q|ȻZ7[~|Pdv /"?=Q1ڦ cǥI1Q"y~j<;Lo~w@vO|yⲒН* ΃"h&A8Ӆ]ŸBzxMϺ$<[.tW\V%?bZhhںdaCQ&'﻽Ov;c5uD_eoنG|b.șUTv\h;~C1B-U)/Z4 /EղY'4?&Ƒ2/Mlc~srn_^c/NOjZ O)@X׮A\eXG)O4~ C#qhNLpt^K4>vF;H-9Ѳ8%Wc]q@xQN%wq/TKis/5IRڷU\rIz\IT/G$_Z/Ա|UO+B[P?<( 9?y'od]?\}ߴŬ 2_շ#WO.H6Ljh4R{O`g- 'R,] -jqS|97=z K?~ޡ?wNB(__o ۇ<*Є2XKL 'ssJkHm1B2Ǧr snL@dW^oOB_$^;DkώP[6뿏Ϣ@IDAT '/w_{^KL//]1CGbةS`V(o5RN~vT 0o\e]DWBQ|r?x 1djݤ-鐸_& pzٻYc 5 t` rMf1BK& ;!B xz%A=Say.կ*%Gkc93d=G gT2*gV{Fe8ЩyjZ68/2 yS3O-?R82<'q2}Y=jPͮ<Rd3P1ZhY6o~}{~󛫿OcelMՏc`қosU{Gۍw .Z[@Ws<C} ~ZwN[_>ƟaQŘ2s 2mG|!p_'PԐ ecwEfP[H[ȃ۸i$i-? 2,u!'c<ͩUnj6X3*S{gp]pm0dgTCmN]En]^͈2*d'YhWk㘳w!r-jPFp0)O~AHp y9΀?5p$j~~jiWO)rTQ(2 KQU%vS?'=PNXa?4})|(CR"gYJ,2ݎ׋h kFm`ݠ)<)P  #-Ep|pBGb[ r+u(m6 mfXer崴JkysbN jlse_SެFB)Hz/Զ0Zz^[Fo#4ʰ-V`$t`/c˖΋l JgecGugυPd793ǯ !Tj%=eHkk7QGVr:A>~է~Q?QZWn𤊦tD߸h.A*(ute?'M%Zik!t9=bnqȷ foe˫X?Tj?6H=3b5<+(0'Yt=2 ?<_w(e%ۖ2Jwkk%zQ 9mGm.{l?>OSa!( \dwöEa츶4|D OIqRT.hK*` _CD`8g65;-n[% -D8<0f@@-zи_/.?´8''.Ѩ0{4^EJXݏsUm ]3# B[Yc3>C,:c?t~| \7^e;By"׿gX%_# @Ar>;jʁ8^z0Emʝ6C!z{/po9GKSңcfqPcU) =QJuw)Au6Jk1UFm h=(|EتB@zYphZeU[$Dh=(@\mG* e# mE4Dq9n[Ӓ2Q %v9 ܆l0hO=|!^~`qW~U?wr eMXk*.7D=yY-3 ䷅Ѧ=%PRB-u#6?h v~xr^m+$l NZm h=(cǪk5KpG{ϹWV3u[$h=(@\Z+ nj8AaSVhi%/Hj6VȚ~pzccMَPbڄ@ģj@ U_lgx01UM"3?~ivV6~κz?h4U/uQ+Wjlc?z)1%J%1KB SmT* \WO5e5:ÜtXJ~C ṗrJg2TV\KFTC1;"Xm&,T"^ؿjx:b`-.Q=;z/r4w$p0#%%R [$!3*f_}5 ow qg; _Kb?J_o_n 7_ݟH yr9S- 0;s%`>~i/U{B?_?S{Nsu)#h(4bH]iۉbe_j݅qϰJbCA6z'1~э"O1CmOW,V\8&s#|?I2EA-/:o gwoMe}X LFqw}cH*k`ʇdļ~h ]'M#qjF\]:Gu8uj%[ol:ͨ Uz WucpzV^,yalJ7-eI@_I#aV*C! ES-K&L$K)qԑ_KU @%@C *C`S-{(RՍHKտZ]T2,23=T$! <|ϔ.럳 >t9OP5aav7dTL J(M rT*! <H{=VLU/PO7d݄eSD]L%M#귪z/#ڂ ]yeGbO}A,/dId0.;!Z%wP4Կ5 Ć_ﻱ>愾#d9$}\E_Ӎ%OЎx@n,҅l K2_TG>LaDqh^Ȓ^ti8)cޱZ ˀ#!j?G Lyǟ^|;OC=;~dId0j5[c? 0#LG/QPrwj4N PRAT<ੌ\~֎ri. 1c,ƣUJ*! 1?qc0pZ?-DQ />i5Ky`6*z]| l-ZSRkX$Di>HJDŽWZz> C`e=GZ9e5VI jo c .w^KM?$]ynrD?ۏ0ʀ{ʎ@;9Nvni9R"J;+BSj9!%yOoSmh˞'o;9X5i b蟳Co1i?mdTwbǟK}?,cU'1؄T[a kRʛadCbEtM |b+z«8(~9,b P>ML*k)ODa+xW|dWC][E=/#{p4d |h]Yė/}:'w`3hWXbC5rik/.8'qI_i~xz~ 4k音^C|ǠUO@p{HLQRWJ6d`99gv1csWY~oekU8FN%MrO\룂IRc:5)O>]!o 5HiQe-Ej1y~,'7mO]~7}|N?ھ$ D~ڿBSuPŘ^sO߾V?o[e&*?[YWss{*GURxͣca+?Gxj7d[$_cl|V~B{LšGDi*֢R.}7P/Pei"KnJS)R:B~F**MZT%0c!L˜3,YJE\,8.\%_*!9_Ό%F|6 z]Vl>9# 6VHƟ_bڙ=z^wX;zXs6QGGdϫ`c:vvR) yPu蟳߿j0u+}?$C?#rv*ܢR.}7PBɎĐݔRt;BHrPb&bJ&uFwZW$KiK`bd.}G踻jdY?+][ (u!JHJ&~tm? YJr L[V?Ȳ-*Ҿd)Mt=<|JŶWi*ݢR=i[KY +-* x i|=TP2> v \NC|) K):`s1š1(>pi!4s>MN5F!-cqdy1+d:CmV[B|{RnU_p#OWӷ0slNB}V=.NKu|G(mQ[ZF_Gy5u%@J .bǭh /( 7BiH\Kǜ {H|4Cpe/+-6 |[l5-[.ׯ--=/{1$ t {\s#ƥ^t¦W&7xm?yM1=r&[8jl2G񏌣O?|KjU]Z-w~g%əP_ǿ SQ'Vkko\7 ֕ nHY8N7O8: -Kܧݽ.`-? T|O?1gH8n9>7EM$$lYLM^!q& ;d_/y:-qܽzC 6%=տ= S?\a+m"yaO&Mbe8_ Bԫߪ u73? d[uʼ 3ϥ}?FvX!6Z##%ز?7pp>}qz]>o}_GZ`h_! VxF1Na:#Q6#Wd1K& -% *Op=>SOhg㑰sۿN>d`xsW\::ߗGU1S5o[!o&=>+% *=C|z%/۪쌢x:(r#`bN7%BP )A׹;8?ȩQ?b?G__[S_N2B"MR"}4p%Eݢx4 -P,3zZH"x/_u#t-xG.L,ladN ;|O#B'[YtUEeZYc>@wp7`;ٿ{>ks%z~֏.c.{9qcj&zqoNb!TN׫¨(7@ODU!ҍ B;ף _ U' ypQ{zIOKW\ӢXCa'tn3y%}g@cħ8B1$:GgNG _鏵KV1Nsh:-ty_raUx9ϸJ_xom@3 нݗr3S1,$>.#EGZhD Mz]ʃX-&IRs 4_9f:mDg.B(Ծ|}iƊ}{xn?6o7}Z[1= q'P.S?T?/ˉ?I$`Si#:o5kU{3Fk|Ost/Y%G%'&(NY绖_;_}}yô.y!JwhɎ>ͽh<1HM@Qzrjy@5@)O(Q;^^/*]' Uyh$b5RZm%2LlgY TԱWؾ#U-*XL7Κ_Sx]\M=w˨!utJK?OE?*;/qTײڏc5x8.JKPal-h/P?.^[X#~[C},A54Mƿ*ӺǎFwOu}-$#Z V< D(љHEy!'G(Q;.)VrT|W\@A™u"YtA JTT>8 d?L|/xl +HͨMǟ[_U3@ (#3 [z#Q 7>Wqx#2?eMעr/ O.lQ' _Ȓr16cS` s!k߲_gKؿ .iz8;+8Pj H88#kA*GUg "‰oO5>ÕRQX>2TnP[URP4: s'{.taOT6K$}cFy;B 8o}U@?9y㊪^5 7Աơ+&v֓Әӱ2_pP}C+̺;V@1qi "cIXew&` [o9x_(٬uZԿfjA{'hV;B9Z`>#{N:P?JID~KC8A No2't6$_B?Z|$67g-vQ#^=[wi?d"C e"ڲ^n]#C׿G9h//W?SXW~-G;j+uƿcۿol1QZX{u,[[?740A͸ة:qhala܍2M_CJX( x(?Wng d:| E̼܌ x4/RUXL/aCf=#a:xn.C05pF>B\ʈ/dpXm]B sD]fd"6?Z; 2/O!g3հRҙ:+?OE q. ygUXӪ>!s[c(#OA 'yJ=-Ș >Cf,e ؾ)>^^?k3^?itڿ1r! ~W\bj0cF. 2_ 2hf?Jw%n~;U^Zc| rdyQ:| wdӌ{)+{[4| ɺ {[PPƓ&K*4Rd~>"QQgIEfFʉAb1ytٕJJvJ$5if(v_xX^GX6[Z|eYZS_Í]{- _нzw&c p>S_=וףo85*RS* >nJ0 $q!\?u 65Pݵ<ҿej~KsSoX$A͏nRլtE];mK돪5={q!Y㟮㇎z4(}4Džz\[/",uʣr6Xf/ccwrGqrh?иreoTcGs]p$MȒP:봉d pWZ "\py]YCyfxLkhW3|$5͝s7A,3<&g` 6WН~AeSCyfxY]>S4we:Vs@`Ҡcs^؟B.C4;z-1]O7W1W#4HbK% yfI(/@\MMDUv.E*g]Jϫc5Ysڿ0}~.Q4Zw 2lMPM3N\FAaP+;>xDžs%HiqWs'۟~tRR]>c(㵭H8 ocC'BOH?|X/VkIܘ30v +ΰ T c;~إNIسk-=΃L˥o=k/_܊h9T Uǻ ;,40)o韷mii5kzQ"B9nW黏g}0W ќÎBk߿Ϗ%I8Թqܾ! |s9Tf .rӗ[n;.yprw [|_;B}-[$@Z\R2]lCc_;Ӹz-)ǓM@)T_ԓ1?T|lT̝b.߲Z爿z#Ԩ <(pǏy\ ſY?n+B.E{\W ?@ˮQp>]߅7h?(;j]iՋͺn8Wz! uBS#4BeMOltD$ JƉWp_ \-_PZdJΧ0ԃX||~W7Ӊ~s۟z Lt_P$8fDY m0bo !bDO 3iJ"|j_$tT_~pz[$?\PX۫e#DTǙ/?E8neQpҎFSH \[\ƻD`w #>tA,p6X`( gߊ~hV?7#?I6>6zXwHeL(}w? ^VfG_X3h-8EG ~t C*ܑwoٿ_kߺCwܼb!eˢE#s߾QD g^xIħ^{7mEC߲WT_VH]u`dQ{#MX3i 6j1/(L4sA0ܲZ@󎨖3);]`S|QI@)@Lf p#wLGO=WONL`2[? \abLb)oXȋqTvқ=819,̟"Pc3FH7K?}XhDzx=[g(` `>f4&;[Pr%*T`[H*QOA0ir )yGg?pm3Qe&π(wDN͌4Gbgld h#tjL`SmJbedLjj9#k9d0 Jw'OD_?_-.EL9;m9#9S~3!cfld hQ#tjL`Jֻ{|V9i dfy! Q-^c! ?wNE+)}5Zɭ;F,K"M"OXD04cY*;B)|#Tv-PE)X]`2L1=o(G;3/_FEOȀ^XoIB^0gNbz>!N'k6XFƺ` pZN`YgS (nLPaGr 9`YgS7<* |fztj;ӣq98T V1< pQwV_GAA;@XWH؃( %k#.;Nk j+x/UvW/8ٯGY߷W -[%~k6Io8L1=P7?R|k1)SL'dLj _!N,sJh ܶ_NzƀәwiK(<1㿾98n_f7 pja i3r0= z u9ei;B!Ώũ-8L1=j WpNbz>!NbCCLbz>!NSuFR~2=F8\d4Hy^ Lr1:k8Lzg^kxqRP{4oGlL:ccY?)$s#6hT+ų4 ~Vp9T+l1qc._oGFN?9H=ZuqpFyuQl?!^8IH]vw)Yץ_8FX/p w (♉Xl~ OhyK|k#M?C? l躢~HOS<"xokS2ZFݣu9SbZ.Zkj`\%FPDSv X{眲HF8A!`8[r!2e)AmfX=iW11ߥM>;t?zG.rsus&wd!uU6sE' 2_%VcEA)1 >vzNՍ ƿުeccR}Gmq \a3)epv>fwxdq-s*9>wbACPV%Q\=ߵe׸-B:Kk1=ݮ?0a0^?P1<,4 NK ?!u'5/տף 0b?40%hoC*V{뗁~q_h :J֪th +?OEb!-όʩXNj}bqa:Yc'# /sXEc 坒\rFfԿo~|q[?r:B<ʫh/GA]^8*)+@IDATJ5;uZKF:Bp ֵR> &brfr^ Be&+i=!8.sz{wM]ivϾ{<#Q!|pt1z'B GH .[_F^߿,ztKÒq_?[#K2TPmud&SVSsq̐d.42vDX`syulfH $00/C^Y F慜9#߱txw53J6vfb:#O!̀MQK9i!#Pɷ >e)<؏q4 F慜9i<:y3,%j g&RmΪ?c^drf(AsJryZF;+<*vsꟴT3/q%-3 |?Eǟ:R_Yx,YN42y.N-~_i;Φ ߴӒ~`r^#po;] Ijk ޠ i~^bQN9Ө=чGA~zity5h1<gT(>F]-_[B%f ;5yzh. X϶B(;.)3uZqԿewzizK̷Ϫ7H]9Z+΋hH,A8Ӆwwu t5*.v ɇ &A8a?yw:JS .u1OWhQ[gTܻOʁ軣O?3@%{|x٧~*LϽ-Qvɼ5Wq[TP;.[]SaҶfjQOE&di~a]|BWҿ/EUNst={+4~l M—`Gߒ?X姵KT7=/I$—`⫹¸c1u--L@jQC-ǯM8~u]*{#x:^ Uez\MDX9>_ ]@$ӻYc^~ B(_+Xw´\;ħRvFdT %3'#$h bV#l= ":Kʗy.殺*Gkf]oKiunA=E*ɐ@PCw$S#w ' yy_D༊p,$םBAw3Ε_׺WяcYb1Kil 'CA]|"Sn WTSmA=̓ykf$['4QD e#7-E|3RGb-%sʤ&^옫?v}r]Zwv%ϫں]k@.*a2Wn= ֧!1ƿo}4n}+/AߏeT%D+Ҥ%ѥ(mt"pE4E;i=g+zv avj61]k@)H90Wٶ2`㑿V#*\;UL;j_! S4گs0+6zת;B9c?H5`VsY'bړцAZzj?RrSJoE{,ԲZ/;BiCX~y[yQ`G(w''FL8jx}v$iv>z" v^-؂')H4W}~)H90WUnc.j1]kOQRO!Wf=Fuwa8Ny}, 08khgs0 1;dZcwQ&)Ue Lڡ۠h#s츬0uڨs/.~9v4AUz%%{>{=lo=޷w$l[?ǭcGtZŭ˜n/L!_H_8z`[eйDUP~nu \W`CeTٿ|Z^l#r.c 7oc?wGR.ר._".;<߹#vFv9rgu!Yo W_IS GRfڰ;ׯCԋ?(!-B +;B{*3؉ ; 9 ؿ_ԟAc&Fm zV>ɘ.[46" >-R8G&+ihM\?=e8ZD>2|u^m8V}×agx[y[밳~_ZC;#u /Hovu< '_' 78@fR)w],G++;̄cv+ń$% =-_)J શ3bH WMllR_M\[{b2fLNsGh_^~?{S+1O=˭i  7rg; QP^Kb?J_3} n/g#6_=HLܭ\k,?Fd;.0_-DZa|ooOpo݉-Yw*k`H̓ډ&^GE=3 >C;kq1Ue0Y$M?VV{q%@i "ktp9Wrxp~wr!;w錥"z8|QՉ\׿uţGaտ6.`>1Fݦܒ![|&R#FDN3?nǎw;YgU 7ÐmƆ<,%g.NGrT.ryB1p"!eLjzPG~-7*癇˟߼Er֙l@%ʰ/V?~㯴R {j)z.?;I*r5V\py*9>#tR.c`'bqb&(ͭn䷒O],Mz߲ |osͶ;BIׄ>0aD4p?pZ50(M B`yA|A+Fxdg-xqIThѾ_34sP*=_#3C#{ xx=dTz߹6L\muՆs:%xt. DA<3ٿyɲE}]e[ZX4J:+ Zufj阨6cQ;d'MKFtL,풏MӤ{ݗ?'Pm2@cy#[:&O"l;D>gc_~j['wc  َFj ] 귊D(\yPh\ǚZ:&r%; 7cRXVƚ=+'\{YKb@H&#Vo߾w8ʲ0OiN&=8j;5'@Rt(Gc!5OԆx:cc舸-넣szK 6υ|TףKP$ʌ\ Tڦ f2Aiܗ^X1A㯴_F}_q^݋Jbn=޽}o3(_a$Z$_CW;}, |\#kk8B#سL:N/?JNV8H9x,1$dMBQs(/*:6ׯBkȴ9|4hpܹmn= 6Tmԥ<%Cgl<xzdK;CS>QfpM lUh"h :U+wjĻ+ڄđϵq.|;= 2QB>]C!=x>~ե旀(3rAF^(#ou{G?V8U5KXSN2#ęǟ= `ѱs s׿QvpxaRHyO#.>D,$G6.z$ 왲L%MQZ4Y+~1v\NM)kNc`:~=H;.S\x=RR/&X}S$.6|2 sL0wDe昬l4տ_wCC WҺ/8Ae \`g-MtNL pLJ_|f>w} w~ [;B~[^ϣox5ܘʓsrDSv49aIi: ySȸZ\ӹ:_Ǥi-]N@.^'{pNz k#?n}Wd4oſؚ/:c]f;Y+藝k'|#-Q(xPc>9aIi#Z9&;p*=wC`/%C:u4[?K}>=BPU$?ݼWIB0pIC_s_3ś:9ׄ XY<M&Kأ " JyZ߈zX!,PevVMunLLWK.]QdqG,tpb& O c'=gN]9we.=<#Gpc C>TWKoo2X$e"/G[#G#v\2lSl`YwOx~ ɤ-[?1تOKb?WQ.&(m'Ͷ]98hhN! / h24U~ѯ2B~vutyB Ayy8x6Qg};Gy{>MJy"Gb˹x M hA3ڿwv!oT R:OOP.<տǟ;aVK?ՏG ^0P0вBɎP)-,)P!vxQ:-Ur, uwZ%W,i+`a?dpǎewg=!z+WZ ]̦~pB0)P%W>ݏ,i2JdP,-$/Q%WN XBWxiVi:-GT5PГWvrZ7J&ʷ>-C}[) b1;>D=䗢{WU JE/Hk:Ǡ8PzD:W:kO0ҟBFJDvboujPյaaXLIğow39AxSs/uOR]D?J_0]o}/}9&4dyTã礵~Z^#ݧ/Eү7j|NҹrTؼ,$/ Ugϫi\wy5\NgXǟiVV|Zr_ڸXu ox;B/8R鋟1Rh%(Caz$ER8;.CuS'=~5~)`~ uцϞ}K85[\?[aO&̣~}LEVI7ܲ/si!W-[r:M}mIN-m>U y[jONL =wx2G xث2*Th'wRҸs]^RDWlohAmݨ e)[+}Ibta]6?vQ{6={к'kox{ԉCbzs'i"+IvS ;!bu4F% ቨ41t~vl0A NT'5sPz+kGYuӔ|~!9Qɯeq9!gRS ==Xi[?8 T(> ЯN D&W۟h<įƳ92dgYl+xN ߥOͲ8k)}cgaٮYl qO =[B?_k:+l1_;EXK =N瓼ؠINYCDE qs{}ḆO[Xuz"\Ď;Bsf\X+FcDuPvzϟ )9u46-G$2B \8}&j)_55Y?/ҍ+WX=I#Z&<-A]!uTxMpx.t㣛o_PG#%4?Vʻ ?-i]X xo&N ~+-fP闹1"tQ?-g(3͞ h9~sX[Ȼ>Zd™ҿeQ=c._{ #؅I^DY/Bp*~/VGLD7*Ld 7|/T5M$Ou";=wD;Lcsɛ4uNYA<2S~VY6Cv|9xOfpf:mDg0Gg^\TXV?1?B(}c-H3 н/Ub*v:MckN&~yXw@?jqE)9ކ3r6Nk%Ծ|}iƊ./dٴ+ N|B(-oN/D#] ?9AS?nvMj8;ܮNNV5;ه OE)j^ٳ \/&(-6\_-AxuY[o0 |n=|DshɎ>zGqAsf] Ak2PAksHٟQ;^bU& GE$L492"Mlgi-;B+Ws~eVkn\AnPQڟ\ 1Z%qߤMf3_qXeZ1;CRG(PwϞ/_nnÌ]ۂZ'|cPJnZ_ y-fDi;8mOGaҿoh6UWC=-~ ~o&D &β@55/;7pfէ>[͕8Ĭ$)dʉVjřG;$Q (Q5_Ʊr3cA û 7a S=;!ƅY"!Q&Gڿv5gz98w^[WjkG'?-Ax ?IFwz/j}_ b'd|sKOJƐ]w3<F{SFWS&:Q0Y_ X?Y~FGKcGh[ȓ06ϗ?*t _UeLLbB#>,s'ӗϚۊ%p{F)N%g;^ScJ5o=.2<}S0{>uPV?K::K/kr$2JLH7J=4*>W?Kؿҗk cvW6[fvTقh6oR}c+:15~'z/\_q!c?5\SCFM=fd"|~`3(G3;w [2_ 6 9,:~Pq\1!cRSHWw2LkhFjoYNϬ+:P)=GMprl+5=U9Y5kjh2 @;ɒQwޠ79pӨϒ!9-Q \XMod'k@ڤ18'Px'(R_Z? %`/Qe޼[g8޽ygb}{ϭ^?t%71h)$BGc@_g/^<MC-ZInd1!O9oT=FqbR"_cI5oW#Cb3ڿ?}K Dwoo'ٳ)6?623-%\_sTM}o噗g$#?K ?#Dq>^ =O{-6_Ɵoysj{}ˏR ]#yWt 2̕_Bm&/|=0=/Ƒ??չ Go֏DS0`wo@7 Իu &P՝u/RvrISևԩ~r@B!Qݒ02 pvDU%.:G?a¶x s˅Z㏱2?G/չ~yj_ظWo=BZ7¾ 3WE1⏱h.c_UyFu)]Ȓ!r mz.~ C}Q^-k/$-[EW pb?ڃ]O]a?uROcqM@~ǶZ|l4;35vMML?'x$溦-?zchSp2$ u >Fa{aJǿǏ|?}qW!UwbnaQQmAʍ?Ѭpt[=wP|GzE?<Jܳ % 6fMXV <hQ"OxVk,pK_A|rB.klA-s[?jW1.=vM~> _,ntȎ*[;_s/!>s?ׯcIt nRbl|tn#:⪇1s׌˅/5ܪL>o9滆-?>i-oG Z-ÉRkdYuZ| ס+!-+h>c/S/E;X ţ_Bx?>6GS9'gmf2zl}$&>w5=,'f>oR<'\.j~Kw=׻ NGgtў>~bO/WN7xi1{M5! ㏏?P՚=6O J"wOv,S. HVo,'Sd):T;;W?"꨺oҧ. !:S}GhTC}[k"8gV/6t񧁋uu'_%>C 0СŹ_~v-N8 n"cˇ:lhε0d$^Esxwp~ a5o;LA;Lu-i23qJ {RV6W2<.2(!YM߬ş1+ |$8;m-G1pnlxG>}whCC#J_0OB㬂՟ف5|W0_X<옒cE|rḴU|=쑒{=P hW }gŨAvT/p^Kei3m`+Bx12s|GsYCفȒo$bA0i2 ąI;ZNS T|мF%Hf4&;[P hrF:sJb3dldNS = ;F*˷# {㱟YdPKѭǘwyţNj{y*Bc!/Qeۉ_!$efq (ñXy%>sXh쏯ƣ~mG/O7?'t>G1E595sA0ܲ ;.yxUrF:Uz Nc@L;>?w":G\i )yGTH>(~7i )yGT̙d$e'۔<rF: rNa@ ["4N'sst ǟ&󵈱)Gc]yDz-gS3gY<DТFT̙dˇ2Wg׿7xos}UtNMP hrH=ydɾsjǟQSj{'߭X;lA w0f.^+ًN1=?@IDAT `ą`YaϜ2|BNluHa)2*8Pܘ ^HȀӉΚ u9eyC{íY g\K̮c?=?3XC1`ES߰ }vsd4`rgr𱮐QJ׎=Ɇtj<*;+rף۫l-?$[€S+ZHeqNbz>+n\ y$ģqeLӦ$L1=Yv2~0+?rSޞOȀӹ:1i^UcxOhpG߰8l_m4o)2 H=~P:mAi p:Y8L1mG(teS֪8L1=jWSOȀӹĐON>܅S~,BTq7:mLjun 6ɦ5NN1>7rx4oGjt1屬s~N̕_ i"I7ˠ+AA+g{˷j9s9+}?:ţ!-mKƎP>%Ý.Om=jxrEnsFIZޘ&U@撉?(1b'ѕwX7:Ak(=='U@ qLy|^edǔi<~6[e?a.-69wJfF]9sjAu CWx<`onlodqƤZ[47='U@ qLy<{g{w?Vzɸ_O{}# Q ՠ_< vxtqfU:^xSڇ2K4_[Ȣ<4H2׮ҵE=?/w V)h~Xb Wl9s &z/Ej#CNSeUCh2o__mǹ1(YnС6`0<z4~PG.L?#rj6l߲pf':Yc'# /sXEc J\rFfԿo~|q;[Ӡ%\89u:v^үGA]ފ0P G:dGcO|~K5[ ;޽{ڜPr:y*)Xpu߹dtxt WGW%[c%]S|hJRe:]?G^;^n}ӎ4ۭB^j`'7d/GUS>:]S &h91'뤺4͐ OPƾeX`A2.4漤~ZJ_sG\u193 9sZdCoI?2{M@{q4pٹj  g=ŨY@8r:A?M~G#$xèD=%kp~aI\8W -%B*:2)98fHT2;|rj,0ɹ:M63$brf!sZ,hD@L#BΜXE: eșI`TGv;M31Kf%4y͐[܄J2ǿ~sQfD@L#BΜ4]m53J6g1C/293 Rˠ9O_%p-sc00`?.uevpC_!$ksjq h|4e: G*cZׂS)Z S,'RAcDW* IwpMii%%F}̽ G.v 8-5c\M[AӒ.ܽ n~ .r.Q{o?^k="<f+|xkdH]cy*Ϩ*Q|(Zx[; KvFqk),] DZm3P6v\Rf.~}5Ӱqk~om͟Uo'(sVq(&vYp:4s I* M΅u;BDžNha?yw]瞎=tvuhh놮wo#>iK^=+OTCQvʙyZL@5wN־s &(r΢@G?_'Us{%_SZV%UBG-!ۿ. ?Fd {>u[[tM%@9 !q- )+-G9AoXa>6uno,r};M}3kMs@%bŏ2_B ҿXH}PT-^>P{H{=XP?e(GozQ/Y$—`⫹Cw 1B8 &r5wH Q4WǦ _ UX?.wq<2_S*J`O@o!MnЖΉ{M\4D2[?K5f2ouB|Zg!>Mf{ddwmQl3ުsQxADg)Д[RX υа]R,uFpV~XJ/ e<xgw pai`PR`'--8*_;GS/x0RMp ƹP[ WyBڂw9Vs)3^<7[~Ij ,4Sf8_ZM[>X5<[ rlin 3?A=KL_(<;]`׹l2VTbcޚ_ԳZQN:a?]O CvEU8&<ɜE༊=֗Isͯ?gp&}㏝ Dz1bh8.gHEU8擭g) +©6I~S/͍?ʫpdqUzAQ1%hZ-,ɋ{Q{Zp^EUǟIUim!C+#ǻ?ńK?q= , ^M _ncr|Dͨ-d#I03t OiN*PG߇oZ#&eaݰ R ƣ,Fd_7zpR ~r#1Dd]Z] K>* qSr M@~c+;B+H9υ= pA1zq{蹧5\Xv_?q@~TOǯ\?9E2S$6F 5p*ӌ.H+?I_uVz|, :+l[?(WGCʤվ2 s`,VlqDduԓ\hj[SΣ5XIڑ:"|gGFUG}- W I&ago@dwFi#c,)H9˅aԵvs7Ȃ')ٸ^ttq2{j,h#,\gΎR$p=tUs)ĎS׺BvԂi ^OmM[XWy񹟐_GgZsG(Y}55 :4y[eBa.aK!\T?wGY jzA=j~J(Xo/ɉ~KEWfk{zѪ&{QQc>;_5git"oԏz.gmȍ-Aݭ"ْ٘y7Ww^aGȖΚ]_{8ʬ̬Ywx$Ft-&},ImP^v?7k`lVoP2th2O[̔\iRx aDGbڎzGp]G~/2{ΐ37Aiڄv֣ʻUܷdkYuƟS~F2BW2N q:fXGU]<:ȍ^\M!3+ŎxRX+\WUJWNb!.?0K1VwkKtx*kɧZX~>v2Hr2|p=E-[gR~>g0B:zTm[X 7cZ[o=S `K.a^2?${Y|ڏ4kG(OZw:A#Dq1ǹngx[yvDvnOef-׮Pg!G:"mT!d+?vǒ؏VfkmZw,gÈ6_HḼ7afz_ 2Kʿ`|U{BP)F}Χ- ֝8BȺ"־if;@g|%h' K˾J'!9C;'2>L-F_?0ɗ~%i_G3۝dF2qnokM9#;B9P`IGxbO8+?{u(0p]6NV?b>3uiSfIIlS>_i95__Ԓ?w{.?'I*s5V\py.>d#`4R/m`'~h17ip*='˧,5ҟ >t 8f[< k; Fr,F9".9hU& Bye~_@GR '#Fd?Ψp, ;B0b:Q6]q풯ԍiuyrj*the2 ,@zGBwƉ|OUQ擴?TP`É?h,ێ2A)j5D>!>_t ="$Êf#`_ǒTy$=߻wﬦ;4l3ZLnK"j љd!(;>. %A<`:p4Ε^e"Ć zRԨ2 bC~V۴ 羴|kZ 6?lVlPxwBh7C 'Q!F sTP%kdf!4<"=Zʤ [J%z|*/^1$o4+˦]BsQ^r'9|U/< pKѺ/w*VۥRBK,f< <>GK7ZF>Q&rA #iAP̽_9T԰ iB {XR}?xS?j^ǿ wTPå7hY#| 2 w42BٵsG^3%Q9]uQ-!$J8ug}5("e"ę_Xc_⩽a;Zқ$Tǿx*ԃ kZK[|*aOdĆWN/!$lIC/8p10lBf`Ϙy*iL2Zkdc,k&̘dn_&Óte4K1܀w%5ibSEhzL'L1ı=JfLV`sJg;pq^/1D *; VoXĎP'; Xf6UL^8 eRrtcA `&o[j|5\e<#1o gZp䳽( wj< M9kʆI 3&q -NkeӸ: _cRjVYڴS` ʟkW@'ttopF(&poy.dW|#|D 211&Na&̘-11x5K1܀{ߎwҬ/lA5K£i0>uYCX1?,M~ ,zx+`PAc8__zrhι+z.NF5!b- ,扤b^ z% K``A?yYUl,+,!UY{}Tַ{0 b^N׿(_AH"Ns5 JW+E{Ģ+n0եǟ"xɃl,˖ތ;N2쌴e"/[[#m $v\*dSt`Yw/x(~ ɤW-[1է&ĕT dٶ+(``A0K$bme"qSn!Wt5$򳮵"_e,`Q k l/>*H+}KGx.q^Q쇭YUz5uBu^C⽐ƚ>~IXPg>W [Rs/P>@QA|:I |=`V8D\Q%UŽd cepq*`a+S-#^ gn`^-f[c{HY+BƟ"_ǣ^$xr_k rG+fn.<Ԧl[z@!x?a!c9?c]ajZK>l~3l;|"2 U XOQJ -Hj #RnFb*:R;ˁ(+BQ%U~oz ZeZ (Ϻ=5| $CT F?z".*`a?TPQ%U =^MV%q`ȍQ%6'p?+*bUR>u ?_lT26sa{%_-U2vJY/Hm`+B+s,)?a,lu Ÿ`K~3!+ر וcM@UBߚՆm LU`bh|RǨ=)&'ޚe7"gRWzԧgGkok%F_ZleI3Tݫy_K>Z/ SW?"o;|5eg(b @aWaliABL00+(.4P |O0ڴhϳGιրXUIܿ=;-Ngo@B+q2(Od?Ԉ*,PP"|!&@j]^^CgƋX^ |!&uu@ej[ &pY]iBXo0Iƴ]XųKd`+]w?jR~!9Wiq93y))` ?8v#x2T AK_|AaWG(I;l_y6ņ&h+O[|3QZ.|rOm#OmG(_WA<#qVo[e+gUQ5NT(sXTsԌ8]Yon\˟Y2Sg}V٫;KMɘwZs={>nߩk{r>׺Ѓ5\K~Yo)9+kGY2CIlGHmC]XW3"pk|CKS6!O7[X5=!|O|׾O/`KG~`Zڤ醃l9Pi=oA"^"& jǡy\F^Z04Njb( Q_aXϭ|?`!NڿQ!l K"[R OX,@xkx3:s]kcЙcg+|,MNXVFx_X OTpݏHq 4H);BpzjY3]Kj k~՟U %_6W쯯ðz+Z& jcOoVxQP!!^B7>yCޚ-kh 9QJ-0ab7L;߃~+-ǹ1"tV?-+3͞ *$ s뱶wim$K ; ̈́3%K-$ƻ7x:g"L Oߜ(OpPL _ z"_j0DI@DJpWL )B;%?PkɈ#h3lK[T i0wu #vb$#.O //_ȍ'xH7O7 Wjx܈Ɉw{_up2ϼTrQV?N^?1_B(mu|Xx pE!JZL]50h/_ (gpr7GPZMsoOU 4&/ʍ؉xAv\B쫷#O5VW8s\^ȼW*jHF|糿B!m(?$oNd?]b_GSv" & b;c}x|_A jrs >+ĜEo75Nv{Q5Ynhn?OCKv ݟOx574ji%  ʝl#2^"fJG< e;3pyDC8A )yFh彖Β|֏mpӔk|$u_#ד_뼤:vd2\EۥYѠɌ>ce>bDŽ E)Gxs|W v5:l j ǠcLFN*Ak1F>$Kj6&_kY>$ CU PP_~Kë,6BEEKZom@ Ien ؎FykgjFw9G1$FqTل#קy4Π|-*XY~-?`'گFPg8fUe{"6 4}qOe# ^q @#F0oGm\VKI/|&wW֗58 |A"niIIp.O.lDQFv"Q^f_M$Ia>L)(38Jo3LY}, j[-#zgV}؅ڇE|dz yh$?2tϹgԟeߨP؟%>D~TUS?}'S{?9c7 e4lxg`?oVrUL7rF{.U~K~/Bxկ>#c02le ^п,O)$15`~q*r^`8?<ga5b&;辙f` ̂QC6gA9W8ac9oJxP:;s 3|1P֧ ZKCuE.K(LNd"hBێ_D7Xߕ%60Md"Wf 0VYي=f?nz[Md 32? `$_ay.PNPxLqJc* }LWo&E1J3!XՖ/45{Q!9~Xk__ϝ`J'?'x-4˲8ZLMD#>=|B(m[)$Ck%``f10~]L0\X |16cl%gaC۷xe>z*W- w/leѵ3TЗ1N6^ǎPTD {%G"QlrW*s]7!`͢& ]ŭWߏ]P|P|5\JW-sv-Mb|Mv[0To>K-jƅY+GϺy0_W# ,}5Kb` {'mHq9TE;YGt' |H.7jV㒁 QN]CWyFhs%_eu$D@IDATe[T,tz p+Zk?c0t~G/jR7;7eY%"ތ!%wmcF)ߩVc\쿥w~4CvPlQL+MLkw ÿ5Xˌ8P2sUl'ȏky>o5 o;4JDَZ@Wzw1!ZmfS/.\?PirnL2`(]xw<ŻDV2> c9f bY~{eQ݌@[ ?  Ab{?3>R~# Bk v[y\lѵiBq _Ikvaĸ@Be)#d5 6SiVzkE:ox0 Y'|5opK@*[TS,ߴղ "0U|<`~X4社?.ш}#v k+ mwbv2pS^{CkɌm,ԞY&_[z_ߏ}מ9NImҽcP ˌP&hqrqe8i X0Nm"G`G0K_C;Π-wZ?xlhBXJ-{{}'F5_ &_Z N=Yݹ!k`oK!{5Ȉh䈓CXU^XClt4B#>',&t1B#b;DI0x{q0gl 3 0FF*)-DAfO*zFj-ǟ"q!lU/:_h71Ii?EZ0fslit'_ܛ?be=[e?mL] h q0gl G<=Y!vZ* dQ/Fr!Jۣ∛M[qt(`(O2`FlA+[(F=S836B#&~.;MIL#(O2bc-W#9d%;?GMssv ǟo̧³"EW}^O2`ΘL7XCD@F!WZ"^mb7'ķ?ŵLE=#NzFrABjGy 'L8~̺_NV=S͢gnt?|,?qXk`SdDJdx o|̫L_#;B{I*B/f2bZ:|5o\l!W̖uĴtBD2 6I\3F̔  &g; z sN(6uƙ_@`zt;k"Să̚4uƙ2bZ:`awVnlYXлy>^`W"wXLjbǬ9ž((g_,a ȭێdC j<+~K8.kx~ߒeX뇯#dKqZeqN;G*fΉiN(R ٫9EN(bc,ˌv,-P0uƙ0g ,m9a}58܄# (l!6uFR>% @:L1#ƫq[eqN(׎Zcu%2)# Đ7"FLK'@)meF[tBx__`=)R J(Ot1F[}CA uGwɉ'UW//Ŏ:I3X֟c+V8IHYa2oK>qT ^&n3?P3a/ v2hJPdJB". /!gpk(_R望Ro߲ʷd%_Kn5N|֒Qb3,Ad8,z{=fu>R3* S1pӿ;" Q|Vdd"f搙l8 }YC3cGK2"3},?># 7]Uق|N)5Z?[r sG*Zfj/"39.#uc,}amcR=6,Yo&U@ [3fz5]+Q_O]q廟pF(&ܙN4aQdU,:ɯxγ?2K4 ~m!lBM~6JC D>oCq,w 0"az|+Cjr܈ H^"Xw\SR| %vC-zX"Ox1o9kNǒa+?E^l!-ӮHzZVJ MjRf]gDf$9Fc距&1/blY=, T;Bd :A-_ [c¦g\Y8rXVEKV^MFm,aE Z*$;q'TߗR+@6Mq͝ jD(F\PN tr[uRYJ('{]$ %fE ~~mT!zVߒeϬ|goʗIqYod,Ew;xms?,K DOjP'GkD_d#6 OV̆~r(!?-e ᙣu g"x^t%/1ȳ Tq^Ǧ.|srH^3MG>>qȿsjȊ)zqf} I,$}ѐqݬ ᔿxM 3[?@.-րkG{GcAb>Dn=\Ƿ?s3l_P?G ($Ĭ?яz )?4kO}c툳'yq{O^=;xFH^J# ]٤Q,3B}!.ˤf||iUpk!8#9PܢP(U7AZp`,m:[㝤-B(_B#m?TdП/B}+|#1~Adݼ+TBm|o߲?a?h޿?[:Z2hLb>VF=^Wu|X _5Osi}/D| ׅK|Jrgؽuw%p_ 4?{$>C\`6 Ծ}i5_{sp1'O~ƚ4o'ǎ?!Sj|㟝jo@^7}$R b"Wm,Wܗ<#4i'i.{Md lp9H/B afI@|5uNVRz+apMV,$ fx&7.$d tOt6kL>[9Fѕ)t)ȢeJfNeeOUf["KFnKf 4BMʏ*c+>:^֨-g2[ULGvF^Q.,0Oy`͗ 2T5/Ton Q^O_by.uF( P+#c5K-иơ<7XhliZqs-K-[g\("GYH}p[l<ʷY yώ[ z_ԒXQg%ୱ5nA-UdH#ymiaVz ֊pMy9[+wÝ`pX\яcYm[[Ԓ* uGIeVz!sEU8&`</ĠpH8uؿԈ_+)r#6k{Ӆ)rZ?Jk Gv߲͏} l#`dj1q[!T.Kq.,!>CD(ڟ?oɿŎǐҟ I[ɷdo H\q| BldNy]=O8Q ۝y篐*$8fѸk M/t@r?&ɁTsb_o>nX=v[)}<6fwrX?|W"HYxޚyTfl5;dšeM=\ZixnG=BQ~]sxվLL hlJ4To%wǍ{eڠ =<|"Qz*2nNd_i[ۗ⅀:=GtXNH^Z]c Q0YrHK=eg P[VvV"rYlK I#svji4j"J?.TPoa:hA~TOPX1o\/bf-od5Xi؈yCB=5wQq2rH\wC3[o:^zj\{_y5~I-^h6W2dzXDJzL+AB%I#:kf\moڠP<Ft=I5镫>K5#mqfeQ>v %-Dz%᎕^i`4W=eaD3B]n@\BD[nX˚X^vpgoIr, mfhe1sG(Iz5޳\S~(" ?cW~F(u0ϣ|vb7en؎ېlkyu3Zs>ncBm)?jԤ643u(<ё^k2\ڠ-"Gt[iլ=Im}䓧/wf ՚D;Ȋpg;F$-y#O㲂Z7щR>cBZK (:zWߎdqYQfxF?ȅ%fgIˆYTWol~mHGݟPCD{!N;EÄ;""3 0j y:솹 /x/vshdYQ!t-4Q%,ԟ[YV>kK\H-oq88SJ?L-Ă[`P/?WK&PsKj O| \cy.?#vFv+r<(,Cv$Z? Ϋ/+bqF1mZ6#͟QogE?;VΤ-3BP }QK@ٶ=U;~|3ejBdxFթ'-kbڿ(ѕKUzZJyߒ6ku~f!^?e9i1]3|a俾OG<@(4}/)hY!YBhZ*H(}3#nA~@A_`!A]0l px0tnF]vȣfR(w9eNkaVl3% c8WT{e(UO,7iLfWҞmZX*Wq@fƝh_GVnliɌ`'[g09טriGGK7ZF>Q&rA #iAP̽_9T{3_<^HASYz/wKϳyG3o`xG qW=t b-=8koQ&rAFX(|>{f}6>V8Ǣ]5%?Do_ĢL8w [|w+vX<7lGk\zs"8׾QzpxaRkiO%)>D$B-i.M 3O%UFr͚XC~1ë́ ӱdxθ̒fi3&p=&:^LP{HXCd 3Ƙۙ8־YYڌ6LyNi~_. uuo8Ae  o\`g,Ӧ> v 'A pLJ_|,cKmof} ,g>-?P.|34"_'_õ<9`Mِc:)3a$s4?A8 WIX-t&̘(k,BgL ߒEXXĘL0fLL[_|<͘dn=ooG;Wiՠ%щ4,m!{JBRU&?={uƕ0I 1R ȯ/=4ip \oGv'UsJDR1/Das| m=hCP 0 Iżwbg^EkUY{}Tַ{0 b^N׿(_ҽ ,D ݻnjV3!EEWz"ׯaK?E3B?:P]Y-'wd0iD&^Fڂ\I'$U֏VK˧(JL_>P-I?[cЫOM,+=g$CmWBQ`H*ڥE0Cr kHg]kE>Y:˧2,N8_|TV:#y=C 1DR1/DH `9WkoyAt\@ 3gK> sB־SRAH*慨X?%]tQja?hߵ_1#XkweVrJ}@]u $[q?ju%= R'4V+?E>GWϽIЏ6?V]yMٶC0&!B(r~1ǺU{;Դ4}ϥ|gvDeL0FY7; COZrEY/ lGb#.UR, uwZ%QVąޣJ wo;B͝@fg/V'@Qu{jH(*!H* ~`?eE\JQ%U~4,-$/٣JG 3DYz<JqݣJAmAO_10VU~=|?R%Vz5lel >\1p 'J|![|93.Le. _ &4LkW,VNX@S~Y@?,gtC`8stL &ZBvlֱtBK`2UW < ۫SHv~_Ҹ _u1x)ve) _,qsnV8(m+Ga6!M y%(Q:N}{PYȏR[\ebG[W&7^~o-#Vv(>;'w~纬<\.b8x#nm?\ŸW\#y qSAпY4ێj4Vr4jw; uOǿwwj_ D&7._j<įƳ9|҇T M8FVlQoX|3QZ.|rOm#OmG(_WA<#qV $[e+gUQ5NT(sXTsԌ8]Yon\˟Y2Sg}V٫;IKMɘ@wZs={>nߩk{r>׺Ѓ5\K~Yo)9+kGY2CIlGHmC]XW3"pk|CKS6!O7[X5=!|O|׾O/`+ QK3+I 9 DsҏF 040qQ;㊸0Ȭ҂ v8VF\ znӦ )԰vDڨ(' %f[X"Jxذb*[;ZcΜ=ӿ]?cIhbG$q7p? YcDuP%x~/ OTpݏHq4# 7om%0ӵƬˊ'|oɇ1__aJ'WY3Mƞ0ޒ#@Ե3B):BCυn|t5[Hr*_ ?/Z2,ao8vIǿ'&?:WZ,@ŏscD褭~Z:OWRg=yA0TH/뱶wim$K ; sߒQ›T|׍3&LoNN'8BlWQѯP=nyl$zW"s%T+& ٔ pD`ѵEdEuv4֙FW%-*o?4:;1]H}B@ė/F<$}ԧi+5wbE_< y%ghIb*v:ydED{ZhD8Iˏ: +ĜEo75Nv{Q5Ynhn?OCKv ݟOx574ji%  ʝl#2^"fJG< e;3pyDC8A )yFh彖Β|֏mpӔk|$u_#ד_뼤:vd2\EۥYѠɌ>ce>bDŽ E)Gxs|Wv5:l j ǠcLFN*Ak1F>$Kf6&_kY>$ CU PP_~Kë,6BEEKZom@ Ien ؎FykgjFw9G1$FqTل#קyˏh47Y TԱZH,i3BݶJJ-G*mA?iL>柯v#gDBEz? U'~y0Ǧi|J*̫@΂8~k;M !-*jMwwf6kM4f{ǿwiAfM&I941C>J~rA*q:C%.X ;*l$\YG֯90tg;UPʷ[vB ySB,O^5u88 g^[~WJ6s}wjPcn̯N~]@}^LNY/k/qD<Ғ1d?]h \$m$.12E| H|7SP/gpr!kҟgX8[y!?zuZkņ ;J #{e( *`N uGbo_ա#)Qn~h']dXN_@mWOs[B#.-o9e3~lNGo}UpUϬjZv}C~ YOFcJa|FB iyd8XlY~;l Ovg_t 3Z6…;m *g>RNbb*EٕFz5#Oz[-SKV c܌U$.1kKϯvWxʂy%[Vczm6tpZt6*H0vU&pesϨ?>$Qok;j?K|R0\S9x/_ W/J_2Q_},GhaeSYR$Ib>JkkvK[Qm+x5+O$Tpy?%jNMv}3J#0LOlYςrqr޸0D&tv;&%|~;fc7ޭOj].QD<ַ.n.!+KXl`Dj2y'sݢ~!K[y?V?hQyO?hf]d+_)KsNdUf f[~9+KW\aB>sBI&JPqltȵ?p0B(ƬWwOu 5uwD` G%3;v/:=%ZX LWfOd%{~(~FmW6O@IDATfhr˽GYwe",~h@^w[M6ΦHp2g2ާ5 gꇪ,CG97LF>؂rr:VL>"Rb%yyQlR^6MŎw./k-`/_+V@%Y-[(_+gw+ȯmtq?b{uo6-La-w 2;PT_S8ܾϸ\W㱆Wey4}n"TĴ28.JJ0 Fq\̝t?j[Su[_G]Vm \oɇhز_d[k7RP/۰Y6G 96gzs\Q?p=e=xsF,&Oe-f",XDAek!4jglE./DV,+K;v$S?_/tzG[n:4υ.z[ qSgp^@3d;~sx?T~V\,[g~lxs?-B $ ѧt5΋8(O~T*y)lmA҄,hK)5k9z+;xV#o)C Av}8"_u*͝sگ7X&1+HppT 3˷B)!N]Y~'XhNTt~{d8w;9t8SN)6xoDxz\4"ɦKʜ@AUv.Lg+WmǪ-$^K~\cD֠j/Ast@ bq:-t`Xk/5CPͯ|CrHS הT*;vNlNZheqKF}{υPڶSH.J 3 Fյc z4aH!cb`|b%scmOJXA7 Æf?ob|UB Zj_ʬkg/?Obm?lĉx!!"AKEa"$ZyUK(L,oCEM ![ -_1A k: [; Z ;a|o[2Ԍ >uWӏ>u?W`K,4G#G oY~jO\o<<ʭ?a%jໆ-?Kn#^htYUV곃3~`._ԤoovoʲK~kENCJZcSS27Ɗ5K1s+hdDW#E@S?jf+ps e؄Oy(}8k wh4M"bAnb&C6=^B]c!j Ҧ~kx%d6PN8y wVd||shLS6~:)7A~# g|G`A~P1r 0آk?SEڿ0q4/SGk4Al26T׊Pu&`AOj< NT`!Yiǫe#DapeyhST)\-GRV dc; D CY=!hMo =sdq?ڬ{/S?@l7MZ /OBTOq4FG!a\X3<Ex hKُ.aH!vwA-[-[0nр-"v[>?OkPiL"K,{Գ n_sC\m߲WrC 5/nk?7!'[ nǙ32h G|HNX2M%E?;^b Gv(`(O2`Af`0U Sj?ZĭF8.TR06&?EZs?EB-8_tHozec443@+` N$ˇ#7B#oWa'zn˒/g #/`Ax {Pr%C*T`=Ȉ^䈓CDG}7.-\7%QQ d؞Wb'Q"{q0gl GL`]vEXFQ dZFr!JSw'_?,.OgE|8x d1`o~[(FQ=S836B#d D/x?oNoNjkřzF䈃(o"o?X5N )lq2ubz=uÛEuX~ei5Hs'ϳҧd@ :5*WAbG( v 2Ur/^52,3eĴt@j<:NxB0F-ielg,+)# A~MKXw1-PWl,3v$ךE5i,3eĴt@VfwKLw}??3H[C1`Es? ŎYs=QPϾX6b3v# ڽÑ['ɆtҟxW*;+˗p\׫h3%$_eGȖ :L1-v ǏT,c͜L1-PO"^Ws1-P&X!H Xfy[:"Ni3`,3af|'-@ijمY sG' jL9q7' 6 GLK'@QBl,댤|JR 2uƙ2b|G( W%:L1-PKd+8SFLK'@s!ˡo&EN(S :#)Ujxj6zSAP3BcKLk/N"Ƴ_xn[FhH5Q(=LK_00=plYEu9{o6)«a03 Lck[>)$c#6hT+Ż4 ?T8ǘvyO6_9c._oGݹG53V;MONk#5[EQfR۟OEZXv=exJeuW쐩~[xQm4R Xp*@<E<#Iͦ7AU" ?V"` o-bFs;8 Y7*S<F$H_D(I76% PR/%c KD ?N4ku21L* c8N`;=?E9fuHM[2!.mbv{I^]Mo_#ekUq|0gv,B7TGӆ4bsa1Ucj<;lj?9:_6H7UL卑f Xa3!?/[hZ&f}=YxP,,x2G$TClhJX=}ȢD: T=\=g^l/ھAvل7a! QHzء ?kx49Y,aR#; ' 'In)UA%ɠSߴM Kߤ/ 6htz.\ly*h<lz.L#bj8lFKYt $u#J '!Mbkp"FxBke%^D(gJzN|/|pwݲf2P_>qesף(\KBNpA=CCYxXPCHکG;W/!unZWiw6矖bA1FmѺy*z uu \_iyģf#SC0w_]QD^ 'μH<.]sh.s8:ˏZ^i8qm*vM}c: áۃ@I?ZLď/d"FRldTf3$bH(Sa,G/>F^!tP,ޙ6!2;m}R8|3Y/ܬǟJ?ɯ_GK2P)3s͜wf3$b*>9QJ5&XĜr&19# [~Ȝ0!#03ȼ3'~;V.4FZrFIt&X.S%2`3-0y͐n >eqa?B£~l?;iefH $02oi 6Λa#-Q9# hsVzaIy ":Y_Q_RU"DP9~[FX~0%ڟ:nΪ|4m<Y8BU ԻXSH w9SVN44y.Nm~^i; 8kð%-`r枆G$d5$qڨkENr.;yp4K-qJ]uŽb<=vR,-S/bLjbO倍G|[ JY!͒#Q{ݚKWtG_϶F(%N\2ϔi]QQxwF}X?[Zg>R)|֊&.Ť{=/Ay҅Z>W?j8:n$߭Vn0 WYwԟ<ۻ` ܮ)e(#?b0O=C3R*[=c cLOxc?^cz"彷 ?.918e[w9\[6jQ>NE-wgm]'4 q#G*[l]_{?'5_^om6CԖWeج&;qx#|q9wK0фAbowƁ$¨+$0Rѐ@|X y mݾ:,| & ;E$Ikߵp`J & RΦ@NL@ʛW&3:k*ʧ(G^- 8~ * ?Fd F{>5`g>iQk6bW F+ېAs/"%ǒy"*d%>G5&u+`#_g)}Q>Y`ZcuS;έ5UǮ۟?XGQƴ5v GJט^j(K0Vc#j)| &p/= <iU;zȏ%R o= zI,/Wc[ߙ /?c^XrO0%jh=~5|l%Esz՟͎#hu]w4uO@N;_%h"c?K%f+d6B|Zf=w%>-r0"(rI(o9HEg)В[X ΍Шm廨HEY;0:$UdH ;{)މ˵ yy7Og'!nmCyUH'ZyВ[U5T1ˮQA=Pvk63P0^hۗNPLlq\B[q6Wjln-W_`2Bm^?9)6B{ /U#GrlkzQf|z7OOV|#66d n|[P&cEU }#/[>cnX+ڰq1?їՃ!f+zz4TT!h=%+)6͓5^^kͯ\W F|~^ʧPw==#UUQN;WPG^<_+/A|7IU[7 $-\(?C8y\d2C]F(ˢ]Gmg PhmM 1( &PÓ41.@9A ʏ(_~.u'gF$s`a4=jԴQcl_=uqWFr [P3/[ay?E̩zmu׼!=ht鿮 =W6}_Ϛ\9 Qap`neP.7BGwD(Jz?"FK Q'ڼF?DV)(bKtH-go߂m&"rG8X =vW4JjiҢI7 RQs1=k$IZ3=bu9avJ6'1]j@\dVK{&`-mWWIe`㑿V#wf:Q i_& K4b# s?sj[tBmkʱl+m5 ˑm6 8mkctr,D(Iz4%F-#u^Z#4G(_p"{''zo#_S&A\Qf5<`5&I%=/qb5TnxAXkBsK)Q>3$3[3 sKP<&j@9W#K}g̩lj^a4=^ֺ;Dw<0AV\ǒ돳v{Vhz7e8&|8e*ٸL\R;yD%'.+usGcq~U^? Xϫׯo:g?X@Ӥ}嫟Fk޼<^J> h{}#Kb LY#Dh 3_^|Bog/1|'?`yGǖk[/ǿO?z~%=ZFǹ]K״?7 \VD;h>il?vr >)#T a8#TˏCW8.^xk: ʏ@ܲk 5y! z, )<V|dBG:.kں(z"4ƁF)7xG(D 2^#a -/fbf"[ˎ<)Kyk)_`_g7Tبe^|6jY_ؙ~ɗpS@OCK/j )f}2xoIݒ~1\O0J'Y.mԲ|'_5 P̣CH;g3_;b5̧1w]z8pRc?0~2J~~pp?|,ߴ?/{Mc?vwtVX=zh4H n(@lwky׳gC_o- ]'CClƍla NK).NOPB7cPl_{K7rl:PY}d劗ǒ./]v)7/r5s 9\nzˢ{9Y)+ s7)" lZ; obB4%B,jGS՘dQp;7r^_B> vW?-}ǜX>]GY5Ď0;.[' (?*qg&w1PA͏C6,rՌی9VK?[$Ē'_̿?ER'20;7aDcOЧmZ$6Fp 6ÙoSWoJ\ b\8 (g1|SmE]3r|g[/w(i֙x!(;_|,2kǀ#o߾ӷViA ׸:J4)xdh-oJoC9:z)y}36 ls'oDY -K6yV8_8Y` ƭL K~Rc7LOb 0+Euh۷t[\\սYM߸cIX~]5 ͉]eʒWJVyD$䯝z<yF)ڤMRodm"?%䫨w>ħDШuɏ67^HbN_Bk6CNX Z/Ap7aK l` p`iZdsod,9Tn4ƻyus3iZ>o\p Y972qZLQ1-?T) QV䂰L9.ȊvZMPQ+J'l \uˏG j'OG1;l$Q yߨ~eX_jnB}bӹ3kic]QDc4OwvIj ;Gǽp^G/ަk|5~u@JfE@ֻp2߂Lj6YV0}!۵5o5&`G R T2VW9eٗpmTT} EVp 6ATGU1q`T ~<Z>L i2,XuT!oD5{KgXުף`z^*`A nGE_ފ8}&&+m_[ї7|i^A^G~F7u1oZiڟD/ "|'Xk_rנz9"dzk"/ |<ވ.h/⎓ & =#v{2KRȭȸ8qI,OQ̲3^_G]_F2闭#\dB+>kU2 QB/,Bdo+z 'MC>UͿ?`(򩅼6$ɴ?:j-Zf}1{럧j^ESA@Bވ*Ko8Cy+_P+28uL_' wɧ]dl_{y (bU3 QqWvv6<.]c1^c(IVnh<@AYvQxs!f,!uRH q߷EȐ8Oq˵6xK@0~j Qs$ms;/kDŻt vWLIqh!K# XرF/a+`a?XiEU֥ Ri26tUԶ >v{ Xvjra?~NP_H"veTG㹙B&YKT`ZbRӋ4b2MPZj0$6B_{cDpi#Y%^GGM/ wU%/hRWbes^dGa($a'3Jl*'O8/mDX A﴿.a8Z  }GZXC['.MI _x8zz+ܱ*k#>G0:/%xw`ZUIvhKUb hi_Q ?(foDXCVE~O5ZLTpbϱ:OpćKQyALTA@)`Y6hḼZ9]vb- &pLFƦc02ObPѵN]V J)/:f-a_m@UaurHUq;al'u2KC؀kY-oD>ʭWz< T}->O?361NIsn,Rog_}H~lwZÉU,W*xuш(Q/hCLWanlo/`)cG1*g+XCqzF.  \T~+M.mWW]L\យ Jb ,bؾ?$[5~^ƿCR[VnkN^ȳ~<1؊bMYa2Qa3|̝umsp+0Ro+G{?,^Mmalm9VsP^/+߹Ð-טZ w?lؒښa娰9l˵Icm\.Kȿy*W&הg^ntNJxFDH 8Fedw[QTK$_sbeC 9?taT[%H6(_[/kCtN:6(#l4m -~a娰Nno=2+|εU&xx xoֿ54EQc?~"Jm2ٻ?J4{݃YD(sOKˋiWq7мE"=q#UJ'cX/_br ^]K?NNmuZبwnuOwxاnt-fY '?JR#gHol< P`1*hODES@ρY$L` NTML $W׉P>˖ȜOo5d98'.^K~ b0\Oaq<aҶяPpTPҬO JdR~COj<#hMe&QUbSOۂ\ Pk]Unr㓧|'Bh<՟eh5$?'BيI5Ok[p-EM\G~3EA)#sU#Yh\3][ZjyO?>NdcAohn.x]Bo}ǕߢzJ~YaˏZ=( Ŕ_5]G2vRKߗiiڶ!_ + _ ^K 3+ktrxMG@ߴ," pZqQN㊸\02/'VGif肗vZqqM$-s#3ozɧNO 뤁 ~OԺK=ɟu M"2jR 8?h̠??kshm sr=>ā'B! Js0i4@ H<8>SW?aӢ?2!ݤH8}&6jJJ̲ԟc__BCw__bH'wj!H66n-) *_\Jr [~<ύnZ =GHer*_] 疴*L,ہS?_+<mUZ̠/1" Q>m)i0͑ *$ K˱wmݓe $HFٚ *0>#Va۟9Q>BRr^FA?/0QtE+Jv'2wBOP,ZM蠾$X/1Eva{zO4֕ƐwD.-B(+uXo!%X> px}c,a[!:ӗVc抭uX?a_5@c 35+_XV>sϸJ]5 _x 5hӛy'hIf*|"XݙFoFiRNXx}.^?%"$?bqENWס@F\+:lDGn׉K};Whxsi^Y 90.mB w*_“;l@ QOv-^qh\]nWQgEA}Cp AQ:EȍZڿ7BW##\AͿmͷΏxq^~rȼeu]GWO[7vC{_t?޽-9(vh<:45tԂlL#z {)_#ex%Vi2!1qUTLbL{M#vFF `#J~Ζ| l|+:Hm*GN~-V5p0o&+ō?oKb@+5.3)w,.0GRk(Sgj<+<ͣqa[ uBWz'uǐ\T:2f6I&_o|/P/!P1h<; ?oh<R]p`}U_*޼WbU28ш'OYZ|9OWM#v BE-廗TلǧzF5p:U~_3kKl'ꯢ;B)UfG_{?94^ ?_³)%pZQ|bg|`ܥiz`MPH>z4K \ʵ j@,ePQ{zw_6CYWn0VSwo1O+G޾}J:ƨ@IDAT MG l3;D}qB(qz$%J :+86P.spgYzmOԉ pluqaIq#/zRH{H=h{RrX\?i_1v$;=/'I/ OnJI1H%:}T c?AK IR8qd*W(tKNn"˷m[S_KBwGYVp@I)poO Zz#Ua{ύnY *5?8%?Ov÷*.;q"5\Q"U)(@% GsЅ j&N֓iC|Ok ?{l$ӗϢRD}ybq3w*9UMK`|>gZ0T~FYx,ֵAw40WKiUf9r"޷_j)_Gɯ-(׉Z |w5=`v vu-mTCn1 ǟnju9? (k(7;GYYd['4[|u5a*>"?a*2 $n$Cd 3|KmY*`>I$ˈ/~m[+ަ/!jkK?A 4<'BQ#17~5le`)uRQ3I1`psӬ[Wm8Or49D.+[`,ReiQ~r7Dܨپ żN,N_dCn 9e^A:!n+2lBd37z)Aeknt1VlYs[z ?s!!y [HWLJOcuOkVPmR4HYzZDBZxZ}Di_sQ1uVӤ !,ȏb${sImZ4z;_8.aˏrf\*[|Tw"" +X,ݥ|o QK WltO۷„yM;n@G'n,jQL1)&|x 0+^=g?j| ȧU?( OԪY@]G!il[/lJٮhL[ށ=[D\Úq-_ON\W+a{#4jjE~~u/EDaM䭍P[&lE][pV$._DaD2~Ϫm%fQaYmhXxǩ2WJ$i3%[BzM@j-CU-µ"r]G+2F(E%sAwhH/57xypp:3kG~5QyjzeF3%4U<3Z:W,X*hD4LpύP궎Sn^@f ӱwC'qL: n?8σIqؾNj#isW CFz 𴿪mx_kiUhGAQo?'͟ã8V'Np XщsFx4Σ ۦN b1X2$`!_ry  %6؆ }$AJ~8B}۱FEl5j}ǯyϲ_j`>KWS3S~jQUV3tDDN 'W[Vp7jn!WP_#oG Z)C.2kdًlvW| 2hzǓ[2S$&!ecSZepQ{!NԎ_bam[<6F‹NwQÿ80@6CyEaܨō/!? .ܔ(5OuE9iϨlO?Pb8Q F'~1rn\6ȴPʹW Fhp"46BUmش^<+0IΟ&036l"^4Q~ɽ9#h `E Q-fC0wdGږ3t eX1RWp$ޱ>0c0&3|'Dž^k>AЋpEh/o ?h͞9&gSd\7{Tz#/%ӏWr_ _@~l?=|i˿J}L]p0KTc0C3g.-Zy⒗OZ*-fC 09Dм#3^q?pm3i&π(wD͌0Am@H;ZHf0&;$9~.'ܦ$VFq3a{Lv D>v{.O~O\^%?_mnMٲ<wLbF:4sfB<DY3ҡ3 +YZƋb} 4Zp\NI7(wDo*Cz&CN+)u5Z cwx<^%Ïr,uUK8Y| >E&/V ?a飢A%R8'Bi}I*B7ok5eqCh<ޡ2 yKL'daO,l,;YSOȀÅ 4)LxBV 8X 1H%s" 5,kㆦAmV3k.R=vG8gr;p(0 vn PQŎv+>QP.:V{asqQ_۟lLW[!__e| mz4S{$H$-amϸ,1=P7,?R}k\1)KL'd*@xY,1=l,یX&[xB5,kJ 6/l'5cZ]ڼr)dS}mk݄KL'd3eA . 8ܐkT3)KL; _}80Xה%2½W8)KL'd:Đo&XbzU{v]ߝ{T3cѸzk *괿f>Ru]5h6/Dh]UlןSȋZHHYVw%_F#j/pw ?P3lzsT%(?Hxc%l`B-F>|n4>|ؐuCHQ?2ţ,!k=G5N)tIW)}siQr@)bښ.Z9\dJD3뱖_(csȴ+2Vc sQ#]dy)AlFXiPԴ+&v?hjEjnn*d[;B[^fqFYwg sFj"zNu C9+lq yh >vyOՃ _oUv`R}G0R/IfC^I1' 촿:fwNNN}=YT R?{%?XXdx!64{%ʢD:)c3C%5z6TڽV{T,v'|ۑ C#Wˏ|(iMNuHȉ#CI[*GAPih2ai[4oKCƠcap܆q\z>9k 2 SG㡧e+seVq}X15\uܟ[&c.4AGIF 'BMl4^"Jke_D(gJR$%=_k? J:6avV\I{z= buS]_s%'uXSW1ۑjSc'wP^"BpAݐ<اm߽{ $ۜZ!UfDV)+<:hr}q?|SCvJ4jNpIncQzwFtɦt+v RG9,סK^12m*vM}c: cdPc.(cQŤ,-rTL bp!]B&2nuL 2-fHLgQ$*Y!_x n4oY[!c:H.`"F` g$y#fNu`MK*# =(lZS;n+?W0sk⺧h||(B w$˥z'DW;m}ydѣ_B [?čPI~5?FXRlXXWF0e74 JF#mONR 1\fD@H2:̄fH $02oIUû ͰvRu9]oa <r j 0:L^3$b*_|jBOYLwYOq~l?;iefH $02oi 6Λa#-Q9# hsVzaIy ":Y_Q_RU"DP9~[FX~0%ڟ:nΪ|4?e]#TeЌcRk ,)L'BAcD6BI @I/4wS\oqۖ|MQx9sO#pWz 8mB5עc' [AV<{N?Ж8us%.:SGkiaG1@ij{g?yy ~CQ1cD1mݶ?8,y1bgحD{0NQ(?z(6B`/qyL ꯯CK4x|UҚ>H{V7q(&#y .ϓ.hAV]ʼnq#n%rInwտrHͼvM.sGi]G muRqDcRg4Kk|8,-Ivɸ%WwP'.ReZQrq*R um {H>n,>IxM?joEXWAbg:Oz 6=޳y=j{|xGh:*&fE5ى Ǜq /FU|Gݎ˹#] 0& &~(6\$F]$%/|/7+ZehuaK0)'IZSrO0ɗr6erPt`RTpP޼6q]sWP>mDq&'c7Y4[L@|5O a1%PK( QWǦ _ \TX>Ǻ,w^A;VQ*ex'1ZN AcZ䴿?]. L@\=vӻYbvPBa#ׯ,;aqg]r)#2j`ꉒ?*'hb[V#l=Yt-/ :VZT_m_u* s LRQNjp'[8\@lWwt_qO 6WQzR(k{ -H\Q>qZCԣeoSk>h}(Lj *%t 髹zpjpfߒ}|56!#זb#R?T(!7ʶ5Ya+wKn7k#jI rϰQ׹h2VT7bc_ԣZQN6c>dy#H03pOi#6PG7k+&kgeFB>e JZPzǴc&_*ۗ'.s봑g.2:/11[=67FIQԸչ1F?OExG( KyhmdRck0e77GK09J~-?$%NF^`G,ey|uWʲ|W|Ys/rl?Z{cSi ) G d3a$u PN4>ʗGDK] +mc5Q(ɜ@&X`8@4Gy5o׿7BO]x\`kcQe0$G+6B8_l#ִ{jO-sGZO-"w3'Җ#Zg&kgeF_;imRUhF$4=o1V4NLǺ ~ ;a]Fhzaa-pj1XE&mν'AΘKtaiV7 FvL^7d9&?Lik=r\W~U~ lDh%QfFύih( u5/]i׏e\72*d-2/[jy?.B+ymQ78ldMs_ 2g$};8)_ <_eVEG.VIXcJ@DFQnݚrP |49NZY0;8sX*qk mkHõc?a0]oƿ0\xc*8 {=1w湼FHQ Wf)p8M~lx?Q;4evֻ(xIP8Dn i֧`KW|_Ʒ1G㷪EҪjiҨRހH xGim<:D$ivOc?)T'D,G1/ɯa#$˱$,֫P!jzyhCt!^P%轍|MqE;>&IxI 8R{I*`g ,юGX5ץv(zL-юGXץvr ^ uӥv>3EgkZz&ȊXaqnJ=-\`xczwȴ $cV%+0]j:o֙D;ae}n#F#?xq+ wKgi,C1r(򴿪ǖ-O;Xۏ޷w$lqѱx#b 3zK)Oj<7rʟy9@^h*^Krl Yug^ c`ڥ__C]^yn1FyX䷖QC~l(˿.  >m BuS_x;hQ^XgC$860,D%1[ZZʎ^_G:[g-SO?D(yXe'dd9Ðp?G5J9` > Xr!lۂ^Uw?Jɘ֑h<+:">G%J7+qLW C8]*OQaӿNMO\ti\ "S֟_+& b56|? 'k:|ozDc4k팈m_+d+n>o-9O:^2zDc `H8yr}9CmA+np I}vJ߽+uL W^TDΦјW&IJ\mX*kWq@XDH~Q&XNDh,_~RO oSrk%ycD DW!oWOǒ8RW!u GڣnlN0HmBDbnZsfH||)o-? U{Hs?Po=m\Yv*?< -l,'_tƛxyN4"%_Y ?0ŗ~$ _>5GON}/dA D{r3_7x kS7<JCvTB%b/KEt[ߟ76*NxcqVqY~zrFq)&p1Gz&R^FBD. ?ljw( ;Y Pq[m΋y#[ 0e8vpG-%7mffJ+S?YB^5??7jY5$ʡ|6ۖ|%_&\v| ePHaɬA\%oNU#;//gȡ9zJO8b l$cK`5凄R|#}YgaSA0A8<8/4U]_`g|S?5?ְazo;ۯ6!ƫɛS΋闏:C1%G]!qz2+2ʏm#ԅD!%wiR~Ruh x0g{ Ý4Z_($o߽ N/N)\UP [n3u-/UL߱0Wś@wG4PD \6%;]~a|s݊_iXh nJޭPy[@ %?N E9 6%vx?6ja1l޷{N;ϡ| =\8zFbnØ OQL+O[o4H۫?]Ui}S*~JDK1 A:ߐ= /Y>eng(ז8 ?s$imdU.-κ4JJ ZUfjǿ(I!>z8Y%q#[8tvG4ɺ^TyǪK:<@Yc̤!6Fp #9Թ QcgU`5oJE7S `#0iWo8Q?(T4Yu^lKj ?ȱNفYq#[8EN}fڟu7'X dx<۷QROz( ca|c@ScIq26c'JCK<1tDL`c]ptΝeE.Fh>*sP$ʊ\ 2kMbҸ-?Z}X@WگW`b؀n޾U5Q_E@I2EoH$Gb] Yy!njo(S¤0Ǐ:պ "ąO4F٭/5yw/\#A׿Onsu\ɷz:d>qaO#+2+s}Q\Ih gt\Hp 3dJZw5iV`C?8qg^kөףɰ$2sZK[cg#sb?$г 2k8ѾRkikL`ZʟsNO kkWK_ЇA 83Or˂BO*&I@&bXǖAnC>7B%U$ENi_'%Kyr.hʎ\Ɠ&10-'k1;;gI2_cRjW#=BkiBc]QDc'q҅Z>m3Bkq+")eY\gn _ܻA@WD҉1VNք5&;hc=91܁{껑i n06TnL⨁U2K%.`~wis`ٗpmTT=Ǣ{+q>$TiqU H/=5h_3ɛ8)ׂ d`A0 y# M& 0dغJU۾F_Ga+2q(DYrq&&+[+_ (w5+M 8!E/9kPz9"ǯk"w?RC ei_<\q$g嗉 Lmx\qm-pWY>F{-Teg#|Íd/[#Goyb5 Q@h@\,PI&)ms XLBޢ]:Y `pʗP-4 },kͿG!K?X8 y|j! :#w2#LwGݽpQU#l˥x W$ІM|*k҅޾SAA>GFeYeuU2iK9>dl֢cP{}DivZ~̮P.W_%)~rg7]xJUeP8?YSaݗq߷EР1I-)Έ9/ T[ +n%FR|MQy|l!KV# XرGAv~av^ mDX !8N+.n/giiȢ킏AI{938{HErT*x>zޚ3F[J.s^ڔm hÐӗWQ˨$Z#VvOTa(`a?t^dGa(yBK*p'BOg|:,Ub, u6OZ%WBQ%V~o;|ljmux 3eʶ| G ^h T.Ub,+rXrQ%V~2!/m$o9J' ]&p^ ݏ&_E[d밤Q%@H:(perX7J!"?vG㋚Ae -[&\}[*8XprŋOPC(+LU{emR 16mbqy#r6 %.ZL^?Ma8a 1&k# egaaMlMxg5n|NP{:)STȵ7޺Be7"e_>ֿׂ9/}21F%E )r^U9io%Rog_}H~CW㙇}F#u%`m*|ڍ~Z%L eu̾(Flk(n4QKŸ`JӃ|K՟zM姰$Ģ}o14~F` -Oc@B+qL*gnD$A:d +A@)`)cFml@=D^v!F^ 16ߩ~CA]}-,tYgdH*;0sɞvߥ\#^xW>OO/ 'g#s1_ps9s1_s9sA+&-nb1oX?P.4n3: po|^nsTH$_~@䣀)?lAv2o9w9=k_ȩ˜ל\ka9ןكZb5sa?D_'~/G4DA „G:.{Is(oLOcPo?s>֧@9u2']$jοr̈D[*%mx,mqb{οBu_K/}?۷+qS3fMtqkCuRZxj{NJl2OȯH?(;tt__}sB2whל\\\q?1xBcnh^ %v9 833mp1_8ED$& 羏viGcF̮6ecο4r?=<1{: {?c럹Ϲ־06ҍW_sP#^4* h?5Ő" }m^dhÇ3 ~fiIBK ;(Ro|nzP!%Z\it&cG96cοsο ۔P5Fl? ֲs5ןsK< IDATϹg^?Sy瞓:sd(<`z<xc˔$%҆'LT|dfA0mksG3:(99溆:돹OtϹ0yO9sbBWD}%= ``bоg O-^pqI?O˒96t7;s1_1ml>a9tr?;g'x4'B1XID} F(7D`LK8{zF 4@k"dS>tQuL?uML"x:iqW;9wtcοs砿K999bqgZ=J99@0ߏyQ24<qK6'99_g5k |9sLS~S}hsz;z9s=bοsο$khwT <ןs9ן =cYcwSzc\=FFa}d |i\}|'Mh$A~FF<ŗf)B 8OOo?ut?GלwucsOs>X;$ܜt*9,VK FJ)0"s=ίO 3?o?0ǟ9N3G5Ɯ\\;Fs)Di$43`]s-Po>?/P ̔CcjsGPAZ<d8 'Vdʟ78.wGaZKh)99U/Ĝ\\/sIu^Z94y&V 0::<ɣs dy x΅P 1S7igL;s5s 2?N1so6t_iu2_~sO?DBlCq?R MWGϑio M 7 F~ eoBa`sʁ<_1e}":Ogpy&s5s=}돹/k?~8-?B2P0$Z$}lp*,=†~ QjX(rʔx/6N;O~9pz 㯺G)v Jikp/R os1s#9+&_.kG =Nr9Ba#s~Qwq_*/y q_–F ?MFXd';_/3_ /81/(s!^1_-h?r1 8ٜ]7ks|:a4P&R%`"?O EH,%޹ @310$kDy,[nƈ4j"*)XO?zv3/4 JgmغGA_tok/j~t\Xe`Mǽ_ s_⏂*BZ= WEtKv@Y_S\/-j# 8.???hi;6|r+VsLo,a tJIAX gW5L\?oeIF[Ce?8F&#WR';an /kOo`s0soHl;U2̜cam]zMQ_J28r^f5"E . ]/jcA?џ 7&" K˿,~R~W87 V@O6)jvjJ\0?\8U -S?BNtlU q/UmG;~XY8Mɿ }Vb]&(SD ПuBtfWw?MoU 26 GMN8ψum m1g,dB8&sDxOa(|F1͂@/e AU<@ M!f$""ݼЙu;ILwLoU%'[.9zn£`2P `߁ShDeYU"%P#PNG `Fÿb1Qih_Wv`H  24PmQcM`WxcM6 Oi~St@'-_֜\u|pN>|!ڑZ??/<1#|?6+M@  maqm'.rr\/8xuuC!6q}j~^[ot'7^_y 혱E}b/q?9s*\j6#. sIPkLLMzQB W:1S!C`!"""rljqq3 wvTߟh#G~'- o3Mao%H#9/ 7TϵL\Nlaum *c/Gn+XHE8_O7WbBӉ'TͶֶU6`ˣ+SHKs$\]qqf?cĪE#T,ߒaU$Wkx+gRS!d*k qX;(ELR٣|90!vB 68\gdùՂNs!6xP,f8l,OW"'#rP1bqr t9 Qj"ǸL x$"Dŏd%y|"17F#UNy\B ;iG(=@{.$%yz!8E-J\<9.N?)͏KTʼnfFƩheI [::j$ d!pRkfH5?!༐Q!(/Z dO!Q +gI%'P#wl|o.l~Ӱ&ZQ xdi XÈHb87q<^as}p߁<Z  mse?D9 Apu-2n9=<Cf'aAг'rq+o|4vd2JB&8@dQb7gpG/}ԏBv;:ŽaE'k[|<9GҰLLJ2saG3pWp$,|1߉'C J ?0EB(&NNxO$D[7܋ltb^bM" IR,G'֒vI=fnie4j\xK&[ȱdyy)y|NPl)DJ6eRC9EGyi9ZSYYOC͏T]CGUPPwPoi4 -F˧-UNz 3Kg++WZd-k-B2Z:6m, 7u:::y:uvyKҵ ݪ{B1cX28 >cc]g+ۣעץ?UB~c0\R~ !&CCC reB > s 7FM1htʨsP%Cc;O7j|Ѹ$DjI)44tQ3Yl1,}*gduG+̷ZZ$Y̵kqߒbciʲٲj j;dkk3llmRl<5V޳MfOϱ`ut9T8\rDŎ[ vӉv*pvztv\jˇ%e]W]בs]\߸9*ܮg׻pzl󋗷̫ƫ;{M=8>g} !}}?y?##x`  Ld zl,mff q  4b%-aaIa[gWwExFLh$DFE.5Uܮ#g<EJZ(!Z0 5rQbc$1u 2~mFGǍ45~FFĄ] C&MKR$5'k%KJ"m13\H5J֧ҒӶu zl8qn?u Fr'57@:!=%}Wg^,םX_  ̀ϳVfuDeN1GN:;2{S؜9}){4Kt%9L'M*uK&M^=K%.GzNaA`AEAϔ)LL8aڢi OOoa>cΌ33Bfejm9{s(sr>eko2h"~.ˊo._i!PeJΗ~^_|gן[eҍˈ$n,ZsΊWZYdջW+(۴F<~ek?^RwE?ll1xc&M>mo%bKmMeVւO%o;/UۍnCmgΓUUUw-F'tO}S͖̽>žzc>jZ\qVU'kOo=(/ȟ|nٳn;;.eog:e_]Zⷆowx}%==;?|<)ӳ)I˿i^_^_' `-F(iNxĠASCIIScreenshot9 pHYs%%IR$iTXtXML:com.adobe.xmp 260 708 Screenshot <iDOT(KWg?T@IDATx}yŕ~]%,&`DeyDƈfQ38bbi`f"* .33Q̨1(Ab.HΩUݷo^N|ts #0#0#L( ne3#0#HXsC`F`Fhxo!n>]B p4n{T8[4^<8ڢBi0Cd^L~g}{S#0#0@A[op@&?.\(ZlGoܸQ 4ȗPUБ11mNEݫF'jUb9*m}'m۶UʍaF`(pRIO?][.>Lj*,^#AUPH$N˱|`phtS/u3<b>]![of~ T5eۯ-[[tr-Λ7O,^X.1EW?^0^t:SbСCڵk<ҟ?;Xru 6RCEI 5r<:ә<@FXv2C>TWٳKm p!禛nrZh߿m\ ײ~}[[ ~ h^U0q :9Orٲea;vZ٣~U'źҎԟ]~=ăa 1VZ̕5z*c<Q~tj# &."+қ2ej1-o 3fkVDQj^ ={‹>B//UyMH~]F _P >-1lcCcP ޗ8怄72{Ԗw+/Wg4"]ߴܻwXhq 2QC J9eƆ:aF ]'_ϴ[W3^DM6-se"~z{..Gka* O($O[7C,Zom9?p@iI#7i7!l)kZoKwtvu]ܟg5QqCWZ8e'.W'0 v gѢEYxyꩧ 8AQC9 >&%_̚=iݺ e8ۥ |Tm7iCZXm+ߦq^ZQ)okFV.8~}}%ӆKfK/T %r uqKf69ꫡU{V>Fg˖-ytxtjg.W86*Sq)Rӿ?m׊c-nP"bS^tSvC&J֧H{`|f :[oIjHyQwܸqx׿Y >}NFT'*;-nPˆ>#F?{ bTh*o(}w_l;ob"c|޾}{%QS/[_ M!&:_X]pUG`38o6?wn:?,NMU2$IA֒ߙ3ݠOVfp. .^{U ly{I 'b2'Rڂ“cWT@Q u ow߭nQ`.8ڕ^V|RIFJv5j;G7tkzI3; 讜lvLȷo͌ƒ^Gl~c>93=B_/Ef).wygիW$|CL>]oEkq֎c E<^A%IolssqT_yO ƤSi.%2ÇW|l yS6:J_|cOp͂)2%D~卩B6/گ%mF[8Ff|3J}7܇%OdA*^dZ |}iպ'8R]'򸖠N4푔yjgN~N} 2~9۶mkZuWܱ`@G(A N+1kyS.(Tbpn?XW̯klh37S/mR+mdýH_MyWOk1=k:_6nܨ.HEwi] 4 e? o~*}褳x#싪ItI^C`M6Nyvq=W>Â)/zR0'{a g1f7k-ႅ].qmjY=|e91iWC"a7MX-[cIBi?A,#`^G@T4f QX7\UL_38C2p 9#j'-vm4o;%vÖ=|WohCe)uF;NH!ӧSDs |EbcJӮ}N۶m Nj8sbB'p =F%-!yN /H~)?4TAI.q@-ZvN_.tӃ<3X /L~/[ tsMKk*W7xi;Қu?>T/_dAf^Hcy`Pk󲪰uV#"e)cǎⓒC@ b.nO`9 ߩX@1%/ d;ߨHf_u6O%uUJAq!.U~1>ݬz!2]xHSL)WsW8qȐ!&OêG;\JC_#g=2Z :t.ՀpZ'3rHǝXc܅xR0YGƅK9⪜xox‰q9/Zv8懋$j-;Va(ȳF:wuWRYi+w #IEjT-Ӛv[n D_8gm?׌DK,3f;V[T)T xq #TyO!{<>H~^c7$ ~9FF{R_W_YզM ѣG-.vzb_1j_ v0;?*Svş{nQ4ur-2/yM64$t8/B ns@WFOkh!m7{8fA5_:wرcc-O~~ɼjc/L=Z?ʺYu%qV cݺu ~7ŝyGs:?OL|i^?~0lu'T &iTF9V obg~3 (~[Z+V op\g%jD7U@_d:ݺuhXTgʏ)7N ^d+N݄)E]}n^$t[h]֓83]RU_ OZ-F0)]j܃-7(o,py W~6-7X:9Ho)tU&Vq<:.7Hr1T~Xgk%_Cqh DNKuj|Inp8{ҤIE]XrҦg}(=|j0FRDj]>obսGovP \܊/㟱qVe˖Db8.?V `< vmJP;̧~:x?"˓pp OY5k 0պyќq@͐*MJ g Kֽ'<=Iր ?W:C!ݎ WSI+[ˏ,Y&܋V5iԂAO>JTZA ly䑠Sx_F*ł)#RXlyj7g"e%QAA͈Xu_-UqZAW>= :j ; VL(0~;.EV^eMĚR};)y.|' QDt@jKfR毾jIHH_ PI3 1kd\sMIlDWZ_XBu8'ՂW!&5oUϒL0AGz%8޷UV2Qn߾fk߿/C" b+jP"҂%[$O7O_KC⚇LEPiP}.{^[@_D}e4n>CI+%-Ih|=80/?00`Θ׬YB~L}a5,t|A bA.IfS7 Ėt9+o:/wW(Y"Q +i}09,Ig63N_÷%Q0%so"\ ͛g i<D.ҥ *=@IK_S_2;!W9ǶՋ/]\~1RtX( __)0`\vۻ"-ؼys)}:;6o"@=L1qkrS҉_oHgz@Ɩ7[ntsd +#N7h >Y0|"ʴԐ&fSgժUAEVm;d"ȂXCL DL|:smvݴWO)xEjPS5 2l{E5?繳Nzyoh|T-P mݺ5QFYbV? L9[&km=?yUӐc btRC&WAx_SM6bCw^w>裢A;.e oxJ״FyT~M(a͙''J>4Gm9CR%?kOlWHRRq Fٟի&;w/zڇk׮FļJ A;8EPÆ ?3ΔkypB %#F$Nrت]zg\0MF i]/CעjѢEL_!,G9s$2cU~Y&k"XqR )T66k*vݦM[CXXxeۣg=poP '|<xWǛ6VC1-+sÃ[eM89sVT жmJ꫞={>qP~!$ G'  .B@ETɭ`hq_+*a/b˖-ȌU_~Y๸@nfcVcɑ_%[;vlWvJ_I_&_"AloFuC0db]@2dK]fi9<L-]-m{Q7ѽ{wk]\mۈ_t1n>a >}xJ$DZx2#&sSv Gq0F7sT8愈ק0uwċ/>CѭqA_bC[+x?8o_O#ÆLx`C  .0a4.K廘%Kz/&o.np@]\vebҥevÁ;#g|rZ>d70_i|B@1 cLQ( oY &w3ψ#;wC.9krScZd-=+Vg_ߡC2M2M/kgzÃ+ܥQ~9zmB~O@~2Su\pnd|KWס,_u^ꑫj2"Ua#W^S5:d뚓j^,ïi~,1geE췵_A's6Džo}5Gz '~Ə](ppCZ U  (t3c8I4!v}-ZTW'|${|{ae7\ֵ4 V 4repY &ůV 9?XU]h-b1:S)V=>vc~4+~q>TN0Om[nvuop)Zg5ꫯ.Wa52tbM'Iكjސe˖Ig;0ƸĪҬYӎOOԌjղoS:l2ĺ:I3;w mꄅ./N9 D*i?o6a,^`,Y?&"N7TZe֗zޡIfK}UЖxӗgƞ0ACi&1p TڑhV@\Y#?WEGv>Rļ&!a=ĹClovM^)@O3EH|ᇹi'ի"z,߮X$Q'\7saM,ڊ7S o77~P7<^았yrmZo !sT~óp¢ohW]4q$_T͍_  \#jpqIpJr?^ ꫦp{t?eNj*C:Wt}L{_swذR7Yꌓ3®T_ ivE=溄syAI_}vA?aU4dƒ4D+Zypǣ_`?pmIWFkTV k͒_F8F? [Mr#R__㧍+i%?-}ƀ!.b bMDͩVxZm󫮺ʁ/H!"s/z$HxvX]u~gySO;49~Ya.<,f~9i8l:Z_$L X9z#u):KVCIM  &=G-}38CZJt=~p=F >C^{5v8ꨣݻj  Fqc~u?>G~WuJw Av~0Y6GٳG^Zb6UI@b tK9aOyaBݻwowqU+.`gz7nzr].Q Ϝ9S;Fu)ݹKg1c :/Wg.G[? _VRp7 ;QK5cI\3fLjׯ8bԩ+\[ũ|W_}5*"V[߽{b.Zm:hiNFb[ͤ[3T媢,x7lq ߬H|߄IoNHoBy,`|`F`F`A\1#0##8>0#0#P sF`F``AeF`FsX9|y#0#0 LJ2#0#9,`Dm6BJN0yC?ڵ[>UBsi6I;tPvZyFAz519޴_-[Կ/k%<O?tn:ɀlN-?h ,~&1ˆ  [@r>;aU mmɓ'VcnɁʺpW"ܔXtӡcNJ_Ϟ0jT/< 9c8Pi1|~ ,[";&.xbI=)J\D\$Lد/=Y_Į?ەMњun . g! {g&Od3ԓCk*Кְaą^hgb>Z/4<냏IwNqSL . wbh/jˬ19m`o35\c^ h]vxGCTq!e}wԩbnUlQCMnK6ӦMcm*l?e_v򋏳F!Ξ=[N*X]7ڒ55}_WClݽC1YRQgu`;{p*KnsqV,5]y智H;Oׂۓ$=yt8E&`/~_0L(T(5꣢4d%vo$L7Ku5}% 2ŰtRjdY}l2=Uo~#,XV^R֭Z9c,ZH1NSO=@oc?f)?>%_qJG6G-;fA/MxxZG~F;mEb+ơ.on7^t|v{a~.^K/VMÛRK 4t7LCN~d< h2:4fvR3Lh U",f6' .k% Z1bOԋuSW[Pɀ Ds@i))$S77ns,0HP0S@(ep(_]_~>3a*Dio\uinFoLE^ R&NZ}>?"# ]hOb Gd 54f3xη-1p:w̜9S5(+rꆆ綾^qIn߾}PsO8mx5FݥKicV*gAKIy=a^z )@>)_*JT/s U2ۯ¦]+Cl97 f T7|pݤ hD8 >$b! qô"UoTa?N)n 旝|͛r0[y1oV`S_dlO: G+㗑y*Fh~|/k91'#q=~/ƶ Ն.`Fkvc]$e4|eJXf1 R >Xm6OAAl o50yǮ]9m_ #;T¾&ZF`Dz8zc'>';^m 5/|Ե_c\9B8㷥ݢpB!wr~9{AN>%ȭ{3AUs_sCL?ş=4d60 A=t1{:˗/M+j5Mp4yf: &/;5O)9ĆGsϕ] =%W^qm~tJ_מdQ7;h[zk|͐<߲!t95kŅco$N\_Ku1} /ٹwСWvw*{[k+ZTDn_1Zʱf=B:6}rsb"@' <oLLk1~W %Y_~}:g- 'yUG~]r"f~79% lg{Ixu>wϙZ tN۶mp9]n0Eʵ<hb]u7ycRJ*8TlY%lgm?^r7J{y ho(7}^Vlҋ/b]#Ȃx=db'|- ATuֽ5~#*uVMx#mӣX)!ΡKw(P_W~zMZV%n}A/}eCI;v(,>)7>_!-o8ImN4/mܸQ6ZQ=qyw{<j\4I?eu׆O;@köw!4󠔟^#ݻpPW{F |oh?bSLڔI ۯ!=z76k0oU IDATDb&:-?ľCrٽ{w݉aoL"&oN{m=y2c7T bl7o rСC踞R;w3 |cGt9iA)җHU|CFIh.&m|/ $eP{u9$7G[b[cPu1>ݬj[H{PXvm`F{bw}rvv׾i2ہaMbUW6_~qE[Nۜ-ŝ s9?=ۿ7lYt2ksa]>Z\k[o?OT>"z*!?j_aC\c7!`(WlOgd`eҥ߻wӵkW#^-Z8h2n|"Z g% yq^B햂 Tݷo/t-˖-Z/>9._ 4uQ㟋E]?&4/Pxt1? q[ؘZ ӮT}wpS4I2𡂩y'\  gҤIEy\*8!?|C/TkhK>{Oɚ\?Qg}kpƺ͛7  2K6 ?GE(ϳˢ )xCXgϓ~)6={%+;h"El~_$yS+,G 0~Q\ 8@ߴxuS&k*s| ~ k;&c}x;˄v:Aw$h=LSJSNW/ Zp.뀪Ңlժeڥos]o~"y_FګNW~3"*֝McYcsd8|"ʴZ~񩒷̇zȪoVzG#a/$)m-JJ!>b7XD ^tsK=rz4瀚=anpߵ??|`Ge&DH(0)Su{?c;CgH.ڵ_?֭[+xgi>M.0ԩS)~hU;_^r&8gݾ@L=dڱ\訁B1C.8q j'x205G8 /+j= W^G}O.4ݴiS93G?o6إ~>[R6Y~mS={ta#| >î/oC`.oqK/RZZv,PmbR70"6ft24Num~1Cܔq{q_ m W\ѥKAXpǓ6ȑC.m۶RF@}ؔ,׶%ɏ~{{ʕsv Yۯ/[lb@^ *~o*Y'm~GFgRO-*A?l ^@IDAT} E} ȭ|\0fPEqfDAqԍX1f7Fݍ{ubD E0Ct^fʬW_]y~m޼B8)[yGrcoIfkT󟁗o݃ĤI<\ _~2ٞe|Sή]rE%D?t֯:T’pVA%?F _)^tkWzҢ4hgϞFt= JL}_Q?zꩮ69f~=|RkwUm?|M6)=l vD NP`!.?e|A+:¹j OY?W9ڵso sХK'!)Ν;aJ>ВQ<_~[’pVAΰw ȯ_y۶mm۶%+-P87#f&~~I>i䣟~z09^{ϵF?CLGSN˖2#;@cS)Fb8i-qtz G6,]4R-X=R 1bD8pv?W^e*xX)E>;@c>JUPb+}/FW"Ç 뮻"eYk祎HP _Ua!V>Ev;lC<{CL釖VZ9(gϖ O?|{3.:5RZ8~C C~=rD=/~u[ã}_&:¹+DD@1vxc&|:b_< dC w 2A6Ruܲera-(sS2X0a;ތ&Ƹ{&M͛;v L2ET) FN#ċy9c8^æ.\WH?iڴg?4o;ap]z|@{ .pAP*Eg- jR%k!Vm˴~;JsmO!6&CafcLsΕΩjc+4CŊ{*mntz|0>49R|7>ܣƼ1!AL@HE㆖4˄eY&2wqnZmz>Wi-ZKߤ=qļH"=Ī-P0{>J~eY[05yFa+zXo0>33|Y ߼ l_CLg! ظqPR b: lPCK9DʀeP|1cƸA/օC,Nj2azPbI_}Zv2\?-mXb?C]c27 [֊5C&oY&҆L !4 ZCCK2qΒ+^0d7G᣺ȯ0 ފ݄QExةSg|}u/E׮]_Oië۝v֭[%'_߯X~B Kʖ[huIAK~8 }JԷh.vp6m$*+Õ~H#Eݐ.] [ԃ~9*׊!}`X6DLx q>Ɋ Bԣ=L[lqzg_ bjHOon/SYbE53gN] ql.gj|Ey˭ Erw3a"˯-3s-4|~6۔+:Noik/]]7t|?vWNo_ra ~BOe!2p?2K/AEr7dJS++WZ6PXﴃw{_WWn韲ůtqCQPs\IrʯkY~/_ y }#uM4r\ Ҳ3:O?,e8Lϗ= FZopUC,۳nsGu~T1|$TP.)Sl4&7|s6^Wƪ7_bz n>.1"X?E~ί%wq%qSTM/:(po̙3>g C~%[fquCZ_d]잏pXr <.J~dir˅Smƕֽ+{Rɡ!XʯƗAJ?q/K6mvNP g(7;nqޅŀd;1Mz HVؕzJRu*;vlU\||J9# &;t9녽U/iXkڜD @=S/ʯsSO=_a7x?HmPL52lڧghcȯߑӄ2kҡcP ܴ]w]T^ʯ7nƍ˙^ux);tY?K1i8&3_[7b!^+Ͱ06ZP9zKfPBε`.+IIq𐎝+Ҿ7p2TV*{݃$ݻc9FהӢy gճtk: pg r ǖFiS񏅔qdާװ_ űnPe+ʱ+؆K<bHNoS6R<#Gf]Y*(0H~m:բ~޵kgPLr~r%?g/dnm_$ >|xCTR "R(193^= #Wf0K.˿V}WaU9_~'D8\^xXw]\“}/9_o(v@[8/R>> Zd?@subJJ~?_hp ={PVEݻ%AZg1 mNyXe37ѣo6TNx8k*Ro0WB!ť\Wل *,^?0 GבmժU00^t nS]W Ce5m$7<6'YNMzC~_>]ok)F'=_,_2_ɯ7WggބWw27ѝ|ɀŷly[ϒ ڤIO.V1'Lhw!F.k'g~ږ_pQ~̱Aa«)w޽ȤI<:uN9^{1Sfsq쎧n4lm4WV#Oz 񦴥U.Kq!ҽ , kt\,cgXCJVtm{rO4crRx5s;w: 2e*V"Q^ImӾfaҏ Œ_CLauXQ')*^P?ÊrZy/ɟW/J/hoکbo#C1*[I|\6ıXD'\3{v%U+ֲb:53&>XQip)\ r78.Ø9\CCյZc))r_-9`_ʖo~ʺ@5~_%_9+f;ؒa?&/ Mm=nky"S `^ϔ7|,n*0wسw8#G-u&=XѱcD@s!o/`D *af!uJ e~`EOa Vu$o.+S$>p?v[X $y.u=HѥKWuI~fHeJv`jN˨K) McC cŐ^0}H|?&ٿ=#_ C`Ȑ!޴R[.1cTXm~:tظq#? &*:_ *>!wvuOEs0y?ZC=~ꫯ=BC@۶-]J׷`~GKr6 &wd~rB~ۀ~m-L*/J/J!1_ϒ6R1|_B 84Caa*+! ҽ>N +!~2+ťvH_Z}TįWۿq~J[#@ g! =F|,6Xn::.Jun`Ŋ+jYIX88 o7}6UWW~я,U2$suP:x S GuiSN!04)LbB%}D߾}_|a!+5X-&K>q#=I|s!hҤII +ZH'x7G+'mc=Qjm?‰Lv ~$dS07 w {H.U'N=Tž'4'«C/`]Z/LPS /v]}" ]F^`~+S6t׋aX~ JO˗/Fu atdw)7tFdOOͷ[d9Ę prk|i}9Nű㩴(""8wlK,u8Lzy@Pt`?[)W2KM|juTk!WjۿSjh~b:igL׿8S?ïT1%'8'h}&'-4L6-YP_|Q.CTǁbȕ,6lppR5#(UQ8pԫX~cIz+dĐ@)I7{xq}Kzg68mb3O1CR[tDŽ :Jӧ X(fU\kllJ`](֭['ڴix/&M~M>KU߿z{ꩧD[bD@*gϴL.x2yR+eKx$}?l0Fߪ#sn{τ1Gvh~M!~y3T7VC_=peyu/8\1myPvЩSG{Æ.}QѾ}jrMէSNb7VO=h׮]MQmVJF~Y~}T)k3V %N?}XzubVϫ0XS\"TA=M-/b?!8ĸ5kT,Ma@9=cyx~=;N`Oin:ô *T9ԭs)\"zJ:w,vڥ wz`[筲v2M.=.hyKX?$g[Y7GF"?~|Fq+ d *>`?ꬳČ^dStô~C'~gΜɽLv`#h1c 7ٔE*]Nž}{E#QĬYN$/d3ұC8 1RͼU8|РAc.>@*mbxiaK,˖-wl Mӧ2e-?Hyr!],^X}dL4r~ܧO~+Ѷms* f0 ֥oXO>k?9?T8O[qW %_*)ަ+VPaŭ*f͚N|+3ҥKe>A/Bxq5k*ΝzKٞ={ڋĚ5OzqTCCCʎw"P3,w) C(IlF˒{w|@ʻ+:D?'Jγ>>Ìḿ-[tÔ 9R?=dthكm2r?{RW! glOw-wgϾƗ&|KӵkWe sqRX _ 6c+% sȒ"pd?,~&b/9A'Cl;dJرiҴtƍӻTrV}}b"96MW泡'nJO3gN>Yp4~?7Ƙڊ'AenobA&֩wpl!z<#BQA}g{{gƈ+'ؖ *oQL\bb͐ mKE"2a1O %1-\uf@}vލotӔ?rq@p|Nog?7pn/؎sru.o#0v5hD8Hpb&w}رճ/ɠ_!.vlm4︖/:AuP!9$Tcz!plDzw^-73VP0> I|0O?48Rv9]Wrq}o0x xNa*#1EZ=xEE a[ ^KzO =4dK_y啪uN/Gnn]AN/*]jA]ފWi~-.s;ꆯhqynP1_ݮp+oB3 OaJWFѼa[|Lb-_C,  Kv #!>3;,-Z QF98'Q̪B}|[@I¯+xNOto-4E2!9cP8_: {R}1;ɠo2xd!{i:yRQOai6mڸp *gqcGqfC 3?0Q7кaCE4.Yc˹"`~x!{v?P&I\~l>p8UjZNF!޸I2CT^dĽ ʔ)?s$ z{˭ȴb\-צ4H[q/cK.$rfsj8AZ-_\ ?>}ȀA 6/Ar8+9??𔀀YP߮5kzPn*TB!~饗?Th>4N:8`Z lL*\6R ^: G MKAGFEBa)tb9>gt<ĕIV^??ąztǪC֚%XLg68أ(Cb ʁn{u<Ig9/'D%[?@O>*lҒFU8uuuΆ <7 Ŀ վ_Te?1qC\vz .|\ o -D'Pk8V?9IJFn n#7Bgqr-믿^:_ T5T.~W״~:M*2c0YS 8Io3%&# ;%lKtFK+ՑOi;dr$>vitmʤW^y%T<a\裏v8ܹs=)?ܿ iB< _pg}֓p å<ϯ2"`;L6O1;Le$HYy-$* j_!} ۦC)mfJC+|1kCz \?Áї.ӓ O</xW^_zhRO]N|x+:ĶOg).KeB>/r~Сs Do ,Us!~!.kWϚ*IROr6L { ij ¹瞛1SC& `;v3חݻ-[8͚6u[8.z'}q1/k|3կ\3zw]veM^s } '5 ̟ yo+;L%9A!f=c}?R1VH?x'uEvؑhq?OGIbwޙ5.ߌ_rPwܙ11WNf'v36]3YA+M0+2RlcIqq3)yW!jb6vE!VC&h|&98Zi6mvU2m}Аk.ŋ8i*(Se*_7LWm,vlԺCP.~,Yb(ߔrJS8>*;82Sp-o߂!LtfGuͷ#AiiqY&TA7_?7iaQ'Z`>X|?ȯ͓igB.ܪU+/OAQ ^6C&;Le(k83E(- MV=X9Ʈde"m2ބ@] .ƺ_~Yt*Z4~4z2]H;~~-ZTۦ3 "wA$~G-jMsu$:oug5 ܕǰN!|Xή:Dqk(4Z3FܫP,eO 8b!Ncm5CC&h5!TXC ?JJcmu6{sϟo*A<`3{ *{WUM$W+Ș v{t{򓲣G~ r Zk7i4r[FrL9b)xO9̅UM o.ddLX_;(v*FqЕR N31ŋ;u`@ڵ^1~7|!3dرfS)3To:vߔNi_6n;ʵ履~\YXYn s(|=C zHɯ_óͩSO=Q!ϿҸ/8یl_=E~Ao.tؕc CLC&F1Vfph1hu8֭[YfhӤ)6$0q)Qkv{UNBs]0`@ QbW=/4~K 3Ťc Ax7a'h-P~%^m"aۿH/XPzved7 S(AٴlK@F Y[7~?!ѯc_}Ujˮ)N(_~/o贁a.A㐵bѽtsK8){jqx?og츱߁j~rC:,ư#f~oZ8L9F*L&:E˿b8wyЇɿ6 'Ao\1l Y P?K^C_k~vyKSl 3U$\HCBJu~SN+j*UVIB#2Ih)}n=oF.\ep uבm5@ *bk:^~dpZ!$Aʰ D=0ך"Q[11x`[k1[ck*6w\7 wYA6v7nҳLNAS _Y|ɲlΥVi9ٳ'*A㓵.˨&$9|;FpSOq<=sKI\ ?Dve%m5dbۼ=Qp-.NG^>snm>%Vs O 'IDAT xP~}?~|Iz8oGVֆ8}v%Ν;-~r_N+ۿ71gaڱc3p@->k& ˭i=b_Wn_DR=*fCbbI{:d_qע/{" ~9TprL\8ƹ2^#2{l5k|޽/U2ĸYg%-\/~i|?d~+9.0MlV#}V&4]ukekƣJ79CA z x8x萺0OanX*eK\$kcS>N8duJ|w&p_Cloh!9D`!bMoTyqy|P%>XfP0E32+ťv2d7i\3 ?=8|_ =2d(*I`"Ƽ>{{zG:OY]35׮Nh#hH 16+_A^?!~q'/“@߾}Ŗ-[2v{Ib@~K@=u}I~Y~+,E'~~/qa$V` 2JPCz03{u!֩5j{tz:&KJkqk9TnœŲ{##xiVTy 'OX,[vV눺X x2eF!Ӡ^xzdͯaJGСC5]ɦK/[HЁM@OU֛WZr! y!Jv5׿X>Ѣ*nS8OnpWŋ%fv5zYDO:Z\SZdHZY?JSeSkkujo򙅇NWǒ^N::&#O{}A&cᆣ{)ۡH6-,X 62^af(0^N$? 8[Nv ϛ%}F{o;ywyN %E$(1h;~N_;"ˇM#8i,M  Ǟa,d ui*WT |Q{j[n?~x~_ME^7 Ȃї^zIk@E8f2ᙇX-1JV^G:u>+EIVyN==>t裏WM]k"ݻ>jڧ~xDvjsΝŮ]JjV>R?1<(Ø'jh< oܸѵnɰ́}+ؾyh=^vQBDޒ78(|%_Pݻ$F%fΜɽx~k 8kh1s 7<[$/oP&2sR?Ϛ57LW(V!Ál۴iSl91f$&Ƈm5q5ucF`Fȉ;9!#0#0Ռ;.׍`F`F '/seIENDB`triton-2.0.0/docs/index.rst000066400000000000000000000027251440023377100156300ustar00rootroot00000000000000Welcome to Triton's documentation! ================================== Triton is a language and compiler for parallel programming. It aims to provide a Python-based programming environment for productively writing custom DNN compute kernels capable of running at maximal throughput on modern GPU hardware. Getting Started --------------- - Follow the :doc:`installation instructions ` for your platform of choice. - Take a look at the :doc:`tutorials ` to learn how to write your first Triton program. .. toctree:: :maxdepth: 1 :caption: Getting Started :hidden: getting-started/installation getting-started/tutorials/index Python API ------------------- - :doc:`triton ` - :doc:`triton.language ` - :doc:`triton.testing ` .. toctree:: :maxdepth: 1 :caption: Python API :hidden: python-api/triton python-api/triton.language python-api/triton.testing Going Further ------------------ Check out the following documents to learn more about Triton and how it compares against other DSLs for DNNs: - Chapter 1: :doc:`Introduction ` - Chapter 2: :doc:`Related Work ` .. toctree:: :maxdepth: 1 :caption: Programming Guide :hidden: programming-guide/chapter-1/introduction programming-guide/chapter-2/related-work triton-2.0.0/docs/programming-guide/000077500000000000000000000000001440023377100173765ustar00rootroot00000000000000triton-2.0.0/docs/programming-guide/chapter-1/000077500000000000000000000000001440023377100211625ustar00rootroot00000000000000triton-2.0.0/docs/programming-guide/chapter-1/cuda-parallel-matmul.png000066400000000000000000000230251440023377100256750ustar00rootroot00000000000000PNG  IHDRS.#}gAMA a cHRMz&u0`:pQ<7PLTEeeeYYYZZZר2fffyyy|||jjj坝gggsssqqqՎ~~~Šňlllhhhmmmuuuppp򖖖wwwݡvvvkkk{{{nnnĆoooxxxӍrrr蕕}}}zzziiitttᴴͷ̄طhStRNS@fbKGDHtIMER_"IDATxCG+ n.Do(⤈D/U*AŨѸh|1boķqMbvﲻ~U =Ouuu aj?~陦CAAAAAAdf婧 c* )Lé^NASFS1vo,҃2 (bzY8NePNؽpJ 2$8"4hʖ+f,I~剂K]C:eI@qS1vo,҃2 (bzY8NePNؽpJ 2$8{e8eI@q8c)=(p(pҽb x#tp@b)X)Lé^NASFS1vo,҃2 (bzY8NePNؽpJ 2$8{e8eI@q8E^h>}xqz$|FlY<ս_1&ckRl8^U'a NePNQST\u8E"VSdhpXN)bu8E"VSdhpXN)bi/ 2}M_3Z&$3b ^sa%ǵk&)$pJ+`" H&)$pJ+`" H&)$Ny;g,c3/x8e֋Fx+QO_ׄN;4퉗'rȲޟ*/\ǐ NFB+N-pN-pN-pN-pN-rT1~Y2"/Vש)p% YoR/Th)ZN)ZN)ZN)ZN)ZN)Zw\};.5k|%>~]5'eU/ ە!NIu8EhSt|p8EhSt|p8EhSt|p8EhSt|p8EhSt|p(BS= UT2NaPLWʐSX:E S):>8E S):>8E S):>8E S):>8E S):>8EKkBNbY>e)hRfԯX2u Y5!IVD80NiL$SZIVD80NiL$SZI,95b< 9ssy qbދ*oZ7"b&zJ ]BVWjnTN~m3^V ˟lE(MZWFo榿%⫥.kniNFD̄i%O[[i%Fzvmu74[%d2_:1\Zy1;@·PB:U@-Fvjc J;*ߐNsn";kw}씓Y|oH94gZNm{܉vsjb4}qM^(=Mg`;w*58cXB^w'rN-ekSN\kZqMϞs~|zf1j:=\TcŜS CgjrNkXpwP/vsTYYѿ}?YWqrˌn>9ӗTY]=;t~"}4p˫\ޜ2e6Nuf/o}ɷ,DWg~ƙ>s*9_ܹSm/-u˷EO7ځbΩ-YT8ua4-Ko`XU;w Zv4r)SS=:,$rN򇬧Y^j=N-\'Np Z[$ؓo'l"}3S_矹.x_'H$rx+ ;Om>>:uqy'a s({??$E[j;SL v>y"r_@yLߟZ^שOܛgZ5wvehSs>k%P}%B~_2֩DuV^/)#m]"=]ٻݽMgم}be#y2⿛p[v:?rnK4tvυSCESr~>˸Mgug}];Xw{$,iԫ#k*ݰ'F?;wj]~u1w֗OCb+`qjogrOib~V|߸J=$FsEsvz?suj>w>cru}_%J 4uJ9Õ5G4wsAAejﻵ=N߿>畍TQmkdߟZyNO-僝<<(~{?z:$\vmRjFnS[ռc!SK߉iǩŬuqk&}J~'N>JDkuZ'u߮`2$8֩^NASFS1vo,҃2 (bzY8NePNؽpJ 2$8{e8eI@q8EHiB-%x/dO:'B}҇u(XbzY8NePNؽpJ 2$8{e8eI@q8c)=(p(p*SzPQ&{>\Gk(Bսc/"sZ[0$8WsIBBSFSTz5WN)bu8E"VSdhpXN)bu8E"VSdhpXN)biTIB-/ilU:BK[LH&)$pJ+`" H&)$pJ+`" H&:|$|fDeBVRVbc3]>RY2!I6W=4p8EhSt|p8EhSt|p8EhSt|p8EhSt|p8EhUJ|ϻ~TUTor7yߧ}Ӽc;Es_ujjYO[P?uЫ$l(Ի)i-pN-pN-pN-pN-pN"wJ5% Y)bnu27~Y2)\NVN)ZN)ZN)ZN)ZN)Z9qyY0Ξi~m`禲³"^\ݷ+CNa-pN-pN-pN-pN-pN- h)K9rW1K 䪌ȳLW&<`" H&)$pJ+`" H&)$pJ+`" H&)$pJ+`" H&)$pJ+`" H&)$pJ+`"gFB1&`" &)$pJ+`" H&)$pJ+`" H&)$pJ+`"B~8`%b>rzaTJ'cLaq?7"<*۽pJ 2$2{8eI@e8c+)=(p(p*WSzPQ&T[ )L*)B(GڼYE-z&N<,ꄣOFʌX(:ccXbze8NePNؽpJ 2$2{8eI@e8c+)=(p(p*WSzPQ&!GKF-z&NΤ3%R6?ur.w\mi;{d[ʌp){NitaI@帜}Z NePNQSSk\u8UFfTST5WNхY&{8U;FUS*ktaI@e8ENNQsT]ePNQSSk\i┽댖8E S):>8E S):>8E S):>8E SHJ9gGw?{ndS~EX֩&}y2Fש?UB_|)B檓QP S):>8E S):>8E S):>8E S):>8E S):>8EK}qƬ(brgQ>t꬈SLBc֓7.2_ d6@uJ{b$K8E S):>8E S):>8E S):>8E S&w\1+ p19| 7.˿0 &If{yoW֎JS}__wZ_m19[knni< 1%u(p 锻Ud. 6QnC}lߐN-s)U{F:\YG7dzVeEw4⧆)Nw'Z "uKpY&;R,'-ܵ,i읾9QQ;掮x=3[;o6xZ!֭Z,Qy}X[yo8&3+uعݹCǩ|Ǚ6#jfvǜu\^O[įM}7VVTTlNT)ONgz+U|*ܚ5&FFlV3#Uc͓fSJmF֩}7aΝЦ]@92QLcQ1z[)cSi>{}a^:u*:i ^?s ڼoWXt1KXl] )lyqu\8u]|OZ4;,(ri癵7s2Y%oF %-Hc-oݵ[8߹kc&ߩbqꝌ+S͝Cc+ٶ 1EWBkN׹SUSlpx߶j}7%{W\PrjUv?h߻uF=v2 Fkp_<38N>S7ܥ}*g} J>Z97<.'^}Kpn gSuKAG}⹲h%.tL2ΝK3ÍlWKS G{N-m.=!vvξ2췫Bu_?s}C¥[bo91#Vgv\cl}7GfT@%cyS۩G|g~}˙}'rOp/6ns]֩d_mCB_X@X6ik=?p^er>~>TY-\}jQUo r^\]k]X2w+7q-سܜb=Ww?rr˹.%M#ce7m.akqn=Mwy]rw j[e-IgvlIuQXW..ϥKS8ճRUxdͽ]QlߺhL$SZIVD80NiL$SZIV"fy1hU2¸w߸p>)=(p(\")=(p(\")=(p(\")=(p(\")=(p('Q\>8UvFSWD,.,>R>;.-3)SEQNASFXqu8UNASFXqu8UNASFXqu8UNASFXqu8UNASFUm)3jR=bVdN)XarOuSZe8eE.WSZe8eE.WSZe8eE.WSZe8eE.WSZe8eE.WSZe C$zY% SZeZuʓ/̇;Ѷ3)NBBr"02N)`Iéé *6k:PQ& w 7جNBBSFn?*=*ܠb: NeppppN$$8eIé)iAw0kB)]hxg Fqꗉ1|MH!د 93kjL$SZIVD80NiL$SZIVD80NiŗI -ĔWKj،q(ZwėIӾ":Y,Q딥bYөS,ZbgN)R>8E RHS )pJ"N)S)|p8N)R>sOT8E RHS )pJ"N)S)|p8N)R>վ%":xt_N]Sn),>]qo\)w*딍t|p8N)R>8E RHS )pJ"N)S)|p8rLϞق>7KT)ÿ4 <:ӧ;U/~TS{_\~MhJ:8mO'$(ĻdG`N)N)R>8E RHS )pJ"N)S)|p8Ny/F=2˛=}.M,NDNYu > 5!޲OG&9}wʬ#<5!k&)$pJ+`" H&)$pJ+`"F1OٖAAAAAAA)?DɅzD%tEXtdate:create2021-03-18T23:29:19-04:00%tEXtdate:modify2021-03-18T23:29:19-04:00cDIENDB`triton-2.0.0/docs/programming-guide/chapter-1/introduction.rst000066400000000000000000000173041440023377100244420ustar00rootroot00000000000000============== Introduction ============== -------------- Motivations -------------- Over the past decade, Deep Neural Networks (DNNs) have emerged as an important class of Machine Learning (ML) models, capable of achieving state-of-the-art performance across many domains ranging from natural language processing [SUTSKEVER2014]_ to computer vision [REDMON2016]_ to computational neuroscience [LEE2017]_. The strength of these models lies in their hierarchical structure, composed of a sequence of parametric (e.g., convolutional) and non-parametric (e.g., rectified linearity) *layers*. This pattern, though notoriously computationally expensive, also generates a large amount of highly parallelizable work particularly well suited for multi- and many- core processors. As a consequence, Graphics Processing Units (GPUs) have become a cheap and accessible resource for exploring and/or deploying novel research ideas in the field. This trend has been accelerated by the release of several frameworks for General-Purpose GPU (GPGPU) computing, such as CUDA and OpenCL, which have made the development of high-performance programs easier. Yet, GPUs remain incredibly challenging to optimize for locality and parallelism, especially for computations that cannot be efficiently implemented using a combination of pre-existing optimized primitives. To make matters worse, GPU architectures are also rapidly evolving and specializing, as evidenced by the addition of tensor cores to NVIDIA (and more recently AMD) micro-architectures. This tension between the computational opportunities offered by DNNs and the practical difficulty of GPU programming has created substantial academic and industrial interest for Domain-Specific Languages (DSLs) and compilers. Regrettably, these systems -- whether they be based on polyhedral machinery (*e.g.*, Tiramisu [BAGHDADI2021]_, Tensor Comprehensions [VASILACHE2018]_) or scheduling languages (*e.g.*, Halide [JRK2013]_, TVM [CHEN2018]_) -- remain less flexible and (for the same algorithm) markedly slower than the best handwritten compute kernels available in libraries like `cuBLAS `_, `cuDNN `_ or `TensorRT `_. The main premise of this project is the following: programming paradigms based on blocked algorithms [LAM1991]_ can facilitate the construction of high-performance compute kernels for neural networks. We specifically revisit traditional "Single Program, Multiple Data" (SPMD [AUGUIN1983]_) execution models for GPUs, and propose a variant in which programs -- rather than threads -- are blocked. For example, in the case of matrix multiplication, CUDA and Triton differ as follows: .. table:: :widths: 50 50 +-----------------------------------------------------+-----------------------------------------------------+ | CUDA Programming Model | Triton Programming Model | | | | | (Scalar Program, Blocked Threads) | (Blocked Program, Scalar Threads) | +=====================================================+=====================================================+ | | | |.. code-block:: C |.. code-block:: C | | | :force: | | | | | #pragma parallel | #pragma parallel | | for(int m = 0; i < M; m++) | for(int m = 0; m < M; m += MB) | | #pragma parallel | #pragma parallel | | for(int n = 0; j < N; n++){ | for(int n = 0; n < N; n += NB){ | | float acc = 0; | float acc[MB, NB] = 0; | | for(int k = 0; k < K;k ++) | for(int k = 0; k < K; k += KB) | | acc += A[i, k]* B[k, j]; | acc += A[m:m+MB, k:k+KB] | | | @ B[k:k+KB, n:n+NB]; | | C[i, j] = acc; | C[m:m+MB, n:n+NB] = acc; | | } | } | | | | +-----------------------------------------------------+-----------------------------------------------------+ | |pic1| | |pic2| | +-----------------------------------------------------+-----------------------------------------------------+ .. |pic1| image:: cuda-parallel-matmul.png .. |pic2| image:: triton-parallel-matmul.png A key benefit of this approach is that it leads to block-structured iteration spaces that offer programmers more flexibility than existing DSLs when implementing sparse operations, all while allowing compilers to aggressively optimize programs for data locality and parallelism. -------------- Challenges -------------- The main challenge posed by our proposed paradigm is that of work scheduling, i.e., how the work done by each program instance should be partitioned for efficient execution on modern GPUs. To address this issue, the Triton compiler makes heavy use of *block-level data-flow analysis*, a technique for scheduling iteration blocks statically based on the control- and data-flow structure of the target program. The resulting system actually works surprisingly well: our compiler manages to apply a broad range of interesting optimization automatically (e.g., automatic coalescing, thread swizzling, pre-fetching, automatic vectorization, tensor core-aware instruction selection, shared memory allocation/synchronization, asynchronous copy scheduling). Of course doing all this is not trivial; one of the purposes of this guide is to give you a sense of how it works. -------------- References -------------- .. [SUTSKEVER2014] I. Sutskever et al., "Sequence to Sequence Learning with Neural Networks", NIPS 2014 .. [REDMON2016] J. Redmon et al., "You Only Look Once: Unified, Real-Time Object Detection", CVPR 2016 .. [LEE2017] K. Lee et al., "Superhuman Accuracy on the SNEMI3D Connectomics Challenge", ArXiV 2017 .. [BAGHDADI2021] R. Baghdadi et al., "Tiramisu: A Polyhedral Compiler for Expressing Fast and Portable Code", CGO 2021 .. [VASILACHE2018] N. Vasilache et al., "Tensor Comprehensions: Framework-Agnostic High-Performance Machine Learning Abstractions", ArXiV 2018 .. [JRK2013] J. Ragan-Kelley et al., "Halide: A Language and Compiler for Optimizing Parallelism, Locality, and Recomputation in Image Processing Pipelines", PLDI 2013 .. [CHEN2018] T. Chen et al., "TVM: An Automated End-to-End Optimizing Compiler for Deep Learning", OSDI 2018 .. [LAM1991] M. Lam et al., "The Cache Performance and Optimizations of Blocked Algorithms", ASPLOS 1991 .. [AUGUIN1983] M. Auguin et al., "Opsila: an advanced SIMD for numerical analysis and signal processing", EUROMICRO 1983triton-2.0.0/docs/programming-guide/chapter-1/triton-parallel-matmul.png000066400000000000000000000060531440023377100263020ustar00rootroot00000000000000PNG  IHDR/ ugAMA a cHRMz&u0`:pQ<PLTEeeeYYYbbbaaa```\\\[[[ZZZccc^^^]]]^_]]^\___ר\]\cdbYZY[[\Z[\bcedb\[ZY\]_\]^cdeZZYZYYeed2ZZ[[\^^]XZZX[\][[]YYX]]\cdc[\[]\WYZZ]\X\[XabddbZ`^W\^`k\|tRNS@fbKGDHtIMEō,6 IDATx[sEa/&m)qılKvl~.dN5vݍ.Y4zA|0|^oov,AP E2zY"z, e^ec^2@/KuCD/Cz:z!! Rlː^e^6veH/TG/;D2zY"z, e^ec^|i sx(>#<_x, e^ec^2@/KuCD/Cz:z!! Rlː^e^6veH/TG/;D2zY"z, e^ec^2@/KuCD/CzW|Y߆ e^ec^2@/KuCD/Cz:z!! Rlː^e^6veH/TG/;D2zY"z, e^ec^2@/KuCD/CzW]?Sywaϗ:z!! Rlː^e^6veH/TG/;D2zY"z, e^ec^2@/KuCD/Cz:z!! Rlː^e^6veH//S1_p}r]w){eH/#A$^zKE/ɮ$^zKE/ɮ$^zKE/ɮ$^zKE/ɮ$^zKE/ɮ$u^2|HPE/ɮ$^zKE/ɮ$^zKE/ɮ$^zKE/ɮ$^zKE/ɮ$^zKE/~ݽ8lA:;+?t'첡Ѯ$^zKE/ɮ$^zKE/ɮ$^zKE/ɮ$^zKE/ɮ$^zKE/~c1gTHU{iz y|eC/]/zIv%d׋^]/zIv%d׋^]/zIv%d׋^]/zIv%d׋^]/zIv%d׋^]/zI%=)|y{xd瘣o{JTbkU>w~ Cp+YFڳ%tEXtdate:create2021-03-18T23:29:24-04:00[۽%tEXtdate:modify2021-03-18T23:29:24-04:00*gIENDB`triton-2.0.0/docs/programming-guide/chapter-2/000077500000000000000000000000001440023377100211635ustar00rootroot00000000000000triton-2.0.0/docs/programming-guide/chapter-2/halide-iteration.png000066400000000000000000000304731440023377100251220ustar00rootroot00000000000000PNG  IHDR sBIT|dtEXtSoftwaregnome-screenshot> IDATx{XUeSsX*)Qּi5]hyʎNNffW'SiD&3m4~ji253ұ<( _{4^{kgg(YsWf$5l!? H2t.1ꔏ:F;VǨS~6ZNhcM~]qc :Mz_;yRw:yR[D-[ 3\m6/^;deӣt*1 Q=XK_~9oʔ)$]8C˖-sn7NErUV.FPKpHH/_os{%Qe|̙:}$)669ne۝ӦMezqW^:EU.aaaJNNV^j 6EqqC:P͵:p!naRSS+Fiյ.&+!e$fXg=/%&Mhرc|~BB$I>:\wכwfJzxʼnQ=D5k(..Ύ}ڀ;?jN>P 9zGTZZVWi싯*lN>x\v֮]mZʤ‚ϯ_X;&_??\5` \_u!ͦvt+kuϖ/ѧ>$=Ӻ溮<(nr "%%%i„ &GjQ;aM}~$'[>\;ߛ4i6l:YӔj“*'h_ϟ0EMmݺY;uꤽ{}̴0g眿h<7Y1WyjP(nk׮Zvv.\(&#,N~~+o8pn؍S+WTXX3Ó'Ok[WX3gj̘1VƼ$OeIj>>5>>a7>LƟy\Rׯ$7N{VΝy:PWmwIExֱ4uuNiI k-R%W ,е^3gΨP 7| ZԽefp9oܾ!aEEGGk֬Y2d$i9r-Nvq"Pk6@I&_%V[Zgu pnW}TO^aN):F*, n9^W!5 D(0EFHa" #}M}JgsNK(f՞6G٧O:}4stf U`S풤GUXX.o@Iҹ:sx5 EGGfU;;9u:\EDDUV :pINoISOLIּʟ䱃$??Pϛݲ\{vmwܩ;V+;(((;CyWYc믿^cǎnaJ aW4zu5\;rn:[NCU(r i+ԡͿ9'(U{IR]zrjs; J?ƹoY$Izp8Ciɒ%%I4|+czDPKp`PXsѹTN=CgsHGu_ִ.k¹oxS:Kp֭vZEGG;ѣ?>߯R%$$hСr8Fx%AHC=*)K_w髠uŚ7os߯Pmzǝsέ|aV4V(rg۷oWff$}޽o?d[k6`Fx4Ms?LO$5 ooP6O^%I}oFMZ-;;[۷ѣGw/r묬,DKpe۟ު]c Oz)©8ر|%I^R ET.5mg&+2.]YY j<6s%⫁.\|ݳgZ/\_WY+-NV3ܹsn]oP^֡%XJebax \͵}vcWVV bIwܡnTp7u %|o{iwrrrԿ罆[hra7ӞmӦHܩ1WKJKK4kf2-ܢ-[H6l\(+f&벦-'̗MtHyCwUVj3 vcmg_X BJ*?3|/9%IlT>[1/IFFƌSQQQz$˶mtw(==]ԬY3\R:u"kܛ 軭(I;{mO򖛛Ϗ"xb=ϗ$]}OժUZ6 ,pSͣT)뫑/UpH$P:kƌ4hW[l`{͢4|$IǏ7Q,Nvqv]|V?(?zwP"졮?UڲCI5١>Y+&&ƒcO6B 8q&MdI`=Fxcgv?򴎦0Zn^x)"C5-6^s's/6lJKK%Iw}YS4õ[qIҡܹsu!͛uWWkv]_(^G^Rϕ?EW8{?Ya;55s:p,6[slQ:yeɏ?hu8#F>6Y\VޚZ5rrrPN8 cjڴE39sʟ+???*,,z 0SL^]vY<ӂ ,___5o\ݻw]OPXx7ww}[ZRZZ#G裏>ҽޫKZ 5D<̼y.ޜ9s\PRRW_}UQP,0{V!ᒤcs/>rz͇ և~x? wa*.jРsTeee***RNN̙>}nS< -r5I9yϜ:XPMIII˻yyyZp?0xg}V7|y+((ի5}t;333yfp :@6U:wIdE^ִ:Xtyg]&MN̙3" $00P PzzyoTSI'| :Kҁ*ܯM6={˳~r-jӦsL%%%WVV٣W'(((HǏwu\Epc~i[4nX&M:5Rf_GלIHH5\'OJƌoY_~JJJ4deggK*P_BCC:ԧO >YVK/T,G}{ۨy0*_$'I֛oYxw)%j֬ϟ;CRsnjEW_͛Θ1C]wUQa[oU_/_+++ի)5€JOOwj6v]/msGun'%%_ה)Sc O-ZTY}uED0ߐXe,=]: c/5p-^_|e˖鮻(ɓ5yK׵kWa8?3iG_){{_Hl_c_!ިL933S#FG'::Zc}nfϗUE\} "~{plXgO(`=OSLp>L߬񑟟l6BCCոqcnZה)S+:&j5€u9QqN8Y'@VAA$)<<\$'Ԓ%KW_Iƍj߾eyQsGC0]twkɒ%՚&áH*/"#(99ʼ8]~فjӦMm۶K'}:tysNuر_8p[gϞ={c"&++KÇw^swgyF}o.Iz7ump*.Z{%fiź;-N~|r;w;=:w9r9~7/x'???{ܹ UZZÇk׮]pUd@PKpPP.]~]pΔ)S/~p8l2qo߾Z&Ə?xI%y,XPrzߜ_|Q'NTgQhѢz nP=P˗oyz:fΜӧo΄L6[cΝO=T?? 6LÆ y&LЄ !F0KpXXիW/k~=IDATٷ~|ݵkW> xTY5jUV[n's>#ܹsg wAr8:x$I&Zzuܹ!!!AiiiG}TW\qE?ڵKRW]ui;{%822Rk֬Q\\\|~BBi "Za񥊋ӎ;j47--MNT~6ǧ9rnr Z~}`W)++SAAA"RRR4j(iF6M!!!SO=Ժr\_uZͭ V\7|Sα"ٳG{ѬYog%X*/lIIIy͛;j>#bbbԱcGkھ}JKKUPPѣG+;;[?%yk4 >|B v7i$mذdg/r-]T?-[DmݺU?z~/K+[:KpNw^o^TRR_&ѣt题}嗺뮻\0׶m[YF]w5͞xTE؍uUk׮n… e$11buimڴI{1^{MVE.@pSNUsђ%K,L~:uꤥKaÆQ Rdd5j^z9[W^Nagٔ<)I?p망;wS@@@}}}p8yfD`&MiӦhn\\k|7x'ꪫh"͝;W[nѣGURRf͚SNj֬ Qa wfbiDXa1c(""BԪU+{ի5uT={5+Q k.댌q_>#X"uv-[^tnW^xxxrX"&NXC.(0EFHa" #Q`$0D(0EFڼ{UVfu3B^Z7S>ڒ6:FjsE: c^!==]cU(< NRܙ%:Qhu:հ}.[FHa" #Q`$k233%I UdddfggĉmH;gV(Iґ#G/IR``$)77W=fttl6[3?uP #(8E8#&^uiFmڴ?^yiiiرsnv_;G~98ps<%%9wn6mgϞ-VVVVʢa:cq qF؃p$ͦŋ;8H]r:ECU.AAAZti3M2E񒤰phٲeqi߾}L>m[[X"*-_\7xoիG3gG:vvs{ڴiuzgt>~,(nr _|֭[[չsK]̜bIRP:~hdB# :a7UKRQQ4aP͵}K&CͬDt|B v7i$mذdރ"솶n,:u޽{վ}{IRII~efr6(nk׮Zvv.\(&#,NX#z쩕+W*,,LRɓ'k+Vh̙3f1-##㒳^4ذbc#0E >ShhhgyF+W%IƍS޽/n VUbbb7iMun(66J $___-X@ᒤB 4H9998#a5k, 2D~9Vg[]n4g,})KP(hJNNVRR$iѢE馛ZsIsUPxwFxٳg+**ʹ=zh&,a ׷[ܹs' ap84v蘿1SEM:U:tpnϙ3GK,0g{8ͦD9~KCik6mMVqqqϯ7nX㹞.ýp18# #Q`$0a3FVZW^S:S\"l]v9_gdd\tǏk 2a/׭[lsvk ja/7qu8r8upD(0EFHa" #Q`$0D(0EFVR``+;XRccԩ,X1cuE8((HtR~BAQVǨSJ~15u">8EFb9" #Q`$0D?+f3IENDB`triton-2.0.0/docs/programming-guide/chapter-2/polyhedral-iteration.png000066400000000000000000001662271440023377100260460ustar00rootroot00000000000000PNG  IHDR"D7\sBIT|dtEXtSoftwaregnome-screenshot> IDATxyxuMӻMKr"r( DZ][WUAA˩ (+\(`)-#9?ҎML44KY&M¾};}4ƍ{YYYPBy"""iLQj*DGGnSNCaҤIuk*3jpD3g`uw[|X %K0fsN6m,?tP|8|0VZIݙ58˴iӤnѣ?>t:$>`!R BDDDԂn6+YYY0j(qҥ RSSݻwnj3`6m^pӧOǀ"##&8gŋm^[jbccѧOaPs/LDDDԂAѹsg5 )))ϗ^Ƕm>ubҥX]zk5OQ,_cƌyǏzzWNQ #Gc!22|̚5 GENmg1qDDEE]tҥKq,_(g ;w:=u۶m?bOCEQT"""r_|OFڵk.kf|.t:>B$''w ^!pY<ӧ[nq-vKDDDB>>>kqZ~ݘ9s&-BvlmVCK # BDDD䀏>K.m1 Enn.9w6m ł:Ɔ Z>Wڷoڷot$ "DDDD(**yﲲ21 J@N BDDD䀈t%e4qeY'uVo 4%19஻]wĚ5kgɓΆ(ӧN.Q "DDDD , (BCѰ)H "DDDDH^  DDDDk9(HMMٳg0ĠgϞVL"r "DDDlؽ{7+4lbb"ƏCBDފAÆ PRR"kt\;wļyбc1ERKߚqݵ "DDDłM6a׮]kkѣG!<<EEE?p):u fx0{l >\ߤ{7_~f\w9?_QGn AAAuALL ر?3F#V\ nF.]e˖!99#o͸Z^"""&ۻwB1osZfΟ?f\w 5I^^>Νݻ7oSLFZ f٩YDDD$7ohL<W_}mp HMMѣGc̘1*L#!!#''jС4˗!"\ff&:@VYzEDDs%@~~>RSS!*6 L"BDDD`jĈns~7gϞfX HJJB1x`111֭VXQ믿Xlݺpe"66ׯY6;;K,Abb"ڶm}bȐ!ڵ+1e/)))RCcΝu̔;<~G 28p tD;ݻݻwQ.]-[4~Z[Ν;'ObԨQB~зo_iSNŋ3v1X,և}||fDDٸ|2ڶmp{(bʔ)/~~~hDff&Μ9 O+++Cvv4mX銊 SNa(//믿A "^x::6x>%%%t0~x@VV^z&;, "DDD$[~~> 00Nkfw^@׿Ҽva֭35o#Gxx JBxx8믥+`me>ݺu(&Lln_sVm5۷?:u–-[m4ry"""vCCC Vc6Tj8to7(6ZY,! BDDDEGG̙3Ȑz5j\iV[ؽ{7~m{Reeeعs'v܉ bxWٳ'L&Μ9ګh}kR}000-}[^:p+ """ٮZvZ9~87{nv{Xd ,Y{qATVV`0'JJJfbs,:u°aгgO\s5۷/"##1w\H@@0l0ϯTܹ6L "DDD$[Ϟ=hq9ҦW*G\tI:iСCoW^ի|ITTT`Xh n:⋈7)Xͫw9GGff&rrrȜɝk#ěH6Fn P^^>Yfl޼YzbҤIj`0֙ӧK=;Y,:x7Xn(ցVo!C>rܹ7ވ3g"%%jkjDI&N(tA2BElڴ K.ׯ_j_`ʕ6>TRR"?..f^u`5:VVca_^ 7&덵=H|}ؿ?֭[lP[c<5I@@,X ~سgOz2 Xv-{.cϟ9}4>߳g^ym+;;WƩSpv;{Y\z=~mijk~;`; Eaҥ׿ѣ|Ks[wyDz{ @Eڵ otu!(x饗pQZ#|8 %%۷/8,[ u[J޺# BDDD=z4.\F8<^uصk; .~Çyf}0͈A^du+[VVK.!00m۶BE={ϟbA||S{hą b <<QQQh߾=T*ND "DDDdjZz!J\A\A\A\A\+N!""""RWv{ "..NRg0+]y贶2|7HHHmݦt)DDD!"OV^Dt:?/[o JDDDǣ&IRÄajȡC`0~ W_}%)k׮XleC"""rǭ!>Va-(\^!""";z&m~|BD"""j#GȑY0tZ`7D^!""f{|6bğ!$9֭c!F9,os: '9#֭ð#9$A;ҮF_!Æ)\y"""joc!VV c7nu(\y "yy(t %y h4A<`b h4J*Nz|||VX, ZyJ}[^wDURVNq#_w>p_̟~h2Aݕ.㈢^לY;v9sÇ#/!ABerN:T1c`,*~x(OZ-;yM!""I۽{'O3t |kQ2"D "DDDdץ])S2uí|=z(\y*ﻱΝ6!Jnu&"j"""jЅ͛mCH|@flCK3DFFbp9H{r3K=§bw JivڬvVuϕ1Ҿ=)[/f Q&”.ãr6F6ӥٳ'fl݊m5 oWD؃cT*n_>&dN !DDDHrvj){SRBDې IDATpf*;E@z~;*\F "DDD_Vķ@x`VtA҈b!""r}-X !C0g&$%%)]b "DDD^-3 6m۶JFD%ˡZV:+..l t((,,`B]6hDii) 00Ptt+y8^yGa' !!E߷F @ն{Vbs4kɼzߦM o- "PDLU#SӘfi_jD[O˗B7ވ. !DD8μ:@Tp[u|P BDDZ"p!νR!kVܩSd(\Qk"?g.|ボoǒ GD'MJߤT X,ARf}Âx=*0W^^ADTDf•x&,}U*BDܜ;ƌAw ,^/#"r 3UT`7XCBȣ&"""7e*/Ƕn !X B"""7TBz1{"jD܌ۮ%'N*ӘAꈈAȍep< b!VAMe0> B7~I"juD܀[ C/t!!yYZ!Z%""" eP9 A /`… !Dj1lmڴADD%&&BEX8HR}uϱc maT-C@wu:4mBHk# N)\gR<LJ'.䌓oV. uLrre`.\T"qNi]q_>>vfXPYY)|}}gl6CjKV7d`Ֆ_] L&t:qDWDl6˪+ .+++ewWT*78_Et:ȩۙۖ3(zF޶mduM-]`U{_SK\q}MZ+++{tP5R~r2F)  -m˙ۃDZ4er}+c+5p9%w5Wn[t:; r 7 ~z Dbbbm048R͞6m4:[ii4X=qqq ip~qq1 ʕ+6IIIQVV&;6ΖuSTꪫ]ʕ+> ηX,{ 춴Z-鐙)(h痖"//OV[  jp~aa!e/77WIN=9ʒuBノ;6ܿ?o2dPTTThӦMm#;;[V]111V\\ޥILLl@\PPҪo߾ѓ✜'S݋fU&'clJ ]wrW\uVѾ}狢({{pqL9.4 pqm6{DZ59Gw+((6]v\!q,33۷1z25Wѣ8qRk6oF~3xe9q,f3Fzw:'-{VWPPPotԾhoFi^͢o"##e.7[u}Y(BV7:uS-{Tnޥ@m5`6M5-J%]ɺuAV+ Ycث]W>;>>>NLjm^?'mAAA)` =Juxxx^|MmC^]>22Rֶe&Adww>7uq,88g[^mZDZPղ֗Oq 9[`Д}Xu9 o_cojr5]}޹MDD\ As z53pzp%$੧B=Bpp•y7GDD7>gQ7u*b!"r "BDDyz `Q?e !C`!"E8!۱dZ(\Uc!"C߹sqqjR|xx+\ BDD(bH]@UN,""7 BeggбcGFT jo^rXbi CtIrBDxEdSA4 T*|}s 6uJ%CO87t6j~`'BEj֡JptL}!''"""t-..N< HJJR 1TQ-7܀S+B ,, aaakt :jz̾5c!""c#Qr0k _q_h!""a!""b*/~sd CQ+ BDDnT^Qv20˗c#(\9Cq1RE!$1}b+#" BDD3YC/*qC)\""R)Ck6•QKb!f3, v(Q!-wh`p7(d2#B|:h*ϝToٳ15|BǓ^45<(gB$[^^;v0 Ph߾xł?X+<:c9+VKen.6 ŋBC曘z$GQQIII(// ԩx0l03FZ--K1rr0TˊP\?c•0K鲳!Pt^}K1貳`LKkw WFDD BDD.QO BHVVnS2""R˗0gfZ#"qÄ WFDDJa!"Uf !YY҈Y#ǏW2""RK2pB"#wz\ WFDDJc!"QO %'P6`ر WFDDA߱y X+ư 1Bʈ]0l!!!A3ApTz m1>>>:T-Ea[Y/g1PTJ*ptJ>FjphQQQJA@\\ex4FӢYl<(*`͸nذ{OW FppexKxAחr ""r3gF)۶BDDMDD͖w 0vOʈ]1Q?n !ee„3Wʈȝ1rŎ#r@ab"nsZ+#""wgD!9G؄d ""A,租 !aΝֳY$`d`cWv>1vB7C(Xs3F;q/{UTTϾߛl6C3osh߾=n۳ک+: h:^  T$"b(\gE:+_???+N "$[aa!;v J2XF]$M]tjo^ʜl0yy->Pƶfz^ZM(îc!Tp^BU#'%%qpM塼 SNJ㑪Hu(1ҾPr/k""%Rб#n+!DD\ "DDdW_㋛nBHnθ}^\չ•YD-H~?/MZ߀(~Ii:-un<$Kܳw/<Q hIө0 Vd45EyN@LL=Nʺs':BUG]b !DDl5ۈ"ҿ4 :vٕ+? !y]J9 y CI ~[g}E)] }vBr{]peDDZ0 cYRw쐦ߊΞwXV"a(.ƥ;鰫B-^.lߎӦy%k=Q2""jMDZ.+ _s4;~ :ϚHWH 88s!o_ٳ}Q`!قAPϤVQ >:$O#qhC~ ""h3ֻϬ_ogB ֡/ g>x sp5ދ{P-88JЦMe1pqf~:T2fApVm|!=[ !cΝAl"00P2!/#u|Db[,o`[jb;gl67^(//otjqqqP Άjo{ڶmռ JB`о6???68d2J!X+++cڅ3U1R5޻{YY dEEE())Vtt4 Ym7z%{ܟF秧R[,˪ 7 ʒVpppyh)))AQQڴih(,((@Nuһ IDAT:bcc 98eo=' YDZBBBj_^^|Ym58V[TTTWJ81wxe@dՇX=[,6{Җ+&Iv[Miɼhݖ=A, +~4v>(6,_=TUh|}:>:w` r׻۪}fo}9s;u%@έΪPSc:&uF._g4=ԷlcmiG^/+-{l[nNݢE UGַ֧ij5>jzq< 0hv(tI=Q]anX,6ũaurLIIt JPV;Vѣ8z4J~]]_pꭷpD¨Q`'N?Q.!CCC9L?,N?@I/\NшKJ݊{,;PT#G{N:n\s(:h4:.aA^/dT?$OZm݂SV"}{ | +vQ<կ?$X]n6y3in!9.ߜg\w/~%#6s B,!ذOgTTĮq vSF(_{a4^r ˟=x,̸^u±jafwXX?G1;Y̚Ÿexsh)*02d!+1Dd=Jv>:Vn+^?dykOHW-L뮠rॗ;)SZn_9)h׍!"z$%!EKcn]l7q1l؀,Ff 1VNZK>/F]Eh$ħcяpթSY;oVvkDZ^T_t5Fo"ȽrvSB;i1Dz*駞a-1) ]s +V$+?d88(߸LOg3LK,?:+I:nIBk1ęAOJIUWq˓O( q75Ql jeelnȹ&r is=gٱC_ҌإReO%!5IDb0S]}5['M ][Wsω$>l7́ի9z5S0[bZ~2ƒ6o^?߱sbrsB )2GD!+EXMB^$!%vh<0?.-唧"i9m]{-/df׿U(i:|C6DDh !&f#55vdX,u(NSAͦu8R|>޼"OB|7ȍkIlVp w7FFr5p_s͌Op\?|J֯OMżyԟf"+qqqu(N( hT111Z3d+fZаpa5B!t:Z9eR:k4t7/F0yLX<~^G[oQ[čnbO~yOm8jO3.d&**JD$Tz?^PH"kY(^/o^|q$pÃAbgW> s}_z+/ddonϞ~5<ĿV^9 /F !`&ϼG5xQ^^*׮c2aկX?OG% YwMTFFCMMp~~^GBϰN$D!@!D5wxmIH(sy~okwraØ߳EJ:YzXE:e 5;v l`Bf$BYuLNF'ūgMM&|wp}Ucݿ_OZ8.׺\\{73K|zL͎9w.[x|B.d jwrf"өܴ7dΑ>a z.=|8gLnXoVk,.!$kjj:)9!!AZ^/v<ӟG$&r?o( G{^5 :A=_O\.Z-\VF?ɞ'^Z1ssǭw-|{}$"=Ѐ}zIi]ɢ {^Z=,)иAnS:\b07T__d2 : V<--XSRqvLPSS7 D̚Nqf\! F'"e~ʮ7`d y˗4mڀ3τvH7h؈nGI"KmE`^z-DD#!zl%UF"Os3/z*M_ n8s76S{wDef2wuDx4GQ|>tK,="LZV6INc؂,zU<Ȕ;4 i3K;?w-x.BIDB#:^ǹgOBFhX8ɹbtpzI}]{ha?9W]qde:NŎcCӉ6ƬYT|3Bp$sDb9kky1?40q θ#Zfs;wR曀5ǎG!m<"q74P4v[Ą&~$ "B B GU/ΜI+VpkУ38㥗m՟9n .2#~$!B'ǫ;Ye^IKK󡓡B RYɋ3gm]fcrڅX EC<`  r.o| 6su`RD||a jmSN"p&Ih#q,22RzŸ> @sl,SSx`8aF:ExBE…d,\HsYe~JSq1κ:LXSSI'aD DN^[""{G I8DD!Ysy9f[R@S|<_zًi8Vdz:KhB !D?j,.fu~>JEpƑ !ڒDD!IcQ? )!yBiiB=ID4zL*+\@B!BYca'5 iLI7`G&BID"|j,\ɳfiBZ$TY2x< RRRgQrV+qqqGQmAkfφ:RS9sZ&kN'555l64hijj!8\MM Nt|>GH'CUkuTT6M㈆&IDDNzFtv\.4\zIQ M!karo3q #uhZ5fpj-s/`C{<y^C|>= #5/}T{7k> IKgIG&B.[#}PkKB339$ B!!="BK[)gG&B>B^ܲCR?|$!B!DHBPW_Ƃ蚛 =ZȄBCzD7mꐄԍ|"IBC!DJ>7O=$$7%6qdB!#B'Ygk݄n(IBB>DD!qy$v(n@Vv !LVKJJ"!!q4Sff&n½i9Jsй\Ԏ6> lVUCN\\6 {+55EQcPkf#L&ӀOv-**SOU_wyo V" i4Zd2 {z}֟~$!:n@0ɵZ<`wZv%pUB oņ/FP3a?CRRS5L!%<#IBB Q!hg嗫IHݤI\G$$&jB^$&&r'Lb@YçKz>>x#B!Ox`'jʕ|~|>jf> ..NȄB$sDCg퐄K"B3#"VSSCSS|e/v1 6LpEQ8|0$%%+VkUO뉉s١pPYY {szHKK}z@VV Nk8pbbbdF455o7o^!v7;N%k(9߷O=Ŧ믇zDGGP:΅=x:lk i}'`﴿ZVsW ID!'`7kݹSw: B!BB 9[Wj/Xa"BIDCW۠5 i< ny]4L!Zd"ֲuVG]]UUUL&eԩ92_OlP쳹׉RnJqq1444`XHNNfرL64 Y>7P^^Nuu5DGGʄ :u*6MPCb׮]ر*x<0b8q"EPCVSS| p`?L<1cB2bPDD/̮]PZ{/+..o^#11 / &`o~I9p˫m( 7oW^c ꫯXr%999\veL2e# ]nodk߾}F1uT.rYx >cNg|w_iƅ^(KgSZZ/̖-[:LLaٸv-k׮%&&s=3@fc޼y1|pzUEQ())aǎlܸ:N'>,k˝755{ng̚5(ԯYNG˥r /mæ|P}=rH,YBFFFGGGO~~>Z=ϽKjj@~}|8gy&1o6> na@ך桇b޽.cƌt:Ǝرc9묳x뭷ظq#^+Wb6Yp@ !"f !4( ?9_'LBȲe8s^YLMB. $?Oj2{ln&!旿%SNC=461ݿ?O=˸ꪫ6G; 33w{;쐲a֯_7馛Xxq@=FEq뭷C>3~~92!D(DDm۶+͙3OY)} ̞~Oԧ{n'GEEqgt:'죏>@sEu[ii){svs:83\)>/: 9x &LS" .Plذ l!4_=^/IO9pTΝ;OAhCʌ3Rܹs̖an᪡A?f̘mxꩧusNu>'UF#smkOg ֵDu]QQUUUA)W`DD'IIIO@Q)_'!&+2.jjj(//CcX=z4IЩ'ܑ#G ]Vn(j3iҤ۾pöyrfqLZfH.;RQQN1 ]>Q%hr|XWL&SP=:ٲ;fFVbQׅ x\DDD۝Ng@u:]9Nw>^r Ξݝ[>/'pnm## (ỳW) ;VuO Xx)hu|Ъv XɤM1 >N O V^੧l2B#2q>QطsYB!^QۇbaĈ B J@ђ!Ȑ<>a'))cdhݎj6C7]ڝ šY|>  -jCy\.iiiꆄ]RǓD|%OB.pP|?!';nz:۹vwWs`O|~Jr Æ }qaW42yJsMQ xXh@lj(Ѷ󡺺C{ Q]̄osKà՞^DŽ\cœ~nN58Z0̌?(r|vbF&﷿=q[Dֶ۱,Gcc#o^ CHOOU'd"rPZEĉᮢֱӹa$j(,,ra2dSV)ݿZ}v W^. iV|qك;_sbΏ?èiӂVv͸[Z0ğ~z 5:ܛ7`6a -1=‘qIĦjVܤ\rsRn( ܀.?XɼuZu:T\. Fzpo,!6x_n7Sy ٺ`ࢋ.sB[QF(PZW.ruh;֭[ݬ">ٌ+oΝ;^t:cƌQ_޽ޫ)((P_; dwܹO())0^JJLSS֖Il5PU^^>T)O<gԅ,Bh.99e˖P+Vuso5 ]LNhlld֭꼆V g)))jCm6=ڣ2*++ٶm$;77&nԨQW̲lݺ%( b޽?A4iҐHL8Qbǎsw׮]&FI&U<ȁzUUUC+srrXti+D UgܹTVVfx8p$>UxnS~tqۄ8N)++So111L<رciii Ν;IJJ"''˝q8t: `ذaCI~^ϔ)Sꫯ_OP(Bmm-IرcOD|`ԩ|x<jkkꫯbذa'\Q^^NQQ:`00eʔnW 7 L0]v( %%%ԐMRR kv??Xd~_bpDD2."ccIhMB\ e?왧qaf3]$^{hPFjj*yyyCryiNǔ)Sطo4jb`2p8jjj[i+77w%!mf3fb Z%%%bوl6c4q\477sO 'Nsk:c٘5k۶mn:D||}:vZC1NSŒiG!79222!$yY9{6 ]3.1hiiprhY1 L>if\NDӑFZZ>N'nٌl&22rͥ @VVYYYx^<|f,4s\8N'7&ffqI'p8p86Qln~II>zd2uC$dD=(ϹqVUm7NfS@z0 DGGu(V[,z/""J:b$D Z!4y$$#B!DDD W}=+NtFEc[|Ƒ !b H""p:o(l=IgqdB!(!3gٿGT4} kB!$"B㨩a̙f#o+ȝHȄB1$SEUχ(t!EKe%+g@9|f#gɝsF8~+m{aȲ'u7c:Zi[IyIDD*++]sss%b\.א[f@i1%6Ge{TN.5!{񨻭[V, >m8ȵqiԖ9N#\.Ǔm@C$"B~\^OBJKccI{EkB!$4rt8r{BO"kl#B!$B#Gxq (+Z'W5d#B!D(DDt EE9]UК<2Y5L!B!DP5jY*jB!B$"B?xDI?YȄBjFB~S'5 iLI% $D!!D'og15g m#B!DDD'5{f4Ƭ|1G&BP&l&22Nq4jh4b4Wz.^3}C 霴mGt:L&#:N^tߝ^^V 8{;z^mӘfJ*O||a jZ4۷ڼydoۯMկ Z1fiQDD! zmId8h4uC$"B[@3 L! !zlf^3]Eު$ B!DH""ؑMX{i蛛1'>}Ƒ !bDD/`iSl&z #B!`$[%7…[ZeK:LȄB1XI""RgIHL}i=B!%vzX,Y\.>Nb:n~!szܺ1c'kE) _(|>Rվ  RJ֣EQp:a`#.VSSC}ٹr#Gr0LhN\p1I:4IQq{x=zij]j8Ը\.111r-셖n7qqqG38)%*Lf !:6KBjO`&!B!/!:ѵ%!y]b5L!Df !TʧKx4YϿ5ڦqdB!7#"`K/La $ B!DDDի&!Gdw82!B+%xUZfϜgbFjB!™1m׿x'!f3繷% B!DDD!jOy2t՞<Ϭl82!B 1my tI\xY6B!9""`VUoNa$WLL Gӝ?z-jRs>fP$!:ŌGꢧt:f@v%֡\ {h4J|{`0 @DЊ$""`6 MrDM?@ig0/a45'"##ic2 DF0LL&,9Qra0HMM:!O) 1Dl|I˃* B!DDD!`Ȯ9j Fy+BmH""D`wIHss!BS>>vu:@ϽS$!B!Мp"L}xOBοy~^VXB!$" }xxA4\p)$!B!9n7Ѳ|/v|>z~y @ht d ˅EQFQc׋Kʹs_^Fm{/֣χn,) IDD멯 77W6Qꅪ*\.&/IH?f}MCIQf<i@륹oR*7ߞs8I\ {t $"f"m(zPVV@||<G44I""Dx)yq4jH$!B!/޹zJ|'!9w?,IB!B$"B ro/[Ƒ@d$!B!i1u5_zĜ;O㨄B!'ҥTOBɷߧq`B!DDFQx+^hMB_ά_Ɓ !BN!E.諯I ˙KIBB1H""D?Srjkkٻw/>x{nʥRkx O!vZZZ`6`}0z477Ԅ``4e\.F+CA}}=^Ʉd ^Ñnp8jjjHHH82!zN!Iaa!}[nc2226mgy&',Kx?0;%PQUUEYYx<r8[L4))) >\|9r jjjPcz= & cnJJJcL&IIIdddHc>|JZZZ:=jDVV$%j\.)>/z[nlO΢El,D$d2Ȯ]fժU|嗝6+--{ r饗bZ;x$(-ӯU}wQ__бuuu2rH W^^ξ}Nk2fa/\aa!wvSVVFYY ;4t\.Oiii–>LII =vb郞B yw8s {8` )) $'Uصk<MMMl6yyy 6 ͆磱gnuֱ}v~_Ջ? LVM>@(..pZ$&&bZ1L`#Z~EAAL4I63|> ())sF||<,Amm-7ߐѣloVN#>>X, zEKK 555jWSSæM0a=ٶm[Dh4@LL & N墩n7p=ʔ)SSh;wz՟Y,n0L4\.^{۷smI/  S{L& zϖ-`)Pwĉ5LM6裏74;<&O|/S}DFF??defꬳhC<&n{RXX'''eE-V|ԟO@a(N#55#Ftį"*++՟2y~98N6oެ6 dffvs÷FMNNNjjkkٺuz-X,dggzk(TVVRTTֽ^gԩ][ WfϞ=hFI||୯%9wq\|lذf{S,w}A [)z-*Gfcj:cp 6˧lfϞÇv}6*xO>Ic7 DII  ѣG3jԨNgh0Orr2x<n7 wV D^^^ cχn Fe˖-vN2nFDD` 䖖l٢^ 3"H""Dx<֮] A_~彾\.]tqs$\ Goq]pCRSS:(**>gZ;4c;r:>55|dsСCj29r1=Gm3ۮs|vwH:(:: /?xu*!B$"BݻΝ۽N'o,^pGDpg?G;5o1"(s:222^Јn?|ĈA)3;;[pö2z/?\vZDttt˴Z<ζMYUUFfffPE6mz۶masB IDDv;555'm(ںuӧOUޖ\7${t2@@fUUU^WwtXN9x^uGLLLÏR'VWWw @rrr8HJJR D{ uep Vt:LW۽{wP ^WmӴ2&$""`MMMgmOmr?7O--hR\D=(_ӶOjugù[ZZ$ӻV 랹`[a2uPÈn7f\@x^MV(k]r%/Xb9S$1įRRRN{ݮ6hac=%%;tO>0]]8~␝eu/t:F1 Ilfmó?EhZ~1raIX,]ֻޭVkKd:΀ǹGFFvGKK . ϧN .˫#""Ffccc@INv^H CX6 }Ngsm3@twn9.!mnnV{7^ör:'z}eEQ:U^7B| :. F#~\z Dtttݮ^rŝ]kڮ^}b~cye t:}eu^[< @*иN^`: ̲ Cl.+աh E Ё:.Ʈm(jl>`^ӓk`w:֘fGп{6Od`0Kum0Lݝ[=itw2AKrݫ߆d"Ss'wygzd2+UF*&&n0 Beddɋm+Cgízmrn۱Ѯz8tt&>ǵڗSaf9hCڏEI9XOSa\\\@uh ioq;sbٛ폷] EQԡV5(N ڹe0VVzv+YmEEE8`ηjhZOuvZ8GPBڣ`0-bXVVTTTЮͱ-˥3l=n$&&zϫc m2Y]>0a۷oXW]/X! IRVV Umtuu:4 Aj,z B:s!P&\N~ `a Um磦&hn۶ 'ǏZB$"B~F,@:a\} }&Mԧauuu Uz ;+ 6,(G]]=zϽ@z?Ypa039jG #?|A4*}tRlقa͚5dddR;IHQGl,鏿HN˪RZ,z=Gf(®]:uj8p@]R1)))s Ӭ,{n&Nث(ٳGR3&?Fjj=t:է@"69rMqq1111~___7 15v}ǷUm[eގ;v8 !BCЖҖR(eB[|.\Jمr PҰ,dڱ%ɶ8:?$xɲlv?:>:Lٳڊkι\yapaa!lr&:%h)cGo,š[o]P "dz=m?Ǐ6<, IMMM0j*3Ӄdc$Ɠa}CWkkkXEL&>,À=Ћx">v7p_HL ''999tC;%\M6m{K?ٞGB{{;Z[[+"X<dY,8p###!Um6MFAmmm-H"^5::Ɛ7V4L8x0i0[9ZxՅ#Gq#4͔-AnnГq9ΐ68hnn~>???)&;444`pp0{dO<_|Q}vXbIbP*oNfÂih Y$AL4 ?!b16{2КBFe0 &v/zlu3w|"h8i"^Ǎ7(B:i+WBWW,d?N9}%/!DZc!Xu:Lz_84CW*I5 f6"UUU(,,DGGFGG IT*QVV;B&a0 2<ϡVEyyy¯*ZN; 2a6166 #j7Xra||v]X7p/LOOGzzz"DP0 0L?W7Q[[b:q`O1PQyPfgGX!i_lްGwMB !XX&*$< &}+eJ;\t~#MR:̏ahZq[e녍y%Dp 51ݻךrO=5vs~[|X}>i/Xbw!SOArBֆo?ז;ބ&8#BHX,}}x;^ھ="Ad/ yFWܱ;v@WW7ߌ+ؿ${X5ߎE3 $BbE%B!@b6j`[{ MCc#>Z<=?)碭W)ssQ_,DH&''1<ֿ;F.3%#BH"XhY}!Xzzw!ҰwP Bx'YVނBCrB!disdbbY ygeh_F0 B,>޺  kw B!B"g.y! ތ?E믇B_8M&Z$`E&BA{5mD%-䩚(B!vZ}-<?y3#|VZ _xޠ /hIBNrW*?&B!D̉?đ_t2n2ʾmTȨ_ֲue֒ "9+!Xaې(R\2B!;ٌч&%?U7܀e.!0kV&CE- BB+ dXвk>2!XbC"WFwPY aRx"RaY* @b~T*D"@0\ r<ňk |@T'DBx:޻ۿɲ8?m[WpAn iDH$M4M;v|"`_ { "eو/,b-a:\$ G0m1 5ʄaeß>- ]~9cjk<9S^gqƒ. "$g.Drk]B!FZuhg?:ӖwQ!Io!dlx5(B!a~KXk_<(ܰa~~{}!N;<2X1%BHd?Abi ]xf ͕= $Iic5׀ 3'wQ!BȒ`J^겲%}DH?ᦛxۖ3@&D|B!PL?>ueBA$|nY!3 VL!B!vĔשEEKh 8qYYYqg#o ǹaH|v;^/D" Z+V7&ήXq sp\|a۽08pRxxxΓx<juK\S^/u2 Q.||h/B`oYCP pzԝ&oڹ\x%sH' :0^Wн g>8|e},' -w!E%B!$';ʒ B .lEcR!BȌ81`X2DHz?}7~=,- BȬlCCS^ӣTG}M$!_o~{ٷPahB!1lOVD$I8oqz!ߺu}B!B1;+@={p|?ӢTGA$7~!+Q(B!dF'ZolB="KIz+ 5QGPBi}f\ÖG]$KoB0`_ !BòSVRXuCTTDdzDHM7a衇B{\*B!ă=L-bQ鲡 BB둑`؇1!䇨ٽQ.T*UFL;2K,C͝ \.L&@0\J2E{k/p-J(++KRSYDAa 5`䩧BanANuxtTIVDH\z`|Y/A~O_DTB!$TDH|y0s'+Ayߣ\0B!DHyK/%r!BBQ!c6LB?PrQ.!B x~+*˲HW(GQ.!B x^|1HPxQ.!BY "$dFw B߸nIRZzDHIIvql%.vqql|H((\.8N0h Ѕx<hp<pi_px< ]Z IDAT6%JNDHn7Dz.{N,F9^,p+<@<3t<8,! z.B\zBFPD4ɋ9<΂CȽKr!BHP!18uV>HwuQ.!B$ "$fxn޻JQCН(B!D8OvCȟoDdB!d)P!Q9xjfp}g]B!BADS6p(М~qKF!B5Oz*CB'F\2B!(p[xrZǟF\2B!(1 ] \̱c|!g\G#ѹ[Gax3t#cx4"(ʥI^DHz=z2}!:%E홑(bST.B\cIWb14Md2d2YRRR] R4%Oeee.Fң BlƓցioNIAs;!^9%#B!ˍ8Ԅ8jcըG^^޼Dzuuw\RiEtYVh4pd2(Jt:dffR&&&rrA]e11#SNXLx^FbtvC"@&AV#33Z'vcll ccclp8zJdj̄ZvQc N'_k\.GFF233P(\e61:: Ӊ OB]]zriFGG/O>Ӿ?99~455êUo3f0`{{TtuuvOn8z{{!PZZ|srpqx/tݰZ0JByy9233PdXсr`X066Vddd`ŊTb0ގ)_W_k3|g`Y[nŶm۠jS`B_s=3iiiPŘdqVs=83JE66׭BKºvAbez҂)_g2 R(>#kz[nEMMʹݎGO>A~>qHKKedϬ_h[T|ol\.444`bbBFAnn.ӧUJN' a<`6z&c6qAaH v_([(xvАԁnxxz|B#55uJŎyLNNbddCCCzZسg֬YǏ^KRCM8& LXvmR:ԄQkCzz. F0|Í, jkk2v444b_KOOGvv6,:X18Ÿ{܌{Ow:Б37'G]8e p]w&1+ Ok5Ke˖=z(y۸]{-ߴ !3- {Qz~_"`ʕt[q tttyyyXz98|BOQjj*VZ5"D ex<twwOZzuH`(0%Uuڄ,bÆ I7 5! f3ZZZ` 7nLU:$ ;eY+V^2bAū"?Gª̷>;'cƃWy 'h4uSOTGD"TUU-j~L~~>&m{YT*(//X àBXRƕM__>XEUUՔF0D>ʐc瓦W$yRYYUpgF#|E/-&Iz>"K_x=_4 "IMX᩾> u$bͫ?sʪ)(xY@cXx<U9DRRRI&irԉ&p%ID0ͶDp8biii |='pllLhUuN|l)'* k֬[ 6 "I, 8Ìxs zq7מ uCD<:R'GrW9y~ʒʉ&xjNՅD"lbI0ũBf2 #yrbA$?ߏ; ɷ87|.55`YvfD2g(q!Wnd2ٜn0Qw>Jr:x8B BCy:yk 6pi&l\.WH9t4I$͸P(H"}^6mtbxq܌jD"9A|!ǒJs\< -"BN>lFJJʼ-ݡ^\[N6۵s6-bC9ZxB:^[T/ !0P*Ƈ\]][XsB`b 07yv}{_k+gy53+ނ7+\a%Jx?\>rMOe.)))sޛvp'ǴJZ!~4͜שl~9٥A02LTD<Ϭdf rB>VFFƜ78o8 :҂w.]oCd\?dP*VCx,;="O@ooL&7ZiafX,[Y \s|@ %D*.\WYX -fbLt:rmDy+K,[Y\H`>9,g8lyb%<,\u,RW^8gT*V-reqϏ^sl0p8k0ҟཀj@z!RSS "n;ci9X"h{fqVp@ ds-fsa+55ut|||V2h4q Jb'j !$7g~K!HPcM DzJyWsP%9\0?mCabg,IsΒd'pS&d)%yo,kf]rT~|7OBPꐻŃڕ7tW wp?߄U%@˄By|H?Yf=R--x)))Xwb,k>|ǒµ!xBn Up˩ZeJq\@KK07B,cڵIUy|[bddD$ĉzP*VDv;&TJ%֭[C9)Jra*ٌab()+dlF[[۔%I0 ˆUFFdYwkz0 hii2A$iZeffb||\pnxxV bh<pQ܋D"@-g.x<DRuΛDoo/Z[[kR)֭[ЋMD*BVcdd;ŨG]8e p]w%UK|v;6>Xxkn@IJ};4SZ[SRRPWW b8m|JBjj*Rϊl6=##k֬I s1 hjj2aAP@*#q0 vyRkkQQ***.6DP,#--MX!vp`|||H*F1uALFLer`0111\d2ֆ@"r\8tд܁bvTըl6aAaVUIxKKq>~i뮋ҰDZv4o5- l9}+˲(..FqqqR ARƍӃn;l6 îf"Q^^ԕN͛7y^"Pp+j4Vioo*'gZEdz+VH0vt:\VU$%%%I_JX~=) U)tH$(--EAAADG* 6mBWW 󘘘^/D=_?999ַSN9%z'$DDɷ-Wm-ϟ06f6>aabbvvv?tV>)((FFF`2wtJn0L*chhB2)2ҳ~RT81:::)))DvvvDvN$FGG1:::m0h4DNNNRa 77WϴOH$Bzz/BȆ {7HnD"N<$ lN'L&xL&(JTf0@" sIKKCZZV\ abj:߇C =&''a!Hjn>B,;e˅qx<H$Yk1d2PTTrƲG]ĸr 7 _uXSwoLLH$S&Rd2(Jx^ qa`Syt eY@ab tN*"%%eQ$ak7z{Bg&\B!B!FA$I{)߾L lތm B!BYrDg>^ w;P!B!˂H_n"u.6B!($x7BgKv^B!BYVD~;^睇 zM. !B!dQIwߔœ>.ޱ }B!BHPIp{/~30|蹗)B! ܃_J!K.O;,K;/D" aH :aaFu{!me]A2<"* oaD[%kpޣC1e2}XIPDq/%%fb-eb5DB2$ˣ]b1U!b$=jK@ۿ ! wB!B$ 浟} @0I IDAT?>J!B! " 䕟'WB!BbG? !$kqB!BHL q }az|?R!B!1&ǹ0H)B!G="q뮃BHO~% !^9`Z<< M0<CXx[GX{!$;qXx/bZ9Jǩ]y/dy'6ۯ'pߠh*lhV]a8b׆Z8PTTnE4's,Foo/@"333%JNDЎ+sOS "B!qHo} D/pmF!B! "q.^|'B!w(߾5v!  !B!$.Qu\oŨPyuB!BHܣ : >Ī+S!B! Hz V~ !B!$aP!^Oy&_z!T|!Ŀ5 L&+}\.N0\svX,!]H$t-ڣeP(Q.M #ز>pR)j%_fLH$.F\d.BS(`ݮh#n,KER4E{ LbxXEFFF^n f>'Qt7b&B!IDpM46^KX(<B!(D㧞 !ky]B!B!$4 "Qq8)@ d2s9" !B!$Q͆6lQG^@P!B!I2sxrzBi;wBB!BHҠ xbzn[d5.Bq8-.-#BK!ryi0< ' pBDBb<^/z_pfg={NˡB!BH(,PoCvmI 㪃dS!B!$DD%H''C޽p(uB!BYmۋA>tك>*,*KK<&&&`t:z! ɠji\88N'L&B@ZZ˅ 8N8.xe, XhԪh/.l6X,8nH$d2j(h/.LNNjr´4Z#^wsHϓy<y|eGGJKKiEFw2F+ؿ?9Xy=S8sqYgSyyXWQ޿:aC$zzz0:: 30 t@/s ca4zgX A,s cE?N8)DBףV:FOO`fTdgg*'l\0 4 򐛛K '1L8g+B-s c߉'000z/{w}7T*q#///&$D  /7|n{r:;r^-[0!_M6M;f ."}`rBӉV /ߩjTVVRe2<^UVQe0`g~ L H$BAA˓2<5kCL$ PXXmmmB)))Xr?A2lhmmCjQUU%*Y0hmm?#Ad`"3 g-|]$OUW],n\8NxNr(((9%L1L)|MR+VBGJ ڦ .W\qPi#_$:q޽dR2LLLQ2BNZ T avaZ166,H$BeeeRĜ8qG @*y\.LNN`0LlKR!---ŏ:ގnk -- :rN`3 0::sVQWW񠩩iJO,C"==2 bnNF&iJKuvv6V^jVlfdd@VC.C$rn`0`bbbJ`)//Giii4 :4%@!55Ux\.X, )meQSSԽ===hkk+aVT*aTx'! (,~((([i.KxhZdffF@I*!dw wsJ;gqT3`-N{Y^p_Pqܸq#~w׾讨+Ob2p<(J۪gXpq@EE1봴4B{`@WWPaI9<'N^ggg2LﶥU chhB J8S.p{ ,"??s ۍ> 4_>Ɯ[VݻWd25+. (((@ee咗9֌СCµRPRRV;翛1_E^gdddJ}Ǧ p&dz*x+ rw}wR>CA$6s=.b8N'[QYY9:Ԅ7׿Պ娩AII > ۍ77!v8+:l68p@h [\*"++ b% T*URupЂ%%%RRDvv28^VVVRMDoo/_z(((qkU!X* YYYq0M4hll> ̜,+FpÑT-. zP[[ F3DL_K199 TTCV''' 4j塪*r9ER߽0===7ҠӞ2?%W^ aM6ahhx7jQ"0 þd2^Z<'E%prrRP(PUUWZVH<3 K-j8PjC-Mt筸8EW_-4n߿}?;;wB\7xChu pL%$ ha!<4 .q†SƠ/zKqqЂ׷{qR,dQcdsBh4ZRP 0!jZ; NJJJf Ʒ"[O8w¡ ),͛q} Cjڰ[&..o 0 !//oѕw=%YM-9Z66;;[<aâiOT*BwO[$u899)ܫ233#IT*z].0?Q_#VYY)|9ϓd21a̙3YYYj>|۷o$.\H襗^Bss3~ؚBT:EBB"z1LMMM&b> oPtgpر)gϞEB?^ο# g4l6Ew{>Fit ţ>M6aӦMXh%!w}S0N>HB"##E EF(**yt1?6zx=GEE.jGQO(^3fPl{ oܸ!uQs+V }4}$$$'NF0rtww/y#G(2zyQQsDIaܹxWuV\RRwjqo}d᫯L0GGEz1SPP Hloz?>GBQӠfs(U^^.>4j⣏>$''GvXuAuS\"^AAA ƪEمż$WՅ"rcppƌ Diiz{{S|ϓH !!k׮E^^^xfM:'OƼy8bm+VQT7oq7L8x޽{8*.[L=?.|߿UUUСC(++Xx('h4XbhիK$ܹS$U$NO裏p| չ."^tWo'%%E~khhaԄ]vpW\HGqidS$FOO/0 սD*7ڌ3+V֭[x@#8q_˗/ŋHII755OZ[[s5oz())nGmm- ..ΥE& ۷o_~z)8}4$IhDdd$܇$IOo>a6nܨhbOWV& .]QZصk'|c8FX,oAJJKX,o#O6 WVȚ\())jEcc#ڐ9[B-Z$z;___L0Ǐ[5LCUZZݻwnCw{ϫBoo/F#bbb\nQXX(@^?CatHLLT.hODt:RRRPRR͆fF?~ʑ6 }v!2cƍjHH#1o̘1{h0Lbk׎9ݍz N#==)))?~< łz!#~I&/_͛qyq_TTӑ0ڵkhiiٳgQVVYE\\^|E4*lٲiu||<^N6\|Yvvv㳲f_3C/Xk닔"&&!!!h4hkkÅ `2`2Ā/_E1bl6v܉C7 W6^"22z===hkkC]]F#jjjO`` ֯_/y"IDGq۷ɓ.xb,YdT\NCC}];wΥBCCj*Qhg4;uv!)SgE$ Gaa(9TXv-zNsN=zT$Cj+Vx\r(,:TcǎŲeߡԣ.^yxgFށ>pyD<.M& ^Eaa!Μ9#FWngܸq%KF$IBii)87ǃ> xDNOaZ駟øtwh0uT,\> {_|1.PZs,Y$7F.\ ?fΜK"22ҍz>Ʉ}zN#++ ?0n87Fv;=O>*L8xGBv7h4ŋG^R/LDtwwB,:vBBB8$$$)DX,&ln^GXXƅ[TVVիX,`@xx8F[T[[ ٌv@cĉHMM)j X,hooGHHz=N Ʉ&!!!0 HLLDll,qTTTf$`0 ,,LL7ףf~~~0 ӹ Tǫ"""""L@DDDDDnD܎"""""r;&"DDDDDvLDH"ގ7| .Ą 4^v/y|x ?vލX%<͛/ !==]HKDߏŋw999xg;v [nŋ1&Mɓ'DDDDDD$"Gsso˖-ذak6EEEիWcǎr """""PdG$dMB`0`޽HJJV""""""RىHoo/y o~ {Ѷ˓ Dϟ{ #`ҤI]PP7""""""%;9tS;''gHi4dgg p) NDnMf͚5sgΜ.--DRE\\ܐMHHpjWUU T@V""IE;::C>ĉNs TBV"ڊ7nvddK燇;[ZZCDDDDD*!+1N` rj -jjj0uTW^鐈TgnǩcƌF'Zhhhm^'"r+"Vթ>~~~vi^DDDDDd%"Ʃrjr!"""""55k̘1NmWhH&:NN8DDDngJ -""rܺڵk.񁁁r!""rHlڴi "R5YSœmmm.r!""""""44T]:㣣CDDDDD*!kjɓ˗/ƍҹVROHHֆ_#""RىȌ3D"={=sS;55Un8DDDw\cc#z)^f ""ɚN풒![\\,nk4dee T@v"`z"C:ǎՉF ىHTTz!.))ѣG=oNٟ~iJNDॗ^rj?hjj?cUV) "HNN,X ڍpz6 yyyXt)v^DDDDDd'??bKz̛7ӧOGjj*pq\x˗c͚5JADDDDD*X"> ?0jjjFF,_;wT*"""""R Ef2e F#~_ay\RR}|S2"""""RŮGnn.~_gϞťKヨ(̚5 iiiN[x"Gb?DDDDDRN"""""" &"DDDDDvLD혈1!"""""c"BDDDDDnD܎"""""r;&"DDDDDvLD혈1!""rnwjP$DDDEN퀀H٩m0F(""b"BDDb]w5B' _~%q lٲ333G(2""b"BDD4={׿mo޼y"BD4 EDD4Ǐ0l۶y&"DDDjN;cEDD`ڵ0HHHȈK#I4AAgg'|}}?ҡ"""""r;N""""""c"BDDDDDnD܎"""""r;&"DDDDDvLD혈1!"""""c"BDDDDDnD+3IENDB`triton-2.0.0/docs/programming-guide/chapter-2/related-work.rst000066400000000000000000000342131440023377100243200ustar00rootroot00000000000000============== Related Work ============== At first sight, Triton may seem like just yet another DSL for DNNs. The purpose of this section is to contextualize Triton and highlight its differences with the two leading approaches in this domain: polyhedral compilation and scheduling languages. ----------------------- Polyhedral Compilation ----------------------- Traditional compilers typically rely on intermediate representations, such as LLVM-IR [LATTNER2004]_, that encode control flow information using (un)conditional branches. This relatively low-level format makes it difficult to statically analyze the runtime behavior (e.g., cache misses) of input programs, and to automatically optimize loops accordingly through the use of tiling [WOLFE1989]_, fusion [DARTE1999]_ and interchange [ALLEN1984]_. To solve this issue, polyhedral compilers [ANCOURT1991]_ rely on program representations that have statically predictable control flow, thereby enabling aggressive compile-time program transformations for data locality and parallelism. Though this strategy has been adopted by many languages and compilers for DNNs such as Tiramisu [BAGHDADI2021]_, Tensor Comprehensions [VASILACHE2018]_, Diesel [ELANGO2018]_ and the Affine dialect in MLIR [LATTNER2019]_, it also comes with a number of limitations that will be described later in this section. +++++++++++++++++++++++ Program Representation +++++++++++++++++++++++ Polyhedral compilation is a vast area of research. In this section we only outline the most basic aspects of this topic, but readers interested in the solid mathematical foundations underneath may refer to the ample literature on linear and integer programming. .. table:: :widths: 50 50 +-----------------------------------------------------+-----------------------------------------------------+ | | | |.. code-block:: C | |pic1| | | | | | for(int i = 0; i < 3; i++) | | | for(int j = i; j < 5; j++) | | | A[i][j] = 0; | | +-----------------------------------------------------+-----------------------------------------------------+ .. |pic1| image:: polyhedral-iteration.png :width: 300 Polyhedral compilers focus on a class of programs commonly known as **Static Control Parts** (SCoP), *i.e.*, maximal sets of consecutive statements in which conditionals and loop bounds are affine functions of surrounding loop indices and global invariant parameters. As shown above, programs in this format always lead to iteration domains that are bounded by affine inequalities, i.e., polyhedral. These polyhedra can also be defined algebraically; for the above example: .. math:: \mathcal{P} = \{ i, j \in \mathbb{Z}^2 ~|~ \begin{pmatrix} 1 & 0 \\ -1 & 0 \\ -1 & 1 \\ 0 & -1 \\ \end{pmatrix} \begin{pmatrix} i \\ j \end{pmatrix} + \begin{pmatrix} 0 \\ 2 \\ 0 \\ 4 \end{pmatrix} \geq 0 \} Each point :math:`(i, j)` in :math:`\mathcal{P}` represents a *polyhedral statement*, that is a program statement which (1) does not induce control-flow side effects (e.g., :code:`for`, :code:`if`, :code:`break`) and (2) contains only affine functions of loop indices and global parameters in array accesses. To facilitate alias analysis, array accesses are also mathematically abstracted, using so-called *access function*. In other words, :code:`A[i][j]` is simply :code:`A[f(i,j)]` where the access function :math:`f` is defined by: .. math:: f(i, j) = \begin{pmatrix} 1 & 0\\ 0 & 1\\ \end{pmatrix} \begin{pmatrix} i\\ j \end{pmatrix} = (i, j) Note that the iteration domains of an SCoP does not specify the order in which its statements shall execute. In fact, this iteration domain may be traversed in many different possible legal orders, i.e. *schedules*. Formally, a schedule is defined as a p-dimensional affine transformation :math:`\Theta` of loop indices :math:`\mathbf{x}` and global invariant parameters :math:`\mathbf{g}`: .. math:: \Theta_S(\mathbf{x}) = T_S \begin{pmatrix} \vec{x}\\ \vec{g}\\ 1 \end{pmatrix} \qquad T_S \in \mathbb{Z} ^{p \times (\text{dim}(\mathbf{x}) + \text{dim}(\mathbf{g}) + 1)} Where :math:`\Theta_S(\mathbf{x})` is a p-dimensional vector representing the slowest to fastest growing indices (from left to right) when traversing the loop nest surrounding :math:`S`. For the code shown above, the original schedule defined by the loop nest in C can be retrieved by using: .. math:: \Theta_S(\mathbf{x}) = \begin{pmatrix} 1 & 0 \\ 0 & 1 \\ \end{pmatrix} \begin{pmatrix} i & j \end{pmatrix}^T = \begin{pmatrix} i & j \end{pmatrix}^T where :math:`i` and :math:`j` are respectively the slowest and fastest growing loop indices in the nest. If :math:`T_S` is a vector (resp. tensor), then :math:`\Theta_S` is a said to be one-dimensional (resp. multi-dimensional). +++++++++++ Advantages +++++++++++ Programs amenable to polyhedral compilation can be aggressively transformed and optimized. Most of these transformations actually boil down to the production of schedules and iteration domains that enable loop transformations promoting parallelism and spatial/temporal data locality (e.g., fusion, interchange, tiling, parallelization). Polyhedral compilers can also automatically go through complex verification processes to ensure that the semantics of their input program is preserved throughout this optimization phase. Note that polyhedral optimizers are not incompatible with more standard optimization techniques. In fact, it is not uncommon for these systems to be implemented as a set of LLVM passes that can be run ahead of more traditional compilation techniques [GROSSER2012]_. All in all, polyhedral machinery is extremely powerful, when applicable. It has been shown to support most common loop transformations, and has indeed achieved performance comparable to state-of-the-art GPU libraries for dense matrix multiplication [ELANGO2018]_. Additionally, it is also fully automatic and doesn't require any hint from programmers apart from source-code in a C-like format. ++++++++++++ Limitations ++++++++++++ Unfortunately, polyhedral compilers suffer from two major limitations that have prevented its adoption as a universal method for code generation in neural networks. First, the set of possible program transformations :math:`\Omega = \{ \Theta_S ~|~ S \in \text{program} \}` is large, and grows with the number of statements in the program as well as with the size of their iteration domain. Verifying the legality of each transformation can also require the resolution of complex integer linear programs, making polyhedral compilation very computationally expensive. To make matters worse, hardware properties (e.g., cache size, number of SMs) and contextual characteristics (e.g., input tensor shapes) also have to be taken into account by this framework, leading to expensive auto-tuning procedures [SATO2019]_. Second, the polyhedral framework is not very generally applicable; SCoPs are relatively common [GIRBAL2006]_ but require loop bounds and array subscripts to be affine functions of loop indices, which typically only occurs in regular, dense computations. For this reason, this framework still has to be successfully applied to sparse -- or even structured-sparse -- neural networks, whose importance has been rapidly rising over the past few years. On the other hand, blocked program representations advocated by this dissertation are less restricted in scope and can achieve close to peak performance using standard dataflow analysis. ----------------------- Scheduling Languages ----------------------- Separation of concerns [DIJKSTRA82]_ is a well-known design principle in computer science: programs should be decomposed into modular layers of abstraction that separate the semantics of their algorithms from the details of their implementation. Systems like Halide and TVM push this philosophy one step further, and enforce this separation at the grammatical level through the use of a **scheduling language**. The benefits of this methodology are particularly visible in the case of matrix multiplication, where, as one can see below, the definition of the algorithm (Line 1-7) is completely disjoint from its implementation (Line 8-16), meaning that both can be maintained, optimized and distributed independently. .. code-block:: python :linenos: // algorithm Var x("x"), y("y"); Func matmul("matmul"); RDom k(0, matrix_size); RVar ki; matmul(x, y) = 0.0f; matmul(x, y) += A(k, y) * B(x, k); // schedule Var xi("xi"), xo("xo"), yo("yo"), yi("yo"), yii("yii"), xii("xii"); matmul.vectorize(x, 8); matmul.update(0) .split(x, x, xi, block_size).split(xi, xi, xii, 8) .split(y, y, yi, block_size).split(yi, yi, yii, 4) .split(k, k, ki, block_size) .reorder(xii, yii, xi, ki, yi, k, x, y) .parallel(y).vectorize(xii).unroll(xi).unroll(yii); The resulting code may however not be completely portable, as schedules can sometimes rely on execution models (e.g., SPMD) or hardware intrinsics (e.g., matrix-multiply-accumulate) that are not widely available. This issue can be mitigated by auto-scheduling mechanisms [MULLAPUDI2016]_. +++++++++++ Advantages +++++++++++ The main advantage of this approach is that it allows programmers to write an algorithm *only once*, and focus on performance optimization separately. It makes it possible to manually specify optimizations that a polyhedral compiler wouldn't be able to figure out automatically using static data-flow analysis. Scheduling languages are, without a doubt, one of the most popular approaches for neural network code generation. The most popular system for this purpose is probably TVM, which provides good performance across a wide range of platforms as well as built-in automatic scheduling mechanisms. ++++++++++++ Limitations ++++++++++++ This ease-of-development comes at a cost. First of all, existing systems that follow this paradigm tend to be noticeably slower than Triton on modern hardware when applicable (e.g., V100/A100 tensor cores w/ equal tile sizes). I do believe that this is not a fundamental issue of scheduling languages -- in the sense that it could probably be solved with more efforts -- but it could mean that these systems are harder to engineer. More importantly, existing scheduling languages generate loops whose bounds and increments cannot depend on surrounding loop indices without at least imposing severe constraints on possible schedules -- if not breaking the system entirely. This is problematic for sparse computations, whose iteration spaces may be irregular. .. table:: :widths: 50 50 +-----------------------------------------------------+-----------------------------------------------------+ | | | |.. code-block:: C | |pic2| | | | | | for(int i = 0; i < 4; i++) | | | for(int j = 0; j < 4; j++) | | | float acc = 0; | | | for(int k = 0; k < K[i]; k++) | | | acc += A[i][col[i,k]]*B[k][j] | | | C[i][j] = acc; | | +-----------------------------------------------------+-----------------------------------------------------+ .. |pic2| image:: halide-iteration.png :width: 300 On the other hand, the block-based program representation that we advocate for through this work allows for block-structured iteration spaces and allows programmers to manually handle load-balancing as they wish. -------------- References -------------- .. [LATTNER2004] C. Lattner et al., "LLVM: a compilation framework for lifelong program analysis transformation", CGO 2004 .. [WOLFE1989] M. Wolfe, "More Iteration Space Tiling", SC 1989 .. [DARTE1999] A. Darte, "On the Complexity of Loop Fusion", PACT 1999 .. [ALLEN1984] J. Allen et al., "Automatic Loop Interchange", SIGPLAN Notices 1984 .. [ANCOURT1991] C. Ancourt et al., "Scanning Polyhedra with DO Loops", PPoPP 1991 .. [BAGHDADI2021] R. Baghdadi et al., "Tiramisu: A Polyhedral Compiler for Expressing Fast and Portable Code", CGO 2021 .. [VASILACHE2018] N. Vasilache et al., "Tensor Comprehensions: Framework-Agnostic High-Performance Machine Learning Abstractions", ArXiV 2018 .. [ELANGO2018] V. Elango et al. "Diesel: DSL for Linear Algebra and Neural Net Computations on GPUs", MAPL 2018 .. [LATTNER2019] C. Lattner et al., "MLIR Primer: A Compiler Infrastructure for the End of Moore’s Law", Arxiv 2019 .. [GROSSER2012] T. Grosser et al., "Polly - Performing Polyhedral Optimizations on a Low-Level Intermediate Representation", Parallel Processing Letters 2012 .. [SATO2019] Y. Sato et al., "An Autotuning Framework for Scalable Execution of Tiled Code via Iterative Polyhedral Compilation", TACO 2019 .. [GIRBAL2006] S. Girbal et al., "Semi-Automatic Composition of Loop Transformations for Deep Parallelism and Memory Hierarchies", International Journal of Parallel Programming 2006 .. [DIJKSTRA82] E. W. Dijkstra et al., "On the role of scientific thought", Selected writings on computing: a personal perspective 1982 .. [MULLAPUDI2016] R. Mullapudi et al., "Automatically scheduling halide image processing pipelines", TOG 2016 triton-2.0.0/docs/python-api/000077500000000000000000000000001440023377100160515ustar00rootroot00000000000000triton-2.0.0/docs/python-api/triton.language.rst000066400000000000000000000032021440023377100217010ustar00rootroot00000000000000triton.language ================ .. currentmodule:: triton.language Programming Model ------------------- .. autosummary:: :toctree: generated :nosignatures: program_id num_programs Creation Ops ------------- .. autosummary:: :toctree: generated :nosignatures: arange zeros Shape Manipulation Ops ----------------------- .. autosummary:: :toctree: generated :nosignatures: broadcast_to reshape ravel Linear Algebra Ops ------------------- .. autosummary:: :toctree: generated :nosignatures: dot Memory Ops -------------------- .. autosummary:: :toctree: generated :nosignatures: load store atomic_cas atomic_xchg Indexing Ops -------------- .. autosummary:: :toctree: generated :nosignatures: where Math Ops ---------- .. autosummary:: :toctree: generated :nosignatures: exp log cos sin sqrt sigmoid softmax Reduction Ops --------------- .. autosummary:: :toctree: generated :nosignatures: max min sum Atomic Ops --------------- .. autosummary:: :toctree: generated :nosignatures: atomic_cas atomic_add atomic_max atomic_min Comparison ops --------------- .. autosummary:: :toctree: generated :nosignatures: minimum maximum .. _Random Number Generation: Random Number Generation ------------------------- .. autosummary:: :toctree: generated :nosignatures: randint4x randint rand randn Compiler Hint Ops ------------------- .. autosummary:: :toctree: generated :nosignatures: multiple_oftriton-2.0.0/docs/python-api/triton.rst000066400000000000000000000002271440023377100201230ustar00rootroot00000000000000triton ======== .. currentmodule:: triton .. autosummary:: :toctree: generated :nosignatures: jit autotune heuristics Configtriton-2.0.0/docs/python-api/triton.testing.rst000066400000000000000000000002531440023377100215760ustar00rootroot00000000000000triton.testing ================ .. currentmodule:: triton.testing .. autosummary:: :toctree: generated :nosignatures: do_bench Benchmark perf_reporttriton-2.0.0/include/000077500000000000000000000000001440023377100144545ustar00rootroot00000000000000triton-2.0.0/include/CMakeLists.txt000066400000000000000000000000311440023377100172060ustar00rootroot00000000000000add_subdirectory(triton) triton-2.0.0/include/triton/000077500000000000000000000000001440023377100157735ustar00rootroot00000000000000triton-2.0.0/include/triton/Analysis/000077500000000000000000000000001440023377100175565ustar00rootroot00000000000000triton-2.0.0/include/triton/Analysis/Alias.h000066400000000000000000000047061440023377100207670ustar00rootroot00000000000000#ifndef TRITON_ANALYSIS_ALIAS_H #define TRITON_ANALYSIS_ALIAS_H #include "mlir/Analysis/AliasAnalysis.h" #include "mlir/Analysis/DataFlowAnalysis.h" #include "llvm/ADT/DenseSet.h" namespace mlir { class AliasInfo { public: AliasInfo() = default; AliasInfo(Value value) { insert(value); } void insert(Value value) { allocs.insert(value); } const DenseSet &getAllocs() const { return allocs; } bool operator==(const AliasInfo &other) const { return allocs == other.allocs; } /// The pessimistic value state of a value without alias static AliasInfo getPessimisticValueState(MLIRContext *context) { return AliasInfo(); } static AliasInfo getPessimisticValueState(Value value) { return AliasInfo(); } /// The union of both arguments static AliasInfo join(const AliasInfo &lhs, const AliasInfo &rhs); private: /// The set of allocated values that are aliased by this lattice. /// For now, we only consider aliased value produced by the following /// situations: /// 1. values returned by scf.yield /// 2. block arguments in scf.for /// Example: /// alloc v1 alloc v2 /// | | /// |--------------| |------------| /// scf.for v3 scf.for v4 scf.for v5 /// | /// scf.yield v6 /// /// v1's alloc [v1] /// v2's alloc [v2] /// v3's alloc [v1] /// v4's alloc [v1, v2] /// v5's alloc [v2] /// v6's alloc [v1] /// /// Therefore, v1's liveness range is the union of v3, v4, and v6 /// v2's liveness range is the union of v4 and v5. DenseSet allocs; }; //===----------------------------------------------------------------------===// // Shared Memory Alias Analysis //===----------------------------------------------------------------------===// class SharedMemoryAliasAnalysis : public ForwardDataFlowAnalysis { public: using ForwardDataFlowAnalysis::ForwardDataFlowAnalysis; /// XXX(Keren): Compatible interface with MLIR AliasAnalysis for future use. /// Given two values, returns their aliasing behavior. AliasResult alias(Value lhs, Value rhs); /// Returns the modify-reference behavior of `op` on `location`. ModRefResult getModRef(Operation *op, Value location); /// Computes if the alloc set of the results are changed. ChangeResult visitOperation(Operation *op, ArrayRef *> operands) override; }; } // namespace mlir #endif // TRITON_ANALYSIS_ALIAS_H triton-2.0.0/include/triton/Analysis/Allocation.h000066400000000000000000000137661440023377100220310ustar00rootroot00000000000000#ifndef TRITON_ANALYSIS_ALLOCATION_H #define TRITON_ANALYSIS_ALLOCATION_H #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/SetVector.h" #include "llvm/Support/raw_ostream.h" #include "triton/Dialect/Triton/IR/Dialect.h" #include "triton/Dialect/TritonGPU/IR/Dialect.h" #include #include namespace mlir { namespace triton { class AllocationAnalysis; SmallVector getScratchConfigForCvtLayout(triton::gpu::ConvertLayoutOp op, unsigned &inVec, unsigned &outVec); } // namespace triton /// Modified from llvm-15.0: llvm/ADT/AddressRanges.h /// A class that represents an interval, specified using a start and an end /// values: [Start, End). template class Interval { public: Interval() {} Interval(T S, T E) : Start(S), End(E) { assert(Start <= End); } T start() const { return Start; } T end() const { return End; } T size() const { return End - Start; } bool contains(T Addr) const { return Start <= Addr && Addr < End; } bool intersects(const Interval &R) const { return Start < R.End && R.Start < End; } bool operator==(const Interval &R) const { return Start == R.Start && End == R.End; } bool operator!=(const Interval &R) const { return !(*this == R); } bool operator<(const Interval &R) const { return std::make_pair(Start, End) < std::make_pair(R.Start, R.End); } private: T Start = std::numeric_limits::min(); T End = std::numeric_limits::max(); }; class Allocation { public: /// A unique identifier for shared memory buffers using BufferId = size_t; using BufferIdSetT = DenseSet; static constexpr BufferId InvalidBufferId = std::numeric_limits::max(); /// Creates a new Allocation analysis that computes the shared memory /// information for all associated shared memory values. Allocation(Operation *operation) : operation(operation) { run(); } /// Returns the operation this analysis was constructed from. Operation *getOperation() const { return operation; } /// Returns the offset of the given buffer in the shared memory. size_t getOffset(BufferId bufferId) const { return bufferSet.at(bufferId).offset; } /// Returns the size of the given buffer in the shared memory. size_t getAllocatedSize(BufferId bufferId) const { return bufferSet.at(bufferId).size; } /// Returns the buffer id of the given value. /// This interface only returns the allocated buffer id. /// If you want to get all the buffer ids that are associated with the given /// value, including alias buffers, use getBufferIds. BufferId getBufferId(Value value) const { if (valueBuffer.count(value)) { return valueBuffer.lookup(value)->id; } else { return InvalidBufferId; } } /// Returns all the buffer ids of the given value, including alias buffers. BufferIdSetT getBufferIds(Value value) const { BufferIdSetT bufferIds; auto allocBufferId = getBufferId(value); if (allocBufferId != InvalidBufferId) bufferIds.insert(allocBufferId); for (auto *buffer : aliasBuffer.lookup(value)) { if (buffer->id != InvalidBufferId) bufferIds.insert(buffer->id); } return bufferIds; } /// Returns the scratch buffer id of the given value. BufferId getBufferId(Operation *operation) const { if (opScratch.count(operation)) { return opScratch.lookup(operation)->id; } else { return InvalidBufferId; } } /// Returns the size of total shared memory allocated size_t getSharedMemorySize() const { return sharedMemorySize; } bool isIntersected(BufferId lhsId, BufferId rhsId) const { if (lhsId == InvalidBufferId || rhsId == InvalidBufferId) return false; auto lhsBuffer = bufferSet.at(lhsId); auto rhsBuffer = bufferSet.at(rhsId); return lhsBuffer.intersects(rhsBuffer); } private: /// A class that represents a shared memory buffer struct BufferT { enum class BufferKind { Explicit, Scratch }; /// MT: thread-safe inline static std::atomic nextId = 0; BufferKind kind; BufferId id; size_t size; size_t offset; bool operator==(const BufferT &other) const { return id == other.id; } bool operator<(const BufferT &other) const { return id < other.id; } BufferT() : BufferT(BufferKind::Explicit) {} BufferT(BufferKind kind) : kind(kind), id(InvalidBufferId), size(0), offset(0) {} BufferT(BufferKind kind, size_t size) : BufferT(kind, size, 0) {} BufferT(BufferKind kind, size_t size, size_t offset) : kind(kind), id(nextId++), size(size), offset(offset) {} bool intersects(const BufferT &other) const { return Interval(offset, offset + size) .intersects( Interval(other.offset, other.offset + other.size)); } }; /// Op -> Scratch Buffer using OpScratchMapT = DenseMap; /// Value -> Explicit Buffer using ValueBufferMapT = llvm::MapVector; /// Value -> Alias Buffer using AliasBufferMapT = llvm::MapVector>; /// BufferId -> Buffer using BufferSetT = std::map; /// Runs allocation analysis on the given top-level operation. void run(); private: template void addBuffer(KeyType &key, Args &&...args) { auto buffer = BufferT(Kind, std::forward(args)...); bufferSet[buffer.id] = std::move(buffer); if constexpr (Kind == BufferT::BufferKind::Explicit) { valueBuffer[key] = &bufferSet[buffer.id]; } else { opScratch[key] = &bufferSet[buffer.id]; } } void addAlias(Value value, Value alloc) { aliasBuffer[value].insert(valueBuffer[alloc]); } private: Operation *operation; OpScratchMapT opScratch; ValueBufferMapT valueBuffer; AliasBufferMapT aliasBuffer; BufferSetT bufferSet; size_t sharedMemorySize = 0; friend class triton::AllocationAnalysis; }; } // namespace mlir #endif // TRITON_ANALYSIS_ALLOCATION_H triton-2.0.0/include/triton/Analysis/AxisInfo.h000066400000000000000000000206701440023377100214540ustar00rootroot00000000000000#ifndef TRITON_ANALYSIS_AXISINFO_H #define TRITON_ANALYSIS_AXISINFO_H #include "mlir/Analysis/DataFlowAnalysis.h" #include "llvm/Support/raw_ostream.h" #include "triton/Analysis/Utility.h" #include "triton/Dialect/Triton/IR/Dialect.h" #include "triton/Dialect/TritonGPU/IR/Dialect.h" #include #include namespace mlir { //===----------------------------------------------------------------------===// // AxisInfo //===----------------------------------------------------------------------===// /// This lattice value represents known information on the axes of a lattice. class AxisInfo { public: typedef SmallVector DimVectorT; public: /// Default constructor AxisInfo() : AxisInfo({}, {}, {}) {} /// Construct contiguity info with known contiguity AxisInfo(DimVectorT knownContiguity, DimVectorT knownDivisibility, DimVectorT knownConstancy) : AxisInfo(knownContiguity, knownDivisibility, knownConstancy, {}) {} AxisInfo(DimVectorT knownContiguity, DimVectorT knownDivisibility, DimVectorT knownConstancy, std::optional knownConstantValue) : contiguity(knownContiguity), divisibility(knownDivisibility), constancy(knownConstancy), constantValue(knownConstantValue), rank(contiguity.size()) { assert(knownContiguity.size() == static_cast(rank)); assert(knownDivisibility.size() == static_cast(rank)); assert(knownConstancy.size() == static_cast(rank)); } /// Accessors int64_t getContiguity(size_t dim) const { return contiguity[dim]; } const DimVectorT &getContiguity() const { return contiguity; } int64_t getDivisibility(size_t dim) const { return divisibility[dim]; } const DimVectorT &getDivisibility() const { return divisibility; } int64_t getConstancy(size_t dim) const { return constancy[dim]; } const DimVectorT &getConstancy() const { return constancy; } int getRank() const { return rank; } std::optional getConstantValue() const { return constantValue; } /// Comparison bool operator==(const AxisInfo &other) const { return (contiguity == other.contiguity) && (divisibility == other.divisibility) && (constancy == other.constancy) && (constantValue == other.constantValue) && (rank == other.rank); } /// The pessimistic value state of the contiguity is unknown. static AxisInfo getPessimisticValueState(MLIRContext *context) { return AxisInfo(); } static AxisInfo getPessimisticValueState(Value value); /// The gcd of both arguments for each dimension static AxisInfo join(const AxisInfo &lhs, const AxisInfo &rhs); private: /// The _contiguity_ information maps the `d`-th /// dimension to the length of the shortest /// sequence of contiguous integers along it. /// Suppose we have an array of N elements, /// with a contiguity value C, /// the array can be divided into a list of /// N/C sequences of C contiguous elements. /// Since we have N = 2^k, C must be a power of two. /// For example: /// [10, 11, 12, 13, 18, 19, 20, 21] /// [20, 21, 22, 23, 28, 29, 30, 31] /// Would have contiguity [1, 4]. /// and /// [12, 16, 20, 24] /// [13, 17, 21, 25] /// [14, 18, 22, 26] /// [15, 19, 23, 27] /// [18, 22, 26, 30] /// [19, 23, 27, 31] /// Would have contiguity [2, 1]. DimVectorT contiguity; /// The _divisibility_ information maps the `d`-th /// dimension to the largest power-of-two that /// divides the first element of all the values along it /// For example: /// [10, 11, 12, 13, 18, 19, 20, 21] /// [20, 21, 22, 23, 28, 29, 30, 31] // would have divisibility [1, 2] // and /// [12, 16, 20, 24] /// [13, 17, 21, 25] /// [14, 18, 22, 26] /// [15, 19, 23, 27] // would have divisibility [4, 1] DimVectorT divisibility; /// The _constancy_ information maps the `d`-th /// dimension to the length of the shortest /// sequence of constant integer along it. This is /// particularly useful to infer the contiguity /// of operations (e.g., add) involving a constant. /// Suppose we have an array of N elements, /// with a constancy value C, /// the array can be divided into a list of /// N/C sequences of C elements with the same value. /// Since we have N = 2^k, C must be a power of two. /// For example /// [8, 8, 8, 8, 12, 12, 12, 12] /// [16, 16, 16, 16, 20, 20, 20, 20] /// would have constancy [1, 4] DimVectorT constancy; /// The constant value of the lattice if we can infer it. std::optional constantValue; // number of dimensions of the lattice int rank{}; }; class AxisInfoVisitor { public: AxisInfoVisitor() = default; virtual ~AxisInfoVisitor() = default; static bool isContiguousDim(const AxisInfo &info, ArrayRef shape, int dim) { return info.getContiguity(dim) == shape[dim]; } static bool isConstantDim(const AxisInfo &info, ArrayRef shape, int dim) { return info.getConstancy(dim) == shape[dim]; } virtual AxisInfo getAxisInfo(Operation *op, ArrayRef *> operands) = 0; virtual bool match(Operation *op) = 0; }; /// Base class for all operations template class AxisInfoVisitorImpl : public AxisInfoVisitor { public: using AxisInfoVisitor::AxisInfoVisitor; AxisInfo getAxisInfo(Operation *op, ArrayRef *> operands) final { return getAxisInfo(cast(op), operands); } bool match(Operation *op) final { return isa(op); } virtual AxisInfo getAxisInfo(OpTy op, ArrayRef *> operands) { llvm_unreachable("Unimplemented getAxisInfo"); } }; /// Binary operations template class BinaryOpVisitorImpl : public AxisInfoVisitorImpl { public: using AxisInfoVisitorImpl::AxisInfoVisitorImpl; AxisInfo getAxisInfo(OpTy op, ArrayRef *> operands) override { auto lhsInfo = operands[0]->getValue(); auto rhsInfo = operands[1]->getValue(); auto rank = lhsInfo.getRank(); assert(operands.size() == 2 && "Expected two operands"); AxisInfo::DimVectorT contiguity; AxisInfo::DimVectorT divisibility; AxisInfo::DimVectorT constancy; auto constantValue = getConstantValue(op, lhsInfo, rhsInfo); for (auto d = 0; d < rank; ++d) { if (constantValue.has_value()) { contiguity.push_back(1); constancy.push_back( std::max(lhsInfo.getConstancy(d), rhsInfo.getConstancy(d))); divisibility.push_back(highestPowOf2Divisor(constantValue.value())); } else { contiguity.push_back(getContiguity(op, lhsInfo, rhsInfo, d)); constancy.push_back(getConstancy(op, lhsInfo, rhsInfo, d)); divisibility.push_back(getDivisibility(op, lhsInfo, rhsInfo, d)); } } return AxisInfo(contiguity, divisibility, constancy, constantValue); } protected: virtual int64_t getContiguity(OpTy op, const AxisInfo &lhs, const AxisInfo &rhs, int dim) { return 1; } virtual int64_t getDivisibility(OpTy op, const AxisInfo &lhs, const AxisInfo &rhs, int dim) { return 1; } virtual int64_t getConstancy(OpTy op, const AxisInfo &lhs, const AxisInfo &rhs, int dim) { return 1; } virtual std::optional getConstantValue(OpTy op, const AxisInfo &lhs, const AxisInfo &rhs) { return {}; } }; class AxisInfoVisitorList { public: template > void append() { (visitors.emplace_back(std::make_unique()), ...); } AxisInfo apply(Operation *op, ArrayRef *> operands) { for (auto &visitor : visitors) if (visitor->match(op)) return visitor->getAxisInfo(op, operands); return AxisInfo(); } private: std::vector> visitors; }; class AxisInfoAnalysis : public ForwardDataFlowAnalysis { private: AxisInfoVisitorList visitors; public: AxisInfoAnalysis(MLIRContext *context); ChangeResult visitOperation(Operation *op, ArrayRef *> operands) override; unsigned getPtrContiguity(Value ptr); unsigned getPtrAlignment(Value ptr); unsigned getMaskAlignment(Value mask); }; } // namespace mlir #endiftriton-2.0.0/include/triton/Analysis/Membar.h000066400000000000000000000102211440023377100211260ustar00rootroot00000000000000#ifndef TRITON_ANALYSIS_MEMBAR_H #define TRITON_ANALYSIS_MEMBAR_H #include "Allocation.h" #include "llvm/ADT/SmallPtrSet.h" namespace mlir { class OpBuilder; //===----------------------------------------------------------------------===// // Shared Memory Barrier Analysis //===----------------------------------------------------------------------===// class MembarAnalysis { public: /// Creates a new Membar analysis that generates the shared memory barrier /// in the following circumstances: /// - RAW: If a shared memory write is followed by a shared memory read, and /// their addresses are intersected, a barrier is inserted. /// - WAR: If a shared memory read is followed by a shared memory read, and /// their addresses are intersected, a barrier is inserted. /// The following circumstances do not require a barrier: /// - WAW: not possible because overlapped memory allocation is not allowed. /// - RAR: no write is performed. /// Temporary storage of operations such as Reduce are considered as both /// a shared memory read. If the temporary storage is written but not read, /// it is considered as the problem of the operation itself but not the membar /// analysis. /// The following circumstances are not considered yet: /// - Double buffers /// - N buffers MembarAnalysis(Allocation *allocation) : allocation(allocation) {} /// Runs the membar analysis to the given operation, inserts a barrier if /// necessary. void run(); private: struct RegionInfo { using BufferIdSetT = Allocation::BufferIdSetT; BufferIdSetT syncReadBuffers; BufferIdSetT syncWriteBuffers; RegionInfo() = default; RegionInfo(const BufferIdSetT &syncReadBuffers, const BufferIdSetT &syncWriteBuffers) : syncReadBuffers(syncReadBuffers), syncWriteBuffers(syncWriteBuffers) { } /// Unions two RegionInfo objects. void join(const RegionInfo &other) { syncReadBuffers.insert(other.syncReadBuffers.begin(), other.syncReadBuffers.end()); syncWriteBuffers.insert(other.syncWriteBuffers.begin(), other.syncWriteBuffers.end()); } /// Returns true if buffers in two RegionInfo objects are intersected. bool isIntersected(const RegionInfo &other, Allocation *allocation) const { return /*RAW*/ isIntersected(syncWriteBuffers, other.syncReadBuffers, allocation) || /*WAR*/ isIntersected(syncReadBuffers, other.syncWriteBuffers, allocation) || /*WAW*/ isIntersected(syncWriteBuffers, other.syncWriteBuffers, allocation); } /// Clears the buffers because a barrier is inserted. void sync() { syncReadBuffers.clear(); syncWriteBuffers.clear(); } private: /// Returns true if buffers in two sets are intersected. bool isIntersected(const BufferIdSetT &lhs, const BufferIdSetT &rhs, Allocation *allocation) const { return std::any_of(lhs.begin(), lhs.end(), [&](auto lhsId) { return std::any_of(rhs.begin(), rhs.end(), [&](auto rhsId) { return allocation->isIntersected(lhsId, rhsId); }); }); } }; /// Applies the barrier analysis based on the SCF dialect, in which each /// region has a single basic block only. /// Example: /// region1 /// op1 /// op2 (scf.if) /// region2 /// op3 /// op4 /// region3 /// op5 /// op6 /// op7 /// region2 and region3 started with the information of region1. /// Each region is analyzed separately and keeps their own copy of the /// information. At op7, we union the information of the region2 and region3 /// and update the information of region1. void dfsOperation(Operation *operation, RegionInfo *blockInfo, OpBuilder *builder); /// Updates the RegionInfo operation based on the operation. void transfer(Operation *operation, RegionInfo *blockInfo, OpBuilder *builder); private: Allocation *allocation; }; } // namespace mlir #endif // TRITON_ANALYSIS_MEMBAR_H triton-2.0.0/include/triton/Analysis/Utility.h000066400000000000000000000055001440023377100213720ustar00rootroot00000000000000#ifndef TRITON_ANALYSIS_UTILITY_H #define TRITON_ANALYSIS_UTILITY_H #include "mlir/Analysis/SliceAnalysis.h" #include "triton/Dialect/TritonGPU/IR/Dialect.h" #include #include #include namespace mlir { class ReduceOpHelper { public: explicit ReduceOpHelper(triton::ReduceOp op) : op(op) { srcTy = op.operand().getType().cast(); } ArrayRef getSrcShape() { return srcTy.getShape(); } Attribute getSrcLayout() { return srcTy.getEncoding(); } bool isFastReduction(); unsigned getInterWarpSize(); unsigned getIntraWarpSize(); unsigned getThreadsReductionAxis(); SmallVector getScratchConfigBasic(); SmallVector> getScratchConfigsFast(); unsigned getScratchSizeInBytes(); private: triton::ReduceOp op; RankedTensorType srcTy{}; }; bool isSharedEncoding(Value value); bool maybeSharedAllocationOp(Operation *op); bool maybeAliasOp(Operation *op); bool supportMMA(triton::DotOp op, int version); bool supportMMA(Value value, int version); Type getElementType(Value value); std::string getValueOperandName(Value value, AsmState &state); template inline SmallVector convertType(ArrayRef in) { SmallVector out; for (const T_IN &i : in) out.push_back(T_OUT(i)); return out; } template Int product(llvm::ArrayRef arr) { return std::accumulate(arr.begin(), arr.end(), 1, std::multiplies{}); } template Int ceil(Int m, Int n) { return (m + n - 1) / n; } // output[i] = input[order[i]] template SmallVector reorder(ArrayRef input, ArrayRef order) { size_t rank = order.size(); assert(input.size() == rank); SmallVector result(rank); for (auto it : llvm::enumerate(order)) { result[it.index()] = input[it.value()]; } return result; } template T highestPowOf2Divisor(T n) { if (n == 0) { return (static_cast(1) << (sizeof(T) * 8 - 2)); } return (n & (~(n - 1))); } bool isSingleValue(Value value); bool isMmaToDotShortcut(triton::gpu::MmaEncodingAttr &mmaLayout, triton::gpu::DotOperandEncodingAttr &dotOperandLayout); /// Multi-root DAG topological sort. /// Performs a topological sort of the Operation in the `toSort` SetVector. /// Returns a topologically sorted SetVector. /// It is faster than mlir::topologicalSort because it prunes nodes that have /// been visited before. SetVector multiRootTopologicalSort(const SetVector &toSort); // This uses the toplogicalSort above SetVector multiRootGetSlice(Operation *op, TransitiveFilter backwardFilter = nullptr, TransitiveFilter forwardFilter = nullptr); } // namespace mlir #endif // TRITON_ANALYSIS_UTILITY_H triton-2.0.0/include/triton/CMakeLists.txt000066400000000000000000000000661440023377100205350ustar00rootroot00000000000000add_subdirectory(Conversion) add_subdirectory(Dialect)triton-2.0.0/include/triton/Conversion/000077500000000000000000000000001440023377100201205ustar00rootroot00000000000000triton-2.0.0/include/triton/Conversion/CMakeLists.txt000066400000000000000000000002121440023377100226530ustar00rootroot00000000000000 set(LLVM_TARGET_DEFINITIONS Passes.td) mlir_tablegen(Passes.h.inc -gen-pass-decls) add_public_tablegen_target(TritonConversionPassIncGen)triton-2.0.0/include/triton/Conversion/MLIRTypes.h000066400000000000000000000024501440023377100220620ustar00rootroot00000000000000#ifndef TRITON_CONVERSION_MLIR_TYPES_H #define TRITON_CONVERSION_MLIR_TYPES_H #include "mlir/Transforms/DialectConversion.h" #include "triton/Dialect/TritonGPU/IR/Dialect.h" // This file redefines some common MLIR types for easy usage. namespace mlir { namespace triton { namespace type { // Integer types inline Type i32Ty(MLIRContext *ctx) { return IntegerType::get(ctx, 32); } inline Type i16Ty(MLIRContext *ctx) { return IntegerType::get(ctx, 16); } inline Type i8Ty(MLIRContext *ctx) { return IntegerType::get(ctx, 8); } inline Type u32Ty(MLIRContext *ctx) { return IntegerType::get(ctx, 32, IntegerType::Unsigned); } inline Type u1Ty(MLIRContext *ctx) { return IntegerType::get(ctx, 1, IntegerType::Unsigned); } // Float types inline Type f16Ty(MLIRContext *ctx) { return FloatType::getF16(ctx); } inline Type f32Ty(MLIRContext *ctx) { return FloatType::getF32(ctx); } inline Type f64Ty(MLIRContext *ctx) { return FloatType::getF64(ctx); } inline Type bf16Ty(MLIRContext *ctx) { return FloatType::getBF16(ctx); } inline bool isFloat(Type type) { return type.isF32() || type.isF64() || type.isF16() || type.isF128(); } inline bool isInt(Type type) { return type.isIntOrFloat() && !isFloat(type); } } // namespace type } // namespace triton } // namespace mlir #endif // TRITON_CONVERSION_MLIR_TYPES_H triton-2.0.0/include/triton/Conversion/Passes.h000066400000000000000000000006371440023377100215350ustar00rootroot00000000000000#ifndef TRITON_CONVERSION_PASSES_H #define TRITON_CONVERSION_PASSES_H #include "mlir/Dialect/LLVMIR/LLVMDialect.h" #include "triton/Conversion/TritonGPUToLLVM/TritonGPUToLLVMPass.h" #include "triton/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.h" namespace mlir { namespace triton { #define GEN_PASS_REGISTRATION #include "triton/Conversion/Passes.h.inc" } // namespace triton } // namespace mlir #endif triton-2.0.0/include/triton/Conversion/Passes.td000066400000000000000000000036041440023377100217120ustar00rootroot00000000000000#ifndef TRITON_CONVERSION_PASSES #define TRITON_CONVERSION_PASSES include "mlir/Pass/PassBase.td" def ConvertTritonToTritonGPU: Pass<"convert-triton-to-tritongpu", "mlir::ModuleOp"> { let summary = "Convert Triton to TritonGPU"; let description = [{ }]; let constructor = "mlir::triton::createConvertTritonToTritonGPUPass()"; let dependentDialects = ["mlir::arith::ArithmeticDialect", "mlir::math::MathDialect", "mlir::StandardOpsDialect", // TODO: Does this pass depend on SCF? "mlir::scf::SCFDialect", "mlir::triton::TritonDialect", "mlir::triton::gpu::TritonGPUDialect"]; let options = [ Option<"numWarps", "num-warps", "int32_t", /*default*/"4", "number of warps"> ]; } def ConvertTritonGPUToLLVM : Pass<"convert-triton-gpu-to-llvm", "mlir::ModuleOp"> { let summary = "Convert TritonGPU to LLVM"; let description = [{ }]; let constructor = "mlir::triton::createConvertTritonGPUToLLVMPass()"; let dependentDialects = ["mlir::arith::ArithmeticDialect", "mlir::math::MathDialect", "mlir::gpu::GPUDialect", "mlir::scf::SCFDialect", "mlir::LLVM::LLVMDialect", "mlir::tensor::TensorDialect", "mlir::triton::TritonDialect", "mlir::triton::gpu::TritonGPUDialect", "mlir::NVVM::NVVMDialect", "mlir::StandardOpsDialect"]; let options = [ Option<"computeCapability", "compute-capability", "int32_t", /*default*/"80", "device compute capability"> ]; } #endif triton-2.0.0/include/triton/Conversion/TritonGPUToLLVM/000077500000000000000000000000001440023377100227515ustar00rootroot00000000000000triton-2.0.0/include/triton/Conversion/TritonGPUToLLVM/PTXAsmFormat.h000066400000000000000000000254521440023377100254170ustar00rootroot00000000000000#ifndef TRITON_CONVERSION_TRITONGPU_TO_LLVM_ASM_FORMAT_H #define TRITON_CONVERSION_TRITONGPU_TO_LLVM_ASM_FORMAT_H #include "mlir/IR/Value.h" #include "triton/Dialect/Triton/IR/Dialect.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include #include namespace mlir { class ConversionPatternRewriter; class Location; namespace triton { using llvm::StringRef; struct PTXInstr; struct PTXInstrCommon; struct PTXInstrExecution; // PTXBuilder helps to manage a PTX asm program consists of one or multiple // instructions. // // A helper for building an ASM program, the objective of PTXBuilder is to give // a thin encapsulation and make the ASM code for MLIR LLVM Dialect more clear. // Currently, several factors are introduced to reduce the need for mixing // string and C++ if-else code. // // Usage: // To build: @$3 asm("@%3 add.s32 %0, %1, %2;" : "=r"(i) : "r"(j), "r"(k), // "b"(p)); // // PTXBuilder builder; // auto& add = builder.create<>(); // add.predicate(pVal).o("lo").o("u32"); // add any suffix // // predicate here binds %0 to pVal, pVal is a mlir::Value // // auto* iOpr = builder.newOperand(iVal, "r"); // %1 bind to iVal // auto* jOpr = builder.newOperand(jVal, "r"); // %2 bind to jVal // auto* kOpr = builder.newOperand(kVal, "r"); // %3 bind to kVal // add(iOpr, jOpr, kOpr).predicate(predVal); // set operands and predicate // // To get the asm code: // builder.dump() // // To get all the mlir::Value used in the PTX code, // // builder.getAllMlirArgs() // get {pVal, iVal, jVal, kVal} // // To get the string containing all the constraints with "," separated, // builder.getConstraints() // get "=r,r,k" // // PTXBuilder can build a PTX asm with multiple instructions, sample code: // // PTXBuilder builder; // auto& mov = builder.create("mov"); // auto& cp = builder.create("cp"); // mov(...); // cp(...); // This will get a PTX code with two instructions. // // Similar to a C function, a declared PTXInstr instance can be launched // multiple times with different operands, e.g. // // auto& mov = builder.create("mov"); // mov(... some operands ...); // mov(... some different operands ...); // // Finally, we will get a PTX code with two mov instructions. // // There are several derived instruction type for typical instructions, for // example, the PtxIOInstr for ld and st instructions. struct PTXBuilder { struct Operand { std::string constraint; Value value; int idx{-1}; llvm::SmallVector list; std::function repr; // for list Operand() = default; Operand(const Operation &) = delete; Operand(Value value, StringRef constraint) : constraint(constraint), value(value) {} bool isList() const { return !value && constraint.empty(); } Operand *listAppend(Operand *arg) { list.push_back(arg); return this; } Operand *listGet(size_t nth) const { assert(nth < list.size()); return list[nth]; } std::string dump() const; }; template INSTR *create(Args &&...args) { instrs.emplace_back(std::make_unique(this, args...)); return static_cast(instrs.back().get()); } // Create a list of operands. Operand *newListOperand() { return newOperand(); } Operand *newListOperand(ArrayRef> items) { auto *list = newOperand(); for (auto &item : items) { list->listAppend(newOperand(item.first, item.second)); } return list; } Operand *newListOperand(unsigned count, mlir::Value val, const std::string &constraint) { auto *list = newOperand(); for (unsigned i = 0; i < count; ++i) { list->listAppend(newOperand(val, constraint)); } return list; } Operand *newListOperand(unsigned count, const std::string &constraint) { auto *list = newOperand(); for (unsigned i = 0; i < count; ++i) { list->listAppend(newOperand(constraint)); } return list; } // Create a new operand. It will not add to operand list. // @value: the MLIR value bind to this operand. // @constraint: ASM operand constraint, .e.g. "=r" // @formatter: extra format to represent this operand in ASM code, default is // "%{0}".format(operand.idx). Operand *newOperand(mlir::Value value, StringRef constraint, std::function formatter = nullptr); // Create a new operand which is written to, that is, the constraint starts // with "=", e.g. "=r". Operand *newOperand(StringRef constraint); // Create a constant integer operand. Operand *newConstantOperand(int64_t v); // Create a constant operand with explicit code specified. Operand *newConstantOperand(const std::string &v); Operand *newAddrOperand(mlir::Value addr, StringRef constraint, int off = 0); llvm::SmallVector getAllArgs() const; llvm::SmallVector getAllMLIRArgs() const; std::string getConstraints() const; std::string dump() const; mlir::Value launch(OpBuilder &rewriter, Location loc, Type resTy, bool hasSideEffect = true, bool isAlignStack = false, ArrayRef attrs = {}) const; private: Operand *newOperand() { argArchive.emplace_back(std::make_unique()); return argArchive.back().get(); } // Make the operands in argArchive follow the provided \param order. void reorderArgArchive(ArrayRef order) { assert(order.size() == argArchive.size()); // The order in argArchive is unnecessary when onlyAttachMLIRArgs=false, but // it does necessary when onlyAttachMLIRArgs is true for the $0, $1... are // determined by PTX code snippet passed from external. sort(argArchive.begin(), argArchive.end(), [&](std::unique_ptr &a, std::unique_ptr &b) { auto ida = std::find(order.begin(), order.end(), a.get()); auto idb = std::find(order.begin(), order.end(), b.get()); assert(ida != order.end()); assert(idb != order.end()); return ida < idb; }); } friend struct PTXInstr; friend struct PTXInstrCommon; protected: llvm::SmallVector, 6> argArchive; llvm::SmallVector, 2> instrs; llvm::SmallVector, 4> executions; int oprCounter{}; }; // PTX instruction common interface. // Put the generic logic for all the instructions here. struct PTXInstrCommon { explicit PTXInstrCommon(PTXBuilder *builder) : builder(builder) {} using Operand = PTXBuilder::Operand; // clang-format off PTXInstrExecution& operator()() { return call({}); } PTXInstrExecution& operator()(Operand* a) { return call({a}); } PTXInstrExecution& operator()(Operand* a, Operand* b) { return call({a, b}); } PTXInstrExecution& operator()(Operand* a, Operand* b, Operand* c) { return call({a, b, c}); } PTXInstrExecution& operator()(Operand* a, Operand* b, Operand* c, Operand* d) { return call({a, b, c, d}); } PTXInstrExecution& operator()(Operand* a, Operand* b, Operand* c, Operand* d, Operand * e) { return call({a, b, c, d, e}); } PTXInstrExecution& operator()(Operand* a, Operand* b, Operand* c, Operand* d, Operand * e, Operand* f) { return call({a, b, c, d, e, f}); } PTXInstrExecution& operator()(Operand* a, Operand* b, Operand* c, Operand* d, Operand * e, Operand* f, Operand* g) { return call({a, b, c, d, e, f, g}); } // clang-format on // Set operands of this instruction. PTXInstrExecution &operator()(llvm::ArrayRef oprs, bool onlyAttachMLIRArgs = false); protected: // "Call" the instruction with operands. // \param oprs The operands of this instruction. // \param onlyAttachMLIRArgs Indicate that it simply attach the MLIR Arguments // to the inline Asm without generating the operand ids(such as $0, $1) in PTX // code. PTXInstrExecution &call(llvm::ArrayRef oprs, bool onlyAttachMLIRArgs = false); PTXBuilder *builder{}; llvm::SmallVector instrParts; friend struct PTXInstrExecution; }; template struct PTXInstrBase : public PTXInstrCommon { using Operand = PTXBuilder::Operand; explicit PTXInstrBase(PTXBuilder *builder, const std::string &name) : PTXInstrCommon(builder) { o(name); } // Append a suffix to the instruction. // e.g. PTXInstr("add").o("s32") get a add.s32. // A predicate is used to tell whether to apply the suffix, so that no if-else // code needed. e.g. `PTXInstr("add").o("s32", isS32).o("u32", !isS32);` will // get a `add.s32` if isS32 is true. ConcreteT &o(const std::string &suffix, bool predicate = true) { if (predicate) instrParts.push_back(suffix); return *static_cast(this); } }; struct PTXInstr : public PTXInstrBase { using PTXInstrBase::PTXInstrBase; // Append a ".global" to the instruction. PTXInstr &global(); // Append a ".shared" to the instruction. PTXInstr &shared(); // Append a ".v[0-9]+" to the instruction PTXInstr &v(int vecWidth, bool predicate = true); // Append a".b[0-9]+" to the instruction PTXInstr &b(int width); }; // Record the operands and context for "launching" a PtxInstr. struct PTXInstrExecution { using Operand = PTXBuilder::Operand; llvm::SmallVector argsInOrder; PTXInstrExecution() = default; explicit PTXInstrExecution(PTXInstrCommon *instr, llvm::ArrayRef oprs, bool onlyAttachMLIRArgs) : argsInOrder(oprs.begin(), oprs.end()), instr(instr), onlyAttachMLIRArgs(onlyAttachMLIRArgs) {} // Prefix a predicate to the instruction. PTXInstrExecution &predicate(mlir::Value value, StringRef constraint = "b") { pred = instr->builder->newOperand(value, constraint); return *this; } // Prefix a !predicate to the instruction. PTXInstrExecution &predicateNot(mlir::Value value, StringRef constraint) { pred = instr->builder->newOperand(value, constraint); pred->repr = [](int idx) { return "@!$" + std::to_string(idx); }; return *this; } std::string dump() const; SmallVector getArgList() const; PTXInstrCommon *instr{}; Operand *pred{}; bool onlyAttachMLIRArgs{}; }; /// ====== Some instruction wrappers ====== // We add the wrappers to make the usage more intuitive by avoiding mixing the // PTX code with some trivial C++ code. struct PTXCpAsyncLoadInstr : PTXInstrBase { explicit PTXCpAsyncLoadInstr(PTXBuilder *builder, triton::CacheModifier modifier) : PTXInstrBase(builder, "cp.async") { o(triton::stringifyCacheModifier(modifier).str()); o("shared"); o("global"); } }; } // namespace triton } // namespace mlir #endif triton-2.0.0/include/triton/Conversion/TritonGPUToLLVM/TritonGPUToLLVMPass.h000066400000000000000000000007341440023377100266060ustar00rootroot00000000000000#ifndef TRITON_CONVERSION_TRITONGPU_TO_LLVM_PASS_H #define TRITON_CONVERSION_TRITONGPU_TO_LLVM_PASS_H #include "mlir/Conversion/LLVMCommon/TypeConverter.h" #include "mlir/Transforms/DialectConversion.h" #include namespace mlir { class ModuleOp; template class OperationPass; namespace triton { std::unique_ptr> createConvertTritonGPUToLLVMPass(int computeCapability = 80); } // namespace triton } // namespace mlir #endif triton-2.0.0/include/triton/Conversion/TritonToTritonGPU/000077500000000000000000000000001440023377100234565ustar00rootroot00000000000000triton-2.0.0/include/triton/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.h000066400000000000000000000012071440023377100300140ustar00rootroot00000000000000#ifndef TRITON_CONVERSION_TRITONTOTRITONGPU_TRITONTOTRITONGPUPASS_H #define TRITON_CONVERSION_TRITONTOTRITONGPU_TRITONTOTRITONGPUPASS_H #include namespace mlir { class ModuleOp; template class OperationPass; namespace triton { constexpr static char AttrNumWarpsName[] = "triton_gpu.num-warps"; // Create the pass with numWarps passed from cl::opt. std::unique_ptr> createConvertTritonToTritonGPUPass(); // Create the pass with numWarps set explicitly. std::unique_ptr> createConvertTritonToTritonGPUPass(int numWarps); } // namespace triton } // namespace mlir #endif triton-2.0.0/include/triton/Dialect/000077500000000000000000000000001440023377100173405ustar00rootroot00000000000000triton-2.0.0/include/triton/Dialect/CMakeLists.txt000066400000000000000000000000651440023377100221010ustar00rootroot00000000000000add_subdirectory(Triton) add_subdirectory(TritonGPU) triton-2.0.0/include/triton/Dialect/Triton/000077500000000000000000000000001440023377100206175ustar00rootroot00000000000000triton-2.0.0/include/triton/Dialect/Triton/CMakeLists.txt000066400000000000000000000000621440023377100233550ustar00rootroot00000000000000add_subdirectory(IR) add_subdirectory(Transforms) triton-2.0.0/include/triton/Dialect/Triton/IR/000077500000000000000000000000001440023377100211315ustar00rootroot00000000000000triton-2.0.0/include/triton/Dialect/Triton/IR/CMakeLists.txt000066400000000000000000000013131440023377100236670ustar00rootroot00000000000000set(LLVM_TARGET_DEFINITIONS TritonOps.td) mlir_tablegen(Ops.h.inc -gen-op-decls) mlir_tablegen(Ops.cpp.inc -gen-op-defs) mlir_tablegen(OpsEnums.h.inc -gen-enum-decls) mlir_tablegen(OpsEnums.cpp.inc -gen-enum-defs) set(LLVM_TARGET_DEFINITIONS TritonDialect.td) mlir_tablegen(Dialect.h.inc -gen-dialect-decls) mlir_tablegen(Dialect.cpp.inc -gen-dialect-defs) set(LLVM_TARGET_DEFINITIONS TritonTypes.td) mlir_tablegen(Types.h.inc -gen-typedef-decls) mlir_tablegen(Types.cpp.inc -gen-typedef-defs) set(LLVM_TARGET_DEFINITIONS TritonInterfaces.td) mlir_tablegen(AttrInterfaces.h.inc -gen-attr-interface-decls) mlir_tablegen(AttrInterfaces.cpp.inc -gen-attr-interface-defs) add_public_tablegen_target(TritonTableGen) triton-2.0.0/include/triton/Dialect/Triton/IR/Dialect.h000066400000000000000000000032551440023377100226540ustar00rootroot00000000000000#ifndef TRITON_DIALECT_TRITON_IR_DIALECT_H_ #define TRITON_DIALECT_TRITON_IR_DIALECT_H_ #include "mlir/Dialect/Math/IR/Math.h" #include "mlir/Dialect/SCF/SCF.h" #include "mlir/Dialect/StandardOps/IR/Ops.h" #include "mlir/Dialect/Tensor/IR/Tensor.h" #include "mlir/IR/BuiltinOps.h" #include "mlir/IR/Dialect.h" #include "mlir/Interfaces/ControlFlowInterfaces.h" #include "triton/Dialect/Triton/IR/Dialect.h.inc" #include "triton/Dialect/Triton/IR/OpsEnums.h.inc" #include "triton/Dialect/Triton/IR/Traits.h" #include "triton/Dialect/Triton/IR/Types.h" #define GET_OP_CLASSES #include "triton/Dialect/Triton/IR/Ops.h.inc" namespace mlir { namespace triton { class DialectInferLayoutInterface : public DialectInterface::Base { public: DialectInferLayoutInterface(Dialect *dialect) : Base(dialect) {} virtual LogicalResult inferTransOpEncoding(Attribute operandEncoding, Attribute &resultEncoding) const = 0; virtual LogicalResult inferReduceOpEncoding(Attribute operandEncoding, unsigned axis, Attribute &resultEncoding) const = 0; virtual LogicalResult inferExpandDimsOpEncoding(Attribute operandEncoding, unsigned axis, Attribute &resultEncoding, Optional location) const = 0; // Note: this function only verify operand encoding but doesn't infer result // encoding virtual LogicalResult inferDotOpEncoding(Attribute operandEncoding, unsigned opIdx, Attribute retEncoding, Optional location) const = 0; }; } // namespace triton } // namespace mlir #endif // TRITON_IR_DIALECT_H_ triton-2.0.0/include/triton/Dialect/Triton/IR/Interfaces.h000066400000000000000000000003271440023377100233670ustar00rootroot00000000000000#ifndef TRITON_IR_INTERFACES_H_ #define TRITON_IR_INTERFACES_H_ #include "mlir/IR/OpDefinition.h" #define GET_TYPEDEF_CLASSES #include "triton/Dialect/Triton/IR/AttrInterfaces.h.inc" #endif // TRITON_IR_TYPES_H_ triton-2.0.0/include/triton/Dialect/Triton/IR/Traits.h000066400000000000000000000034301440023377100225500ustar00rootroot00000000000000#ifndef TRITON_IR_TRAITS_H_ #define TRITON_IR_TRAITS_H_ #include "mlir/IR/OpDefinition.h" #include "mlir/IR/BuiltinTypes.h" #include "mlir/Support/LogicalResult.h" #include namespace mlir { namespace OpTrait { // These functions are out-of-line implementations of the methods in the // corresponding trait classes. This avoids them being template // instantiated/duplicated. namespace impl { LogicalResult verifySameOperandsAndResultEncoding(Operation *op); LogicalResult verifySameOperandsEncoding(Operation *op); // The rationale for this trait is to prevent users from creating programs // that would have catastrophic register pressure and cause the compiler to // hang. // Since H100 has 256KB registers, we should allow users to create tensors // of size up to 256K elements. It will spill for datatypes wider than 1B, // but we probably should limit number of elements (rather than bytes) to // keep specs simple int constexpr maxTensorNumElements = 1048576; LogicalResult verifyTensorSize(Operation *op); } // namespace impl template class TensorSizeTrait : public TraitBase { public: static LogicalResult verifyTrait(Operation *op) { return impl::verifyTensorSize(op); } }; template class SameOperandsAndResultEncoding : public TraitBase { public: static LogicalResult verifyTrait(Operation *op) { return impl::verifySameOperandsAndResultEncoding(op); } }; template class SameOperandsEncoding : public TraitBase { public: static LogicalResult verifyTrait(Operation *op) { return impl::verifySameOperandsEncoding(op); } }; } // namespace OpTrait } // namespace mlir #endif triton-2.0.0/include/triton/Dialect/Triton/IR/TritonAttrDefs.td000066400000000000000000000040051440023377100243750ustar00rootroot00000000000000#ifndef TRITON_ATTR_DEFS #define TRITON_ATTR_DEFS include "mlir/IR/EnumAttr.td" // Attrs for LoadOp def TT_CacheModifierAttr : I32EnumAttr< "CacheModifier", "", [ I32EnumAttrCase<"NONE", 1, "none">, I32EnumAttrCase<"CA", 2, "ca">, I32EnumAttrCase<"CG", 3, "cg">, ]> { let cppNamespace = "::mlir::triton"; } def TT_EvictionPolicyAttr : I32EnumAttr< "EvictionPolicy", "", [ I32EnumAttrCase<"NORMAL", 1, "evict_normal">, I32EnumAttrCase<"EVICT_FIRST", 2, "evict_first">, I32EnumAttrCase<"EVICT_LAST", 3, "evict_last"> ]> { let cppNamespace = "::mlir::triton"; } // reduction def TT_RedOpAttr : I32EnumAttr< /*name*/"RedOp", /*summary*/"", /*case*/ [ I32EnumAttrCase, I32EnumAttrCase<"FADD", 2, "fadd">, I32EnumAttrCase<"MIN", 3, "min">, I32EnumAttrCase<"MAX", 4, "max">, I32EnumAttrCase<"UMIN", 5, "umin">, I32EnumAttrCase<"UMAX", 6, "umax">, I32EnumAttrCase<"ARGMIN", 7, "argmin">, I32EnumAttrCase<"ARGMAX", 8, "argmax">, I32EnumAttrCase<"ARGUMIN", 9, "argumin">, I32EnumAttrCase<"ARGUMAX", 10, "argumax">, I32EnumAttrCase<"FMIN", 11, "fmin">, I32EnumAttrCase<"FMAX", 12, "fmax">, I32EnumAttrCase<"ARGFMIN", 13, "argfmin">, I32EnumAttrCase<"ARGFMAX", 14, "argfmax">, I32EnumAttrCase<"XOR", 15, "xor"> ]> { let cppNamespace = "::mlir::triton"; } // atomic def TT_AtomicRMWAttr : I32EnumAttr< "RMWOp", "", [ I32EnumAttrCase<"AND", 1, "and">, I32EnumAttrCase<"OR", 2, "or">, I32EnumAttrCase<"XOR", 3, "xor">, I32EnumAttrCase<"ADD", 4, "add">, I32EnumAttrCase<"FADD", 5, "fadd">, I32EnumAttrCase<"MAX", 6, "max">, I32EnumAttrCase<"MIN", 7, "min">, I32EnumAttrCase<"UMAX", 8, "umax">, I32EnumAttrCase<"UMIN", 9, "umin">, I32EnumAttrCase<"XCHG", 10, "exch"> ]> { let cppNamespace = "::mlir::triton"; } #endif triton-2.0.0/include/triton/Dialect/Triton/IR/TritonDialect.td000066400000000000000000000016011440023377100242250ustar00rootroot00000000000000#ifndef TRITON_DIALECT #define TRITON_DIALECT include "mlir/IR/OpBase.td" def Triton_Dialect : Dialect { let name = "tt"; let cppNamespace = "::mlir::triton"; let summary = "The Triton IR in MLIR"; let description = [{ Triton Dialect. Dependent Dialects: * Arithmetic: * addf, addi, andi, cmpf, cmpi, divf, fptosi, ... * Math: * exp, sin, cos, log, ... * StructuredControlFlow: * ForOp, IfOp, WhileOp, YieldOp, ConditionOp }]; let dependentDialects = [ "arith::ArithmeticDialect", "math::MathDialect", "StandardOpsDialect", "scf::SCFDialect", // Since LLVM 15 // "cf::ControlFlowDialect", // "func::FuncDialect" ]; let extraClassDeclaration = [{ void registerTypes(); }]; let hasConstantMaterializer = 1; } include "triton/Dialect/Triton/IR/TritonTypes.td" #endif // TRITON_DIALECT triton-2.0.0/include/triton/Dialect/Triton/IR/TritonInterfaces.td000066400000000000000000000004751440023377100247530ustar00rootroot00000000000000#ifndef TRITON_INTERFACES #define TRITON_INTERFACES include "mlir/IR/OpBase.td" def TensorSizeTrait : NativeOpTrait<"TensorSizeTrait">; def SameOperandsAndResultEncoding : NativeOpTrait<"SameOperandsAndResultEncoding">; def SameOperandsEncoding : NativeOpTrait<"SameOperandsEncoding">; #endif // TRITON_INTERFACEStriton-2.0.0/include/triton/Dialect/Triton/IR/TritonOps.td000066400000000000000000000346751440023377100234420ustar00rootroot00000000000000#ifndef TRITON_OPS #define TRITON_OPS include "triton/Dialect/Triton/IR/TritonDialect.td" include "triton/Dialect/Triton/IR/TritonTypes.td" include "triton/Dialect/Triton/IR/TritonAttrDefs.td" include "triton/Dialect/Triton/IR/TritonInterfaces.td" include "mlir/IR/OpBase.td" include "mlir/Interfaces/SideEffectInterfaces.td" // NoSideEffect include "mlir/Interfaces/ControlFlowInterfaces.td" // BranchOpInterface include "mlir/Interfaces/InferTypeOpInterface.td" // SameOperandsAndResultType include "mlir/Interfaces/SideEffectInterfaces.td" // NoSideEffect include "mlir/Interfaces/CastInterfaces.td" // CastOpInterface // // Op Base // class TT_Op traits = []> : Op { } // // CastOps // // Use cast ops in arith: // bitcast // fptoui, fptosi, uitofp, sitofp, // extf, tructf, // extui, extsi, tructi def TT_IntToPtrOp : TT_Op<"int_to_ptr", [SameOperandsAndResultShape, SameOperandsAndResultEncoding, NoSideEffect, /*DeclareOpInterfaceMethods*/]> { let summary = "Cast int64 to pointer"; let arguments = (ins TT_I64Like:$from); let results = (outs TT_PtrLike:$result); let assemblyFormat = "$from attr-dict `:` type($from) `->` type($result)"; } def TT_PtrToIntOp : TT_Op<"ptr_to_int", [SameOperandsAndResultShape, SameOperandsAndResultEncoding, NoSideEffect, /*DeclareOpInterfaceMethods*/]> { let summary = "Cast pointer to int64"; let arguments = (ins TT_PtrLike:$from); let results = (outs TT_I64Like:$result); let assemblyFormat = "$from attr-dict `:` type($from) `->` type($result)"; } // arith.bitcast doesn't support pointers def TT_BitcastOp : TT_Op<"bitcast", [SameOperandsAndResultShape, SameOperandsAndResultEncoding, NoSideEffect, /*DeclareOpInterfaceMethods*/]> { let summary = "Cast between types of the same bitwidth"; let arguments = (ins TT_Type:$from); let results = (outs TT_Type:$result); let assemblyFormat = "$from attr-dict `:` type($from) `->` type($result)"; // TODO: Add verifier } def TT_FpToFpOp : TT_Op<"fp_to_fp", [SameOperandsAndResultShape, SameOperandsAndResultEncoding, NoSideEffect, DeclareOpInterfaceMethods]> { let summary = "Floating point casting for custom types"; let description = [{ Floating point casting for custom types (F8). F8 <-> FP16, BF16, FP32, FP64 }]; let arguments = (ins TT_FloatLike:$from); let results = (outs TT_FloatLike:$result); let assemblyFormat = "$from attr-dict `:` type($from) `->` type($result)"; // TODO: We need a verifier here. } // // Pointer Arith Ops // def TT_AddPtrOp : TT_Op<"addptr", [NoSideEffect, SameOperandsAndResultShape, SameOperandsAndResultEncoding, TypesMatchWith<"result type matches ptr type", "result", "ptr", "$_self">]> { let arguments = (ins TT_PtrLike:$ptr, TT_IntLike:$offset); let results = (outs TT_PtrLike:$result); let assemblyFormat = "$ptr `,` $offset attr-dict `:` type($result) `,` type($offset)"; } // // Load/Store Ops // def TT_LoadOp : TT_Op<"load", [SameOperandsAndResultShape, SameOperandsAndResultEncoding, AttrSizedOperandSegments, MemoryEffects<[MemRead]>, TypesMatchWith<"infer ptr type from result type", "result", "ptr", "getPointerTypeSameShape($_self)">, TypesMatchWith<"infer mask type from result type or none", "result", "mask", "getI1SameShape($_self)", "($_op.getOperands().size() <= 1) || std::equal_to<>()">, TypesMatchWith<"infer other type from result type or none", "result", "other", "$_self", "($_op.getOperands().size() <= 2) || std::equal_to<>()">]> { let summary = "load"; let arguments = (ins TT_PtrLike:$ptr, Optional:$mask, Optional:$other, TT_CacheModifierAttr:$cache, TT_EvictionPolicyAttr:$evict, BoolAttr:$isVolatile); let results = (outs TT_Type:$result); let builders = [ OpBuilder<(ins "Value":$ptr, "triton::CacheModifier":$cache, "triton::EvictionPolicy":$evict, "bool":$isVolatile)>, OpBuilder<(ins "Value":$ptr, "Value":$mask, "triton::CacheModifier":$cache, "triton::EvictionPolicy":$evict, "bool":$isVolatile)>, OpBuilder<(ins "Value":$ptr, "Value":$mask, "Value":$other, "triton::CacheModifier":$cache, "triton::EvictionPolicy":$evict, "bool":$isVolatile)>, ]; // let assemblyFormat = "operands attr-dict `:` type($result)"; let parser = [{ return mlir::triton::parseLoadOp(parser, result); }]; let printer = [{ return mlir::triton::printLoadOp(p, *this); }]; let hasCanonicalizer = 1; } def TT_StoreOp : TT_Op<"store", [SameOperandsShape, SameOperandsEncoding, MemoryEffects<[MemWrite]>, TypesMatchWith<"infer ptr type from value type", "value", "ptr", "getPointerTypeSameShape($_self)">, TypesMatchWith<"infer mask type from value type", "value", "mask", "getI1SameShape($_self)", "($_op.getOperands().size() <= 2) || std::equal_to<>()">]> { let summary = "store"; let arguments = (ins TT_PtrLike:$ptr, TT_Type:$value, Optional:$mask); let builders = [ OpBuilder<(ins "Value":$ptr, "Value":$value)>, ]; // let assemblyFormat = "operands attr-dict `:` type($value)"; let parser = [{ return mlir::triton::parseStoreOp(parser, result); }]; let printer = [{ return mlir::triton::printStoreOp(p, *this); }]; let hasCanonicalizer = 1; } // // Atomic Op // def TT_AtomicRMWOp : TT_Op<"atomic_rmw", [SameOperandsAndResultShape, SameOperandsAndResultEncoding, MemoryEffects<[MemRead]>, MemoryEffects<[MemWrite]>, TypesMatchWith<"infer ptr type from value type", "val", "ptr", "getPointerTypeSameShape($_self)">, TypesMatchWith<"infer mask type from value type", "val", "mask", "getI1SameShape($_self)", "($_op.getOperands().size() <= 2) || std::equal_to<>()">]> { let summary = "atomic rmw"; let description = [{ load data at $ptr, do $rmw_op with $val, and store result to $ptr. return old value at $ptr }]; let arguments = (ins TT_AtomicRMWAttr:$atomic_rmw_op, TT_PtrLike:$ptr, TT_Type:$val, Optional:$mask); let results = (outs TT_Type:$result); } def TT_AtomicCASOp : TT_Op<"atomic_cas", [MemoryEffects<[MemRead]>, MemoryEffects<[MemWrite]>, SameOperandsAndResultShape, SameOperandsAndResultEncoding]> { let summary = "atomic cas"; let description = [{ compare $cmp with data $old at location $ptr, if $old == $cmp, store $val to $ptr, else store $old to $ptr, return $old }]; let arguments = (ins TT_Ptr:$ptr, TT_Type:$cmp, TT_Type:$val); let results = (outs TT_Type:$result); } // // Shape Manipulation Ops // def TT_SplatOp : TT_Op<"splat", [NoSideEffect, SameOperandsAndResultElementType, SameOperandsAndResultEncoding]> { let summary = "splat"; let arguments = (ins TT_Type:$src); let results = (outs TT_Tensor:$result); let assemblyFormat = "$src attr-dict `:` functional-type(operands, results)"; let hasFolder = 1; } def TT_ExpandDimsOp : TT_Op<"expand_dims", [NoSideEffect, DeclareOpInterfaceMethods, SameOperandsAndResultElementType]> { let summary = "expand_dims"; let arguments = (ins TT_Tensor:$src, I32Attr:$axis); let results = (outs TT_Tensor:$result); let assemblyFormat = "$src attr-dict `:` functional-type(operands, results)"; } def TT_ViewOp : TT_Op<"view", [NoSideEffect, SameOperandsAndResultElementType, SameOperandsAndResultEncoding]> { let summary = "view"; let arguments = (ins TT_Tensor:$src); let results = (outs TT_Tensor:$result); let assemblyFormat = "$src attr-dict `:` functional-type(operands, results)"; } def TT_BroadcastOp : TT_Op<"broadcast", [NoSideEffect, SameOperandsAndResultElementType, SameOperandsAndResultEncoding]> { let summary = "broadcast. No left-padding as of now."; let arguments = (ins TT_Type:$src); let results = (outs TT_Type:$result); let assemblyFormat = "$src attr-dict `:` functional-type(operands, results)"; let hasFolder = 1; } def TT_CatOp : TT_Op<"cat", [NoSideEffect, SameOperandsAndResultElementType, SameOperandsAndResultEncoding]> { let summary = "concatenate 2 tensors"; let arguments = (ins TT_Tensor:$lhs, TT_Tensor:$rhs); let results = (outs TT_Tensor:$result); let assemblyFormat = "$lhs `,` $rhs attr-dict `:` functional-type(operands, results)"; } def TT_TransOp : TT_Op<"trans", [NoSideEffect, DeclareOpInterfaceMethods, SameOperandsAndResultElementType]> { let summary = "transpose a tensor"; let arguments = (ins TT_Tensor:$src); let results = (outs TT_Tensor:$result); let assemblyFormat = "$src attr-dict `:` functional-type(operands, results)"; } // // SPMD Ops // def TT_GetProgramIdOp : TT_Op<"get_program_id", [NoSideEffect]> { let arguments = (ins I32Attr:$axis); let results = (outs I32:$result); let assemblyFormat = "attr-dict `:` type($result)"; } def TT_GetNumProgramsOp : TT_Op<"get_num_programs", [NoSideEffect]> { let arguments = (ins I32Attr:$axis); let results = (outs I32:$result); let assemblyFormat = "attr-dict `:` type($result)"; } // // Dot Op // def TT_DotOp : TT_Op<"dot", [NoSideEffect, DeclareOpInterfaceMethods, TypesMatchWith<"result's type matches accumulator's type", "d", "c", "$_self">]> { let summary = "dot"; let description = [{ $d = matrix_multiply($a, $b) + $c }]; let arguments = (ins TT_FpIntTensor:$a, TT_FpIntTensor:$b, TT_FpIntTensor:$c, BoolAttr:$allowTF32); let results = (outs TT_FpIntTensor:$d); let assemblyFormat = "$a`,` $b`,` $c attr-dict `:` type($a) `*` type($b) `->` type($d)"; } // // Reduce Op // def TT_ReduceOp : TT_Op<"reduce", [NoSideEffect, DeclareOpInterfaceMethods]> { let summary = "reduce"; let arguments = (ins TT_RedOpAttr:$redOp, TT_Tensor:$operand, I32Attr:$axis); let results = (outs TT_Type:$result); let builders = [ OpBuilder<(ins "triton::RedOp":$redOp, "Value":$operand, "int":$axis)>, ]; let assemblyFormat = "$operand attr-dict `:` type($operand) `->` type($result)"; let extraClassDeclaration = [{ // This member function is marked static because we need to call it before the ReduceOp // is constructed, see the implementation of create_reduce in triton.cc. static bool withIndex(mlir::triton::RedOp redOp); }]; } // // External elementwise op // def TT_ExtElemwiseOp : TT_Op<"ext_elemwise", [NoSideEffect, Elementwise, SameOperandsAndResultShape, SameOperandsAndResultEncoding, SameVariadicOperandSize]> { let summary = "ext_elemwise"; let description = [{ call an external function $symbol implemented in $libpath/$libname with $args return $libpath/$libname:$symbol($args...) }]; let arguments = (ins Variadic:$args, StrAttr:$libname, StrAttr:$libpath, StrAttr:$symbol); let results = (outs TT_Type:$result); let assemblyFormat = "operands attr-dict `:` type(operands) `->` type($result)"; } // // Make Range Op // // TODO: should have ConstantLike as Trait def TT_MakeRangeOp : TT_Op<"make_range", [NoSideEffect]> { let summary = "make range"; let description = [{ Returns an 1D int32 tensor. Values span from $start to $end (exclusive), with step = 1 }]; let arguments = (ins I32Attr:$start, I32Attr:$end); let results = (outs TT_IntTensor:$result); let assemblyFormat = "attr-dict `:` type($result)"; } // // Make PrintfOp // def TT_PrintfOp : TT_Op<"printf", [MemoryEffects<[MemWrite]>]>, Arguments<(ins StrAttr:$prefix, Variadic>:$args)> { let summary = "Device-side printf, as in CUDA for debugging"; let description = [{ `tt.printf` takes a literal string prefix and an arbitrary number of scalar or tensor arguments that should be printed. format are generated automatically from the arguments. }]; let assemblyFormat = [{ $prefix attr-dict ($args^ `:` type($args))? }]; } #endif // Triton_OPS triton-2.0.0/include/triton/Dialect/Triton/IR/TritonTypes.td000066400000000000000000000034001440023377100237630ustar00rootroot00000000000000#ifndef TRITON_TYPES #define TRITON_TYPES include "triton/Dialect/Triton/IR/TritonDialect.td" // // Types // class TritonTypeDef : TypeDef { // Used by printer/parser let mnemonic = _mnemonic; } // Floating-point Type def F8 : TritonTypeDef<"Float8", "f8">; def TT_Float : AnyTypeOf<[F8, F16, BF16, F32, F64], "floating-point">; def TT_FloatTensor : TensorOf<[TT_Float]>; def TT_FloatLike : AnyTypeOf<[TT_Float, TT_FloatTensor]>; // Boolean Type // TT_Bool -> I1 def TT_BoolTensor : TensorOf<[I1]>; def TT_BoolLike : AnyTypeOf<[I1, TT_BoolTensor]>; // Integer Type def TT_Int : AnyTypeOf<[I1, I8, I16, I32, I64], "integer">; def TT_IntTensor : TensorOf<[TT_Int]>; def TT_IntLike : AnyTypeOf<[TT_Int, TT_IntTensor]>; // I32 Type // TT_I32 -> I32 // TT_I32Tensor -> I32Tensor def TT_I32Like: AnyTypeOf<[I32, I32Tensor]>; // I64 Type // TT_I64 -> I64 // TT_I64Tensor -> I64Tensor def TT_I64Like: AnyTypeOf<[I64, I64Tensor]>; // Pointer Type def TT_Ptr : TritonTypeDef<"Pointer", "ptr"> { let summary = "pointer type"; let description = [{ Triton PointerType }]; let parameters = (ins "Type":$pointeeType, "int":$addressSpace); let builders = [ TypeBuilderWithInferredContext<(ins "Type":$pointeeType, "int":$addressSpace ), [{ return $_get(pointeeType.getContext(), pointeeType, addressSpace); }]> ]; let skipDefaultBuilders = 1; } def TT_PtrTensor : TensorOf<[TT_Ptr]>; def TT_PtrLike : AnyTypeOf<[TT_Ptr, TT_PtrTensor]>; def TT_FpIntTensor : AnyTypeOf<[TT_FloatTensor, TT_IntTensor]>; def TT_Tensor : AnyTypeOf<[TT_FpIntTensor, TT_PtrTensor]>; def TT_Type : AnyTypeOf<[TT_FloatLike, TT_IntLike, TT_PtrLike]>; #endif triton-2.0.0/include/triton/Dialect/Triton/IR/Types.h000066400000000000000000000005161440023377100224100ustar00rootroot00000000000000#ifndef TRITON_IR_TYPES_H_ #define TRITON_IR_TYPES_H_ #include "mlir/IR/BuiltinTypes.h" #include "mlir/IR/TypeSupport.h" #include "mlir/IR/Types.h" #define GET_TYPEDEF_CLASSES #include "triton/Dialect/Triton/IR/Types.h.inc" namespace mlir { unsigned getPointeeBitWidth(RankedTensorType tensorTy); } #endif // TRITON_IR_TYPES_H_ triton-2.0.0/include/triton/Dialect/Triton/Transforms/000077500000000000000000000000001440023377100227555ustar00rootroot00000000000000triton-2.0.0/include/triton/Dialect/Triton/Transforms/CMakeLists.txt000066400000000000000000000002231440023377100255120ustar00rootroot00000000000000set(LLVM_TARGET_DEFINITIONS Passes.td) mlir_tablegen(Passes.h.inc -gen-pass-decls -name Triton) add_public_tablegen_target(TritonTransformsIncGen) triton-2.0.0/include/triton/Dialect/Triton/Transforms/Passes.h000066400000000000000000000005431440023377100243660ustar00rootroot00000000000000#ifndef TRITON_DIALECT_TRITON_TRANSFORMS_PASSES_H_ #define TRITON_DIALECT_TRITON_TRANSFORMS_PASSES_H_ #include "mlir/Pass/Pass.h" namespace mlir { namespace triton { std::unique_ptr createCombineOpsPass(); } // namespace triton #define GEN_PASS_REGISTRATION #include "triton/Dialect/Triton/Transforms/Passes.h.inc" } // namespace mlir #endif triton-2.0.0/include/triton/Dialect/Triton/Transforms/Passes.td000066400000000000000000000011721440023377100245450ustar00rootroot00000000000000#ifndef TRITON_PASSES #define TRITON_PASSES include "mlir/Pass/PassBase.td" def TritonCombineOps : Pass { let summary = "combine ops"; let description = [{ dot(a, b, 0) + c => dot(a, b, c) addptr(addptr(ptr, idx0), idx1) => addptr(ptr, AddI(idx0, idx1)) select(cond, load(ptrs, broadcast(cond), ???), other) => load(ptrs, broadcast(cond), other) }]; let constructor = "mlir::triton::createCombineOpsPass()"; let dependentDialects = ["mlir::arith::ArithmeticDialect", /*SelectOp*/"mlir::StandardOpsDialect"]; } #endif triton-2.0.0/include/triton/Dialect/TritonGPU/000077500000000000000000000000001440023377100211735ustar00rootroot00000000000000triton-2.0.0/include/triton/Dialect/TritonGPU/CMakeLists.txt000066400000000000000000000000621440023377100237310ustar00rootroot00000000000000add_subdirectory(IR) add_subdirectory(Transforms) triton-2.0.0/include/triton/Dialect/TritonGPU/IR/000077500000000000000000000000001440023377100215055ustar00rootroot00000000000000triton-2.0.0/include/triton/Dialect/TritonGPU/IR/CMakeLists.txt000066400000000000000000000010201440023377100242360ustar00rootroot00000000000000set(LLVM_TARGET_DEFINITIONS TritonGPUOps.td) mlir_tablegen(Dialect.h.inc -gen-dialect-decls -dialect=triton_gpu) mlir_tablegen(Dialect.cpp.inc -gen-dialect-defs -dialect=triton_gpu) mlir_tablegen(Ops.h.inc -gen-op-decls) mlir_tablegen(Ops.cpp.inc -gen-op-defs) add_public_tablegen_target(TritonGPUTableGen) set(LLVM_TARGET_DEFINITIONS TritonGPUAttrDefs.td) mlir_tablegen(TritonGPUAttrDefs.h.inc -gen-attrdef-decls) mlir_tablegen(TritonGPUAttrDefs.cpp.inc -gen-attrdef-defs) add_public_tablegen_target(TritonGPUAttrDefsIncGen) triton-2.0.0/include/triton/Dialect/TritonGPU/IR/Dialect.h000066400000000000000000000026151440023377100232270ustar00rootroot00000000000000#ifndef TRITON_DIALECT_TRITONGPU_IR_DIALECT_H_ #define TRITON_DIALECT_TRITONGPU_IR_DIALECT_H_ #include "mlir/Dialect/GPU/GPUDialect.h" #include "mlir/Dialect/Tensor/IR/Tensor.h" #include "mlir/IR/BuiltinOps.h" #include "mlir/IR/Dialect.h" // TritonGPU depends on Triton #include "triton/Dialect/Triton/IR/Dialect.h" #include "triton/Dialect/TritonGPU/IR/Dialect.h.inc" #include "triton/Dialect/TritonGPU/IR/Traits.h" #define GET_ATTRDEF_CLASSES #include "triton/Dialect/Triton/IR/AttrInterfaces.h.inc" #include "triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.h.inc" #define GET_OP_CLASSES #include "triton/Dialect/TritonGPU/IR/Ops.h.inc" namespace mlir { namespace triton { namespace gpu { unsigned getElemsPerThread(Type type); SmallVector getThreadsPerWarp(const Attribute &layout); SmallVector getWarpsPerCTA(const Attribute &layout); SmallVector getSizePerThread(const Attribute &layout); SmallVector getContigPerThread(const Attribute &layout); SmallVector getThreadsPerCTA(const Attribute &layout); SmallVector getShapePerCTA(const Attribute &layout, ArrayRef tensorShape = ArrayRef()); SmallVector getOrder(const Attribute &layout); bool isaDistributedLayout(const Attribute &layout); } // namespace gpu } // namespace triton } // namespace mlir #endif // TRITON_DIALECT_TRITONGPU_IR_DIALECT_H_ triton-2.0.0/include/triton/Dialect/TritonGPU/IR/Traits.h000066400000000000000000000014131440023377100231230ustar00rootroot00000000000000#ifndef TRITON_GPU_IR_TRAITS_H_ #define TRITON_GPU_IR_TRAITS_H_ #include "mlir/IR/OpDefinition.h" #include "mlir/IR/BuiltinTypes.h" #include "mlir/Support/LogicalResult.h" namespace mlir { namespace OpTrait { // These functions are out-of-line implementations of the methods in the // corresponding trait classes. This avoids them being template // instantiated/duplicated. namespace impl { LogicalResult verifyResultsAreSharedEncoding(Operation *op); } // namespace impl template class ResultsAreSharedEncoding : public TraitBase { public: static LogicalResult verifyTrait(Operation *op) { return impl::verifyResultsAreSharedEncoding(op); } }; } // namespace OpTrait } // namespace mlir #endif triton-2.0.0/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td000066400000000000000000000466361440023377100253450ustar00rootroot00000000000000#ifndef TRITONGPU_ATTRDEFS #define TRITONGPU_ATTRDEFS include "triton/Dialect/TritonGPU/IR/TritonGPUDialect.td" include "triton/Dialect/Triton/IR/TritonInterfaces.td" //===----------------------------------------------------------------------===// // TritonGPU Attribute Definitions //===----------------------------------------------------------------------===// class TritonGPU_Attr traits = [], string baseCppClass = "::mlir::Attribute"> : AttrDef { let description = [{ TritonGPU Tensors differ from usual tensors in that they contain a _layout_ attribute which determines how the data should be partitioned across CUDA threads. Formally speaking, we define a layout as a function \mathcal{L} that maps a multi-dimensional tensor index $i \in \mathbb{Z}^d$ to a set of integers T corresponding to the indices of the CUDA threads allowed to access some data at index $i$. For example, let us consider the layout function: \mathcal{L}(0, 0) = {0, 4} \mathcal{L}(0, 1) = {1, 5} \mathcal{L}(1, 0) = {2, 6} \mathcal{L}(1, 1) = {3, 7} Then, attaching $\mathcal{L} to a tensor $T$ would mean that: - T[0,0] is owned by both cuda thread 0 and 4 - T[0,1] is owned by both cuda thread 1 and 5 - T[1,0] is owned by both cuda thread 2 and 6 - T[1,1] is owned by both cuda thread 3 and 7 Right now, Triton implements two classes of layouts: shared, and distributed. }]; code extraBaseClassDeclaration = [{ unsigned getElemsPerThread(ArrayRef shape) const; ::mlir::LogicalResult verifyLayoutForArg(::mlir::Operation* op, unsigned argNo) const; }]; } //===----------------------------------------------------------------------===// // Shared Layout Encoding //===----------------------------------------------------------------------===// def SharedEncodingAttr : TritonGPU_Attr<"SharedEncoding"> { let mnemonic = "shared"; let description = [{ An encoding for tensors whose elements may be simultaneously accessed by different cuda threads in the programs, via shared memory. In other words, for all indices i \in R^d, \mathcal{L}(i) = {0, 1, ..., 32*num_warps - 1}. In order to avoid shared memory bank conflicts, elements may be swizzled in memory. For example, a swizzled row-major layout could store its data as follows: A_{0, 0} A_{0, 1} A_{0, 2} A_{0, 3} ... [phase 0] \ per_phase = 2 A_{1, 0} A_{1, 1} A_{1, 2} A_{1, 3} ... [phase 0] / groups of vec=2 elements are stored contiguously _ _ _ _ /\_ _ _ _ A_{2, 2} A_{2, 3} A_{2, 0} A_{2, 1} ... [phase 1] \ per phase = 2 A_{3, 2} A_{3, 3} A_{3, 0} A_{3, 1} ... [phase 1] / }]; let parameters = ( ins // swizzle info "unsigned":$vec, "unsigned":$perPhase, "unsigned":$maxPhase, ArrayRefParameter<"unsigned", "order of axes by the rate of changing">:$order ); let builders = [ AttrBuilder<(ins "DotOperandEncodingAttr":$dotOpEnc, "ArrayRef":$shape, "ArrayRef":$order, "Type":$eltTy), [{ auto mmaEnc = dotOpEnc.getParent().dyn_cast(); if(!mmaEnc) return $_get(context, 1, 1, 1, order); int opIdx = dotOpEnc.getOpIdx(); // number of rows per phase int perPhase = 128 / (shape[order[0]] * (eltTy.getIntOrFloatBitWidth() / 8)); perPhase = std::max(perPhase, 1); // index of the inner dimension in `order` unsigned inner = (opIdx == 0) ? 0 : 1; // ---- begin Volta ---- if (mmaEnc.isVolta()) { bool is_row = order[0] != 0; bool is_vec4 = opIdx == 0 ? !is_row && (shape[order[0]] <= 16) : is_row && (shape[order[0]] <= 16); int pack_size = opIdx == 0 ? ((is_row || is_vec4) ? 1 : 2) : ((is_row && !is_vec4) ? 2 : 1); int rep = 2 * pack_size; int maxPhase = (order[inner] == 1 ? 8 : 4) / perPhase; int vec = 2 * rep; return $_get(context, vec, perPhase, maxPhase, order); } // ---- begin Ampere ---- if (mmaEnc.isAmpere()) { std::vector matShape = {8, 8, 2 * 64 / eltTy.getIntOrFloatBitWidth()}; // for now, disable swizzle when using transposed int8 tensor cores if (eltTy.isInteger(8) && order[0] == inner) return $_get(context, 1, 1, 1, order); // --- handle A operand --- if (opIdx == 0) { // compute swizzling for A operand int vec = (order[0] == 1) ? matShape[2] : matShape[0]; // k : m int mmaStride = (order[0] == 1) ? matShape[0] : matShape[2]; int maxPhase = mmaStride / perPhase; return $_get(context, vec, perPhase, maxPhase, order); } // --- handle B operand --- if (opIdx == 1) { int vec = (order[0] == 1) ? matShape[1] : matShape[2]; // n : k int mmaStride = (order[0] == 1) ? matShape[2] : matShape[1]; int maxPhase = mmaStride / perPhase; return $_get(context, vec, perPhase, maxPhase, order); } llvm_unreachable("invalid operand index"); } // ---- not implemented ---- llvm_unreachable("unsupported swizzling for provided MMA version"); }]> ]; let extraClassDeclaration = extraBaseClassDeclaration; } //===----------------------------------------------------------------------===// // Distributed Layout Encoding //===----------------------------------------------------------------------===// class DistributedEncoding : TritonGPU_Attr { let description = [{ Distributed encodings have a layout function that is entirely characterized by a d-dimensional tensor L. Note that L doesn't need to have the same shape (or even the same rank) as the tensor it is encoding. The layout function \mathcal{L} of this layout is then defined, for an index `i` \in R^D, as follows: \mathcal{L}(A)[i_d] = L[(i_d + k_d*A.shape[d]) % L.shape[d]] \forall k_d such as i_d + k_d*A.shape[d] < L.shape[d] For example, for a tensor/layout pair A = [x x x x x x x x] [x x x x x x x x] L = [0 1 2 3 ] [4 5 6 7 ] [8 9 10 11] [12 13 14 15] Then the data of A would be distributed as follow between the 16 CUDA threads: L(A) = [ {0,8} , {1,9} , {2,10}, {3,11}, {0,8} , {1, 9} , {2, 10}, {3, 11}, {4,12}, {5,13}, {6,14}, {7,15}, {4,12}, {5, 13}, {6, 14}, {7, 15} ] }]; let extraClassDeclaration = extraBaseClassDeclaration; } //===----------------------------------------------------------------------===// // Blocked Layout Encoding //===----------------------------------------------------------------------===// def BlockedEncodingAttr : DistributedEncoding<"BlockedEncoding"> { let mnemonic = "blocked"; let description = [{ An encoding where each warp owns a contiguous portion of the target tensor. This is typically the kind of data layout used to promote memory coalescing in LoadInst and StoreInst. It is characterized by three tuples -- thread tile size, warp tile size, and block tile size -- which specify the amount of elements owned by each CUDA thread, warp and CTA respectively. For example, a row-major coalesced layout may partition a 16x16 tensor over 2 warps (i.e. 64 threads) as follows. [ 0 0 1 1 2 2 3 3 ; 32 32 33 33 34 34 35 35 ] [ 0 0 1 1 2 2 3 3 ; 32 32 33 33 34 34 35 35 ] [ 4 4 5 5 6 6 7 7 ; 36 36 37 37 38 38 39 39 ] [ 4 4 5 5 6 6 7 7 ; 36 36 37 37 38 38 39 39 ] ... [ 28 28 29 29 30 30 31 31 ; 60 60 61 61 62 62 63 63 ] [ 28 28 29 29 30 30 31 31 ; 60 60 61 61 62 62 63 63 ] for #triton_gpu.blocked_layout<{ sizePerThread = {2, 2} threadsPerWarp = {8, 4} warpsPerCTA = {1, 2} }> }]; let builders = [ // Custom builder initializes sizePerWarp and sizePerCTA automatically // TODO: compiles on MacOS but not linux? // AttrBuilder<(ins "ArrayRef":$sizePerThread, // "ArrayRef":$threadsPerWarp, // "ArrayRef":$warpsPerCTA, // "ArrayRef":$order), [{ // int rank = threadsPerWarp.size(); // SmallVector sizePerWarp(rank); // SmallVector sizePerCTA(rank); // for (unsigned i = 0; i < rank; i++) { // sizePerWarp.push_back(sizePerThread[i] * threadsPerWarp[i]); // sizePerCTA.push_back(sizePerWarp[i] * warpsPerCTA[i]); // } // return $_get(context, sizePerThread, threadsPerWarp, warpsPerCTA, order, sizePerWarp, sizePerCTA); // }]>, // Custom builder initializes sizePerWarp and sizePerCTA automatically // Default builder takes sizePerThread, order and numWarps, and tries to // pack numWarps*32 threads in the provided order for use in a type // of the given shape. AttrBuilder<(ins "ArrayRef":$shape, "ArrayRef":$sizePerThread, "ArrayRef":$order, "unsigned":$numWarps), [{ int rank = sizePerThread.size(); unsigned remainingLanes = 32; unsigned remainingThreads = numWarps*32; unsigned remainingWarps = numWarps; unsigned prevLanes = 1; unsigned prevWarps = 1; SmallVector threadsPerWarp(rank); SmallVector warpsPerCTA(rank); for (int _dim = 0; _dim < rank - 1; ++_dim) { int i = order[_dim]; unsigned threadsPerCTA = std::clamp(remainingThreads, 1, shape[i] / sizePerThread[i]); threadsPerWarp[i] = std::clamp(threadsPerCTA, 1, remainingLanes); warpsPerCTA[i] = std::clamp(threadsPerCTA / threadsPerWarp[i], 1, remainingWarps); remainingWarps /= warpsPerCTA[i]; remainingLanes /= threadsPerWarp[i]; remainingThreads /= threadsPerCTA; prevLanes *= threadsPerWarp[i]; prevWarps *= warpsPerCTA[i]; } // Expand the last dimension to fill the remaining lanes and warps threadsPerWarp[order[rank-1]] = 32 / prevLanes; warpsPerCTA[order[rank-1]] = numWarps / prevWarps; return $_get(context, sizePerThread, threadsPerWarp, warpsPerCTA, order); }]> ]; let extraClassDeclaration = extraBaseClassDeclaration # [{ SliceEncodingAttr squeeze(int axis); }]; let parameters = ( ins ArrayRefParameter<"unsigned">:$sizePerThread, ArrayRefParameter<"unsigned">:$threadsPerWarp, ArrayRefParameter<"unsigned">:$warpsPerCTA, // fastest-changing axis first ArrayRefParameter< "unsigned", "order of axes by the rate of changing" >:$order // These attributes can be inferred from the rest // ArrayRefParameter<"unsigned">:$sizePerWarp, // ArrayRefParameter<"unsigned">:$sizePerCTA ); } //===----------------------------------------------------------------------===// // MMA Layout Encoding //===----------------------------------------------------------------------===// // TODO: MMAv1 and MMAv2 should be two instances of the same class def MmaEncodingAttr : DistributedEncoding<"MmaEncoding"> { let mnemonic = "mma"; let description = [{ An encoding for tensors that have been produced by tensor cores. It is characterized by two parameters: - A 'versionMajor' which specifies the generation the tensor cores whose output is being partitioned: 1 for first-gen tensor cores (Volta), and 2 for second-gen tensor cores (Turing/Ampere). - A 'versionMinor' which indicates the specific layout of a tensor core generation, e.g. for Volta, there might be multiple kinds of layouts annotated by 0,1,2 and so on. - A `blockTileSize` to indicate how data should be partitioned between warps. // -------------------------------- version = 1 --------------------------- // For first-gen tensor cores, the implicit warpTileSize is [16, 16]. Note: the layout is different from the recommended in PTX ISA https://docs.nvidia.com/cuda/parallel-thread-execution/index.html (mma.884 section, FP32 accumulator). For example, when versionMinor=1, the matrix L corresponding to blockTileSize=[32,16] is: warp 0 --------------------------------/\------------------------------- [ 0 0 2 2 8 8 10 10 0 0 2 2 8 8 10 10 ] [ 1 1 3 3 9 9 11 11 1 1 3 3 9 9 11 11 ] [ 0 0 2 2 8 8 10 10 0 0 2 2 8 8 10 10 ] [ 1 1 3 3 9 9 11 11 1 1 3 3 9 9 11 11 ] [ 4 4 6 6 12 12 14 14 4 4 6 6 12 12 14 14 ] [ 5 5 7 7 13 13 15 15 5 5 7 7 13 13 15 15 ] [ 4 4 6 6 12 12 14 14 4 4 6 6 12 12 14 14 ] [ 5 5 7 7 13 13 15 15 5 5 7 7 13 13 15 15 ] [ 16 16 18 18 20 20 22 22 16 16 18 18 20 20 22 22 ] [ 17 17 19 19 21 21 23 23 17 17 19 19 21 21 23 23 ] [ 16 16 18 18 20 20 22 22 16 16 18 18 20 20 22 22 ] [ 17 17 19 19 21 21 23 23 17 17 19 19 21 21 23 23 ] [ 24 24 26 26 28 28 30 30 24 24 26 26 28 28 30 30 ] [ 25 25 27 27 29 29 31 31 25 25 27 27 29 29 31 31 ] [ 24 24 26 26 28 28 30 30 24 24 26 26 28 28 30 30 ] [ 25 25 27 27 29 29 31 31 25 25 27 27 29 29 31 31 ] warp 1 = warp0 + 32 --------------------------------/\------------------------------- [ 32 32 34 34 40 40 42 42 32 32 34 34 40 40 42 42 ] [ 33 33 35 35 41 41 43 43 33 33 35 35 41 41 43 43 ] [ ............................................................... ] // -------------------------------- version = 2 --------------------------- // For second-gen tensor cores, the implicit warpTileSize is [16, 8]. Information about this layout can be found in the official PTX documentation https://docs.nvidia.com/cuda/parallel-thread-execution/index.html (mma.16816 section, FP32 accumulator). For example, the matrix L corresponding to blockTileSize=[32,16] is: warp 0 warp 1 -----------------/\------------- ----------------/\------------- [ 0 0 1 1 2 2 3 3 32 32 33 33 34 34 35 35 [ 4 4 5 5 6 6 7 7 36 36 37 37 38 38 39 39 [ .............................. .............................. [ 28 28 29 29 30 30 31 31 60 60 61 61 62 62 63 63 [ 0 0 1 1 2 2 3 3 32 32 33 33 34 34 35 35 [ 4 4 5 5 6 6 7 7 36 36 37 37 38 38 39 39 [ .............................. .............................. [ 28 28 29 29 30 30 31 31 60 60 61 61 62 62 63 63 warp 3 warp 4 ----------------/\------------- ----------------/\------------- [ 64 64 65 65 66 66 67 67 96 96 97 97 98 98 99 99 [ 68 68 69 69 70 70 71 71 100 100 101 101 102 102 103 103 [ .............................. ............................... [ 92 92 93 93 94 94 95 95 124 124 125 125 126 126 127 127 [ 64 64 65 65 66 66 67 67 96 96 97 97 98 98 99 99 [ 68 68 69 69 70 70 71 71 100 100 101 101 102 102 103 103 [ .............................. ............................... [ 92 92 93 93 94 94 95 95 124 124 125 125 126 126 127 127 }]; let parameters = ( ins "unsigned":$versionMajor, "unsigned":$versionMinor, ArrayRefParameter<"unsigned">:$warpsPerCTA ); let builders = [ // Specially for MMAV1(Volta) AttrBuilder<(ins "int":$versionMajor, "int":$numWarps, "int":$id), [{ assert(versionMajor == 1 && "This builder is specially for versionMajor==1"); SmallVector wpt({static_cast(numWarps), 1}); int versionMinor = 0; assert(id < (1<((1<, // Specially for MMAV1(Volta) AttrBuilder<(ins "int":$versionMajor, "ArrayRef":$warpsPerCTA, "ArrayRef":$shapeA, "ArrayRef":$shapeB, "bool":$isARow, "bool":$isBRow, "int":$id), [{ assert(versionMajor == 1 && "This builder is specially for versionMajor==1"); bool isAVec4 = !isARow && (shapeA[isARow] <= 16); bool isBVec4 = isBRow && (shapeB[isBRow] <= 16); // 4-bits to encode 4 booleans: [isARow, isBRow, isAVec4, isBVec4] // 3-bits to encode the MMA ID to make each unique int versionMinor = (isARow * (1<<0)) |\ (isBRow * (1<<1)) |\ (isAVec4 * (1<<2)) |\ (isBVec4 * (1<<3)); assert(id < (1<((1< ]; let extraClassDeclaration = extraBaseClassDeclaration # [{ bool isVolta() const; bool isAmpere() const; // Get [isARow, isBRow, isAVec4, isBVec4, id] from versionMinor std::tuple decodeVoltaLayoutStates() const; // Number of bits in versionMinor to hold the ID of the MMA encoding instance. // Here 5 bits can hold 32 IDs in a single module. static constexpr int numBitsToHoldMmaV1ID{5}; }]; } def SliceEncodingAttr : DistributedEncoding<"SliceEncoding"> { let mnemonic = "slice"; let description = [{ TODO: improve docs A = [x x x x x x x x] parent = [0 1 2 3 ] [4 5 6 7 ] [8 9 10 11] [12 13 14 15] dim = 0 Then the data of A would be distributed as follow between the 16 CUDA threads: L(A) = [ {0,4,8,12} , {1,5,9,13} , ... {3,7,11,15}, {0,4,8,12} , ..., {3,7,11,15} ] This is useful for constructing the inverse layout of an expand_dims operation during some optimization passes. }]; let parameters = ( ins "unsigned":$dim, // TODO: constraint here to only take distributed encodings "Attribute":$parent ); let extraClassDeclaration = extraBaseClassDeclaration # [{ template SmallVector paddedShape(ArrayRef shape) const; }]; } def DotOperandEncodingAttr : DistributedEncoding<"DotOperandEncoding"> { let mnemonic = "dot_op"; let description = [{ In TritonGPU dialect, considering `d = tt.dot a, b, c` tt.dot's operands a and b must be of DotOperandEncodingAttr layout. a's opIdx is 0, b's opIdx is 1. The parend field in DotOperandEncodingAttr is the layout of d. For MMA v1, an additional attribute `isMMAv1Row` determines whether e.g. the a operand is used in the context of an mma.884.row.col or an mma.884.col.col operation. See the PTX ISA documentation section 9.7.13.4.1 for more details. }]; let parameters = ( ins "unsigned":$opIdx, "Attribute":$parent, "Attribute":$isMMAv1Row ); let builders = [ AttrBuilder<(ins "unsigned":$opIdx, "Attribute":$parent), [{ Attribute isMMAv1Row; if(parent.isa() && parent.cast().isVolta()){ isMMAv1Row = BoolAttr::get(context, true); } return $_get(context, opIdx, parent, isMMAv1Row); }]> ]; let extraClassDeclaration = extraBaseClassDeclaration; } #endif triton-2.0.0/include/triton/Dialect/TritonGPU/IR/TritonGPUDialect.td000066400000000000000000000014711440023377100251620ustar00rootroot00000000000000#ifndef TRITONGPU_DIALECT #define TRITONGPU_DIALECT include "mlir/IR/OpBase.td" def TritonGPU_Dialect : Dialect { let name = "triton_gpu"; let cppNamespace = "::mlir::triton::gpu"; let hasOperationAttrVerify = 1; let description = [{ Triton GPU Dialect. }]; let dependentDialects = [ "triton::TritonDialect", "mlir::gpu::GPUDialect", "tensor::TensorDialect", ]; let extraClassDeclaration = [{ static std::string getNumWarpsAttrName() { return "triton_gpu.num-warps"; } static int getNumWarps(ModuleOp mod) { if(!mod->hasAttr("triton_gpu.num-warps")) llvm::report_fatal_error( "TritonGPU module should contain a triton_gpu.num-warps attribute"); return mod->getAttr("triton_gpu.num-warps").cast().getInt(); } }]; } #endif triton-2.0.0/include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td000066400000000000000000000202701440023377100243540ustar00rootroot00000000000000#ifndef TRITONGPU_OPS #define TRITONGPU_OPS include "triton/Dialect/TritonGPU/IR/TritonGPUDialect.td" include "triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td" include "mlir/Dialect/Arithmetic/IR/ArithmeticBase.td" include "triton/Dialect/Triton/IR/TritonTypes.td" include "triton/Dialect/Triton/IR/TritonAttrDefs.td" include "mlir/IR/OpBase.td" include "mlir/Interfaces/SideEffectInterfaces.td" // NoSideEffect include "mlir/Interfaces/InferTypeOpInterface.td" // SameOperandsAndResultType def ResultsAreSharedEncoding: NativeOpTrait<"ResultsAreSharedEncoding">; class TTG_Op traits = []> : Op; def TTG_ConvertLayoutOp : TTG_Op<"convert_layout", [SameOperandsAndResultShape, SameOperandsAndResultElementType, NoSideEffect]> { let summary = "convert layout"; let arguments = (ins TT_Tensor:$src); let results = (outs TT_Tensor:$result); let assemblyFormat = "$src attr-dict `:` functional-type(operands, results)"; } def TTG_AsyncWaitOp : TTG_Op<"async_wait"> { let summary = "async wait"; let arguments = (ins I32Attr:$num); let assemblyFormat = "attr-dict"; let extraClassDeclaration = [{ static bool isSupported(int computeCapability) { return computeCapability >= 80; } }]; } def TTG_AsyncCommitGroupOp : TTG_Op<"async_commit_group"> { let summary = "async commit group"; let assemblyFormat = "attr-dict"; let extraClassDeclaration = [{ static bool isSupported(int computeCapability) { return computeCapability >= 80; } }]; } // Port Arith_CmpIOp & Arith_CmpFOp & Std_SelectOp to TritonGPU. // This is needed because these ops don't // handle encodings // e.g., https://github.com/llvm/llvm-project/blob/main/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td#L111 def TTG_CmpIOp : TTG_Op<"cmpi", [NoSideEffect, Elementwise, SameOperandsAndResultShape, SameOperandsAndResultEncoding]> { let summary = "integer comparison operation"; let description = [{}]; let arguments = (ins Arith_CmpIPredicateAttr:$predicate, TT_IntLike:$lhs, TT_IntLike:$rhs); let results = (outs TT_BoolLike:$result); } def TTG_CmpFOp : TTG_Op<"cmpf", [NoSideEffect, Elementwise, SameOperandsAndResultShape, SameOperandsAndResultEncoding]> { let summary = "floating-point comparison operation"; let description = [{}]; let arguments = (ins Arith_CmpFPredicateAttr:$predicate, TT_FloatLike:$lhs, TT_FloatLike:$rhs); let results = (outs TT_BoolLike:$result); } // TODO: migrate to arith::SelectOp on LLVM16 def TTG_SelectOp : TTG_Op<"select", [NoSideEffect, Elementwise, SameOperandsAndResultShape, SameOperandsAndResultEncoding]> { let summary = "select operation"; let description = [{}]; let arguments = (ins TT_BoolLike:$condition, TT_Tensor:$true_value, TT_Tensor:$false_value); let results = (outs TT_Tensor:$result); } def TTG_InsertSliceAsyncOp : TTG_Op<"insert_slice_async", [AttrSizedOperandSegments, ResultsAreSharedEncoding, MemoryEffects<[MemRead]>, TypesMatchWith<"infer mask type from src type", "src", "mask", "getI1SameShape($_self)", "($_op.getOperands().size() <= 3) || std::equal_to<>()">, TypesMatchWith<"infer other type from src type", "src", "other", "getPointeeType($_self)", "($_op.getOperands().size() <= 4) || std::equal_to<>()">]> { let summary = "insert slice async"; let description = [{ This operation inserts a tensor `$src` into another tensor `$dst` as specified by the operation’s `$index` argument and `$axis` attribute. It returns a copy of `$dst` with the proper slice updated asynchronously with the value of `$src`. This operation is non-blocking, and `$results` will have the updated value after the corresponding async_wait. When converting from `tt.load` to `triton_gpu.insert_slice_async`, the `$evict`, `$cache`, and `$isVolatile` fields might be ignored on certain hardware. For example, on NVIDIA GPUs, the cache policy is determined by the backend, and `$evict` and `$isVolatile` are ignored because they apply to L1 cache only. The insert_slice_async operation supports the following arguments: * src: the tensor that is inserted. * dst: the tensor into which the `$src` tensor is inserted. * index: the index of the `$src` tensor at the given `$axis` from which the `$dst` tensor is inserted into * mask: optional tensor-rank number of boolean masks which specify which elements of the `$src` tensor are inserted into the `$dst` tensor. * other: optional tensor-rank number of other tensors which specify what values are inserted into the `$dst` tensor if the corresponding element of the `$mask` tensor is false. In the future, we may decompose this operation into a sequence of: * `async` operation to specify a sequence of asynchronous operations * `load` operation to load a tensor from global memory * `insert_slice` operations to insert the `$src` tensor into the `$dst` tensor Example: ``` %1 = triton_gpu.alloc_tensor : tensor<2x32xf32> %2 = triton_gpu.insert_slice_async %0, %1, %index { axis = 0 } : tensor<32x!tt.ptr, #AL> -> tensor<2x32xf32, #A> triiton_gpu.async_wait { num = 0 : i32 } ``` }]; let arguments = (ins TT_PtrTensor:$src, TT_Tensor:$dst, I32:$index, Optional:$mask, Optional:$other, TT_CacheModifierAttr:$cache, TT_EvictionPolicyAttr:$evict, BoolAttr:$isVolatile, I32Attr:$axis); let builders = [ OpBuilder<(ins "Value":$src, "Value":$dst, "Value":$index, "triton::CacheModifier":$cache, "triton::EvictionPolicy":$evict, "bool":$isVolatile, "int":$axis)>, OpBuilder<(ins "Value":$src, "Value":$dst, "Value":$index, "Value":$mask, "triton::CacheModifier":$cache, "triton::EvictionPolicy":$evict, "bool":$isVolatile, "int":$axis)>, OpBuilder<(ins "Value":$src, "Value":$dst, "Value":$index, "Value":$mask, "Value":$other, "triton::CacheModifier":$cache, "triton::EvictionPolicy":$evict, "bool":$isVolatile, "int":$axis)>, ]; let results = (outs TT_Tensor:$result); //let assemblyFormat = [{ // $src `,` $dst `` // $index, $mask, $other // attr-dict `:` type($src) `->` type($dst) //}]; let extraClassDeclaration = [{ static DenseSet getEligibleLoadByteWidth(int computeCapability) { DenseSet validLoadBytes; if (computeCapability >= 80) { validLoadBytes = {4, 8, 16}; } return validLoadBytes; } }]; // The custom parser could be replaced with oilist in LLVM-16 let parser = [{ return parseInsertSliceAsyncOp(parser, result); }]; let printer = [{ return printInsertSliceAsyncOp(p, *this); }]; } def TTG_AllocTensorOp : TTG_Op<"alloc_tensor", [MemoryEffects<[MemAlloc]>, // Allocate shared memory ResultsAreSharedEncoding]> { let summary = "allocate tensor"; let description = [{ This operation defines a tensor of a particular shape. The contents of the tensor are supposed to be in shared memory. Note: This op can be repalced to a `bufferization.alloc_tensor` in LLVM 16. }]; let assemblyFormat = [{attr-dict `:` type($result)}]; let results = (outs TT_Tensor:$result); } #endif triton-2.0.0/include/triton/Dialect/TritonGPU/Transforms/000077500000000000000000000000001440023377100233315ustar00rootroot00000000000000triton-2.0.0/include/triton/Dialect/TritonGPU/Transforms/CMakeLists.txt000066400000000000000000000002311440023377100260650ustar00rootroot00000000000000set(LLVM_TARGET_DEFINITIONS Passes.td) mlir_tablegen(Passes.h.inc -gen-pass-decls -name TritonGPU) add_public_tablegen_target(TritonGPUTransformsIncGen) triton-2.0.0/include/triton/Dialect/TritonGPU/Transforms/Passes.h000066400000000000000000000016451440023377100247460ustar00rootroot00000000000000#ifndef TRITON_DIALECT_TRITONGPU_TRANSFORMS_PASSES_H_ #define TRITON_DIALECT_TRITONGPU_TRANSFORMS_PASSES_H_ #include "mlir/Pass/Pass.h" namespace mlir { std::unique_ptr createTritonGPUPipelinePass(int numStages = 2); // TODO(Keren): prefetch pass not working yet std::unique_ptr createTritonGPUPrefetchPass(); std::unique_ptr createTritonGPUCanonicalizeLoopsPass(); std::unique_ptr createTritonGPUCoalescePass(); std::unique_ptr createTritonGPUReorderInstructionsPass(); std::unique_ptr createTritonGPUDecomposeConversionsPass(); std::unique_ptr createTritonGPUCombineOpsPass(int computeCapability = 80); std::unique_ptr createTritonGPUVerifier(); std::unique_ptr createTritonGPUUpdateMmaForVoltaPass(); /// Generate the code for registering passes. #define GEN_PASS_REGISTRATION #include "triton/Dialect/TritonGPU/Transforms/Passes.h.inc" } // namespace mlir #endif triton-2.0.0/include/triton/Dialect/TritonGPU/Transforms/Passes.td000066400000000000000000000076121440023377100251260ustar00rootroot00000000000000#ifndef TRITONGPU_PASSES #define TRITONGPU_PASSES include "mlir/Pass/PassBase.td" def TritonGPUPipeline : Pass<"tritongpu-pipeline", "mlir::ModuleOp"> { let summary = "pipeline"; let description = [{ Unroll loops to hide global memory -> shared memory latency. }]; let constructor = "mlir::createTritonGPUPipelinePass()"; let dependentDialects = ["mlir::triton::gpu::TritonGPUDialect", "mlir::scf::SCFDialect", "mlir::arith::ArithmeticDialect"]; let options = [ Option<"numStages", "num-stages", "int32_t", /*default*/"2", "number of pipeline stages"> ]; } def TritonGPUPrefetch : Pass<"tritongpu-prefetch", "mlir::ModuleOp"> { let summary = "prefetch"; let description = [{ Prefetch operands (a and b) of tt.dot into shared memory to hide shared memory -> register latency. }]; let constructor = "mlir::createTritonGPUPrefetchPass()"; let dependentDialects = ["mlir::triton::gpu::TritonGPUDialect", "mlir::scf::SCFDialect", "mlir::arith::ArithmeticDialect"]; } def TritonGPUCoalesce: Pass<"tritongpu-coalesce", "mlir::ModuleOp"> { let summary = "coalesce"; let description = [{ TODO }]; let constructor = "mlir::createTritonGPUCoalescePass()"; let dependentDialects = ["mlir::triton::gpu::TritonGPUDialect"]; } def TritonGPUCombineOps : Pass<"tritongpu-combine", "mlir::ModuleOp"> { let summary = "combine triton gpu ops"; let description = [{ convert_layout(convert_layout(%src, #LAYOUT_0), #LAYOUT_1) => convert_layout(%src, #LAYOUT_1) convert_layout(%src, #LAYOUT) => %src if %src.layout() == #LAYOUT }]; let constructor = "mlir::createTritonGPUCombineOpsPass()"; let dependentDialects = ["mlir::triton::gpu::TritonGPUDialect", "mlir::triton::TritonDialect"]; let options = [ Option<"computeCapability", "compute-capability", "int32_t", /*default*/"80", "device compute capability"> ]; } def TritonGPUReorderInstructions: Pass<"tritongpu-reorder-instructions", "mlir::ModuleOp"> { let summary = "Reorder instructions"; let description = "This pass reorder instructions so as to (1) decrease register pressure (e.g., by moving " "conversions from shared memory before their first use) and (2) promote LLVM instruction " "order more friendly to `ptxas`."; let constructor = "mlir::createTritonGPUReorderInstructionsPass()"; let dependentDialects = ["mlir::triton::gpu::TritonGPUDialect", "mlir::triton::TritonDialect"]; } def TritonGPUDecomposeConversions: Pass<"tritongpu-decompose-conversions", "mlir::ModuleOp"> { let summary = "Decompose convert[distributed -> dotOperand] into convert[distributed -> shared -> dotOperand]"; let description = "Decomposing conversions this way makes it possible to use CSE and re-use #shared tensors"; let constructor = "mlir::createTritonGPUDecomposeConversionsPass()"; let dependentDialects = ["mlir::triton::gpu::TritonGPUDialect", "mlir::triton::TritonDialect"]; } def TritonGPUCanonicalizeLoops: Pass<"tritongpu-canonicalize-loops", "mlir::ModuleOp"> { let summary = "canonicalize scf.ForOp ops"; let description = [{ This implements some optimizations that are missing in the standard scf.ForOp canonicalizer. }]; let constructor = "mlir::createTritonGPUCanonicalizeLoopsPass()"; let dependentDialects = ["mlir::triton::gpu::TritonGPUDialect"]; } def UpdateMmaForVolta : Pass<"tritongpu-update-mma-for-volta", "mlir::ModuleOp"> { let summary = "Update mma encodings for Volta"; let description = [{ This helps to update the mma encodings for Volta. }]; let constructor = "mlir::createTritonGPUUpdateMmaForVoltaPass()"; let dependentDialects = ["mlir::triton::gpu::TritonGPUDialect"]; } #endif triton-2.0.0/include/triton/Dialect/TritonGPU/Transforms/TritonGPUConversion.h000066400000000000000000000017011440023377100274020ustar00rootroot00000000000000//===----------------------------------------------------------------------===// // // Defines utilities to use while converting to the TritonGPU dialect. // //===----------------------------------------------------------------------===// #ifndef TRITON_DIALECT_TRITONGPU_TRANSFORMS_TRITONGPUCONVERSION_H_ #define TRITON_DIALECT_TRITONGPU_TRANSFORMS_TRITONGPUCONVERSION_H_ #include "mlir/Transforms/DialectConversion.h" namespace mlir { class TritonGPUTypeConverter : public TypeConverter { public: TritonGPUTypeConverter(MLIRContext *context, int numWarps); int getNumWarps() const { return numWarps; } private: MLIRContext *context; int numWarps; }; class TritonGPUConversionTarget : public ConversionTarget { public: explicit TritonGPUConversionTarget(MLIRContext &ctx, TritonGPUTypeConverter &typeConverter); }; } // namespace mlir #endif // TRITON_DIALECT_TRITONGPU_TRANSFORMS_TRITONGPUCONVERSION_H_ triton-2.0.0/include/triton/Target/000077500000000000000000000000001440023377100172215ustar00rootroot00000000000000triton-2.0.0/include/triton/Target/LLVMIR/000077500000000000000000000000001440023377100202265ustar00rootroot00000000000000triton-2.0.0/include/triton/Target/LLVMIR/LLVMIRTranslation.h000066400000000000000000000017751440023377100236350ustar00rootroot00000000000000#ifndef TRITON_TARGET_LLVMIRTRANSLATION_H #define TRITON_TARGET_LLVMIRTRANSLATION_H #include "llvm/ADT/StringRef.h" #include #include #include namespace llvm { class Module; class LLVMContext; } // namespace llvm namespace mlir { class ModuleOp; } // namespace mlir namespace mlir { namespace triton { // add external dependent libs void addExternalLibs(mlir::ModuleOp &module, const std::vector &names, const std::vector &paths); // Translate TritonGPU dialect to LLVMIR, return null if failed. std::unique_ptr translateTritonGPUToLLVMIR(llvm::LLVMContext *llvmContext, mlir::ModuleOp module, int computeCapability); // Translate mlir LLVM dialect to LLVMIR, return null if failed. std::unique_ptr translateLLVMToLLVMIR(llvm::LLVMContext *llvmContext, mlir::ModuleOp module); } // namespace triton } // namespace mlir #endif // TRITON_TARGET_LLVMIRTRANSLATION_H triton-2.0.0/include/triton/Target/PTX/000077500000000000000000000000001440023377100176745ustar00rootroot00000000000000triton-2.0.0/include/triton/Target/PTX/PTXTranslation.h000066400000000000000000000004751440023377100227450ustar00rootroot00000000000000#ifndef TRITON_TARGET_PTXTRANSLATION_H #define TRITON_TARGET_PTXTRANSLATION_H #include namespace llvm { class Module; } // namespace llvm namespace triton { // Translate TritonGPU IR to PTX code. std::string translateLLVMIRToPTX(llvm::Module &module, int cc, int version); } // namespace triton #endif triton-2.0.0/include/triton/Tools/000077500000000000000000000000001440023377100170735ustar00rootroot00000000000000triton-2.0.0/include/triton/Tools/Sys/000077500000000000000000000000001440023377100176515ustar00rootroot00000000000000triton-2.0.0/include/triton/Tools/Sys/GetEnv.hpp000066400000000000000000000027641440023377100215630ustar00rootroot00000000000000/* * Copyright (c) 2015, PHILIPPE TILLET. All rights reserved. * * This file is part of ISAAC. * * ISAAC is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, * MA 02110-1301 USA */ #ifndef TDL_TOOLS_SYS_GETENV_HPP #define TDL_TOOLS_SYS_GETENV_HPP #include #include #include namespace triton { namespace tools { inline std::string getenv(const char *name) { const char *cstr = std::getenv(name); if (!cstr) return ""; std::string result(cstr); return result; } inline bool getBoolEnv(const std::string &env) { const char *s = std::getenv(env.c_str()); std::string str(s ? s : ""); std::transform(str.begin(), str.end(), str.begin(), [](unsigned char c) { return std::tolower(c); }); return (str == "on" || str == "true" || str == "1"); } } // namespace tools } // namespace triton #endif triton-2.0.0/lib/000077500000000000000000000000001440023377100135775ustar00rootroot00000000000000triton-2.0.0/lib/Analysis/000077500000000000000000000000001440023377100153625ustar00rootroot00000000000000triton-2.0.0/lib/Analysis/Alias.cpp000066400000000000000000000037421440023377100171250ustar00rootroot00000000000000#include "triton/Analysis/Alias.h" #include "mlir/Dialect/Tensor/IR/Tensor.h" #include "triton/Analysis/Utility.h" #include "triton/Dialect/TritonGPU/IR/Dialect.h" namespace mlir { AliasInfo AliasInfo::join(const AliasInfo &lhs, const AliasInfo &rhs) { if (lhs == rhs) return lhs; AliasInfo ret; for (auto value : lhs.allocs) { ret.insert(value); } for (auto value : rhs.allocs) { ret.insert(value); } return ret; } ChangeResult SharedMemoryAliasAnalysis::visitOperation( Operation *op, ArrayRef *> operands) { AliasInfo aliasInfo; bool pessimistic = true; if (maybeSharedAllocationOp(op)) { // These ops may allocate a new shared memory buffer. auto result = op->getResult(0); // XXX(Keren): the following ops are always aliasing for now if (isa(op)) { // extract_slice %src // trans %src aliasInfo = AliasInfo(operands[0]->getValue()); pessimistic = false; } else if (isa( op)) { // insert_slice_async %src, %dst, %index // insert_slice %src into %dst[%offsets] aliasInfo = AliasInfo(operands[1]->getValue()); pessimistic = false; } else if (isSharedEncoding(result)) { aliasInfo.insert(result); pessimistic = false; } } if (pessimistic) { return markAllPessimisticFixpoint(op->getResults()); } // Join all lattice elements ChangeResult result = ChangeResult::NoChange; for (Value value : op->getResults()) { result |= getLatticeElement(value).join(aliasInfo); } return result; } AliasResult SharedMemoryAliasAnalysis::alias(Value lhs, Value rhs) { // TODO: implement return AliasResult::MayAlias; } ModRefResult SharedMemoryAliasAnalysis::getModRef(Operation *op, Value location) { // TODO: implement return ModRefResult::getModAndRef(); } } // namespace mlir triton-2.0.0/lib/Analysis/Allocation.cpp000066400000000000000000000466561440023377100201740ustar00rootroot00000000000000#include "triton/Analysis/Allocation.h" #include "mlir/Analysis/Liveness.h" #include "mlir/Analysis/SliceAnalysis.h" #include "mlir/Dialect/Tensor/IR/Tensor.h" #include "triton/Analysis/Alias.h" #include "triton/Analysis/Utility.h" #include "triton/Dialect/TritonGPU/IR/Dialect.h" #include "llvm/ADT/SmallVector.h" #include #include #include using ::mlir::triton::gpu::BlockedEncodingAttr; using ::mlir::triton::gpu::DotOperandEncodingAttr; using ::mlir::triton::gpu::getContigPerThread; using ::mlir::triton::gpu::getOrder; using ::mlir::triton::gpu::getShapePerCTA; using ::mlir::triton::gpu::getSizePerThread; using ::mlir::triton::gpu::MmaEncodingAttr; using ::mlir::triton::gpu::SharedEncodingAttr; using ::mlir::triton::gpu::SliceEncodingAttr; namespace mlir { //===----------------------------------------------------------------------===// // Shared Memory Allocation Analysis //===----------------------------------------------------------------------===// namespace triton { // Bitwidth of pointers constexpr int kPtrBitWidth = 64; static std::pair, SmallVector> getCvtOrder(const Attribute &srcLayout, const Attribute &dstLayout) { auto srcBlockedLayout = srcLayout.dyn_cast(); auto srcMmaLayout = srcLayout.dyn_cast(); auto srcDotLayout = srcLayout.dyn_cast(); auto dstBlockedLayout = dstLayout.dyn_cast(); auto dstMmaLayout = dstLayout.dyn_cast(); auto dstDotLayout = dstLayout.dyn_cast(); assert(!(srcMmaLayout && dstMmaLayout) && "Unexpected mma -> mma layout conversion"); // mma or dot layout does not have an order, so the order depends on the // layout of the other operand. auto inOrd = (srcMmaLayout || srcDotLayout) ? getOrder(dstLayout) : getOrder(srcLayout); auto outOrd = (dstMmaLayout || dstDotLayout) ? getOrder(srcLayout) : getOrder(dstLayout); return {inOrd, outOrd}; } SmallVector getScratchConfigForCvtLayout(triton::gpu::ConvertLayoutOp op, unsigned &inVec, unsigned &outVec) { auto srcTy = op.src().getType().cast(); auto dstTy = op.result().getType().cast(); Attribute srcLayout = srcTy.getEncoding(); Attribute dstLayout = dstTy.getEncoding(); // MmaToDotShortcut doesn't use shared mem if (auto mmaLayout = srcLayout.dyn_cast()) if (auto dotOperandLayout = dstLayout.dyn_cast()) if (isMmaToDotShortcut(mmaLayout, dotOperandLayout)) return {}; assert(srcLayout && dstLayout && "Unexpect layout in getScratchConfigForCvtLayout()"); auto [inOrd, outOrd] = getCvtOrder(srcLayout, dstLayout); unsigned srcContigPerThread = getContigPerThread(srcLayout)[inOrd[0]]; unsigned dstContigPerThread = getContigPerThread(dstLayout)[outOrd[0]]; // TODO: Fix the legacy issue that ourOrd[0] == 0 always means // that we cannot do vectorization. inVec = outOrd[0] == 0 ? 1 : inOrd[0] == 0 ? 1 : srcContigPerThread; outVec = outOrd[0] == 0 ? 1 : dstContigPerThread; auto srcShape = srcTy.getShape(); auto dstShape = dstTy.getShape(); auto srcShapePerCTA = getShapePerCTA(srcLayout, srcShape); auto dstShapePerCTA = getShapePerCTA(dstLayout, dstShape); unsigned rank = dstTy.getRank(); SmallVector paddedRepShape(rank); unsigned pad = std::max(inVec, outVec); for (unsigned d = 0; d < rank; ++d) { paddedRepShape[d] = std::max(std::min(srcTy.getShape()[d], srcShapePerCTA[d]), std::min(dstTy.getShape()[d], dstShapePerCTA[d])); } if (rank == 1) return paddedRepShape; unsigned paddedDim = 1; if (auto dstBlockedLayout = dstLayout.dyn_cast()) { paddedDim = dstBlockedLayout.getOrder()[0]; } paddedRepShape[paddedDim] += pad; return paddedRepShape; } // TODO: extend beyond scalars SmallVector getScratchConfigForAtomicRMW(triton::AtomicRMWOp op) { SmallVector smemShape; if (op.ptr().getType().isa()) { // do nothing or just assert because shared memory is not used in tensor up // to now } else { // need only bytes for scalar // always vec = 1 and elemsPerThread = 1 for scalar? smemShape.push_back(1); } return smemShape; } SmallVector getScratchConfigForAtomicCAS(triton::AtomicCASOp op) { return SmallVector{1}; } class AllocationAnalysis { public: AllocationAnalysis(Operation *operation, Allocation *allocation) : operation(operation), allocation(allocation) { run(); } private: using BufferT = Allocation::BufferT; /// Value -> Liveness Range /// Use MapVector to ensure determinism. using BufferRangeMapT = llvm::MapVector>; /// Nodes -> Nodes using GraphT = DenseMap>; void run() { getValuesAndSizes(); resolveLiveness(); computeOffsets(); } /// Initializes explicitly defined shared memory values for a given operation. void getExplicitValueSize(Operation *op) { // Values returned from scf.yield will not be allocated even though they // have the shared encoding. // For example: %a = scf.if -> yield // %a must be allocated elsewhere by other operations. // FIXME(Keren): extract and insert are always alias for now if (!maybeSharedAllocationOp(op) || maybeAliasOp(op)) { return; } for (Value result : op->getResults()) { if (isSharedEncoding(result)) { // Bytes could be a different value once we support padding or other // allocation policies. auto tensorType = result.getType().dyn_cast(); auto bytes = tensorType.getNumElements() * tensorType.getElementTypeBitWidth() / 8; allocation->addBuffer(result, bytes); } } } /// Initializes temporary shared memory for a given operation. void getScratchValueSize(Operation *op) { if (auto reduceOp = dyn_cast(op)) { ReduceOpHelper helper(reduceOp); unsigned bytes = helper.getScratchSizeInBytes(); allocation->addBuffer(op, bytes); } else if (auto cvtLayout = dyn_cast(op)) { auto srcTy = cvtLayout.src().getType().cast(); auto dstTy = cvtLayout.result().getType().cast(); auto srcEncoding = srcTy.getEncoding(); auto dstEncoding = dstTy.getEncoding(); if (srcEncoding.isa() || dstEncoding.isa()) { // Conversions from/to shared memory do not need scratch memory. return; } // ConvertLayoutOp with both input/output non-shared_layout // TODO: Besides of implementing ConvertLayoutOp via shared memory, it's // also possible to realize it with other approaches in restricted // conditions, such as warp-shuffle unsigned inVec = 0; unsigned outVec = 0; auto smemShape = getScratchConfigForCvtLayout(cvtLayout, inVec, outVec); unsigned elems = std::accumulate(smemShape.begin(), smemShape.end(), 1, std::multiplies{}); auto bytes = srcTy.getElementType().isa() ? elems * kPtrBitWidth / 8 : elems * std::max(8, srcTy.getElementTypeBitWidth()) / 8; allocation->addBuffer(op, bytes); } else if (auto atomicRMWOp = dyn_cast(op)) { auto value = op->getOperand(0); // only scalar requires scratch memory // make it explicit for readability if (value.getType().dyn_cast()) { // nothing to do } else { auto smemShape = getScratchConfigForAtomicRMW(atomicRMWOp); unsigned elems = std::accumulate(smemShape.begin(), smemShape.end(), 1, std::multiplies{}); auto elemTy = value.getType().cast().getPointeeType(); auto bytes = elemTy.isa() ? elems * kPtrBitWidth / 8 : elems * std::max(8, elemTy.getIntOrFloatBitWidth()) / 8; allocation->addBuffer(op, bytes); } } else if (auto atomicCASOp = dyn_cast(op)) { auto value = op->getOperand(0); auto smemShape = getScratchConfigForAtomicCAS(atomicCASOp); unsigned elems = std::accumulate(smemShape.begin(), smemShape.end(), 1, std::multiplies{}); auto elemTy = value.getType().cast().getPointeeType(); auto bytes = elemTy.isa() ? elems * kPtrBitWidth / 8 : elems * elemTy.getIntOrFloatBitWidth() / 8; allocation->addBuffer(op, bytes); } } void getValueAlias(Value value, SharedMemoryAliasAnalysis &analysis) { LatticeElement *latticeElement = analysis.lookupLatticeElement(value); if (latticeElement) { auto &info = latticeElement->getValue(); if (!info.getAllocs().empty()) { for (auto alloc : info.getAllocs()) { allocation->addAlias(value, alloc); } } } } /// Extract all shared memory values and their sizes void getValuesAndSizes() { // Get the alloc values operation->walk([&](Operation *op) { getExplicitValueSize(op); getScratchValueSize(op); }); // Get the alias values SharedMemoryAliasAnalysis aliasAnalysis(operation->getContext()); aliasAnalysis.run(operation); operation->walk([&](Operation *op) { for (auto operand : op->getOperands()) { getValueAlias(operand, aliasAnalysis); } for (auto value : op->getResults()) { getValueAlias(value, aliasAnalysis); } }); } /// Computes the liveness range of the allocated value. /// Each buffer is allocated only once. void resolveExplicitBufferLiveness( function_ref(Value value)> getLiveness) { for (auto valueBufferIter : allocation->valueBuffer) { auto value = valueBufferIter.first; auto *buffer = valueBufferIter.second; bufferRange[buffer] = getLiveness(value); } } /// Extends the liveness range by unionizing the liveness range of the aliased /// values because each allocated buffer could be an alias of others, if block /// arguments are involved. void resolveAliasBufferLiveness( function_ref(Value value)> getLiveness) { for (auto aliasBufferIter : allocation->aliasBuffer) { auto value = aliasBufferIter.first; auto buffers = aliasBufferIter.second; auto range = getLiveness(value); for (auto *buffer : buffers) { auto minId = range.start(); auto maxId = range.end(); if (bufferRange.count(buffer)) { // Extend the allocated buffer's range minId = std::min(minId, bufferRange[buffer].start()); maxId = std::max(maxId, bufferRange[buffer].end()); } bufferRange[buffer] = Interval(minId, maxId); } } } /// Computes the liveness range of scratched buffers. /// Some operations may have a temporary buffer that is not explicitly /// allocated, but is used to store intermediate results. void resolveScratchBufferLiveness( const DenseMap &operationId) { // Analyze liveness of scratch buffers for (auto opScratchIter : allocation->opScratch) { // Any scratch memory's live range is the current operation's live // range. auto *op = opScratchIter.first; auto *buffer = opScratchIter.second; bufferRange.insert({buffer, Interval(operationId.lookup(op), operationId.lookup(op) + 1)}); } } /// Resolves liveness of all values involved under the root operation. void resolveLiveness() { // Assign an ID to each operation using post-order traversal. // To achieve the correct liveness range, the parent operation's ID // should be greater than each of its child operation's ID . // Example: // ... // %5 = triton.convert_layout %4 // %6 = scf.for ... iter_args(%arg0 = %0) -> (i32) { // %2 = triton.convert_layout %5 // ... // scf.yield %arg0 // } // For example, %5 is defined in the parent region and used in // the child region, and is not passed as a block argument. // %6 should should have an ID greater than its child operations, // otherwise %5 liveness range ends before the child operation's liveness // range ends. DenseMap operationId; operation->walk( [&](Operation *op) { operationId[op] = operationId.size(); }); // Analyze liveness of explicit buffers Liveness liveness(operation); auto getValueLivenessRange = [&](Value value) { auto liveOperations = liveness.resolveLiveness(value); auto minId = std::numeric_limits::max(); auto maxId = std::numeric_limits::min(); std::for_each(liveOperations.begin(), liveOperations.end(), [&](Operation *liveOp) { if (operationId[liveOp] < minId) { minId = operationId[liveOp]; } if ((operationId[liveOp] + 1) > maxId) { maxId = operationId[liveOp] + 1; } }); return Interval(minId, maxId); }; resolveExplicitBufferLiveness(getValueLivenessRange); resolveAliasBufferLiveness(getValueLivenessRange); resolveScratchBufferLiveness(operationId); } /// Computes the shared memory offsets for all related values. /// Paper: Algorithms for Compile-Time Memory Optimization /// (https://www.cs.utexas.edu/users/harrison/papers/compile-time.pdf) void computeOffsets() { SmallVector buffers; for (auto bufferIter : bufferRange) { buffers.emplace_back(bufferIter.first); } DenseMap bufferStart; calculateStarts(buffers, bufferStart); GraphT interference; buildInterferenceGraph(buffers, bufferStart, interference); allocate(buffers, bufferStart, interference); } /// Computes the initial shared memory offsets. void calculateStarts(const SmallVector &buffers, DenseMap &bufferStart) { // v = values in shared memory // t = triplet of (size, start, end) // shared memory space // - // | *******t4 // | /|\ v2 inserts t4, t5, and t6 // | | // | ******t5 ************t6 // | ^^^^^v2^^^^^^ // | | *********************t2 // | \|/ v2 erases t1 // | ******t1 ^^^^^^^^^v1^^^^^^^^^ ************t3 // |---------------------------------------------| liveness range // 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 ... /// Start -> Liveness Range using TripleMapT = std::multimap>; TripleMapT tripleMap; tripleMap.insert(std::make_pair(0, Interval())); SmallVector xBuffers = buffers; while (!xBuffers.empty()) { auto tripleIt = tripleMap.begin(); auto size = tripleIt->first; auto range = tripleIt->second; tripleMap.erase(tripleIt); auto bufferIt = std::find_if(xBuffers.begin(), xBuffers.end(), [&](auto *buffer) { auto xRange = bufferRange[buffer]; bool res = xRange.intersects(range); for (auto val : tripleMap) res = res && !val.second.intersects(xRange); return res; }); if (bufferIt != xBuffers.end()) { auto buffer = *bufferIt; auto xSize = buffer->size; auto xRange = bufferRange.lookup(buffer); bufferStart[buffer] = size; tripleMap.insert( {size + xSize, Interval{std::max(range.start(), xRange.start()), std::min(range.end(), xRange.end())}}); if (range.start() < xRange.start()) tripleMap.insert({size, Interval{range.start(), xRange.end()}}); if (xRange.end() < range.end()) tripleMap.insert({size, Interval{xRange.start(), range.end()}}); xBuffers.erase(bufferIt); } } } /// Builds a graph of all shared memory values. Edges are created between /// shared memory values that are overlapping. void buildInterferenceGraph(const SmallVector &buffers, const DenseMap &bufferStart, GraphT &interference) { for (auto x : buffers) { for (auto y : buffers) { if (x == y) continue; auto xStart = bufferStart.lookup(x); auto yStart = bufferStart.lookup(y); auto xSize = x->size; auto ySize = y->size; Interval xSizeRange = {xStart, xStart + xSize}; Interval ySizeRange = {yStart, yStart + ySize}; auto xOpRange = bufferRange.lookup(x); auto yOpRange = bufferRange.lookup(y); if (xOpRange.intersects(yOpRange) && xSizeRange.intersects(ySizeRange)) { interference[x].insert(y); } } } } /// Finalizes shared memory offsets considering interference. void allocate(const SmallVector &buffers, const DenseMap &bufferStart, const GraphT &interference) { // First-fit graph coloring // Neighbors are nodes that interfere with each other. // We color a node by finding the index of the first available // non-neighboring node or the first neighboring node without any color. // Nodes with the same color do not interfere with each other. DenseMap colors; for (auto value : buffers) { colors[value] = (value == buffers[0]) ? 0 : -1; } SmallVector available(buffers.size()); for (auto x : buffers) { std::fill(available.begin(), available.end(), true); for (auto y : interference.lookup(x)) { int color = colors[y]; if (color >= 0) { available[color] = false; } } auto it = std::find(available.begin(), available.end(), true); colors[x] = std::distance(available.begin(), it); } // Finalize allocation // color0: [0, 7), [0, 8), [0, 15) -> [0, 7), [0, 8), [0, 15) // color1: [7, 9) -> [0 + 1 * 15, 9 + 1 * 15) -> [15, 24) // color2: [8, 12) -> [8 + 2 * 15, 12 + 2 * 15) -> [38, 42) // TODO(Keren): We are wasting memory here. // Nodes with color2 can actually start with 24. for (auto x : buffers) { size_t adj = 0; for (auto y : interference.lookup(x)) { adj = std::max(adj, bufferStart.lookup(y) + y->size); } x->offset = bufferStart.lookup(x) + colors.lookup(x) * adj; allocation->sharedMemorySize = std::max(allocation->sharedMemorySize, x->offset + x->size); } } private: Operation *operation; Allocation *allocation; BufferRangeMapT bufferRange; }; } // namespace triton void Allocation::run() { triton::AllocationAnalysis(getOperation(), this); } } // namespace mlir triton-2.0.0/lib/Analysis/AxisInfo.cpp000066400000000000000000001027611440023377100176150ustar00rootroot00000000000000#include "mlir/Analysis/DataFlowAnalysis.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" #include "llvm/Support/raw_ostream.h" #include "triton/Analysis/AxisInfo.h" #include "triton/Dialect/Triton/IR/Dialect.h" #include "triton/Dialect/TritonGPU/IR/Dialect.h" namespace mlir { // Function for extended Euclidean Algorithm static int64_t gcdImpl(int64_t a, int64_t b, int64_t *x, int64_t *y) { // Base Case if (a == 0) { *x = 0; *y = 1; return b; } int64_t x1, y1; // To store results of recursive call int64_t gcd = gcdImpl(b % a, a, &x1, &y1); // Update x and y using results of // recursive call *x = y1 - (b / a) * x1; *y = x1; return gcd; } static int64_t gcd(int64_t a, int64_t b) { if (a == 0) return b; if (b == 0) return a; int64_t x, y; return gcdImpl(a, b, &x, &y); } static constexpr int log2Int(int64_t num) { return (num > 1) ? 1 + log2Int(num / 2) : 0; } //===----------------------------------------------------------------------===// // AxisInfo //===----------------------------------------------------------------------===// AxisInfo AxisInfo::getPessimisticValueState(Value value) { auto rank = 1; if (TensorType ty = value.getType().dyn_cast()) rank = ty.getRank(); auto contiHint = 1; auto divHint = 1; auto constHint = 1; BlockArgument blockArg = value.dyn_cast(); if (blockArg && blockArg.getOwner()->isEntryBlock()) { Operation *op = blockArg.getOwner()->getParentOp(); if (FuncOp fun = dyn_cast(op)) { Attribute attr = fun.getArgAttr(blockArg.getArgNumber(), "tt.divisibility"); if (attr) divHint = attr.cast().getValue().getZExtValue(); } else if (auto fun = dyn_cast(op)) { Attribute attr = fun.getArgAttr(blockArg.getArgNumber(), "tt.divisibility"); if (attr) divHint = attr.cast().getValue().getZExtValue(); } else { // Derive the divisibility of the induction variable only when // the step and the lower bound are both constants if (auto forOp = dyn_cast(op)) { if (blockArg == forOp.getInductionVar()) { if (auto lowerBound = forOp.getLowerBound().getDefiningOp()) { if (auto step = forOp.getStep().getDefiningOp()) { auto lowerBoundVal = lowerBound.getValue() .cast() .getValue() .getZExtValue(); auto stepVal = step.getValue().cast().getValue().getZExtValue(); auto k = gcd(lowerBoundVal, stepVal); if (k != 0) divHint = k; } } } } } } return AxisInfo(/*knownContiguity=*/DimVectorT(rank, contiHint), /*knownDivisibility=*/DimVectorT(rank, divHint), /*knownConstancy=*/DimVectorT(rank, constHint)); } // The gcd of both arguments for each dimension AxisInfo AxisInfo::join(const AxisInfo &lhs, const AxisInfo &rhs) { DimVectorT contiguity; DimVectorT divisibility; DimVectorT constancy; for (auto d = 0; d < lhs.getRank(); ++d) { contiguity.push_back(gcd(lhs.getContiguity(d), rhs.getContiguity(d))); divisibility.push_back(gcd(lhs.getDivisibility(d), rhs.getDivisibility(d))); constancy.push_back(gcd(lhs.getConstancy(d), rhs.getConstancy(d))); } std::optional constantValue; if (lhs.getConstantValue().has_value() && rhs.getConstantValue().has_value() && lhs.getConstantValue() == rhs.getConstantValue()) constantValue = lhs.getConstantValue(); return AxisInfo(contiguity, divisibility, constancy, constantValue); } //===----------------------------------------------------------------------===// // AxisInfoVisitor //===----------------------------------------------------------------------===// template class CastOpAxisInfoVisitor final : public AxisInfoVisitorImpl { public: using AxisInfoVisitorImpl::AxisInfoVisitorImpl; AxisInfo getAxisInfo(OpTy op, ArrayRef *> operands) override { return operands[0]->getValue(); } }; class MakeRangeOpAxisInfoVisitor final : public AxisInfoVisitorImpl { public: using AxisInfoVisitorImpl::AxisInfoVisitorImpl; AxisInfo getAxisInfo(triton::MakeRangeOp op, ArrayRef *> operands) override { auto start = op.start(); auto end = op.end(); return AxisInfo(/*contiguity=*/{end - start}, /*divisibility=*/{highestPowOf2Divisor(start)}, /*constancy=*/{1}); } }; class ConstantOpAxisInfoVisitor final : public AxisInfoVisitorImpl { public: using AxisInfoVisitorImpl::AxisInfoVisitorImpl; AxisInfo getAxisInfo(arith::ConstantOp op, ArrayRef *> operands) override { auto intAttr = op.getValue().dyn_cast(); auto boolAttr = op.getValue().dyn_cast(); if (intAttr || boolAttr) { int64_t value{}; if (intAttr) value = intAttr.getValue().getZExtValue(); else value = boolAttr.getValue() ? 1 : 0; return AxisInfo(/*contiguity=*/{1}, /*divisibility=*/{highestPowOf2Divisor(value)}, /*constancy=*/{1}, /*knownConstantValue=*/{value}); } // TODO: generalize to dense attr auto splatAttr = op.getValue().dyn_cast(); if (splatAttr && splatAttr.getElementType().isIntOrIndex()) { int64_t value = splatAttr.getSplatValue().getZExtValue(); TensorType ty = splatAttr.getType().cast(); return AxisInfo( /*contiguity=*/AxisInfo::DimVectorT(ty.getRank(), 1), /*divisibility=*/ AxisInfo::DimVectorT(ty.getRank(), highestPowOf2Divisor(value)), /*constancy=*/ AxisInfo::DimVectorT(ty.getShape().begin(), ty.getShape().end()), /*knownConstantValue=*/{value}); } return AxisInfo(); } }; template class AddSubOpAxisInfoVisitor final : public BinaryOpVisitorImpl { public: using BinaryOpVisitorImpl::BinaryOpVisitorImpl; private: int64_t getContiguity(OpTy op, const AxisInfo &lhs, const AxisInfo &rhs, int dim) override { return std::max(gcd(lhs.getConstancy(dim), rhs.getContiguity(dim)), gcd(lhs.getContiguity(dim), rhs.getConstancy(dim))); } int64_t getDivisibility(OpTy op, const AxisInfo &lhs, const AxisInfo &rhs, int dim) override { // lhs = k * d_lhs = k * k' * gcd(d_lhs, d_rhs) // rhs = p * d_rhs = p * p' * gcd(d_lhs, d_rhs) // lhs + rhs = k * d_lhs + p * d_rhs = (k * d_lhs + p * d_rhs) * // gcd(d_lhs, d_rhs) return gcd(lhs.getDivisibility(dim), rhs.getDivisibility(dim)); } int64_t getConstancy(OpTy op, const AxisInfo &lhs, const AxisInfo &rhs, int dim) override { return gcd(lhs.getConstancy(dim), rhs.getConstancy(dim)); } std::optional getConstantValue(OpTy op, const AxisInfo &lhs, const AxisInfo &rhs) override { if (lhs.getConstantValue().has_value() && rhs.getConstantValue().has_value()) { if constexpr (std::is_same_v || std::is_same_v) { return {lhs.getConstantValue().value() + rhs.getConstantValue().value()}; } else if constexpr (std::is_same_v) { return {lhs.getConstantValue().value() - rhs.getConstantValue().value()}; } } return {}; } }; class MulIOpAxisInfoVisitor final : public BinaryOpVisitorImpl { public: using BinaryOpVisitorImpl::BinaryOpVisitorImpl; private: int64_t getContiguity(arith::MulIOp op, const AxisInfo &lhs, const AxisInfo &rhs, int dim) override { // lhs * 1 = lhs auto lhsContiguity = rhs.getConstantValue().has_value() && rhs.getConstantValue() == 1 ? lhs.getContiguity(dim) : 1; // 1 * rhs = rhs auto rhsContiguity = lhs.getConstantValue().has_value() && lhs.getConstantValue() == 1 ? rhs.getContiguity(dim) : 1; return std::max(lhsContiguity, rhsContiguity); } int64_t getConstancy(arith::MulIOp op, const AxisInfo &lhs, const AxisInfo &rhs, int dim) override { return gcd(lhs.getConstancy(dim), rhs.getConstancy(dim)); } int64_t getDivisibility(arith::MulIOp op, const AxisInfo &lhs, const AxisInfo &rhs, int dim) override { // lhs = k * d_lhs // rhs = p * d_rhs // lhs * rhs = k * d_lhs * p * d_rhs = k * p * d_lhs * d_rhs return lhs.getDivisibility(dim) * rhs.getDivisibility(dim); } std::optional getConstantValue(arith::MulIOp op, const AxisInfo &lhs, const AxisInfo &rhs) override { if (lhs.getConstantValue().has_value() && rhs.getConstantValue().has_value()) return {lhs.getConstantValue().value() * rhs.getConstantValue().value()}; return {}; } }; template class DivOpAxisInfoVisitor final : public BinaryOpVisitorImpl { public: using BinaryOpVisitorImpl::BinaryOpVisitorImpl; private: int64_t getContiguity(OpTy op, const AxisInfo &lhs, const AxisInfo &rhs, int dim) override { // lhs / 1 = lhs return rhs.getConstantValue().has_value() && rhs.getConstantValue().value() == 1 ? lhs.getContiguity(dim) : 1; } int64_t getConstancy(OpTy op, const AxisInfo &lhs, const AxisInfo &rhs, int dim) override { auto resTy = op.getResult().getType().template dyn_cast(); if (!resTy) return BinaryOpVisitorImpl::getConstancy(op, lhs, rhs, dim); auto shape = resTy.getShape(); // Case 1: both lhs and rhs are constants. auto constancy = gcd(lhs.getConstancy(dim), rhs.getConstancy(dim)); // Case 2: lhs contiguous, rhs constant. // lhs: d_lhs * k, d_lhs * k + 1, ..., d_lhs * k + n // rhs: d_rhs * p, d_rhs * p, ..., d_rhs * p // lhs / rhs = d_lhs * k / (d_rhs * p), (d_lhs * k + 1) / (d_rhs * p), // ..., (d_lhs * k + n) / (d_rhs * p) // Because d_lhs % d_rhs = 0 || d_rhs % d_lhs = 0, // the minimal constancy is gcd(d_lhs, d_rhs). // Since gcd(d_lhs, d_rhs) maybe > len(lhs), // we need to use another gcd to get the actual constancy. if (AxisInfoVisitor::isContiguousDim(lhs, shape, dim) && AxisInfoVisitor::isConstantDim(rhs, shape, dim)) { constancy = std::max(constancy, gcd(lhs.getContiguity(dim), gcd(lhs.getDivisibility(dim), rhs.getDivisibility(dim)))); } return constancy; } int64_t getDivisibility(OpTy op, const AxisInfo &lhs, const AxisInfo &rhs, int dim) override { // Case 1: lhs is 0 if (lhs.getConstantValue().has_value() && lhs.getConstantValue().value() == 0) return lhs.getDivisibility(dim); // Case 2: rhs is constant if (rhs.getConstantValue().has_value()) { auto lhsDivisibility = lhs.getDivisibility(dim); auto rhsValue = rhs.getConstantValue().value(); if (lhsDivisibility % rhsValue == 0) return lhsDivisibility / rhsValue; } // Case 3: both are not constant return 1; } std::optional getConstantValue(OpTy op, const AxisInfo &lhs, const AxisInfo &rhs) override { if (lhs.getConstantValue().has_value() && rhs.getConstantValue().has_value()) return {lhs.getConstantValue().value() / rhs.getConstantValue().value()}; return {}; } }; template class RemOpAxisInfoVisitor final : public BinaryOpVisitorImpl { public: using BinaryOpVisitorImpl::BinaryOpVisitorImpl; private: int64_t getContiguity(OpTy op, const AxisInfo &lhs, const AxisInfo &rhs, int dim) override { auto resTy = op.getResult().getType().template dyn_cast(); if (!resTy) return BinaryOpVisitorImpl::getContiguity(op, lhs, rhs, dim); auto shape = resTy.getShape(); int64_t contiguity = 1; // lhs contiguous, rhs constant // lhs: d_lhs * k, d_lhs * k + 1, ..., d_lhs * k + n // rhs: d_rhs * p, d_rhs * p, ..., d_rhs * p // lhs % rhs = d_lhs * k % (d_rhs * p), (d_lhs * k + 1) % (d_rhs * p), // ..., (d_lhs * k + n) % (d_rhs * p) // Because d_lhs % d_rhs = 0 || d_rhs % d_lhs = 0, // The minimal contiguity is gcd(d_lhs, d_rhs). // Since gcd(d_lhs, d_rhs) maybe > len(lhs), // we need to use another gcd to get the actual contiguity. if (AxisInfoVisitor::isContiguousDim(lhs, shape, dim) && AxisInfoVisitor::isConstantDim(rhs, shape, dim)) { contiguity = std::max(contiguity, gcd(lhs.getContiguity(dim), gcd(lhs.getDivisibility(dim), rhs.getDivisibility(dim)))); } return contiguity; } int64_t getDivisibility(OpTy op, const AxisInfo &lhs, const AxisInfo &rhs, int dim) override { // lhs: d_lhs * k = gcd(d_lhs, d_rhs) * k' * k = gcd(d_lhs, d_rhs) * k'' // rhs: d_rhs * p = gcd(d_lhs, d_rhs) * p' * p = gcd(d_lhs, d_rhs) * p'' // lhs = gcd(d_lhs, d_rhs) * k'' = gcd(d_lhs, d_rhs) * d + r // r must be divisible by gcd(d_lhs, d_rhs) return gcd(lhs.getDivisibility(dim), rhs.getDivisibility(dim)); }; int64_t getConstancy(OpTy op, const AxisInfo &lhs, const AxisInfo &rhs, int dim) override { auto resTy = op.getResult().getType().template dyn_cast(); if (!resTy) return BinaryOpVisitorImpl::getConstancy(op, lhs, rhs, dim); auto shape = resTy.getShape(); // lhs % 1 = 0 return rhs.getConstantValue().has_value() && rhs.getConstantValue().value() == 1 ? shape[dim] : gcd(lhs.getConstancy(dim), rhs.getConstancy(dim)); } std::optional getConstantValue(OpTy op, const AxisInfo &lhs, const AxisInfo &rhs) override { if (lhs.getConstantValue().has_value() && rhs.getConstantValue().has_value()) return {lhs.getConstantValue().value() % rhs.getConstantValue().value()}; else if (rhs.getConstantValue().has_value() && rhs.getConstantValue().value() == 1) return {0}; return {}; } }; class SplatOpAxisInfoVisitor final : public AxisInfoVisitorImpl { public: using AxisInfoVisitorImpl::AxisInfoVisitorImpl; AxisInfo getAxisInfo(triton::SplatOp op, ArrayRef *> operands) override { Type _retTy = *op->result_type_begin(); TensorType retTy = _retTy.cast(); AxisInfo opInfo = operands[0]->getValue(); AxisInfo::DimVectorT contiguity; AxisInfo::DimVectorT divisibility; AxisInfo::DimVectorT constancy; for (int d = 0; d < retTy.getRank(); ++d) { contiguity.push_back(1); divisibility.push_back(opInfo.getDivisibility(0)); constancy.push_back(retTy.getShape()[d]); } return AxisInfo(contiguity, divisibility, constancy, operands[0]->getValue().getConstantValue()); } }; class ExpandDimsOpAxisInfoVisitor final : public AxisInfoVisitorImpl { public: using AxisInfoVisitorImpl::AxisInfoVisitorImpl; AxisInfo getAxisInfo(triton::ExpandDimsOp op, ArrayRef *> operands) override { AxisInfo opInfo = operands[0]->getValue(); AxisInfo::DimVectorT contiguity = opInfo.getContiguity(); AxisInfo::DimVectorT divisibility = opInfo.getDivisibility(); AxisInfo::DimVectorT constancy = opInfo.getConstancy(); contiguity.insert(contiguity.begin() + op.axis(), 1); divisibility.insert(divisibility.begin() + op.axis(), 1); constancy.insert(constancy.begin() + op.axis(), 1); return AxisInfo(contiguity, divisibility, constancy, operands[0]->getValue().getConstantValue()); } }; class BroadcastOpAxisInfoVisitor final : public AxisInfoVisitorImpl { public: using AxisInfoVisitorImpl::AxisInfoVisitorImpl; AxisInfo getAxisInfo(triton::BroadcastOp op, ArrayRef *> operands) override { Type _retTy = *op->result_type_begin(); Type _opTy = *op->operand_type_begin(); TensorType retTy = _retTy.cast(); TensorType opTy = _opTy.cast(); ArrayRef retShape = retTy.getShape(); ArrayRef opShape = opTy.getShape(); AxisInfo opInfo = operands[0]->getValue(); AxisInfo::DimVectorT contiguity; AxisInfo::DimVectorT divisibility; AxisInfo::DimVectorT constancy; for (int d = 0; d < retTy.getRank(); ++d) { contiguity.push_back(opShape[d] == 1 ? 1 : opInfo.getContiguity(d)); divisibility.push_back(opInfo.getDivisibility(d)); constancy.push_back(opShape[d] == 1 ? retShape[d] : opInfo.getConstancy(d)); } return AxisInfo(contiguity, divisibility, constancy, operands[0]->getValue().getConstantValue()); } }; template class CmpOpAxisInfoVisitor final : public AxisInfoVisitorImpl { public: using AxisInfoVisitorImpl::AxisInfoVisitorImpl; AxisInfo getAxisInfo(OpTy op, ArrayRef *> operands) override { auto resTy = op.getResult().getType().template dyn_cast(); if (!resTy) return AxisInfo(); auto shape = resTy.getShape(); short rank = resTy.getRank(); auto lhsInfo = operands[0]->getValue(); auto rhsInfo = operands[1]->getValue(); AxisInfo::DimVectorT contiguity, divisibility, constancy; std::optional constantValue; for (short d = 0; d < rank; ++d) { int64_t constHint = 1; if (lhsInfo.getConstantValue().has_value() && rhsInfo.getConstantValue().has_value()) { constHint = lhsInfo.getConstancy(d); constantValue = compare(getPredicate(op), lhsInfo.getConstantValue().value(), rhsInfo.getConstantValue().value()) ? 1 : 0; } else { // Case 1: lhs and rhs are both partial constants constHint = gcd(lhsInfo.getConstancy(d), rhsInfo.getConstancy(d)); // Case 2: lhs all constant, rhs all contiguous // NOTE: // lhs: 4 4 4 4 // rhs: 4 5 6 7 // lhs ge rhs: 1, 0, 0, 0 // Case 3: lhs all contiguous, rhs all constant // NOTE // lhs: 4 5 6 7 // rhs: 4 4 4 4 // lhs sle rhs: 1, 0, 0, 0 if (/*Case 2=*/( notGePredicate(getPredicate(op)) && (AxisInfoVisitor::isConstantDim(lhsInfo, shape, d) && AxisInfoVisitor::isContiguousDim(rhsInfo, shape, d))) || /*Case 3=*/(notLePredicate(getPredicate(op)) && (AxisInfoVisitor::isContiguousDim(lhsInfo, shape, d) && AxisInfoVisitor::isConstantDim(rhsInfo, shape, d)))) { constHint = std::max(constHint, gcd(lhsInfo.getContiguity(d), gcd(lhsInfo.getDivisibility(d), rhsInfo.getDivisibility(d)))); } } constancy.push_back(constHint); divisibility.push_back(1); contiguity.push_back(1); } return AxisInfo(contiguity, divisibility, constancy, constantValue); } private: static arith::CmpIPredicate getPredicate(triton::gpu::CmpIOp op) { return op.predicate(); } static arith::CmpIPredicate getPredicate(arith::CmpIOp op) { return op.getPredicate(); } static bool notGePredicate(arith::CmpIPredicate predicate) { return predicate != arith::CmpIPredicate::sge && predicate != arith::CmpIPredicate::uge; } static bool notLePredicate(arith::CmpIPredicate predicate) { return predicate != arith::CmpIPredicate::sle && predicate != arith::CmpIPredicate::ule; } static bool compare(arith::CmpIPredicate predicate, int64_t lhs, int64_t rhs) { switch (predicate) { case arith::CmpIPredicate::eq: return lhs == rhs; case arith::CmpIPredicate::ne: return lhs != rhs; case arith::CmpIPredicate::slt: return lhs < rhs; case arith::CmpIPredicate::sle: return lhs <= rhs; case arith::CmpIPredicate::sgt: return lhs > rhs; case arith::CmpIPredicate::sge: return lhs >= rhs; case arith::CmpIPredicate::ult: return (uint64_t)lhs < (uint64_t)rhs; case arith::CmpIPredicate::ule: return (uint64_t)lhs <= (uint64_t)rhs; case arith::CmpIPredicate::ugt: return (uint64_t)lhs > (uint64_t)rhs; case arith::CmpIPredicate::uge: return (uint64_t)lhs >= (uint64_t)rhs; default: break; } llvm_unreachable("unknown comparison predicate"); } }; template class SelectOpAxisInfoVisitor final : public AxisInfoVisitorImpl { public: using AxisInfoVisitorImpl::AxisInfoVisitorImpl; AxisInfo getAxisInfo(OpTy op, ArrayRef *> operands) override { auto resTy = op.getResult().getType().template dyn_cast(); if (!resTy) return AxisInfo(); auto shape = resTy.getShape(); auto rank = shape.size(); auto condConstancy = operands[0]->getValue().getConstancy(); auto lhsInfo = operands[1]->getValue(); auto rhsInfo = operands[2]->getValue(); AxisInfo::DimVectorT contiguity, divisibility, constancy; std::optional constantValue; if (operands[0]->getValue().getConstantValue().has_value()) { if (operands[0]->getValue().getConstantValue() == 0) { contiguity = rhsInfo.getContiguity(); divisibility = rhsInfo.getDivisibility(); constancy = rhsInfo.getConstancy(); constantValue = rhsInfo.getConstantValue(); } else { contiguity = lhsInfo.getContiguity(); divisibility = lhsInfo.getDivisibility(); constancy = lhsInfo.getConstancy(); constantValue = lhsInfo.getConstantValue(); } } else { for (auto d = 0; d < rank; ++d) { constancy.push_back( std::min(gcd(lhsInfo.getConstancy(d), condConstancy[d]), gcd(rhsInfo.getConstancy(d), condConstancy[d]))); divisibility.push_back( std::min(lhsInfo.getDivisibility(d), rhsInfo.getDivisibility(d))); contiguity.push_back( std::min(gcd(lhsInfo.getContiguity(d), condConstancy[d]), gcd(rhsInfo.getContiguity(d), condConstancy[d]))); } if (lhsInfo.getConstantValue().has_value() && rhsInfo.getConstantValue().has_value() && lhsInfo.getConstantValue() == rhsInfo.getConstantValue()) constantValue = lhsInfo.getConstantValue(); } return AxisInfo(contiguity, divisibility, constancy, constantValue); } }; template class LogicalOpAxisInfoVisitor final : public BinaryOpVisitorImpl { public: using BinaryOpVisitorImpl::BinaryOpVisitorImpl; private: int64_t getConstancy(OpTy op, const AxisInfo &lhs, const AxisInfo &rhs, int dim) override { return gcd(lhs.getConstancy(dim), rhs.getConstancy(dim)); } std::optional getConstantValue(OpTy op, const AxisInfo &lhs, const AxisInfo &rhs) override { if (lhs.getConstantValue().has_value() && rhs.getConstantValue().has_value()) { if constexpr (std::is_same::value) { return {lhs.getConstantValue().value() & rhs.getConstantValue().value()}; } else if constexpr (std::is_same::value) { return {lhs.getConstantValue().value() | rhs.getConstantValue().value()}; } else if constexpr (std::is_same::value) { return {lhs.getConstantValue().value() ^ rhs.getConstantValue().value()}; } } return {}; } }; class ShLIOpAxisInfoVisitor final : public BinaryOpVisitorImpl { public: using BinaryOpVisitorImpl::BinaryOpVisitorImpl; private: int64_t getContiguity(arith::ShLIOp op, const AxisInfo &lhs, const AxisInfo &rhs, int dim) override { if (rhs.getConstantValue().has_value() && rhs.getConstantValue().value() == 0) return lhs.getContiguity(dim); else return 1; } int64_t getDivisibility(arith::ShLIOp op, const AxisInfo &lhs, const AxisInfo &rhs, int dim) override { auto shift = rhs.getConstantValue().has_value() ? rhs.getConstantValue().value() : rhs.getDivisibility(dim); auto numBits = log2Int(lhs.getDivisibility(dim)); auto maxBits = log2Int(highestPowOf2Divisor(0)); // Make sure the return value doesn't exceed highestPowOf2Divisor(0) if (shift + numBits > maxBits) return highestPowOf2Divisor(0); return lhs.getDivisibility(dim) << shift; } int64_t getConstancy(arith::ShLIOp op, const AxisInfo &lhs, const AxisInfo &rhs, int dim) override { return gcd(lhs.getConstancy(dim), rhs.getConstancy(dim)); } std::optional getConstantValue(arith::ShLIOp op, const AxisInfo &lhs, const AxisInfo &rhs) override { if (lhs.getConstantValue().has_value() && rhs.getConstantValue().has_value()) return {lhs.getConstantValue().value() << rhs.getConstantValue().value()}; return {}; } }; template class ShROpAxisInfoVisitor final : public BinaryOpVisitorImpl { public: using BinaryOpVisitorImpl::BinaryOpVisitorImpl; private: int64_t getContiguity(OpTy op, const AxisInfo &lhs, const AxisInfo &rhs, int dim) override { if (rhs.getConstantValue().has_value() && rhs.getConstantValue().value() == 0) return lhs.getContiguity(dim); else return 1; } int64_t getDivisibility(OpTy op, const AxisInfo &lhs, const AxisInfo &rhs, int dim) override { if (rhs.getConstantValue().has_value()) return std::max(1, lhs.getDivisibility(dim) / (1 << rhs.getConstantValue().value())); else return std::max(1, lhs.getDivisibility(dim) / (1 << rhs.getDivisibility(dim))); } int64_t getConstancy(OpTy op, const AxisInfo &lhs, const AxisInfo &rhs, int dim) override { return gcd(lhs.getConstancy(dim), rhs.getConstancy(dim)); } std::optional getConstantValue(OpTy op, const AxisInfo &lhs, const AxisInfo &rhs) override { if (lhs.getConstantValue().has_value() && rhs.getConstantValue().has_value()) return {lhs.getConstantValue().value() >> rhs.getConstantValue().value()}; return {}; } }; template class MaxMinOpAxisInfoVisitor final : public AxisInfoVisitorImpl { public: using AxisInfoVisitorImpl::AxisInfoVisitorImpl; AxisInfo getAxisInfo(OpTy op, ArrayRef *> operands) override { auto lhsInfo = operands[0]->getValue(); auto rhsInfo = operands[1]->getValue(); std::optional constantValue; if (lhsInfo.getConstantValue().has_value() && rhsInfo.getConstantValue().has_value()) { if constexpr (std::is_same_v || std::is_same_v) { constantValue = {std::max(lhsInfo.getConstantValue().value(), rhsInfo.getConstantValue().value())}; } else if constexpr (std::is_same_v || std::is_same_v) { constantValue = {std::min(lhsInfo.getConstantValue().value(), rhsInfo.getConstantValue().value())}; } } auto rank = lhsInfo.getRank(); return AxisInfo(/*knownContiguity=*/AxisInfo::DimVectorT(rank, 1), /*knownDivisibility=*/AxisInfo::DimVectorT(rank, 1), /*knownConstancy=*/AxisInfo::DimVectorT(rank, 1), /*constantValue=*/constantValue); } }; //===----------------------------------------------------------------------===// // AxisInfoAnalysis //===----------------------------------------------------------------------===// AxisInfoAnalysis::AxisInfoAnalysis(MLIRContext *context) : ForwardDataFlowAnalysis(context) { // UnrealizedConversionCast: // This is needed by TritonGPUToLLVM, to get AxisInfo when the graph is // in the process of a PartialConversion, where UnrealizedConversionCast // may exist visitors.append, CastOpAxisInfoVisitor, CastOpAxisInfoVisitor, CastOpAxisInfoVisitor, CastOpAxisInfoVisitor, CastOpAxisInfoVisitor, CastOpAxisInfoVisitor, CastOpAxisInfoVisitor, CastOpAxisInfoVisitor>(); visitors.append(); visitors.append(); visitors.append, AddSubOpAxisInfoVisitor, AddSubOpAxisInfoVisitor>(); visitors.append(); visitors.append, DivOpAxisInfoVisitor>(); visitors.append, RemOpAxisInfoVisitor>(); visitors.append(); visitors.append(); visitors.append(); visitors.append, CmpOpAxisInfoVisitor>(); visitors.append, LogicalOpAxisInfoVisitor, LogicalOpAxisInfoVisitor>(); visitors.append, SelectOpAxisInfoVisitor>(); visitors.append, ShROpAxisInfoVisitor>(); visitors.append, MaxMinOpAxisInfoVisitor, MaxMinOpAxisInfoVisitor, MaxMinOpAxisInfoVisitor>(); } ChangeResult AxisInfoAnalysis::visitOperation( Operation *op, ArrayRef *> operands) { AxisInfo curr = visitors.apply(op, operands); if (curr.getRank() == 0) { return markAllPessimisticFixpoint(op->getResults()); } // join all lattice elements ChangeResult result = ChangeResult::NoChange; for (Value value : op->getResults()) { result |= getLatticeElement(value).join(curr); } return result; } unsigned AxisInfoAnalysis::getPtrContiguity(Value ptr) { auto tensorTy = ptr.getType().dyn_cast(); if (!tensorTy) return 1; auto layout = tensorTy.getEncoding(); auto shape = tensorTy.getShape(); // Here order should be ordered by contiguous first, so the first element // should have the largest contiguous. auto order = triton::gpu::getOrder(layout); unsigned align = getPtrAlignment(ptr); unsigned contigPerThread = triton::gpu::getSizePerThread(layout)[order[0]]; contigPerThread = std::min(align, contigPerThread); contigPerThread = std::min(shape[order[0]], contigPerThread); return contigPerThread; } unsigned AxisInfoAnalysis::getPtrAlignment(Value ptr) { auto tensorTy = ptr.getType().dyn_cast(); if (!tensorTy) return 1; auto axisInfo = lookupLatticeElement(ptr)->getValue(); auto layout = tensorTy.getEncoding(); auto order = triton::gpu::getOrder(layout); auto maxMultipleBytes = axisInfo.getDivisibility(order[0]); auto maxContig = axisInfo.getContiguity(order[0]); auto elemNumBits = getPointeeBitWidth(tensorTy); auto elemNumBytes = std::max(elemNumBits / 8, 1); auto maxMultiple = std::max(maxMultipleBytes / elemNumBytes, 1); unsigned alignment = std::min(maxMultiple, maxContig); return alignment; } unsigned AxisInfoAnalysis::getMaskAlignment(Value mask) { auto tensorTy = mask.getType().dyn_cast(); if (!tensorTy) return 1; auto maskOrder = triton::gpu::getOrder(tensorTy.getEncoding()); auto maskAxis = lookupLatticeElement(mask)->getValue(); auto alignment = std::max(maskAxis.getConstancy(maskOrder[0]), 1); return alignment; } } // namespace mlir triton-2.0.0/lib/Analysis/CMakeLists.txt000066400000000000000000000003041440023377100201170ustar00rootroot00000000000000add_mlir_library(TritonAnalysis AxisInfo.cpp Allocation.cpp Membar.cpp Alias.cpp Utility.cpp DEPENDS TritonTableGen TritonGPUAttrDefsIncGen LINK_LIBS PUBLIC MLIRAnalysis ) triton-2.0.0/lib/Analysis/Membar.cpp000066400000000000000000000115101440023377100172670ustar00rootroot00000000000000#include "triton/Analysis/Membar.h" #include "triton/Analysis/Alias.h" #include "triton/Dialect/TritonGPU/IR/Dialect.h" #include "mlir/Dialect/GPU/GPUDialect.h" #include "mlir/Dialect/Tensor/IR/Tensor.h" namespace mlir { void MembarAnalysis::run() { auto *operation = allocation->getOperation(); RegionInfo regionInfo; OpBuilder builder(operation); dfsOperation(operation, ®ionInfo, &builder); } void MembarAnalysis::dfsOperation(Operation *operation, RegionInfo *parentRegionInfo, OpBuilder *builder) { transfer(operation, parentRegionInfo, builder); if (operation->getNumRegions()) { // If there's any nested regions, we need to visit them. // scf.if and scf.else: two regions // scf.if only: two regions // scf.for: one region RegionInfo curRegionInfo; auto traverseRegions = [&]() -> auto{ for (auto ®ion : operation->getRegions()) { // Copy the parent info as the current info. RegionInfo regionInfo = *parentRegionInfo; for (auto &block : region.getBlocks()) { // assert(region.getBlocks().size() == 1 && // "Multiple blocks in a region is not supported"); for (auto &op : block.getOperations()) { // Traverse the nested operation. dfsOperation(&op, ®ionInfo, builder); } } curRegionInfo.join(regionInfo); } // Set the parent region info as the union of the nested region info. *parentRegionInfo = curRegionInfo; }; traverseRegions(); if (isa(operation)) { // scf.for can have two possible inputs: the init value and the // previous iteration's result. Although we've applied alias analysis, // there could be unsynced memory accesses on reused memories. // For example, consider the following code: // %1 = convert_layout %0: blocked -> shared // ... // gpu.barrier // ... // %5 = convert_layout %4 : shared -> dot // %6 = tt.dot %2, %5 // scf.yield // // Though %5 could be released before scf.yield, it may shared the same // memory with %1. So we actually have to insert a barrier before %1 to // make sure the memory is synced. traverseRegions(); } } } void MembarAnalysis::transfer(Operation *op, RegionInfo *regionInfo, OpBuilder *builder) { if (isa(op) || isa(op) || isa(op) || isa(op) || isa(op)) { // Do not insert barriers before control flow operations and // alloc/extract/insert // alloc is an allocation op without memory write. // FIXME(Keren): extract_slice is always alias for now return; } if (isa(op)) { // If the current op is a barrier, we sync previous reads and writes regionInfo->sync(); return; } if (isa(op) && !isa(op->getNextNode())) { // If the current op is an async wait and the next op is not a barrier we // insert a barrier op and sync regionInfo->sync(); OpBuilder::InsertionGuard g(*builder); builder->setInsertionPointAfter(op); builder->create(op->getLoc()); regionInfo->sync(); return; } RegionInfo curRegionInfo; for (Value value : op->getOperands()) { for (auto bufferId : allocation->getBufferIds(value)) { if (bufferId != Allocation::InvalidBufferId) { if (isa(op) || isa(op)) { // FIXME(Keren): insert_slice and insert_slice_async are always alias // for now curRegionInfo.syncWriteBuffers.insert(bufferId); } else { // ConvertLayoutOp: shared memory -> registers curRegionInfo.syncReadBuffers.insert(bufferId); } } } } for (Value value : op->getResults()) { // ConvertLayoutOp: registers -> shared memory auto bufferId = allocation->getBufferId(value); if (bufferId != Allocation::InvalidBufferId) { curRegionInfo.syncWriteBuffers.insert(bufferId); } } // Scratch buffer is considered as both shared memory write & read auto bufferId = allocation->getBufferId(op); if (bufferId != Allocation::InvalidBufferId) { curRegionInfo.syncWriteBuffers.insert(bufferId); curRegionInfo.syncReadBuffers.insert(bufferId); } if (regionInfo->isIntersected(curRegionInfo, allocation)) { OpBuilder::InsertionGuard g(*builder); builder->setInsertionPoint(op); builder->create(op->getLoc()); regionInfo->sync(); } // Update the region info, even if barrier is inserted, we have to maintain // the current op's read/write buffers. regionInfo->join(curRegionInfo); } } // namespace mlir triton-2.0.0/lib/Analysis/Utility.cpp000066400000000000000000000257111440023377100175370ustar00rootroot00000000000000#include "triton/Analysis/Utility.h" #include "mlir/IR/Dialect.h" #include "triton/Dialect/Triton/IR/Dialect.h" #include "triton/Dialect/TritonGPU/IR/Dialect.h" #include namespace mlir { bool ReduceOpHelper::isFastReduction() { auto srcLayout = srcTy.getEncoding(); auto axis = op.axis(); return axis == triton::gpu::getOrder(srcLayout)[0]; } unsigned ReduceOpHelper::getInterWarpSize() { auto srcLayout = srcTy.getEncoding(); auto srcShape = srcTy.getShape(); auto axis = op.axis(); auto srcReduceDimSize = static_cast(srcShape[axis]); unsigned sizeIntraWarps = getIntraWarpSize(); return std::min(srcReduceDimSize / sizeIntraWarps, triton::gpu::getWarpsPerCTA(srcLayout)[axis]); } unsigned ReduceOpHelper::getIntraWarpSize() { auto srcLayout = srcTy.getEncoding(); auto srcShape = srcTy.getShape(); auto axis = op.axis(); auto srcReduceDimSize = static_cast(srcShape[axis]); return std::min(srcReduceDimSize, triton::gpu::getThreadsPerWarp(srcLayout)[axis]); } unsigned ReduceOpHelper::getThreadsReductionAxis() { auto srcLayout = srcTy.getEncoding(); auto axis = op.axis(); return triton::gpu::getThreadsPerWarp(srcLayout)[axis] * triton::gpu::getWarpsPerCTA(srcLayout)[axis]; } SmallVector ReduceOpHelper::getScratchConfigBasic() { auto axis = op.axis(); auto smemShape = convertType(getSrcShape()); smemShape[axis] = std::min(smemShape[axis], getThreadsReductionAxis()); return smemShape; } SmallVector> ReduceOpHelper::getScratchConfigsFast() { auto axis = op.axis(); SmallVector> smemShapes(3); auto argLayout = srcTy.getEncoding(); auto argLayoutMma = argLayout.dyn_cast(); if (argLayoutMma && argLayoutMma.getVersionMajor() == 2 && triton::gpu::getWarpsPerCTA(argLayout)[axis] == 1) return {{1, 1}, {1, 1}}; /// shared memory block0 smemShapes[0] = convertType(getSrcShape()); smemShapes[0][axis] = getInterWarpSize(); /// FIXME(Qingyi): This size is actually larger than required. /// shared memory block1: auto mod = op.getOperation()->getParentOfType(); unsigned numWarps = triton::gpu::TritonGPUDialect::getNumWarps(mod); smemShapes[1].push_back(numWarps * 32); return smemShapes; } unsigned ReduceOpHelper::getScratchSizeInBytes() { unsigned elems = 0; if (isFastReduction()) { auto smemShapes = getScratchConfigsFast(); for (const auto &smemShape : smemShapes) elems = std::max(elems, product(smemShape)); } else { auto smemShape = getScratchConfigBasic(); elems = product(smemShape); } auto tensorType = op.operand().getType().cast(); unsigned bytes = elems * tensorType.getElementTypeBitWidth() / 8; if (triton::ReduceOp::withIndex(op.redOp())) bytes += elems * sizeof(int32_t); return bytes; } bool isSharedEncoding(Value value) { auto type = value.getType(); if (auto tensorType = type.dyn_cast()) { auto encoding = tensorType.getEncoding(); return encoding && encoding.isa(); } return false; } bool maybeSharedAllocationOp(Operation *op) { // TODO(Keren): This function can be replaced by adding // MemoryEffectOpInterface. We can then use the MemoryEffectOpInterface to // query the memory effects of the op. auto *dialect = op->getDialect(); return dialect && (dialect->getTypeID() == mlir::TypeID::get() || dialect->getTypeID() == mlir::TypeID::get() || dialect->getTypeID() == mlir::TypeID::get() || dialect->getTypeID() == mlir::TypeID::get()); } bool maybeAliasOp(Operation *op) { return isa(op) || isa(op) || isa(op) || isa(op); } bool supportMMA(triton::DotOp op, int version) { // Refer to mma section for the data type supported by Volta and Hopper // Tensor Core in // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-fragment-mma-884-f16 auto aElemTy = op.a().getType().cast().getElementType(); auto bElemTy = op.b().getType().cast().getElementType(); if (aElemTy.isF32() && bElemTy.isF32()) { return op.allowTF32() && version >= 2; } return supportMMA(op.a(), version) && supportMMA(op.b(), version); } bool supportMMA(Value value, int version) { // Tell whether a DotOp support HMMA by the operand type(either $a or $b). // We cannot get both the operand types(in TypeConverter), here we assume the // types of both the operands are identical here. assert((version == 1 || version == 2) && "Unexpected MMA layout version found"); auto elemTy = value.getType().cast().getElementType(); return elemTy.isF16() || elemTy.isBF16() || (elemTy.isF32() && version >= 2) || (elemTy.isInteger(8) && version >= 2); } Type getElementType(Value value) { auto type = value.getType(); if (auto tensorType = type.dyn_cast()) return tensorType.getElementType(); return type; } std::string getValueOperandName(Value value, AsmState &state) { std::string opName; llvm::raw_string_ostream ss(opName); value.printAsOperand(ss, state); return opName; } bool isMmaToDotShortcut(triton::gpu::MmaEncodingAttr &mmaLayout, triton::gpu::DotOperandEncodingAttr &dotOperandLayout) { // dot_op = #mma // when #mma = MmaEncoding return mmaLayout.getVersionMajor() == 2 && mmaLayout.getWarpsPerCTA()[1] == 1 && dotOperandLayout.getOpIdx() == 0 && dotOperandLayout.getParent() == mmaLayout; } bool isSingleValue(Value value) { // Don't consider load as expensive if it is loading a scalar. if (auto tensorTy = value.getType().dyn_cast()) return tensorTy.getNumElements() == 1; // TODO: Handle other cases. // For example, when ptr is a tensor of single value. // It means that ptr is a resultant of broadcast or generated through // a chain of broadcast and other operations. // Rematerialize it without considering contiguous memory access pattern is // fine. return true; } namespace { /// A data structure similar to SetVector but maintains /// a deque instead of a vector to allow for efficient /// push_back and pop_front operations. /// Using SetVector doesn't suffice our needs because /// it only pushes and pops from the back. /// For example, if we have a queue like this: /// 0->4 1->2->3 /// ^-------- /// where 3 depends on 4, once we pop 3, we found /// 4 is not ready, so we check 2 and push 3 back /// to the queue. struct DFSSubgraphState { DFSSubgraphState() : set(), deque() {} DenseSet set; std::deque deque; bool push_back(Operation *op) { if (set.insert(op).second) { deque.push_back(op); return true; } return false; } Operation *pop_front() { Operation *op = deque.front(); deque.pop_front(); set.erase(op); return op; } bool empty() { return deque.empty(); } }; /// DFS post-order implementation that maintains a global count to work across /// multiple invocations, to help implement topological sort on multi-root DAGs. /// We traverse all operations but only record the ones that appear in /// `toSort` for the final result. struct DFSState { DFSState(const SetVector &set) : toSort(set), seen() {} const SetVector &toSort; SmallVector topologicalCounts; DenseSet seen; /// We mark each op as ready if all its operands are seen. If an op is ready, /// we add it to the queue. Otherwise, we keep adding its operands to the /// ancestors set. void addToReadyQueue(Operation *op, DFSSubgraphState &subGraph, SmallVector &readyQueue) { bool ready = true; for (Value operand : op->getOperands()) { auto def = operand.getDefiningOp(); if (def && !seen.count(def)) { subGraph.push_back(def); ready = false; } } if (ready) readyQueue.push_back(op); } }; void dfsPostorder(Operation *root, DFSState *state) { DFSSubgraphState subGraph; subGraph.push_back(root); SmallVector ops; while (!subGraph.empty()) { // Nodes in the ready queue are ready to be processed. // Meaning that either their operands are all seen or they have null // operands. SmallVector readyQueue; auto *current = subGraph.pop_front(); state->addToReadyQueue(current, subGraph, readyQueue); while (!readyQueue.empty()) { Operation *current = readyQueue.pop_back_val(); if (!state->seen.insert(current).second) continue; ops.push_back(current); for (Value result : current->getResults()) { for (Operation *op : result.getUsers()) state->addToReadyQueue(op, subGraph, readyQueue); } for (Region ®ion : current->getRegions()) { for (Operation &op : region.getOps()) state->addToReadyQueue(&op, subGraph, readyQueue); } } } for (Operation *op : llvm::reverse(ops)) { if (state->toSort.count(op) > 0) state->topologicalCounts.push_back(op); } } } // namespace SetVector multiRootTopologicalSort(const SetVector &toSort) { if (toSort.empty()) { return toSort; } // Run from each root with global count and `seen` set. DFSState state(toSort); for (auto *s : toSort) { assert(toSort.count(s) == 1 && "NYI: multi-sets not supported"); dfsPostorder(s, &state); } // Reorder and return. SetVector res; for (auto it = state.topologicalCounts.rbegin(), eit = state.topologicalCounts.rend(); it != eit; ++it) { res.insert(*it); } return res; } SetVector multiRootGetSlice(Operation *op, TransitiveFilter backwardFilter, TransitiveFilter forwardFilter) { SetVector slice; slice.insert(op); unsigned currentIndex = 0; SetVector backwardSlice; SetVector forwardSlice; while (currentIndex != slice.size()) { auto *currentOp = (slice)[currentIndex]; // Compute and insert the backwardSlice starting from currentOp. backwardSlice.clear(); getBackwardSlice(currentOp, &backwardSlice, backwardFilter); slice.insert(backwardSlice.begin(), backwardSlice.end()); // Compute and insert the forwardSlice starting from currentOp. forwardSlice.clear(); getForwardSlice(currentOp, &forwardSlice, forwardFilter); slice.insert(forwardSlice.begin(), forwardSlice.end()); ++currentIndex; } return multiRootTopologicalSort(slice); } } // namespace mlir triton-2.0.0/lib/CMakeLists.txt000066400000000000000000000002071440023377100163360ustar00rootroot00000000000000# add_subdirectory(codegen) add_subdirectory(Analysis) add_subdirectory(Conversion) add_subdirectory(Dialect) add_subdirectory(Target) triton-2.0.0/lib/Conversion/000077500000000000000000000000001440023377100157245ustar00rootroot00000000000000triton-2.0.0/lib/Conversion/CMakeLists.txt000066400000000000000000000001061440023377100204610ustar00rootroot00000000000000add_subdirectory(TritonToTritonGPU) add_subdirectory(TritonGPUToLLVM) triton-2.0.0/lib/Conversion/TritonGPUToLLVM/000077500000000000000000000000001440023377100205555ustar00rootroot00000000000000triton-2.0.0/lib/Conversion/TritonGPUToLLVM/CMakeLists.txt000066400000000000000000000012211440023377100233110ustar00rootroot00000000000000add_mlir_conversion_library(TritonGPUToLLVM ConvertLayoutOpToLLVM.cpp DotOpToLLVM.cpp ElementwiseOpToLLVM.cpp LoadStoreOpToLLVM.cpp TritonGPUToLLVM.cpp TritonGPUToLLVMPass.cpp PTXAsmFormat.cpp ReduceOpToLLVM.cpp Utility.cpp ViewOpToLLVM.cpp DotOpHelpers.cpp ADDITIONAL_HEADER_DIRS ${PROJECT_SOURCE_DIR}/include/triton/Conversion/TritonGPUToLLVM DEPENDS TritonConversionPassIncGen LINK_COMPONENTS Core LINK_LIBS PUBLIC MLIRIR MLIRPass MLIRGPUOps MLIRGPUToNVVMTransforms MLIRGPUTransforms TritonAnalysis TritonIR TritonGPUIR TritonGPUTransforms ) triton-2.0.0/lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp000066400000000000000000000700741440023377100254040ustar00rootroot00000000000000#include "ConvertLayoutOpToLLVM.h" #include "DotOpHelpers.h" #include "Utility.h" using ::mlir::LLVM::DotOpFMAConversionHelper; using ::mlir::LLVM::DotOpMmaV1ConversionHelper; using ::mlir::LLVM::getElementsFromStruct; using ::mlir::LLVM::getSharedMemoryObjectFromStruct; using ::mlir::LLVM::getStridesFromShapeAndOrder; using ::mlir::LLVM::getStructFromElements; using ::mlir::LLVM::MMA16816ConversionHelper; using ::mlir::triton::gpu::DotOperandEncodingAttr; using ::mlir::triton::gpu::getContigPerThread; using ::mlir::triton::gpu::getElemsPerThread; using ::mlir::triton::gpu::getOrder; using ::mlir::triton::gpu::getShapePerCTA; using ::mlir::triton::gpu::getSizePerThread; using ::mlir::triton::gpu::isaDistributedLayout; using ::mlir::triton::gpu::SharedEncodingAttr; struct ConvertLayoutOpConversion : public ConvertTritonGPUOpToLLVMPattern { public: using ConvertTritonGPUOpToLLVMPattern< triton::gpu::ConvertLayoutOp>::ConvertTritonGPUOpToLLVMPattern; LogicalResult matchAndRewrite(triton::gpu::ConvertLayoutOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { Value src = op.src(); Value dst = op.result(); auto srcTy = src.getType().cast(); auto dstTy = dst.getType().cast(); Attribute srcLayout = srcTy.getEncoding(); Attribute dstLayout = dstTy.getEncoding(); if (isaDistributedLayout(srcLayout) && dstLayout.isa()) { return lowerDistributedToShared(op, adaptor, rewriter); } if (srcLayout.isa() && dstLayout.isa()) { return lowerSharedToDotOperand(op, adaptor, rewriter); } if (isaDistributedLayout(srcLayout) && isaDistributedLayout(dstLayout)) { return lowerDistributedToDistributed(op, adaptor, rewriter); } if (srcLayout.isa() && dstLayout.isa()) { return lowerMmaToDotOperand(op, adaptor, rewriter); } // TODO: to be implemented llvm_unreachable("unsupported layout conversion"); return failure(); } private: SmallVector getMultiDimOffset(Attribute layout, Location loc, ConversionPatternRewriter &rewriter, unsigned elemId, ArrayRef shape, ArrayRef multiDimCTAInRepId, ArrayRef shapePerCTA) const { unsigned rank = shape.size(); if (auto blockedLayout = layout.dyn_cast()) { auto multiDimOffsetFirstElem = emitBaseIndexForLayout(loc, rewriter, blockedLayout, shape); SmallVector multiDimOffset(rank); SmallVector multiDimElemId = getMultiDimIndex( elemId, getSizePerThread(layout), getOrder(layout)); for (unsigned d = 0; d < rank; ++d) { multiDimOffset[d] = add(multiDimOffsetFirstElem[d], idx_val(multiDimCTAInRepId[d] * shapePerCTA[d] + multiDimElemId[d])); } return multiDimOffset; } if (auto sliceLayout = layout.dyn_cast()) { unsigned dim = sliceLayout.getDim(); auto multiDimOffsetParent = getMultiDimOffset(sliceLayout.getParent(), loc, rewriter, elemId, sliceLayout.paddedShape(shape), sliceLayout.paddedShape(multiDimCTAInRepId), sliceLayout.paddedShape(shapePerCTA)); SmallVector multiDimOffset(rank); for (unsigned d = 0; d < rank + 1; ++d) { if (d == dim) continue; unsigned slicedD = d < dim ? d : (d - 1); multiDimOffset[slicedD] = multiDimOffsetParent[d]; } return multiDimOffset; } if (auto mmaLayout = layout.dyn_cast()) { SmallVector mmaColIdx(4); SmallVector mmaRowIdx(2); Value threadId = getThreadId(rewriter, loc); Value warpSize = idx_val(32); Value laneId = urem(threadId, warpSize); Value warpId = udiv(threadId, warpSize); // TODO: fix the bug in MMAEncodingAttr document SmallVector multiDimWarpId(2); multiDimWarpId[0] = urem(warpId, idx_val(mmaLayout.getWarpsPerCTA()[0])); multiDimWarpId[1] = udiv(warpId, idx_val(mmaLayout.getWarpsPerCTA()[0])); Value _1 = idx_val(1); Value _2 = idx_val(2); Value _4 = idx_val(4); Value _8 = idx_val(8); Value _16 = idx_val(16); if (mmaLayout.isAmpere()) { multiDimWarpId[0] = urem(multiDimWarpId[0], idx_val(shape[0] / 16)); multiDimWarpId[1] = urem(multiDimWarpId[1], idx_val(shape[1] / 8)); Value mmaGrpId = udiv(laneId, _4); Value mmaGrpIdP8 = add(mmaGrpId, _8); Value mmaThreadIdInGrp = urem(laneId, _4); Value mmaThreadIdInGrpM2 = mul(mmaThreadIdInGrp, _2); Value mmaThreadIdInGrpM2P1 = add(mmaThreadIdInGrpM2, _1); Value rowWarpOffset = mul(multiDimWarpId[0], _16); mmaRowIdx[0] = add(mmaGrpId, rowWarpOffset); mmaRowIdx[1] = add(mmaGrpIdP8, rowWarpOffset); Value colWarpOffset = mul(multiDimWarpId[1], _8); mmaColIdx[0] = add(mmaThreadIdInGrpM2, colWarpOffset); mmaColIdx[1] = add(mmaThreadIdInGrpM2P1, colWarpOffset); } else if (mmaLayout.isVolta()) { // Volta doesn't follow the pattern here." } else { llvm_unreachable("Unexpected MMALayout version"); } assert(rank == 2); SmallVector multiDimOffset(rank); if (mmaLayout.isAmpere()) { multiDimOffset[0] = elemId < 2 ? mmaRowIdx[0] : mmaRowIdx[1]; multiDimOffset[1] = elemId % 2 == 0 ? mmaColIdx[0] : mmaColIdx[1]; multiDimOffset[0] = add( multiDimOffset[0], idx_val(multiDimCTAInRepId[0] * shapePerCTA[0])); multiDimOffset[1] = add( multiDimOffset[1], idx_val(multiDimCTAInRepId[1] * shapePerCTA[1])); } else if (mmaLayout.isVolta()) { auto [isARow, isBRow, isAVec4, isBVec4, mmaId] = mmaLayout.decodeVoltaLayoutStates(); auto coords = DotOpMmaV1ConversionHelper::getMNCoords( threadId, rewriter, mmaLayout.getWarpsPerCTA(), shape, isARow, isBRow, isAVec4, isBVec4); return DotOpMmaV1ConversionHelper::getCoord(elemId, coords); } else { llvm_unreachable("Unexpected MMALayout version"); } return multiDimOffset; } llvm_unreachable("unexpected layout in getMultiDimOffset"); } // shared memory rd/st for blocked or mma layout with data padding void processReplica(Location loc, ConversionPatternRewriter &rewriter, bool stNotRd, RankedTensorType type, ArrayRef numCTAsEachRep, ArrayRef multiDimRepId, unsigned vec, ArrayRef paddedRepShape, ArrayRef outOrd, SmallVector &vals, Value smemBase) const { auto accumNumCTAsEachRep = product(numCTAsEachRep); auto layout = type.getEncoding(); auto blockedLayout = layout.dyn_cast(); auto sliceLayout = layout.dyn_cast(); auto mmaLayout = layout.dyn_cast(); auto rank = type.getRank(); auto sizePerThread = getSizePerThread(layout); auto accumSizePerThread = product(sizePerThread); SmallVector numCTAs(rank); auto shapePerCTA = getShapePerCTA(layout, type.getShape()); auto order = getOrder(layout); for (unsigned d = 0; d < rank; ++d) { numCTAs[d] = ceil(type.getShape()[d], shapePerCTA[d]); } auto elemTy = type.getElementType(); bool isInt1 = elemTy.isInteger(1); bool isPtr = elemTy.isa(); auto llvmElemTyOrig = getTypeConverter()->convertType(elemTy); if (isInt1) elemTy = IntegerType::get(elemTy.getContext(), 8); else if (isPtr) elemTy = IntegerType::get(elemTy.getContext(), 64); auto llvmElemTy = getTypeConverter()->convertType(elemTy); for (unsigned ctaId = 0; ctaId < accumNumCTAsEachRep; ++ctaId) { auto multiDimCTAInRepId = getMultiDimIndex(ctaId, numCTAsEachRep, order); SmallVector multiDimCTAId(rank); for (const auto &it : llvm::enumerate(multiDimCTAInRepId)) { auto d = it.index(); multiDimCTAId[d] = multiDimRepId[d] * numCTAsEachRep[d] + it.value(); } auto linearCTAId = getLinearIndex(multiDimCTAId, numCTAs, order); // TODO: This is actually redundant index calculation, we should // consider of caching the index calculation result in case // of performance issue observed. for (unsigned elemId = 0; elemId < accumSizePerThread; elemId += vec) { SmallVector multiDimOffset = getMultiDimOffset(layout, loc, rewriter, elemId, type.getShape(), multiDimCTAInRepId, shapePerCTA); Value offset = linearize(rewriter, loc, multiDimOffset, paddedRepShape, outOrd); auto elemPtrTy = ptr_ty(llvmElemTy, 3); Value ptr = gep(elemPtrTy, smemBase, offset); auto vecTy = vec_ty(llvmElemTy, vec); ptr = bitcast(ptr, ptr_ty(vecTy, 3)); if (stNotRd) { Value valVec = undef(vecTy); for (unsigned v = 0; v < vec; ++v) { auto currVal = vals[elemId + linearCTAId * accumSizePerThread + v]; if (isInt1) currVal = zext(llvmElemTy, currVal); else if (isPtr) currVal = ptrtoint(llvmElemTy, currVal); valVec = insert_element(vecTy, valVec, currVal, idx_val(v)); } store(valVec, ptr); } else { Value valVec = load(ptr); for (unsigned v = 0; v < vec; ++v) { Value currVal = extract_element(llvmElemTy, valVec, idx_val(v)); if (isInt1) currVal = icmp_ne(currVal, rewriter.create( loc, i8_ty, rewriter.getI8IntegerAttr(0))); else if (isPtr) currVal = inttoptr(llvmElemTyOrig, currVal); vals[elemId + linearCTAId * accumSizePerThread + v] = currVal; } } } } } // The MMAV1's result is quite different from the exising "Replica" structure, // add a new simple but clear implementation for it to avoid modificating the // logic of the exising one. void processReplicaForMMAV1(Location loc, ConversionPatternRewriter &rewriter, bool stNotRd, RankedTensorType type, ArrayRef multiDimRepId, unsigned vec, ArrayRef paddedRepShape, ArrayRef outOrd, SmallVector &vals, Value smemBase, ArrayRef shape, bool isDestMma = false) const { unsigned accumNumCTAsEachRep = 1; auto layout = type.getEncoding(); MmaEncodingAttr mma = layout.dyn_cast(); auto sliceLayout = layout.dyn_cast(); if (sliceLayout) mma = sliceLayout.getParent().cast(); auto order = getOrder(layout); auto rank = type.getRank(); int accumSizePerThread = vals.size(); SmallVector numCTAs(rank, 1); SmallVector numCTAsEachRep(rank, 1); SmallVector shapePerCTA = getShapePerCTA(layout, shape); auto elemTy = type.getElementType(); int ctaId = 0; auto multiDimCTAInRepId = getMultiDimIndex(ctaId, numCTAsEachRep, order); SmallVector multiDimCTAId(rank); for (const auto &it : llvm::enumerate(multiDimCTAInRepId)) { auto d = it.index(); multiDimCTAId[d] = multiDimRepId[d] * numCTAsEachRep[d] + it.value(); } std::vector, Value>> coord2valT( accumSizePerThread); bool needTrans = outOrd[0] != 0; if (sliceLayout || isDestMma) needTrans = false; vec = needTrans ? 2 : 1; { // We need to transpose the coordinates and values here to enable vec=2 // when store to smem. std::vector, Value>> coord2val( accumSizePerThread); for (unsigned elemId = 0; elemId < accumSizePerThread; ++elemId) { // TODO[Superjomn]: Move the coordinate computation out of loop, it is // duplicate in Volta. SmallVector multiDimOffset = getMultiDimOffset(layout, loc, rewriter, elemId, type.getShape(), multiDimCTAInRepId, shapePerCTA); coord2val[elemId] = std::make_pair(multiDimOffset, vals[elemId]); } if (needTrans) { auto [isARow, isBRow, isAVec4, isBVec4, mmaId] = mma.decodeVoltaLayoutStates(); DotOpMmaV1ConversionHelper helper(mma); // do transpose int numM = helper.getElemsM(mma.getWarpsPerCTA()[0], shape[0], isARow, isAVec4); int numN = accumSizePerThread / numM; for (int r = 0; r < numM; r++) { for (int c = 0; c < numN; c++) { coord2valT[r * numN + c] = std::move(coord2val[c * numM + r]); } } } else { coord2valT = std::move(coord2val); } } // Now the coord2valT has the transposed and contiguous elements(with // vec=2), the original vals is not needed. for (unsigned elemId = 0; elemId < accumSizePerThread; elemId += vec) { auto coord = coord2valT[elemId].first; Value offset = linearize(rewriter, loc, coord, paddedRepShape, outOrd); auto elemPtrTy = ptr_ty(elemTy, 3); Value ptr = gep(elemPtrTy, smemBase, offset); auto vecTy = vec_ty(elemTy, vec); ptr = bitcast(ptr, ptr_ty(vecTy, 3)); if (stNotRd) { Value valVec = undef(vecTy); for (unsigned v = 0; v < vec; ++v) { auto currVal = coord2valT[elemId + v].second; valVec = insert_element(vecTy, valVec, currVal, idx_val(v)); } store(valVec, ptr); } else { Value valVec = load(ptr); for (unsigned v = 0; v < vec; ++v) { Value currVal = extract_element(elemTy, valVec, idx_val(v)); vals[elemId + v] = currVal; } } } } // blocked/mma -> blocked/mma. // Data padding in shared memory to avoid bank conflict. LogicalResult lowerDistributedToDistributed(triton::gpu::ConvertLayoutOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const { auto loc = op.getLoc(); Value src = op.src(); Value dst = op.result(); auto srcTy = src.getType().cast(); auto dstTy = dst.getType().cast(); Attribute srcLayout = srcTy.getEncoding(); Attribute dstLayout = dstTy.getEncoding(); auto llvmElemTy = getTypeConverter()->convertType(dstTy.getElementType()); Value smemBase = getSharedMemoryBase(loc, rewriter, op.getOperation()); auto elemPtrTy = ptr_ty(llvmElemTy, 3); smemBase = bitcast(smemBase, elemPtrTy); auto shape = dstTy.getShape(); unsigned rank = dstTy.getRank(); SmallVector numReplicates(rank); SmallVector inNumCTAsEachRep(rank); SmallVector outNumCTAsEachRep(rank); SmallVector inNumCTAs(rank); SmallVector outNumCTAs(rank); auto srcShapePerCTA = getShapePerCTA(srcLayout, srcTy.getShape()); auto dstShapePerCTA = getShapePerCTA(dstLayout, shape); // For Volta, all the coords for a CTA are calculated. bool isSrcMmaV1{}, isDstMmaV1{}; if (auto mmaLayout = srcLayout.dyn_cast()) { isSrcMmaV1 = mmaLayout.isVolta(); } if (auto sliceLayout = srcLayout.dyn_cast()) { isSrcMmaV1 = sliceLayout.getParent().isa() && sliceLayout.getParent().cast().isVolta(); } if (auto mmaLayout = dstLayout.dyn_cast()) { isDstMmaV1 = mmaLayout.isVolta(); } if (auto sliceLayout = dstLayout.dyn_cast()) { isDstMmaV1 = sliceLayout.getParent().isa() && sliceLayout.getParent().cast().isVolta(); } for (unsigned d = 0; d < rank; ++d) { unsigned inPerCTA = std::min(shape[d], srcShapePerCTA[d]); unsigned outPerCTA = std::min(shape[d], dstShapePerCTA[d]); unsigned maxPerCTA = std::max(inPerCTA, outPerCTA); numReplicates[d] = ceil(shape[d], maxPerCTA); inNumCTAsEachRep[d] = maxPerCTA / inPerCTA; outNumCTAsEachRep[d] = maxPerCTA / outPerCTA; assert(maxPerCTA % inPerCTA == 0 && maxPerCTA % outPerCTA == 0); inNumCTAs[d] = ceil(shape[d], inPerCTA); outNumCTAs[d] = ceil(shape[d], outPerCTA); } // Potentially we need to store for multiple CTAs in this replication auto accumNumReplicates = product(numReplicates); // unsigned elems = getElemsPerThread(srcTy); auto vals = getElementsFromStruct(loc, adaptor.src(), rewriter); unsigned inVec = 0; unsigned outVec = 0; auto paddedRepShape = getScratchConfigForCvtLayout(op, inVec, outVec); unsigned outElems = getElemsPerThread(dstTy); auto outOrd = getOrder(dstLayout); SmallVector outVals(outElems); for (unsigned repId = 0; repId < accumNumReplicates; ++repId) { auto multiDimRepId = getMultiDimIndex(repId, numReplicates, outOrd); if (repId != 0) barrier(); if (srcLayout.isa() || srcLayout.isa() || srcLayout.isa()) { if (isSrcMmaV1) processReplicaForMMAV1(loc, rewriter, /*stNotRd*/ true, srcTy, multiDimRepId, inVec, paddedRepShape, outOrd, vals, smemBase, shape); else processReplica(loc, rewriter, /*stNotRd*/ true, srcTy, inNumCTAsEachRep, multiDimRepId, inVec, paddedRepShape, outOrd, vals, smemBase); } else { assert(0 && "ConvertLayout with input layout not implemented"); return failure(); } barrier(); if (dstLayout.isa() || dstLayout.isa() || dstLayout.isa()) { if (isDstMmaV1) processReplicaForMMAV1(loc, rewriter, /*stNotRd*/ false, dstTy, multiDimRepId, outVec, paddedRepShape, outOrd, outVals, smemBase, shape, /*isDestMma=*/true); else processReplica(loc, rewriter, /*stNotRd*/ false, dstTy, outNumCTAsEachRep, multiDimRepId, outVec, paddedRepShape, outOrd, outVals, smemBase); } else { assert(0 && "ConvertLayout with output layout not implemented"); return failure(); } } SmallVector types(outElems, llvmElemTy); auto *ctx = llvmElemTy.getContext(); Type structTy = struct_ty(types); Value result = getStructFromElements(loc, outVals, rewriter, structTy); rewriter.replaceOp(op, result); return success(); } // blocked -> shared. // Swizzling in shared memory to avoid bank conflict. Normally used for // A/B operands of dots. LogicalResult lowerDistributedToShared(triton::gpu::ConvertLayoutOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const { auto loc = op.getLoc(); Value src = op.src(); Value dst = op.result(); auto srcTy = src.getType().cast(); auto srcShape = srcTy.getShape(); auto dstTy = dst.getType().cast(); auto dstShape = dstTy.getShape(); assert(srcShape.size() == 2 && "Unexpected rank of ConvertLayout(blocked->shared)"); auto srcLayout = srcTy.getEncoding(); auto dstSharedLayout = dstTy.getEncoding().cast(); auto inOrd = getOrder(srcLayout); auto outOrd = dstSharedLayout.getOrder(); Value smemBase = getSharedMemoryBase(loc, rewriter, dst); auto elemTy = getTypeConverter()->convertType(srcTy.getElementType()); auto elemPtrTy = ptr_ty(getTypeConverter()->convertType(elemTy), 3); smemBase = bitcast(smemBase, elemPtrTy); auto dstStrides = getStridesFromShapeAndOrder(dstShape, outOrd, loc, rewriter); auto srcIndices = emitIndices(loc, rewriter, srcLayout, srcShape); storeDistributedToShared(src, adaptor.src(), dstStrides, srcIndices, dst, smemBase, elemTy, loc, rewriter); auto smemObj = SharedMemoryObject(smemBase, dstShape, outOrd, loc, rewriter); auto retVal = getStructFromSharedMemoryObject(loc, smemObj, rewriter); rewriter.replaceOp(op, retVal); return success(); } // shared -> mma_operand LogicalResult lowerSharedToDotOperand(triton::gpu::ConvertLayoutOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const { auto loc = op.getLoc(); Value src = op.src(); Value dst = op.result(); auto dstTensorTy = dst.getType().cast(); auto srcTensorTy = src.getType().cast(); auto dotOperandLayout = dstTensorTy.getEncoding().cast(); auto sharedLayout = srcTensorTy.getEncoding().cast(); bool isOuter{}; int K{}; if (dotOperandLayout.getOpIdx() == 0) // $a K = dstTensorTy.getShape()[sharedLayout.getOrder()[0]]; else // $b K = dstTensorTy.getShape()[sharedLayout.getOrder()[1]]; isOuter = K == 1; Value res; if (auto mmaLayout = dotOperandLayout.getParent().dyn_cast_or_null()) { res = lowerSharedToDotOperandMMA(op, adaptor, rewriter, mmaLayout, dotOperandLayout, isOuter); } else if (auto blockedLayout = dotOperandLayout.getParent() .dyn_cast_or_null()) { auto dotOpLayout = dstTensorTy.getEncoding().cast(); DotOpFMAConversionHelper helper(blockedLayout); auto thread = getThreadId(rewriter, loc); if (dotOpLayout.getOpIdx() == 0) { // $a res = helper.loadA(src, adaptor.src(), blockedLayout, thread, loc, rewriter); } else { // $b res = helper.loadB(src, adaptor.src(), blockedLayout, thread, loc, rewriter); } } else { assert(false && "Unsupported dot operand layout found"); } rewriter.replaceOp(op, res); return success(); } // mma -> dot_operand LogicalResult lowerMmaToDotOperand(triton::gpu::ConvertLayoutOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const { auto loc = op.getLoc(); auto srcTy = op.src().getType().cast(); auto dstTy = op.result().getType().cast(); auto srcLayout = srcTy.getEncoding(); auto dstLayout = dstTy.getEncoding(); auto srcMmaLayout = srcLayout.cast(); auto dstDotLayout = dstLayout.cast(); if (isMmaToDotShortcut(srcMmaLayout, dstDotLayout)) { // get source values auto vals = getElementsFromStruct(loc, adaptor.src(), rewriter); unsigned elems = getElemsPerThread(srcTy); Type elemTy = this->getTypeConverter()->convertType(srcTy.getElementType()); // for the destination type, we need to pack values together // so they can be consumed by tensor core operations unsigned vecSize = std::max(32 / elemTy.getIntOrFloatBitWidth(), 1); Type vecTy = vec_ty(elemTy, vecSize); SmallVector types(elems / vecSize, vecTy); SmallVector vecVals; for (unsigned i = 0; i < elems; i += vecSize) { Value packed = rewriter.create(loc, vecTy); for (unsigned j = 0; j < vecSize; j++) packed = insert_element(vecTy, packed, vals[i + j], i32_val(j)); vecVals.push_back(packed); } // This needs to be ordered the same way that // ldmatrix.x4 would order it // TODO: this needs to be refactor so we don't // implicitly depends on how emitOffsetsForMMAV2 // is implemented SmallVector reorderedVals; for (unsigned i = 0; i < vecVals.size(); i += 4) { reorderedVals.push_back(vecVals[i]); reorderedVals.push_back(vecVals[i + 2]); reorderedVals.push_back(vecVals[i + 1]); reorderedVals.push_back(vecVals[i + 3]); } // return composeValuesToDotOperandLayoutStruct(ha, numRepM, numRepK); Type structTy = LLVM::LLVMStructType::getLiteral(this->getContext(), types); Value view = getStructFromElements(loc, reorderedVals, rewriter, structTy); rewriter.replaceOp(op, view); return success(); } return failure(); } // shared -> dot_operand if the result layout is mma Value lowerSharedToDotOperandMMA( triton::gpu::ConvertLayoutOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter, const MmaEncodingAttr &mmaLayout, const DotOperandEncodingAttr &dotOperandLayout, bool isOuter) const { auto loc = op.getLoc(); Value src = op.src(); Value dst = op.result(); bool isHMMA = supportMMA(dst, mmaLayout.getVersionMajor()); auto smemObj = getSharedMemoryObjectFromStruct(loc, adaptor.src(), rewriter); Value res; if (!isOuter && mmaLayout.isAmpere() && isHMMA) { // tensor core v2 MMA16816ConversionHelper mmaHelper(src.getType(), mmaLayout, getThreadId(rewriter, loc), rewriter, getTypeConverter(), op.getLoc()); if (dotOperandLayout.getOpIdx() == 0) { // operand $a res = mmaHelper.loadA(src, smemObj); } else if (dotOperandLayout.getOpIdx() == 1) { // operand $b res = mmaHelper.loadB(src, smemObj); } } else if (!isOuter && mmaLayout.isVolta() && isHMMA) { // tensor core v1 DotOpMmaV1ConversionHelper helper(mmaLayout); bool isMMAv1Row = dotOperandLayout.getIsMMAv1Row().cast().getValue(); auto srcSharedLayout = src.getType() .cast() .getEncoding() .cast(); // Can only convert [1, 0] to row or [0, 1] to col for now if ((srcSharedLayout.getOrder()[0] == 1 && !isMMAv1Row) || (srcSharedLayout.getOrder()[0] == 0 && isMMAv1Row)) { llvm::errs() << "Unsupported Shared -> DotOperand[MMAv1] conversion\n"; return Value(); } if (dotOperandLayout.getOpIdx() == 0) { // operand $a // TODO[Superjomn]: transA is not available here. bool transA = false; res = helper.loadA(src, smemObj, getThreadId(rewriter, loc), loc, rewriter); } else if (dotOperandLayout.getOpIdx() == 1) { // operand $b // TODO[Superjomn]: transB is not available here. bool transB = false; res = helper.loadB(src, smemObj, getThreadId(rewriter, loc), loc, rewriter); } } else { assert(false && "Unsupported mma layout found"); } return res; } }; void populateConvertLayoutOpToLLVMPatterns( mlir::LLVMTypeConverter &typeConverter, RewritePatternSet &patterns, int numWarps, AxisInfoAnalysis &axisInfoAnalysis, const Allocation *allocation, Value smem, ConvertTritonGPUOpToLLVMPatternBase::IndexCacheInfo &indexCacheInfo, PatternBenefit benefit) { patterns.add(typeConverter, allocation, smem, indexCacheInfo, benefit); } triton-2.0.0/lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.h000066400000000000000000000011231440023377100250360ustar00rootroot00000000000000#ifndef TRITON_CONVERSION_TRITONGPU_TO_LLVM_CONVERT_LAYOUT_OP_H #define TRITON_CONVERSION_TRITONGPU_TO_LLVM_CONVERT_LAYOUT_OP_H #include "TritonGPUToLLVMBase.h" using namespace mlir; using namespace mlir::triton; using ::mlir::triton::gpu::DotOperandEncodingAttr; void populateConvertLayoutOpToLLVMPatterns( mlir::LLVMTypeConverter &typeConverter, RewritePatternSet &patterns, int numWarps, AxisInfoAnalysis &axisInfoAnalysis, const Allocation *allocation, Value smem, ConvertTritonGPUOpToLLVMPatternBase::IndexCacheInfo &indexCacheInfo, PatternBenefit benefit); #endif triton-2.0.0/lib/Conversion/TritonGPUToLLVM/DotOpHelpers.cpp000066400000000000000000001454141440023377100236420ustar00rootroot00000000000000#include "DotOpHelpers.h" namespace mlir { namespace LLVM { int DotOpMmaV1ConversionHelper::numElemsPerThreadA(ArrayRef shape, bool isARow, bool isAVec4, int vec) const { int numM = getNumM(shape[0], isARow, isAVec4); int NK = shape[1]; // Here we mimic the logic in loadA, the result cannot be calculated // directly. llvm::DenseSet> visited; auto ld = [&](int m, int k) { visited.insert({m, k}); if (vec > 4) { if (isARow) visited.insert({m, k + 4}); else visited.insert({m + 1, k}); } }; for (unsigned k = 0; k < NK; k += 4) for (unsigned m = 0; m < numM / 2; ++m) if (!visited.count({m, k})) ld(m, k); return visited.size() * 2; } int DotOpMmaV1ConversionHelper::numElemsPerThreadB(ArrayRef shape, bool isBRow, bool isBVec4, int vec) const { unsigned numN = getNumN(shape[1], isBRow, isBVec4); int NK = shape[0]; // Here we mimic the logic in loadA, the result cannot be calculated // directly. llvm::DenseSet> visited; int elemsPerLd = vec > 4 ? 4 : 2; auto ld = [&](int n, int k) { visited.insert({n, k}); if (vec > 4) { if (isBRow) visited.insert({n + 1, k}); else visited.insert({n, k + 4}); } }; for (unsigned k = 0; k < NK; k += 4) for (unsigned n = 0; n < numN / 2; ++n) { if (!visited.count({n, k})) ld(n, k); } return visited.size() * 2; } Value DotOpMmaV1ConversionHelper::loadA( Value tensor, const SharedMemoryObject &smemObj, Value thread, Location loc, ConversionPatternRewriter &rewriter) const { auto *ctx = rewriter.getContext(); auto tensorTy = tensor.getType().cast(); auto sharedLayout = tensorTy.getEncoding().cast(); auto shape = tensorTy.getShape(); auto order = sharedLayout.getOrder(); Value cSwizzleOffset = smemObj.getCSwizzleOffset(order[0]); Value smemBase = smemObj.getBaseBeforeSwizzle(order[0], loc, rewriter); bool isARow = order[0] != 0; auto [isARow_, _0, isAVec4, _1, _2] = mmaLayout.decodeVoltaLayoutStates(); AParam param(isARow_, isAVec4); auto [offsetAM, offsetAK, _3, _4] = computeOffsets( thread, isARow, false, fpw, param.spw, param.rep, rewriter, loc); int vecA = sharedLayout.getVec(); auto strides = smemObj.strides; Value strideAM = isARow ? strides[0] : i32_val(1); Value strideAK = isARow ? i32_val(1) : strides[1]; Value strideA0 = isARow ? strideAK : strideAM; Value strideA1 = isARow ? strideAM : strideAK; int strideRepM = wpt[0] * fpw[0] * 8; int strideRepK = 1; // swizzling int perPhaseA = sharedLayout.getPerPhase(); int maxPhaseA = sharedLayout.getMaxPhase(); int stepA0 = isARow ? strideRepK : strideRepM; int numPtrA = std::max(2 * perPhaseA * maxPhaseA / stepA0, 1); int NK = shape[1]; // pre-compute pointer lanes Value offA0 = isARow ? offsetAK : offsetAM; Value offA1 = isARow ? offsetAM : offsetAK; Value phaseA = urem(udiv(offA1, i32_val(perPhaseA)), i32_val(maxPhaseA)); offA0 = add(offA0, cSwizzleOffset); SmallVector offA(numPtrA); for (int i = 0; i < numPtrA; i++) { Value offA0I = add(offA0, i32_val(i * (isARow ? 4 : strideRepM))); offA0I = udiv(offA0I, i32_val(vecA)); offA0I = xor_(offA0I, phaseA); offA0I = mul(offA0I, i32_val(vecA)); offA[i] = add(mul(offA0I, strideA0), mul(offA1, strideA1)); } Type elemX2Ty = vec_ty(f16_ty, 2); Type elemPtrTy = ptr_ty(f16_ty); if (tensorTy.getElementType().isBF16()) { elemX2Ty = vec_ty(i16_ty, 2); elemPtrTy = ptr_ty(i16_ty); } // prepare arguments SmallVector ptrA(numPtrA); std::map, std::pair> has; for (int i = 0; i < numPtrA; i++) ptrA[i] = gep(ptr_ty(f16_ty), smemBase, offA[i]); auto ld = [&](decltype(has) &vals, int m, int k, Value val0, Value val1) { vals[{m, k}] = {val0, val1}; }; auto loadA = [&](int m, int k) { int offidx = (isARow ? k / 4 : m) % numPtrA; Value thePtrA = gep(elemPtrTy, smemBase, offA[offidx]); int stepAM = isARow ? m : m / numPtrA * numPtrA; int stepAK = isARow ? k / (numPtrA * vecA) * (numPtrA * vecA) : k; Value offset = add(mul(i32_val(stepAM * strideRepM), strideAM), mul(i32_val(stepAK), strideAK)); Value pa = gep(elemPtrTy, thePtrA, offset); Type aPtrTy = ptr_ty(vec_ty(i32_ty, std::max(vecA / 2, 1)), 3); Value ha = load(bitcast(pa, aPtrTy)); // record lds that needs to be moved Value ha00 = bitcast(extract_element(ha, i32_val(0)), elemX2Ty); Value ha01 = bitcast(extract_element(ha, i32_val(1)), elemX2Ty); ld(has, m, k, ha00, ha01); if (vecA > 4) { Value ha10 = bitcast(extract_element(ha, i32_val(2)), elemX2Ty); Value ha11 = bitcast(extract_element(ha, i32_val(3)), elemX2Ty); if (isARow) ld(has, m, k + 4, ha10, ha11); else ld(has, m + 1, k, ha10, ha11); } }; unsigned numM = getNumM(shape[0], isARow, isAVec4); for (unsigned k = 0; k < NK; k += 4) for (unsigned m = 0; m < numM / 2; ++m) if (!has.count({m, k})) loadA(m, k); SmallVector elems; elems.reserve(has.size() * 2); for (auto item : has) { // has is a map, the key should be ordered. elems.push_back(item.second.first); elems.push_back(item.second.second); } Type resTy = struct_ty(SmallVector(elems.size(), elemX2Ty)); Value res = getStructFromElements(loc, elems, rewriter, resTy); return res; } Value DotOpMmaV1ConversionHelper::loadB( Value tensor, const SharedMemoryObject &smemObj, Value thread, Location loc, ConversionPatternRewriter &rewriter) const { // smem auto strides = smemObj.strides; auto *ctx = rewriter.getContext(); auto tensorTy = tensor.getType().cast(); auto sharedLayout = tensorTy.getEncoding().cast(); auto shape = tensorTy.getShape(); auto order = sharedLayout.getOrder(); Value smem = smemObj.getBaseBeforeSwizzle(order[0], loc, rewriter); bool isBRow = order[0] != 0; // is row-major in shared memory layout // isBRow_ indicates whether B is row-major in DotOperand layout auto [_0, isBRow_, _1, isBVec4, _2] = mmaLayout.decodeVoltaLayoutStates(); assert(isBRow == isBRow_ && "B need smem isRow"); BParam param(isBRow_, isBVec4); int vecB = sharedLayout.getVec(); Value strideBN = isBRow ? i32_val(1) : strides[1]; Value strideBK = isBRow ? strides[0] : i32_val(1); Value strideB0 = isBRow ? strideBN : strideBK; Value strideB1 = isBRow ? strideBK : strideBN; int strideRepN = wpt[1] * fpw[1] * 8; int strideRepK = 1; auto [_3, _4, offsetBN, offsetBK] = computeOffsets( thread, false, isBRow, fpw, param.spw, param.rep, rewriter, loc); // swizzling int perPhaseB = sharedLayout.getPerPhase(); int maxPhaseB = sharedLayout.getMaxPhase(); int stepB0 = isBRow ? strideRepN : strideRepK; int numPtrB = std::max(2 * perPhaseB * maxPhaseB / stepB0, 1); int NK = shape[0]; Value offB0 = isBRow ? offsetBN : offsetBK; Value offB1 = isBRow ? offsetBK : offsetBN; Value phaseB = urem(udiv(offB1, i32_val(perPhaseB)), i32_val(maxPhaseB)); Value cSwizzleOffset = smemObj.getCSwizzleOffset(order[0]); offB0 = add(offB0, cSwizzleOffset); SmallVector offB(numPtrB); for (int i = 0; i < numPtrB; ++i) { Value offB0I = add(offB0, i32_val(i * (isBRow ? strideRepN : 4))); offB0I = udiv(offB0I, i32_val(vecB)); offB0I = xor_(offB0I, phaseB); offB0I = mul(offB0I, i32_val(vecB)); offB[i] = add(mul(offB0I, strideB0), mul(offB1, strideB1)); } Type elemPtrTy = ptr_ty(f16_ty); Type elemX2Ty = vec_ty(f16_ty, 2); if (tensorTy.getElementType().isBF16()) { elemPtrTy = ptr_ty(i16_ty); elemX2Ty = vec_ty(i16_ty, 2); } SmallVector ptrB(numPtrB); ValueTable hbs; for (int i = 0; i < numPtrB; ++i) ptrB[i] = gep(ptr_ty(f16_ty), smem, offB[i]); auto ld = [&](decltype(hbs) &vals, int m, int k, Value val0, Value val1) { vals[{m, k}] = {val0, val1}; }; auto loadB = [&](int n, int K) { int offidx = (isBRow ? n : K / 4) % numPtrB; Value thePtrB = ptrB[offidx]; int stepBN = isBRow ? n / numPtrB * numPtrB : n; int stepBK = isBRow ? K : K / (numPtrB * vecB) * (numPtrB * vecB); Value offset = add(mul(i32_val(stepBN * strideRepN), strideBN), mul(i32_val(stepBK), strideBK)); Value pb = gep(elemPtrTy, thePtrB, offset); Value hb = load(bitcast(pb, ptr_ty(vec_ty(i32_ty, std::max(vecB / 2, 1)), 3))); // record lds that needs to be moved Value hb00 = bitcast(extract_element(hb, i32_val(0)), elemX2Ty); Value hb01 = bitcast(extract_element(hb, i32_val(1)), elemX2Ty); ld(hbs, n, K, hb00, hb01); if (vecB > 4) { Value hb10 = bitcast(extract_element(hb, i32_val(2)), elemX2Ty); Value hb11 = bitcast(extract_element(hb, i32_val(3)), elemX2Ty); if (isBRow) ld(hbs, n + 1, K, hb10, hb11); else ld(hbs, n, K + 4, hb10, hb11); } }; unsigned numN = getNumN(shape[1], isBRow, isBVec4); for (unsigned k = 0; k < NK; k += 4) for (unsigned n = 0; n < numN / 2; ++n) { if (!hbs.count({n, k})) loadB(n, k); } SmallVector elems; for (auto &item : hbs) { // has is a map, the key should be ordered. elems.push_back(item.second.first); elems.push_back(item.second.second); } Type resTy = struct_ty(SmallVector(elems.size(), elemX2Ty)); Value res = getStructFromElements(loc, elems, rewriter, resTy); return res; } std::tuple DotOpMmaV1ConversionHelper::computeOffsets(Value threadId, bool isARow, bool isBRow, ArrayRef fpw, ArrayRef spw, ArrayRef rep, ConversionPatternRewriter &rewriter, Location loc) const { auto *ctx = rewriter.getContext(); Value _1 = i32_val(1); Value _3 = i32_val(3); Value _4 = i32_val(4); Value _16 = i32_val(16); Value _32 = i32_val(32); Value lane = urem(threadId, _32); Value warp = udiv(threadId, _32); // warp offset Value warp0 = urem(warp, i32_val(wpt[0])); Value warp12 = udiv(warp, i32_val(wpt[0])); Value warp1 = urem(warp12, i32_val(wpt[1])); Value warpMOff = mul(warp0, i32_val(spw[0])); Value warpNOff = mul(warp1, i32_val(spw[1])); // Quad offset Value quadMOff = mul(udiv(and_(lane, _16), _4), i32_val(fpw[0])); Value quadNOff = mul(udiv(and_(lane, _16), _4), i32_val(fpw[1])); // Pair offset Value pairMOff = udiv(urem(lane, _16), _4); pairMOff = urem(pairMOff, i32_val(fpw[0])); pairMOff = mul(pairMOff, _4); Value pairNOff = udiv(urem(lane, _16), _4); pairNOff = udiv(pairNOff, i32_val(fpw[0])); pairNOff = urem(pairNOff, i32_val(fpw[1])); pairNOff = mul(pairNOff, _4); // scale pairMOff = mul(pairMOff, i32_val(rep[0] / 2)); quadMOff = mul(quadMOff, i32_val(rep[0] / 2)); pairNOff = mul(pairNOff, i32_val(rep[1] / 2)); quadNOff = mul(quadNOff, i32_val(rep[1] / 2)); // Quad pair offset Value laneMOff = add(pairMOff, quadMOff); Value laneNOff = add(pairNOff, quadNOff); // A offset Value offsetAM = add(warpMOff, laneMOff); Value offsetAK = and_(lane, _3); // B offset Value offsetBN = add(warpNOff, laneNOff); Value offsetBK = and_(lane, _3); // i indices Value offsetCM = add(and_(lane, _1), offsetAM); if (isARow) { offsetAM = add(offsetAM, urem(threadId, _4)); offsetAK = i32_val(0); } if (!isBRow) { offsetBN = add(offsetBN, urem(threadId, _4)); offsetBK = i32_val(0); } return std::make_tuple(offsetAM, offsetAK, offsetBN, offsetBK); } DotOpMmaV1ConversionHelper::ValueTable DotOpMmaV1ConversionHelper::extractLoadedOperand( Value llStruct, int NK, ConversionPatternRewriter &rewriter) const { ValueTable rcds; SmallVector elems = getElementsFromStruct(llStruct.getLoc(), llStruct, rewriter); int offset = 0; for (int i = 0; offset < elems.size(); ++i) { for (int k = 0; k < NK; k += 4) { rcds[{i, k}] = std::make_pair(elems[offset], elems[offset + 1]); offset += 2; } } return rcds; } SmallVector DotOpMmaV1ConversionHelper::getMNCoords(Value thread, ConversionPatternRewriter &rewriter, ArrayRef wpt, ArrayRef shape, bool isARow, bool isBRow, bool isAVec4, bool isBVec4) { auto *ctx = thread.getContext(); auto loc = UnknownLoc::get(ctx); Value _1 = i32_val(1); Value _2 = i32_val(2); Value _4 = i32_val(4); Value _16 = i32_val(16); Value _32 = i32_val(32); Value _fpw0 = i32_val(fpw[0]); Value _fpw1 = i32_val(fpw[1]); DotOpMmaV1ConversionHelper::AParam aParam(isARow, isAVec4); DotOpMmaV1ConversionHelper::BParam bParam(isBRow, isBVec4); SmallVector rep({aParam.rep[0], bParam.rep[1]}); SmallVector spw({aParam.spw[0], bParam.spw[1]}); SmallVector shapePerCTA({spw[0] * wpt[0], spw[1] * wpt[1]}); Value lane = urem(thread, _32); Value warp = udiv(thread, _32); Value warp0 = urem(warp, i32_val(wpt[0])); Value warp12 = udiv(warp, i32_val(wpt[0])); Value warp1 = urem(warp12, i32_val(wpt[1])); // warp offset Value offWarpM = mul(warp0, i32_val(spw[0])); Value offWarpN = mul(warp1, i32_val(spw[1])); // quad offset Value offQuadM = mul(udiv(and_(lane, _16), _4), _fpw0); Value offQuadN = mul(udiv(and_(lane, _16), _4), _fpw1); // pair offset Value offPairM = udiv(urem(lane, _16), _4); offPairM = urem(offPairM, _fpw0); offPairM = mul(offPairM, _4); Value offPairN = udiv(urem(lane, _16), _4); offPairN = udiv(offPairN, _fpw0); offPairN = urem(offPairN, _fpw1); offPairN = mul(offPairN, _4); // sclare offPairM = mul(offPairM, i32_val(rep[0] / 2)); offQuadM = mul(offQuadM, i32_val(rep[0] / 2)); offPairN = mul(offPairN, i32_val(rep[1] / 2)); offQuadN = mul(offQuadN, i32_val(rep[1] / 2)); // quad pair offset Value offLaneM = add(offPairM, offQuadM); Value offLaneN = add(offPairN, offQuadN); // a, b offset Value offsetAM = add(offWarpM, offLaneM); Value offsetBN = add(offWarpN, offLaneN); // m indices Value offsetCM = add(and_(lane, _1), offsetAM); SmallVector idxM; for (unsigned m = 0; m < shape[0]; m += shapePerCTA[0]) for (unsigned mm = 0; mm < rep[0]; ++mm) idxM.push_back(add(offsetCM, i32_val(m + mm * 2))); // n indices Value offsetCN = add((and_(lane, _2)), (add(offWarpN, offPairN))); SmallVector idxN; for (int n = 0; n < shape[1]; n += shapePerCTA[1]) { for (int nn = 0; nn < rep[1]; ++nn) { idxN.push_back(add( offsetCN, i32_val(n + nn / 2 * 4 + (nn % 2) * 2 * fpw[1] * rep[1]))); idxN.push_back( add(offsetCN, i32_val(n + nn / 2 * 4 + (nn % 2) * 2 * fpw[1] * rep[1] + 1))); } } SmallVector> axes({idxM, idxN}); // product the axis M and axis N to get coords, ported from // generator::init_idx method from triton2.0 // TODO[Superjomn]: check the order. SmallVector coords; for (Value x1 : axes[1]) { // N for (Value x0 : axes[0]) { // M SmallVector idx(2); idx[0] = x0; // M idx[1] = x1; // N coords.push_back(std::move(idx)); } } return coords; // {M,N} in row-major } void DotOpMmaV1ConversionHelper::AParam::build(bool isARow) { int packSize0 = (isARow || isAVec4) ? 1 : 2; int repM = 2 * packSize0; int repK = 1; int spwM = fpw[0] * 4 * repM; rep.assign({repM, 0, repK}); spw.assign({spwM, 0, 1}); vec = 2 * rep[0]; } void DotOpMmaV1ConversionHelper::BParam::build(bool isBRow) { int packSize1 = (isBRow && !isBVec4) ? 2 : 1; rep.assign({0, 2 * packSize1, 1}); spw.assign({0, fpw[1] * 4 * rep[1], 1}); vec = 2 * rep[1]; } std::tuple DotOpMmaV2ConversionHelper::getRepMN(const RankedTensorType &tensorTy) { auto mmaLayout = tensorTy.getEncoding().cast(); auto wpt = mmaLayout.getWarpsPerCTA(); int M = tensorTy.getShape()[0]; int N = tensorTy.getShape()[1]; auto [instrM, instrN] = getInstrShapeMN(); int repM = std::max(M / (wpt[0] * instrM), 1); int repN = std::max(N / (wpt[1] * instrN), 1); return {repM, repN}; } Type DotOpMmaV2ConversionHelper::getShemPtrTy() const { switch (mmaType) { case TensorCoreType::FP32_FP16_FP16_FP32: return ptr_ty(type::f16Ty(ctx), 3); case TensorCoreType::FP32_BF16_BF16_FP32: return ptr_ty(type::i16Ty(ctx), 3); case TensorCoreType::FP32_TF32_TF32_FP32: return ptr_ty(type::f32Ty(ctx), 3); case TensorCoreType::INT32_INT8_INT8_INT32: return ptr_ty(type::i8Ty(ctx), 3); default: llvm::report_fatal_error("mma16816 data type not supported"); } return Type{}; } Type DotOpMmaV2ConversionHelper::getMatType() const { // floating point types Type fp32x1Ty = vec_ty(type::f32Ty(ctx), 1); Type fp16x2Ty = vec_ty(type::f16Ty(ctx), 2); Type i16x2Ty = vec_ty(type::i16Ty(ctx), 2); Type fp16x2Pack4Ty = LLVM::LLVMStructType::getLiteral(ctx, SmallVector(4, fp16x2Ty)); // LLVM 14.0 does not support bf16 type, so we use i16 instead. Type bf16x2Pack4Ty = LLVM::LLVMStructType::getLiteral(ctx, SmallVector(4, i16x2Ty)); Type fp32Pack4Ty = LLVM::LLVMStructType::getLiteral(ctx, SmallVector(4, fp32x1Ty)); // integer types Type i8x4Ty = vec_ty(type::i8Ty(ctx), 4); Type i8x4Pack4Ty = LLVM::LLVMStructType::getLiteral(ctx, SmallVector(4, i8x4Ty)); switch (mmaType) { case TensorCoreType::FP32_FP16_FP16_FP32: return fp16x2Pack4Ty; case TensorCoreType::FP32_BF16_BF16_FP32: return bf16x2Pack4Ty; case TensorCoreType::FP32_TF32_TF32_FP32: return fp32Pack4Ty; case TensorCoreType::INT32_INT8_INT8_INT32: return i8x4Pack4Ty; default: llvm::report_fatal_error("Unsupported mma type found"); } return Type{}; } Type DotOpMmaV2ConversionHelper::getLoadElemTy() { switch (mmaType) { case TensorCoreType::FP32_FP16_FP16_FP32: return vec_ty(type::f16Ty(ctx), 2); case TensorCoreType::FP32_BF16_BF16_FP32: return vec_ty(type::bf16Ty(ctx), 2); case TensorCoreType::FP32_TF32_TF32_FP32: return type::f32Ty(ctx); case TensorCoreType::INT32_INT8_INT8_INT32: return type::i32Ty(ctx); default: llvm::report_fatal_error("Unsupported mma type found"); } return Type{}; } Type DotOpMmaV2ConversionHelper::getMmaRetType() const { Type fp32Ty = type::f32Ty(ctx); Type i32Ty = type::i32Ty(ctx); Type fp32x4Ty = LLVM::LLVMStructType::getLiteral(ctx, SmallVector(4, fp32Ty)); Type i32x4Ty = LLVM::LLVMStructType::getLiteral(ctx, SmallVector(4, i32Ty)); switch (mmaType) { case TensorCoreType::FP32_FP16_FP16_FP32: return fp32x4Ty; case TensorCoreType::FP32_BF16_BF16_FP32: return fp32x4Ty; case TensorCoreType::FP32_TF32_TF32_FP32: return fp32x4Ty; case TensorCoreType::INT32_INT8_INT8_INT32: return i32x4Ty; default: llvm::report_fatal_error("Unsupported mma type found"); } return Type{}; } DotOpMmaV2ConversionHelper::TensorCoreType DotOpMmaV2ConversionHelper::getTensorCoreTypeFromOperand(Type operandTy) { auto tensorTy = operandTy.cast(); auto elemTy = tensorTy.getElementType(); if (elemTy.isF16()) return TensorCoreType::FP32_FP16_FP16_FP32; if (elemTy.isF32()) return TensorCoreType::FP32_TF32_TF32_FP32; if (elemTy.isBF16()) return TensorCoreType::FP32_BF16_BF16_FP32; if (elemTy.isInteger(8)) return TensorCoreType::INT32_INT8_INT8_INT32; return TensorCoreType::NOT_APPLICABLE; } DotOpMmaV2ConversionHelper::TensorCoreType DotOpMmaV2ConversionHelper::getMmaType(triton::DotOp op) { Value A = op.a(); Value B = op.b(); auto aTy = A.getType().cast(); auto bTy = B.getType().cast(); // d = a*b + c auto dTy = op.d().getType().cast(); if (dTy.getElementType().isF32()) { if (aTy.getElementType().isF16() && bTy.getElementType().isF16()) return TensorCoreType::FP32_FP16_FP16_FP32; if (aTy.getElementType().isBF16() && bTy.getElementType().isBF16()) return TensorCoreType::FP32_BF16_BF16_FP32; if (aTy.getElementType().isF32() && bTy.getElementType().isF32() && op.allowTF32()) return TensorCoreType::FP32_TF32_TF32_FP32; } else if (dTy.getElementType().isInteger(32)) { if (aTy.getElementType().isInteger(8) && bTy.getElementType().isInteger(8)) return TensorCoreType::INT32_INT8_INT8_INT32; } return TensorCoreType::NOT_APPLICABLE; } SmallVector MMA16816SmemLoader::computeLdmatrixMatOffs(Value warpId, Value lane, Value cSwizzleOffset) { // 4x4 matrices Value c = urem(lane, i32_val(8)); Value s = udiv(lane, i32_val(8)); // sub-warp-id // Decompose s => s_0, s_1, that is the coordinate in 2x2 matrices in a // warp Value s0 = urem(s, i32_val(2)); Value s1 = udiv(s, i32_val(2)); // We use different orders for a and b for better performance. Value kMatArr = kOrder == 1 ? s1 : s0; Value nkMatArr = kOrder == 1 ? s0 : s1; // matrix coordinate inside a CTA, the matrix layout is [2x2wpt] for A and // [2wptx2] for B. e.g. Setting wpt=3, The data layout for A(kOrder=1) is // |0 0 1 1 2 2| -> 0,1,2 are the warpids // |0 0 1 1 2 2| // // for B(kOrder=0) is // |0 0| -> 0,1,2 are the warpids // |1 1| // |2 2| // |0 0| // |1 1| // |2 2| // Note, for each warp, it handles a 2x2 matrices, that is the coordinate // address (s0,s1) annotates. Value matOff[2]; matOff[kOrder ^ 1] = add(mul(warpId, i32_val(warpOffStride)), // warp offset mul(nkMatArr, i32_val(matArrStride))); // matrix offset inside a warp matOff[kOrder] = kMatArr; // Physical offset (before swizzling) Value cMatOff = matOff[order[0]]; Value sMatOff = matOff[order[1]]; Value cSwizzleMatOff = udiv(cSwizzleOffset, i32_val(cMatShape)); cMatOff = add(cMatOff, cSwizzleMatOff); // row offset inside a matrix, each matrix has 8 rows. Value sOffInMat = c; SmallVector offs(numPtrs); Value phase = urem(udiv(sOffInMat, i32_val(perPhase)), i32_val(maxPhase)); Value sOff = add(sOffInMat, mul(sMatOff, i32_val(sMatShape))); for (int i = 0; i < numPtrs; ++i) { Value cMatOffI = add(cMatOff, i32_val(i * pLoadStrideInMat)); cMatOffI = xor_(cMatOffI, phase); offs[i] = add(mul(cMatOffI, i32_val(cMatShape)), mul(sOff, sStride)); } return offs; } SmallVector MMA16816SmemLoader::computeB32MatOffs(Value warpOff, Value lane, Value cSwizzleOffset) { assert(needTrans && "Only used in transpose mode."); // Load tf32 matrices with lds32 Value cOffInMat = udiv(lane, i32_val(4)); Value sOffInMat = urem(lane, i32_val(4)); Value phase = urem(udiv(sOffInMat, i32_val(perPhase)), i32_val(maxPhase)); SmallVector offs(numPtrs); for (int mat = 0; mat < 4; ++mat) { // Load 4 mats each time int kMatArrInt = kOrder == 1 ? mat / 2 : mat % 2; int nkMatArrInt = kOrder == 1 ? mat % 2 : mat / 2; if (kMatArrInt > 0) // we don't need pointers for k continue; Value kMatArr = i32_val(kMatArrInt); Value nkMatArr = i32_val(nkMatArrInt); Value cMatOff = add(mul(warpOff, i32_val(warpOffStride)), mul(nkMatArr, i32_val(matArrStride))); Value cSwizzleMatOff = udiv(cSwizzleOffset, i32_val(cMatShape)); cMatOff = add(cMatOff, cSwizzleMatOff); Value sMatOff = kMatArr; Value sOff = add(sOffInMat, mul(sMatOff, i32_val(sMatShape))); // FIXME: (kOrder == 1?) is really dirty hack for (int i = 0; i < numPtrs / 2; ++i) { Value cMatOffI = add(cMatOff, i32_val(i * pLoadStrideInMat * (kOrder == 1 ? 1 : 2))); cMatOffI = xor_(cMatOffI, phase); Value cOff = add(cOffInMat, mul(cMatOffI, i32_val(cMatShape))); cOff = urem(cOff, i32_val(tileShape[order[0]])); sOff = urem(sOff, i32_val(tileShape[order[1]])); offs[2 * i + nkMatArrInt] = add(cOff, mul(sOff, sStride)); } } return offs; } SmallVector MMA16816SmemLoader::computeB8MatOffs(Value warpOff, Value lane, Value cSwizzleOffset) { assert(needTrans && "Only used in transpose mode."); Value cOffInMat = udiv(lane, i32_val(4)); Value sOffInMat = mul(urem(lane, i32_val(4)), i32_val(4)); // each thread load 4 cols SmallVector offs(numPtrs); for (int mat = 0; mat < 4; ++mat) { int kMatArrInt = kOrder == 1 ? mat / 2 : mat % 2; int nkMatArrInt = kOrder == 1 ? mat % 2 : mat / 2; if (kMatArrInt > 0) // we don't need pointers for k continue; Value kMatArr = i32_val(kMatArrInt); Value nkMatArr = i32_val(nkMatArrInt); Value cMatOff = add(mul(warpOff, i32_val(warpOffStride)), mul(nkMatArr, i32_val(matArrStride))); Value sMatOff = kMatArr; for (int loadx4Off = 0; loadx4Off < numPtrs / 8; ++loadx4Off) { for (int elemOff = 0; elemOff < 4; ++elemOff) { int ptrOff = loadx4Off * 8 + nkMatArrInt * 4 + elemOff; Value cMatOffI = add(cMatOff, i32_val(loadx4Off * pLoadStrideInMat * (kOrder == 1 ? 1 : 2))); Value sOffInMatElem = add(sOffInMat, i32_val(elemOff)); // disable swizzling ... Value cOff = add(cOffInMat, mul(cMatOffI, i32_val(cMatShape))); Value sOff = add(sOffInMatElem, mul(sMatOff, i32_val(sMatShape))); // To prevent out-of-bound access when tile is too small. cOff = urem(cOff, i32_val(tileShape[order[0]])); sOff = urem(sOff, i32_val(tileShape[order[1]])); offs[ptrOff] = add(cOff, mul(sOff, sStride)); } } } return offs; } std::tuple MMA16816SmemLoader::loadX4(int mat0, int mat1, ArrayRef offs, ArrayRef ptrs, Type matTy, Type shemPtrTy) const { assert(mat0 % 2 == 0 && mat1 % 2 == 0 && "smem matrix load must be aligned"); int matIdx[2] = {mat0, mat1}; int ptrIdx{-1}; if (canUseLdmatrix) ptrIdx = matIdx[order[0]] / (instrShape[order[0]] / matShape[order[0]]); else if (elemBytes == 4 && needTrans) ptrIdx = matIdx[order[0]]; else if (elemBytes == 1 && needTrans) ptrIdx = matIdx[order[0]] * 4; else llvm::report_fatal_error("unsupported mma type found"); // The main difference with the original triton code is we removed the // prefetch-related logic here for the upstream optimizer phase should // take care with it, and that is transparent in dot conversion. auto getPtr = [&](int idx) { return ptrs[idx]; }; Value ptr = getPtr(ptrIdx); // The struct should have exactly the same element types. auto resTy = matTy.cast(); Type elemTy = matTy.cast().getBody()[0]; // For some reasons, LLVM's NVPTX backend inserts unnecessary (?) integer // instructions to pack & unpack sub-word integers. A workaround is to // store the results of ldmatrix in i32 if (auto vecElemTy = elemTy.dyn_cast()) { Type elemElemTy = vecElemTy.getElementType(); if (auto intTy = elemElemTy.dyn_cast()) { if (intTy.getWidth() <= 16) { elemTy = rewriter.getI32Type(); resTy = LLVM::LLVMStructType::getLiteral(ctx, SmallVector(4, elemTy)); } } } if (canUseLdmatrix) { Value sOffset = mul(i32_val(matIdx[order[1]] * sMatStride * sMatShape), sStride); Value sOffsetPtr = gep(shemPtrTy, ptr, sOffset); PTXBuilder builder; // ldmatrix.m8n8.x4 returns 4x2xfp16(that is 4xb32) elements for a // thread. auto resArgs = builder.newListOperand(4, "=r"); auto addrArg = builder.newAddrOperand(sOffsetPtr, "r"); auto ldmatrix = builder.create("ldmatrix.sync.aligned.m8n8.x4") ->o("trans", needTrans /*predicate*/) .o("shared.b16"); ldmatrix(resArgs, addrArg); // The result type is 4xi32, each i32 is composed of 2xf16 // elements (adjacent two columns in a row) or a single f32 element. Value resV4 = builder.launch(rewriter, loc, resTy); return {extract_val(elemTy, resV4, i32_arr_attr(0)), extract_val(elemTy, resV4, i32_arr_attr(1)), extract_val(elemTy, resV4, i32_arr_attr(2)), extract_val(elemTy, resV4, i32_arr_attr(3))}; } else if (elemBytes == 4 && needTrans) { // Use lds.32 to load tf32 matrices Value ptr2 = getPtr(ptrIdx + 1); assert(sMatStride == 1); int sOffsetElem = matIdx[order[1]] * (sMatStride * sMatShape); Value sOffsetElemVal = mul(i32_val(sOffsetElem), sStride); int sOffsetArrElem = sMatStride * sMatShape; Value sOffsetArrElemVal = add(sOffsetElemVal, mul(i32_val(sOffsetArrElem), sStride)); Value elems[4]; if (kOrder == 1) { elems[0] = load(gep(shemPtrTy, ptr, sOffsetElemVal)); elems[1] = load(gep(shemPtrTy, ptr2, sOffsetElemVal)); elems[2] = load(gep(shemPtrTy, ptr, sOffsetArrElemVal)); elems[3] = load(gep(shemPtrTy, ptr2, sOffsetArrElemVal)); } else { elems[0] = load(gep(shemPtrTy, ptr, sOffsetElemVal)); elems[2] = load(gep(shemPtrTy, ptr2, sOffsetElemVal)); elems[1] = load(gep(shemPtrTy, ptr, sOffsetArrElemVal)); elems[3] = load(gep(shemPtrTy, ptr2, sOffsetArrElemVal)); } std::array retElems; retElems.fill(undef(elemTy)); for (auto i = 0; i < 4; ++i) { retElems[i] = insert_element(elemTy, retElems[i], elems[i], i32_val(0)); } return {retElems[0], retElems[1], retElems[2], retElems[3]}; } else if (elemBytes == 1 && needTrans) { // work with int8 // Can't use i32 here. Use LLVM's VectorType elemTy = matTy.cast().getBody()[0]; std::array, 2> ptrs; ptrs[0] = { getPtr(ptrIdx), getPtr(ptrIdx + 1), getPtr(ptrIdx + 2), getPtr(ptrIdx + 3), }; ptrs[1] = { getPtr(ptrIdx + 4), getPtr(ptrIdx + 5), getPtr(ptrIdx + 6), getPtr(ptrIdx + 7), }; assert(sMatStride == 1); int sOffsetElem = matIdx[order[1]] * (sMatStride * sMatShape); Value sOffsetElemVal = mul(i32_val(sOffsetElem), sStride); int sOffsetArrElem = 1 * (sMatStride * sMatShape); Value sOffsetArrElemVal = add(sOffsetElemVal, mul(i32_val(sOffsetArrElem), sStride)); std::array i8v4Elems; i8v4Elems.fill(undef(elemTy)); Value i8Elems[4][4]; if (kOrder == 1) { for (int i = 0; i < 2; ++i) for (int j = 0; j < 4; ++j) i8Elems[i][j] = load(gep(shemPtrTy, ptrs[i][j], sOffsetElemVal)); for (int i = 2; i < 4; ++i) for (int j = 0; j < 4; ++j) i8Elems[i][j] = load(gep(shemPtrTy, ptrs[i - 2][j], sOffsetArrElemVal)); for (int m = 0; m < 4; ++m) { for (int e = 0; e < 4; ++e) i8v4Elems[m] = insert_element(i8v4Elems[m].getType(), i8v4Elems[m], i8Elems[m][e], i32_val(e)); } } else { // k first for (int j = 0; j < 4; ++j) i8Elems[0][j] = load(gep(shemPtrTy, ptrs[0][j], sOffsetElemVal)); for (int j = 0; j < 4; ++j) i8Elems[2][j] = load(gep(shemPtrTy, ptrs[1][j], sOffsetElemVal)); for (int j = 0; j < 4; ++j) i8Elems[1][j] = load(gep(shemPtrTy, ptrs[0][j], sOffsetArrElemVal)); for (int j = 0; j < 4; ++j) i8Elems[3][j] = load(gep(shemPtrTy, ptrs[1][j], sOffsetArrElemVal)); for (int m = 0; m < 4; ++m) { for (int e = 0; e < 4; ++e) i8v4Elems[m] = insert_element(i8v4Elems[m].getType(), i8v4Elems[m], i8Elems[m][e], i32_val(e)); } } return {bitcast(i8v4Elems[0], i32_ty), bitcast(i8v4Elems[1], i32_ty), bitcast(i8v4Elems[2], i32_ty), bitcast(i8v4Elems[3], i32_ty)}; } assert(false && "Invalid smem load"); return {Value{}, Value{}, Value{}, Value{}}; } MMA16816SmemLoader::MMA16816SmemLoader( int wpt, ArrayRef order, uint32_t kOrder, ArrayRef smemStrides, ArrayRef tileShape, ArrayRef instrShape, ArrayRef matShape, int perPhase, int maxPhase, int elemBytes, ConversionPatternRewriter &rewriter, TypeConverter *typeConverter, const Location &loc) : order(order.begin(), order.end()), kOrder(kOrder), tileShape(tileShape.begin(), tileShape.end()), instrShape(instrShape.begin(), instrShape.end()), matShape(matShape.begin(), matShape.end()), perPhase(perPhase), maxPhase(maxPhase), elemBytes(elemBytes), rewriter(rewriter), loc(loc), ctx(rewriter.getContext()) { cMatShape = matShape[order[0]]; sMatShape = matShape[order[1]]; sStride = smemStrides[order[1]]; // rule: k must be the fast-changing axis. needTrans = kOrder != order[0]; canUseLdmatrix = elemBytes == 2 || (!needTrans); // b16 if (canUseLdmatrix) { // Each CTA, the warps is arranged as [1xwpt] if not transposed, // otherwise [wptx1], and each warp will perform a mma. numPtrs = tileShape[order[0]] / (needTrans ? wpt : 1) / instrShape[order[0]]; } else { numPtrs = tileShape[order[0]] / wpt / matShape[order[0]]; } numPtrs = std::max(numPtrs, 2); // Special rule for i8/u8, 4 ptrs for each matrix if (!canUseLdmatrix && elemBytes == 1) numPtrs *= 4; int loadStrideInMat[2]; loadStrideInMat[kOrder] = 2; // instrShape[kOrder] / matShape[kOrder], always 2 loadStrideInMat[kOrder ^ 1] = wpt * (instrShape[kOrder ^ 1] / matShape[kOrder ^ 1]); pLoadStrideInMat = loadStrideInMat[order[0]]; sMatStride = loadStrideInMat[order[1]] / (instrShape[order[1]] / matShape[order[1]]); // Each matArr contains warpOffStride matrices. matArrStride = kOrder == 1 ? 1 : wpt; warpOffStride = instrShape[kOrder ^ 1] / matShape[kOrder ^ 1]; } Value MMA16816ConversionHelper::loadA(Value tensor, const SharedMemoryObject &smemObj) const { auto aTensorTy = tensor.getType().cast(); SmallVector shape(aTensorTy.getShape().begin(), aTensorTy.getShape().end()); ValueTable ha; std::function loadFn; auto [matShapeM, matShapeN, matShapeK] = getMmaMatShape(aTensorTy); auto [mmaInstrM, mmaInstrN, mmaInstrK] = getMmaInstrShape(aTensorTy); int numRepM = getNumRepM(aTensorTy, shape[0]); int numRepK = getNumRepK(aTensorTy, shape[1]); if (aTensorTy.getEncoding().isa()) { Value warpM = getWarpM(shape[0]); // load from smem // we use ldmatrix.x4 so each warp processes 16x16 elements. int wpt = std::min(mmaLayout.getWarpsPerCTA()[0], shape[0] / 16); loadFn = getLoadMatrixFn(tensor, smemObj, mmaLayout, wpt /*wpt*/, 1 /*kOrder*/, {mmaInstrM, mmaInstrK} /*instrShape*/, {matShapeM, matShapeK} /*matShape*/, warpM /*warpId*/, ha /*vals*/, true /*isA*/); } else if (aTensorTy.getEncoding().isa()) { // load from registers, used in gemm fuse // TODO(Superjomn) Port the logic. assert(false && "Loading A from register is not supported yet."); } else { assert(false && "A's layout is not supported."); } // step1. Perform loading. for (int m = 0; m < numRepM; ++m) for (int k = 0; k < numRepK; ++k) loadFn(2 * m, 2 * k); // step2. Format the values to LLVM::Struct to passing to mma codegen. return composeValuesToDotOperandLayoutStruct(ha, numRepM, numRepK); } Value MMA16816ConversionHelper::loadB(Value tensor, const SharedMemoryObject &smemObj) { ValueTable hb; auto tensorTy = tensor.getType().cast(); SmallVector shape(tensorTy.getShape().begin(), tensorTy.getShape().end()); // TODO[Superjomn]: transB cannot be accessed in ConvertLayoutOp. bool transB = false; if (transB) { std::swap(shape[0], shape[1]); } auto [matShapeM, matShapeN, matShapeK] = getMmaMatShape(tensorTy); auto [mmaInstrM, mmaInstrN, mmaInstrK] = getMmaInstrShape(tensorTy); int numRepK = getNumRepK(tensorTy, shape[0]); int numRepN = getNumRepN(tensorTy, shape[1]); Value warpN = getWarpN(shape[1]); // we use ldmatrix.x4 so each warp processes 16x16 elements. int wpt = std::min(mmaLayout.getWarpsPerCTA()[1], shape[1] / 16); auto loadFn = getLoadMatrixFn(tensor, smemObj, mmaLayout, wpt /*wpt*/, 0 /*kOrder*/, {mmaInstrK, mmaInstrN} /*instrShape*/, {matShapeK, matShapeN} /*matShape*/, warpN /*warpId*/, hb /*vals*/, false /*isA*/); for (int n = 0; n < std::max(numRepN / 2, 1); ++n) { for (int k = 0; k < numRepK; ++k) loadFn(2 * n, 2 * k); } Value result = composeValuesToDotOperandLayoutStruct( hb, std::max(numRepN / 2, 1), numRepK); return result; } Value MMA16816ConversionHelper::loadC(Value tensor, Value llTensor) const { auto tensorTy = tensor.getType().cast(); auto [repM, repN] = DotOpMmaV2ConversionHelper::getRepMN(tensorTy); size_t fcSize = 4 * repM * repN; assert(tensorTy.getEncoding().isa() && "Currently, we only support $c with a mma layout."); // Load a normal C tensor with mma layout, that should be a // LLVM::struct with fcSize elements. auto structTy = llTensor.getType().cast(); assert(structTy.getBody().size() == fcSize && "DotOp's $c operand should pass the same number of values as $d in " "mma layout."); return llTensor; } LogicalResult MMA16816ConversionHelper::convertDot(Value a, Value b, Value c, Value d, Value loadedA, Value loadedB, Value loadedC, DotOp op, DotOpAdaptor adaptor) const { helper.deduceMmaType(op); auto aTensorTy = a.getType().cast(); auto dTensorTy = d.getType().cast(); SmallVector aShape(aTensorTy.getShape().begin(), aTensorTy.getShape().end()); auto dShape = dTensorTy.getShape(); // shape / shape_per_cta int numRepM = getNumRepM(aTensorTy, dShape[0]); int numRepN = getNumRepN(aTensorTy, dShape[1]); int numRepK = getNumRepK(aTensorTy, aShape[1]); ValueTable ha = getValuesFromDotOperandLayoutStruct(loadedA, numRepM, numRepK); ValueTable hb = getValuesFromDotOperandLayoutStruct( loadedB, std::max(numRepN / 2, 1), numRepK); auto fc = getElementsFromStruct(loc, loadedC, rewriter); auto callMma = [&](unsigned m, unsigned n, unsigned k) { unsigned colsPerThread = numRepN * 2; PTXBuilder builder; auto &mma = *builder.create(helper.getMmaInstr().str()); // using =r for float32 works but leads to less readable ptx. bool isIntMMA = dTensorTy.getElementType().isInteger(32); auto retArgs = builder.newListOperand(4, isIntMMA ? "=r" : "=f"); auto aArgs = builder.newListOperand({ {ha[{m, k}], "r"}, {ha[{m + 1, k}], "r"}, {ha[{m, k + 1}], "r"}, {ha[{m + 1, k + 1}], "r"}, }); auto bArgs = builder.newListOperand({{hb[{n, k}], "r"}, {hb[{n, k + 1}], "r"}}); auto cArgs = builder.newListOperand(); for (int i = 0; i < 4; ++i) { cArgs->listAppend(builder.newOperand(fc[m * colsPerThread + 4 * n + i], std::to_string(i))); // reuse the output registers } mma(retArgs, aArgs, bArgs, cArgs); Value mmaOut = builder.launch(rewriter, loc, helper.getMmaRetType()); Type elemTy = mmaOut.getType().cast().getBody()[0]; for (int i = 0; i < 4; ++i) fc[m * colsPerThread + 4 * n + i] = extract_val(elemTy, mmaOut, i32_arr_attr(i)); }; for (int k = 0; k < numRepK; ++k) for (int m = 0; m < numRepM; ++m) for (int n = 0; n < numRepN; ++n) callMma(2 * m, n, 2 * k); Type resElemTy = dTensorTy.getElementType(); for (auto &elem : fc) { elem = bitcast(elem, resElemTy); } // replace with new packed result Type structTy = LLVM::LLVMStructType::getLiteral( ctx, SmallVector(fc.size(), resElemTy)); Value res = getStructFromElements(loc, fc, rewriter, structTy); rewriter.replaceOp(op, res); return success(); } std::function MMA16816ConversionHelper::getLoadMatrixFn( Value tensor, const SharedMemoryObject &smemObj, MmaEncodingAttr mmaLayout, int wpt, uint32_t kOrder, SmallVector instrShape, SmallVector matShape, Value warpId, MMA16816ConversionHelper::ValueTable &vals, bool isA) const { auto tensorTy = tensor.getType().cast(); // We assumes that the input operand of Dot should be from shared layout. // TODO(Superjomn) Consider other layouts if needed later. auto sharedLayout = tensorTy.getEncoding().cast(); const int perPhase = sharedLayout.getPerPhase(); const int maxPhase = sharedLayout.getMaxPhase(); const int elemBytes = tensorTy.getElementTypeBitWidth() / 8; auto order = sharedLayout.getOrder(); // the original register_lds2, but discard the prefetch logic. auto ld2 = [](ValueTable &vals, int mn, int k, Value val) { vals[{mn, k}] = val; }; // (a, b) is the coordinate. auto load = [=, &vals, &ld2](int a, int b) { MMA16816SmemLoader loader( wpt, sharedLayout.getOrder(), kOrder, smemObj.strides, tensorTy.getShape() /*tileShape*/, instrShape, matShape, perPhase, maxPhase, elemBytes, rewriter, typeConverter, loc); Value cSwizzleOffset = smemObj.getCSwizzleOffset(order[0]); SmallVector offs = loader.computeOffsets(warpId, lane, cSwizzleOffset); const int numPtrs = loader.getNumPtrs(); SmallVector ptrs(numPtrs); Value smemBase = smemObj.getBaseBeforeSwizzle(order[0], loc, rewriter); Type smemPtrTy = helper.getShemPtrTy(); for (int i = 0; i < numPtrs; ++i) { ptrs[i] = bitcast(gep(smemPtrTy, smemBase, ValueRange({offs[i]})), smemPtrTy); } auto [ha0, ha1, ha2, ha3] = loader.loadX4( (kOrder == 1) ? a : b /*mat0*/, (kOrder == 1) ? b : a /*mat1*/, offs, ptrs, helper.getMatType(), helper.getShemPtrTy()); if (isA) { ld2(vals, a, b, ha0); ld2(vals, a + 1, b, ha1); ld2(vals, a, b + 1, ha2); ld2(vals, a + 1, b + 1, ha3); } else { ld2(vals, a, b, ha0); ld2(vals, a + 1, b, ha2); ld2(vals, a, b + 1, ha1); ld2(vals, a + 1, b + 1, ha3); } }; return load; } Value MMA16816ConversionHelper::composeValuesToDotOperandLayoutStruct( const MMA16816ConversionHelper::ValueTable &vals, int n0, int n1) const { std::vector elems; for (int m = 0; m < n0; ++m) for (int k = 0; k < n1; ++k) { elems.push_back(vals.at({2 * m, 2 * k})); elems.push_back(vals.at({2 * m, 2 * k + 1})); elems.push_back(vals.at({2 * m + 1, 2 * k})); elems.push_back(vals.at({2 * m + 1, 2 * k + 1})); } assert(!elems.empty()); Type elemTy = elems[0].getType(); Type structTy = LLVM::LLVMStructType::getLiteral( ctx, SmallVector(elems.size(), elemTy)); auto result = getStructFromElements(loc, elems, rewriter, structTy); return result; } MMA16816ConversionHelper::ValueTable MMA16816ConversionHelper::getValuesFromDotOperandLayoutStruct(Value value, int n0, int n1) const { auto elems = getElementsFromStruct(loc, value, rewriter); int offset{}; ValueTable vals; for (int i = 0; i < n0; ++i) { for (int j = 0; j < n1; j++) { vals[{2 * i, 2 * j}] = elems[offset++]; vals[{2 * i, 2 * j + 1}] = elems[offset++]; vals[{2 * i + 1, 2 * j}] = elems[offset++]; vals[{2 * i + 1, 2 * j + 1}] = elems[offset++]; } } return vals; } SmallVector DotOpFMAConversionHelper::getThreadIds( Value threadId, ArrayRef shapePerCTA, ArrayRef sizePerThread, ArrayRef order, ConversionPatternRewriter &rewriter, Location loc) const { int dim = order.size(); SmallVector threadIds(dim); for (unsigned k = 0; k < dim - 1; k++) { Value dimK = i32_val(shapePerCTA[order[k]] / sizePerThread[order[k]]); Value rem = urem(threadId, dimK); threadId = udiv(threadId, dimK); threadIds[order[k]] = rem; } Value dimK = i32_val(shapePerCTA[order[dim - 1]]); threadIds[order[dim - 1]] = urem(threadId, dimK); return threadIds; } Value DotOpFMAConversionHelper::loadA( Value A, Value llA, BlockedEncodingAttr dLayout, Value thread, Location loc, ConversionPatternRewriter &rewriter) const { auto aTensorTy = A.getType().cast(); auto aLayout = aTensorTy.getEncoding().cast(); auto aShape = aTensorTy.getShape(); auto aOrder = aLayout.getOrder(); auto order = dLayout.getOrder(); bool isARow = aOrder[0] == 1; auto aSmem = getSharedMemoryObjectFromStruct(loc, llA, rewriter); Value strideAM = aSmem.strides[0]; Value strideAK = aSmem.strides[1]; Value strideA0 = isARow ? strideAK : strideAM; Value strideA1 = isARow ? strideAM : strideAK; int aNumPtr = 8; int K = aShape[1]; int M = aShape[0]; auto shapePerCTA = getShapePerCTA(dLayout); auto sizePerThread = getSizePerThread(dLayout); Value _0 = i32_val(0); Value mContig = i32_val(sizePerThread[order[1]]); // threadId in blocked layout auto threadIds = getThreadIds(thread, shapePerCTA, sizePerThread, order, rewriter, loc); Value threadIdM = threadIds[0]; Value offA0 = isARow ? _0 : mul(threadIdM, mContig); Value offA1 = isARow ? mul(threadIdM, mContig) : _0; SmallVector aOff(aNumPtr); for (int i = 0; i < aNumPtr; ++i) { aOff[i] = add(mul(offA0, strideA0), mul(offA1, strideA1)); } auto elemTy = A.getType().cast().getElementType(); Type ptrTy = ptr_ty(elemTy); SmallVector aPtrs(aNumPtr); for (int i = 0; i < aNumPtr; ++i) aPtrs[i] = gep(ptrTy, aSmem.base, aOff[i]); SmallVector vas; int mShapePerCTA = getShapePerCTAForMN(dLayout, true /*isM*/); int mSizePerThread = getSizePerThreadForMN(dLayout, true /*isM*/); for (unsigned k = 0; k < K; ++k) for (unsigned m = 0; m < M; m += mShapePerCTA) for (unsigned mm = 0; mm < mSizePerThread; ++mm) { Value offset = add(mul(i32_val(m + mm), strideAM), mul(i32_val(k), strideAK)); Value pa = gep(ptrTy, aPtrs[0], offset); Value va = load(pa); vas.emplace_back(va); } return getStructFromValueTable(vas, rewriter, loc, elemTy); } Value DotOpFMAConversionHelper::loadB( Value B, Value llB, BlockedEncodingAttr dLayout, Value thread, Location loc, ConversionPatternRewriter &rewriter) const { auto bTensorTy = B.getType().cast(); auto bLayout = bTensorTy.getEncoding().cast(); auto bShape = bTensorTy.getShape(); auto bOrder = bLayout.getOrder(); auto order = dLayout.getOrder(); bool isBRow = bOrder[0] == 1; auto bSmem = getSharedMemoryObjectFromStruct(loc, llB, rewriter); Value strideBN = bSmem.strides[1]; Value strideBK = bSmem.strides[0]; Value strideB0 = isBRow ? strideBN : strideBK; Value strideB1 = isBRow ? strideBK : strideBN; int bNumPtr = 8; int K = bShape[0]; int N = bShape[1]; auto shapePerCTA = getShapePerCTA(dLayout); auto sizePerThread = getSizePerThread(dLayout); Value _0 = i32_val(0); Value nContig = i32_val(sizePerThread[order[0]]); // threadId in blocked layout auto threadIds = getThreadIds(thread, shapePerCTA, sizePerThread, order, rewriter, loc); Value threadIdN = threadIds[1]; Value offB0 = isBRow ? mul(threadIdN, nContig) : _0; Value offB1 = isBRow ? _0 : mul(threadIdN, nContig); SmallVector bOff(bNumPtr); for (int i = 0; i < bNumPtr; ++i) { bOff[i] = add(mul(offB0, strideB0), mul(offB1, strideB1)); } auto elemTy = B.getType().cast().getElementType(); Type ptrTy = ptr_ty(elemTy); SmallVector bPtrs(bNumPtr); for (int i = 0; i < bNumPtr; ++i) bPtrs[i] = gep(ptrTy, bSmem.base, bOff[i]); SmallVector vbs; int nShapePerCTA = getShapePerCTAForMN(dLayout, false /*isM*/); int nSizePerThread = getSizePerThreadForMN(dLayout, false /*isM*/); for (unsigned k = 0; k < K; ++k) for (unsigned n = 0; n < N; n += nShapePerCTA) for (unsigned nn = 0; nn < nSizePerThread; ++nn) { Value offset = add(mul(i32_val(n + nn), strideBN), mul(i32_val(k), strideBK)); Value pb = gep(ptrTy, bPtrs[0], offset); Value vb = load(pb); vbs.emplace_back(vb); } return getStructFromValueTable(vbs, rewriter, loc, elemTy); } DotOpFMAConversionHelper::ValueTable DotOpFMAConversionHelper::getValueTableFromStruct( Value val, int K, int n0, int shapePerCTA, int sizePerThread, ConversionPatternRewriter &rewriter, Location loc) const { ValueTable res; auto elems = getElementsFromStruct(loc, val, rewriter); int index = 0; for (unsigned k = 0; k < K; ++k) { for (unsigned m = 0; m < n0; m += shapePerCTA) for (unsigned mm = 0; mm < sizePerThread; ++mm) { res[{m + mm, k}] = elems[index++]; } } return res; } Value DotOpFMAConversionHelper::getStructFromValueTable( ArrayRef vals, ConversionPatternRewriter &rewriter, Location loc, Type elemTy) const { SmallVector elemTypes(vals.size(), elemTy); SmallVector elems; elems.reserve(vals.size()); for (auto &val : vals) { elems.push_back(val); } Type structTy = struct_ty(elemTypes); return getStructFromElements(loc, elems, rewriter, structTy); } int DotOpFMAConversionHelper::getNumElemsPerThread( ArrayRef shape, DotOperandEncodingAttr dotOpLayout) { auto blockedLayout = dotOpLayout.getParent().cast(); auto shapePerCTA = getShapePerCTA(blockedLayout); auto sizePerThread = getSizePerThread(blockedLayout); // TODO[Superjomn]: we assume the k aixs is fixed for $a and $b here, fix it // if not. int K = dotOpLayout.getOpIdx() == 0 ? shape[1] : shape[0]; int otherDim = dotOpLayout.getOpIdx() == 1 ? shape[1] : shape[0]; bool isM = dotOpLayout.getOpIdx() == 0; int shapePerCTAMN = getShapePerCTAForMN(blockedLayout, isM); int sizePerThreadMN = getSizePerThreadForMN(blockedLayout, isM); return K * std::max(otherDim / shapePerCTAMN, 1) * sizePerThreadMN; } } // namespace LLVM } // namespace mlir triton-2.0.0/lib/Conversion/TritonGPUToLLVM/DotOpHelpers.h000066400000000000000000000547031440023377100233070ustar00rootroot00000000000000#ifndef TRITON_CONVERSION_TRITONGPU_TO_LLVM_DOT_OP_HELPERS_H #define TRITON_CONVERSION_TRITONGPU_TO_LLVM_DOT_OP_HELPERS_H #include "mlir/Analysis/SliceAnalysis.h" #include "mlir/Conversion/ArithmeticToLLVM/ArithmeticToLLVM.h" #include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h" #include "mlir/Conversion/LLVMCommon/LoweringOptions.h" #include "mlir/Conversion/LLVMCommon/Pattern.h" #include "mlir/Conversion/MathToLLVM/MathToLLVM.h" #include "mlir/Conversion/SCFToStandard/SCFToStandard.h" #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h" #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h" #include "mlir/Dialect/GPU/GPUDialect.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" #include "mlir/Dialect/Tensor/IR/Tensor.h" #include "mlir/IR/Matchers.h" #include "mlir/IR/TypeUtilities.h" #include "mlir/Transforms/DialectConversion.h" #include "triton/Analysis/AxisInfo.h" #include "triton/Analysis/Utility.h" #include "triton/Conversion/MLIRTypes.h" #include "triton/Conversion/TritonGPUToLLVM/PTXAsmFormat.h" #include "triton/Dialect/Triton/IR/Dialect.h" #include "triton/Dialect/TritonGPU/IR/Dialect.h" #include "llvm/Support/Format.h" #include "llvm/Support/FormatVariadic.h" #include "Utility.h" namespace mlir { namespace LLVM { using namespace mlir::triton; using ::mlir::triton::gpu::BlockedEncodingAttr; using ::mlir::triton::gpu::DotOperandEncodingAttr; using ::mlir::triton::gpu::MmaEncodingAttr; using ::mlir::triton::gpu::SharedEncodingAttr; // Helper for conversion of DotOp with mma, that is sm<80 struct DotOpMmaV1ConversionHelper { MmaEncodingAttr mmaLayout; ArrayRef wpt; static constexpr std::array fpw{{2, 2, 1}}; using ValueTable = std::map, std::pair>; explicit DotOpMmaV1ConversionHelper(MmaEncodingAttr mmaLayout) : mmaLayout(mmaLayout), wpt(mmaLayout.getWarpsPerCTA()) {} // Help to share some variables across multiple functions for A. // TODO[Superjomn]: refactor and restrict this to only use in DotOp // conversion. struct AParam { SmallVector rep; SmallVector spw; bool isAVec4{}; int vec{}; // This could only used in DotOp, not in // loadA/loadB/TypeConverter AParam(bool isARow, bool isAVec4) : isAVec4(isAVec4) { build(isARow); } private: void build(bool isARow); }; // Help to share some variables across multiple functions for A. // TODO[Superjomn]: refactor and restrict this to only use in DotOp // conversion. struct BParam { SmallVector rep; SmallVector spw; bool isBVec4{}; int vec{}; // This could only used in DotOp, not in // loadA/loadB/TypeConverter BParam(bool isBRow, bool isBVec4) : isBVec4(isBVec4) { build(isBRow); } private: void build(bool isBRow); }; int getRepM(int M) const { return std::max(M / (wpt[0] * instrShape[0]), 1); } int getRepN(int N) const { return std::max(N / (wpt[1] * instrShape[1]), 1); } static ArrayRef getMmaInstrShape() { return instrShape; } static Type getMmaRetType(TensorType operand) { auto *ctx = operand.getContext(); Type fp32Ty = type::f32Ty(ctx); // f16*f16+f32->f32 return struct_ty(SmallVector{8, fp32Ty}); } static Type getMatType(TensorType operand) { auto *ctx = operand.getContext(); Type fp16Ty = type::f16Ty(ctx); Type vecTy = vec_ty(fp16Ty, 2); return struct_ty(SmallVector{vecTy}); } // Get the number of fp16x2 elements for $a. unsigned getNumM(int M, bool isARow, bool isAVec4) const { AParam param(isARow, isAVec4); unsigned numM = param.rep[0] * M / (param.spw[0] * wpt[0]); return numM; } // Get the number of fp16x2 elements for $b. unsigned getNumN(int N, bool isBRow, bool isBVec4) const { BParam param(isBRow, isBVec4); unsigned numN = param.rep[1] * N / (param.spw[1] * wpt[1]); return numN; } int numElemsPerThreadA(ArrayRef shape, bool isARow, bool isAVec4, int vec) const; int numElemsPerThreadB(ArrayRef shape, bool isBRow, bool isBVec4, int vec) const; // Loading $a from smem to registers, returns a LLVM::Struct. Value loadA(Value tensor, const SharedMemoryObject &smemObj, Value thread, Location loc, ConversionPatternRewriter &rewriter) const; // Loading $b from smem to registers, returns a LLVM::Struct. Value loadB(Value tensor, const SharedMemoryObject &smemObj, Value thread, Location loc, ConversionPatternRewriter &rewriter) const; static ArrayRef getOrder() { return mmaOrder; } // Compute the offset of the matrix to load. // Returns offsetAM, offsetAK, offsetBN, offsetBK. // NOTE, the information M(from $a) and N(from $b) couldn't be retrieved at // the same time in the usage in convert_layout[shared->dot_op], we leave // the noexist info to be 0 and only use the desired argument from the // composed result. In this way we want to retain the original code // structure in convert_mma884 method for easier debugging. std::tuple computeOffsets(Value threadId, bool isARow, bool isBRow, ArrayRef fpw, ArrayRef spw, ArrayRef rep, ConversionPatternRewriter &rewriter, Location loc) const; // Extract values belong to $a or $b from a LLVMStruct, the shape is n0xn1. DotOpMmaV1ConversionHelper::ValueTable extractLoadedOperand(Value llStruct, int NK, ConversionPatternRewriter &rewriter) const; // Get the number of elements of this thread in M axis. The N axis could be // further deduced with the accSize / elemsM. \param wpt: the wpt in M axis // \param M: the shape in M axis int getElemsM(int wpt, int M, bool isARow, bool isAVec4) { DotOpMmaV1ConversionHelper::AParam param(isARow, isAVec4); int shapePerCTAM = param.spw[0] * wpt; return M / shapePerCTAM * param.rep[0]; } using CoordTy = SmallVector; // Get the coordinates(m,n) of the elements emit by a thread in accumulator. static SmallVector getMNCoords(Value thread, ConversionPatternRewriter &rewriter, ArrayRef wpt, ArrayRef shape, bool isARow, bool isBRow, bool isAVec4, bool isBVec4); // \param elemId the offset of the element in a thread static CoordTy getCoord(int elemId, ArrayRef coords) { return coords[elemId]; } private: static constexpr unsigned instrShape[] = {16, 16, 4}; static constexpr unsigned mmaOrder[] = {0, 1}; }; // Helper for conversion of DotOp with mma, that is sm>=80 struct DotOpMmaV2ConversionHelper { enum class TensorCoreType : uint8_t { // floating-point tensor core instr FP32_FP16_FP16_FP32 = 0, // default FP32_BF16_BF16_FP32, FP32_TF32_TF32_FP32, // integer tensor core instr INT32_INT1_INT1_INT32, // Not implemented INT32_INT4_INT4_INT32, // Not implemented INT32_INT8_INT8_INT32, // Not implemented // NOT_APPLICABLE, }; MmaEncodingAttr mmaLayout; MLIRContext *ctx{}; explicit DotOpMmaV2ConversionHelper(MmaEncodingAttr mmaLayout) : mmaLayout(mmaLayout) { ctx = mmaLayout.getContext(); } void deduceMmaType(DotOp op) const { mmaType = getMmaType(op); } void deduceMmaType(Type operandTy) const { mmaType = getTensorCoreTypeFromOperand(operandTy); } // Get the M and N of mma instruction shape. static std::tuple getInstrShapeMN() { // According to DotOpConversionHelper::mmaInstrShape, all the M,N are // {16,8} return {16, 8}; } static std::tuple getRepMN(const RankedTensorType &tensorTy); Type getShemPtrTy() const; // The type of matrix that loaded by either a ldmatrix or composed lds. Type getMatType() const; Type getLoadElemTy(); Type getMmaRetType() const; ArrayRef getMmaInstrShape() const { assert(mmaType != TensorCoreType::NOT_APPLICABLE && "Unknown mma type found."); return mmaInstrShape.at(mmaType); } static ArrayRef getMmaInstrShape(TensorCoreType tensorCoreType) { assert(tensorCoreType != TensorCoreType::NOT_APPLICABLE && "Unknown mma type found."); return mmaInstrShape.at(tensorCoreType); } ArrayRef getMmaMatShape() const { assert(mmaType != TensorCoreType::NOT_APPLICABLE && "Unknown mma type found."); return mmaMatShape.at(mmaType); } // Deduce the TensorCoreType from either $a or $b's type. static TensorCoreType getTensorCoreTypeFromOperand(Type operandTy); int getVec() const { assert(mmaType != TensorCoreType::NOT_APPLICABLE && "Unknown mma type found."); return mmaInstrVec.at(mmaType); } StringRef getMmaInstr() const { assert(mmaType != TensorCoreType::NOT_APPLICABLE && "Unknown mma type found."); return mmaInstrPtx.at(mmaType); } static TensorCoreType getMmaType(triton::DotOp op); private: mutable TensorCoreType mmaType{TensorCoreType::NOT_APPLICABLE}; // Used on nvidia GPUs mma layout .version == 2 // Refer to // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-storage // for more details. inline static const std::map> mmaInstrShape = { {TensorCoreType::FP32_FP16_FP16_FP32, {16, 8, 16}}, {TensorCoreType::FP32_BF16_BF16_FP32, {16, 8, 16}}, {TensorCoreType::FP32_TF32_TF32_FP32, {16, 8, 8}}, {TensorCoreType::INT32_INT1_INT1_INT32, {16, 8, 256}}, {TensorCoreType::INT32_INT4_INT4_INT32, {16, 8, 64}}, {TensorCoreType::INT32_INT8_INT8_INT32, {16, 8, 32}}, }; // shape of matrices loaded by ldmatrix (m-n-k, for mxk & kxn matrices) // Refer to // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-ldmatrix // for more details. inline static const std::map> mmaMatShape = { {TensorCoreType::FP32_FP16_FP16_FP32, {8, 8, 8}}, {TensorCoreType::FP32_BF16_BF16_FP32, {8, 8, 8}}, {TensorCoreType::FP32_TF32_TF32_FP32, {8, 8, 4}}, {TensorCoreType::INT32_INT1_INT1_INT32, {8, 8, 64}}, {TensorCoreType::INT32_INT4_INT4_INT32, {8, 8, 32}}, {TensorCoreType::INT32_INT8_INT8_INT32, {8, 8, 16}}, }; // Supported mma instruction in PTX. // Refer to // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-for-mma // for more details. inline static const std::map mmaInstrPtx = { {TensorCoreType::FP32_FP16_FP16_FP32, "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"}, {TensorCoreType::FP32_BF16_BF16_FP32, "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32"}, {TensorCoreType::FP32_TF32_TF32_FP32, "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"}, {TensorCoreType::INT32_INT1_INT1_INT32, "mma.sync.aligned.m16n8k256.row.col.s32.b1.b1.s32.xor.popc"}, {TensorCoreType::INT32_INT4_INT4_INT32, "mma.sync.aligned.m16n8k64.row.col.satfinite.s32.s4.s4.s32"}, {TensorCoreType::INT32_INT8_INT8_INT32, "mma.sync.aligned.m16n8k32.row.col.satfinite.s32.s8.s8.s32"}, }; // vector length per ldmatrix (16*8/element_size_in_bits) inline static const std::map mmaInstrVec = { {TensorCoreType::FP32_FP16_FP16_FP32, 8}, {TensorCoreType::FP32_BF16_BF16_FP32, 8}, {TensorCoreType::FP32_TF32_TF32_FP32, 4}, {TensorCoreType::INT32_INT1_INT1_INT32, 128}, {TensorCoreType::INT32_INT4_INT4_INT32, 32}, {TensorCoreType::INT32_INT8_INT8_INT32, 16}, }; }; // Data loader for mma.16816 instruction. class MMA16816SmemLoader { public: MMA16816SmemLoader(int wpt, ArrayRef order, uint32_t kOrder, ArrayRef smemStrides, ArrayRef tileShape, ArrayRef instrShape, ArrayRef matShape, int perPhase, int maxPhase, int elemBytes, ConversionPatternRewriter &rewriter, TypeConverter *typeConverter, const Location &loc); // lane = thread % 32 // warpOff = (thread/32) % wpt(0) llvm::SmallVector computeOffsets(Value warpOff, Value lane, Value cSwizzleOffset) { if (canUseLdmatrix) return computeLdmatrixMatOffs(warpOff, lane, cSwizzleOffset); else if (elemBytes == 4 && needTrans) return computeB32MatOffs(warpOff, lane, cSwizzleOffset); else if (elemBytes == 1 && needTrans) return computeB8MatOffs(warpOff, lane, cSwizzleOffset); else llvm::report_fatal_error("Invalid smem load config"); return {}; } int getNumPtrs() const { return numPtrs; } // Compute the offset to the matrix this thread(indexed by warpOff and lane) // mapped to. SmallVector computeLdmatrixMatOffs(Value warpId, Value lane, Value cSwizzleOffset); // Compute 32-bit matrix offsets. SmallVector computeB32MatOffs(Value warpOff, Value lane, Value cSwizzleOffset); // compute 8-bit matrix offset. SmallVector computeB8MatOffs(Value warpOff, Value lane, Value cSwizzleOffset); // Load 4 matrices and returns 4 vec<2> elements. std::tuple loadX4(int mat0, int mat1, ArrayRef offs, ArrayRef ptrs, Type matTy, Type shemPtrTy) const; private: SmallVector order; int kOrder; SmallVector tileShape; SmallVector instrShape; SmallVector matShape; int perPhase; int maxPhase; int elemBytes; ConversionPatternRewriter &rewriter; const Location &loc; MLIRContext *ctx{}; int cMatShape; int sMatShape; Value sStride; bool needTrans; bool canUseLdmatrix; int numPtrs; int pLoadStrideInMat; int sMatStride; int matArrStride; int warpOffStride; }; // This class helps to adapt the existing DotOpConversion to the latest // DotOpOperand layout design. It decouples the exising implementation to two // parts: // 1. loading the specific operand matrix(for $a, $b, $c) from smem // 2. passing the loaded value and perform the mma codegen struct MMA16816ConversionHelper { MmaEncodingAttr mmaLayout; ArrayRef wpt; SmallVector properWpt; Value thread, lane, warp; DotOpMmaV2ConversionHelper helper; ConversionPatternRewriter &rewriter; TypeConverter *typeConverter; Location loc; MLIRContext *ctx{}; using ValueTable = std::map, Value>; // dotOperand: type of either one operand of dotOp. MMA16816ConversionHelper(Type dotOperand, MmaEncodingAttr mmaLayout, Value thread, ConversionPatternRewriter &rewriter, TypeConverter *typeConverter, Location loc) : mmaLayout(mmaLayout), thread(thread), helper(mmaLayout), rewriter(rewriter), typeConverter(typeConverter), loc(loc), ctx(mmaLayout.getContext()), wpt(mmaLayout.getWarpsPerCTA()) { helper.deduceMmaType(dotOperand); Value _32 = i32_val(32); lane = urem(thread, _32); warp = udiv(thread, _32); } // Get a warpId for M axis. Value getWarpM(int M) const { auto matShape = helper.getMmaMatShape(); return urem(urem(warp, i32_val(wpt[0])), i32_val(M / matShape[0])); } // Get a warpId for N axis. Value getWarpN(int N) const { auto matShape = helper.getMmaMatShape(); Value warpMN = udiv(warp, i32_val(wpt[0])); return urem(urem(warpMN, i32_val(wpt[1])), i32_val(N / matShape[1])); } // Get the mmaInstrShape deducing either from $a or $b. std::tuple getMmaInstrShape(Type operand) const { helper.deduceMmaType(operand); auto mmaInstrShape = helper.getMmaInstrShape(); int mmaInstrM = mmaInstrShape[0]; int mmaInstrN = mmaInstrShape[1]; int mmaInstrK = mmaInstrShape[2]; return std::make_tuple(mmaInstrM, mmaInstrN, mmaInstrK); } // Get the mmaMatShape deducing either from $a or $b. std::tuple getMmaMatShape(Type operand) const { helper.deduceMmaType(operand); auto matShape = helper.getMmaMatShape(); int matShapeM = matShape[0]; int matShapeN = matShape[1]; int matShapeK = matShape[2]; return std::make_tuple(matShapeM, matShapeN, matShapeK); } // \param operand is either $a or $b's type. inline int getNumRepM(Type operand, int M) const { return getNumRepM(operand, M, wpt[0]); } // \param operand is either $a or $b's type. inline int getNumRepN(Type operand, int N) const { return getNumRepN(operand, N, wpt[1]); } // \param operand is either $a or $b's type. inline int getNumRepK(Type operand, int K) const { return getNumRepK_(operand, K); } static int getNumRepM(Type operand, int M, int wpt) { auto tensorCoreType = DotOpMmaV2ConversionHelper::getTensorCoreTypeFromOperand(operand); int mmaInstrM = DotOpMmaV2ConversionHelper::getMmaInstrShape(tensorCoreType)[0]; return std::max(M / (wpt * mmaInstrM), 1); } static int getNumRepN(Type operand, int N, int wpt) { auto tensorCoreType = DotOpMmaV2ConversionHelper::getTensorCoreTypeFromOperand(operand); int mmaInstrN = DotOpMmaV2ConversionHelper::getMmaInstrShape(tensorCoreType)[1]; return std::max(N / (wpt * mmaInstrN), 1); } static int getNumRepK_(Type operand, int K) { auto tensorCoreType = DotOpMmaV2ConversionHelper::getTensorCoreTypeFromOperand(operand); int mmaInstrK = DotOpMmaV2ConversionHelper::getMmaInstrShape(tensorCoreType)[2]; return std::max(K / mmaInstrK, 1); } // Get number of elements per thread for $a operand. static size_t getANumElemsPerThread(RankedTensorType operand, int wpt) { auto shape = operand.getShape(); int repM = getNumRepM(operand, shape[0], wpt); int repK = getNumRepK_(operand, shape[1]); return 4 * repM * repK; } // Get number of elements per thread for $b operand. static size_t getBNumElemsPerThread(RankedTensorType operand, int wpt) { auto shape = operand.getShape(); int repK = getNumRepK_(operand, shape[0]); int repN = getNumRepN(operand, shape[1], wpt); return 4 * std::max(repN / 2, 1) * repK; } // Loading $a from smem to registers, returns a LLVM::Struct. Value loadA(Value tensor, const SharedMemoryObject &smemObj) const; // Loading $b from smem to registers, returns a LLVM::Struct. Value loadB(Value tensor, const SharedMemoryObject &smemObj); // Loading $c to registers, returns a Value. Value loadC(Value tensor, Value llTensor) const; // Conduct the Dot conversion. // \param a, \param b, \param c and \param d are DotOp operands. // \param loadedA, \param loadedB, \param loadedC, all of them are result of // loading. LogicalResult convertDot(Value a, Value b, Value c, Value d, Value loadedA, Value loadedB, Value loadedC, DotOp op, DotOpAdaptor adaptor) const; private: std::function getLoadMatrixFn(Value tensor, const SharedMemoryObject &smemObj, MmaEncodingAttr mmaLayout, int wpt, uint32_t kOrder, SmallVector instrShape, SmallVector matShape, Value warpId, ValueTable &vals, bool isA) const; // Compose a map of Values to a LLVM::Struct. // The layout is a list of Value with coordinate of (i,j), the order is as // the follows: // [ // (0,0), (0,1), (1,0), (1,1), # i=0, j=0 // (0,2), (0,3), (1,2), (1,3), # i=0, j=1 // (0,4), (0,5), (1,4), (1,5), # i=0, j=2 // ... // (2,0), (2,1), (3,0), (3,1), # i=1, j=0 // (2,2), (2,3), (3,2), (3,3), # i=1, j=1 // (2,4), (2,5), (3,4), (3,5), # i=1, j=2 // ... // ] // i \in [0, n0) and j \in [0, n1) // There should be \param n0 * \param n1 elements in the output Struct. Value composeValuesToDotOperandLayoutStruct(const ValueTable &vals, int n0, int n1) const; ValueTable getValuesFromDotOperandLayoutStruct(Value value, int n0, int n1) const; }; // Helper for conversion of FMA DotOp. struct DotOpFMAConversionHelper { Attribute layout; MLIRContext *ctx{}; using ValueTable = std::map, Value>; explicit DotOpFMAConversionHelper(Attribute layout) : layout(layout), ctx(layout.getContext()) {} SmallVector getThreadIds(Value threadId, ArrayRef shapePerCTA, ArrayRef sizePerThread, ArrayRef order, ConversionPatternRewriter &rewriter, Location loc) const; Value loadA(Value A, Value llA, BlockedEncodingAttr dLayout, Value thread, Location loc, ConversionPatternRewriter &rewriter) const; Value loadB(Value B, Value llB, BlockedEncodingAttr dLayout, Value thread, Location loc, ConversionPatternRewriter &rewriter) const; ValueTable getValueTableFromStruct(Value val, int K, int n0, int shapePerCTA, int sizePerThread, ConversionPatternRewriter &rewriter, Location loc) const; Value getStructFromValueTable(ArrayRef vals, ConversionPatternRewriter &rewriter, Location loc, Type elemTy) const; // get number of elements per thread for $a or $b. static int getNumElemsPerThread(ArrayRef shape, DotOperandEncodingAttr dotOpLayout); // Get shapePerCTA for M or N axis. static int getShapePerCTAForMN(BlockedEncodingAttr layout, bool isM) { auto order = layout.getOrder(); auto shapePerCTA = getShapePerCTA(layout); int mShapePerCTA = order[0] == 1 ? shapePerCTA[order[1]] : shapePerCTA[order[0]]; int nShapePerCTA = order[0] == 0 ? shapePerCTA[order[1]] : shapePerCTA[order[0]]; return isM ? mShapePerCTA : nShapePerCTA; } // Get sizePerThread for M or N axis. static int getSizePerThreadForMN(BlockedEncodingAttr layout, bool isM) { auto order = layout.getOrder(); auto sizePerThread = getSizePerThread(layout); int mSizePerThread = order[0] == 1 ? sizePerThread[order[1]] : sizePerThread[order[0]]; int nSizePerThread = order[0] == 0 ? sizePerThread[order[1]] : sizePerThread[order[0]]; return isM ? mSizePerThread : nSizePerThread; } }; } // namespace LLVM } // namespace mlir #endif triton-2.0.0/lib/Conversion/TritonGPUToLLVM/DotOpToLLVM.cpp000066400000000000000000000256511440023377100233150ustar00rootroot00000000000000#include "DotOpToLLVM.h" #include "DotOpHelpers.h" #include "Utility.h" using namespace mlir; using namespace mlir::triton; using ::mlir::LLVM::DotOpFMAConversionHelper; using ::mlir::LLVM::DotOpMmaV1ConversionHelper; using ::mlir::LLVM::getElementsFromStruct; using ::mlir::LLVM::getStructFromElements; using ::mlir::LLVM::MMA16816ConversionHelper; using ::mlir::triton::gpu::DotOperandEncodingAttr; using ::mlir::triton::gpu::MmaEncodingAttr; struct DotOpConversion : public ConvertTritonGPUOpToLLVMPattern { using ConvertTritonGPUOpToLLVMPattern< triton::DotOp>::ConvertTritonGPUOpToLLVMPattern; LogicalResult matchAndRewrite(triton::DotOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { // D = A * B + C Value A = op.a(); Value D = op.getResult(); // Here we assume the DotOp's operands always comes from shared memory. auto AShape = A.getType().cast().getShape(); size_t reduceAxis = 1; unsigned K = AShape[reduceAxis]; bool isOuter = K == 1; MmaEncodingAttr mmaLayout = D.getType() .cast() .getEncoding() .dyn_cast(); if (!isOuter && mmaLayout && supportMMA(op, mmaLayout.getVersionMajor())) { if (mmaLayout.isVolta()) return convertMMA884(op, adaptor, rewriter); if (mmaLayout.isAmpere()) return convertMMA16816(op, adaptor, rewriter); llvm::report_fatal_error( "Unsupported MMA kind found when converting DotOp to LLVM."); } if (D.getType() .cast() .getEncoding() .isa()) return convertFMADot(op, adaptor, rewriter); llvm::report_fatal_error( "Unsupported DotOp found when converting TritonGPU to LLVM."); } private: // Convert to mma.m16n8k16 LogicalResult convertMMA16816(triton::DotOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const { auto loc = op.getLoc(); auto mmaLayout = op.getResult() .getType() .cast() .getEncoding() .cast(); Value A = op.a(); Value B = op.b(); Value C = op.c(); MMA16816ConversionHelper mmaHelper(A.getType(), mmaLayout, getThreadId(rewriter, loc), rewriter, getTypeConverter(), loc); auto ATensorTy = A.getType().cast(); auto BTensorTy = B.getType().cast(); assert(ATensorTy.getEncoding().isa() && BTensorTy.getEncoding().isa() && "Both $a and %b should be DotOperand layout."); Value loadedA, loadedB, loadedC; loadedA = adaptor.a(); loadedB = adaptor.b(); loadedC = mmaHelper.loadC(op.c(), adaptor.c()); return mmaHelper.convertDot(A, B, C, op.d(), loadedA, loadedB, loadedC, op, adaptor); } /// Convert to mma.m8n8k4 LogicalResult convertMMA884(triton::DotOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const { auto *ctx = op.getContext(); auto loc = op.getLoc(); Value A = op.a(); Value B = op.b(); Value D = op.getResult(); auto mmaLayout = D.getType() .cast() .getEncoding() .cast(); auto ALayout = A.getType() .cast() .getEncoding() .cast(); auto BLayout = B.getType() .cast() .getEncoding() .cast(); auto ATensorTy = A.getType().cast(); auto BTensorTy = B.getType().cast(); auto DTensorTy = D.getType().cast(); auto AShape = ATensorTy.getShape(); auto BShape = BTensorTy.getShape(); auto DShape = DTensorTy.getShape(); auto wpt = mmaLayout.getWarpsPerCTA(); bool isARow = ALayout.getIsMMAv1Row().cast().getValue(); bool isBRow = BLayout.getIsMMAv1Row().cast().getValue(); auto [isARow_, isBRow_, isAVec4_, isBVec4_, mmaId] = mmaLayout.decodeVoltaLayoutStates(); assert(isARow == isARow_); assert(isBRow == isBRow_); DotOpMmaV1ConversionHelper helper(mmaLayout); unsigned numM = helper.getNumM(AShape[0], isARow, isAVec4_); unsigned numN = helper.getNumN(BShape[1], isBRow, isBVec4_); unsigned NK = AShape[1]; auto has = helper.extractLoadedOperand(adaptor.a(), NK, rewriter); auto hbs = helper.extractLoadedOperand(adaptor.b(), NK, rewriter); // Initialize accumulators with external values, the acc holds the // accumulator value that is shared between the MMA instructions inside a // DotOp, we can call the order of the values the accumulator-internal // order. SmallVector acc = getElementsFromStruct(loc, adaptor.c(), rewriter); size_t resSize = acc.size(); // The resVals holds the final result of the DotOp. // NOTE The current order of resVals is different from acc, we call it the // accumulator-external order. and SmallVector resVals(resSize); auto getIdx = [&](int m, int n) { std::vector idx{{ (m * 2 + 0) + (n * 4 + 0) * numM, // row0 (m * 2 + 0) + (n * 4 + 1) * numM, (m * 2 + 1) + (n * 4 + 0) * numM, // row1 (m * 2 + 1) + (n * 4 + 1) * numM, (m * 2 + 0) + (n * 4 + 2) * numM, // row2 (m * 2 + 0) + (n * 4 + 3) * numM, (m * 2 + 1) + (n * 4 + 2) * numM, // row3 (m * 2 + 1) + (n * 4 + 3) * numM, }}; return idx; }; auto callMMA = [&](unsigned m, unsigned n, unsigned k) { auto ha = has.at({m, k}); auto hb = hbs.at({n, k}); PTXBuilder builder; auto idx = getIdx(m, n); // note: using "=f" for float leads to cleaner PTX bool isIntMMA = DTensorTy.getElementType().isInteger(32); auto *resOprs = builder.newListOperand(8, isIntMMA ? "=r" : "=f"); auto *AOprs = builder.newListOperand({ {ha.first, "r"}, {ha.second, "r"}, }); auto *BOprs = builder.newListOperand({ {hb.first, "r"}, {hb.second, "r"}, }); auto *COprs = builder.newListOperand(); for (int i = 0; i < 8; ++i) COprs->listAppend(builder.newOperand(acc[idx[i]], std::to_string(i))); auto mma = builder.create("mma.sync.aligned.m8n8k4") ->o(isARow ? "row" : "col") .o(isBRow ? "row" : "col") .o("f32.f16.f16.f32"); mma(resOprs, AOprs, BOprs, COprs); Value res = builder.launch(rewriter, loc, helper.getMmaRetType(ATensorTy)); for (auto i = 0; i < 8; i++) { Value elem = extract_val(f32_ty, res, i32_arr_attr(i)); acc[idx[i]] = elem; } }; for (unsigned k = 0; k < NK; k += 4) for (unsigned m = 0; m < numM / 2; ++m) for (unsigned n = 0; n < numN / 2; ++n) { callMMA(m, n, k); } // res holds the same layout of acc for (size_t i = 0; i < acc.size(); ++i) { resVals[i] = acc[i]; } Type structTy = LLVM::LLVMStructType::getLiteral( ctx, SmallVector(resSize, type::f32Ty(ctx))); Value res = getStructFromElements(loc, resVals, rewriter, structTy); rewriter.replaceOp(op, res); return success(); } LogicalResult convertFMADot(triton::DotOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const { auto *ctx = rewriter.getContext(); auto loc = op.getLoc(); auto threadId = getThreadId(rewriter, loc); auto A = op.a(); auto B = op.b(); auto C = op.c(); auto D = op.getResult(); auto aTensorTy = A.getType().cast(); auto bTensorTy = B.getType().cast(); auto cTensorTy = C.getType().cast(); auto dTensorTy = D.getType().cast(); auto aShape = aTensorTy.getShape(); auto bShape = bTensorTy.getShape(); auto cShape = cTensorTy.getShape(); BlockedEncodingAttr dLayout = dTensorTy.getEncoding().cast(); auto order = dLayout.getOrder(); auto cc = getElementsFromStruct(loc, adaptor.c(), rewriter); DotOpFMAConversionHelper helper(dLayout); Value llA = adaptor.a(); Value llB = adaptor.b(); auto sizePerThread = getSizePerThread(dLayout); auto shapePerCTA = getShapePerCTA(dLayout); int K = aShape[1]; int M = aShape[0]; int N = bShape[1]; int mShapePerCTA = order[0] == 1 ? shapePerCTA[order[1]] : shapePerCTA[order[0]]; int mSizePerThread = order[0] == 1 ? sizePerThread[order[1]] : sizePerThread[order[0]]; int nShapePerCTA = order[0] == 0 ? shapePerCTA[order[1]] : shapePerCTA[order[0]]; int nSizePerThread = order[0] == 0 ? sizePerThread[order[1]] : sizePerThread[order[0]]; auto has = helper.getValueTableFromStruct(llA, K, M, mShapePerCTA, mSizePerThread, rewriter, loc); auto hbs = helper.getValueTableFromStruct(llB, K, N, nShapePerCTA, nSizePerThread, rewriter, loc); SmallVector ret = cc; bool isCRow = order[0] == 1; for (unsigned k = 0; k < K; k++) { for (unsigned m = 0; m < M; m += mShapePerCTA) for (unsigned n = 0; n < N; n += nShapePerCTA) for (unsigned mm = 0; mm < mSizePerThread; ++mm) for (unsigned nn = 0; nn < nSizePerThread; ++nn) { int mIdx = m / mShapePerCTA * mSizePerThread + mm; int nIdx = n / nShapePerCTA * nSizePerThread + nn; int z = isCRow ? mIdx * N / nShapePerCTA * mSizePerThread + nIdx : nIdx * M / mShapePerCTA * nSizePerThread + mIdx; ret[z] = rewriter.create( loc, has[{m + mm, k}], hbs[{n + nn, k}], ret[z]); } } auto res = getStructFromElements( loc, ret, rewriter, struct_ty(SmallVector(ret.size(), ret[0].getType()))); rewriter.replaceOp(op, res); return success(); } }; void populateDotOpToLLVMPatterns(mlir::LLVMTypeConverter &typeConverter, RewritePatternSet &patterns, int numWarps, AxisInfoAnalysis &axisInfoAnalysis, const Allocation *allocation, Value smem, PatternBenefit benefit) { patterns.add(typeConverter, allocation, smem, benefit); } triton-2.0.0/lib/Conversion/TritonGPUToLLVM/DotOpToLLVM.h000066400000000000000000000010511440023377100227460ustar00rootroot00000000000000#ifndef TRITON_CONVERSION_TRITONGPU_TO_LLVM_DOT_OP_H #define TRITON_CONVERSION_TRITONGPU_TO_LLVM_DOT_OP_H #include "TritonGPUToLLVMBase.h" using namespace mlir; using namespace mlir::triton; void populateDotOpToLLVMPatterns(mlir::LLVMTypeConverter &typeConverter, RewritePatternSet &patterns, int numWarps, AxisInfoAnalysis &axisInfoAnalysis, const Allocation *allocation, Value smem, PatternBenefit benefit); #endif triton-2.0.0/lib/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVM.cpp000066400000000000000000001120521440023377100250400ustar00rootroot00000000000000#include "ElementwiseOpToLLVM.h" using namespace mlir; using namespace mlir::triton; using ::mlir::LLVM::getElementsFromStruct; using ::mlir::LLVM::getStructFromElements; using ::mlir::triton::gpu::getElemsPerThread; struct FpToFpOpConversion : public ConvertTritonGPUOpToLLVMPattern { using ConvertTritonGPUOpToLLVMPattern< triton::FpToFpOp>::ConvertTritonGPUOpToLLVMPattern; static SmallVector convertFp8x4ToFp16x4(Location loc, ConversionPatternRewriter &rewriter, const Value &v0, const Value &v1, const Value &v2, const Value &v3) { auto ctx = rewriter.getContext(); auto fp8x4VecTy = vec_ty(i8_ty, 4); Value fp8x4Vec = undef(fp8x4VecTy); fp8x4Vec = insert_element(fp8x4VecTy, fp8x4Vec, v0, i32_val(0)); fp8x4Vec = insert_element(fp8x4VecTy, fp8x4Vec, v1, i32_val(1)); fp8x4Vec = insert_element(fp8x4VecTy, fp8x4Vec, v2, i32_val(2)); fp8x4Vec = insert_element(fp8x4VecTy, fp8x4Vec, v3, i32_val(3)); fp8x4Vec = bitcast(fp8x4Vec, i32_ty); PTXBuilder builder; auto *ptxAsm = "{ \n" ".reg .b32 a<2>, b<2>; \n" "prmt.b32 a0, 0, $2, 0x5040; \n" "prmt.b32 a1, 0, $2, 0x7060; \n" "lop3.b32 b0, a0, 0x7fff7fff, 0, 0xc0; \n" "lop3.b32 b1, a1, 0x7fff7fff, 0, 0xc0; \n" "shr.b32 b0, b0, 1; \n" "shr.b32 b1, b1, 1; \n" "lop3.b32 $0, b0, 0x80008000, a0, 0xf8; \n" "lop3.b32 $1, b1, 0x80008000, a1, 0xf8; \n" "}"; auto &call = *builder.create(ptxAsm); auto *o0 = builder.newOperand("=r"); auto *o1 = builder.newOperand("=r"); auto *i = builder.newOperand(fp8x4Vec, "r"); call({o0, o1, i}, /*onlyAttachMLIRArgs=*/true); auto fp16x2VecTy = vec_ty(f16_ty, 2); auto fp16x2x2StructTy = struct_ty(SmallVector{fp16x2VecTy, fp16x2VecTy}); auto fp16x2x2Struct = builder.launch(rewriter, loc, fp16x2x2StructTy, false); auto fp16x2Vec0 = extract_val(fp16x2VecTy, fp16x2x2Struct, i32_arr_attr(0)); auto fp16x2Vec1 = extract_val(fp16x2VecTy, fp16x2x2Struct, i32_arr_attr(1)); return {extract_element(f16_ty, fp16x2Vec0, i32_val(0)), extract_element(f16_ty, fp16x2Vec0, i32_val(1)), extract_element(f16_ty, fp16x2Vec1, i32_val(0)), extract_element(f16_ty, fp16x2Vec1, i32_val(1))}; } static SmallVector convertFp16x4ToFp8x4(Location loc, ConversionPatternRewriter &rewriter, const Value &v0, const Value &v1, const Value &v2, const Value &v3) { auto ctx = rewriter.getContext(); auto fp16x2VecTy = vec_ty(f16_ty, 2); Value fp16x2Vec0 = undef(fp16x2VecTy); Value fp16x2Vec1 = undef(fp16x2VecTy); fp16x2Vec0 = insert_element(fp16x2VecTy, fp16x2Vec0, v0, i32_val(0)); fp16x2Vec0 = insert_element(fp16x2VecTy, fp16x2Vec0, v1, i32_val(1)); fp16x2Vec1 = insert_element(fp16x2VecTy, fp16x2Vec1, v2, i32_val(0)); fp16x2Vec1 = insert_element(fp16x2VecTy, fp16x2Vec1, v3, i32_val(1)); fp16x2Vec0 = bitcast(fp16x2Vec0, i32_ty); fp16x2Vec1 = bitcast(fp16x2Vec1, i32_ty); PTXBuilder builder; auto *ptxAsm = "{ \n" ".reg .b32 a<2>, b<2>; \n" "shl.b32 a0, $1, 1; \n" "shl.b32 a1, $2, 1; \n" "lop3.b32 a0, a0, 0x7fff7fff, 0, 0xc0; \n" "lop3.b32 a1, a1, 0x7fff7fff, 0, 0xc0; \n" "add.u32 a0, a0, 0x00800080; \n" "add.u32 a1, a1, 0x00800080; \n" "lop3.b32 b0, $1, 0x80008000, a0, 0xea; \n" "lop3.b32 b1, $2, 0x80008000, a1, 0xea; \n" "prmt.b32 $0, b0, b1, 0x7531; \n" "}"; auto &call = *builder.create(ptxAsm); auto *o = builder.newOperand("=r"); auto *i0 = builder.newOperand(fp16x2Vec0, "r"); auto *i1 = builder.newOperand(fp16x2Vec1, "r"); call({o, i0, i1}, /*onlyAttachMLIRArgs=*/true); auto fp8x4VecTy = vec_ty(i8_ty, 4); auto fp8x4Vec = builder.launch(rewriter, loc, fp8x4VecTy, false); return {extract_element(i8_ty, fp8x4Vec, i32_val(0)), extract_element(i8_ty, fp8x4Vec, i32_val(1)), extract_element(i8_ty, fp8x4Vec, i32_val(2)), extract_element(i8_ty, fp8x4Vec, i32_val(3))}; } static SmallVector convertFp8x4ToBf16x4(Location loc, ConversionPatternRewriter &rewriter, const Value &v0, const Value &v1, const Value &v2, const Value &v3) { auto ctx = rewriter.getContext(); auto fp8x4VecTy = vec_ty(i8_ty, 4); Value fp8x4Vec = undef(fp8x4VecTy); fp8x4Vec = insert_element(fp8x4VecTy, fp8x4Vec, v0, i32_val(0)); fp8x4Vec = insert_element(fp8x4VecTy, fp8x4Vec, v1, i32_val(1)); fp8x4Vec = insert_element(fp8x4VecTy, fp8x4Vec, v2, i32_val(2)); fp8x4Vec = insert_element(fp8x4VecTy, fp8x4Vec, v3, i32_val(3)); fp8x4Vec = bitcast(fp8x4Vec, i32_ty); PTXBuilder builder; auto *ptxAsm = "{ \n" ".reg .b32 a<2>, sign<2>, nosign<2>, b<2>; \n" "prmt.b32 a0, 0, $2, 0x5040; \n" "prmt.b32 a1, 0, $2, 0x7060; \n" "and.b32 sign0, a0, 0x80008000; \n" "and.b32 sign1, a1, 0x80008000; \n" "and.b32 nosign0, a0, 0x7fff7fff; \n" "and.b32 nosign1, a1, 0x7fff7fff; \n" "shr.b32 nosign0, nosign0, 4; \n" "shr.b32 nosign1, nosign1, 4; \n" "add.u32 nosign0, nosign0, 0x38003800; \n" "add.u32 nosign1, nosign1, 0x38003800; \n" "or.b32 $0, sign0, nosign0; \n" "or.b32 $1, sign1, nosign1; \n" "}"; auto &call = *builder.create(ptxAsm); auto *o0 = builder.newOperand("=r"); auto *o1 = builder.newOperand("=r"); auto *i = builder.newOperand(fp8x4Vec, "r"); call({o0, o1, i}, /* onlyAttachMLIRArgs */ true); auto bf16x2VecTy = vec_ty(i16_ty, 2); auto bf16x2x2StructTy = struct_ty(SmallVector{bf16x2VecTy, bf16x2VecTy}); auto bf16x2x2Struct = builder.launch(rewriter, loc, bf16x2x2StructTy, false); auto bf16x2Vec0 = extract_val(bf16x2VecTy, bf16x2x2Struct, i32_arr_attr(0)); auto bf16x2Vec1 = extract_val(bf16x2VecTy, bf16x2x2Struct, i32_arr_attr(1)); return {extract_element(i16_ty, bf16x2Vec0, i32_val(0)), extract_element(i16_ty, bf16x2Vec0, i32_val(1)), extract_element(i16_ty, bf16x2Vec1, i32_val(0)), extract_element(i16_ty, bf16x2Vec1, i32_val(1))}; } static SmallVector convertBf16x4ToFp8x4(Location loc, ConversionPatternRewriter &rewriter, const Value &v0, const Value &v1, const Value &v2, const Value &v3) { auto ctx = rewriter.getContext(); auto bf16x2VecTy = vec_ty(i16_ty, 2); Value bf16x2Vec0 = undef(bf16x2VecTy); Value bf16x2Vec1 = undef(bf16x2VecTy); bf16x2Vec0 = insert_element(bf16x2VecTy, bf16x2Vec0, v0, i32_val(0)); bf16x2Vec0 = insert_element(bf16x2VecTy, bf16x2Vec0, v1, i32_val(1)); bf16x2Vec1 = insert_element(bf16x2VecTy, bf16x2Vec1, v2, i32_val(0)); bf16x2Vec1 = insert_element(bf16x2VecTy, bf16x2Vec1, v3, i32_val(1)); bf16x2Vec0 = bitcast(bf16x2Vec0, i32_ty); bf16x2Vec1 = bitcast(bf16x2Vec1, i32_ty); PTXBuilder builder; auto *ptxAsm = "{ \n" ".reg .u32 sign, sign<2>, nosign, nosign<2>; \n" ".reg .u32 fp8_min, fp8_max, rn_, zero; \n" "mov.u32 fp8_min, 0x38003800; \n" "mov.u32 fp8_max, 0x3ff03ff0; \n" "mov.u32 rn_, 0x80008; \n" "mov.u32 zero, 0; \n" "and.b32 sign0, $1, 0x80008000; \n" "and.b32 sign1, $2, 0x80008000; \n" "prmt.b32 sign, sign0, sign1, 0x7531; \n" "and.b32 nosign0, $1, 0x7fff7fff; \n" "and.b32 nosign1, $2, 0x7fff7fff; \n" ".reg .u32 nosign_0_<2>, nosign_1_<2>; \n" "and.b32 nosign_0_0, nosign0, 0xffff0000; \n" "max.u32 nosign_0_0, nosign_0_0, 0x38000000; \n" "min.u32 nosign_0_0, nosign_0_0, 0x3ff00000; \n" "and.b32 nosign_0_1, nosign0, 0x0000ffff; \n" "max.u32 nosign_0_1, nosign_0_1, 0x3800; \n" "min.u32 nosign_0_1, nosign_0_1, 0x3ff0; \n" "or.b32 nosign0, nosign_0_0, nosign_0_1; \n" "and.b32 nosign_1_0, nosign1, 0xffff0000; \n" "max.u32 nosign_1_0, nosign_1_0, 0x38000000; \n" "min.u32 nosign_1_0, nosign_1_0, 0x3ff00000; \n" "and.b32 nosign_1_1, nosign1, 0x0000ffff; \n" "max.u32 nosign_1_1, nosign_1_1, 0x3800; \n" "min.u32 nosign_1_1, nosign_1_1, 0x3ff0; \n" "or.b32 nosign1, nosign_1_0, nosign_1_1; \n" "add.u32 nosign0, nosign0, rn_; \n" "add.u32 nosign1, nosign1, rn_; \n" "sub.u32 nosign0, nosign0, 0x38003800; \n" "sub.u32 nosign1, nosign1, 0x38003800; \n" "shr.u32 nosign0, nosign0, 4; \n" "shr.u32 nosign1, nosign1, 4; \n" "prmt.b32 nosign, nosign0, nosign1, 0x6420; \n" "or.b32 $0, nosign, sign; \n" "}"; auto &call = *builder.create(ptxAsm); auto *o = builder.newOperand("=r"); auto *i0 = builder.newOperand(bf16x2Vec0, "r"); auto *i1 = builder.newOperand(bf16x2Vec1, "r"); call({o, i0, i1}, /*onlyAttachMLIRArgs=*/true); auto fp8x4VecTy = vec_ty(i8_ty, 4); auto fp8x4Vec = builder.launch(rewriter, loc, fp8x4VecTy, false); return {extract_element(i8_ty, fp8x4Vec, i32_val(0)), extract_element(i8_ty, fp8x4Vec, i32_val(1)), extract_element(i8_ty, fp8x4Vec, i32_val(2)), extract_element(i8_ty, fp8x4Vec, i32_val(3))}; } static SmallVector convertFp8x4ToFp32x4(Location loc, ConversionPatternRewriter &rewriter, const Value &v0, const Value &v1, const Value &v2, const Value &v3) { auto fp16Values = convertFp8x4ToFp16x4(loc, rewriter, v0, v1, v2, v3); return {rewriter.create(loc, f32_ty, fp16Values[0]), rewriter.create(loc, f32_ty, fp16Values[1]), rewriter.create(loc, f32_ty, fp16Values[2]), rewriter.create(loc, f32_ty, fp16Values[3])}; } static SmallVector convertFp32x4ToFp8x4(Location loc, ConversionPatternRewriter &rewriter, const Value &v0, const Value &v1, const Value &v2, const Value &v3) { auto c0 = rewriter.create(loc, f16_ty, v0); auto c1 = rewriter.create(loc, f16_ty, v1); auto c2 = rewriter.create(loc, f16_ty, v2); auto c3 = rewriter.create(loc, f16_ty, v3); return convertFp16x4ToFp8x4(loc, rewriter, c0, c1, c2, c3); } static SmallVector convertFp8x4ToFp64x4(Location loc, ConversionPatternRewriter &rewriter, const Value &v0, const Value &v1, const Value &v2, const Value &v3) { auto fp16Values = convertFp8x4ToFp16x4(loc, rewriter, v0, v1, v2, v3); return {rewriter.create(loc, f64_ty, fp16Values[0]), rewriter.create(loc, f64_ty, fp16Values[1]), rewriter.create(loc, f64_ty, fp16Values[2]), rewriter.create(loc, f64_ty, fp16Values[3])}; } static SmallVector convertFp64x4ToFp8x4(Location loc, ConversionPatternRewriter &rewriter, const Value &v0, const Value &v1, const Value &v2, const Value &v3) { auto c0 = rewriter.create(loc, f16_ty, v0); auto c1 = rewriter.create(loc, f16_ty, v1); auto c2 = rewriter.create(loc, f16_ty, v2); auto c3 = rewriter.create(loc, f16_ty, v3); return convertFp16x4ToFp8x4(loc, rewriter, c0, c1, c2, c3); } static Value convertBf16ToFp32(Location loc, ConversionPatternRewriter &rewriter, const Value &v) { PTXBuilder builder; auto &cvt = *builder.create("cvt.rn.f32.bf16"); auto res = builder.newOperand("=r"); auto operand = builder.newOperand(v, "h"); cvt(res, operand); return builder.launch(rewriter, loc, f32_ty, false); } static Value convertFp32ToBf16(Location loc, ConversionPatternRewriter &rewriter, const Value &v) { PTXBuilder builder; auto &cvt = *builder.create("cvt.rn.bf16.f32"); auto res = builder.newOperand("=h"); auto operand = builder.newOperand(v, "r"); cvt(res, operand); // TODO: This is a hack to get the right type. We should be able to invoke // the type converter return builder.launch(rewriter, loc, i16_ty, false); } LogicalResult matchAndRewrite(triton::FpToFpOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { auto srcTensorType = op.from().getType().cast(); auto dstTensorType = op.result().getType().cast(); auto srcEltType = srcTensorType.getElementType(); auto dstEltType = dstTensorType.getElementType(); auto loc = op->getLoc(); auto elems = getElemsPerThread(dstTensorType); SmallVector resultVals; // Select convertor if (srcEltType.isa() || dstEltType.isa()) { std::function(Location, ConversionPatternRewriter &, const Value &, const Value &, const Value &, const Value &)> convertor; if (srcEltType.isa() && dstEltType.isF16()) { convertor = convertFp8x4ToFp16x4; } else if (srcEltType.isF16() && dstEltType.isa()) { convertor = convertFp16x4ToFp8x4; } else if (srcEltType.isa() && dstEltType.isBF16()) { convertor = convertFp8x4ToBf16x4; } else if (srcEltType.isBF16() && dstEltType.isa()) { convertor = convertBf16x4ToFp8x4; } else if (srcEltType.isa() && dstEltType.isF32()) { convertor = convertFp8x4ToFp32x4; } else if (srcEltType.isF32() && dstEltType.isa()) { convertor = convertFp32x4ToFp8x4; } else if (srcEltType.isa() && dstEltType.isF64()) { convertor = convertFp8x4ToFp64x4; } else if (srcEltType.isF64() && dstEltType.isa()) { convertor = convertFp64x4ToFp8x4; } else { assert(false && "unsupported fp8 casting"); } // Vectorized casting assert(elems % 4 == 0 && "FP8 casting only support tensors with 4-aligned sizes"); auto elements = getElementsFromStruct(loc, adaptor.from(), rewriter); for (size_t i = 0; i < elems; i += 4) { auto converted = convertor(loc, rewriter, elements[i], elements[i + 1], elements[i + 2], elements[i + 3]); resultVals.append(converted); } } else if (srcEltType.isBF16() && dstEltType.isF32()) { resultVals.emplace_back(convertBf16ToFp32(loc, rewriter, adaptor.from())); } else if (srcEltType.isF32() && dstEltType.isBF16()) { resultVals.emplace_back(convertFp32ToBf16(loc, rewriter, adaptor.from())); } else { assert(false && "unsupported type casting"); } assert(resultVals.size() == elems); auto convertedDstTensorType = this->getTypeConverter()->convertType(dstTensorType); auto result = getStructFromElements(loc, resultVals, rewriter, convertedDstTensorType); rewriter.replaceOp(op, result); return success(); } }; template class ElementwiseOpConversionBase : public ConvertTritonGPUOpToLLVMPattern { public: using OpAdaptor = typename SourceOp::Adaptor; explicit ElementwiseOpConversionBase(LLVMTypeConverter &typeConverter, PatternBenefit benefit = 1) : ConvertTritonGPUOpToLLVMPattern(typeConverter, benefit) {} LogicalResult matchAndRewrite(SourceOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { auto resultTy = op.getType(); Location loc = op->getLoc(); unsigned elems = getElemsPerThread(resultTy); auto resultElementTy = getElementTypeOrSelf(resultTy); Type elemTy = this->getTypeConverter()->convertType(resultElementTy); SmallVector types(elems, elemTy); Type structTy = this->getTypeConverter()->convertType(resultTy); auto *concreteThis = static_cast(this); auto operands = getOperands(rewriter, adaptor, elems, loc); SmallVector resultVals(elems); for (unsigned i = 0; i < elems; ++i) { resultVals[i] = concreteThis->createDestOp(op, adaptor, rewriter, elemTy, operands[i], loc); if (!bool(resultVals[i])) return failure(); } Value view = getStructFromElements(loc, resultVals, rewriter, structTy); rewriter.replaceOp(op, view); return success(); } protected: SmallVector> getOperands(ConversionPatternRewriter &rewriter, OpAdaptor adaptor, const unsigned elems, Location loc) const { SmallVector> operands(elems); for (auto operand : adaptor.getOperands()) { auto sub_operands = getElementsFromStruct(loc, operand, rewriter); for (size_t i = 0; i < elems; ++i) { operands[i].push_back(sub_operands[i]); } } return operands; } }; template struct ElementwiseOpConversion : public ElementwiseOpConversionBase< SourceOp, ElementwiseOpConversion> { using Base = ElementwiseOpConversionBase>; using Base::Base; using OpAdaptor = typename Base::OpAdaptor; explicit ElementwiseOpConversion(LLVMTypeConverter &typeConverter, PatternBenefit benefit = 1) : ElementwiseOpConversionBase( typeConverter, benefit) {} // An interface to support variant DestOp builder. DestOp createDestOp(SourceOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter, Type elemTy, ValueRange operands, Location loc) const { return rewriter.create(loc, elemTy, operands, adaptor.getAttributes().getValue()); } }; struct CmpIOpConversion : public ElementwiseOpConversionBase { using Base = ElementwiseOpConversionBase; using Base::Base; using Adaptor = typename Base::OpAdaptor; // An interface to support variant DestOp builder. LLVM::ICmpOp createDestOp(triton::gpu::CmpIOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter, Type elemTy, ValueRange operands, Location loc) const { return rewriter.create( loc, elemTy, ArithCmpIPredicateToLLVM(op.predicate()), operands[0], operands[1]); } static LLVM::ICmpPredicate ArithCmpIPredicateToLLVM(arith::CmpIPredicate predicate) { switch (predicate) { #define __PRED_ENUM(item__) \ case arith::CmpIPredicate::item__: \ return LLVM::ICmpPredicate::item__ __PRED_ENUM(eq); __PRED_ENUM(ne); __PRED_ENUM(sgt); __PRED_ENUM(sge); __PRED_ENUM(slt); __PRED_ENUM(sle); __PRED_ENUM(ugt); __PRED_ENUM(uge); __PRED_ENUM(ult); __PRED_ENUM(ule); #undef __PRED_ENUM } llvm_unreachable("Unknown arith::CmpIPredicate"); } }; struct CmpFOpConversion : public ElementwiseOpConversionBase { using Base = ElementwiseOpConversionBase; using Base::Base; using Adaptor = typename Base::OpAdaptor; // An interface to support variant DestOp builder. static LLVM::FCmpOp createDestOp(triton::gpu::CmpFOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter, Type elemTy, ValueRange operands, Location loc) { return rewriter.create( loc, elemTy, ArithCmpFPredicateToLLVM(op.predicate()), operands[0], operands[1]); } static LLVM::FCmpPredicate ArithCmpFPredicateToLLVM(arith::CmpFPredicate predicate) { switch (predicate) { #define __PRED_ENUM(item__, item1__) \ case arith::CmpFPredicate::item__: \ return LLVM::FCmpPredicate::item1__ __PRED_ENUM(OEQ, oeq); __PRED_ENUM(ONE, one); __PRED_ENUM(OGT, ogt); __PRED_ENUM(OGE, oge); __PRED_ENUM(OLT, olt); __PRED_ENUM(OLE, ole); __PRED_ENUM(ORD, ord); __PRED_ENUM(UEQ, ueq); __PRED_ENUM(UGT, ugt); __PRED_ENUM(UGE, uge); __PRED_ENUM(ULT, ult); __PRED_ENUM(ULE, ule); __PRED_ENUM(UNE, une); __PRED_ENUM(UNO, uno); __PRED_ENUM(AlwaysTrue, _true); __PRED_ENUM(AlwaysFalse, _false); #undef __PRED_ENUM } llvm_unreachable("Unknown arith::CmpFPredicate"); } }; struct ExtElemwiseOpConversion : public ElementwiseOpConversionBase { using Base = ElementwiseOpConversionBase; using Base::Base; using Adaptor = typename Base::OpAdaptor; Value createDestOp(triton::ExtElemwiseOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter, Type elemTy, ValueRange operands, Location loc) const { StringRef funcName = op.symbol(); if (funcName.empty()) llvm::errs() << "ExtElemwiseOpConversion"; Type funcType = getFunctionType(elemTy, operands); LLVM::LLVMFuncOp funcOp = appendOrGetFuncOp(rewriter, op, funcName, funcType); return rewriter.create(loc, funcOp, operands).getResult(0); } private: Type getFunctionType(Type resultType, ValueRange operands) const { SmallVector operandTypes(operands.getTypes()); return LLVM::LLVMFunctionType::get(resultType, operandTypes); } LLVM::LLVMFuncOp appendOrGetFuncOp(ConversionPatternRewriter &rewriter, triton::ExtElemwiseOp op, StringRef funcName, Type funcType) const { using LLVM::LLVMFuncOp; auto funcAttr = StringAttr::get(op->getContext(), funcName); Operation *funcOp = SymbolTable::lookupNearestSymbolFrom(op, funcAttr); if (funcOp) return cast(*funcOp); mlir::OpBuilder b(op->getParentOfType()); auto ret = b.create(op->getLoc(), funcName, funcType); ret.getOperation()->setAttr( "libname", StringAttr::get(op->getContext(), op.libname())); ret.getOperation()->setAttr( "libpath", StringAttr::get(op->getContext(), op.libpath())); return ret; } }; struct FDivOpConversion : ElementwiseOpConversionBase { using Base = ElementwiseOpConversionBase; using Base::Base; using Adaptor = typename Base::OpAdaptor; Value createDestOp(mlir::arith::DivFOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter, Type elemTy, ValueRange operands, Location loc) const { PTXBuilder ptxBuilder; auto &fdiv = *ptxBuilder.create("div"); unsigned bitwidth = elemTy.getIntOrFloatBitWidth(); if (32 == bitwidth) { fdiv.o("full").o("f32"); } else if (64 == bitwidth) { fdiv.o("rn").o("f64"); } else { assert(0 && bitwidth && "not supported"); } auto res = ptxBuilder.newOperand(bitwidth == 32 ? "=r" : "=l"); auto lhs = ptxBuilder.newOperand(operands[0], bitwidth == 32 ? "r" : "l"); auto rhs = ptxBuilder.newOperand(operands[1], bitwidth == 32 ? "r" : "l"); fdiv(res, lhs, rhs); Value ret = ptxBuilder.launch(rewriter, loc, elemTy, false); return ret; } }; struct FMulOpConversion : ElementwiseOpConversionBase { using Base = ElementwiseOpConversionBase; using Base::Base; using Adaptor = typename Base::OpAdaptor; Value createDestOp(mlir::arith::MulFOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter, Type elemTy, ValueRange operands, Location loc) const { auto lhsElemTy = getElementType(op.getLhs()); auto rhsElemTy = getElementType(op.getRhs()); if (lhsElemTy.isBF16() && rhsElemTy.isBF16()) { PTXBuilder builder; auto ptxAsm = " { .reg .b16 c; \n" " mov.b16 c, 0x8000U; \n" // 0.0 " fma.rn.bf16 $0, $1, $2, c; } \n"; auto &fMul = *builder.create(ptxAsm); auto res = builder.newOperand("=h"); auto lhs = builder.newOperand(operands[0], "h"); auto rhs = builder.newOperand(operands[1], "h"); fMul({res, lhs, rhs}, /*onlyAttachMLIRArgs=*/true); return builder.launch(rewriter, loc, i16_ty, false); } else { return rewriter.create(loc, elemTy, operands[0], operands[1]); } } }; struct FAddOpConversion : ElementwiseOpConversionBase { using Base = ElementwiseOpConversionBase; using Base::Base; using Adaptor = typename Base::OpAdaptor; Value createDestOp(mlir::arith::AddFOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter, Type elemTy, ValueRange operands, Location loc) const { auto lhsElemTy = getElementType(op.getLhs()); auto rhsElemTy = getElementType(op.getRhs()); if (lhsElemTy.isBF16() && rhsElemTy.isBF16()) { PTXBuilder builder; auto ptxAsm = "{ .reg .b16 c; \n" " mov.b16 c, 0x3f80U; \n" // 1.0 " fma.rn.bf16 $0, $1, c, $2; } \n"; auto &fAdd = *builder.create(ptxAsm); auto res = builder.newOperand("=h"); auto lhs = builder.newOperand(operands[0], "h"); auto rhs = builder.newOperand(operands[1], "h"); fAdd({res, lhs, rhs}, /*onlyAttachMLIRArgs=*/true); return builder.launch(rewriter, loc, i16_ty, false); } else { return rewriter.create(loc, elemTy, operands[0], operands[1]); } } }; struct FSubOpConversion : ElementwiseOpConversionBase { using Base = ElementwiseOpConversionBase; using Base::Base; using Adaptor = typename Base::OpAdaptor; Value createDestOp(mlir::arith::SubFOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter, Type elemTy, ValueRange operands, Location loc) const { auto lhsElemTy = getElementType(op.getLhs()); auto rhsElemTy = getElementType(op.getRhs()); if (lhsElemTy.isBF16() && rhsElemTy.isBF16()) { PTXBuilder builder; auto ptxAsm = " { .reg .b16 c; \n" " mov.b16 c, 0xbf80U; \n" // -1.0 " fma.rn.bf16 $0, $2, c, $1;} \n"; auto &fSub = *builder.create(ptxAsm); auto res = builder.newOperand("=h"); auto lhs = builder.newOperand(operands[0], "h"); auto rhs = builder.newOperand(operands[1], "h"); fSub({res, lhs, rhs}, /*onlyAttachMLIRArgs=*/true); return builder.launch(rewriter, loc, i16_ty, false); } else { return rewriter.create(loc, elemTy, operands[0], operands[1]); } } }; struct SIToFPOpConversion : ElementwiseOpConversionBase { using Base = ElementwiseOpConversionBase; using Base::Base; using Adaptor = typename Base::OpAdaptor; Value createDestOp(mlir::arith::SIToFPOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter, Type elemTy, ValueRange operands, Location loc) const { auto outElemTy = getElementType(op.getOut()); if (outElemTy.isBF16()) { auto value = rewriter.create(loc, f32_ty, operands[0]); return FpToFpOpConversion::convertFp32ToBf16(loc, rewriter, value); } else { return rewriter.create(loc, elemTy, operands[0]); } } }; struct FPToSIOpConversion : ElementwiseOpConversionBase { using Base = ElementwiseOpConversionBase; using Base::Base; using Adaptor = typename Base::OpAdaptor; Value createDestOp(mlir::arith::FPToSIOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter, Type elemTy, ValueRange operands, Location loc) const { auto inElemTy = getElementType(op.getIn()); if (inElemTy.isBF16()) { auto value = FpToFpOpConversion::convertBf16ToFp32(loc, rewriter, operands[0]); return rewriter.create(loc, elemTy, value); } else { return rewriter.create(loc, elemTy, operands[0]); } } }; struct ExtFOpConversion : ElementwiseOpConversionBase { using Base = ElementwiseOpConversionBase; using Base::Base; using Adaptor = typename Base::OpAdaptor; Value createDestOp(mlir::arith::ExtFOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter, Type elemTy, ValueRange operands, Location loc) const { auto inElemTy = getElementType(op.getIn()); if (inElemTy.isBF16()) { auto outElemTy = getElementType(op.getOut()); assert(outElemTy.isF32() && "unsupported conversion"); return FpToFpOpConversion::convertBf16ToFp32(loc, rewriter, operands[0]); } else { return rewriter.create(loc, elemTy, operands[0]); } } }; struct TruncFOpConversion : ElementwiseOpConversionBase { using Base = ElementwiseOpConversionBase; using Base::Base; using Adaptor = typename Base::OpAdaptor; Value createDestOp(mlir::arith::TruncFOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter, Type elemTy, ValueRange operands, Location loc) const { auto outElemTy = getElementType(op.getOut()); if (outElemTy.isBF16()) { auto inElemTy = getElementType(op.getIn()); assert(inElemTy.isF32() && "unsupported conversion"); return FpToFpOpConversion::convertFp32ToBf16(loc, rewriter, operands[0]); } else { return rewriter.create(loc, elemTy, operands[0]); } } }; struct ExpOpConversionApprox : ElementwiseOpConversionBase { using Base = ElementwiseOpConversionBase; using Base::Base; using Adaptor = typename Base::OpAdaptor; Value createDestOp(mlir::math::ExpOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter, Type elemTy, ValueRange operands, Location loc) const { // For FP64 input, call __nv_expf for higher-precision calculation if (elemTy.getIntOrFloatBitWidth() == 64) return {}; const double log2e = 1.4426950408889634; Value prod = fmul(f32_ty, operands[0], f32_val(log2e)); PTXBuilder ptxBuilder; auto &exp2 = ptxBuilder.create("ex2")->o("approx").o("f32"); auto output = ptxBuilder.newOperand("=f"); auto input = ptxBuilder.newOperand(prod, "f"); exp2(output, input); return ptxBuilder.launch(rewriter, loc, f32_ty, false); } }; void populateElementwiseOpToLLVMPatterns(mlir::LLVMTypeConverter &typeConverter, RewritePatternSet &patterns, int numWarps, AxisInfoAnalysis &axisInfoAnalysis, const Allocation *allocation, Value smem, PatternBenefit benefit) { #define POPULATE_TERNARY_OP(SRC_OP, DST_OP) \ patterns.add>(typeConverter, benefit); POPULATE_TERNARY_OP(triton::gpu::SelectOp, LLVM::SelectOp) #undef POPULATE_TERNARY_OP #define POPULATE_BINARY_OP(SRC_OP, DST_OP) \ patterns.add>(typeConverter, benefit); POPULATE_BINARY_OP(arith::SubIOp, LLVM::SubOp) // - POPULATE_BINARY_OP(arith::AddIOp, LLVM::AddOp) // + POPULATE_BINARY_OP(arith::MulIOp, LLVM::MulOp) // * POPULATE_BINARY_OP(arith::DivSIOp, LLVM::SDivOp) POPULATE_BINARY_OP(arith::DivUIOp, LLVM::UDivOp) POPULATE_BINARY_OP(arith::RemFOp, LLVM::FRemOp) // % POPULATE_BINARY_OP(arith::RemSIOp, LLVM::SRemOp) POPULATE_BINARY_OP(arith::RemUIOp, LLVM::URemOp) POPULATE_BINARY_OP(arith::AndIOp, LLVM::AndOp) // & POPULATE_BINARY_OP(arith::OrIOp, LLVM::OrOp) // | POPULATE_BINARY_OP(arith::XOrIOp, LLVM::XOrOp) // ^ POPULATE_BINARY_OP(arith::ShLIOp, LLVM::ShlOp) // << POPULATE_BINARY_OP(arith::ShRSIOp, LLVM::AShrOp) // >> POPULATE_BINARY_OP(arith::ShRUIOp, LLVM::LShrOp) // >> #undef POPULATE_BINARY_OP #define POPULATE_UNARY_OP(SRC_OP, DST_OP) \ patterns.add>(typeConverter, benefit); POPULATE_UNARY_OP(arith::TruncIOp, LLVM::TruncOp) POPULATE_UNARY_OP(arith::ExtSIOp, LLVM::SExtOp) POPULATE_UNARY_OP(arith::ExtUIOp, LLVM::ZExtOp) POPULATE_UNARY_OP(arith::FPToUIOp, LLVM::FPToUIOp) POPULATE_UNARY_OP(arith::UIToFPOp, LLVM::UIToFPOp) POPULATE_UNARY_OP(math::LogOp, math::LogOp) POPULATE_UNARY_OP(math::CosOp, math::CosOp) POPULATE_UNARY_OP(math::SinOp, math::SinOp) POPULATE_UNARY_OP(math::SqrtOp, math::SqrtOp) POPULATE_UNARY_OP(math::ExpOp, math::ExpOp) POPULATE_UNARY_OP(triton::BitcastOp, LLVM::BitcastOp) POPULATE_UNARY_OP(triton::IntToPtrOp, LLVM::IntToPtrOp) POPULATE_UNARY_OP(triton::PtrToIntOp, LLVM::PtrToIntOp) #undef POPULATE_UNARY_OP patterns.add(typeConverter, benefit); patterns.add(typeConverter, benefit); patterns.add(typeConverter, benefit); patterns.add(typeConverter, benefit); patterns.add(typeConverter, benefit); patterns.add(typeConverter, benefit); patterns.add(typeConverter, benefit); patterns.add(typeConverter, benefit); patterns.add(typeConverter, benefit); patterns.add(typeConverter, benefit); patterns.add(typeConverter, benefit); patterns.add(typeConverter, benefit); // ExpOpConversionApprox will try using ex2.approx if the input type is FP32. // For FP64 input type, ExpOpConversionApprox will return failure and // ElementwiseOpConversion defined below will call // __nv_expf for higher-precision calculation patterns.add(typeConverter, benefit); } triton-2.0.0/lib/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVM.h000066400000000000000000000012001440023377100244750ustar00rootroot00000000000000#ifndef TRITON_CONVERSION_TRITONGPU_TO_ELEMENTWISE_OP_H #define TRITON_CONVERSION_TRITONGPU_TO_ELEMENTWISE_OP_H #include "TritonGPUToLLVMBase.h" using namespace mlir; using namespace mlir::triton; void populateElementwiseOpToLLVMPatterns(mlir::LLVMTypeConverter &typeConverter, RewritePatternSet &patterns, int numWarps, AxisInfoAnalysis &axisInfoAnalysis, const Allocation *allocation, Value smem, PatternBenefit benefit); #endif triton-2.0.0/lib/Conversion/TritonGPUToLLVM/LoadStoreOpToLLVM.cpp000066400000000000000000001007431440023377100244570ustar00rootroot00000000000000#include "mlir/IR/Matchers.h" #include "mlir/IR/TypeUtilities.h" #include "ConvertLayoutOpToLLVM.h" #include "LoadStoreOpToLLVM.h" using namespace mlir; using namespace mlir::triton; using ::mlir::LLVM::getElementsFromStruct; using ::mlir::LLVM::getSharedMemoryObjectFromStruct; using ::mlir::LLVM::getStructFromElements; using ::mlir::triton::gpu::getElemsPerThread; using ::mlir::triton::gpu::SharedEncodingAttr; // Contains some helper functions for both Load and Store conversions. struct LoadStoreConversionBase { explicit LoadStoreConversionBase(AxisInfoAnalysis &axisAnalysisPass) : axisAnalysisPass(axisAnalysisPass) {} // Get corresponding LLVM element values of \param value. static SmallVector getLLVMElems(Value value, Value llValue, ConversionPatternRewriter &rewriter, Location loc) { if (!value) return {}; if (!llValue.getType().isa()) return {llValue}; // Here, we assume that all inputs should have a blockedLayout auto valueVals = getElementsFromStruct(loc, llValue, rewriter); return valueVals; } unsigned getContiguity(Value ptr) const { auto tensorTy = ptr.getType().dyn_cast(); if (!tensorTy) return 1; return axisAnalysisPass.getPtrContiguity(ptr); } unsigned getVectorSize(Value ptr) const { auto tensorTy = ptr.getType().dyn_cast(); if (!tensorTy) return 1; auto contiguity = getContiguity(ptr); auto pointeeBitWidth = getPointeeBitWidth(tensorTy); // The maximum vector size is 128 bits on NVIDIA GPUs. return std::min(128 / pointeeBitWidth, contiguity); } unsigned getMaskAlignment(Value mask) const { return axisAnalysisPass.getMaskAlignment(mask); } protected: AxisInfoAnalysis &axisAnalysisPass; }; struct LoadOpConversion : public ConvertTritonGPUOpToLLVMPattern, public LoadStoreConversionBase { using ConvertTritonGPUOpToLLVMPattern< triton::LoadOp>::ConvertTritonGPUOpToLLVMPattern; LoadOpConversion(LLVMTypeConverter &converter, AxisInfoAnalysis &axisAnalysisPass, PatternBenefit benefit) : ConvertTritonGPUOpToLLVMPattern(converter, benefit), LoadStoreConversionBase(axisAnalysisPass) {} LogicalResult matchAndRewrite(triton::LoadOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { auto loc = op->getLoc(); // original values Value ptr = op.ptr(); Value mask = op.mask(); Value other = op.other(); // adaptor values Value llPtr = adaptor.ptr(); Value llMask = adaptor.mask(); Value llOther = adaptor.other(); // Determine the vectorization size Type valueTy = op.getResult().getType(); Type valueElemTy = typeConverter->convertType(getElementTypeOrSelf(valueTy)); unsigned vec = getVectorSize(ptr); unsigned numElems = getElemsPerThread(ptr.getType()); if (llMask) vec = std::min(vec, getMaskAlignment(mask)); // Get the LLVM values for pointers auto ptrElems = getLLVMElems(ptr, llPtr, rewriter, loc); assert(ptrElems.size() == numElems); // Get the LLVM values for mask SmallVector maskElems; if (llMask) { maskElems = getLLVMElems(mask, llMask, rewriter, loc); assert(maskElems.size() == numElems); } // Get the LLVM values for `other` // TODO: (goostavz) handle when other is const but not splat, which // should be rarely seen bool otherIsSplatConstInt = false; DenseElementsAttr constAttr; int64_t splatVal = 0; if (other && valueElemTy.isa() && matchPattern(other, m_Constant(&constAttr)) && constAttr.isSplat()) { otherIsSplatConstInt = true; splatVal = constAttr.getSplatValue().getSExtValue(); } auto otherElems = getLLVMElems(other, llOther, rewriter, loc); // vectorized iteration through all the pointer/mask/other elements const int valueElemNbits = std::max(8u, valueElemTy.getIntOrFloatBitWidth()); const int numVecs = numElems / vec; SmallVector loadedVals; for (size_t vecStart = 0; vecStart < numElems; vecStart += vec) { // TODO: optimization when ptr is GEP with constant offset size_t in_off = 0; const size_t maxWordWidth = std::max(32, valueElemNbits); const size_t totalWidth = valueElemNbits * vec; const size_t width = std::min(totalWidth, maxWordWidth); const size_t nWords = std::max(1, totalWidth / width); const size_t wordNElems = width / valueElemNbits; assert(wordNElems * nWords * numVecs == numElems); // TODO(Superjomn) Add cache policy fields to StoreOp. // TODO(Superjomn) Deal with cache policy here. const bool hasL2EvictPolicy = false; PTXBuilder ptxBuilder; Value pred = mask ? maskElems[vecStart] : int_val(1, 1); const std::string readConstraint = (width == 64) ? "l" : ((width == 32) ? "r" : "c"); const std::string writeConstraint = (width == 64) ? "=l" : ((width == 32) ? "=r" : "=c"); // prepare asm operands auto *dstsOpr = ptxBuilder.newListOperand(); for (size_t wordIdx = 0; wordIdx < nWords; ++wordIdx) { auto *opr = ptxBuilder.newOperand(writeConstraint); // =r operations dstsOpr->listAppend(opr); } auto *addrOpr = ptxBuilder.newAddrOperand(ptrElems[vecStart], "l", in_off); // Define the instruction opcode auto &ld = ptxBuilder.create<>("ld") ->o("volatile", op.isVolatile()) .global() .o("ca", op.cache() == triton::CacheModifier::CA) .o("cg", op.cache() == triton::CacheModifier::CG) .o("L1::evict_first", op.evict() == triton::EvictionPolicy::EVICT_FIRST) .o("L1::evict_last", op.evict() == triton::EvictionPolicy::EVICT_LAST) .o("L1::cache_hint", hasL2EvictPolicy) .v(nWords) .b(width); PTXBuilder::Operand *evictOpr{}; // Here lack a mlir::Value to bind to this operation, so disabled. // if (has_l2_evict_policy) // evictOpr = ptxBuilder.newOperand(l2Evict, "l"); if (!evictOpr) ld(dstsOpr, addrOpr).predicate(pred, "b"); else ld(dstsOpr, addrOpr, evictOpr).predicate(pred, "b"); if (other) { for (size_t ii = 0; ii < nWords; ++ii) { // PTX doesn't support mov.u8, so we need to use mov.u16 auto movWidth = width < 16 ? 16 : width; PTXInstr &mov = ptxBuilder.create<>("mov")->o("u" + std::to_string(movWidth)); size_t size = width / valueElemNbits; auto vecTy = LLVM::getFixedVectorType(valueElemTy, size); Value v = undef(vecTy); for (size_t s = 0; s < size; ++s) { Value falseVal = otherElems[vecStart + ii * size + s]; Value sVal = createIndexAttrConstant( rewriter, loc, this->getTypeConverter()->getIndexType(), s); v = insert_element(vecTy, v, falseVal, sVal); } v = bitcast(v, IntegerType::get(getContext(), width)); PTXInstr::Operand *opr{}; if (otherIsSplatConstInt) opr = ptxBuilder.newConstantOperand(splatVal); else opr = ptxBuilder.newOperand(v, readConstraint); mov(dstsOpr->listGet(ii), opr).predicateNot(pred, "b"); } } // Create inline ASM signature SmallVector retTys(nWords, IntegerType::get(getContext(), width)); Type retTy = retTys.size() > 1 ? LLVM::LLVMStructType::getLiteral(getContext(), retTys) : retTys[0]; // TODO: if (has_l2_evict_policy) // auto asmDialectAttr = // LLVM::AsmDialectAttr::get(rewriter.getContext(), // LLVM::AsmDialect::AD_ATT); Value ret = ptxBuilder.launch(rewriter, loc, retTy); // Extract and store return values SmallVector rets; for (unsigned int ii = 0; ii < nWords; ++ii) { Value curr; if (retTy.isa()) { curr = extract_val(IntegerType::get(getContext(), width), ret, i64_arr_attr(ii)); } else { curr = ret; } curr = bitcast(curr, LLVM::getFixedVectorType(valueElemTy, width / valueElemNbits)); rets.push_back(curr); } int tmp = width / valueElemNbits; for (size_t ii = 0; ii < vec; ++ii) { Value vecIdx = createIndexAttrConstant( rewriter, loc, this->getTypeConverter()->getIndexType(), ii % tmp); Value loaded = extract_element(valueElemTy, rets[ii / tmp], vecIdx); loadedVals.push_back(loaded); } } // end vec Type llvmResultStructTy = getTypeConverter()->convertType(valueTy); Value resultStruct = getStructFromElements(loc, loadedVals, rewriter, llvmResultStructTy); rewriter.replaceOp(op, {resultStruct}); return success(); } }; struct StoreOpConversion : public ConvertTritonGPUOpToLLVMPattern, public LoadStoreConversionBase { using ConvertTritonGPUOpToLLVMPattern< triton::StoreOp>::ConvertTritonGPUOpToLLVMPattern; StoreOpConversion(LLVMTypeConverter &converter, AxisInfoAnalysis &axisAnalysisPass, PatternBenefit benefit) : ConvertTritonGPUOpToLLVMPattern(converter, benefit), LoadStoreConversionBase(axisAnalysisPass) {} LogicalResult matchAndRewrite(triton::StoreOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { Value ptr = op.ptr(); Value mask = op.mask(); Value value = op.value(); Value llPtr = adaptor.ptr(); Value llMask = adaptor.mask(); Value llValue = adaptor.value(); auto loc = op->getLoc(); MLIRContext *ctx = rewriter.getContext(); auto valueTy = value.getType(); Type valueElemTy = typeConverter->convertType(getElementTypeOrSelf(valueTy)); unsigned vec = getVectorSize(ptr); unsigned numElems = getElemsPerThread(ptr.getType()); auto ptrElems = getLLVMElems(ptr, llPtr, rewriter, loc); auto valueElems = getLLVMElems(value, llValue, rewriter, loc); assert(ptrElems.size() == valueElems.size()); // Determine the vectorization size SmallVector maskElems; if (llMask) { maskElems = getLLVMElems(mask, llMask, rewriter, loc); assert(valueElems.size() == maskElems.size()); unsigned maskAlign = getMaskAlignment(mask); vec = std::min(vec, maskAlign); } const size_t dtsize = std::max(1, valueElemTy.getIntOrFloatBitWidth() / 8); const size_t valueElemNbits = dtsize * 8; const int numVecs = numElems / vec; for (size_t vecStart = 0; vecStart < numElems; vecStart += vec) { // TODO: optimization when ptr is AddPtr with constant offset size_t in_off = 0; const size_t maxWordWidth = std::max(32, valueElemNbits); const size_t totalWidth = valueElemNbits * vec; const size_t width = std::min(totalWidth, maxWordWidth); const size_t nWords = std::max(1, totalWidth / width); const size_t wordNElems = width / valueElemNbits; assert(wordNElems * nWords * numVecs == numElems); // TODO(Superjomn) Add cache policy fields to StoreOp. // TODO(Superjomn) Deal with cache policy here. Type valArgTy = IntegerType::get(ctx, width); auto wordTy = vec_ty(valueElemTy, wordNElems); SmallVector> asmArgs; for (size_t wordIdx = 0; wordIdx < nWords; ++wordIdx) { // llWord is a width-len composition Value llWord = undef(wordTy); // Insert each value element to the composition for (size_t elemIdx = 0; elemIdx < wordNElems; ++elemIdx) { const size_t elemOffset = vecStart + wordIdx * wordNElems + elemIdx; assert(elemOffset < valueElems.size()); Value elem = valueElems[elemOffset]; if (elem.getType().isInteger(1)) elem = rewriter.create(loc, type::i8Ty(ctx), elem); elem = bitcast(elem, valueElemTy); Type u32Ty = typeConverter->convertType(type::u32Ty(ctx)); llWord = insert_element(wordTy, llWord, elem, i32_val(elemIdx)); } llWord = bitcast(llWord, valArgTy); std::string constraint = (width == 64) ? "l" : ((width == 32) ? "r" : "c"); asmArgs.emplace_back(llWord, constraint); } // Prepare the PTX inline asm. PTXBuilder ptxBuilder; auto *asmArgList = ptxBuilder.newListOperand(asmArgs); Value maskVal = llMask ? maskElems[vecStart] : int_val(1, 1); auto *asmAddr = ptxBuilder.newAddrOperand(ptrElems[vecStart], "l", in_off); auto &ptxStoreInstr = ptxBuilder.create<>("st")->global().v(nWords).b(width); ptxStoreInstr(asmAddr, asmArgList).predicate(maskVal, "b"); Type boolTy = getTypeConverter()->convertType(rewriter.getIntegerType(1)); llvm::SmallVector argTys({boolTy, ptr.getType()}); argTys.insert(argTys.end(), nWords, valArgTy); auto asmReturnTy = void_ty(ctx); ptxBuilder.launch(rewriter, loc, asmReturnTy); } rewriter.eraseOp(op); return success(); } }; struct AtomicCASOpConversion : public ConvertTritonGPUOpToLLVMPattern, public LoadStoreConversionBase { using ConvertTritonGPUOpToLLVMPattern< triton::AtomicCASOp>::ConvertTritonGPUOpToLLVMPattern; AtomicCASOpConversion(LLVMTypeConverter &converter, const Allocation *allocation, Value smem, AxisInfoAnalysis &axisAnalysisPass, PatternBenefit benefit) : ConvertTritonGPUOpToLLVMPattern( converter, allocation, smem, benefit), LoadStoreConversionBase(axisAnalysisPass) {} LogicalResult matchAndRewrite(triton::AtomicCASOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { auto loc = op.getLoc(); MLIRContext *ctx = rewriter.getContext(); Value ptr = op.ptr(); Value llPtr = adaptor.ptr(); Value llCmp = adaptor.cmp(); Value llVal = adaptor.val(); auto ptrElements = getElementsFromStruct(loc, llPtr, rewriter); auto cmpElements = getElementsFromStruct(loc, llCmp, rewriter); auto valElements = getElementsFromStruct(loc, llVal, rewriter); auto valueTy = op.getResult().getType().dyn_cast(); Type valueElemTy = valueTy ? getTypeConverter()->convertType(valueTy.getElementType()) : op.getResult().getType(); auto tid = tid_val(); Value pred = icmp_eq(tid, i32_val(0)); PTXBuilder ptxBuilderMemfence; auto memfence = ptxBuilderMemfence.create("membar")->o("gl"); memfence(); auto ASMReturnTy = void_ty(ctx); ptxBuilderMemfence.launch(rewriter, loc, ASMReturnTy); Value atomPtr = getSharedMemoryBase(loc, rewriter, op.getOperation()); atomPtr = bitcast(atomPtr, ptr_ty(valueElemTy, 3)); Value casPtr = ptrElements[0]; Value casCmp = cmpElements[0]; Value casVal = valElements[0]; PTXBuilder ptxBuilderAtomicCAS; auto *dstOpr = ptxBuilderAtomicCAS.newOperand("=r"); auto *ptrOpr = ptxBuilderAtomicCAS.newAddrOperand(casPtr, "l"); auto *cmpOpr = ptxBuilderAtomicCAS.newOperand(casCmp, "r"); auto *valOpr = ptxBuilderAtomicCAS.newOperand(casVal, "r"); auto &atom = *ptxBuilderAtomicCAS.create("atom"); atom.global().o("cas").o("b32"); atom(dstOpr, ptrOpr, cmpOpr, valOpr).predicate(pred); auto old = ptxBuilderAtomicCAS.launch(rewriter, loc, valueElemTy); barrier(); PTXBuilder ptxBuilderStore; auto *dstOprStore = ptxBuilderStore.newAddrOperand(atomPtr, "l"); auto *valOprStore = ptxBuilderStore.newOperand(old, "r"); auto &st = *ptxBuilderStore.create("st"); st.shared().o("b32"); st(dstOprStore, valOprStore).predicate(pred); ptxBuilderStore.launch(rewriter, loc, ASMReturnTy); ptxBuilderMemfence.launch(rewriter, loc, ASMReturnTy); barrier(); Value ret = load(atomPtr); barrier(); rewriter.replaceOp(op, {ret}); return success(); } }; struct AtomicRMWOpConversion : public ConvertTritonGPUOpToLLVMPattern, public LoadStoreConversionBase { using ConvertTritonGPUOpToLLVMPattern< triton::AtomicRMWOp>::ConvertTritonGPUOpToLLVMPattern; AtomicRMWOpConversion(LLVMTypeConverter &converter, const Allocation *allocation, Value smem, AxisInfoAnalysis &axisAnalysisPass, PatternBenefit benefit) : ConvertTritonGPUOpToLLVMPattern( converter, allocation, smem, benefit), LoadStoreConversionBase(axisAnalysisPass) {} LogicalResult matchAndRewrite(triton::AtomicRMWOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { auto loc = op.getLoc(); MLIRContext *ctx = rewriter.getContext(); auto atomicRmwAttr = op.atomic_rmw_op(); Value ptr = op.ptr(); Value val = op.val(); Value llPtr = adaptor.ptr(); Value llVal = adaptor.val(); Value llMask = adaptor.mask(); auto valElements = getElementsFromStruct(loc, llVal, rewriter); auto ptrElements = getElementsFromStruct(loc, llPtr, rewriter); auto maskElements = getElementsFromStruct(loc, llMask, rewriter); auto valueTy = op.getResult().getType().dyn_cast(); Type valueElemTy = valueTy ? getTypeConverter()->convertType(valueTy.getElementType()) : op.getResult().getType(); const size_t valueElemNbits = valueElemTy.getIntOrFloatBitWidth(); auto elemsPerThread = getElemsPerThread(val.getType()); // vec = 1 for scalar auto vec = getVectorSize(ptr); Value mask = int_val(1, 1); auto tid = tid_val(); // tensor if (valueTy) { auto valTy = val.getType().cast(); vec = std::min(vec, valTy.getElementType().isF16() ? 2 : 1); // mask auto shape = valueTy.getShape(); auto numElements = product(shape); mask = and_(mask, icmp_slt(mul(tid, i32_val(elemsPerThread)), i32_val(numElements))); } auto vecTy = vec_ty(valueElemTy, vec); SmallVector resultVals(elemsPerThread); for (size_t i = 0; i < elemsPerThread; i += vec) { Value rmwVal = undef(vecTy); for (int ii = 0; ii < vec; ++ii) { Value iiVal = createIndexAttrConstant( rewriter, loc, getTypeConverter()->getIndexType(), ii); rmwVal = insert_element(vecTy, rmwVal, valElements[i + ii], iiVal); } Value rmwPtr = ptrElements[i]; Value rmwMask = maskElements[i]; rmwMask = and_(rmwMask, mask); std::string sTy; PTXBuilder ptxBuilderAtomicRMW; std::string tyId = valueElemNbits * vec == 64 ? "l" : (valueElemNbits * vec == 32 ? "r" : "h"); auto *dstOpr = ptxBuilderAtomicRMW.newOperand("=" + tyId); auto *ptrOpr = ptxBuilderAtomicRMW.newAddrOperand(rmwPtr, "l"); auto *valOpr = ptxBuilderAtomicRMW.newOperand(rmwVal, tyId); auto &atom = ptxBuilderAtomicRMW.create<>("atom")->global().o("gpu"); auto rmwOp = stringifyRMWOp(atomicRmwAttr).str(); auto sBits = std::to_string(valueElemNbits); switch (atomicRmwAttr) { case RMWOp::AND: sTy = "b" + sBits; break; case RMWOp::OR: sTy = "b" + sBits; break; case RMWOp::XOR: sTy = "b" + sBits; break; case RMWOp::ADD: sTy = "s" + sBits; break; case RMWOp::FADD: rmwOp = "add"; rmwOp += (valueElemNbits == 16 ? ".noftz" : ""); sTy = "f" + sBits; sTy += (vec == 2 && valueElemNbits == 16) ? "x2" : ""; break; case RMWOp::MAX: sTy = "s" + sBits; break; case RMWOp::MIN: sTy = "s" + sBits; break; case RMWOp::UMAX: rmwOp = "max"; sTy = "u" + sBits; break; case RMWOp::UMIN: rmwOp = "min"; sTy = "u" + sBits; break; case RMWOp::XCHG: sTy = "b" + sBits; break; default: return failure(); } atom.o(rmwOp).o(sTy); if (valueTy) { atom(dstOpr, ptrOpr, valOpr).predicate(rmwMask); auto retType = vec == 1 ? valueElemTy : vecTy; auto ret = ptxBuilderAtomicRMW.launch(rewriter, loc, retType); for (int ii = 0; ii < vec; ++ii) { resultVals[i + ii] = vec == 1 ? ret : extract_element(valueElemTy, ret, idx_val(ii)); } } else { PTXBuilder ptxBuilderMemfence; auto memfenc = ptxBuilderMemfence.create("membar")->o("gl"); memfenc(); auto ASMReturnTy = void_ty(ctx); ptxBuilderMemfence.launch(rewriter, loc, ASMReturnTy); rmwMask = and_(rmwMask, icmp_eq(tid, i32_val(0))); atom(dstOpr, ptrOpr, valOpr).predicate(rmwMask); auto old = ptxBuilderAtomicRMW.launch(rewriter, loc, valueElemTy); Value atomPtr = getSharedMemoryBase(loc, rewriter, op.getOperation()); atomPtr = bitcast(atomPtr, ptr_ty(valueElemTy, 3)); store(old, atomPtr); barrier(); Value ret = load(atomPtr); barrier(); rewriter.replaceOp(op, {ret}); } } if (valueTy) { Type structTy = getTypeConverter()->convertType(valueTy); Value resultStruct = getStructFromElements(loc, resultVals, rewriter, structTy); rewriter.replaceOp(op, {resultStruct}); } return success(); } }; struct InsertSliceOpConversion : public ConvertTritonGPUOpToLLVMPattern { using ConvertTritonGPUOpToLLVMPattern< tensor::InsertSliceOp>::ConvertTritonGPUOpToLLVMPattern; LogicalResult matchAndRewrite(tensor::InsertSliceOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { // %dst = insert_slice %src into %dst[%offsets] Location loc = op->getLoc(); Value dst = op.dest(); Value src = op.source(); Value res = op.result(); assert(allocation->getBufferId(res) == Allocation::InvalidBufferId && "Only support in-place insert_slice for now"); auto srcTy = src.getType().dyn_cast(); auto srcLayout = srcTy.getEncoding().dyn_cast(); auto srcShape = srcTy.getShape(); assert(srcLayout && "Unexpected srcLayout in InsertSliceOpConversion"); auto dstTy = dst.getType().dyn_cast(); auto dstLayout = dstTy.getEncoding().dyn_cast(); auto llDst = adaptor.dest(); assert(dstLayout && "Unexpected dstLayout in InsertSliceOpConversion"); assert(op.hasUnitStride() && "Only unit stride supported by InsertSliceOpConversion"); // newBase = base + offset // Triton support either static and dynamic offsets auto smemObj = getSharedMemoryObjectFromStruct(loc, llDst, rewriter); SmallVector offsets; SmallVector srcStrides; auto mixedOffsets = op.getMixedOffsets(); for (auto i = 0; i < mixedOffsets.size(); ++i) { if (op.isDynamicOffset(i)) { offsets.emplace_back(adaptor.offsets()[i]); } else { offsets.emplace_back(i32_val(op.getStaticOffset(i))); } // Like insert_slice_async, we only support slice from one dimension, // which has a slice size of 1 if (op.getStaticSize(i) != 1) { srcStrides.emplace_back(smemObj.strides[i]); } } // Compute the offset based on the original strides of the shared memory // object auto offset = dot(rewriter, loc, offsets, smemObj.strides); auto elemTy = getTypeConverter()->convertType(dstTy.getElementType()); auto elemPtrTy = ptr_ty(elemTy, 3); auto smemBase = gep(elemPtrTy, smemObj.base, offset); auto llSrc = adaptor.source(); auto srcIndices = emitIndices(loc, rewriter, srcLayout, srcShape); storeDistributedToShared(src, llSrc, srcStrides, srcIndices, dst, smemBase, elemTy, loc, rewriter); // Barrier is not necessary. // The membar pass knows that it writes to shared memory and will handle it // properly. rewriter.replaceOp(op, llDst); return success(); } }; struct InsertSliceAsyncOpConversion : public ConvertTritonGPUOpToLLVMPattern, public LoadStoreConversionBase { using ConvertTritonGPUOpToLLVMPattern< triton::gpu::InsertSliceAsyncOp>::ConvertTritonGPUOpToLLVMPattern; InsertSliceAsyncOpConversion( LLVMTypeConverter &converter, const Allocation *allocation, Value smem, ConvertTritonGPUOpToLLVMPatternBase::IndexCacheInfo &indexCacheInfo, AxisInfoAnalysis &axisAnalysisPass, PatternBenefit benefit) : ConvertTritonGPUOpToLLVMPattern( converter, allocation, smem, indexCacheInfo, benefit), LoadStoreConversionBase(axisAnalysisPass) {} LogicalResult matchAndRewrite(triton::gpu::InsertSliceAsyncOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { // insert_slice_async %src, %dst, %index, %mask, %other auto loc = op.getLoc(); Value src = op.src(); Value dst = op.dst(); Value res = op.result(); Value mask = op.mask(); Value other = op.other(); assert(allocation->getBufferId(res) == Allocation::InvalidBufferId && "Only support in-place insert_slice_async for now"); auto srcTy = src.getType().cast(); auto resTy = dst.getType().cast(); auto resElemTy = getTypeConverter()->convertType(resTy.getElementType()); auto srcBlockedLayout = srcTy.getEncoding().cast(); auto resSharedLayout = resTy.getEncoding().cast(); auto srcShape = srcTy.getShape(); assert(srcShape.size() == 2 && "insert_slice_async: Unexpected rank of %src"); Value llDst = adaptor.dst(); Value llSrc = adaptor.src(); Value llMask = adaptor.mask(); Value llOther = adaptor.other(); Value llIndex = adaptor.index(); // %src auto srcElems = getLLVMElems(src, llSrc, rewriter, loc); // %dst auto dstTy = dst.getType().cast(); auto dstShape = dstTy.getShape(); auto smemObj = getSharedMemoryObjectFromStruct(loc, llDst, rewriter); auto axis = op->getAttrOfType("axis").getInt(); SmallVector offsetVals; SmallVector srcStrides; for (auto i = 0; i < dstShape.size(); ++i) { if (i == axis) { offsetVals.emplace_back(llIndex); } else { offsetVals.emplace_back(i32_val(0)); srcStrides.emplace_back(smemObj.strides[i]); } } // Compute the offset based on the original dimensions of the shared // memory object auto dstOffset = dot(rewriter, loc, offsetVals, smemObj.strides); auto dstPtrTy = ptr_ty(resElemTy, 3); Value dstPtrBase = gep(dstPtrTy, smemObj.base, dstOffset); // %mask SmallVector maskElems; if (llMask) { maskElems = getLLVMElems(mask, llMask, rewriter, loc); assert(srcElems.size() == maskElems.size()); } // %other SmallVector otherElems; if (llOther) { // FIXME(Keren): always assume other is 0 for now // It's not necessary for now because the pipeline pass will skip // generating insert_slice_async if the load op has any "other" tensor. // assert(false && "insert_slice_async: Other value not supported yet"); otherElems = getLLVMElems(other, llOther, rewriter, loc); assert(srcElems.size() == otherElems.size()); } // We don't use getVec() here because we are copying from memory to memory. // If contiguity > vector size, we can have one pointer maintaining the // start of the vector and the other pointer moving to the next vector. unsigned inVec = getContiguity(src); unsigned outVec = resSharedLayout.getVec(); unsigned minVec = std::min(outVec, inVec); unsigned numElems = getElemsPerThread(srcTy); unsigned perPhase = resSharedLayout.getPerPhase(); unsigned maxPhase = resSharedLayout.getMaxPhase(); auto sizePerThread = srcBlockedLayout.getSizePerThread(); auto threadsPerCTA = getThreadsPerCTA(srcBlockedLayout); auto inOrder = srcBlockedLayout.getOrder(); DenseMap sharedPtrs = getSwizzledSharedPtrs(loc, inVec, srcTy, resSharedLayout, resElemTy, smemObj, rewriter, offsetVals, srcStrides); // If perPhase * maxPhase > threadsPerCTA, we will have elements // that share the same tile indices. The index calculation will // be cached. auto numSwizzleRows = std::max( (perPhase * maxPhase) / threadsPerCTA[inOrder[1]], 1); // A sharedLayout encoding has a "vec" parameter. // On the column dimension, if inVec > outVec, it means we have to divide // single vector read into multiple ones auto numVecCols = std::max(inVec / outVec, 1); auto srcIndices = emitIndices(loc, rewriter, srcBlockedLayout, srcShape); for (unsigned elemIdx = 0; elemIdx < numElems; elemIdx += minVec) { // 16 * 8 = 128bits auto maxBitWidth = std::max(128, resElemTy.getIntOrFloatBitWidth()); auto vecBitWidth = resElemTy.getIntOrFloatBitWidth() * minVec; auto bitWidth = std::min(maxBitWidth, vecBitWidth); auto numWords = vecBitWidth / bitWidth; auto numWordElems = bitWidth / resElemTy.getIntOrFloatBitWidth(); // Tune CG and CA here. auto byteWidth = bitWidth / 8; CacheModifier srcCacheModifier = byteWidth == 16 ? CacheModifier::CG : CacheModifier::CA; assert(byteWidth == 16 || byteWidth == 8 || byteWidth == 4); auto resByteWidth = resElemTy.getIntOrFloatBitWidth() / 8; Value basePtr = sharedPtrs[elemIdx]; for (size_t wordIdx = 0; wordIdx < numWords; ++wordIdx) { PTXBuilder ptxBuilder; auto wordElemIdx = wordIdx * numWordElems; auto ©AsyncOp = *ptxBuilder.create(srcCacheModifier); auto *dstOperand = ptxBuilder.newAddrOperand(basePtr, "r", wordElemIdx * resByteWidth); auto *srcOperand = ptxBuilder.newAddrOperand(srcElems[elemIdx + wordElemIdx], "l"); auto *copySize = ptxBuilder.newConstantOperand(byteWidth); auto *srcSize = copySize; if (op.mask()) { // We don't use predicate in this case, setting src-size to 0 // if there's any mask. cp.async will automatically fill the // remaining slots with 0 if cp-size > src-size. // XXX(Keren): Always assume other = 0 for now. auto selectOp = select(maskElems[elemIdx + wordElemIdx], i32_val(byteWidth), i32_val(0)); srcSize = ptxBuilder.newOperand(selectOp, "r"); } copyAsyncOp(dstOperand, srcOperand, copySize, srcSize); ptxBuilder.launch(rewriter, loc, void_ty(getContext())); } } rewriter.replaceOp(op, llDst); return success(); } }; void populateLoadStoreOpToLLVMPatterns( mlir::LLVMTypeConverter &typeConverter, RewritePatternSet &patterns, int numWarps, AxisInfoAnalysis &axisInfoAnalysis, const Allocation *allocation, Value smem, ConvertTritonGPUOpToLLVMPatternBase::IndexCacheInfo &indexCacheInfo, PatternBenefit benefit) { patterns.add(typeConverter, axisInfoAnalysis, benefit); patterns.add(typeConverter, axisInfoAnalysis, benefit); patterns.add(typeConverter, allocation, smem, axisInfoAnalysis, benefit); patterns.add(typeConverter, allocation, smem, axisInfoAnalysis, benefit); patterns.add(typeConverter, allocation, smem, indexCacheInfo, benefit); patterns.add(typeConverter, allocation, smem, indexCacheInfo, axisInfoAnalysis, benefit); } triton-2.0.0/lib/Conversion/TritonGPUToLLVM/LoadStoreOpToLLVM.h000066400000000000000000000010231440023377100241130ustar00rootroot00000000000000#ifndef TRITON_CONVERSION_TRITONGPU_TO_LLVM_LOAD_STORE_OP_H #define TRITON_CONVERSION_TRITONGPU_TO_LLVM_LOAD_STORE_OP_H #include "TritonGPUToLLVMBase.h" using namespace mlir; using namespace mlir::triton; void populateLoadStoreOpToLLVMPatterns( mlir::LLVMTypeConverter &typeConverter, RewritePatternSet &patterns, int numWarps, AxisInfoAnalysis &axisInfoAnalysis, const Allocation *allocation, Value smem, ConvertTritonGPUOpToLLVMPatternBase::IndexCacheInfo &indexCacheInfo, PatternBenefit benefit); #endif triton-2.0.0/lib/Conversion/TritonGPUToLLVM/PTXAsmFormat.cpp000066400000000000000000000136731440023377100235600ustar00rootroot00000000000000#include "triton/Conversion/TritonGPUToLLVM/PTXAsmFormat.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" #include "mlir/Transforms/DialectConversion.h" #include "llvm/Support/raw_ostream.h" // TODO(Superjomn): unify to llvm::raw_string_ostream #include namespace mlir { namespace triton { // TODO(Superjomn) Move to a global utility file? std::string strJoin(llvm::ArrayRef strs, llvm::StringRef delimiter) { std::string osStr; llvm::raw_string_ostream os(osStr); for (size_t i = 0; !strs.empty() && i < strs.size() - 1; ++i) os << strs[i] << delimiter; if (!strs.empty()) os << strs.back(); os.flush(); return osStr; } PTXInstr::Operand * PTXBuilder::newOperand(mlir::Value value, StringRef constraint, std::function formatter) { argArchive.emplace_back(std::make_unique(value, constraint)); auto *opr = argArchive.back().get(); opr->repr = formatter; opr->idx = oprCounter++; return opr; } PTXBuilder::Operand *PTXBuilder::newOperand(StringRef constraint) { // Constraint should be something like "=r" assert(!constraint.empty() && constraint[0] == '='); auto *opr = newOperand(); opr->idx = oprCounter++; opr->constraint = constraint; return opr; } PTXBuilder::Operand *PTXBuilder::newConstantOperand(const std::string &v) { argArchive.emplace_back(std::make_unique()); argArchive.back()->repr = [v](int idx) { return v; }; return argArchive.back().get(); } PTXBuilder::Operand *PTXBuilder::newConstantOperand(int64_t v) { std::stringstream ss; ss << "0x" << std::hex << v; return newConstantOperand(ss.str()); } std::string PTXBuilder::getConstraints() const { auto args = getAllArgs(); llvm::SmallVector argReprs; for (auto arg : args) argReprs.push_back(arg->constraint); return strJoin(argReprs, ","); } llvm::SmallVector PTXBuilder::getAllMLIRArgs() const { llvm::SmallVector res; for (auto &arg : argArchive) { if (!arg->isList() && arg->value) res.push_back(arg->value); } return res; } SmallVector PTXBuilder::getAllArgs() const { llvm::SmallVector res; for (auto &x : argArchive) if (!x->isList()) res.push_back(x.get()); return res; } mlir::Value PTXBuilder::launch(OpBuilder &rewriter, Location loc, Type resTy, bool hasSideEffect, bool isAlignStack, ArrayRef attrs) const { auto *ctx = rewriter.getContext(); auto inlineAsm = rewriter.create( loc, resTy, getAllMLIRArgs(), // operands dump(), // asm_string getConstraints(), // constraints hasSideEffect, // has_side_effects isAlignStack, // is_align_stack LLVM::AsmDialectAttr::get(ctx, LLVM::AsmDialect::AD_ATT), // asm_dialect ArrayAttr::get(ctx, attrs) // operand_attrs ); return inlineAsm.getRes(); } std::string PTXInstr::Operand::dump() const { if (repr) return repr(idx); if (!isList()) return "$" + std::to_string(idx); llvm::SmallVector oprs; for (auto *opr : list) oprs.push_back(opr->dump()); return "{ " + strJoin(oprs, ", ") + " }"; } PTXInstr::Operand *PTXBuilder::newAddrOperand(mlir::Value addr, StringRef constraint, int off) { auto *opr = newOperand(addr, constraint); opr->repr = [off](int idx) -> std::string { std::stringstream ss; ss << "[ $" << idx << " + " << off << " ]"; return ss.str(); }; return opr; } std::string PTXBuilder::dump() const { llvm::SmallVector lines; for (auto &exec : executions) { lines.push_back(exec->dump()); } return strJoin(lines, "\n\t"); } PTXInstrExecution &PTXInstrCommon::call(ArrayRef oprs, bool onlyAttachMLIRArgs) { if (onlyAttachMLIRArgs) { // Nearly impossible to make the $0,$1 in two PTX code snippets to point to // the same MLIR values in onlyAttachMLIRArgs mode. assert(builder->executions.empty() && "builder can only hold a single execution when onlyAttachMIIRArgs " "is true."); builder->reorderArgArchive(oprs); } builder->executions.emplace_back( std::make_unique(this, oprs, onlyAttachMLIRArgs)); return *builder->executions.back(); } PTXInstrExecution &PTXInstrCommon::operator()(ArrayRef oprs, bool onlyAttachMLIRArgs) { return call(oprs, onlyAttachMLIRArgs); } std::string PTXInstrExecution::dump() const { std::string osStr; llvm::raw_string_ostream os(osStr); std::string instrRepr = strJoin(instr->instrParts, "."); if (onlyAttachMLIRArgs) return instrRepr; if (pred) { if (!pred->repr) os << "@" << pred->dump() << " "; else os << pred->repr(pred->idx) << " "; } llvm::SmallVector argReprs; for (auto *arg : argsInOrder) { argReprs.push_back(arg->dump()); } std::string argsRepr = strJoin(argReprs, ", "); os << instrRepr << " " << argsRepr << ";"; os.flush(); return osStr; } SmallVector PTXInstrExecution::getArgList() const { SmallVector args; for (auto *arg : argsInOrder) { if (arg->isList()) args.insert(args.end(), arg->list.begin(), arg->list.end()); else args.push_back(arg); } return args; } PTXInstr &PTXInstr::global() { o("global"); return *this; } PTXInstr &PTXInstr::shared() { o("shared"); return *this; } PTXInstr &PTXInstr::v(int vecWidth, bool predicate) { if (vecWidth > 1) { o("v" + std::to_string(vecWidth), predicate); } return *this; } PTXInstr &PTXInstr::b(int width) { o("b" + std::to_string(width)); return *this; } } // namespace triton } // namespace mlir triton-2.0.0/lib/Conversion/TritonGPUToLLVM/ReduceOpToLLVM.cpp000066400000000000000000000434661440023377100240020ustar00rootroot00000000000000#include "ReduceOpToLLVM.h" using namespace mlir; using namespace mlir::triton; using ::mlir::LLVM::getElementsFromStruct; using ::mlir::LLVM::getStructFromElements; using ::mlir::LLVM::shflSync; using ::mlir::LLVM::storeShared; using ::mlir::triton::gpu::getElemsPerThread; using ::mlir::triton::gpu::getOrder; struct ReduceOpConversion : public ConvertTritonGPUOpToLLVMPattern { public: using ConvertTritonGPUOpToLLVMPattern< triton::ReduceOp>::ConvertTritonGPUOpToLLVMPattern; LogicalResult matchAndRewrite(triton::ReduceOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { if (ReduceOpHelper(op).isFastReduction()) return matchAndRewriteFast(op, adaptor, rewriter); return matchAndRewriteBasic(op, adaptor, rewriter); } private: void accumulate(ConversionPatternRewriter &rewriter, Location loc, RedOp redOp, Value &acc, Value cur, bool isFirst) const { if (isFirst) { acc = cur; return; } switch (redOp) { case RedOp::ADD: acc = add(acc, cur); break; case RedOp::FADD: acc = fadd(acc.getType(), acc, cur); break; case RedOp::MIN: acc = smin(acc, cur); break; case RedOp::MAX: acc = smax(acc, cur); break; case RedOp::UMIN: acc = umin(acc, cur); break; case RedOp::UMAX: acc = umax(acc, cur); break; case RedOp::FMIN: acc = fmin(acc, cur); break; case RedOp::FMAX: acc = fmax(acc, cur); break; case RedOp::XOR: acc = xor_(acc, cur); break; case RedOp::ARGMIN: case RedOp::ARGMAX: case RedOp::ARGUMIN: case RedOp::ARGUMAX: case RedOp::ARGFMIN: case RedOp::ARGFMAX: llvm::report_fatal_error( "This accumulate implementation is not for argmin / argmax"); default: llvm::report_fatal_error("Unsupported reduce op"); } } void accumulateWithIndex(ConversionPatternRewriter &rewriter, Location loc, RedOp redOp, Value &acc, Value &accIndex, Value cur, Value curIndex, bool isFirst) const { if (isFirst) { acc = cur; accIndex = curIndex; return; } switch (redOp) { case RedOp::ARGMIN: accIndex = select( icmp_slt(acc, cur), accIndex, select(icmp_sgt(acc, cur), curIndex, smin(accIndex, curIndex))); acc = smin(acc, cur); break; case RedOp::ARGMAX: accIndex = select( icmp_sgt(acc, cur), accIndex, select(icmp_slt(acc, cur), curIndex, smin(accIndex, curIndex))); acc = smax(acc, cur); break; case RedOp::ARGUMIN: accIndex = select( icmp_ult(acc, cur), accIndex, select(icmp_ugt(acc, cur), curIndex, smin(accIndex, curIndex))); acc = umin(acc, cur); break; case RedOp::ARGUMAX: accIndex = select( icmp_ugt(acc, cur), accIndex, select(icmp_ult(acc, cur), curIndex, smin(accIndex, curIndex))); acc = umax(acc, cur); break; case RedOp::ARGFMIN: accIndex = select( fcmp_olt(acc, cur), accIndex, select(fcmp_ogt(acc, cur), curIndex, smin(accIndex, curIndex))); acc = fmin(acc, cur); break; case RedOp::ARGFMAX: accIndex = select( fcmp_ogt(acc, cur), accIndex, select(fcmp_olt(acc, cur), curIndex, smin(accIndex, curIndex))); acc = fmax(acc, cur); break; case RedOp::ADD: case RedOp::FADD: case RedOp::MIN: case RedOp::MAX: case RedOp::UMIN: case RedOp::UMAX: case RedOp::FMIN: case RedOp::FMAX: case RedOp::XOR: llvm::report_fatal_error( "This accumulate implementation is only for argmin / argmax"); default: llvm::report_fatal_error("Unsupported reduce op"); } } // Use shared memory for reduction within warps and across warps LogicalResult matchAndRewriteBasic(triton::ReduceOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const { Location loc = op->getLoc(); unsigned axis = op.axis(); bool withIndex = triton::ReduceOp::withIndex(op.redOp()); auto srcTy = op.operand().getType().cast(); auto srcLayout = srcTy.getEncoding().cast(); auto srcOrd = srcLayout.getOrder(); auto srcShape = srcTy.getShape(); auto llvmElemTy = getTypeConverter()->convertType(srcTy.getElementType()); auto llvmIndexTy = getTypeConverter()->getIndexType(); auto elemPtrTy = LLVM::LLVMPointerType::get(llvmElemTy, 3); auto indexPtrTy = LLVM::LLVMPointerType::get(llvmIndexTy, 3); Value smemBase = getSharedMemoryBase(loc, rewriter, op.getOperation()); smemBase = bitcast(smemBase, elemPtrTy); ReduceOpHelper helper(op); auto smemShape = helper.getScratchConfigBasic(); unsigned elems = product(smemShape); Value indexSmemBase = gep(elemPtrTy, smemBase, i32_val(elems)); indexSmemBase = bitcast(indexSmemBase, indexPtrTy); unsigned srcElems = getElemsPerThread(srcTy); auto srcIndices = emitIndices(loc, rewriter, srcLayout, srcShape); auto srcValues = getElementsFromStruct(loc, adaptor.operand(), rewriter); SmallVector> offset = emitOffsetForLayout(srcLayout, srcShape); std::map, Value> accs; std::map, Value> accIndices; std::map, SmallVector> indices; // reduce within threads for (unsigned i = 0; i < srcElems; ++i) { SmallVector key = offset[i]; key[axis] = 0; bool isFirst = accs.find(key) == accs.end(); if (!withIndex) { accumulate(rewriter, loc, op.redOp(), accs[key], srcValues[i], isFirst); } else { Value curIndex = srcIndices[i][axis]; accumulateWithIndex(rewriter, loc, op.redOp(), accs[key], accIndices[key], srcValues[i], curIndex, isFirst); } if (isFirst) indices[key] = srcIndices[i]; } // cached int32 constants std::map ints; ints[0] = i32_val(0); for (int N = smemShape[axis] / 2; N > 0; N >>= 1) ints[N] = i32_val(N); Value sizePerThread = i32_val(srcLayout.getSizePerThread()[axis]); // reduce across threads for (auto it : accs) { const SmallVector &key = it.first; Value acc = it.second; Value accIndex; if (withIndex) accIndex = accIndices[key]; SmallVector writeIdx = indices[key]; writeIdx[axis] = udiv(writeIdx[axis], sizePerThread); Value writeOffset = linearize(rewriter, loc, writeIdx, smemShape, srcOrd); Value writePtr = gep(elemPtrTy, smemBase, writeOffset); Value indexWritePtr = gep(indexPtrTy, indexSmemBase, writeOffset); store(acc, writePtr); if (withIndex) store(accIndex, indexWritePtr); SmallVector readIdx(writeIdx.size(), ints[0]); for (int N = smemShape[axis] / 2; N > 0; N >>= 1) { readIdx[axis] = ints[N]; Value readMask = icmp_slt(writeIdx[axis], ints[N]); Value readOffset = select( readMask, linearize(rewriter, loc, readIdx, smemShape, srcOrd), ints[0]); Value readPtr = gep(elemPtrTy, writePtr, readOffset); barrier(); if (!withIndex) { Value cur = load(readPtr); accumulate(rewriter, loc, op.redOp(), acc, cur, false); barrier(); store(acc, writePtr); } else { Value cur = load(readPtr); Value indexReadPtr = gep(indexPtrTy, indexWritePtr, readOffset); Value curIndex = load(indexReadPtr); accumulateWithIndex(rewriter, loc, op.redOp(), acc, accIndex, cur, curIndex, false); barrier(); store(acc, writePtr); store(accIndex, indexWritePtr); } } } barrier(); // set output values if (auto resultTy = op.getType().dyn_cast()) { // nd-tensor where n >= 1 auto resultLayout = resultTy.getEncoding(); auto resultShape = resultTy.getShape(); unsigned resultElems = getElemsPerThread(resultTy); auto resultIndices = emitIndices(loc, rewriter, resultLayout, resultShape); assert(resultIndices.size() == resultElems); SmallVector resultVals(resultElems); for (unsigned i = 0; i < resultElems; ++i) { SmallVector readIdx = resultIndices[i]; readIdx.insert(readIdx.begin() + axis, ints[0]); Value readOffset = linearize(rewriter, loc, readIdx, smemShape, srcOrd); Value readPtr = gep(elemPtrTy, smemBase, readOffset); Value indexReadPtr = gep(indexPtrTy, indexSmemBase, readOffset); resultVals[i] = withIndex ? load(indexReadPtr) : load(readPtr); } SmallVector resultTypes(resultElems, withIndex ? llvmIndexTy : llvmElemTy); Type structTy = LLVM::LLVMStructType::getLiteral(this->getContext(), resultTypes); Value ret = getStructFromElements(loc, resultVals, rewriter, structTy); rewriter.replaceOp(op, ret); } else { // 0d-tensor -> scalar Value resultVal = withIndex ? load(indexSmemBase) : load(smemBase); rewriter.replaceOp(op, resultVal); } return success(); } // Use warp shuffle for reduction within warps and shared memory for data // exchange across warps LogicalResult matchAndRewriteFast(triton::ReduceOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const { Location loc = op->getLoc(); unsigned axis = adaptor.axis(); bool withIndex = triton::ReduceOp::withIndex(op.redOp()); auto srcTy = op.operand().getType().cast(); auto srcLayout = srcTy.getEncoding(); auto srcShape = srcTy.getShape(); auto srcRank = srcTy.getRank(); auto order = getOrder(srcLayout); auto threadsPerWarp = triton::gpu::getThreadsPerWarp(srcLayout); auto warpsPerCTA = triton::gpu::getWarpsPerCTA(srcLayout); auto llvmElemTy = getTypeConverter()->convertType(srcTy.getElementType()); auto llvmIndexTy = getTypeConverter()->getIndexType(); auto elemPtrTy = LLVM::LLVMPointerType::get(llvmElemTy, 3); auto indexPtrTy = LLVM::LLVMPointerType::get(llvmIndexTy, 3); Value smemBase = getSharedMemoryBase(loc, rewriter, op.getOperation()); smemBase = bitcast(smemBase, elemPtrTy); ReduceOpHelper helper(op); auto smemShapes = helper.getScratchConfigsFast(); unsigned elems = product(smemShapes[0]); unsigned maxElems = std::max(elems, product(smemShapes[1])); Value indexSmemBase = gep(elemPtrTy, smemBase, i32_val(maxElems)); indexSmemBase = bitcast(indexSmemBase, indexPtrTy); unsigned sizeIntraWarps = helper.getIntraWarpSize(); unsigned sizeInterWarps = helper.getInterWarpSize(); unsigned srcElems = getElemsPerThread(srcTy); auto srcIndices = emitIndices(loc, rewriter, srcLayout, srcShape); auto srcValues = getElementsFromStruct(loc, adaptor.operand(), rewriter); SmallVector> offset = emitOffsetForLayout(srcLayout, srcShape); std::map, Value> accs; std::map, Value> accIndices; std::map, SmallVector> indices; // reduce within threads for (unsigned i = 0; i < srcElems; ++i) { SmallVector key = offset[i]; key[axis] = 0; bool isFirst = accs.find(key) == accs.end(); if (!withIndex) { accumulate(rewriter, loc, op.redOp(), accs[key], srcValues[i], isFirst); } else { Value curIndex = srcIndices[i][axis]; accumulateWithIndex(rewriter, loc, op.redOp(), accs[key], accIndices[key], srcValues[i], curIndex, isFirst); } if (isFirst) indices[key] = srcIndices[i]; } Value threadId = getThreadId(rewriter, loc); Value warpSize = i32_val(32); Value warpId = udiv(threadId, warpSize); Value laneId = urem(threadId, warpSize); SmallVector multiDimLaneId = delinearize(rewriter, loc, laneId, threadsPerWarp, order); SmallVector multiDimWarpId = delinearize(rewriter, loc, warpId, warpsPerCTA, order); Value laneIdAxis = multiDimLaneId[axis]; Value warpIdAxis = multiDimWarpId[axis]; Value zero = i32_val(0); Value laneZero = icmp_eq(laneIdAxis, zero); Value warpZero = icmp_eq(warpIdAxis, zero); for (auto it : accs) { const SmallVector &key = it.first; Value acc = it.second; Value accIndex; if (withIndex) accIndex = accIndices[key]; // Reduce within warps for (unsigned N = sizeIntraWarps / 2; N > 0; N >>= 1) { Value shfl = shflSync(loc, rewriter, acc, N); if (!withIndex) { accumulate(rewriter, loc, op.redOp(), acc, shfl, false); } else { Value shflIndex = shflSync(loc, rewriter, accIndex, N); accumulateWithIndex(rewriter, loc, op.redOp(), acc, accIndex, shfl, shflIndex, false); } } SmallVector writeIdx = indices[key]; writeIdx[axis] = (sizeInterWarps == 1) ? zero : warpIdAxis; Value writeOffset = linearize(rewriter, loc, writeIdx, smemShapes[0], order); Value writePtr = gep(elemPtrTy, smemBase, writeOffset); storeShared(rewriter, loc, writePtr, acc, laneZero); if (withIndex) { Value indexWritePtr = gep(indexPtrTy, indexSmemBase, writeOffset); storeShared(rewriter, loc, indexWritePtr, accIndex, laneZero); } } barrier(); // The second round of shuffle reduction // now the problem size: sizeInterWarps, s1, s2, .. , sn // where sizeInterWarps is 2^m // // Each thread needs to process: // elemsPerThread = sizeInterWarps * s1 * s2 .. Sn / numThreads unsigned numThreads = product(triton::gpu::getWarpsPerCTA(srcLayout)) * 32; unsigned elemsPerThread = std::max(elems / numThreads, 1); Value readOffset = threadId; for (unsigned round = 0; round < elemsPerThread; ++round) { Value readPtr = gep(elemPtrTy, smemBase, readOffset); // FIXME(Qingyi): need predicate icmp_slt(threadId, // i32_val(sizeInerWarps)) Value acc = load(readPtr); Value accIndex; if (withIndex) { Value readIndexPtr = gep(indexPtrTy, indexSmemBase, readOffset); accIndex = load(readIndexPtr); } for (unsigned N = sizeInterWarps / 2; N > 0; N >>= 1) { Value shfl = shflSync(loc, rewriter, acc, N); if (!withIndex) { accumulate(rewriter, loc, op.redOp(), acc, shfl, false); } else { Value shflIndex = shflSync(loc, rewriter, accIndex, N); accumulateWithIndex(rewriter, loc, op.redOp(), acc, accIndex, shfl, shflIndex, false); } } // only the first thread in each sizeInterWarps is writing Value writeOffset = readOffset; Value writePtr = gep(elemPtrTy, smemBase, writeOffset); Value threadIsNeeded = icmp_slt(threadId, i32_val(elems)); Value laneIdModSizeInterWarps = urem(laneId, i32_val(sizeInterWarps)); Value laneIdModSizeInterWarpsIsZero = icmp_eq(laneIdModSizeInterWarps, zero); Value pred = and_(threadIsNeeded, laneIdModSizeInterWarpsIsZero); storeShared(rewriter, loc, writePtr, acc, pred); if (withIndex) { Value writeIndexPtr = gep(indexPtrTy, indexSmemBase, writeOffset); storeShared(rewriter, loc, writeIndexPtr, accIndex, pred); } if (round != elemsPerThread - 1) { readOffset = add(readOffset, i32_val(numThreads)); } } // We could avoid this barrier in some of the layouts, however this is not // the general case. // TODO: optimize the barrier incase the layouts are accepted. barrier(); // set output values if (auto resultTy = op.getType().dyn_cast()) { // nd-tensor where n >= 1 auto resultLayout = resultTy.getEncoding().cast(); auto resultShape = resultTy.getShape(); unsigned resultElems = getElemsPerThread(resultTy); auto resultIndices = emitIndices(loc, rewriter, resultLayout, resultShape); assert(resultIndices.size() == resultElems); SmallVector resultVals(resultElems); for (size_t i = 0; i < resultElems; ++i) { SmallVector readIdx = resultIndices[i]; readIdx.insert(readIdx.begin() + axis, i32_val(0)); Value readOffset = linearize(rewriter, loc, readIdx, smemShapes[0], order); Value readPtr = gep(elemPtrTy, smemBase, readOffset); Value indexReadPtr = gep(indexPtrTy, indexSmemBase, readOffset); resultVals[i] = withIndex ? load(indexReadPtr) : load(readPtr); } SmallVector resultTypes(resultElems, withIndex ? llvmIndexTy : llvmElemTy); Type structTy = LLVM::LLVMStructType::getLiteral(this->getContext(), resultTypes); Value ret = getStructFromElements(loc, resultVals, rewriter, structTy); rewriter.replaceOp(op, ret); } else { // 0d-tensor -> scalar Value resultVal = withIndex ? load(indexSmemBase) : load(smemBase); rewriter.replaceOp(op, resultVal); } return success(); } }; void populateReduceOpToLLVMPatterns( mlir::LLVMTypeConverter &typeConverter, RewritePatternSet &patterns, int numWarps, AxisInfoAnalysis &axisInfoAnalysis, const Allocation *allocation, Value smem, ConvertTritonGPUOpToLLVMPatternBase::IndexCacheInfo &indexCacheInfo, PatternBenefit benefit) { patterns.add(typeConverter, allocation, smem, indexCacheInfo, benefit); } triton-2.0.0/lib/Conversion/TritonGPUToLLVM/ReduceOpToLLVM.h000066400000000000000000000010071440023377100234300ustar00rootroot00000000000000#ifndef TRITON_CONVERSION_TRITONGPU_TO_LLVM_REDUCE_OP_H #define TRITON_CONVERSION_TRITONGPU_TO_LLVM_REDUCE_OP_H #include "TritonGPUToLLVMBase.h" using namespace mlir; using namespace mlir::triton; void populateReduceOpToLLVMPatterns( mlir::LLVMTypeConverter &typeConverter, RewritePatternSet &patterns, int numWarps, AxisInfoAnalysis &axisInfoAnalysis, const Allocation *allocation, Value smem, ConvertTritonGPUOpToLLVMPatternBase::IndexCacheInfo &indexCacheInfo, PatternBenefit benefit); #endiftriton-2.0.0/lib/Conversion/TritonGPUToLLVM/TritonGPUToLLVM.cpp000066400000000000000000000562011440023377100241160ustar00rootroot00000000000000#include "TritonGPUToLLVM.h" #include "DotOpHelpers.h" #include "Utility.h" using namespace mlir; using namespace mlir::triton; using ::mlir::LLVM::getElementsFromStruct; using ::mlir::LLVM::getSharedMemoryObjectFromStruct; using ::mlir::LLVM::getStructFromElements; using ::mlir::triton::gpu::getElemsPerThread; using ::mlir::triton::gpu::SharedEncodingAttr; struct ReturnOpConversion : public ConvertOpToLLVMPattern<::mlir::ReturnOp> { using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; LogicalResult matchAndRewrite(ReturnOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { unsigned numArguments = op.getNumOperands(); // Currently, Triton kernel function always return nothing. // TODO(Superjomn) add support for non-inline device function if (numArguments > 0) { return rewriter.notifyMatchFailure( op, "Only kernel function with nothing returned is supported."); } rewriter.replaceOpWithNewOp(op, TypeRange(), ValueRange(), op->getAttrs()); return success(); } }; struct BroadcastOpConversion : public ConvertTritonGPUOpToLLVMPattern { using ConvertTritonGPUOpToLLVMPattern< triton::BroadcastOp>::ConvertTritonGPUOpToLLVMPattern; LogicalResult matchAndRewrite(triton::BroadcastOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { // Following the order of indices in the legacy code, a broadcast of: // [s(0), s(1) ... s(k-1), 1, s(k+1), s(k+2) ... s(n-1)] // => // [s(0), s(1) ... s(k-1), s(k), s(k+1), s(k+2) ... s(n-1)] // // logically maps to a broadcast within a thread's scope: // [cta(0)..cta(k-1), 1,cta(k+1)..cta(n-1),spt(0)..spt(k-1), // 1,spt(k+1)..spt(n-1)] // => // [cta(0)..cta(k-1),cta(k),cta(k+1)..cta(n-1),spt(0)..spt(k-1),spt(k),spt(k+1)..spt(n-1)] // // regardless of the order of the layout // Location loc = op->getLoc(); Value src = adaptor.src(); Value result = op.result(); auto srcTy = op.src().getType().cast(); auto resultTy = result.getType().cast(); auto srcLayout = srcTy.getEncoding(); auto resultLayout = resultTy.getEncoding(); auto srcShape = srcTy.getShape(); auto resultShape = resultTy.getShape(); unsigned rank = srcTy.getRank(); assert(rank == resultTy.getRank()); auto order = triton::gpu::getOrder(srcLayout); auto srcOffsets = emitOffsetForLayout(srcLayout, srcShape); auto resultOffsets = emitOffsetForLayout(resultLayout, resultShape); SmallVector srcVals = getElementsFromStruct(loc, src, rewriter); if (auto srcMma = srcLayout.dyn_cast()) { // NOTE: This is just an naive fix, but for MMA layout, and 2-d fix should // be all right. // TODO[Superjomn]: Replace this with a generic implementation. if (srcMma.isVolta()) { assert(srcTy.getElementType().isF16() && "Unexpected data type on Volta"); int numElemsPerThread = srcMma.getElemsPerThread(resultTy.getShape()); int srcUniqElems = srcVals.size() / 2; int dup = numElemsPerThread / srcUniqElems; SmallVector retVals; if (srcShape[0] == 1) { // add-cols for (int i = 0; i < srcUniqElems; ++i) for (int k = 0; k < dup; ++k) retVals.push_back(srcVals[i * 2]); } else { // add-rows for (int k = 0; k < dup; ++k) for (int i = 0; i < srcUniqElems; ++i) retVals.push_back(srcVals[i]); } auto llvmStructTy = getTypeConverter()->convertType(resultTy); Value ret = getStructFromElements(loc, retVals, rewriter, llvmStructTy); rewriter.replaceOp(op, {ret}); return success(); } } DenseMap, Value, SmallVectorKeyInfo> srcValues; for (size_t i = 0; i < srcOffsets.size(); i++) { srcValues[srcOffsets[i]] = srcVals[i]; } SmallVector resultVals; for (size_t i = 0; i < resultOffsets.size(); i++) { auto offset = resultOffsets[i]; for (size_t j = 0; j < srcShape.size(); j++) if (srcShape[j] == 1) offset[j] = 0; resultVals.push_back(srcValues.lookup(offset)); } auto llvmStructTy = getTypeConverter()->convertType(resultTy); Value resultStruct = getStructFromElements(loc, resultVals, rewriter, llvmStructTy); rewriter.replaceOp(op, {resultStruct}); return success(); } }; struct PrintfOpConversion : public ConvertTritonGPUOpToLLVMPattern { using ConvertTritonGPUOpToLLVMPattern< triton::PrintfOp>::ConvertTritonGPUOpToLLVMPattern; LogicalResult matchAndRewrite(triton::PrintfOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { auto loc = op->getLoc(); SmallVector operands; for (auto operand : adaptor.getOperands()) { auto sub_operands = getElementsFromStruct(loc, operand, rewriter); for (auto elem : sub_operands) { operands.push_back(elem); } } std::string formatStr; llvm::raw_string_ostream os(formatStr); os << op.prefix(); if (!operands.empty()) { os << getFormatSubstr(operands[0]); } for (size_t i = 1; i < operands.size(); ++i) { os << ", " << getFormatSubstr(operands[i]); } llPrintf(formatStr, operands, rewriter); rewriter.eraseOp(op); return success(); } std::string getFormatSubstr(Value value) const { Type type = value.getType(); if (type.isa()) { return "%p"; } else if (type.isBF16() || type.isF16() || type.isF32() || type.isF64()) { return "%f"; } else if (type.isSignedInteger()) { return "%i"; } else if (type.isUnsignedInteger() || type.isSignlessInteger()) { return "%u"; } assert(false && "not supported type"); return ""; } // declare vprintf(i8*, i8*) as external function static LLVM::LLVMFuncOp getVprintfDeclaration(ConversionPatternRewriter &rewriter) { auto moduleOp = rewriter.getBlock()->getParent()->getParentOfType(); StringRef funcName("vprintf"); Operation *funcOp = moduleOp.lookupSymbol(funcName); if (funcOp) return cast(*funcOp); auto *context = rewriter.getContext(); SmallVector argsType{ptr_ty(IntegerType::get(context, 8)), ptr_ty(IntegerType::get(context, 8))}; auto funcType = LLVM::LLVMFunctionType::get(i32_ty, argsType); ConversionPatternRewriter::InsertionGuard guard(rewriter); rewriter.setInsertionPointToStart(moduleOp.getBody()); return rewriter.create(UnknownLoc::get(context), funcName, funcType); } // extend integer to int32, extend float to float64 // this comes from vprintf alignment requirements. static std::pair promoteValue(ConversionPatternRewriter &rewriter, Value value) { auto *context = rewriter.getContext(); auto type = value.getType(); Value newOp = value; Type newType = type; bool bUnsigned = type.isUnsignedInteger(); if (type.isIntOrIndex() && type.getIntOrFloatBitWidth() < 32) { if (bUnsigned) { newType = ui32_ty; newOp = rewriter.create(UnknownLoc::get(context), newType, value); } else { newType = i32_ty; newOp = rewriter.create(UnknownLoc::get(context), newType, value); } } else if (type.isBF16() || type.isF16() || type.isF32()) { newType = f64_ty; newOp = rewriter.create(UnknownLoc::get(context), newType, value); } return {newType, newOp}; } static void llPrintf(StringRef msg, ValueRange args, ConversionPatternRewriter &rewriter) { static const char formatStringPrefix[] = "printfFormat_"; assert(!msg.empty() && "printf with empty string not support"); Type int8Ptr = ptr_ty(i8_ty); auto *context = rewriter.getContext(); auto moduleOp = rewriter.getBlock()->getParent()->getParentOfType(); auto funcOp = getVprintfDeclaration(rewriter); Value one = rewriter.create( UnknownLoc::get(context), i32_ty, rewriter.getI32IntegerAttr(1)); Value zero = rewriter.create( UnknownLoc::get(context), i32_ty, rewriter.getI32IntegerAttr(0)); unsigned stringNumber = 0; SmallString<16> stringConstName; do { stringConstName.clear(); (formatStringPrefix + Twine(stringNumber++)).toStringRef(stringConstName); } while (moduleOp.lookupSymbol(stringConstName)); llvm::SmallString<64> formatString(msg); formatString.push_back('\n'); formatString.push_back('\0'); size_t formatStringSize = formatString.size_in_bytes(); auto globalType = LLVM::LLVMArrayType::get(i8_ty, formatStringSize); LLVM::GlobalOp global; { ConversionPatternRewriter::InsertionGuard guard(rewriter); rewriter.setInsertionPointToStart(moduleOp.getBody()); global = rewriter.create( UnknownLoc::get(context), globalType, /*isConstant=*/true, LLVM::Linkage::Internal, stringConstName, rewriter.getStringAttr(formatString)); } Value globalPtr = rewriter.create(UnknownLoc::get(context), global); Value stringStart = rewriter.create( UnknownLoc::get(context), int8Ptr, globalPtr, SmallVector({zero, zero})); Value bufferPtr = rewriter.create(UnknownLoc::get(context), int8Ptr); SmallVector newArgs; if (args.size() >= 1) { SmallVector argTypes; for (auto arg : args) { Type newType; Value newArg; std::tie(newType, newArg) = promoteValue(rewriter, arg); argTypes.push_back(newType); newArgs.push_back(newArg); } Type structTy = LLVM::LLVMStructType::getLiteral(context, argTypes); auto allocated = rewriter.create(UnknownLoc::get(context), ptr_ty(structTy), one, /*alignment=*/0); for (const auto &entry : llvm::enumerate(newArgs)) { auto index = rewriter.create( UnknownLoc::get(context), i32_ty, rewriter.getI32IntegerAttr(entry.index())); auto fieldPtr = rewriter.create( UnknownLoc::get(context), ptr_ty(argTypes[entry.index()]), allocated, ArrayRef{zero, index}); rewriter.create(UnknownLoc::get(context), entry.value(), fieldPtr); } bufferPtr = rewriter.create(UnknownLoc::get(context), int8Ptr, allocated); } SmallVector operands{stringStart, bufferPtr}; rewriter.create(UnknownLoc::get(context), funcOp, operands); } }; struct MakeRangeOpConversion : public ConvertTritonGPUOpToLLVMPattern { MakeRangeOpConversion( LLVMTypeConverter &converter, ConvertTritonGPUOpToLLVMPatternBase::IndexCacheInfo &indexCacheInfo, PatternBenefit benefit) : ConvertTritonGPUOpToLLVMPattern( converter, /*Allocation*/ nullptr, Value{}, indexCacheInfo, benefit) {} LogicalResult matchAndRewrite(triton::MakeRangeOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { Location loc = op->getLoc(); auto rankedTy = op.result().getType().dyn_cast(); auto shape = rankedTy.getShape(); auto layout = rankedTy.getEncoding(); auto elemTy = rankedTy.getElementType(); assert(elemTy.isInteger(32)); Value start = createIndexAttrConstant(rewriter, loc, elemTy, op.start()); auto idxs = emitIndices(loc, rewriter, layout, shape); unsigned elems = idxs.size(); SmallVector retVals(elems); // TODO: slice layout has more elements than expected. // Unexpected behavior for make range, but generally OK when followed by // expand dims + broadcast. very weird behavior otherwise potentially. for (const auto &multiDim : llvm::enumerate(idxs)) { assert(multiDim.value().size() == 1); retVals[multiDim.index()] = add(multiDim.value()[0], start); } SmallVector types(elems, elemTy); Type structTy = LLVM::LLVMStructType::getLiteral(getContext(), types); Value result = getStructFromElements(loc, retVals, rewriter, structTy); rewriter.replaceOp(op, result); return success(); } }; struct GetProgramIdOpConversion : public ConvertTritonGPUOpToLLVMPattern { using ConvertTritonGPUOpToLLVMPattern< triton::GetProgramIdOp>::ConvertTritonGPUOpToLLVMPattern; LogicalResult matchAndRewrite(triton::GetProgramIdOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { Location loc = op->getLoc(); assert(op.axis() < 3); Value blockId = rewriter.create<::mlir::gpu::BlockIdOp>( loc, rewriter.getIndexType(), dims[op.axis()]); auto llvmIndexTy = getTypeConverter()->getIndexType(); rewriter.replaceOpWithNewOp( op, TypeRange{llvmIndexTy}, ValueRange{blockId}); return success(); } static constexpr mlir::gpu::Dimension dims[] = {mlir::gpu::Dimension::x, mlir::gpu::Dimension::y, mlir::gpu::Dimension::z}; }; struct GetNumProgramsOpConversion : public ConvertTritonGPUOpToLLVMPattern { using ConvertTritonGPUOpToLLVMPattern< triton::GetNumProgramsOp>::ConvertTritonGPUOpToLLVMPattern; LogicalResult matchAndRewrite(triton::GetNumProgramsOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { Location loc = op->getLoc(); assert(op.axis() < 3); Value blockId = rewriter.create<::mlir::gpu::GridDimOp>( loc, rewriter.getIndexType(), dims[op.axis()]); auto llvmIndexTy = getTypeConverter()->getIndexType(); rewriter.replaceOpWithNewOp( op, TypeRange{llvmIndexTy}, ValueRange{blockId}); return success(); } static constexpr mlir::gpu::Dimension dims[] = {mlir::gpu::Dimension::x, mlir::gpu::Dimension::y, mlir::gpu::Dimension::z}; }; struct AddPtrOpConversion : public ConvertTritonGPUOpToLLVMPattern { using ConvertTritonGPUOpToLLVMPattern< triton::AddPtrOp>::ConvertTritonGPUOpToLLVMPattern; LogicalResult matchAndRewrite(triton::AddPtrOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { Location loc = op->getLoc(); auto resultTy = op.getType(); auto resultTensorTy = resultTy.dyn_cast(); if (resultTensorTy) { unsigned elems = getElemsPerThread(resultTy); Type elemTy = getTypeConverter()->convertType(resultTensorTy.getElementType()); SmallVector types(elems, elemTy); Type structTy = LLVM::LLVMStructType::getLiteral(getContext(), types); auto ptrs = getElementsFromStruct(loc, adaptor.ptr(), rewriter); auto offsets = getElementsFromStruct(loc, adaptor.offset(), rewriter); SmallVector resultVals(elems); for (unsigned i = 0; i < elems; ++i) { resultVals[i] = gep(elemTy, ptrs[i], offsets[i]); } Value view = getStructFromElements(loc, resultVals, rewriter, structTy); rewriter.replaceOp(op, view); } else { assert(resultTy.isa()); Type llResultTy = getTypeConverter()->convertType(resultTy); Value result = gep(llResultTy, adaptor.ptr(), adaptor.offset()); rewriter.replaceOp(op, result); } return success(); } }; struct AllocTensorOpConversion : public ConvertTritonGPUOpToLLVMPattern { using ConvertTritonGPUOpToLLVMPattern< triton::gpu::AllocTensorOp>::ConvertTritonGPUOpToLLVMPattern; LogicalResult matchAndRewrite(triton::gpu::AllocTensorOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { Location loc = op->getLoc(); Value smemBase = getSharedMemoryBase(loc, rewriter, op.getResult()); auto resultTy = op.getType().dyn_cast(); auto llvmElemTy = getTypeConverter()->convertType(resultTy.getElementType()); auto elemPtrTy = ptr_ty(llvmElemTy, 3); smemBase = bitcast(smemBase, elemPtrTy); auto order = resultTy.getEncoding().cast().getOrder(); // Workaround for 3D tensors // TODO: we need to modify the pipeline pass to give a proper shared // encoding to 3D tensors SmallVector newOrder; if (resultTy.getShape().size() == 3) newOrder = {1 + order[0], 1 + order[1], 0}; else newOrder = SmallVector(order.begin(), order.end()); auto smemObj = SharedMemoryObject(smemBase, resultTy.getShape(), newOrder, loc, rewriter); auto retVal = getStructFromSharedMemoryObject(loc, smemObj, rewriter); rewriter.replaceOp(op, retVal); return success(); } }; struct ExtractSliceOpConversion : public ConvertTritonGPUOpToLLVMPattern { using ConvertTritonGPUOpToLLVMPattern< tensor::ExtractSliceOp>::ConvertTritonGPUOpToLLVMPattern; LogicalResult matchAndRewrite(tensor::ExtractSliceOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { // %dst = extract_slice %src[%offsets] Location loc = op->getLoc(); auto srcTy = op.source().getType().dyn_cast(); auto srcLayout = srcTy.getEncoding().dyn_cast(); assert(srcLayout && "Unexpected resultLayout in ExtractSliceOpConversion"); assert(op.hasUnitStride() && "Only unit stride supported by ExtractSliceOpConversion"); // newBase = base + offset // Triton supports either static and dynamic offsets auto smemObj = getSharedMemoryObjectFromStruct(loc, adaptor.source(), rewriter); SmallVector opOffsetVals; SmallVector offsetVals; auto mixedOffsets = op.getMixedOffsets(); for (auto i = 0; i < mixedOffsets.size(); ++i) { if (op.isDynamicOffset(i)) opOffsetVals.emplace_back(adaptor.offsets()[i]); else opOffsetVals.emplace_back(i32_val(op.getStaticOffset(i))); offsetVals.emplace_back(add(smemObj.offsets[i], opOffsetVals[i])); } // Compute the offset based on the original strides of the shared memory // object auto offset = dot(rewriter, loc, opOffsetVals, smemObj.strides); // newShape = rank_reduce(shape) // Triton only supports static tensor sizes SmallVector strideVals; for (auto i = 0; i < op.static_sizes().size(); ++i) { if (op.getStaticSize(i) == 1) { offsetVals.erase(offsetVals.begin() + i); } else { strideVals.emplace_back(smemObj.strides[i]); } } auto llvmElemTy = getTypeConverter()->convertType(srcTy.getElementType()); auto elemPtrTy = ptr_ty(llvmElemTy, 3); auto resTy = op.getType().dyn_cast(); smemObj = SharedMemoryObject(gep(elemPtrTy, smemObj.base, offset), strideVals, offsetVals); auto retVal = getStructFromSharedMemoryObject(loc, smemObj, rewriter); rewriter.replaceOp(op, retVal); return success(); } }; struct AsyncWaitOpConversion : public ConvertTritonGPUOpToLLVMPattern { using ConvertTritonGPUOpToLLVMPattern< triton::gpu::AsyncWaitOp>::ConvertTritonGPUOpToLLVMPattern; LogicalResult matchAndRewrite(triton::gpu::AsyncWaitOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { PTXBuilder ptxBuilder; auto &asyncWaitOp = *ptxBuilder.create<>("cp.async.wait_group"); auto num = op->getAttrOfType("num").getInt(); asyncWaitOp(ptxBuilder.newConstantOperand(num)); auto ctx = op.getContext(); auto loc = op.getLoc(); auto voidTy = void_ty(ctx); ptxBuilder.launch(rewriter, loc, voidTy); // Safe to remove the op since it doesn't have any return value. rewriter.eraseOp(op); return success(); } }; struct AsyncCommitGroupOpConversion : public ConvertTritonGPUOpToLLVMPattern { using ConvertTritonGPUOpToLLVMPattern< triton::gpu::AsyncCommitGroupOp>::ConvertTritonGPUOpToLLVMPattern; LogicalResult matchAndRewrite(triton::gpu::AsyncCommitGroupOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { PTXBuilder ptxBuilder; ptxBuilder.create<>("cp.async.commit_group")->operator()(); ptxBuilder.launch(rewriter, op.getLoc(), void_ty(op.getContext())); // Safe to remove the op since it doesn't have any return value. rewriter.eraseOp(op); return success(); } }; namespace mlir { namespace LLVM { void vprintf(StringRef msg, ValueRange args, ConversionPatternRewriter &rewriter) { PrintfOpConversion::llPrintf(msg, args, rewriter); } void vprintf_array(Value thread, ArrayRef arr, std::string info, std::string elem_repr, ConversionPatternRewriter &builder) { std::string fmt = info + " t-%d "; std::vector new_arr({thread}); for (int i = 0; i < arr.size(); ++i) { fmt += elem_repr + ((i == arr.size() - 1) ? "" : ", "); new_arr.push_back(arr[i]); } vprintf(fmt, new_arr, builder); } } // namespace LLVM } // namespace mlir void populateTritonGPUToLLVMPatterns( mlir::LLVMTypeConverter &typeConverter, RewritePatternSet &patterns, int numWarps, AxisInfoAnalysis &axisInfoAnalysis, const Allocation *allocation, Value smem, ConvertTritonGPUOpToLLVMPatternBase::IndexCacheInfo &indexCacheInfo, PatternBenefit benefit) { patterns.add(typeConverter, benefit); patterns.add(typeConverter, allocation, smem, benefit); patterns.add(typeConverter, benefit); patterns.add(typeConverter, benefit); patterns.add(typeConverter, benefit); patterns.add(typeConverter, allocation, smem, benefit); patterns.add(typeConverter, benefit); patterns.add(typeConverter, benefit); patterns.add(typeConverter, indexCacheInfo, benefit); patterns.add(typeConverter, benefit); patterns.add(typeConverter, benefit); } triton-2.0.0/lib/Conversion/TritonGPUToLLVM/TritonGPUToLLVM.h000066400000000000000000000007651440023377100235670ustar00rootroot00000000000000#ifndef TRITON_CONVERSION_TRITONGPU_TO_LLVM_H #define TRITON_CONVERSION_TRITONGPU_TO_LLVM_H #include "TritonGPUToLLVMBase.h" using namespace mlir; using namespace mlir::triton; void populateTritonGPUToLLVMPatterns( mlir::LLVMTypeConverter &typeConverter, RewritePatternSet &patterns, int numWarps, AxisInfoAnalysis &axisInfoAnalysis, const Allocation *allocation, Value smem, ConvertTritonGPUOpToLLVMPatternBase::IndexCacheInfo &indexCacheInfo, PatternBenefit benefit); #endif triton-2.0.0/lib/Conversion/TritonGPUToLLVM/TritonGPUToLLVMBase.h000066400000000000000000001073761440023377100243700ustar00rootroot00000000000000#ifndef TRITON_CONVERSION_TRITONGPU_TO_LLVM_BASE_H #define TRITON_CONVERSION_TRITONGPU_TO_LLVM_BASE_H // TODO: refactor so that it doesn't fail if Allocation.h // is included after utility.h (due to conflict in `store` macro // and #include "triton/Analysis/Allocation.h" // #include "Utility.h" #include "mlir/IR/TypeUtilities.h" #include "triton/Analysis/AxisInfo.h" using namespace mlir; using namespace mlir::triton; using ::mlir::LLVM::SharedMemoryObject; using ::mlir::triton::gpu::BlockedEncodingAttr; using ::mlir::triton::gpu::MmaEncodingAttr; using ::mlir::triton::gpu::SliceEncodingAttr; namespace mlir { namespace LLVM { // Helper function for using printf in LLVM conversion. void vprintf(StringRef msg, ValueRange args, ConversionPatternRewriter &rewriter); void vprintf_array(Value thread, ArrayRef arr, std::string info, std::string elem_repr, ConversionPatternRewriter &builder); } // namespace LLVM } // namespace mlir // FuncOpConversion/FuncOpConversionBase is borrowed from // https://github.com/llvm/llvm-project/blob/fae656b2dd80246c3c6f01e9c77c49560368752c/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp#L276 // since it is not exposed on header files in mlir v14 // TODO(Superjomn): remove the code when MLIR v15.0 is included. // All the rights are reserved by the LLVM community. struct FuncOpConversionBase : public ConvertOpToLLVMPattern { private: /// Only retain those attributes that are not constructed by /// `LLVMFuncOp::build`. If `filterArgAttrs` is set, also filter out argument /// attributes. static void filterFuncAttributes(ArrayRef attrs, bool filterArgAttrs, SmallVectorImpl &result) { for (const auto &attr : attrs) { if (attr.getName() == SymbolTable::getSymbolAttrName() || attr.getName() == FunctionOpInterface::getTypeAttrName() || attr.getName() == "std.varargs" || (filterArgAttrs && attr.getName() == FunctionOpInterface::getArgDictAttrName())) continue; result.push_back(attr); } } /// Helper function for wrapping all attributes into a single DictionaryAttr static auto wrapAsStructAttrs(OpBuilder &b, ArrayAttr attrs) { return DictionaryAttr::get(b.getContext(), b.getNamedAttr("llvm.struct_attrs", attrs)); } protected: using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; // Convert input FuncOp to LLVMFuncOp by using the LLVMTypeConverter provided // to this legalization pattern. LLVM::LLVMFuncOp convertFuncOpToLLVMFuncOp(FuncOp funcOp, ConversionPatternRewriter &rewriter) const { // Convert the original function arguments. They are converted using the // LLVMTypeConverter provided to this legalization pattern. auto varargsAttr = funcOp->getAttrOfType("func.varargs"); TypeConverter::SignatureConversion result(funcOp.getNumArguments()); auto llvmType = getTypeConverter()->convertFunctionSignature( funcOp.getType(), varargsAttr && varargsAttr.getValue(), result); if (!llvmType) return nullptr; // Propagate argument/result attributes to all converted arguments/result // obtained after converting a given original argument/result. SmallVector attributes; filterFuncAttributes(funcOp->getAttrs(), /*filterArgAttrs=*/true, attributes); if (ArrayAttr resAttrDicts = funcOp.getAllResultAttrs()) { assert(!resAttrDicts.empty() && "expected array to be non-empty"); auto newResAttrDicts = (funcOp.getNumResults() == 1) ? resAttrDicts : rewriter.getArrayAttr( {wrapAsStructAttrs(rewriter, resAttrDicts)}); attributes.push_back(rewriter.getNamedAttr( FunctionOpInterface::getResultDictAttrName(), newResAttrDicts)); } if (ArrayAttr argAttrDicts = funcOp.getAllArgAttrs()) { SmallVector newArgAttrs( llvmType.cast().getNumParams()); for (unsigned i = 0, e = funcOp.getNumArguments(); i < e; ++i) { auto mapping = result.getInputMapping(i); assert(mapping && "unexpected deletion of function argument"); for (size_t j = 0; j < mapping->size; ++j) newArgAttrs[mapping->inputNo + j] = argAttrDicts[i]; } attributes.push_back( rewriter.getNamedAttr(FunctionOpInterface::getArgDictAttrName(), rewriter.getArrayAttr(newArgAttrs))); } for (const auto &pair : llvm::enumerate(attributes)) { if (pair.value().getName() == "llvm.linkage") { attributes.erase(attributes.begin() + pair.index()); break; } } // Create an LLVM function, use external linkage by default until MLIR // functions have linkage. LLVM::Linkage linkage = LLVM::Linkage::External; if (funcOp->hasAttr("llvm.linkage")) { auto attr = funcOp->getAttr("llvm.linkage").dyn_cast(); if (!attr) { funcOp->emitError() << "Contains llvm.linkage attribute not of type LLVM::LinkageAttr"; return nullptr; } linkage = attr.getLinkage(); } auto newFuncOp = rewriter.create( funcOp.getLoc(), funcOp.getName(), llvmType, linkage, /*dsoLocal*/ false, attributes); rewriter.inlineRegionBefore(funcOp.getBody(), newFuncOp.getBody(), newFuncOp.end()); if (failed(rewriter.convertRegionTypes(&newFuncOp.getBody(), *typeConverter, &result))) return nullptr; return newFuncOp; } }; using IndexCacheKeyT = std::pair>; struct CacheKeyDenseMapInfo { static IndexCacheKeyT getEmptyKey() { auto *pointer = llvm::DenseMapInfo::getEmptyKey(); return std::make_pair( mlir::Attribute(static_cast(pointer)), SmallVector{}); } static IndexCacheKeyT getTombstoneKey() { auto *pointer = llvm::DenseMapInfo::getTombstoneKey(); return std::make_pair( mlir::Attribute(static_cast(pointer)), SmallVector{std::numeric_limits::max()}); } static unsigned getHashValue(IndexCacheKeyT key) { return llvm::hash_combine( mlir::hash_value(key.first), llvm::hash_combine_range(key.second.begin(), key.second.end())); } static bool isEqual(IndexCacheKeyT LHS, IndexCacheKeyT RHS) { return LHS == RHS; } }; class ConvertTritonGPUOpToLLVMPatternBase { public: // Two levels of value cache in emitting indices calculation: // Key: pair struct IndexCacheInfo { DenseMap, CacheKeyDenseMapInfo> *baseIndexCache; DenseMap>, CacheKeyDenseMapInfo> *indexCache; OpBuilder::InsertPoint *indexInsertPoint; }; explicit ConvertTritonGPUOpToLLVMPatternBase(LLVMTypeConverter &typeConverter) : converter(&typeConverter) {} explicit ConvertTritonGPUOpToLLVMPatternBase(LLVMTypeConverter &typeConverter, const Allocation *allocation, Value smem) : converter(&typeConverter), allocation(allocation), smem(smem) {} explicit ConvertTritonGPUOpToLLVMPatternBase(LLVMTypeConverter &typeConverter, const Allocation *allocation, Value smem, IndexCacheInfo indexCacheInfo) : converter(&typeConverter), indexCacheInfo(indexCacheInfo), allocation(allocation), smem(smem) {} LLVMTypeConverter *getTypeConverter() const { return converter; } static Value getStructFromSharedMemoryObject(Location loc, const SharedMemoryObject &smemObj, ConversionPatternRewriter &rewriter) { auto elems = smemObj.getElems(); auto types = smemObj.getTypes(); auto structTy = LLVM::LLVMStructType::getLiteral(rewriter.getContext(), types); return getStructFromElements(loc, elems, rewriter, structTy); } Value getThreadId(ConversionPatternRewriter &rewriter, Location loc) const { auto llvmIndexTy = this->getTypeConverter()->getIndexType(); auto cast = rewriter.create( loc, TypeRange{llvmIndexTy}, ValueRange{rewriter.create<::mlir::gpu::ThreadIdOp>( loc, rewriter.getIndexType(), ::mlir::gpu::Dimension::x)}); Value threadId = cast.getResult(0); return threadId; } // ----------------------------------------------------------------------- // Shared memory utilities // ----------------------------------------------------------------------- template Value getSharedMemoryBase(Location loc, ConversionPatternRewriter &rewriter, T value) const { auto ptrTy = LLVM::LLVMPointerType::get( this->getTypeConverter()->convertType(rewriter.getI8Type()), 3); auto bufferId = allocation->getBufferId(value); assert(bufferId != Allocation::InvalidBufferId && "BufferId not found"); size_t offset = allocation->getOffset(bufferId); Value offVal = idx_val(offset); Value base = gep(ptrTy, smem, offVal); return base; } DenseMap getSwizzledSharedPtrs(Location loc, unsigned inVec, RankedTensorType srcTy, triton::gpu::SharedEncodingAttr resSharedLayout, Type resElemTy, SharedMemoryObject smemObj, ConversionPatternRewriter &rewriter, SmallVectorImpl &offsetVals, SmallVectorImpl &srcStrides) const { // This utililty computes the pointers for accessing the provided swizzled // shared memory layout `resSharedLayout`. More specifically, it computes, // for all indices (row, col) of `srcEncoding` such that idx % inVec = 0, // the pointer: ptr[(row, col)] = base + (rowOff * strides[ord[1]] + colOff) // where : // compute phase = (row // perPhase) % maxPhase // rowOff = row // colOff = colOffSwizzled + colOffOrdered // colOffSwizzled = ((col // outVec) ^ phase) * outVec // colOffOrdered = (col % outVec) // minVec * minVec // // Note 1: // ------- // Because swizzling happens at a granularity of outVec, we need to // decompose the offset into a swizzled factor and a non-swizzled (ordered) // factor // // Note 2: // ------- // If we have x, y, z of the form: // x = 0b00000xxxx // y = 0byyyyy0000 // z = 0b00000zzzz // then (x + y) XOR z = 0byyyyxxxx XOR 0b00000zzzz = (x XOR z) + y // This means that we can use some immediate offsets for shared memory // operations. auto dstPtrTy = ptr_ty(resElemTy, 3); auto dstOffset = dot(rewriter, loc, offsetVals, smemObj.strides); Value dstPtrBase = gep(dstPtrTy, smemObj.base, dstOffset); auto srcEncoding = srcTy.getEncoding(); auto srcShape = srcTy.getShape(); unsigned numElems = triton::gpu::getElemsPerThread(srcTy); // swizzling params as described in TritonGPUAttrDefs.td unsigned outVec = resSharedLayout.getVec(); unsigned perPhase = resSharedLayout.getPerPhase(); unsigned maxPhase = resSharedLayout.getMaxPhase(); // order auto inOrder = triton::gpu::getOrder(srcEncoding); auto outOrder = triton::gpu::getOrder(resSharedLayout); // tensor indices held by the current thread, as LLVM values auto srcIndices = emitIndices(loc, rewriter, srcEncoding, srcShape); // return values DenseMap ret; // cache for non-immediate offsets DenseMap cacheCol, cacheRow; unsigned minVec = std::min(outVec, inVec); for (unsigned elemIdx = 0; elemIdx < numElems; elemIdx += minVec) { // extract multi dimensional index for current element auto idx = srcIndices[elemIdx]; Value idxCol = idx[inOrder[0]]; // contiguous dimension Value idxRow = idx[inOrder[1]]; // discontiguous dimension Value strideCol = srcStrides[inOrder[0]]; Value strideRow = srcStrides[inOrder[1]]; // extract dynamic/static offset for immediate offseting unsigned immedateOffCol = 0; if (auto add = dyn_cast_or_null(idxCol.getDefiningOp())) if (auto _cst = dyn_cast_or_null( add.getRhs().getDefiningOp())) { unsigned cst = _cst.getValue().cast().getValue().getSExtValue(); unsigned key = cst % (outVec * maxPhase); cacheCol.insert({key, idxCol}); idxCol = cacheCol[key]; immedateOffCol = cst / (outVec * maxPhase) * (outVec * maxPhase); } // extract dynamic/static offset for immediate offseting unsigned immedateOffRow = 0; if (auto add = dyn_cast_or_null(idxRow.getDefiningOp())) if (auto _cst = dyn_cast_or_null( add.getRhs().getDefiningOp())) { unsigned cst = _cst.getValue().cast().getValue().getSExtValue(); unsigned key = cst % (perPhase * maxPhase); cacheRow.insert({key, idxRow}); idxRow = cacheRow[key]; immedateOffRow = cst / (perPhase * maxPhase) * (perPhase * maxPhase); } // compute phase = (row // perPhase) % maxPhase Value phase = urem(udiv(idxRow, i32_val(perPhase)), i32_val(maxPhase)); // row offset is simply row index Value rowOff = mul(idxRow, strideRow); // because swizzling happens at a granularity of outVec, we need to // decompose the offset into a swizzled factor and a non-swizzled // (ordered) factor: colOffSwizzled = ((col // outVec) ^ phase) * outVec // colOffOrdered = (col % outVec) // minVec * minVec Value colOffSwizzled = xor_(udiv(idxCol, i32_val(outVec)), phase); colOffSwizzled = mul(colOffSwizzled, i32_val(outVec)); Value colOffOrdered = urem(idxCol, i32_val(outVec)); colOffOrdered = udiv(colOffOrdered, i32_val(minVec)); colOffOrdered = mul(colOffOrdered, i32_val(minVec)); Value colOff = add(colOffSwizzled, colOffOrdered); // compute non-immediate offset Value offset = add(rowOff, mul(colOff, strideCol)); Value currPtr = gep(dstPtrTy, dstPtrBase, offset); // compute immediate offset Value immedateOff = add(mul(i32_val(immedateOffRow), srcStrides[inOrder[1]]), i32_val(immedateOffCol)); ret[elemIdx] = gep(dstPtrTy, currPtr, immedateOff); } return ret; } void storeDistributedToShared(Value src, Value llSrc, ArrayRef dstStrides, ArrayRef> srcIndices, Value dst, Value smemBase, Type elemTy, Location loc, ConversionPatternRewriter &rewriter) const { auto srcTy = src.getType().cast(); auto srcShape = srcTy.getShape(); assert(srcShape.size() == 2 && "Unexpected rank of storeDistributedToShared"); auto dstTy = dst.getType().cast(); auto srcDistributedLayout = srcTy.getEncoding(); if (auto mmaLayout = srcDistributedLayout.dyn_cast()) { assert((!mmaLayout.isVolta()) && "ConvertLayout MMAv1->Shared is not suppported yet"); } auto dstSharedLayout = dstTy.getEncoding().cast(); auto dstElemTy = dstTy.getElementType(); auto inOrd = triton::gpu::getOrder(srcDistributedLayout); auto outOrd = dstSharedLayout.getOrder(); unsigned inVec = inOrd == outOrd ? triton::gpu::getContigPerThread(srcDistributedLayout)[inOrd[0]] : 1; unsigned outVec = dstSharedLayout.getVec(); unsigned minVec = std::min(outVec, inVec); unsigned perPhase = dstSharedLayout.getPerPhase(); unsigned maxPhase = dstSharedLayout.getMaxPhase(); unsigned numElems = triton::gpu::getElemsPerThread(srcTy); assert(numElems == srcIndices.size()); auto inVals = LLVM::getElementsFromStruct(loc, llSrc, rewriter); auto wordTy = vec_ty(elemTy, minVec); auto elemPtrTy = ptr_ty(elemTy); Value outVecVal = i32_val(outVec); Value minVecVal = i32_val(minVec); Value word; SmallVector srcStrides = {dstStrides[0], dstStrides[1]}; SmallVector offsetVals = {i32_val(0), i32_val(0)}; SharedMemoryObject smemObj(smemBase, srcStrides, offsetVals); DenseMap sharedPtrs = getSwizzledSharedPtrs(loc, inVec, srcTy, dstSharedLayout, dstElemTy, smemObj, rewriter, offsetVals, srcStrides); std::map cache0; std::map cache1; for (unsigned i = 0; i < numElems; ++i) { if (i % minVec == 0) word = undef(wordTy); word = insert_element(wordTy, word, inVals[i], i32_val(i % minVec)); if (i % minVec == minVec - 1) { Value smemAddr = sharedPtrs[i / minVec * minVec]; smemAddr = bitcast(smemAddr, ptr_ty(wordTy, 3)); store(word, smemAddr); } } } // ----------------------------------------------------------------------- // Utilities // ----------------------------------------------------------------------- // Convert an \param index to a multi-dim coordinate given \param shape and // \param order. SmallVector delinearize(ConversionPatternRewriter &rewriter, Location loc, Value linear, ArrayRef shape, ArrayRef order) const { unsigned rank = shape.size(); assert(rank == order.size()); auto reordered = reorder(shape, order); auto reorderedMultiDim = delinearize(rewriter, loc, linear, reordered); SmallVector multiDim(rank); for (unsigned i = 0; i < rank; ++i) { multiDim[order[i]] = reorderedMultiDim[i]; } return multiDim; } SmallVector delinearize(ConversionPatternRewriter &rewriter, Location loc, Value linear, ArrayRef shape) const { unsigned rank = shape.size(); assert(rank > 0); SmallVector multiDim(rank); if (rank == 1) { multiDim[0] = linear; } else { Value remained = linear; for (auto &&en : llvm::enumerate(shape.drop_back())) { Value dimSize = idx_val(en.value()); multiDim[en.index()] = urem(remained, dimSize); remained = udiv(remained, dimSize); } multiDim[rank - 1] = remained; } return multiDim; } Value linearize(ConversionPatternRewriter &rewriter, Location loc, ArrayRef multiDim, ArrayRef shape, ArrayRef order) const { return linearize(rewriter, loc, reorder(multiDim, order), reorder(shape, order)); } Value linearize(ConversionPatternRewriter &rewriter, Location loc, ArrayRef multiDim, ArrayRef shape) const { auto rank = multiDim.size(); Value linear = idx_val(0); if (rank > 0) { linear = multiDim.back(); for (auto [dim, dimShape] : llvm::reverse(llvm::zip(multiDim.drop_back(), shape.drop_back()))) { Value dimSize = idx_val(dimShape); linear = add(mul(linear, dimSize), dim); } } return linear; } Value dot(ConversionPatternRewriter &rewriter, Location loc, ArrayRef offsets, ArrayRef strides) const { assert(offsets.size() == strides.size()); Value ret = idx_val(0); for (auto [offset, stride] : llvm::zip(offsets, strides)) { ret = add(ret, mul(offset, stride)); } return ret; } struct SmallVectorKeyInfo { static unsigned getHashValue(const SmallVector &key) { return llvm::hash_combine_range(key.begin(), key.end()); } static bool isEqual(const SmallVector &lhs, const SmallVector &rhs) { return lhs == rhs; } static SmallVector getEmptyKey() { return SmallVector(); } static SmallVector getTombstoneKey() { return {std::numeric_limits::max()}; } }; // ----------------------------------------------------------------------- // Get offsets / indices for any layout // ----------------------------------------------------------------------- SmallVector emitBaseIndexForLayout(Location loc, ConversionPatternRewriter &rewriter, const Attribute &layout, ArrayRef shape) const { IndexCacheKeyT key = std::make_pair(layout, llvm::to_vector(shape)); auto cache = indexCacheInfo.baseIndexCache; assert(cache && "baseIndexCache is nullptr"); auto insertPt = indexCacheInfo.indexInsertPoint; if (cache->count(key) > 0) { return cache->lookup(key); } else { ConversionPatternRewriter::InsertionGuard guard(rewriter); restoreInsertionPointIfSet(insertPt, rewriter); SmallVector result; if (auto blockedLayout = layout.dyn_cast()) { result = emitBaseIndexForBlockedLayout(loc, rewriter, blockedLayout, shape); } else if (auto mmaLayout = layout.dyn_cast()) { if (mmaLayout.isVolta()) result = emitBaseIndexForMmaLayoutV1(loc, rewriter, mmaLayout, shape); if (mmaLayout.isAmpere()) result = emitBaseIndexForMmaLayoutV2(loc, rewriter, mmaLayout, shape); } else { llvm_unreachable("unsupported emitBaseIndexForLayout"); } cache->insert(std::make_pair(key, result)); *insertPt = rewriter.saveInsertionPoint(); return result; } } SmallVector> emitOffsetForLayout(const Attribute &layout, ArrayRef shape) const { if (auto blockedLayout = layout.dyn_cast()) return emitOffsetForBlockedLayout(blockedLayout, shape); if (auto mmaLayout = layout.dyn_cast()) { if (mmaLayout.isVolta()) return emitOffsetForMmaLayoutV1(mmaLayout, shape); if (mmaLayout.isAmpere()) return emitOffsetForMmaLayoutV2(mmaLayout, shape); } llvm_unreachable("unsupported emitOffsetForLayout"); } // ----------------------------------------------------------------------- // Emit indices // ----------------------------------------------------------------------- SmallVector> emitIndices(Location loc, ConversionPatternRewriter &b, const Attribute &layout, ArrayRef shape) const { IndexCacheKeyT key(layout, llvm::to_vector(shape)); auto cache = indexCacheInfo.indexCache; assert(cache && "indexCache is nullptr"); auto insertPt = indexCacheInfo.indexInsertPoint; if (cache->count(key) > 0) { return cache->lookup(key); } else { ConversionPatternRewriter::InsertionGuard guard(b); restoreInsertionPointIfSet(insertPt, b); SmallVector> result; if (auto blocked = layout.dyn_cast()) { result = emitIndicesForDistributedLayout(loc, b, blocked, shape); } else if (auto mma = layout.dyn_cast()) { result = emitIndicesForDistributedLayout(loc, b, mma, shape); } else if (auto slice = layout.dyn_cast()) { result = emitIndicesForSliceLayout(loc, b, slice, shape); } else { llvm_unreachable( "emitIndices for layouts other than blocked & slice not " "implemented yet"); } cache->insert(std::make_pair(key, result)); *insertPt = b.saveInsertionPoint(); return result; } } private: void restoreInsertionPointIfSet(OpBuilder::InsertPoint *insertPt, ConversionPatternRewriter &rewriter) const { if (insertPt->isSet()) { rewriter.restoreInsertionPoint(*insertPt); } else { auto func = rewriter.getInsertionPoint()->getParentOfType(); rewriter.setInsertionPointToStart(&func.getBody().front()); } } // ----------------------------------------------------------------------- // Blocked layout indices // ----------------------------------------------------------------------- // Get an index-base for each dimension for a \param blocked_layout. SmallVector emitBaseIndexForBlockedLayout(Location loc, ConversionPatternRewriter &rewriter, const BlockedEncodingAttr &blocked_layout, ArrayRef shape) const { Value threadId = getThreadId(rewriter, loc); Value warpSize = idx_val(32); Value laneId = urem(threadId, warpSize); Value warpId = udiv(threadId, warpSize); auto sizePerThread = blocked_layout.getSizePerThread(); auto threadsPerWarp = blocked_layout.getThreadsPerWarp(); auto warpsPerCTA = blocked_layout.getWarpsPerCTA(); auto order = blocked_layout.getOrder(); unsigned rank = shape.size(); // delinearize threadId to get the base index SmallVector multiDimWarpId = delinearize(rewriter, loc, warpId, warpsPerCTA, order); SmallVector multiDimThreadId = delinearize(rewriter, loc, laneId, threadsPerWarp, order); SmallVector multiDimBase(rank); for (unsigned k = 0; k < rank; ++k) { // Wrap around multiDimWarpId/multiDimThreadId incase // shape[k] > shapePerCTA[k] auto maxWarps = ceil(shape[k], sizePerThread[k] * threadsPerWarp[k]); auto maxThreads = ceil(shape[k], sizePerThread[k]); multiDimWarpId[k] = urem(multiDimWarpId[k], idx_val(maxWarps)); multiDimThreadId[k] = urem(multiDimThreadId[k], idx_val(maxThreads)); // multiDimBase[k] = (multiDimThreadId[k] + // multiDimWarpId[k] * threadsPerWarp[k]) * // sizePerThread[k]; Value threadsPerWarpK = idx_val(threadsPerWarp[k]); Value sizePerThreadK = idx_val(sizePerThread[k]); multiDimBase[k] = mul(sizePerThreadK, add(multiDimThreadId[k], mul(multiDimWarpId[k], threadsPerWarpK))); } return multiDimBase; } SmallVector> emitOffsetForBlockedLayout(const BlockedEncodingAttr &blockedLayout, ArrayRef shape) const { auto sizePerThread = blockedLayout.getSizePerThread(); auto threadsPerWarp = blockedLayout.getThreadsPerWarp(); auto warpsPerCTA = blockedLayout.getWarpsPerCTA(); auto order = blockedLayout.getOrder(); unsigned rank = shape.size(); SmallVector shapePerCTA = getShapePerCTA(blockedLayout); SmallVector tilesPerDim(rank); for (unsigned k = 0; k < rank; ++k) tilesPerDim[k] = ceil(shape[k], shapePerCTA[k]); SmallVector> offset(rank); for (unsigned k = 0; k < rank; ++k) { // 1 block in minimum if shape[k] is less than shapePerCTA[k] for (unsigned blockOffset = 0; blockOffset < tilesPerDim[k]; ++blockOffset) for (unsigned warpOffset = 0; warpOffset < warpsPerCTA[k]; ++warpOffset) for (unsigned threadOffset = 0; threadOffset < threadsPerWarp[k]; ++threadOffset) for (unsigned elemOffset = 0; elemOffset < sizePerThread[k]; ++elemOffset) offset[k].push_back(blockOffset * sizePerThread[k] * threadsPerWarp[k] * warpsPerCTA[k] + warpOffset * sizePerThread[k] * threadsPerWarp[k] + threadOffset * sizePerThread[k] + elemOffset); } unsigned elemsPerThread = blockedLayout.getElemsPerThread(shape); unsigned totalSizePerThread = product(sizePerThread); SmallVector> reorderedOffset(elemsPerThread); for (unsigned n = 0; n < elemsPerThread; ++n) { unsigned linearNanoTileId = n / totalSizePerThread; unsigned linearNanoTileElemId = n % totalSizePerThread; SmallVector multiDimNanoTileId = getMultiDimIndex(linearNanoTileId, tilesPerDim, order); SmallVector multiDimNanoTileElemId = getMultiDimIndex( linearNanoTileElemId, sizePerThread, order); for (unsigned k = 0; k < rank; ++k) { unsigned reorderedMultiDimId = multiDimNanoTileId[k] * (sizePerThread[k] * threadsPerWarp[k] * warpsPerCTA[k]) + multiDimNanoTileElemId[k]; reorderedOffset[n].push_back(offset[k][reorderedMultiDimId]); } } return reorderedOffset; } // ----------------------------------------------------------------------- // Mma layout indices // ----------------------------------------------------------------------- SmallVector emitBaseIndexForMmaLayoutV1(Location loc, ConversionPatternRewriter &rewriter, const MmaEncodingAttr &mmaLayout, ArrayRef shape) const { llvm_unreachable("emitIndicesForMmaLayoutV1 not implemented"); } SmallVector> emitOffsetForMmaLayoutV1(const MmaEncodingAttr &mmaLayout, ArrayRef shape) const { SmallVector> ret; for (unsigned i = 0; i < shape[0]; i += getShapePerCTA(mmaLayout, shape)[0]) { for (unsigned j = 0; j < shape[1]; j += getShapePerCTA(mmaLayout, shape)[1]) { ret.push_back({i, j}); ret.push_back({i, j + 1}); ret.push_back({i + 2, j}); ret.push_back({i + 2, j + 1}); ret.push_back({i, j + 8}); ret.push_back({i, j + 9}); ret.push_back({i + 2, j + 8}); ret.push_back({i + 2, j + 9}); } } return ret; } SmallVector emitBaseIndexForMmaLayoutV2(Location loc, ConversionPatternRewriter &rewriter, const MmaEncodingAttr &mmaLayout, ArrayRef shape) const { auto _warpsPerCTA = mmaLayout.getWarpsPerCTA(); assert(_warpsPerCTA.size() == 2); SmallVector warpsPerCTA = {idx_val(_warpsPerCTA[0]), idx_val(_warpsPerCTA[1])}; Value threadId = getThreadId(rewriter, loc); Value warpSize = idx_val(32); Value laneId = urem(threadId, warpSize); Value warpId = udiv(threadId, warpSize); Value warpId0 = urem(warpId, warpsPerCTA[0]); Value warpId1 = urem(udiv(warpId, warpsPerCTA[0]), warpsPerCTA[1]); Value offWarp0 = mul(warpId0, idx_val(16)); Value offWarp1 = mul(warpId1, idx_val(8)); SmallVector multiDimBase(2); multiDimBase[0] = add(udiv(laneId, idx_val(4)), offWarp0); multiDimBase[1] = add(mul(idx_val(2), urem(laneId, idx_val(4))), offWarp1); return multiDimBase; } SmallVector> emitOffsetForMmaLayoutV2(const MmaEncodingAttr &mmaLayout, ArrayRef shape) const { SmallVector> ret; for (unsigned i = 0; i < shape[0]; i += getShapePerCTA(mmaLayout)[0]) { for (unsigned j = 0; j < shape[1]; j += getShapePerCTA(mmaLayout)[1]) { ret.push_back({i, j}); ret.push_back({i, j + 1}); ret.push_back({i + 8, j}); ret.push_back({i + 8, j + 1}); } } return ret; } // Emit indices calculation within each ConversionPattern, and returns a // [elemsPerThread X rank] index matrix. // TODO: [phil] redundant indices computation do not appear to hurt // performance much, but they could still significantly slow down // computations. SmallVector> emitIndicesForDistributedLayout( Location loc, ConversionPatternRewriter &rewriter, const Attribute &layout, ArrayRef shape) const { if (auto mmaLayout = layout.template dyn_cast()) { assert(!mmaLayout.isVolta()); } // step 1, delinearize threadId to get the base index auto multiDimBase = emitBaseIndexForLayout(loc, rewriter, layout, shape); // step 2, get offset of each element auto offset = emitOffsetForLayout(layout, shape); // step 3, add offset to base, and reorder the sequence of indices to // guarantee that elems in the same sizePerThread are adjacent in order unsigned rank = shape.size(); unsigned elemsPerThread = offset.size(); SmallVector> multiDimIdx(elemsPerThread, SmallVector(rank)); for (unsigned n = 0; n < elemsPerThread; ++n) for (unsigned k = 0; k < rank; ++k) multiDimIdx[n][k] = add(multiDimBase[k], idx_val(offset[n][k])); return multiDimIdx; } SmallVector> emitIndicesForSliceLayout(Location loc, ConversionPatternRewriter &rewriter, const SliceEncodingAttr &sliceLayout, ArrayRef shape) const { auto parent = sliceLayout.getParent(); unsigned dim = sliceLayout.getDim(); size_t rank = shape.size(); auto parentIndices = emitIndices(loc, rewriter, parent, sliceLayout.paddedShape(shape)); unsigned numIndices = parentIndices.size(); SmallVector> resultIndices; for (unsigned i = 0; i < numIndices; ++i) { SmallVector indices = parentIndices[i]; indices.erase(indices.begin() + dim); resultIndices.push_back(indices); } return resultIndices; } protected: LLVMTypeConverter *converter; const Allocation *allocation; Value smem; IndexCacheInfo indexCacheInfo; }; template class ConvertTritonGPUOpToLLVMPattern : public ConvertOpToLLVMPattern, public ConvertTritonGPUOpToLLVMPatternBase { public: using OpAdaptor = typename SourceOp::Adaptor; explicit ConvertTritonGPUOpToLLVMPattern(LLVMTypeConverter &typeConverter, PatternBenefit benefit = 1) : ConvertOpToLLVMPattern(typeConverter, benefit), ConvertTritonGPUOpToLLVMPatternBase(typeConverter) {} explicit ConvertTritonGPUOpToLLVMPattern(LLVMTypeConverter &typeConverter, const Allocation *allocation, Value smem, PatternBenefit benefit = 1) : ConvertOpToLLVMPattern(typeConverter, benefit), ConvertTritonGPUOpToLLVMPatternBase(typeConverter, allocation, smem) {} explicit ConvertTritonGPUOpToLLVMPattern(LLVMTypeConverter &typeConverter, const Allocation *allocation, Value smem, IndexCacheInfo indexCacheInfo, PatternBenefit benefit = 1) : ConvertOpToLLVMPattern(typeConverter, benefit), ConvertTritonGPUOpToLLVMPatternBase(typeConverter, allocation, smem, indexCacheInfo) {} protected: LLVMTypeConverter *getTypeConverter() const { return ((ConvertTritonGPUOpToLLVMPatternBase *)this)->getTypeConverter(); } }; #endif triton-2.0.0/lib/Conversion/TritonGPUToLLVM/TritonGPUToLLVMPass.cpp000066400000000000000000000425151440023377100247500ustar00rootroot00000000000000#include "triton/Conversion/TritonGPUToLLVM/TritonGPUToLLVMPass.h" #include "mlir/Conversion/ArithmeticToLLVM/ArithmeticToLLVM.h" #include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h" #include "mlir/Conversion/MathToLLVM/MathToLLVM.h" #include "mlir/Conversion/SCFToStandard/SCFToStandard.h" #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" #include "mlir/Dialect/LLVMIR/NVVMDialect.h" #include "mlir/Pass/Pass.h" #include "triton/Analysis/Allocation.h" #include "triton/Analysis/AxisInfo.h" #include "triton/Analysis/Membar.h" #include "triton/Dialect/Triton/IR/Dialect.h" #include "triton/Dialect/TritonGPU/IR/Dialect.h" #include "ConvertLayoutOpToLLVM.h" #include "DotOpToLLVM.h" #include "ElementwiseOpToLLVM.h" #include "LoadStoreOpToLLVM.h" #include "ReduceOpToLLVM.h" #include "TritonGPUToLLVM.h" #include "TypeConverter.h" #include "ViewOpToLLVM.h" using namespace mlir; using namespace mlir::triton; #define GEN_PASS_CLASSES #include "triton/Conversion/Passes.h.inc" namespace mlir { class TritonLLVMConversionTarget : public ConversionTarget { public: explicit TritonLLVMConversionTarget(MLIRContext &ctx) : ConversionTarget(ctx) { addLegalDialect(); addLegalDialect(); addIllegalDialect(); addIllegalDialect(); addIllegalDialect(); addIllegalDialect(); addLegalOp(); } }; class TritonLLVMFunctionConversionTarget : public ConversionTarget { public: explicit TritonLLVMFunctionConversionTarget(MLIRContext &ctx) : ConversionTarget(ctx) { addLegalDialect(); addLegalDialect(); addIllegalOp(); addLegalOp(); } }; } // namespace mlir namespace { /// FuncOp legalization pattern that converts MemRef arguments to pointers to /// MemRef descriptors (LLVM struct data types) containing all the MemRef type /// information. struct FuncOpConversion : public FuncOpConversionBase { FuncOpConversion(LLVMTypeConverter &converter, int numWarps, PatternBenefit benefit) : FuncOpConversionBase(converter, benefit), numWarps(numWarps) {} LogicalResult matchAndRewrite(FuncOp funcOp, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { auto newFuncOp = convertFuncOpToLLVMFuncOp(funcOp, rewriter); if (!newFuncOp) return failure(); auto ctx = funcOp->getContext(); // Set an attribute to indicate this function is a kernel entry. newFuncOp->setAttr("nvvm.kernel", rewriter.getIntegerAttr(type::u1Ty(ctx), 1)); // Set an attribute for maxntidx, it could be used in latter LLVM codegen // for `nvvm.annotation` metadata. newFuncOp->setAttr("nvvm.maxntid", rewriter.getIntegerAttr(i32_ty, 32 * numWarps)); rewriter.eraseOp(funcOp); return success(); } private: int numWarps{0}; }; class ConvertTritonGPUToLLVM : public ConvertTritonGPUToLLVMBase { public: explicit ConvertTritonGPUToLLVM(int computeCapability) : computeCapability(computeCapability) {} void runOnOperation() override { MLIRContext *context = &getContext(); ModuleOp mod = getOperation(); mlir::LowerToLLVMOptions option(context); option.overrideIndexBitwidth(32); TritonGPUToLLVMTypeConverter typeConverter(context, option); TritonLLVMFunctionConversionTarget funcTarget(*context); TritonLLVMConversionTarget target(*context); int numWarps = triton::gpu::TritonGPUDialect::getNumWarps(mod); // Step 1: Decompose unoptimized layout conversions to use shared memory // Step 2: Decompose insert_slice_async to use load + insert_slice for // pre-Ampere architectures or unsupported vectorized load sizes // Step 3: Allocate shared memories and insert barriers // Step 4: Convert SCF to CFG // Step 5: Convert FuncOp to LLVMFuncOp via partial conversion // Step 6: Get axis and shared memory info // Step 7: Convert the rest of ops via partial conversion // // The reason for putting step 3 before step 4 is that the membar // analysis currently only supports SCF but not CFG. The reason for a // separation between 5/7 is that, step 6 is out of the scope of Dialect // Conversion, thus we need to make sure the smem is not revised during the // conversion of step 7. // Step 1 decomposeMmaToDotOperand(mod, numWarps); decomposeBlockedToDotOperand(mod); // Step 2 decomposeInsertSliceAsyncOp(mod); // Step 3 Allocation allocation(mod); MembarAnalysis membarPass(&allocation); membarPass.run(); // Step 4 RewritePatternSet scf_patterns(context); mlir::populateLoopToStdConversionPatterns(scf_patterns); mlir::ConversionTarget scf_target(*context); scf_target.addIllegalOp(); scf_target.markUnknownOpDynamicallyLegal([](Operation *) { return true; }); if (failed( applyPartialConversion(mod, scf_target, std::move(scf_patterns)))) return signalPassFailure(); // Step 5 RewritePatternSet func_patterns(context); func_patterns.add(typeConverter, numWarps, /*benefit=*/1); if (failed( applyPartialConversion(mod, funcTarget, std::move(func_patterns)))) return signalPassFailure(); // Step 6 - get axis and shared memory info AxisInfoAnalysis axisInfoAnalysis(mod.getContext()); axisInfoAnalysis.run(mod); initSharedMemory(allocation.getSharedMemorySize(), typeConverter); mod->setAttr("triton_gpu.shared", mlir::IntegerAttr::get(mlir::IntegerType::get(context, 32), allocation.getSharedMemorySize())); // Step 7 - rewrite rest of ops // We set a higher benefit here to ensure triton's patterns runs before // arith patterns for some encoding not supported by the community // patterns. OpBuilder::InsertPoint indexInsertPoint; ConvertTritonGPUOpToLLVMPatternBase::IndexCacheInfo indexCacheInfo{ &baseIndexCache, &indexCache, &indexInsertPoint}; RewritePatternSet patterns(context); // Normal conversions populateTritonGPUToLLVMPatterns(typeConverter, patterns, numWarps, axisInfoAnalysis, &allocation, smem, indexCacheInfo, /*benefit=*/10); // ConvertLayoutOp populateConvertLayoutOpToLLVMPatterns(typeConverter, patterns, numWarps, axisInfoAnalysis, &allocation, smem, indexCacheInfo, /*benefit=*/10); // DotOp populateDotOpToLLVMPatterns(typeConverter, patterns, numWarps, axisInfoAnalysis, &allocation, smem, /*benefit=*/10); // ElementwiseOp populateElementwiseOpToLLVMPatterns(typeConverter, patterns, numWarps, axisInfoAnalysis, &allocation, smem, /*benefit=*/10); // LoadStoreOp populateLoadStoreOpToLLVMPatterns(typeConverter, patterns, numWarps, axisInfoAnalysis, &allocation, smem, indexCacheInfo, /*benefit=*/10); // ReduceOp populateReduceOpToLLVMPatterns(typeConverter, patterns, numWarps, axisInfoAnalysis, &allocation, smem, indexCacheInfo, /*benefit=*/10); // ViewOp populateViewOpToLLVMPatterns(typeConverter, patterns, numWarps, axisInfoAnalysis, &allocation, smem, /*benefit=*/10); // Add arith/math's patterns to help convert scalar expression to LLVM. mlir::arith::populateArithmeticToLLVMConversionPatterns(typeConverter, patterns); mlir::populateMathToLLVMConversionPatterns(typeConverter, patterns); mlir::populateStdToLLVMConversionPatterns(typeConverter, patterns); mlir::populateGpuToNVVMConversionPatterns(typeConverter, patterns); if (failed(applyPartialConversion(mod, target, std::move(patterns)))) return signalPassFailure(); } private: Value smem; using IndexCacheKeyT = std::pair>; DenseMap, CacheKeyDenseMapInfo> baseIndexCache; DenseMap>, CacheKeyDenseMapInfo> indexCache; int computeCapability{}; void initSharedMemory(size_t size, TritonGPUToLLVMTypeConverter &typeConverter) { ModuleOp mod = getOperation(); OpBuilder b(mod.getBodyRegion()); auto loc = mod.getLoc(); auto elemTy = typeConverter.convertType(b.getIntegerType(8)); // Set array size 0 and external linkage indicates that we use dynamic // shared allocation to allow a larger shared memory size for each kernel. auto arrayTy = LLVM::LLVMArrayType::get(elemTy, 0); auto global = b.create( loc, arrayTy, /*isConstant=*/false, LLVM::Linkage::External, "global_smem", /*value=*/Attribute(), /*alignment=*/0, mlir::gpu::GPUDialect::getWorkgroupAddressSpace()); SmallVector funcs; mod.walk([&](LLVM::LLVMFuncOp func) { funcs.push_back(func); }); assert(funcs.size() == 1 && "Inliner pass is expected before TritonGPUToLLVM"); b.setInsertionPointToStart(&funcs[0].getBody().front()); smem = b.create(loc, global); auto ptrTy = LLVM::LLVMPointerType::get(typeConverter.convertType(b.getI8Type()), 3); smem = b.create(loc, ptrTy, smem); } void decomposeMmaToDotOperand(ModuleOp mod, int numWarps) const { // Replace `mma -> dot_op` with `mma -> blocked -> dot_op` // unless certain conditions are met mod.walk([&](triton::gpu::ConvertLayoutOp cvtOp) -> void { OpBuilder builder(cvtOp); auto srcType = cvtOp.getOperand().getType().cast(); auto dstType = cvtOp.getType().cast(); auto srcMma = srcType.getEncoding().dyn_cast(); auto dstDotOp = dstType.getEncoding().dyn_cast(); if (srcMma && dstDotOp && !isMmaToDotShortcut(srcMma, dstDotOp)) { auto tmpType = RankedTensorType::get( dstType.getShape(), dstType.getElementType(), triton::gpu::BlockedEncodingAttr::get( mod.getContext(), srcType.getShape(), getSizePerThread(srcMma), getOrder(srcMma), numWarps)); auto tmp = builder.create( cvtOp.getLoc(), tmpType, cvtOp.getOperand()); auto newConvert = builder.create( cvtOp.getLoc(), dstType, tmp); cvtOp.replaceAllUsesWith(newConvert.getResult()); cvtOp.erase(); } }); } void decomposeBlockedToDotOperand(ModuleOp mod) const { // Replace `blocked -> dot_op` with `blocked -> shared -> dot_op` // because the codegen doesn't handle `blocked -> dot_op` directly mod.walk([&](triton::gpu::ConvertLayoutOp cvtOp) -> void { OpBuilder builder(cvtOp); auto srcType = cvtOp.getOperand().getType().cast(); auto dstType = cvtOp.getType().cast(); auto srcBlocked = srcType.getEncoding().dyn_cast(); auto dstDotOp = dstType.getEncoding().dyn_cast(); if (srcBlocked && dstDotOp) { auto tmpType = RankedTensorType::get( dstType.getShape(), dstType.getElementType(), triton::gpu::SharedEncodingAttr::get( mod.getContext(), dstDotOp, srcType.getShape(), getOrder(srcBlocked), srcType.getElementType())); auto tmp = builder.create( cvtOp.getLoc(), tmpType, cvtOp.getOperand()); auto newConvert = builder.create( cvtOp.getLoc(), dstType, tmp); cvtOp.replaceAllUsesWith(newConvert.getResult()); cvtOp.erase(); } }); } void decomposeInsertSliceAsyncOp(ModuleOp mod) const { AxisInfoAnalysis axisInfoAnalysis(mod.getContext()); axisInfoAnalysis.run(mod); // TODO(Keren): This is a hacky knob that may cause performance regression // when decomposition has been performed. We should remove this knob once we // have thorough analysis on async wait. Currently, we decompose // `insert_slice_async` into `load` and `insert_slice` without knowing which // `async_wait` is responsible for the `insert_slice_async`. To guarantee // correctness, we blindly set the `async_wait` to wait for all async ops. // // There are two options to improve this: // 1. We can perform a dataflow analysis to find the `async_wait` that is // responsible for the `insert_slice_async` in the backend. // 2. We can modify the pipeline to perform the decomposition before the // `async_wait` is inserted. However, it is also risky because we don't know // the correct vectorized shape yet in the pipeline pass. Making the // pipeline pass aware of the vectorization could introduce additional // dependencies on the AxisInfoAnalysis and the Coalesce analysis. bool decomposed = false; // insert_slice_async %src, %dst, %idx, %mask, %other // => // %tmp = load %src, %mask, %other // %res = insert_slice %tmp into %dst[%idx] mod.walk([&](triton::gpu::InsertSliceAsyncOp insertSliceAsyncOp) -> void { OpBuilder builder(insertSliceAsyncOp); // Get the vectorized load size auto src = insertSliceAsyncOp.src(); auto dst = insertSliceAsyncOp.dst(); auto srcTy = src.getType().cast(); auto dstTy = dst.getType().cast(); auto srcBlocked = srcTy.getEncoding().dyn_cast(); auto resSharedLayout = dstTy.getEncoding().dyn_cast(); auto resElemTy = dstTy.getElementType(); unsigned inVec = axisInfoAnalysis.getPtrContiguity(src); unsigned outVec = resSharedLayout.getVec(); unsigned minVec = std::min(outVec, inVec); auto maxBitWidth = std::max(128, resElemTy.getIntOrFloatBitWidth()); auto vecBitWidth = resElemTy.getIntOrFloatBitWidth() * minVec; auto bitWidth = std::min(maxBitWidth, vecBitWidth); auto byteWidth = bitWidth / 8; // If the load byte width is not eligible or the current compute // capability does not support async copy, then we do decompose if (triton::gpu::InsertSliceAsyncOp::getEligibleLoadByteWidth( computeCapability) .contains(byteWidth)) return; // load auto tmpTy = RankedTensorType::get(srcTy.getShape(), resElemTy, srcBlocked); auto loadOp = builder.create( insertSliceAsyncOp.getLoc(), tmpTy, insertSliceAsyncOp.src(), insertSliceAsyncOp.mask(), insertSliceAsyncOp.other(), insertSliceAsyncOp.cache(), insertSliceAsyncOp.evict(), insertSliceAsyncOp.isVolatile()); // insert_slice auto axis = insertSliceAsyncOp.axis(); auto intAttr = [&](int64_t v) { return builder.getI64IntegerAttr(v); }; auto offsets = SmallVector(dstTy.getRank(), intAttr(0)); auto sizes = SmallVector(dstTy.getRank(), intAttr(1)); auto strides = SmallVector(dstTy.getRank(), intAttr(1)); offsets[axis] = insertSliceAsyncOp.index(); for (size_t i = 0; i < dstTy.getRank(); i++) { if (i != axis) sizes[i] = intAttr(dstTy.getShape()[i]); } auto insertSliceOp = builder.create( insertSliceAsyncOp.getLoc(), loadOp, insertSliceAsyncOp.dst(), offsets, sizes, strides); // Replace insertSliceAsyncOp.replaceAllUsesWith(insertSliceOp.getResult()); insertSliceAsyncOp.erase(); decomposed = true; }); mod.walk([&](triton::gpu::AsyncCommitGroupOp asyncCommitGroupOp) -> void { if (!triton::gpu::AsyncCommitGroupOp::isSupported(computeCapability)) asyncCommitGroupOp.erase(); }); mod.walk([&](triton::gpu::AsyncWaitOp asyncWaitOp) -> void { if (!triton::gpu::AsyncWaitOp::isSupported(computeCapability)) { // async wait is supported in Ampere and later asyncWaitOp.erase(); } else if (decomposed) { // Wait for all previous async ops OpBuilder builder(asyncWaitOp); auto newAsyncWaitOp = builder.create(asyncWaitOp.getLoc(), 0); asyncWaitOp.erase(); } }); } }; } // anonymous namespace namespace mlir { namespace triton { std::unique_ptr> createConvertTritonGPUToLLVMPass(int computeCapability) { return std::make_unique<::ConvertTritonGPUToLLVM>(computeCapability); } } // namespace triton } // namespace mlir triton-2.0.0/lib/Conversion/TritonGPUToLLVM/TypeConverter.h000066400000000000000000000141471440023377100235460ustar00rootroot00000000000000#ifndef TRITON_CONVERSION_TRITONGPU_TO_LLVM_TYPECONVERTER_H #define TRITON_CONVERSION_TRITONGPU_TO_LLVM_TYPECONVERTER_H #include "mlir/Dialect/LLVMIR/LLVMDialect.h" #include "triton/Conversion/MLIRTypes.h" #include "DotOpHelpers.h" #include "Utility.h" using namespace mlir; using namespace mlir::triton; using ::mlir::LLVM::DotOpFMAConversionHelper; using ::mlir::LLVM::DotOpMmaV1ConversionHelper; using ::mlir::LLVM::MMA16816ConversionHelper; using ::mlir::triton::gpu::BlockedEncodingAttr; using ::mlir::triton::gpu::DotOperandEncodingAttr; using ::mlir::triton::gpu::getElemsPerThread; using ::mlir::triton::gpu::MmaEncodingAttr; using ::mlir::triton::gpu::SharedEncodingAttr; using ::mlir::triton::gpu::SliceEncodingAttr; class TritonGPUToLLVMTypeConverter : public LLVMTypeConverter { public: using TypeConverter::convertType; TritonGPUToLLVMTypeConverter(MLIRContext *ctx, LowerToLLVMOptions &option, const DataLayoutAnalysis *analysis = nullptr) : LLVMTypeConverter(ctx, option, analysis) { addConversion([&](triton::PointerType type) -> llvm::Optional { return convertTritonPointerType(type); }); addConversion([&](RankedTensorType type) -> llvm::Optional { return convertTritonTensorType(type); }); // Internally store float8 as int8 addConversion([&](triton::Float8Type type) -> llvm::Optional { return IntegerType::get(type.getContext(), 8); }); // Internally store bfloat16 as int16 addConversion([&](BFloat16Type type) -> llvm::Optional { return IntegerType::get(type.getContext(), 16); }); } Type convertTritonPointerType(triton::PointerType type) { // Recursively translate pointee type return LLVM::LLVMPointerType::get(convertType(type.getPointeeType()), type.getAddressSpace()); } llvm::Optional convertTritonTensorType(RankedTensorType type) { auto ctx = type.getContext(); Attribute layout = type.getEncoding(); SmallVector shape(type.getShape().begin(), type.getShape().end()); if (layout && (layout.isa() || layout.isa() || layout.isa())) { unsigned numElementsPerThread = getElemsPerThread(type); SmallVector types(numElementsPerThread, convertType(type.getElementType())); return LLVM::LLVMStructType::getLiteral(ctx, types); } else if (auto shared_layout = layout.dyn_cast_or_null()) { SmallVector types; // base ptr auto ptrType = LLVM::LLVMPointerType::get(convertType(type.getElementType()), 3); types.push_back(ptrType); // shape dims auto rank = type.getRank(); // offsets + strides for (auto i = 0; i < rank * 2; i++) { types.push_back(IntegerType::get(ctx, 32)); } return LLVM::LLVMStructType::getLiteral(ctx, types); } else if (auto dotOpLayout = layout.dyn_cast_or_null()) { if (dotOpLayout.getParent() .isa()) { // for parent is blocked layout int numElemsPerThread = DotOpFMAConversionHelper::getNumElemsPerThread(shape, dotOpLayout); return LLVM::LLVMStructType::getLiteral( ctx, SmallVector(numElemsPerThread, type::f32Ty(ctx))); } else { // for parent is MMA layout auto mmaLayout = dotOpLayout.getParent().cast(); auto wpt = mmaLayout.getWarpsPerCTA(); Type elemTy = convertType(type.getElementType()); if (mmaLayout.isAmpere()) { const llvm::DenseMap targetTyMap = { {32, vec_ty(elemTy, 1)}, {16, vec_ty(elemTy, 2)}, {8, vec_ty(elemTy, 4)}, }; Type targetTy; if (targetTyMap.count(elemTy.getIntOrFloatBitWidth())) { targetTy = targetTyMap.lookup(elemTy.getIntOrFloatBitWidth()); // <2xi16>/<4xi8> => i32 // We are doing this because NVPTX inserts extra integer instrs to // pack & unpack vectors of sub-word integers // Note: this needs to be synced with // DotOpMmaV2ConversionHelper::loadX4 if (elemTy.isa() && (elemTy.getIntOrFloatBitWidth() == 8 || elemTy.getIntOrFloatBitWidth() == 16)) targetTy = IntegerType::get(ctx, 32); } else { assert(false && "Unsupported element type"); } if (dotOpLayout.getOpIdx() == 0) { // $a auto elems = MMA16816ConversionHelper::getANumElemsPerThread(type, wpt[0]); return struct_ty(SmallVector(elems, targetTy)); } if (dotOpLayout.getOpIdx() == 1) { // $b auto elems = MMA16816ConversionHelper::getBNumElemsPerThread(type, wpt[1]); return struct_ty(SmallVector(elems, targetTy)); } } if (mmaLayout.isVolta()) { auto [isARow, isBRow, isAVec4, isBVec4, mmaId] = mmaLayout.decodeVoltaLayoutStates(); DotOpMmaV1ConversionHelper helper(mmaLayout); if (dotOpLayout.getOpIdx() == 0) { // $a DotOpMmaV1ConversionHelper::AParam param(isARow, isAVec4); int elems = helper.numElemsPerThreadA(shape, isARow, isAVec4, param.vec); Type x2Ty = vec_ty(elemTy, 2); return struct_ty(SmallVector(elems, x2Ty)); } if (dotOpLayout.getOpIdx() == 1) { // $b DotOpMmaV1ConversionHelper::BParam param(isBRow, isBVec4); int elems = helper.numElemsPerThreadB(shape, isBRow, isBVec4, param.vec); Type x2Ty = vec_ty(elemTy, 2); return struct_ty(SmallVector(elems, x2Ty)); } } } llvm::errs() << "Unexpected dot operand layout detected in " "TritonToLLVMTypeConverter"; return llvm::None; } return llvm::None; } }; #endif triton-2.0.0/lib/Conversion/TritonGPUToLLVM/Utility.cpp000066400000000000000000000122431440023377100227260ustar00rootroot00000000000000#include "Utility.h" namespace mlir { namespace LLVM { using namespace mlir::triton; Value getStructFromElements(Location loc, ValueRange resultVals, ConversionPatternRewriter &rewriter, Type structType) { if (!structType.isa()) { return *resultVals.begin(); } Value llvmStruct = rewriter.create(loc, structType); for (const auto &v : llvm::enumerate(resultVals)) { assert(v.value() && "can not insert null values"); llvmStruct = insert_val(structType, llvmStruct, v.value(), rewriter.getI64ArrayAttr(v.index())); } return llvmStruct; } SmallVector getElementsFromStruct(Location loc, Value llvmStruct, ConversionPatternRewriter &rewriter) { if (llvmStruct.getType().isIntOrIndexOrFloat() || llvmStruct.getType().isa() || llvmStruct.getType().isa()) return {llvmStruct}; ArrayRef types = llvmStruct.getType().cast().getBody(); SmallVector results(types.size()); for (unsigned i = 0; i < types.size(); ++i) { Type type = types[i]; results[i] = extract_val(type, llvmStruct, i64_arr_attr(i)); } return results; } Value createConstantI32(Location loc, PatternRewriter &rewriter, int32_t v) { auto i32ty = rewriter.getIntegerType(32); return rewriter.create(loc, i32ty, IntegerAttr::get(i32ty, v)); } Value createConstantF32(Location loc, PatternRewriter &rewriter, float v) { auto type = type::f32Ty(rewriter.getContext()); return rewriter.create(loc, type, rewriter.getF32FloatAttr(v)); } Value createConstantF64(Location loc, PatternRewriter &rewriter, float v) { auto type = type::f64Ty(rewriter.getContext()); return rewriter.create(loc, type, rewriter.getF64FloatAttr(v)); } // Create an index type constant. Value createIndexConstant(OpBuilder &builder, Location loc, TypeConverter *converter, int64_t value) { Type ty = converter->convertType(builder.getIndexType()); return builder.create(loc, ty, builder.getIntegerAttr(ty, value)); } // Create an integer constant of \param width bits. Value createLLVMIntegerConstant(OpBuilder &builder, Location loc, short width, int64_t value) { Type ty = builder.getIntegerType(width); return builder.create(loc, ty, builder.getIntegerAttr(ty, value)); } SharedMemoryObject getSharedMemoryObjectFromStruct(Location loc, Value llvmStruct, ConversionPatternRewriter &rewriter) { auto elems = getElementsFromStruct(loc, llvmStruct, rewriter); auto rank = (elems.size() - 1) / 2; return {/*base=*/elems[0], /*strides=*/{elems.begin() + 1, elems.begin() + 1 + rank}, /*offsets=*/{elems.begin() + 1 + rank, elems.end()}}; } SmallVector getStridesFromShapeAndOrder(ArrayRef shape, ArrayRef order, Location loc, ConversionPatternRewriter &rewriter) { auto rank = shape.size(); SmallVector strides(rank); int64_t stride = 1; for (auto idx : order) { strides[idx] = i32_val(stride); stride *= shape[idx]; } return strides; } Value storeShared(ConversionPatternRewriter &rewriter, Location loc, Value ptr, Value val, Value pred) { MLIRContext *ctx = rewriter.getContext(); unsigned bits = val.getType().getIntOrFloatBitWidth(); const char *c = bits == 64 ? "l" : (bits == 16 ? "h" : "r"); PTXBuilder builder; auto *ptrOpr = builder.newAddrOperand(ptr, "r"); auto *valOpr = builder.newOperand(val, c); auto &st = builder.create<>("st")->shared().b(bits); st(ptrOpr, valOpr).predicate(pred, "b"); return builder.launch(rewriter, loc, void_ty(ctx)); } Value shflSync(Location loc, ConversionPatternRewriter &rewriter, Value val, int i) { unsigned bits = val.getType().getIntOrFloatBitWidth(); if (bits == 64) { Type vecTy = vec_ty(f32_ty, 2); Value vec = bitcast(val, vecTy); Value val0 = extract_element(f32_ty, vec, i32_val(0)); Value val1 = extract_element(f32_ty, vec, i32_val(1)); val0 = shflSync(loc, rewriter, val0, i); val1 = shflSync(loc, rewriter, val1, i); vec = undef(vecTy); vec = insert_element(vecTy, vec, val0, i32_val(0)); vec = insert_element(vecTy, vec, val1, i32_val(1)); return bitcast(vec, val.getType()); } PTXBuilder builder; auto &shfl = builder.create("shfl.sync")->o("bfly").o("b32"); auto *dOpr = builder.newOperand("=r"); auto *aOpr = builder.newOperand(val, "r"); auto *bOpr = builder.newConstantOperand(i); auto *cOpr = builder.newConstantOperand("0x1f"); auto *maskOpr = builder.newConstantOperand("0xffffffff"); shfl(dOpr, aOpr, bOpr, cOpr, maskOpr); return builder.launch(rewriter, loc, val.getType(), false); } } // namespace LLVM } // namespace mlir triton-2.0.0/lib/Conversion/TritonGPUToLLVM/Utility.h000066400000000000000000000301571440023377100223770ustar00rootroot00000000000000#ifndef TRITON_CONVERSION_TRITONGPU_TO_LLVM_UTILITY_H #define TRITON_CONVERSION_TRITONGPU_TO_LLVM_UTILITY_H #include "mlir/Conversion/LLVMCommon/Pattern.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" #include "triton/Analysis/Utility.h" #include "triton/Conversion/MLIRTypes.h" #include "triton/Conversion/TritonGPUToLLVM/PTXAsmFormat.h" // Shortcuts for some commonly used LLVM ops to keep code simple and intuitive // Operators #define inttoptr(...) rewriter.create(loc, __VA_ARGS__) #define ptrtoint(...) rewriter.create(loc, __VA_ARGS__) #define zext(...) rewriter.create(loc, __VA_ARGS__) #define udiv(...) rewriter.create(loc, __VA_ARGS__) #define urem(...) rewriter.create(loc, __VA_ARGS__) #define add(...) rewriter.create(loc, __VA_ARGS__) #define sub(...) rewriter.create(loc, __VA_ARGS__) #define fadd(...) rewriter.create(loc, __VA_ARGS__) #define mul(...) rewriter.create(loc, __VA_ARGS__) #define fmul(...) rewriter.create(loc, __VA_ARGS__) #define smax(...) rewriter.create(loc, __VA_ARGS__) #define umax(...) rewriter.create(loc, __VA_ARGS__) #define fmax(...) rewriter.create(loc, __VA_ARGS__) #define smin(...) rewriter.create(loc, __VA_ARGS__) #define umin(...) rewriter.create(loc, __VA_ARGS__) #define fmin(...) rewriter.create(loc, __VA_ARGS__) #define and_(...) rewriter.create(loc, __VA_ARGS__) #define xor_(...) rewriter.create(loc, __VA_ARGS__) #define bitcast(val__, type__) \ rewriter.create(loc, type__, val__) #define gep(...) rewriter.create(loc, __VA_ARGS__) #define ptr_ty(...) LLVM::LLVMPointerType::get(__VA_ARGS__) #define insert_val(...) rewriter.create(loc, __VA_ARGS__) #define extract_val(...) rewriter.create(loc, __VA_ARGS__) #define insert_element(...) \ rewriter.create(loc, __VA_ARGS__) #define extract_element(...) \ rewriter.create(loc, __VA_ARGS__) #define load(...) rewriter.create(loc, __VA_ARGS__) #define store(val, ptr) rewriter.create(loc, val, ptr) #define fcmp_ogt(lhs, rhs) \ rewriter.create(loc, rewriter.getI1Type(), \ LLVM::FCmpPredicate::ogt, lhs, rhs) #define fcmp_olt(lhs, rhs) \ rewriter.create(loc, rewriter.getI1Type(), \ LLVM::FCmpPredicate::olt, lhs, rhs) #define icmp_eq(...) \ rewriter.create(loc, LLVM::ICmpPredicate::eq, __VA_ARGS__) #define icmp_ne(...) \ rewriter.create(loc, LLVM::ICmpPredicate::ne, __VA_ARGS__) #define icmp_slt(...) \ rewriter.create(loc, LLVM::ICmpPredicate::slt, __VA_ARGS__) #define icmp_sle(...) \ rewriter.create(loc, LLVM::ICmpPredicate::sle, __VA_ARGS__) #define icmp_sgt(...) \ rewriter.create(loc, LLVM::ICmpPredicate::sgt, __VA_ARGS__) #define icmp_sge(...) \ rewriter.create(loc, LLVM::ICmpPredicate::sge, __VA_ARGS__) #define icmp_ult(...) \ rewriter.create(loc, LLVM::ICmpPredicate::ult, __VA_ARGS__) #define icmp_ule(...) \ rewriter.create(loc, LLVM::ICmpPredicate::ule, __VA_ARGS__) #define icmp_ugt(...) \ rewriter.create(loc, LLVM::ICmpPredicate::ugt, __VA_ARGS__) #define icmp_uge(...) \ rewriter.create(loc, LLVM::ICmpPredicate::uge, __VA_ARGS__) #define select(...) rewriter.create(loc, __VA_ARGS__) #define address_of(...) rewriter.create(loc, __VA_ARGS__) #define barrier() rewriter.create(loc) #define undef(...) rewriter.create(loc, __VA_ARGS__) // Types #define i32_ty rewriter.getIntegerType(32) #define i16_ty rewriter.getIntegerType(16) #define ui32_ty rewriter.getIntegerType(32, false) #define f16_ty rewriter.getF16Type() #define bf16_ty rewriter.getBF16Type() #define i8_ty rewriter.getIntegerType(8) #define f32_ty rewriter.getF32Type() #define f64_ty rewriter.getF64Type() #define vec_ty(type, num) VectorType::get(num, type) #define f32_val(...) LLVM::createConstantF32(loc, rewriter, __VA_ARGS__) #define f64_val(...) LLVM::createConstantF64(loc, rewriter, __VA_ARGS__) #define void_ty(ctx) LLVM::LLVMVoidType::get(ctx) #define struct_ty(...) LLVM::LLVMStructType::getLiteral(ctx, __VA_ARGS__) #define array_ty(elemTy, count) LLVM::LLVMArrayType::get(elemTy, count) // Constants #define i32_val(...) LLVM::createConstantI32(loc, rewriter, __VA_ARGS__) #define int_val(width, val) \ LLVM::createLLVMIntegerConstant(rewriter, loc, width, val) #define idx_val(...) \ LLVM::createIndexConstant(rewriter, loc, this->getTypeConverter(), \ __VA_ARGS__) #define tid_val() getThreadId(rewriter, loc) // Attributes #define i32_arr_attr(...) rewriter.getI32ArrayAttr({__VA_ARGS__}) #define i64_arr_attr(...) rewriter.getI64ArrayAttr({__VA_ARGS__}) namespace mlir { namespace triton { // Delinearize supposing order is [0, 1, .. , n] template llvm::SmallVector getMultiDimIndexImpl(T linearIndex, llvm::ArrayRef shape) { // shape: {a, b, c, d} -> accMul: {1, a, a*b, a*b*c} size_t rank = shape.size(); T accMul = product(shape.drop_back()); T linearRemain = linearIndex; llvm::SmallVector multiDimIndex(rank); for (int i = rank - 1; i >= 0; --i) { multiDimIndex[i] = linearRemain / accMul; linearRemain = linearRemain % accMul; if (i != 0) { accMul = accMul / shape[i - 1]; } } return multiDimIndex; } template llvm::SmallVector getMultiDimIndex(T linearIndex, llvm::ArrayRef shape, llvm::ArrayRef order) { size_t rank = shape.size(); assert(rank == order.size()); auto reordered = reorder(shape, order); auto reorderedMultiDim = getMultiDimIndexImpl(linearIndex, reordered); llvm::SmallVector multiDim(rank); for (unsigned i = 0; i < rank; ++i) { multiDim[order[i]] = reorderedMultiDim[i]; } return multiDim; } // Linearize supposing order is [0, 1, .. , n] template T getLinearIndexImpl(llvm::ArrayRef multiDimIndex, llvm::ArrayRef shape) { assert(multiDimIndex.size() == shape.size()); // shape: {a, b, c, d} -> accMul: {1, a, a*b, a*b*c} size_t rank = shape.size(); T accMul = product(shape.drop_back()); T linearIndex = 0; for (int i = rank - 1; i >= 0; --i) { linearIndex += multiDimIndex[i] * accMul; if (i != 0) { accMul = accMul / shape[i - 1]; } } return linearIndex; } template T getLinearIndex(llvm::ArrayRef multiDimIndex, llvm::ArrayRef shape, llvm::ArrayRef order) { assert(shape.size() == order.size()); return getLinearIndexImpl(reorder(multiDimIndex, order), reorder(shape, order)); } } // namespace triton namespace LLVM { using namespace mlir::triton; Value getStructFromElements(Location loc, ValueRange resultVals, ConversionPatternRewriter &rewriter, Type structType); SmallVector getElementsFromStruct(Location loc, Value llvmStruct, ConversionPatternRewriter &rewriter); /// Create a 32-bit integer constant. Value createConstantI32(Location loc, PatternRewriter &rewriter, int32_t v); /// Create a 32-bit float constant. Value createConstantF32(Location loc, PatternRewriter &rewriter, float v); /// Create a 64-bit float constant. Value createConstantF64(Location loc, PatternRewriter &rewriter, float v); /// Create an index type constant. Value createIndexConstant(OpBuilder &builder, Location loc, TypeConverter *converter, int64_t value); /// Create an integer constant of \param width bits. Value createLLVMIntegerConstant(OpBuilder &builder, Location loc, short width, int64_t value); /// Helper function to get strides from a given shape and its order SmallVector getStridesFromShapeAndOrder(ArrayRef shape, ArrayRef order, Location loc, ConversionPatternRewriter &rewriter); struct SharedMemoryObject { Value base; // i32 ptr. The start address of the shared memory object. // We need to store strides as Values but not integers because the // extract_slice instruction can take a slice at arbitrary offsets. // Take $a[16:32, 16:32] as an example, though we know the stride of $a[0] is // 32, we need to let the instruction that uses $a to be aware of that. // Otherwise, when we use $a, we only know that the shape of $a is 16x16. If // we store strides into an attribute array of integers, the information // cannot pass through block argument assignment because attributes are // associated with operations but not Values. // TODO(Keren): We may need to figure out a way to store strides as integers // if we want to support more optimizations. SmallVector strides; // i32 int. The strides of the shared memory object. SmallVector offsets; // i32 int. The offsets of the shared memory // objects from the originally allocated object. SharedMemoryObject(Value base, ArrayRef strides, ArrayRef offsets) : base(base), strides(strides.begin(), strides.end()), offsets(offsets.begin(), offsets.end()) {} SharedMemoryObject(Value base, ArrayRef shape, ArrayRef order, Location loc, ConversionPatternRewriter &rewriter) : base(base) { strides = getStridesFromShapeAndOrder(shape, order, loc, rewriter); for (auto idx : order) { offsets.emplace_back(i32_val(0)); } } SmallVector getElems() const { SmallVector elems; elems.push_back(base); elems.append(strides.begin(), strides.end()); elems.append(offsets.begin(), offsets.end()); return elems; } SmallVector getTypes() const { SmallVector types; types.push_back(base.getType()); types.append(strides.size(), IntegerType::get(base.getContext(), 32)); types.append(offsets.size(), IntegerType::get(base.getContext(), 32)); return types; } Value getCSwizzleOffset(int order) const { assert(order >= 0 && order < strides.size()); return offsets[order]; } Value getBaseBeforeSwizzle(int order, Location loc, ConversionPatternRewriter &rewriter) const { Value cSwizzleOffset = getCSwizzleOffset(order); Value offset = sub(i32_val(0), cSwizzleOffset); Type type = base.getType(); return gep(type, base, offset); } }; SharedMemoryObject getSharedMemoryObjectFromStruct(Location loc, Value llvmStruct, ConversionPatternRewriter &rewriter); Value storeShared(ConversionPatternRewriter &rewriter, Location loc, Value ptr, Value val, Value pred); Value shflSync(Location loc, ConversionPatternRewriter &rewriter, Value val, int i); } // namespace LLVM } // namespace mlir #endif triton-2.0.0/lib/Conversion/TritonGPUToLLVM/ViewOpToLLVM.cpp000066400000000000000000000306411440023377100234740ustar00rootroot00000000000000#include "ViewOpToLLVM.h" #include "DotOpHelpers.h" using namespace mlir; using namespace mlir::triton; using ::mlir::LLVM::DotOpFMAConversionHelper; using ::mlir::LLVM::DotOpMmaV1ConversionHelper; using ::mlir::LLVM::DotOpMmaV2ConversionHelper; using ::mlir::LLVM::getElementsFromStruct; using ::mlir::LLVM::getSharedMemoryObjectFromStruct; using ::mlir::LLVM::getStructFromElements; using ::mlir::LLVM::MMA16816ConversionHelper; using ::mlir::triton::gpu::getElemsPerThread; struct SplatOpConversion : public ConvertTritonGPUOpToLLVMPattern { using ConvertTritonGPUOpToLLVMPattern< triton::SplatOp>::ConvertTritonGPUOpToLLVMPattern; // Convert SplatOp or arith::ConstantOp with SplatElementsAttr to a // LLVM::StructType value. // // @elemType: the element type in operand. // @resType: the return type of the Splat-like op. // @constVal: a LLVM::ConstantOp or other scalar value. static Value convertSplatLikeOp(Type elemType, Type resType, Value constVal, TypeConverter *typeConverter, ConversionPatternRewriter &rewriter, Location loc) { auto tensorTy = resType.cast(); if (tensorTy.getEncoding().isa() || tensorTy.getEncoding().isa()) { auto srcType = typeConverter->convertType(elemType); auto llSrc = bitcast(constVal, srcType); size_t elemsPerThread = getElemsPerThread(tensorTy); llvm::SmallVector elems(elemsPerThread, llSrc); llvm::SmallVector elemTypes(elems.size(), srcType); auto structTy = LLVM::LLVMStructType::getLiteral(rewriter.getContext(), elemTypes); return getStructFromElements(loc, elems, rewriter, structTy); } else if (auto dotLayout = tensorTy.getEncoding() .dyn_cast()) { return convertSplatLikeOpWithDotOperandLayout( dotLayout, resType, elemType, constVal, typeConverter, rewriter, loc); } else if (auto mmaLayout = tensorTy.getEncoding().dyn_cast()) { return convertSplatLikeOpWithMmaLayout( mmaLayout, resType, elemType, constVal, typeConverter, rewriter, loc); } else assert(false && "Unsupported layout found in ConvertSplatLikeOp"); return {}; } static Value convertSplatLikeOpWithDotOperandLayout( const triton::gpu::DotOperandEncodingAttr &layout, Type resType, Type elemType, Value constVal, TypeConverter *typeConverter, ConversionPatternRewriter &rewriter, Location loc) { auto tensorTy = resType.cast(); auto shape = tensorTy.getShape(); auto dotOperand = tensorTy.getEncoding().cast(); auto parent = layout.getParent(); Value retVal = constVal; Type retTy = elemType; int numElems{}; if (auto mmaLayout = parent.dyn_cast()) { Type matTy; if (mmaLayout.isAmpere()) { numElems = layout.getOpIdx() == 0 ? MMA16816ConversionHelper::getANumElemsPerThread( tensorTy, mmaLayout.getWarpsPerCTA()[0]) : MMA16816ConversionHelper::getBNumElemsPerThread( tensorTy, mmaLayout.getWarpsPerCTA()[1]); DotOpMmaV2ConversionHelper helper(mmaLayout); helper.deduceMmaType(tensorTy); matTy = helper.getMatType(); } else if (mmaLayout.isVolta()) { DotOpMmaV1ConversionHelper helper(mmaLayout); bool isRow = layout.getIsMMAv1Row().cast().getValue(); auto [isARow, isBRow, isAVec4, isBVec4, _0] = mmaLayout.decodeVoltaLayoutStates(); if (layout.getOpIdx() == 0) { DotOpMmaV1ConversionHelper::AParam aParam(isARow, isAVec4); numElems = helper.numElemsPerThreadA(shape, isARow, isAVec4, aParam.vec); } else { DotOpMmaV1ConversionHelper::BParam bParam(isBRow, isBVec4); numElems = helper.numElemsPerThreadB(shape, isBRow, isBVec4, bParam.vec); } matTy = helper.getMatType(tensorTy); } auto numPackedElems = matTy.cast() .getBody()[0] .cast() .getNumElements(); retTy = vec_ty(elemType, numPackedElems); retVal = undef(retTy); for (auto i = 0; i < numPackedElems; ++i) { retVal = insert_element(retTy, retVal, constVal, i32_val(i)); } } else if (auto blockedLayout = parent.dyn_cast()) { numElems = DotOpFMAConversionHelper::getNumElemsPerThread(shape, layout); } else { assert(false && "Unsupported layout found"); } auto structTy = LLVM::LLVMStructType::getLiteral( rewriter.getContext(), SmallVector(numElems, retTy)); return getStructFromElements(loc, SmallVector(numElems, retVal), rewriter, structTy); } static Value convertSplatLikeOpWithMmaLayout( const MmaEncodingAttr &layout, Type resType, Type elemType, Value constVal, TypeConverter *typeConverter, ConversionPatternRewriter &rewriter, Location loc) { auto tensorTy = resType.cast(); auto shape = tensorTy.getShape(); if (layout.isAmpere()) { auto [repM, repN] = DotOpMmaV2ConversionHelper::getRepMN(tensorTy); size_t fcSize = 4 * repM * repN; auto structTy = LLVM::LLVMStructType::getLiteral( rewriter.getContext(), SmallVector(fcSize, elemType)); return getStructFromElements(loc, SmallVector(fcSize, constVal), rewriter, structTy); } if (layout.isVolta()) { DotOpMmaV1ConversionHelper helper(layout); int repM = helper.getRepM(shape[0]); int repN = helper.getRepN(shape[1]); // According to mma layout of v1, each thread process 8 elements. int elems = 8 * repM * repN; auto structTy = LLVM::LLVMStructType::getLiteral( rewriter.getContext(), SmallVector(elems, elemType)); return getStructFromElements(loc, SmallVector(elems, constVal), rewriter, structTy); } assert(false && "Unsupported mma layout found"); return {}; } LogicalResult matchAndRewrite(triton::SplatOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const { auto loc = op->getLoc(); auto src = adaptor.src(); auto llStruct = convertSplatLikeOp(src.getType(), op.getType(), src, getTypeConverter(), rewriter, loc); rewriter.replaceOp(op, {llStruct}); return success(); } }; // This pattern helps to convert arith::ConstantOp(with SplatElementsAttr), // the logic is the same as triton::SplatOp, so the underlying implementation // is reused. struct ArithConstantSplatOpConversion : public ConvertTritonGPUOpToLLVMPattern { using ConvertTritonGPUOpToLLVMPattern< arith::ConstantOp>::ConvertTritonGPUOpToLLVMPattern; LogicalResult matchAndRewrite(arith::ConstantOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { auto value = op.getValue(); if (!value.dyn_cast()) return failure(); auto loc = op->getLoc(); LLVM::ConstantOp arithConstantOp; auto values = op.getValue().dyn_cast(); auto elemType = values.getElementType(); Attribute val; if (elemType.isBF16() || type::isFloat(elemType)) { val = values.getValues()[0]; } else if (type::isInt(elemType)) { val = values.getValues()[0]; } else { llvm::errs() << "ArithConstantSplatOpConversion get unsupported type: " << value.getType() << "\n"; return failure(); } auto constOp = rewriter.create(loc, elemType, val); auto llStruct = SplatOpConversion::convertSplatLikeOp( elemType, op.getType(), constOp, getTypeConverter(), rewriter, loc); rewriter.replaceOp(op, llStruct); return success(); } }; struct CatOpConversion : public ConvertTritonGPUOpToLLVMPattern { using OpAdaptor = typename CatOp::Adaptor; explicit CatOpConversion(LLVMTypeConverter &typeConverter, PatternBenefit benefit = 1) : ConvertTritonGPUOpToLLVMPattern(typeConverter, benefit) {} LogicalResult matchAndRewrite(CatOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { Location loc = op->getLoc(); auto resultTy = op.getType().template cast(); unsigned elems = getElemsPerThread(resultTy); Type elemTy = this->getTypeConverter()->convertType(resultTy.getElementType()); SmallVector types(elems, elemTy); // unpack input values auto lhsVals = getElementsFromStruct(loc, adaptor.lhs(), rewriter); auto rhsVals = getElementsFromStruct(loc, adaptor.rhs(), rewriter); // concatenate (and potentially reorder) values SmallVector retVals; for (Value v : lhsVals) retVals.push_back(v); for (Value v : rhsVals) retVals.push_back(v); // pack and replace Type structTy = LLVM::LLVMStructType::getLiteral(this->getContext(), types); Value ret = getStructFromElements(loc, retVals, rewriter, structTy); rewriter.replaceOp(op, ret); return success(); } }; template struct ViewLikeOpConversion : public ConvertTritonGPUOpToLLVMPattern { using OpAdaptor = typename SourceOp::Adaptor; explicit ViewLikeOpConversion(LLVMTypeConverter &typeConverter, PatternBenefit benefit = 1) : ConvertTritonGPUOpToLLVMPattern(typeConverter, benefit) {} LogicalResult matchAndRewrite(SourceOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { // We cannot directly run `rewriter.replaceOp(op, adaptor.src())` // due to MLIR's restrictions Location loc = op->getLoc(); auto resultTy = op.getType().template cast(); unsigned elems = getElemsPerThread(resultTy); Type elemTy = this->getTypeConverter()->convertType(resultTy.getElementType()); SmallVector types(elems, elemTy); Type structTy = LLVM::LLVMStructType::getLiteral(this->getContext(), types); auto vals = getElementsFromStruct(loc, adaptor.src(), rewriter); Value view = getStructFromElements(loc, vals, rewriter, structTy); rewriter.replaceOp(op, view); return success(); } }; struct TransOpConversion : public ConvertTritonGPUOpToLLVMPattern { using ConvertTritonGPUOpToLLVMPattern< triton::TransOp>::ConvertTritonGPUOpToLLVMPattern; LogicalResult matchAndRewrite(triton::TransOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { Location loc = op->getLoc(); auto srcSmemObj = getSharedMemoryObjectFromStruct(loc, adaptor.src(), rewriter); SmallVector dstStrides = {srcSmemObj.strides[1], srcSmemObj.strides[0]}; SmallVector dstOffsets = {srcSmemObj.offsets[1], srcSmemObj.offsets[0]}; auto dstSmemObj = SharedMemoryObject(srcSmemObj.base, dstStrides, dstOffsets); auto retVal = getStructFromSharedMemoryObject(loc, dstSmemObj, rewriter); rewriter.replaceOp(op, retVal); return success(); } }; void populateViewOpToLLVMPatterns(mlir::LLVMTypeConverter &typeConverter, RewritePatternSet &patterns, int numWarps, AxisInfoAnalysis &axisInfoAnalysis, const Allocation *allocation, Value smem, PatternBenefit benefit) { patterns.add>(typeConverter, benefit); patterns.add>(typeConverter, benefit); patterns.add(typeConverter, benefit); patterns.add(typeConverter, benefit); patterns.add(typeConverter, benefit); patterns.add(typeConverter, benefit); } triton-2.0.0/lib/Conversion/TritonGPUToLLVM/ViewOpToLLVM.h000066400000000000000000000010601440023377100231320ustar00rootroot00000000000000#ifndef TRITON_CONVERSION_TRITONGPU_TO_LLVM_VIEW_OP_H #define TRITON_CONVERSION_TRITONGPU_TO_LLVM_VIEW_OP_H #include "TritonGPUToLLVMBase.h" using namespace mlir; using namespace mlir::triton; void populateViewOpToLLVMPatterns(mlir::LLVMTypeConverter &typeConverter, RewritePatternSet &patterns, int numWarps, AxisInfoAnalysis &axisInfoAnalysis, const Allocation *allocation, Value smem, PatternBenefit benefit); #endif triton-2.0.0/lib/Conversion/TritonToTritonGPU/000077500000000000000000000000001440023377100212625ustar00rootroot00000000000000triton-2.0.0/lib/Conversion/TritonToTritonGPU/CMakeLists.txt000066400000000000000000000005351440023377100240250ustar00rootroot00000000000000add_mlir_conversion_library(TritonToTritonGPU TritonToTritonGPUPass.cpp ADDITIONAL_HEADER_DIRS ${PROJECT_SOURCE_DIR}/include/triton/Conversion/TritonToTritonGPU DEPENDS TritonConversionPassIncGen LINK_COMPONENTS Core LINK_LIBS PUBLIC MLIRIR MLIRPass TritonIR TritonGPUIR TritonGPUTransforms ) triton-2.0.0/lib/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.cpp000066400000000000000000000720571440023377100261660ustar00rootroot00000000000000#include "triton/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.h" #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h" #include "mlir/Dialect/GPU/GPUDialect.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" #include "mlir/Dialect/LLVMIR/NVVMDialect.h" #include "mlir/Dialect/StandardOps/IR/Ops.h" #include "mlir/Pass/Pass.h" #include "mlir/Transforms/DialectConversion.h" #include "triton/Dialect/Triton/IR/Dialect.h" #include "triton/Dialect/TritonGPU/IR/Dialect.h" #include "triton/Dialect/TritonGPU/Transforms/TritonGPUConversion.h" #include "llvm/ADT/APSInt.h" #include using namespace mlir; using namespace mlir::triton; #define GEN_PASS_CLASSES #include "triton/Conversion/Passes.h.inc" namespace { template class GenericOpPattern : public OpConversionPattern { public: using OpConversionPattern::OpConversionPattern; LogicalResult matchAndRewrite(Op op, typename Op::Adaptor adaptor, ConversionPatternRewriter &rewriter) const override { Type retType = this->getTypeConverter()->convertType(op.getType()); rewriter.replaceOpWithNewOp(op, retType, adaptor.getOperands()); return success(); } }; template class ArithCmpPattern : public OpConversionPattern { public: using OpConversionPattern::OpConversionPattern; LogicalResult matchAndRewrite(SrcOp op, typename SrcOp::Adaptor adaptor, ConversionPatternRewriter &rewriter) const override { Type retType = this->getTypeConverter()->convertType(op.getType()); rewriter.replaceOpWithNewOp(op, retType, adaptor.getPredicate(), adaptor.getLhs(), adaptor.getRhs()); return success(); } }; class ArithConstantPattern : public OpConversionPattern { public: using OpConversionPattern::OpConversionPattern; LogicalResult matchAndRewrite(arith::ConstantOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { Type retType = getTypeConverter()->convertType(op.getType()); auto value = adaptor.getValue().dyn_cast(); assert(value); rewriter.replaceOpWithNewOp( op, retType, value.reshape(retType) // This is a hack. We just want to add encoding ); return success(); } }; class ConvertArithmeticOp : public ConversionPattern { public: ConvertArithmeticOp(TritonGPUTypeConverter &typeConverter, MLIRContext *context) : ConversionPattern(typeConverter, MatchAnyOpTypeTag(), /*benefit=*/1, context) {} LogicalResult matchAndRewrite(Operation *op, ArrayRef operands, ConversionPatternRewriter &rewriter) const override { Dialect *dialect = op->getDialect(); if (dialect->getTypeID() != mlir::TypeID::get()) return failure(); return success(); } }; void populateArithmeticPatternsAndLegality( TritonGPUTypeConverter &typeConverter, RewritePatternSet &patterns, TritonGPUConversionTarget &target) { // -------------- // Add legality and rewrite pattern rules for operations // from the Arithmetic dialect. The basic premise is that // arithmetic operations require both inputs to have the same // non-null encoding // -------------- MLIRContext *context = patterns.getContext(); // TODO: there's probably a better way to avoid adding all ops one-by-one patterns.add< ArithConstantPattern, GenericOpPattern, GenericOpPattern, GenericOpPattern, GenericOpPattern, GenericOpPattern, GenericOpPattern, GenericOpPattern, GenericOpPattern, GenericOpPattern, GenericOpPattern, GenericOpPattern, GenericOpPattern, GenericOpPattern, GenericOpPattern, GenericOpPattern, GenericOpPattern, // NegFOp // Floating point GenericOpPattern, GenericOpPattern, // MaxMin GenericOpPattern, GenericOpPattern, GenericOpPattern, GenericOpPattern, GenericOpPattern, GenericOpPattern, // Floating point GenericOpPattern, GenericOpPattern, GenericOpPattern, // Cmp ArithCmpPattern, ArithCmpPattern, // Cast Ops GenericOpPattern, GenericOpPattern, GenericOpPattern, GenericOpPattern, GenericOpPattern, GenericOpPattern, GenericOpPattern, GenericOpPattern, GenericOpPattern>(typeConverter, context); } // this shouldn't exist if mlir's SelectOp checked encodings properly class StdSelectPattern : public OpConversionPattern { public: using OpConversionPattern::OpConversionPattern; LogicalResult matchAndRewrite(SelectOp op, typename SelectOp::Adaptor adaptor, ConversionPatternRewriter &rewriter) const override { Type retType = this->getTypeConverter()->convertType(op.getType()); rewriter.replaceOpWithNewOp( op, retType, adaptor.getCondition(), adaptor.getTrueValue(), adaptor.getFalseValue()); return success(); } }; void populateStdPatternsAndLegality(TritonGPUTypeConverter &typeConverter, RewritePatternSet &patterns, TritonGPUConversionTarget &target) { MLIRContext *context = patterns.getContext(); // Rewrite rule patterns.add(typeConverter, context); target.addLegalOp(); // this is ok because all functions are inlined // by the frontend } void populateMathPatternsAndLegality(TritonGPUTypeConverter &typeConverter, RewritePatternSet &patterns, TritonGPUConversionTarget &target) { MLIRContext *context = patterns.getContext(); // Rewrite rule patterns.add, GenericOpPattern, GenericOpPattern, GenericOpPattern, GenericOpPattern>(typeConverter, context); } // // Triton patterns // // TODO: Do we need to put them in anonymous namespace? struct TritonMakeRangePattern : public OpConversionPattern { using OpConversionPattern::OpConversionPattern; LogicalResult matchAndRewrite(triton::MakeRangeOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { Type retType = getTypeConverter()->convertType(op.getType()); rewriter.replaceOpWithNewOp( op, retType, adaptor.start(), adaptor.end()); return success(); } }; struct TritonExpandDimsPattern : public OpConversionPattern { using OpConversionPattern::OpConversionPattern; LogicalResult matchAndRewrite(triton::ExpandDimsOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { // Type retType = op.getType()); RankedTensorType argType = adaptor.src().getType().cast(); Attribute _argEncoding = argType.getEncoding(); if (!_argEncoding) return failure(); auto argEncoding = _argEncoding.cast(); // return shape auto retShape = argType.getShape().vec(); retShape.insert(retShape.begin() + op.axis(), 1); // return encoding auto retSizePerThread = argEncoding.getSizePerThread().vec(); retSizePerThread.insert(retSizePerThread.begin() + op.axis(), 1); auto retThreadsPerWarp = argEncoding.getThreadsPerWarp().vec(); retThreadsPerWarp.insert(retThreadsPerWarp.begin() + op.axis(), 1); auto retWarpsPerCTA = argEncoding.getWarpsPerCTA().vec(); retWarpsPerCTA.insert(retWarpsPerCTA.begin() + op.axis(), 1); SmallVector retOrder(retShape.size()); std::iota(retOrder.begin(), retOrder.end(), 0); triton::gpu::BlockedEncodingAttr retEncoding = triton::gpu::BlockedEncodingAttr::get(getContext(), retSizePerThread, retThreadsPerWarp, retWarpsPerCTA, retOrder); // convert operand to slice of return type Attribute newArgEncoding = triton::gpu::SliceEncodingAttr::get( getContext(), op.axis(), retEncoding); RankedTensorType newArgType = RankedTensorType::get( argType.getShape(), argType.getElementType(), newArgEncoding); // construct new op auto newSrc = rewriter.create( op.getLoc(), newArgType, adaptor.src()); rewriter.replaceOpWithNewOp(op, newSrc, adaptor.axis()); return success(); } }; struct TritonDotPattern : public OpConversionPattern { using OpConversionPattern::OpConversionPattern; LogicalResult matchAndRewrite(triton::DotOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { RankedTensorType origType = op.getType().cast(); auto origShape = origType.getShape(); auto typeConverter = getTypeConverter(); int numWarps = typeConverter->getNumWarps(); SmallVector retSizePerThread = {1, 1}; if (origShape[0] * origShape[1] / (numWarps * 32) >= 4) retSizePerThread = {2, 2}; if (origShape[0] * origShape[1] / (numWarps * 32) >= 16) retSizePerThread = {4, 4}; SmallVector retOrder = {1, 0}; Attribute dEncoding = triton::gpu::BlockedEncodingAttr::get( getContext(), origShape, retSizePerThread, retOrder, numWarps); RankedTensorType retType = RankedTensorType::get(origShape, origType.getElementType(), dEncoding); // a & b must be of smem layout auto aType = adaptor.a().getType().cast(); auto bType = adaptor.b().getType().cast(); Attribute aEncoding = aType.getEncoding(); Attribute bEncoding = bType.getEncoding(); if (!aEncoding || !bEncoding) return failure(); Value a = adaptor.a(); Value b = adaptor.b(); Value c = adaptor.c(); if (!aEncoding.isa()) { Attribute encoding = triton::gpu::DotOperandEncodingAttr::get(getContext(), 0, dEncoding); auto dstType = RankedTensorType::get(aType.getShape(), aType.getElementType(), encoding); a = rewriter.create(a.getLoc(), dstType, a); } if (!bEncoding.isa()) { Attribute encoding = triton::gpu::DotOperandEncodingAttr::get(getContext(), 1, dEncoding); auto dstType = RankedTensorType::get(bType.getShape(), bType.getElementType(), encoding); b = rewriter.create(b.getLoc(), dstType, b); } c = rewriter.create(c.getLoc(), retType, c); rewriter.replaceOpWithNewOp(op, retType, a, b, c, adaptor.allowTF32()); return success(); } }; struct TritonCatPattern : public OpConversionPattern { using OpConversionPattern::OpConversionPattern; LogicalResult matchAndRewrite(triton::CatOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { // For now, this behaves like generic, but this will evolve when // we add support for `can_reorder=False` Type retType = this->getTypeConverter()->convertType(op.getType()); rewriter.replaceOpWithNewOp(op, retType, adaptor.getOperands()); return success(); } }; struct TritonTransPattern : public OpConversionPattern { using OpConversionPattern::OpConversionPattern; LogicalResult matchAndRewrite(triton::TransOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { Value src = adaptor.src(); auto srcType = src.getType().cast(); Attribute srcEncoding = srcType.getEncoding(); if (!srcEncoding) return failure(); if (!srcEncoding.isa()) { // TODO: end-to-end correctness is broken if // the input is blocked and the output is shared // with different order. Maybe a backend issue in BlockedToShared? SmallVector order = {1, 0}; if (auto srcBlockedEncoding = srcEncoding.dyn_cast()) llvm::copy(srcBlockedEncoding.getOrder(), order.begin()); srcEncoding = triton::gpu::SharedEncodingAttr::get(getContext(), 1, 1, 1, order); srcType = RankedTensorType::get(srcType.getShape(), srcType.getElementType(), srcEncoding); src = rewriter.create(src.getLoc(), srcType, src); } rewriter.replaceOpWithNewOp(op, src); return success(); } }; struct TritonLoadPattern : public OpConversionPattern { using OpConversionPattern::OpConversionPattern; LogicalResult matchAndRewrite(triton::LoadOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { rewriter.replaceOpWithNewOp( op, typeConverter->convertType(op.getType()), adaptor.ptr(), adaptor.mask(), adaptor.other(), adaptor.cache(), adaptor.evict(), adaptor.isVolatile()); return success(); } }; struct TritonStorePattern : public OpConversionPattern { using OpConversionPattern::OpConversionPattern; LogicalResult matchAndRewrite(triton::StoreOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { rewriter.replaceOpWithNewOp( op, adaptor.ptr(), adaptor.value(), adaptor.mask()); return success(); } }; struct TritonAtomicCASPattern : public OpConversionPattern { using OpConversionPattern::OpConversionPattern; LogicalResult matchAndRewrite(triton::AtomicCASOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { rewriter.replaceOpWithNewOp( op, typeConverter->convertType(op.getType()), adaptor.ptr(), adaptor.cmp(), adaptor.val()); return success(); } }; struct TritonAtomicRMWPattern : public OpConversionPattern { using OpConversionPattern::OpConversionPattern; LogicalResult matchAndRewrite(triton::AtomicRMWOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { rewriter.replaceOpWithNewOp( op, typeConverter->convertType(op.getType()), adaptor.atomic_rmw_op(), adaptor.ptr(), adaptor.val(), adaptor.mask()); return success(); } }; struct TritonExtElemwisePattern : public OpConversionPattern { using OpConversionPattern::OpConversionPattern; LogicalResult matchAndRewrite(triton::ExtElemwiseOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { rewriter.replaceOpWithNewOp( op, typeConverter->convertType(op.getType()), adaptor.args(), adaptor.libname(), adaptor.libpath(), adaptor.symbol()); return success(); } }; template struct TritonGenericPattern : public OpConversionPattern { using OpConversionPattern::OpConversionPattern; LogicalResult matchAndRewrite(Op op, typename Op::Adaptor adaptor, ConversionPatternRewriter &rewriter) const override { Type retType = this->getTypeConverter()->convertType(op.getType()); rewriter.replaceOpWithNewOp(op, retType, adaptor.getOperands()); return success(); } }; struct TritonBroadcastPattern : public OpConversionPattern { using OpConversionPattern::OpConversionPattern; // This creates a tensor with the new shape but the argument's layout LogicalResult matchAndRewrite(BroadcastOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { auto srcType = adaptor.src().getType().cast(); auto srcEncoding = srcType.getEncoding(); if (!srcEncoding) return failure(); auto opType = op.getType().cast(); Type retType = RankedTensorType::get(opType.getShape(), opType.getElementType(), srcEncoding); // Type retType = this->getTypeConverter()->convertType(op.getType()); rewriter.replaceOpWithNewOp(op, retType, adaptor.getOperands()); return success(); } }; struct TritonReducePattern : public OpConversionPattern { using OpConversionPattern::OpConversionPattern; LogicalResult matchAndRewrite(triton::ReduceOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { rewriter.replaceOpWithNewOp( op, adaptor.redOp(), adaptor.operand(), adaptor.axis()); return success(); } }; struct TritonPrintfPattern : public OpConversionPattern { using OpConversionPattern::OpConversionPattern; LogicalResult matchAndRewrite(PrintfOp op, typename PrintfOp::Adaptor adaptor, ConversionPatternRewriter &rewriter) const override { rewriter.replaceOpWithNewOp(op, op.prefixAttr(), adaptor.getOperands()); return success(); } }; void populateTritonPatterns(TritonGPUTypeConverter &typeConverter, RewritePatternSet &patterns) { MLIRContext *context = patterns.getContext(); patterns.add< // TODO: view should have custom pattern that views the layout TritonGenericPattern, TritonGenericPattern, TritonGenericPattern, TritonGenericPattern, TritonGenericPattern, TritonGenericPattern, TritonBroadcastPattern, TritonGenericPattern, TritonCatPattern, TritonReducePattern, TritonTransPattern, TritonExpandDimsPattern, TritonMakeRangePattern, TritonDotPattern, TritonLoadPattern, TritonStorePattern, TritonExtElemwisePattern, TritonPrintfPattern, TritonAtomicRMWPattern>(typeConverter, context); } // // SCF patterns // // This is borrowed from ConvertForOpTypes in // SCF/Transforms/StructuralTypeConversions.cpp struct SCFForPattern : public OpConversionPattern { using OpConversionPattern::OpConversionPattern; // Ref: ConvertForOpTypes LogicalResult matchAndRewrite(scf::ForOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { auto newOp = cast(rewriter.cloneWithoutRegions(*op.getOperation())); rewriter.inlineRegionBefore(op.getLoopBody(), newOp.getLoopBody(), newOp.getLoopBody().end()); // Now, update all the types. // Convert the types of block arguments within the given region. This // replaces each block with a new block containing the updated signature. // The entry block may have a special conversion if `entryConversion` is // provided. On success, the new entry block to the region is returned for // convenience. Otherwise, failure is returned. if (failed(rewriter.convertRegionTypes(&newOp.getLoopBody(), *getTypeConverter()))) { return rewriter.notifyMatchFailure(op, "could not convert body types"); } // Change the clone to use the updated operands. We could have cloned with // a BlockAndValueMapping, but this seems a bit more direct. newOp->setOperands(adaptor.getOperands()); // Update the result types to the new converted types. SmallVector newResultTypes; for (Type type : op.getResultTypes()) { Type newType = typeConverter->convertType(type); if (!newType) return rewriter.notifyMatchFailure(op, "not a 1:1 type conversion"); newResultTypes.push_back(newType); } for (auto t : llvm::zip(newOp.getResults(), newResultTypes)) std::get<0>(t).setType(std::get<1>(t)); rewriter.replaceOp(op, newOp.getResults()); return success(); } }; struct SCFYieldPattern : public OpConversionPattern { using OpConversionPattern::OpConversionPattern; LogicalResult matchAndRewrite(scf::YieldOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { // rewriter.setInsertionPointToEnd(rewriter.getInsertionBlock()); // rewriter.create(op.getLoc(), adaptor.getOperands()); // op.erase(); rewriter.replaceOpWithNewOp(op, adaptor.getOperands()); return success(); } }; // This is borrowed from ConvertFIfOpTypes in // SCF/Transforms/StructuralTypeConversions.cpp class SCFIfPattern : public OpConversionPattern { public: using OpConversionPattern::OpConversionPattern; LogicalResult matchAndRewrite(scf::IfOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { // TODO: Generalize this to any type conversion, not just 1:1. // // We need to implement something more sophisticated here that tracks which // types convert to which other types and does the appropriate // materialization logic. // For example, it's possible that one result type converts to 0 types and // another to 2 types, so newResultTypes would at least be the right size to // not crash in the llvm::zip call below, but then we would set the the // wrong type on the SSA values! These edge cases are also why we cannot // safely use the TypeConverter::convertTypes helper here. SmallVector newResultTypes; for (auto type : op.getResultTypes()) { Type newType = typeConverter->convertType(type); if (!newType) return rewriter.notifyMatchFailure(op, "not a 1:1 type conversion"); newResultTypes.push_back(newType); } // See comments in the ForOp pattern for why we clone without regions and // then inline. scf::IfOp newOp = cast(rewriter.cloneWithoutRegions(*op.getOperation())); rewriter.inlineRegionBefore(op.getThenRegion(), newOp.getThenRegion(), newOp.getThenRegion().end()); rewriter.inlineRegionBefore(op.getElseRegion(), newOp.getElseRegion(), newOp.getElseRegion().end()); // Update the operands and types. newOp->setOperands(adaptor.getOperands()); for (auto t : llvm::zip(newOp.getResults(), newResultTypes)) std::get<0>(t).setType(std::get<1>(t)); rewriter.replaceOp(op, newOp.getResults()); return success(); } }; // This is borrowed from ConvertFIfOpTypes in // SCF/Transforms/StructuralTypeConversions.cpp class SCFWhilePattern : public OpConversionPattern { public: using OpConversionPattern::OpConversionPattern; LogicalResult matchAndRewrite(scf::WhileOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { auto *converter = getTypeConverter(); assert(converter); SmallVector newResultTypes; if (failed(converter->convertTypes(op.getResultTypes(), newResultTypes))) return failure(); auto newOp = rewriter.create(op.getLoc(), newResultTypes, adaptor.getOperands()); for (auto i : {0u, 1u}) { auto &dstRegion = newOp.getRegion(i); rewriter.inlineRegionBefore(op.getRegion(i), dstRegion, dstRegion.end()); if (failed(rewriter.convertRegionTypes(&dstRegion, *converter))) return rewriter.notifyMatchFailure(op, "could not convert body types"); } rewriter.replaceOp(op, newOp.getResults()); return success(); } }; class SCFConditionPattern : public OpConversionPattern { public: using OpConversionPattern::OpConversionPattern; LogicalResult matchAndRewrite(scf::ConditionOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { rewriter.updateRootInPlace( op, [&]() { op->setOperands(adaptor.getOperands()); }); return success(); } }; void populateSCFPatterns(TritonGPUTypeConverter &typeConverter, RewritePatternSet &patterns) { MLIRContext *context = patterns.getContext(); patterns.add(typeConverter, context); } // CF class CFBranchPattern : public OpConversionPattern { public: using OpConversionPattern::OpConversionPattern; LogicalResult matchAndRewrite(BranchOp op, BranchOp::Adaptor adaptor, ConversionPatternRewriter &rewriter) const override { auto converter = getTypeConverter(); auto newOp = rewriter.replaceOpWithNewOp(op, op.getSuccessor(), adaptor.getOperands()); return success(); } }; class CFCondBranchPattern : public OpConversionPattern { public: using OpConversionPattern::OpConversionPattern; LogicalResult matchAndRewrite(CondBranchOp op, CondBranchOp::Adaptor adaptor, ConversionPatternRewriter &rewriter) const override { auto converter = getTypeConverter(); auto newOp = rewriter.replaceOpWithNewOp( op, adaptor.getCondition(), op.getTrueDest(), adaptor.getTrueDestOperands(), op.getFalseDest(), adaptor.getFalseDestOperands()); if (failed(rewriter.convertRegionTypes(newOp.getTrueDest()->getParent(), *converter))) return failure(); if (failed(rewriter.convertRegionTypes(newOp.getFalseDest()->getParent(), *converter))) return failure(); return success(); } }; void populateCFPatterns(TritonGPUTypeConverter &typeConverter, RewritePatternSet &patterns) { MLIRContext *context = patterns.getContext(); patterns.add(typeConverter, context); } // class ConvertTritonToTritonGPU : public ConvertTritonToTritonGPUBase { public: ConvertTritonToTritonGPU() = default; // constructor with some parameters set explicitly. ConvertTritonToTritonGPU(int numWarps) { this->numWarps = numWarps; } void runOnOperation() override { MLIRContext *context = &getContext(); ModuleOp mod = getOperation(); // type converter TritonGPUTypeConverter typeConverter(context, numWarps); TritonGPUConversionTarget target(*context, typeConverter); // rewrite patterns RewritePatternSet patterns(context); // add rules populateStdPatternsAndLegality(typeConverter, patterns, target); populateArithmeticPatternsAndLegality(typeConverter, patterns, target); populateMathPatternsAndLegality(typeConverter, patterns, target); populateTritonPatterns(typeConverter, patterns); // TODO: can we use // mlir::scf::populateSCFStructurealTypeConversionsAndLegality(...) here? populateSCFPatterns(typeConverter, patterns); populateCFPatterns(typeConverter, patterns); if (failed(applyPartialConversion(mod, target, std::move(patterns)))) return signalPassFailure(); auto inti = llvm::APSInt(32, false); auto i32_ty = IntegerType::get(mod->getContext(), 32); mod->setAttr( AttrNumWarpsName, IntegerAttr::get(i32_ty, llvm::APInt(32, numWarps.getValue()))); // update layouts // broadcast src => multicast, dst => broadcasted // if (failed(target.refineLayouts(mod, numWarps))) // return signalPassFailure(); } }; } // namespace std::unique_ptr> mlir::triton::createConvertTritonToTritonGPUPass(int numWarps) { return std::make_unique<::ConvertTritonToTritonGPU>(numWarps); } std::unique_ptr> mlir::triton::createConvertTritonToTritonGPUPass() { return std::make_unique<::ConvertTritonToTritonGPU>(); } triton-2.0.0/lib/Dialect/000077500000000000000000000000001440023377100151445ustar00rootroot00000000000000triton-2.0.0/lib/Dialect/CMakeLists.txt000066400000000000000000000000651440023377100177050ustar00rootroot00000000000000add_subdirectory(Triton) add_subdirectory(TritonGPU) triton-2.0.0/lib/Dialect/Triton/000077500000000000000000000000001440023377100164235ustar00rootroot00000000000000triton-2.0.0/lib/Dialect/Triton/CMakeLists.txt000066400000000000000000000000621440023377100211610ustar00rootroot00000000000000add_subdirectory(IR) add_subdirectory(Transforms) triton-2.0.0/lib/Dialect/Triton/IR/000077500000000000000000000000001440023377100167355ustar00rootroot00000000000000triton-2.0.0/lib/Dialect/Triton/IR/CMakeLists.txt000066400000000000000000000003621440023377100214760ustar00rootroot00000000000000add_mlir_dialect_library(TritonIR Interfaces.cpp Dialect.cpp Ops.cpp Types.cpp Traits.cpp DEPENDS TritonTableGen LINK_LIBS PUBLIC MLIRIR MLIRArithmetic MLIRSCF # Since LLVM 15 # MLIRFunc # else MLIRStandard ) triton-2.0.0/lib/Dialect/Triton/IR/Dialect.cpp000066400000000000000000000031151440023377100210060ustar00rootroot00000000000000#include "triton/Dialect/Triton/IR/Dialect.h" #include "triton/Dialect/Triton/IR/Types.h" #include "triton/Dialect/Triton/IR/AttrInterfaces.h.inc" #include "llvm/ADT/StringSwitch.h" #include "llvm/ADT/TypeSwitch.h" #include "llvm/Support/raw_ostream.h" #include "mlir/IR/DialectImplementation.h" #include "mlir/Transforms/InliningUtils.h" #include "triton/Dialect/Triton/IR/Dialect.cpp.inc" using namespace mlir; using namespace mlir::triton; //===----------------------------------------------------------------------===// // TritonDialect Dialect Interfaces //===----------------------------------------------------------------------===// namespace { struct TritonInlinerInterface : public DialectInlinerInterface { using DialectInlinerInterface::DialectInlinerInterface; bool isLegalToInline(Region *dest, Region *src, bool wouldBeCloned, BlockAndValueMapping &valueMapping) const final { return true; } bool isLegalToInline(Operation *, Region *, bool wouldBeCloned, BlockAndValueMapping &) const final { return true; } }; } // namespace void TritonDialect::initialize() { registerTypes(); addOperations< #define GET_OP_LIST #include "triton/Dialect/Triton/IR/Ops.cpp.inc" >(); // We can also add interface here. addInterfaces(); } Operation *TritonDialect::materializeConstant(OpBuilder &builder, Attribute value, Type type, Location loc) { return builder.create(loc, type, value); }triton-2.0.0/lib/Dialect/Triton/IR/Interfaces.cpp000066400000000000000000000000001440023377100215120ustar00rootroot00000000000000triton-2.0.0/lib/Dialect/Triton/IR/Ops.cpp000066400000000000000000000331611440023377100202060ustar00rootroot00000000000000#include "triton/Dialect/Triton/IR/Dialect.h" #include "triton/Dialect/Triton/IR/Types.h" #include "mlir/IR/Builders.h" #include "mlir/IR/BuiltinAttributes.h" #include "mlir/IR/BuiltinTypes.h" #include "mlir/IR/OperationSupport.h" namespace mlir { namespace triton { // Type inference static Type getI1SameShape(Type type) { auto i1Type = IntegerType::get(type.getContext(), 1); if (auto tensorType = type.dyn_cast()) return RankedTensorType::get(tensorType.getShape(), i1Type, tensorType.getEncoding()); return i1Type; } static Type getI32SameShape(Type type) { auto i32Type = IntegerType::get(type.getContext(), 32); if (auto tensorType = type.dyn_cast()) return RankedTensorType::get(tensorType.getShape(), i32Type, tensorType.getEncoding()); return i32Type; } static Type getPointerTypeSameShape(Type type) { if (auto tensorType = type.dyn_cast()) { Type elementType = tensorType.getElementType(); auto shape = tensorType.getShape(); PointerType ptrType = PointerType::get(elementType, 1); return RankedTensorType::get(shape, ptrType, tensorType.getEncoding()); } else { return PointerType::get(type, 1); } } // Parser & printer for assembly forms ParseResult parseLoadOp(OpAsmParser &parser, OperationState &result) { SmallVector allOperands; Type resultTypes[1]; SMLoc allOperandLoc = parser.getCurrentLocation(); if (parser.parseOperandList(allOperands) || parser.parseOptionalAttrDict(result.attributes) || parser.parseColon() || parser.parseCustomTypeWithFallback(resultTypes[0])) return failure(); result.addTypes(resultTypes); SmallVector operandTypes; operandTypes.push_back(getPointerTypeSameShape(resultTypes[0])); // ptr int hasMask = 0, hasOther = 0; if (allOperands.size() >= 2) { operandTypes.push_back(getI1SameShape(resultTypes[0])); // mask hasMask = 1; } if (allOperands.size() >= 3) { operandTypes.push_back(resultTypes[0]); // other hasOther = 1; } if (parser.resolveOperands(allOperands, operandTypes, allOperandLoc, result.operands)) return failure(); // Deduce operand_segment_sizes from the number of the operands. auto operand_segment_sizesAttrName = LoadOp::operand_segment_sizesAttrName(result.name); result.addAttribute( operand_segment_sizesAttrName, parser.getBuilder().getI32VectorAttr({1, hasMask, hasOther})); return success(); } void printLoadOp(OpAsmPrinter &printer, LoadOp loadOp) { printer << " "; printer << loadOp.getOperation()->getOperands(); // "operand_segment_sizes" can be deduced, so we don't print it. printer.printOptionalAttrDict(loadOp->getAttrs(), {loadOp.operand_segment_sizesAttrName()}); printer << " : "; printer.printStrippedAttrOrType(loadOp.result().getType()); } ParseResult parseStoreOp(OpAsmParser &parser, OperationState &result) { SmallVector allOperands; Type valueType; SMLoc allOperandLoc = parser.getCurrentLocation(); if (parser.parseOperandList(allOperands) || parser.parseOptionalAttrDict(result.attributes) || parser.parseColon() || parser.parseCustomTypeWithFallback(valueType)) return failure(); SmallVector operandTypes; operandTypes.push_back(getPointerTypeSameShape(valueType)); // ptr operandTypes.push_back(valueType); // value if (allOperands.size() >= 3) operandTypes.push_back(getI1SameShape(valueType)); // mask if (parser.resolveOperands(allOperands, operandTypes, allOperandLoc, result.operands)) return failure(); return success(); } void printStoreOp(OpAsmPrinter &printer, StoreOp storeOp) { printer << " "; printer << storeOp.getOperation()->getOperands(); printer.printOptionalAttrDict(storeOp->getAttrs(), /*elidedAttrs=*/{}); printer << " : "; printer.printStrippedAttrOrType(storeOp.value().getType()); } } // namespace triton } // namespace mlir #define GET_OP_CLASSES #include "triton/Dialect/Triton/IR/Ops.cpp.inc" // enum attribute definitions #include "triton/Dialect/Triton/IR/OpsEnums.cpp.inc" namespace mlir { namespace triton { //-- FpToFpOp -- bool FpToFpOp::areCastCompatible(::mlir::TypeRange inputs, ::mlir::TypeRange outputs) { if (inputs.size() != 1 || outputs.size() != 1) return false; auto srcEltType = inputs.front(); auto dstEltType = outputs.front(); auto srcTensorType = srcEltType.dyn_cast(); auto dstTensorType = dstEltType.dyn_cast(); if (srcTensorType && dstTensorType) { srcEltType = srcTensorType.getElementType(); dstEltType = dstTensorType.getElementType(); } // Check whether fp8 <=> fp16, bf16, f32, f64 // Make `srcEltType` always the fp8 side if (dstEltType.dyn_cast()) std::swap(srcEltType, dstEltType); if (!srcEltType.dyn_cast()) return false; return dstEltType.isF16() || dstEltType.isBF16() || dstEltType.isF32() || dstEltType.isF64(); } //-- StoreOp -- void StoreOp::build(::mlir::OpBuilder &builder, ::mlir::OperationState &state, ::mlir::Value ptr, ::mlir::Value value) { StoreOp::build(builder, state, ptr, value, mlir::Value()); } //-- LoadOp -- static Type getLoadOpResultType(::mlir::OpBuilder &builder, Type ptrType) { auto ptrTensorType = ptrType.dyn_cast(); if (!ptrTensorType) return ptrType.cast().getPointeeType(); auto shape = ptrTensorType.getShape(); Type elementType = ptrTensorType.getElementType().cast().getPointeeType(); return RankedTensorType::get(shape, elementType); } void LoadOp::build(::mlir::OpBuilder &builder, ::mlir::OperationState &state, ::mlir::Value ptr, ::mlir::triton::CacheModifier cache, ::mlir::triton::EvictionPolicy evict, bool isVolatile) { LoadOp::build(builder, state, ptr, mlir::Value(), mlir::Value(), cache, evict, isVolatile); } void LoadOp::build(::mlir::OpBuilder &builder, ::mlir::OperationState &state, ::mlir::Value ptr, ::mlir::Value mask, ::mlir::triton::CacheModifier cache, ::mlir::triton::EvictionPolicy evict, bool isVolatile) { LoadOp::build(builder, state, ptr, mask, mlir::Value(), cache, evict, isVolatile); } void LoadOp::build(::mlir::OpBuilder &builder, ::mlir::OperationState &state, ::mlir::Value ptr, ::mlir::Value mask, ::mlir::Value other, ::mlir::triton::CacheModifier cache, ::mlir::triton::EvictionPolicy evict, bool isVolatile) { Type resultType = getLoadOpResultType(builder, ptr.getType()); state.addOperands(ptr); if (mask) { state.addOperands(mask); if (other) { state.addOperands(other); } } state.addAttribute( operand_segment_sizesAttrName(state.name), builder.getI32VectorAttr({1, (mask ? 1 : 0), (other ? 1 : 0)})); state.addAttribute( cacheAttrName(state.name), ::mlir::triton::CacheModifierAttr::get(builder.getContext(), cache)); state.addAttribute( evictAttrName(state.name), ::mlir::triton::EvictionPolicyAttr::get(builder.getContext(), evict)); state.addAttribute(isVolatileAttrName(state.name), builder.getBoolAttr(isVolatile)); state.addTypes({resultType}); } //-- TransOp -- mlir::LogicalResult mlir::triton::TransOp::inferReturnTypes( MLIRContext *context, Optional location, ValueRange operands, DictionaryAttr attributes, RegionRange regions, SmallVectorImpl &inferredReturnTypes) { // type is the same as the input auto argTy = operands[0].getType().cast(); SmallVector retShape(argTy.getShape().begin(), argTy.getShape().end()); std::reverse(retShape.begin(), retShape.end()); auto retEltTy = argTy.getElementType(); Attribute argEncoding = argTy.getEncoding(); Attribute retEncoding; if (argEncoding) { Dialect &dialect = argEncoding.getDialect(); auto inferLayoutInterface = dyn_cast(&dialect); if (inferLayoutInterface->inferTransOpEncoding(argEncoding, retEncoding) .failed()) { llvm::report_fatal_error("failed to infer layout for ReduceOp"); return mlir::failure(); } } inferredReturnTypes.push_back( RankedTensorType::get(retShape, retEltTy, retEncoding)); return mlir::success(); } //-- DotOp -- mlir::LogicalResult mlir::triton::DotOp::inferReturnTypes( MLIRContext *context, Optional location, ValueRange operands, DictionaryAttr attributes, RegionRange regions, SmallVectorImpl &inferredReturnTypes) { // type is the same as the accumulator auto accTy = operands[2].getType().cast(); inferredReturnTypes.push_back(accTy); // verify encodings auto aEnc = operands[0].getType().cast().getEncoding(); auto bEnc = operands[1].getType().cast().getEncoding(); auto retEnc = accTy.getEncoding(); if (aEnc) { assert(bEnc); Dialect &dialect = aEnc.getDialect(); auto interface = dyn_cast(&dialect); if (interface->inferDotOpEncoding(aEnc, 0, retEnc, location).failed()) return mlir::failure(); if (interface->inferDotOpEncoding(bEnc, 1, retEnc, location).failed()) return mlir::failure(); } return mlir::success(); } //-- ReduceOp -- mlir::LogicalResult mlir::triton::ReduceOp::inferReturnTypes( MLIRContext *context, Optional location, ValueRange operands, DictionaryAttr attributes, RegionRange regions, SmallVectorImpl &inferredReturnTypes) { // infer shape Value arg = operands[0]; auto argTy = arg.getType().cast(); auto argEltTy = argTy.getElementType(); auto i32Ty = IntegerType::get(argEltTy.getContext(), 32); auto redOp = attributes.get("redOp").cast().getValue(); bool withIndex = mlir::triton::ReduceOp::withIndex(redOp); auto retEltTy = withIndex ? i32Ty : argEltTy; auto retShape = argTy.getShape().vec(); int axis = attributes.get("axis").cast().getInt(); retShape.erase(retShape.begin() + axis); if (retShape.empty()) { // 0d-tensor -> scalar inferredReturnTypes.push_back(retEltTy); } else { // nd-tensor where n >= 1 // infer encoding Attribute argEncoding = argTy.getEncoding(); Attribute retEncoding; if (argEncoding) { Dialect &dialect = argEncoding.getDialect(); auto inferLayoutInterface = dyn_cast(&dialect); if (inferLayoutInterface ->inferReduceOpEncoding(argEncoding, axis, retEncoding) .failed()) { llvm::report_fatal_error("failed to infer layout for ReduceOp"); return mlir::failure(); } } // create type inferredReturnTypes.push_back( RankedTensorType::get(retShape, retEltTy, retEncoding)); } return mlir::success(); } bool mlir::triton::ReduceOp::withIndex(mlir::triton::RedOp redOp) { return redOp == mlir::triton::RedOp::ARGMIN || redOp == mlir::triton::RedOp::ARGMAX || redOp == mlir::triton::RedOp::ARGUMIN || redOp == mlir::triton::RedOp::ARGUMAX || redOp == mlir::triton::RedOp::ARGFMIN || redOp == mlir::triton::RedOp::ARGFMAX; } //-- SplatOp -- OpFoldResult SplatOp::fold(ArrayRef operands) { auto constOperand = src().getDefiningOp(); if (!constOperand) return {}; auto shapedType = getType().cast(); auto ret = SplatElementsAttr::get(shapedType, {constOperand.getValue()}); return ret; } //-- ExpandDimsOp -- mlir::LogicalResult mlir::triton::ExpandDimsOp::inferReturnTypes( MLIRContext *context, Optional loc, ValueRange operands, DictionaryAttr attributes, RegionRange regions, SmallVectorImpl &inferredReturnTypes) { // infer shape auto arg = operands[0]; auto argTy = arg.getType().cast(); auto retShape = argTy.getShape().vec(); int axis = attributes.get("axis").cast().getInt(); retShape.insert(retShape.begin() + axis, 1); // infer encoding Attribute argEncoding = argTy.getEncoding(); Attribute retEncoding; if (argEncoding) { Dialect &dialect = argEncoding.getDialect(); auto inferLayoutInterface = dyn_cast(&dialect); if (inferLayoutInterface ->inferExpandDimsOpEncoding(argEncoding, axis, retEncoding, loc) .failed()) return emitOptionalError(loc, "failed to infer layout for ExpandDimsOp"); } // create type auto argEltTy = argTy.getElementType(); inferredReturnTypes.push_back( RankedTensorType::get(retShape, argEltTy, retEncoding)); return mlir::success(); } //-- BroadcastOp -- OpFoldResult BroadcastOp::fold(ArrayRef operands) { auto constOperand = src().getDefiningOp(); if (!constOperand) return {}; auto shapedType = getType().cast(); auto value = constOperand.getValue(); if (auto denseElemsAttr = value.dyn_cast()) { if (!denseElemsAttr.isSplat()) return {}; return SplatElementsAttr::get(shapedType, denseElemsAttr.getSplatValue()); } else if (value.getType().isIntOrIndexOrFloat()) { return SplatElementsAttr::get(shapedType, value); } else { return {}; } } } // namespace triton } // namespace mlir triton-2.0.0/lib/Dialect/Triton/IR/Traits.cpp000066400000000000000000000052541440023377100207150ustar00rootroot00000000000000#include "triton/Dialect/Triton/IR/Traits.h" static mlir::LogicalResult verifySameEncoding(mlir::Type tyA, mlir::Type tyB) { using namespace mlir; auto encA = tyA.dyn_cast(); auto encB = tyA.dyn_cast(); if (!encA || !encB) return success(); return encA.getEncoding() == encB.getEncoding() ? success() : failure(); } mlir::LogicalResult mlir::OpTrait::impl::verifySameOperandsAndResultEncoding(Operation *op) { if (failed(verifyAtLeastNOperands(op, 1)) || failed(verifyAtLeastNResults(op, 1))) return failure(); auto type = op->getOperand(0).getType(); for (auto resultType : op->getResultTypes()) if (failed(verifySameEncoding(resultType, type))) return op->emitOpError() << "requires the same encoding for all operands and results"; return verifySameOperandsEncoding(op); } mlir::LogicalResult mlir::OpTrait::impl::verifySameOperandsEncoding(Operation *op) { if (failed(verifyAtLeastNOperands(op, 1))) return failure(); auto type = op->getOperand(0).getType(); for (auto opType : llvm::drop_begin(op->getOperandTypes(), 1)) if (failed(verifySameEncoding(opType, type))) return op->emitOpError() << "requires the same encoding for all operands"; return success(); } mlir::LogicalResult mlir::OpTrait::impl::verifyTensorSize(Operation *op) { for (auto opType : op->getOperandTypes()) { if (auto tensorType = opType.dyn_cast()) { int64_t numElements = 1; for (int64_t s : tensorType.getShape()) numElements *= s; if (numElements > maxTensorNumElements) return op->emitError("Maximum allowed number of elements is ") << maxTensorNumElements << ", but " << *op << " has more than that"; if ((numElements & (numElements - 1)) != 0) return op->emitError("Number of elements must be power-of-two, but ") << *op << " doesn't follow the rule (" << numElements << ")" << " elements"; } } for (auto opType : op->getResultTypes()) { if (auto tensorType = opType.dyn_cast()) { int64_t numElements = 1; for (int64_t s : tensorType.getShape()) numElements *= s; if (numElements > maxTensorNumElements) return op->emitError("Maximum allowed number of elements is ") << maxTensorNumElements << ", but " << *op << " has more than that"; if ((numElements & (numElements - 1)) != 0) return op->emitError("Number of elements must be power-of-two, but ") << *op << " doesn't follow the rule (" << numElements << ")" << " elements"; } } return success(); } triton-2.0.0/lib/Dialect/Triton/IR/Types.cpp000066400000000000000000000026021440023377100205450ustar00rootroot00000000000000#include "triton/Dialect/Triton/IR/Types.h" #include "mlir/IR/DialectImplementation.h" // required by `Types.cpp.inc` #include "triton/Dialect/Triton/IR/Dialect.h" #include "llvm/ADT/TypeSwitch.h" // required by `Types.cpp.inc` using namespace mlir; using namespace mlir::triton; #define GET_TYPEDEF_CLASSES #include "triton/Dialect/Triton/IR/Types.cpp.inc" //===----------------------------------------------------------------------===// // Triton Dialect //===----------------------------------------------------------------------===// void TritonDialect::registerTypes() { addTypes< #define GET_TYPEDEF_LIST #include "triton/Dialect/Triton/IR/Types.cpp.inc" >(); } Type PointerType::parse(AsmParser &parser) { if (parser.parseLess()) return Type(); Type pointeeType; if (parser.parseType(pointeeType)) return Type(); if (parser.parseGreater()) return Type(); // TODO: also print address space? return PointerType::get(pointeeType, 1); } void PointerType::print(AsmPrinter &printer) const { printer << "<" << getPointeeType() << ">"; } namespace mlir { unsigned getPointeeBitWidth(RankedTensorType tensorTy) { auto ptrTy = tensorTy.getElementType().cast(); auto pointeeType = ptrTy.getPointeeType(); return pointeeType.isa() ? 8 : pointeeType.getIntOrFloatBitWidth(); } } // namespace mlir triton-2.0.0/lib/Dialect/Triton/Transforms/000077500000000000000000000000001440023377100205615ustar00rootroot00000000000000triton-2.0.0/lib/Dialect/Triton/Transforms/CMakeLists.txt000066400000000000000000000003751440023377100233260ustar00rootroot00000000000000set(LLVM_TARGET_DEFINITIONS Combine.td) mlir_tablegen(TritonCombine.inc -gen-rewriters) add_public_tablegen_target(TritonCombineIncGen) add_mlir_dialect_library(TritonTransforms Combine.cpp DEPENDS TritonTransformsIncGen TritonCombineIncGen ) triton-2.0.0/lib/Dialect/Triton/Transforms/Combine.cpp000066400000000000000000000154641440023377100226530ustar00rootroot00000000000000#include "mlir/IR/BuiltinAttributes.h" #include "mlir/IR/Matchers.h" #include "mlir/IR/PatternMatch.h" #include "mlir/Pass/Pass.h" #include "mlir/Support/LogicalResult.h" #include "mlir/Transforms/GreedyPatternRewriteDriver.h" #include "triton/Dialect/Triton/IR/Dialect.h" #include "triton/Dialect/Triton/Transforms/Passes.h" #include using namespace mlir; namespace { bool isZero(mlir::Value val) { if (mlir::matchPattern(val, mlir::m_Zero()) || mlir::matchPattern(val, mlir::m_AnyZeroFloat())) return true; // broadcast(constant_0) if (auto bc = val.getDefiningOp()) { if (mlir::matchPattern(bc.src(), mlir::m_Zero()) || mlir::matchPattern(bc.src(), mlir::m_AnyZeroFloat())) return true; } return false; } bool isBroadcastConstantCombinable(Attribute value) { if (auto denseValue = value.dyn_cast()) { return denseValue.isSplat(); } return value.isa(); } DenseElementsAttr getConstantValue(Builder &builder, Attribute value, Value bcast_res) { Type resType = bcast_res.getType(); DenseElementsAttr res; if (auto denseValue = value.dyn_cast()) { res = DenseElementsAttr::get(resType, denseValue.getSplatValue()); } else { res = DenseElementsAttr::get(resType, value); } return res; } #include "TritonCombine.inc" } // anonymous namespace // select(cond, load(ptrs, broadcast(cond), ???), other) // => load(ptrs, broadcast(cond), other) class CombineSelectMaskedLoadPattern : public mlir::RewritePattern { public: CombineSelectMaskedLoadPattern(mlir::MLIRContext *context) : mlir::RewritePattern(mlir::SelectOp::getOperationName(), 3, context, {triton::LoadOp::getOperationName()}) {} mlir::LogicalResult matchAndRewrite(mlir::Operation *op, mlir::PatternRewriter &rewriter) const override { auto selectOp = llvm::dyn_cast(op); if (!selectOp) return mlir::failure(); mlir::Value trueValue = selectOp.getTrueValue(); mlir::Value falseValue = selectOp.getFalseValue(); mlir::Value condSelect = selectOp.getCondition(); auto *loadOpCandidate = trueValue.getDefiningOp(); auto loadOp = llvm::dyn_cast_or_null(loadOpCandidate); if (!loadOp) return mlir::failure(); mlir::Value mask = loadOp.mask(); if (!mask) return mlir::failure(); auto *broadcastOpCandidate = mask.getDefiningOp(); auto broadcastOp = llvm::dyn_cast_or_null(broadcastOpCandidate); if (!broadcastOp) return mlir::failure(); auto broadcastCond = broadcastOp.src(); if (broadcastCond != condSelect) return mlir::failure(); rewriter.replaceOpWithNewOp( op, loadOp.ptr(), loadOp.mask(), falseValue, loadOp.cache(), loadOp.evict(), loadOp.isVolatile()); return mlir::success(); } }; // load(ptr, splat(1), ...) -> load(ptr, ...) // load(ptr, splat(0), other, ...) -> other struct CanonicalizeMaskedLoadPattern : public mlir::OpRewritePattern { CanonicalizeMaskedLoadPattern(mlir::MLIRContext *context) : OpRewritePattern(context, 1) {} mlir::LogicalResult matchAndRewrite(triton::LoadOp loadOp, mlir::PatternRewriter &rewriter) const override { auto mask = loadOp.mask(); if (!mask) return mlir::failure(); auto constantMask = llvm::dyn_cast_or_null(mask.getDefiningOp()); if (!constantMask) return mlir::failure(); auto splatMask = constantMask.getValue().dyn_cast(); if (!splatMask) return mlir::failure(); if (splatMask.getSplatValue().getValue() == true) { // mask = splat(1) rewriter.replaceOpWithNewOp( loadOp, loadOp.getType(), loadOp.ptr(), Value(), Value(), loadOp.cache(), loadOp.evict(), loadOp.isVolatile()); } else { // mask = splat(0) // If there's no "other", the value is "undef". Perhaps we want to // optimize it in the future.x auto otherVal = loadOp.other(); if (!otherVal) return mlir::failure(); rewriter.replaceOp(loadOp, otherVal); } return mlir::success(); } }; void triton::LoadOp::getCanonicalizationPatterns(RewritePatternSet &results, MLIRContext *context) { results.add(context); } // store(ptr, value, splat(1), ...) -> store(ptr, value, ...) // store(ptr, value, splat(0), ...) -> [none] struct CanonicalizeMaskedStorePattern : public mlir::OpRewritePattern { CanonicalizeMaskedStorePattern(mlir::MLIRContext *context) : OpRewritePattern(context, 1) {} mlir::LogicalResult matchAndRewrite(triton::StoreOp storeOp, mlir::PatternRewriter &rewriter) const override { auto mask = storeOp.mask(); if (!mask) return mlir::failure(); auto constantMask = llvm::dyn_cast_or_null(mask.getDefiningOp()); if (!constantMask) return mlir::failure(); auto splatMask = constantMask.getValue().dyn_cast(); if (!splatMask) return mlir::failure(); if (splatMask.getSplatValue().getValue() == true) { // mask = splat(1) rewriter.replaceOpWithNewOp(storeOp, storeOp.ptr(), storeOp.value()); } else { // mask = splat(0) rewriter.eraseOp(storeOp); } return mlir::success(); } }; void triton::StoreOp::getCanonicalizationPatterns(RewritePatternSet &results, MLIRContext *context) { results.add(context); } #define GEN_PASS_CLASSES #include "triton/Dialect/Triton/Transforms/Passes.h.inc" class CombineOpsPass : public TritonCombineOpsBase { public: void runOnOperation() override { mlir::MLIRContext *context = &getContext(); mlir::RewritePatternSet patterns(context); mlir::ModuleOp m = getOperation(); // Dot Add %{ patterns.add(context); patterns.add(context); patterns.add(context); patterns.add(context); // %} patterns.add(context); // patterns.add(context); patterns.add(context); if (applyPatternsAndFoldGreedily(m, std::move(patterns)).failed()) signalPassFailure(); } }; std::unique_ptr mlir::triton::createCombineOpsPass() { return std::make_unique(); } triton-2.0.0/lib/Dialect/Triton/Transforms/Combine.td000066400000000000000000000036621440023377100224750ustar00rootroot00000000000000#ifndef TRITON_PATTERNS #define TRITON_PATTERNS include "mlir/Dialect/StandardOps/IR/Ops.td" include "mlir/Dialect/Arithmetic/IR/ArithmeticOps.td" include "triton/Dialect/Triton/IR/TritonOps.td" // AddIOp(DotOp(a, b, c), d) and c==0 => DotOp(a, b, d) // AddFOp(DotOp(a, b, c), d) and c==0 => DotOp(a, b, d) // AddIOp(d, DotOp(a, b, c)) and c==0 => DotOp(a, b, d) // AddFOp(d, DotOp(a, b, c)) and c==0 => DotOp(a, b, d) def CombineDotAddIPattern : Pat< (Arith_AddIOp $d, (TT_DotOp:$res $a, $b, $c, $allowTF32)), (TT_DotOp $a, $b, $d, $allowTF32), [(Constraint> $c)]>; def CombineDotAddFPattern : Pat< (Arith_AddFOp $d, (TT_DotOp:$res $a, $b, $c, $allowTF32)), (TT_DotOp $a, $b, $d, $allowTF32), [(Constraint> $c)]>; def CombineDotAddIRevPattern : Pat< (Arith_AddIOp (TT_DotOp:$res $a, $b, $c, $allowTF32), $d), (TT_DotOp $a, $b, $d, $allowTF32), [(Constraint> $c)]>; def CombineDotAddFRevPattern : Pat< (Arith_AddFOp (TT_DotOp:$res $a, $b, $c, $allowTF32), $d), (TT_DotOp $a, $b, $d, $allowTF32), [(Constraint> $c)]>; // TODO: this fails for addptr(addptr(ptr, i32), i64) // Commented out until fixed // addptr(addptr(%ptr, %idx0), %idx1) => addptr(%ptr, AddI(%idx0, %idx1)) // Note: leave (sub %c0, %c0) canceling to ArithmeticDialect // (ref: ArithmeticCanonicalization.td) // def CombineAddPtrPattern : Pat< // (TT_AddPtrOp (TT_AddPtrOp $ptr, $idx0), $idx1), // (TT_AddPtrOp $ptr, (Arith_AddIOp $idx0, $idx1))>; // broadcast(cst) => cst def getConstantValue : NativeCodeCall<"getConstantValue($_builder, $0, $1)">; def CombineBroadcastConstantPattern : Pat< (TT_BroadcastOp:$bcast_res (Arith_ConstantOp $value)), (Arith_ConstantOp (getConstantValue $value, $bcast_res)), [(Constraint> $value)]>; #endif triton-2.0.0/lib/Dialect/TritonGPU/000077500000000000000000000000001440023377100167775ustar00rootroot00000000000000triton-2.0.0/lib/Dialect/TritonGPU/CMakeLists.txt000066400000000000000000000000621440023377100215350ustar00rootroot00000000000000add_subdirectory(IR) add_subdirectory(Transforms) triton-2.0.0/lib/Dialect/TritonGPU/IR/000077500000000000000000000000001440023377100173115ustar00rootroot00000000000000triton-2.0.0/lib/Dialect/TritonGPU/IR/CMakeLists.txt000066400000000000000000000002321440023377100220460ustar00rootroot00000000000000add_mlir_dialect_library(TritonGPUIR Dialect.cpp Traits.cpp DEPENDS TritonGPUTableGen TritonGPUAttrDefsIncGen LINK_LIBS PUBLIC TritonIR ) triton-2.0.0/lib/Dialect/TritonGPU/IR/Dialect.cpp000066400000000000000000000724751440023377100214010ustar00rootroot00000000000000#include #include "mlir/IR/DialectImplementation.h" #include "mlir/IR/OpImplementation.h" #include "triton/Analysis/Utility.h" #include "triton/Dialect/Triton/IR/Dialect.h" #include "triton/Dialect/TritonGPU/IR/Dialect.h" #include "llvm/ADT/TypeSwitch.h" #include "triton/Dialect/TritonGPU/IR/Dialect.cpp.inc" using namespace mlir; using namespace mlir::triton::gpu; // Utility namespace mlir { namespace triton { // Type inference static Type getI1SameShape(Type type) { auto i1Type = IntegerType::get(type.getContext(), 1); if (auto tensorType = type.dyn_cast()) return RankedTensorType::get(tensorType.getShape(), i1Type, tensorType.getEncoding()); return Type(); } static Type getPointeeType(Type type) { if (auto tensorType = type.dyn_cast()) { // Tensor of pointers auto shape = tensorType.getShape(); auto ptrType = tensorType.getElementType().dyn_cast(); Type pointeeType = ptrType.getPointeeType(); return RankedTensorType::get(shape, pointeeType, tensorType.getEncoding()); } else if (auto ptrType = type.dyn_cast()) { // scalar pointer Type pointeeType = ptrType.getPointeeType(); return pointeeType; } return Type(); } namespace gpu { // TODO: Inheritance of layout attributes // so that all distributed layouts implement // these utilities unsigned getElemsPerThread(Attribute layout, ArrayRef shape) { if (auto blockedLayout = layout.dyn_cast()) { return blockedLayout.getElemsPerThread(shape); } else if (auto sliceLayout = layout.dyn_cast()) { return sliceLayout.getElemsPerThread(shape); } else if (auto mmaLayout = layout.dyn_cast()) { return mmaLayout.getElemsPerThread(shape); } else if (auto sharedLayout = layout.dyn_cast()) { return sharedLayout.getElemsPerThread(shape); } else if (auto dotLayout = layout.dyn_cast()) { return dotLayout.getElemsPerThread(shape); } else { assert(0 && "getElemsPerThread not implemented"); return 0; } } unsigned getElemsPerThread(Type type) { if (type.isIntOrIndexOrFloat() || type.isa() || type.isa()) return 1; auto tensorType = type.cast(); return getElemsPerThread(tensorType.getEncoding(), tensorType.getShape()); } SmallVector getThreadsPerWarp(const Attribute &layout) { if (auto blockedLayout = layout.dyn_cast()) { return SmallVector(blockedLayout.getThreadsPerWarp().begin(), blockedLayout.getThreadsPerWarp().end()); } if (auto mmaLayout = layout.dyn_cast()) { if (mmaLayout.isVolta()) return {4, 8}; if (mmaLayout.isAmpere()) return {8, 4}; } assert(0 && "getThreadsPerWarp not implemented"); return {}; } SmallVector getWarpsPerCTA(const Attribute &layout) { if (auto blockedLayout = layout.dyn_cast()) { return SmallVector(blockedLayout.getWarpsPerCTA().begin(), blockedLayout.getWarpsPerCTA().end()); } if (auto mmaLayout = layout.dyn_cast()) { return SmallVector(mmaLayout.getWarpsPerCTA().begin(), mmaLayout.getWarpsPerCTA().end()); } assert(0 && "getWarpsPerCTA not implemented"); return {}; } SmallVector getSizePerThread(const Attribute &layout) { if (auto blockedLayout = layout.dyn_cast()) { return SmallVector(blockedLayout.getSizePerThread().begin(), blockedLayout.getSizePerThread().end()); } else if (auto sliceLayout = layout.dyn_cast()) { auto ret = getSizePerThread(sliceLayout.getParent()); return ret; // ret.erase(ret.begin() + sliceLayout.getDim()); return ret; } else if (auto mmaLayout = layout.dyn_cast()) { if (mmaLayout.isAmpere()) { return {2, 2}; } else if (mmaLayout.isVolta()) { return {1, 2}; } else { llvm_unreachable("Unexpected mma version"); } } else if (auto dotLayout = layout.dyn_cast()) { auto parentLayout = dotLayout.getParent(); assert(parentLayout && "DotOperandEncodingAttr must have a parent"); if (auto parentMmaLayout = parentLayout.dyn_cast()) { assert(parentMmaLayout.isAmpere() && "mmaLayout version = 1 is not implemented yet"); auto parentShapePerCTA = getShapePerCTA(parentLayout); auto opIdx = dotLayout.getOpIdx(); if (opIdx == 0) { return {2, 4}; } else if (opIdx == 1) { return {4, 1}; } else { assert(0 && "DotOperandEncodingAttr opIdx must be 0 or 1"); return {}; } } else { assert(0 && "DotOperandEncodingAttr non-MmaEncodingAttr parent not " "supported yet"); return {}; } } else { assert(0 && "getSizePerThread not implemented"); return {}; } } SmallVector getContigPerThread(const Attribute &layout) { if (auto mmaLayout = layout.dyn_cast()) { assert(mmaLayout.isVolta() || mmaLayout.isAmpere()); return {1, 2}; } else { return getSizePerThread(layout); } } SmallVector getThreadsPerCTA(const Attribute &layout) { SmallVector threads; if (auto blockedLayout = layout.dyn_cast()) { for (int d = 0, n = blockedLayout.getOrder().size(); d < n; ++d) threads.push_back(blockedLayout.getThreadsPerWarp()[d] * blockedLayout.getWarpsPerCTA()[d]); } else if (auto mmaLayout = layout.dyn_cast()) { if (mmaLayout.getVersionMajor() == 2) { threads = {8 * mmaLayout.getWarpsPerCTA()[0], 4 * mmaLayout.getWarpsPerCTA()[1]}; } else assert(0 && "Unimplemented usage of MmaEncodingAttr"); } else { assert(0 && "Unimplemented usage of getShapePerCTA"); } return threads; } SmallVector getShapePerCTA(const Attribute &layout, ArrayRef tensorShape) { SmallVector shape; if (auto blockedLayout = layout.dyn_cast()) { for (unsigned d = 0, n = blockedLayout.getOrder().size(); d < n; ++d) shape.push_back(blockedLayout.getSizePerThread()[d] * blockedLayout.getThreadsPerWarp()[d] * blockedLayout.getWarpsPerCTA()[d]); } else if (auto sliceLayout = layout.dyn_cast()) { unsigned dim = sliceLayout.getDim(); auto parent = sliceLayout.getParent(); for (unsigned d = 0, n = getOrder(parent).size(); d < n; ++d) { if (d == dim) continue; shape.push_back(getShapePerCTA(parent, tensorShape)[d]); } } else if (auto mmaLayout = layout.dyn_cast()) { if (mmaLayout.isAmpere()) return {16 * mmaLayout.getWarpsPerCTA()[0], 8 * mmaLayout.getWarpsPerCTA()[1]}; if (mmaLayout.isVolta()) { assert(!tensorShape.empty() && "Volta needs the tensorShape"); if (tensorShape.size() == 1) // must be SliceEncoding return {static_cast(tensorShape[0]), static_cast(tensorShape[0])}; return {static_cast(tensorShape[0]), static_cast(tensorShape[1])}; } assert(0 && "Unexpected MMA layout version found"); } else if (auto dotLayout = layout.dyn_cast()) { auto parentLayout = dotLayout.getParent(); assert(parentLayout && "DotOperandEncodingAttr must have a parent"); if (auto parentMmaLayout = parentLayout.dyn_cast()) { assert(parentMmaLayout.isAmpere() && "mmaLayout version = 1 is not implemented yet"); auto parentShapePerCTA = getShapePerCTA(parentLayout, tensorShape); auto opIdx = dotLayout.getOpIdx(); if (opIdx == 0) { return {parentShapePerCTA[0], 16}; } else if (opIdx == 1) { return {16, parentShapePerCTA[1]}; } else { assert(0 && "DotOperandEncodingAttr opIdx must be 0 or 1"); } } else { assert(0 && "DotOperandEncodingAttr non-MmaEncodingAttr parent not " "supported yet"); } } else if (auto mmaLayout = layout.dyn_cast()) { if (mmaLayout.isAmpere()) { return {16 * mmaLayout.getWarpsPerCTA()[0], 8 * mmaLayout.getWarpsPerCTA()[1]}; } else if (mmaLayout.isVolta()) { return {16 * mmaLayout.getWarpsPerCTA()[0], 16 * mmaLayout.getWarpsPerCTA()[1]}; } else { llvm_unreachable("Unexpected mma version"); } } else { assert(0 && "Unimplemented usage of getShapePerCTA"); } return shape; } SmallVector getOrder(const Attribute &layout) { if (auto blockedLayout = layout.dyn_cast()) { return SmallVector(blockedLayout.getOrder().begin(), blockedLayout.getOrder().end()); } else if (auto mmaLayout = layout.dyn_cast()) { return {1, 0}; } else if (auto dotLayout = layout.dyn_cast()) { return {1, 0}; } else if (auto sliceLayout = layout.dyn_cast()) { SmallVector parentOrder = getOrder(sliceLayout.getParent()); unsigned dim = sliceLayout.getDim(); SmallVector order; for (unsigned d : parentOrder) { if (d == dim) continue; else if (d > dim) order.push_back(d - 1); else order.push_back(d); } return order; } else if (auto sharedLayout = layout.dyn_cast()) { return SmallVector(sharedLayout.getOrder().begin(), sharedLayout.getOrder().end()); } else { assert(0 && "Unimplemented usage of getOrder"); return {}; } }; bool isaDistributedLayout(const Attribute &layout) { return layout.isa() || layout.isa() || layout.isa(); } } // namespace gpu } // namespace triton } // namespace mlir static LogicalResult parseIntAttrValue(AsmParser &parser, const Attribute &attr, unsigned &value, StringRef desc) { auto intAttr = attr.dyn_cast(); if (!intAttr) { parser.emitError(parser.getNameLoc(), "expected an integer type in ") << desc; return failure(); } if (intAttr.getType().isSignedInteger()) { int64_t attrVal = intAttr.getSInt(); if (attrVal < 0) { parser.emitError(parser.getNameLoc(), "expected an unsigned integer value in ") << desc; return failure(); } value = attrVal; } else if (intAttr.getType().isSignlessInteger()) { int64_t attrVal = intAttr.getInt(); if (attrVal < 0) { parser.emitError(parser.getNameLoc(), "expected an unsigned integer value in ") << desc; return failure(); } value = attrVal; } else { value = intAttr.getUInt(); } return success(); } // parse an array of integers static LogicalResult parseIntArrayAttr(AsmParser &parser, const NamedAttribute &attr, SmallVector &res, StringRef desc) { auto arrayAttr = attr.getValue().dyn_cast(); if (!arrayAttr) { parser.emitError(parser.getNameLoc(), "expected an array for ") << desc; return failure(); } for (Attribute i : arrayAttr) { unsigned value; if (parseIntAttrValue(parser, i, value, desc).failed()) return failure(); res.push_back(value); } return success(); }; static LogicalResult parseUInt(AsmParser &parser, const NamedAttribute &attr, unsigned &value, StringRef desc) { return parseIntAttrValue(parser, attr.getValue(), value, desc); }; //===----------------------------------------------------------------------===// // Attribute methods //===----------------------------------------------------------------------===// #define GET_ATTRDEF_CLASSES #include "triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.cpp.inc" SliceEncodingAttr BlockedEncodingAttr::squeeze(int axis) { return SliceEncodingAttr::get(getContext(), axis, *this); } unsigned BlockedEncodingAttr::getElemsPerThread(ArrayRef shape) const { size_t rank = shape.size(); auto sizePerThread = getSizePerThread(); auto warpsPerCTA = getWarpsPerCTA(); auto threadsPerWarp = getThreadsPerWarp(); assert(rank == sizePerThread.size() && "unexpected rank in BlockedEncodingAttr::getElemsPerThread"); SmallVector elemsPerThread(rank); for (size_t i = 0; i < rank; ++i) { unsigned t = sizePerThread[i] * threadsPerWarp[i] * warpsPerCTA[i]; elemsPerThread[i] = ceil(shape[i], t) * sizePerThread[i]; } return product(elemsPerThread); } template SmallVector SliceEncodingAttr::paddedShape(ArrayRef shape) const { size_t rank = shape.size(); unsigned dim = getDim(); SmallVector retShape(rank + 1); for (unsigned d = 0; d < rank + 1; ++d) { if (d < dim) retShape[d] = shape[d]; else if (d == dim) retShape[d] = 1; else retShape[d] = shape[d - 1]; } return retShape; } template SmallVector SliceEncodingAttr::paddedShape(ArrayRef shape) const; template SmallVector SliceEncodingAttr::paddedShape(ArrayRef shape) const; unsigned SliceEncodingAttr::getElemsPerThread(ArrayRef shape) const { size_t rank = shape.size(); auto parent = getParent(); return ::getElemsPerThread(parent, paddedShape(shape)); } unsigned MmaEncodingAttr::getElemsPerThread(ArrayRef shape) const { size_t rank = shape.size(); assert(rank == 2 && "Unexpected rank of mma layout"); assert((isVolta() || isAmpere()) && "Only version 1 and 2 is supported"); int res = 0; if (isVolta()) { unsigned mmasRow = ceil(shape[0], 16 * getWarpsPerCTA()[0]); unsigned mmasCol = ceil(shape[1], 16 * getWarpsPerCTA()[1]); // Each warp-level mma884 will perform a m16xn16xk4 mma, thus get a m16xn16 // matrix as result. res = mmasRow * mmasCol * (16 * 16 / 32); } else if (isAmpere()) { unsigned elemsCol = ceil(shape[0], 16 * getWarpsPerCTA()[0]) * 2; unsigned elemsRow = ceil(shape[1], 8 * getWarpsPerCTA()[1]) * 2; res = elemsCol * elemsRow; } else { llvm_unreachable("Unexpected mma version"); } return res; } unsigned SharedEncodingAttr::getElemsPerThread(ArrayRef shape) const { // TODO: assert(0 && "SharedEncodingAttr::getElemsPerThread not implemented"); return 0; } unsigned DotOperandEncodingAttr::getElemsPerThread(ArrayRef shape) const { if (auto blockedLayout = getParent().dyn_cast()) { return blockedLayout.getElemsPerThread(shape); } assert(0 && "DotOperandEncodingAttr::getElemsPerThread not implemented"); return 0; } //===----------------------------------------------------------------------===// // Blocked Encoding //===----------------------------------------------------------------------===// Attribute BlockedEncodingAttr::parse(AsmParser &parser, Type type) { if (parser.parseLess().failed()) return {}; // Parse the data as a dictionary DictionaryAttr dict; if (parser.parseAttribute(dict).failed()) return {}; if (parser.parseGreater().failed()) return {}; SmallVector sizePerThread; SmallVector threadsPerWarp; SmallVector warpsPerCTA; SmallVector order; for (const NamedAttribute &attr : dict) { if (attr.getName() == "sizePerThread") { if (parseIntArrayAttr(parser, attr, sizePerThread, "number of elements per thread") .failed()) return {}; } else if (attr.getName() == "threadsPerWarp") { if (parseIntArrayAttr(parser, attr, threadsPerWarp, "number of threads per warp") .failed()) return {}; } else if (attr.getName() == "warpsPerCTA") { if (parseIntArrayAttr(parser, attr, warpsPerCTA, "number of warps per CTA") .failed()) return {}; } else if (attr.getName() == "order") { if (parseIntArrayAttr(parser, attr, order, "order").failed()) return {}; } else { parser.emitError(parser.getNameLoc(), "unexpected key: ") << attr.getName().strref(); return {}; } } auto ret = parser.getChecked( parser.getContext(), sizePerThread, threadsPerWarp, warpsPerCTA, order); return ret; } void BlockedEncodingAttr::print(mlir::AsmPrinter &printer) const { printer << "<{" << "sizePerThread = [" << getSizePerThread() << "]" << ", threadsPerWarp = [" << getThreadsPerWarp() << "]" << ", warpsPerCTA = [" << getWarpsPerCTA() << "]" << ", order = [" << getOrder() << "]" << "}>"; } //===----------------------------------------------------------------------===// // MMA encoding //===----------------------------------------------------------------------===// Attribute MmaEncodingAttr::parse(AsmParser &parser, Type type) { if (parser.parseLess().failed()) return {}; DictionaryAttr dict; if (parser.parseAttribute(dict).failed()) return {}; if (parser.parseGreater().failed()) return {}; unsigned versionMajor = 0; unsigned versionMinor = 0; SmallVector warpsPerCTA; for (const NamedAttribute &attr : dict) { if (attr.getName() == "versionMajor") { if (parseUInt(parser, attr, versionMajor, "versionMajor").failed()) return {}; } if (attr.getName() == "versionMinor") { if (parseUInt(parser, attr, versionMinor, "versionMinor").failed()) return {}; } if (attr.getName() == "warpsPerCTA") { if (parseIntArrayAttr(parser, attr, warpsPerCTA, "warpsPerCTA").failed()) return {}; } } return parser.getChecked(parser.getContext(), versionMajor, versionMinor, warpsPerCTA); } void MmaEncodingAttr::print(AsmPrinter &printer) const { printer << "<{" << "versionMajor = " << getVersionMajor() << ", " << "versionMinor = " << getVersionMinor() << ", " << "warpsPerCTA = [" << getWarpsPerCTA() << "]" << "}>"; } //===----------------------------------------------------------------------===// // Sliced Encoding //===----------------------------------------------------------------------===// Attribute SliceEncodingAttr::parse(AsmParser &parser, Type type) { if (parser.parseLess().failed()) return {}; NamedAttrList attrs; if (parser.parseOptionalAttrDict(attrs).failed()) return {}; if (parser.parseGreater().failed()) return {}; unsigned dim = attrs.get("dim").cast().getInt(); Attribute parent = attrs.get("parent"); return parser.getChecked(parser.getContext(), dim, parent); } void SliceEncodingAttr::print(mlir::AsmPrinter &printer) const { printer << "<{" << "dim = " << getDim() << ", " << "parent = " << getParent() << "}>"; } //===----------------------------------------------------------------------===// // Shared encoding //===----------------------------------------------------------------------===// Attribute SharedEncodingAttr::parse(AsmParser &parser, Type type) { if (parser.parseLess().failed()) return {}; // Parse the data as a dictionary DictionaryAttr dict; if (parser.parseAttribute(dict).failed()) return {}; if (parser.parseGreater().failed()) return {}; unsigned vec = 0; unsigned perPhase = 0; unsigned maxPhase = 0; SmallVector order; for (const NamedAttribute &attr : dict) { if (attr.getName() == "vec") { if (parseUInt(parser, attr, vec, "vec").failed()) return {}; } else if (attr.getName() == "perPhase") { if (parseUInt(parser, attr, perPhase, "perPhase").failed()) return {}; } else if (attr.getName() == "maxPhase") { if (parseUInt(parser, attr, maxPhase, "maxPhase").failed()) return {}; } else if (attr.getName() == "order") { if (parseIntArrayAttr(parser, attr, order, "order").failed()) return {}; } else { parser.emitError(parser.getNameLoc(), "unexpected key: ") << attr.getName().strref(); return {}; } } return parser.getChecked(parser.getContext(), vec, perPhase, maxPhase, order); } void SharedEncodingAttr::print(AsmPrinter &printer) const { printer << "<{" << "vec = " << getVec() << ", perPhase = " << getPerPhase() << ", maxPhase = " << getMaxPhase() << ", order = [" << getOrder() << "]" << "}>"; } //===----------------------------------------------------------------------===// // Mma encoding //===----------------------------------------------------------------------===// bool MmaEncodingAttr::isVolta() const { return getVersionMajor() == 1; } bool MmaEncodingAttr::isAmpere() const { return getVersionMajor() == 2; } // Get [isARow, isBRow, isAVec4, isBVec4, id] from versionMinor std::tuple MmaEncodingAttr::decodeVoltaLayoutStates() const { unsigned versionMinor = getVersionMinor(); bool isARow = versionMinor & (1 << 0); bool isBRow = versionMinor & (1 << 1); bool isAVec4 = versionMinor & (1 << 2); bool isBVec4 = versionMinor & (1 << 3); int id = 0; for (int i = numBitsToHoldMmaV1ID - 1; i >= 0; --i) id = (id << 1) + static_cast(versionMinor & (1 << (4 + i))); return std::make_tuple(isARow, isBRow, isAVec4, isBVec4, id); } //===----------------------------------------------------------------------===// // DotOperand Encoding //===----------------------------------------------------------------------===// Attribute DotOperandEncodingAttr::parse(AsmParser &parser, Type type) { if (parser.parseLess().failed()) return {}; NamedAttrList attrs; if (parser.parseOptionalAttrDict(attrs).failed()) return {}; if (parser.parseGreater().failed()) return {}; unsigned opIdx = attrs.get("opIdx").cast().getInt(); Attribute parent = attrs.get("parent"); Attribute isMMAv1Row; if (parent.isa() && parent.cast().isVolta()) { isMMAv1Row = attrs.get("isMMAv1Row"); if (!isMMAv1Row) llvm::report_fatal_error("isMMAv1Row attribute is missing"); } return parser.getChecked(parser.getContext(), opIdx, parent, isMMAv1Row); } void DotOperandEncodingAttr::print(mlir::AsmPrinter &printer) const { printer << "<{" << "opIdx = " << getOpIdx() << ", " << "parent = " << getParent(); if (getIsMMAv1Row()) printer << ", isMMAv1Row = " << getIsMMAv1Row(); printer << "}>"; } //===----------------------------------------------------------------------===// // InsertSliceAsyncOp //===----------------------------------------------------------------------===// ParseResult parseInsertSliceAsyncOp(OpAsmParser &parser, OperationState &result) { SmallVector allOperands; Type srcType, dstType; SMLoc allOperandLoc = parser.getCurrentLocation(); if (parser.parseOperandList(allOperands) || parser.parseOptionalAttrDict(result.attributes) || parser.parseColon() || parser.parseCustomTypeWithFallback(srcType) || parser.parseArrow() || parser.parseCustomTypeWithFallback(dstType)) return failure(); result.addTypes(dstType); SmallVector operandTypes; operandTypes.push_back(srcType); // src operandTypes.push_back(dstType); // dst operandTypes.push_back( IntegerType::get(parser.getBuilder().getContext(), 32)); // index int hasMask = 0, hasOther = 0; if (allOperands.size() >= 4) { operandTypes.push_back(triton::getI1SameShape(srcType)); // mask hasMask = 1; } if (allOperands.size() >= 5) { operandTypes.push_back(triton::getPointeeType(srcType)); // other hasOther = 1; } if (parser.resolveOperands(allOperands, operandTypes, allOperandLoc, result.operands)) return failure(); // Deduce operand_segment_sizes from the number of the operands. auto operand_segment_sizesAttrName = InsertSliceAsyncOp::operand_segment_sizesAttrName(result.name); result.addAttribute( operand_segment_sizesAttrName, parser.getBuilder().getI32VectorAttr({1, 1, 1, hasMask, hasOther})); return success(); } void printInsertSliceAsyncOp(OpAsmPrinter &printer, InsertSliceAsyncOp insertSliceAsyncOp) { printer << " "; printer << insertSliceAsyncOp.getOperation()->getOperands(); // "operand_segment_sizes" can be deduced, so we don't print it. printer.printOptionalAttrDict( insertSliceAsyncOp->getAttrs(), {insertSliceAsyncOp.operand_segment_sizesAttrName()}); printer << " : "; printer.printStrippedAttrOrType(insertSliceAsyncOp.src().getType()); printer << " -> "; printer.printStrippedAttrOrType(insertSliceAsyncOp.result().getType()); } //===----------------------------------------------------------------------===// // ASM Interface (i.e.: alias) //===----------------------------------------------------------------------===// class TritonGPUOpAsmInterface : public OpAsmDialectInterface { public: using OpAsmDialectInterface::OpAsmDialectInterface; AliasResult getAlias(Attribute attr, raw_ostream &os) const override { if (auto mmaAttr = attr.dyn_cast()) { os << "mma"; return AliasResult::FinalAlias; } else if (auto sharedAttr = attr.dyn_cast()) { os << "shared"; return AliasResult::FinalAlias; } else if (auto blockedAttr = attr.dyn_cast()) { os << "blocked"; return AliasResult::FinalAlias; } /* else if (auto sliceAttr = attr.dyn_cast()) { os << "slice"; return AliasResult::FinalAlias; } */ return OpAsmDialectInterface::getAlias(attr, os); } }; struct TritonGPUInferLayoutInterface : public triton::DialectInferLayoutInterface { using DialectInferLayoutInterface::DialectInferLayoutInterface; LogicalResult inferReduceOpEncoding(Attribute operandEncoding, unsigned axis, Attribute &resultEncoding) const override { resultEncoding = SliceEncodingAttr::get(getDialect()->getContext(), axis, operandEncoding); return success(); } LogicalResult inferTransOpEncoding(Attribute operandEncoding, Attribute &resultEncoding) const override { SharedEncodingAttr sharedEncoding = operandEncoding.dyn_cast(); if (!sharedEncoding) return failure(); SmallVector retOrder(sharedEncoding.getOrder().begin(), sharedEncoding.getOrder().end()); std::reverse(retOrder.begin(), retOrder.end()); resultEncoding = SharedEncodingAttr::get( getDialect()->getContext(), sharedEncoding.getVec(), sharedEncoding.getPerPhase(), sharedEncoding.getMaxPhase(), retOrder); return mlir::success(); } LogicalResult inferExpandDimsOpEncoding(Attribute operandEncoding, unsigned axis, Attribute &resultEncoding, Optional location) const override { auto sliceEncoding = operandEncoding.dyn_cast(); if (!sliceEncoding) return emitOptionalError( location, "ExpandDimsOp operand encoding must be SliceEncodingAttr"); if (sliceEncoding.getDim() != axis) return emitOptionalError( location, "Incompatible slice dimension for ExpandDimsOp operand"); resultEncoding = sliceEncoding.getParent(); return success(); } LogicalResult inferDotOpEncoding(Attribute operandEncoding, unsigned opIdx, Attribute retEncoding, Optional location) const override { if (auto dotOpEnc = operandEncoding.dyn_cast()) { if (opIdx != dotOpEnc.getOpIdx()) return emitOptionalError(location, "Wrong opIdx"); if (retEncoding != dotOpEnc.getParent()) return emitOptionalError(location, "Incompatible parent encoding"); } else return emitOptionalError( location, "Dot's a/b's encoding should be of DotOperandEncodingAttr"); return success(); } }; void TritonGPUDialect::initialize() { addAttributes< #define GET_ATTRDEF_LIST #include "triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.cpp.inc" >(); addOperations< #define GET_OP_LIST #include "triton/Dialect/TritonGPU/IR/Ops.cpp.inc" >(); addInterfaces(); addInterfaces(); } #define GET_OP_CLASSES #include "triton/Dialect/TritonGPU/IR/Ops.cpp.inc" // verify TritonGPU ops LogicalResult TritonGPUDialect::verifyOperationAttribute(Operation *op, NamedAttribute attr) { // TODO: fill this. return success(); } triton-2.0.0/lib/Dialect/TritonGPU/IR/Traits.cpp000066400000000000000000000006441440023377100212670ustar00rootroot00000000000000#include "triton/Dialect/TritonGPU/IR/Traits.h" #include "triton/Analysis/Utility.h" mlir::LogicalResult mlir::OpTrait::impl::verifyResultsAreSharedEncoding(Operation *op) { if (failed(verifyAtLeastNResults(op, 1))) return failure(); for (auto result : op->getResults()) if (!isSharedEncoding(result)) return op->emitOpError() << "requires all results to be shared encoding"; return success(); }; triton-2.0.0/lib/Dialect/TritonGPU/Transforms/000077500000000000000000000000001440023377100211355ustar00rootroot00000000000000triton-2.0.0/lib/Dialect/TritonGPU/Transforms/CMakeLists.txt000066400000000000000000000010101440023377100236650ustar00rootroot00000000000000set(LLVM_TARGET_DEFINITIONS Combine.td) mlir_tablegen(TritonGPUCombine.inc -gen-rewriters) add_public_tablegen_target(TritonGPUCombineIncGen) add_mlir_dialect_library(TritonGPUTransforms Coalesce.cpp CanonicalizeLoops.cpp Combine.cpp Pipeline.cpp Prefetch.cpp ReorderInstructions.cpp DecomposeConversions.cpp TritonGPUConversion.cpp UpdateMmaForVolta.cpp Utility.cpp DEPENDS TritonGPUTransformsIncGen TritonGPUCombineIncGen LINK_LIBS PUBLIC TritonIR TritonGPUIR MLIRTransformUtils ) triton-2.0.0/lib/Dialect/TritonGPU/Transforms/CanonicalizeLoops.cpp000066400000000000000000000040371440023377100252610ustar00rootroot00000000000000#include "mlir/Analysis/SliceAnalysis.h" #include "triton/Dialect/TritonGPU/IR/Dialect.h" #include "triton/Dialect/TritonGPU/Transforms/Passes.h" using namespace mlir; using namespace mlir::triton; #define GEN_PASS_CLASSES #include "triton/Dialect/TritonGPU/Transforms/Passes.h.inc" namespace { struct CanonicalizePass : public TritonGPUCanonicalizeLoopsBase { CanonicalizePass() = default; void runOnOperation() override { // Canonicalize pass may have created dead code that // standard scf.for canonicalization cannot handle // as of LLVM 14. For example, the iteration arguments // for the pointer of the synchronous loads that are // discarded. // The following piece of code is a workaround to // very crudely remove dead code, by making an iteration // argument yield itself if it is not used to create // side effects anywhere. getOperation()->walk([&](scf::ForOp forOp) -> void { for (size_t i = 0; i < forOp.getNumResults(); ++i) { // condition 1: no other iter arguments depend on it SetVector fwdSlice; mlir::getForwardSlice(forOp.getRegionIterArgs()[i], &fwdSlice); Operation *yieldOp = forOp.getBody()->getTerminator(); bool noOtherDependency = std::all_of( yieldOp->operand_begin(), yieldOp->operand_end(), [&](Value arg) { return arg == yieldOp->getOperand(i) || !fwdSlice.contains(arg.getDefiningOp()); }); // condition 2: final value is not used after the loop auto retVal = forOp.getResult(i); bool noUserAfterLoop = retVal.getUsers().empty(); // yielding the region iter arg will cause loop canonicalization // to clean up the dead code if (noOtherDependency && noUserAfterLoop) { yieldOp->setOperand(i, forOp.getRegionIterArgs()[i]); } } }); } }; } // anonymous namespace std::unique_ptr mlir::createTritonGPUCanonicalizeLoopsPass() { return std::make_unique(); }triton-2.0.0/lib/Dialect/TritonGPU/Transforms/Coalesce.cpp000066400000000000000000000171411440023377100233630ustar00rootroot00000000000000#include "mlir/Analysis/SliceAnalysis.h" #include "triton/Analysis/AxisInfo.h" #include "triton/Analysis/Utility.h" #include "triton/Dialect/TritonGPU/IR/Dialect.h" #include "triton/Dialect/TritonGPU/Transforms/Passes.h" #include using namespace mlir; using namespace mlir::triton; #define GEN_PASS_CLASSES #include "triton/Dialect/TritonGPU/Transforms/Passes.h.inc" template SmallVector argSort(const T &arr) { SmallVector ret(arr.size()); std::iota(ret.begin(), ret.end(), 0); std::sort(ret.begin(), ret.end(), [&](unsigned x, unsigned y) { return arr[x] > arr[y]; }); return ret; } typedef DenseMap> LayoutMap; struct CoalescePass : public TritonGPUCoalesceBase { Attribute getCoalescedEncoding(AxisInfoAnalysis &axisInfo, Value ptr, int numWarps) { auto origType = ptr.getType().cast(); // Get the shape of the tensor. size_t rank = origType.getRank(); AxisInfo info = axisInfo.lookupLatticeElement(ptr)->getValue(); // Get the contiguity order of `ptr` auto order = argSort(info.getContiguity()); // The desired divisibility is the maximum divisibility // among all dependent pointers who have the same order as // `ptr` SetVector withSameOrder; withSameOrder.insert(ptr); if (ptr.getDefiningOp()) for (Operation *op : mlir::multiRootGetSlice(ptr.getDefiningOp())) { for (Value val : op->getResults()) { if (val.getType() != origType) continue; auto valInfo = axisInfo.lookupLatticeElement(val); auto currOrder = argSort(valInfo->getValue().getContiguity()); if (order == currOrder) withSameOrder.insert(val); } } int numElems = product(origType.getShape()); int numThreads = numWarps * 32; int numElemsPerThread = std::max(numElems / numThreads, 1); // Thread tile size depends on memory alignment SmallVector sizePerThread(rank, 1); unsigned elemNumBits = getPointeeBitWidth(origType); unsigned elemNumBytes = std::max(elemNumBits / 8, 1u); unsigned perThread = 1; for (Value val : withSameOrder) { AxisInfo info = axisInfo.lookupLatticeElement(val)->getValue(); unsigned maxMultipleBytes = info.getDivisibility(order[0]); unsigned maxMultiple = std::max(maxMultipleBytes / elemNumBytes, 1u); unsigned maxContig = info.getContiguity(order[0]); unsigned alignment = std::min(maxMultiple, maxContig); unsigned currPerThread = std::min(alignment, 128 / elemNumBits); perThread = std::max(perThread, currPerThread); } sizePerThread[order[0]] = std::min(perThread, numElemsPerThread); SmallVector dims(rank); std::iota(dims.begin(), dims.end(), 0); // create encoding Attribute encoding = triton::gpu::BlockedEncodingAttr::get( &getContext(), origType.getShape(), sizePerThread, order, numWarps); return encoding; } std::function getTypeConverter(AxisInfoAnalysis &axisInfo, Value ptr, int numWarps) { Attribute encoding = getCoalescedEncoding(axisInfo, ptr, numWarps); return [encoding](Type _type) { RankedTensorType type = _type.cast(); return RankedTensorType::get(type.getShape(), type.getElementType(), encoding); }; } template void coalesceOp(LayoutMap &layoutMap, Operation *op, Value ptr, OpBuilder builder) { RankedTensorType ty = ptr.getType().template dyn_cast(); if (!ty) return; auto convertType = layoutMap.lookup(ptr); // convert operands SmallVector newArgs; for (auto v : op->getOperands()) { auto vTy = v.getType().dyn_cast(); if (vTy && !vTy.getEncoding().isa()) newArgs.push_back(builder.create( op->getLoc(), convertType(v.getType()), v)); else newArgs.push_back(v); } // convert output types SmallVector newTypes; for (auto t : op->getResultTypes()) { bool is_async = std::is_same::value; newTypes.push_back(is_async ? t : convertType(t)); } // construct new op with the new encoding Operation *newOp = builder.create(op->getLoc(), newTypes, newArgs, op->getAttrs()); // cast the results back to the original layout for (size_t i = 0; i < op->getNumResults(); i++) { Value newResult = newOp->getResult(i); if (newTypes[i] != op->getResultTypes()[i]) { newResult = builder.create( op->getLoc(), op->getResult(i).getType(), newResult); } op->getResult(i).replaceAllUsesWith(newResult); } op->erase(); } void runOnOperation() override { Operation *op = getOperation(); // Run axis info analysis AxisInfoAnalysis axisInfo(&getContext()); axisInfo.run(op); // For each i/o operation, we determine what layout // the pointers should have for best memory coalescing LayoutMap layoutMap; op->walk([&](Operation *curr) { Value ptr; if (auto op = dyn_cast(curr)) ptr = op.ptr(); if (auto op = dyn_cast(curr)) ptr = op.ptr(); if (auto op = dyn_cast(curr)) ptr = op.ptr(); if (auto op = dyn_cast(curr)) ptr = op.src(); if (auto op = dyn_cast(curr)) ptr = op.ptr(); if (!ptr) return; RankedTensorType ty = ptr.getType().template dyn_cast(); if (!ty || !ty.getElementType().isa()) return; AxisInfo info = axisInfo.lookupLatticeElement(ptr)->getValue(); auto mod = curr->getParentOfType(); int numWarps = triton::gpu::TritonGPUDialect::getNumWarps(mod); auto convertType = getTypeConverter(axisInfo, ptr, numWarps); layoutMap[ptr] = convertType; }); // For each memory op that has a layout L1: // 1. Create a coalesced memory layout L2 of the pointer operands // 2. Convert all operands from layout L1 to layout L2 // 3. Create a new memory op that consumes these operands and // produces a tensor with layout L2 // 4. Convert the output of this new memory op back to L1 // 5. Replace all the uses of the original memory op by the new one op->walk([&](Operation *curr) { OpBuilder builder(curr); if (auto load = dyn_cast(curr)) { coalesceOp(layoutMap, curr, load.ptr(), builder); return; } if (auto op = dyn_cast(curr)) { coalesceOp(layoutMap, curr, op.ptr(), builder); return; } if (auto op = dyn_cast(curr)) { coalesceOp(layoutMap, curr, op.ptr(), builder); return; } if (auto load = dyn_cast(curr)) { coalesceOp(layoutMap, curr, load.src(), builder); return; } if (auto store = dyn_cast(curr)) { coalesceOp(layoutMap, curr, store.ptr(), builder); return; } }); } }; std::unique_ptr mlir::createTritonGPUCoalescePass() { return std::make_unique(); } triton-2.0.0/lib/Dialect/TritonGPU/Transforms/Combine.cpp000066400000000000000000001470061440023377100232250ustar00rootroot00000000000000#include "Utility.h" #include "mlir/Analysis/SliceAnalysis.h" #include "mlir/Dialect/SCF/SCF.h" #include "mlir/IR/BlockAndValueMapping.h" #include "mlir/IR/BuiltinAttributes.h" #include "mlir/IR/Matchers.h" #include "mlir/IR/PatternMatch.h" #include "mlir/IR/Verifier.h" #include "mlir/Interfaces/InferTypeOpInterface.h" #include "mlir/Pass/Pass.h" #include "mlir/Pass/PassManager.h" #include "mlir/Support/LogicalResult.h" #include "mlir/Transforms/GreedyPatternRewriteDriver.h" #include "mlir/Transforms/Passes.h" #include "mlir/Transforms/RegionUtils.h" #include "triton/Analysis/Utility.h" #include "triton/Dialect/TritonGPU/IR/Dialect.h" #include "triton/Dialect/TritonGPU/Transforms/Passes.h" #include "triton/Dialect/TritonGPU/Transforms/TritonGPUConversion.h" #include using namespace mlir; namespace { #include "TritonGPUCombine.inc" using triton::DotOp; using triton::gpu::ConvertLayoutOp; using triton::gpu::DotOperandEncodingAttr; using triton::gpu::MmaEncodingAttr; using triton::gpu::SliceEncodingAttr; // ----------------------------------------------------------------------------- // // ----------------------------------------------------------------------------- // convert(blocked, dot_operand) -> // convert(blocked, mma) + convert(mma, dot_operand) // if this value is itself the result of a dot operation // this is a heuristic to accommodate some pattern seen in fused attention // kernels. // TODO: replace this by something more generic, i.e. layout-aware CSE class DecomposeDotOperand : public mlir::RewritePattern { public: explicit DecomposeDotOperand(mlir::MLIRContext *context) : mlir::RewritePattern(triton::gpu::ConvertLayoutOp::getOperationName(), 1, context) {} mlir::LogicalResult matchAndRewrite(mlir::Operation *op, mlir::PatternRewriter &rewriter) const override { if (!llvm::isa(op)) return mlir::failure(); auto convert = llvm::cast(op); auto srcType = convert.getOperand().getType().cast(); auto dstType = convert.getType().cast(); if (srcType.getEncoding().isa() && dstType.getEncoding().isa()) { auto dstDotOperand = dstType.getEncoding().cast(); auto dstParent = dstDotOperand.getParent(); if (dstDotOperand.getOpIdx() == 1 || !dstParent.isa()) return mlir::failure(); auto dstParentMma = dstParent.cast(); if (dstParentMma.isVolta() || dstParentMma.getWarpsPerCTA()[1] > 1) return mlir::failure(); SetVector bwdSlices; mlir::getBackwardSlice(convert.getResult(), &bwdSlices); if (llvm::find_if(bwdSlices, [](Operation *op) { return isa(op); }) == bwdSlices.end()) return mlir::failure(); auto tmpType = RankedTensorType::get( dstType.getShape(), dstType.getElementType(), dstParentMma); auto tmp = rewriter.create( convert.getLoc(), tmpType, convert.getOperand()); auto newConvert = rewriter.create( convert.getLoc(), dstType, tmp); rewriter.replaceOp(op, {newConvert}); return mlir::success(); } return mlir::failure(); } }; class SimplifyReduceCvt : public mlir::RewritePattern { public: explicit SimplifyReduceCvt(mlir::MLIRContext *context) : mlir::RewritePattern(triton::ReduceOp::getOperationName(), 2, context) { } mlir::LogicalResult matchAndRewrite(mlir::Operation *op, mlir::PatternRewriter &rewriter) const override { auto reduce = cast(*op); auto reduceArg = dyn_cast_or_null( reduce.getOperand().getDefiningOp()); if (!reduceArg) return mlir::failure(); // this may generate unsupported conversions in the LLVM codegen if (reduceArg.getOperand() .getType() .cast() .getEncoding() .isa()) return mlir::failure(); auto newReduce = rewriter.create( op->getLoc(), reduce.redOp(), reduceArg.getOperand(), reduce.axis()); if (isa( *reduceArg.getOperand().getDefiningOp())) return mlir::failure(); Value newRet = newReduce.getResult(); // it's still beneficial to move the conversion // to after the reduce if necessary since it will be // done on a rank-reduced tensor hence cheaper if (newRet.getType() != reduce.getResult().getType()) newRet = rewriter.create( op->getLoc(), reduce.getResult().getType(), newRet); rewriter.replaceOp(op, newRet); return success(); } }; // Layout conversions can't deduce their return type automatically. // IIUC they are therefore not handled by DRR right now class SimplifyConversion : public mlir::RewritePattern { public: explicit SimplifyConversion(mlir::MLIRContext *context) : mlir::RewritePattern(triton::gpu::ConvertLayoutOp::getOperationName(), 4, context) {} mlir::LogicalResult matchAndRewrite(mlir::Operation *op, mlir::PatternRewriter &rewriter) const override { if (!llvm::isa(op)) return mlir::failure(); auto convert = llvm::cast(op); // we don't handle conversions to DotOperandEncodingAttr // this is a heuristics to accommodate fused attention auto srcType = convert.getOperand().getType().cast(); auto dstType = convert.getType().cast(); if (dstType.getEncoding().isa() && srcType.getEncoding().isa()) return mlir::failure(); // convert to the same layout -- we can delete if (op->getResultTypes() == op->getOperandTypes()) { rewriter.replaceOp(op, op->getOperands()); return mlir::success(); } Operation *arg = op->getOperand(0).getDefiningOp(); // block argument if (!arg) return mlir::failure(); // cvt(alloc_tensor(x), type2) -> alloc_tensor(x, type2) auto alloc_tensor = dyn_cast(arg); if (alloc_tensor) { if (!isSharedEncoding(op->getResult(0))) { return mlir::failure(); } rewriter.replaceOpWithNewOp( op, op->getResult(0).getType()); return mlir::success(); } // cvt(insert_slice(x), type2) -> insert_slice(cvt(x, type2)) auto insert_slice = dyn_cast(arg); if (insert_slice) { if (!isSharedEncoding(op->getResult(0))) { return mlir::failure(); } auto newType = op->getResult(0).getType().cast(); // Ensure that the new insert_slice op is placed in the same place as the // old insert_slice op. Otherwise, the new insert_slice op may be placed // after the async_wait op, which is not allowed. OpBuilder::InsertionGuard guard(rewriter); rewriter.setInsertionPoint(insert_slice); auto newArg = rewriter.create( op->getLoc(), newType, insert_slice.dst()); rewriter.replaceOpWithNewOp( op, newType, insert_slice.src(), newArg.getResult(), insert_slice.index(), insert_slice.mask(), insert_slice.other(), insert_slice.cache(), insert_slice.evict(), insert_slice.isVolatile(), insert_slice.axis()); return mlir::success(); } // cvt(extract_slice(x), type2) -> extract_slice(cvt(x, type2)) auto extract_slice = dyn_cast(arg); if (extract_slice) { if (!isSharedEncoding(op->getResult(0))) { return mlir::failure(); } auto origType = extract_slice.source().getType().cast(); auto newType = RankedTensorType::get( origType.getShape(), origType.getElementType(), op->getResult(0).getType().cast().getEncoding()); auto origResType = op->getResult(0).getType().cast(); auto resType = RankedTensorType::get( origResType.getShape(), origResType.getElementType(), extract_slice.getType().cast().getEncoding()); // Ensure that the new extract_slice op is placed in the same place as the // old extract_slice op. Otherwise, the new extract_slice op may be placed // after the async_wait op, which is not allowed. OpBuilder::InsertionGuard guard(rewriter); rewriter.setInsertionPoint(extract_slice); auto newArg = rewriter.create( op->getLoc(), newType, extract_slice.source()); rewriter.replaceOpWithNewOp( op, resType, newArg.getResult(), extract_slice.offsets(), extract_slice.sizes(), extract_slice.strides(), extract_slice.static_offsets(), extract_slice.static_sizes(), extract_slice.static_strides()); return mlir::success(); } // cvt(cvt(x, type1), type2) -> cvt(x, type2) if (llvm::isa(arg)) { if (arg->getOperand(0).getDefiningOp() && !isSharedEncoding(arg->getOperand(0)) && isSharedEncoding(convert.getOperand()) && !isSharedEncoding(convert.getResult())) { return mlir::failure(); } if (isSharedEncoding(convert.getOperand()) && isSharedEncoding(convert.getResult())) { return mlir::failure(); } auto srcType = convert.getOperand().getType().cast(); auto srcShared = srcType.getEncoding().dyn_cast(); if (srcShared && srcShared.getVec() > 1) return mlir::failure(); rewriter.replaceOpWithNewOp( op, op->getResultTypes().front(), arg->getOperand(0)); return mlir::success(); } // cvt(type1, splat(type2, x)) -> splat(type1, x) if (auto splat = llvm::dyn_cast(arg)) { rewriter.replaceOpWithNewOp(op, op->getResultTypes(), splat.src()); return mlir::success(); } // cvt(type1, make_range(type2, x)) -> make_range(type1, x) if (auto range = llvm::dyn_cast(arg)) { rewriter.replaceOpWithNewOp( op, op->getResultTypes(), range.start(), range.end()); return mlir::success(); } // cvt(type, constant) -> constant if (auto cst = llvm::dyn_cast(arg)) if (auto ret = cst.getValue().dyn_cast()) { auto newRet = SplatElementsAttr::get(op->getResultTypes().front(), ret.getSplatValue()); rewriter.replaceOpWithNewOp(op, newRet); return mlir::success(); } return mlir::failure(); } }; // ----------------------------------------------------------------------------- // // ----------------------------------------------------------------------------- // TODO: Interface LogicalResult invertEncoding(Attribute targetEncoding, Operation *op, Attribute &ret) { ret = targetEncoding; if (auto expand_dims = dyn_cast(op)) { ret = triton::gpu::SliceEncodingAttr::get( op->getContext(), expand_dims.axis(), targetEncoding); } if (auto reduce = dyn_cast(op)) { auto sliceEncoding = targetEncoding.dyn_cast(); if (!sliceEncoding) return failure(); ret = sliceEncoding.getParent(); } return success(); } inline bool expensiveLoadOrStore(Operation *op, Attribute &targetEncoding) { // Case 1: A size 1 tensor is not expensive since all threads will load the // same if (isSingleValue(op->getOperand(0))) return false; auto ptr = op->getOperand(0); if (auto tensorTy = ptr.getType().dyn_cast()) { auto encoding = tensorTy.getEncoding(); // Case 2: Different type conversion is expensive (e.g., mma <-> block) if (encoding.getTypeID() != targetEncoding.getTypeID()) return true; auto sizePerThread = triton::gpu::getSizePerThread(encoding); auto targetSizePerThread = triton::gpu::getSizePerThread(targetEncoding); auto order = triton::gpu::getOrder(encoding); auto targetOrder = triton::gpu::getOrder(targetEncoding); // Case 3: The targeEncoding may expose more vectorization opportunities return sizePerThread[order[0]] >= targetSizePerThread[targetOrder[0]]; } return false; } inline bool expensiveToRemat(Operation *op, Attribute &targetEncoding) { if (!op) return true; if (isa(op)) return expensiveLoadOrStore(op, targetEncoding); if (isa(op)) return true; if (isa( op)) return true; return false; } LogicalResult simulateBackwardRematerialization( Operation *initOp, SetVector &processed, SetVector &layout, llvm::MapVector &toConvert, const Attribute &targetEncoding) { // DFS std::vector> queue; queue.emplace_back(initOp, targetEncoding); // We want to see the effect of converting `initOp` to a new layout // so we initialize `numCvts = 1`. int numCvts = 1; while (!queue.empty()) { Operation *currOp; Attribute currLayout; std::tie(currOp, currLayout) = queue.back(); queue.pop_back(); // If the current operation is expensive to rematerialize, // we stop everything if (expensiveToRemat(currOp, currLayout)) break; // A conversion will be removed here (i.e. transferred to operands) numCvts -= 1; // Done processing processed.insert(currOp); layout.insert(currLayout); // Add all operands to the queue for (Value argI : currOp->getOperands()) { Attribute newEncoding; // Cannot invert the current encoding for this operand // we stop everything if (failed(invertEncoding(currLayout, currOp, newEncoding))) return mlir::failure(); if (toConvert.count(argI) && toConvert[argI] != newEncoding) return mlir::failure(); Operation *opArgI = argI.getDefiningOp(); toConvert.insert({argI, newEncoding}); // 1. Only convert RankedTensorType // 2. Skip if there's no defining op // 3. Skip if the defining op has already been processed // 4. Skip or the defining op is in a different block if (!argI.getType().isa() || !opArgI || processed.contains(opArgI) || opArgI->getBlock() != currOp->getBlock()) continue; // If the conversion can be folded into opArgI then // we don't count this conversion as expensive if (isa(*opArgI)) continue; // We add one expensive conversion for the current operand numCvts += 1; queue.emplace_back(opArgI, newEncoding); } } // if rematerialization would add more conversions than it removes // then we don't do it if (numCvts > 0) return mlir::failure(); return mlir::success(); } // Operation *cloneWithInferType(mlir::PatternRewriter &rewriter, Operation *op, BlockAndValueMapping &mapping) { Operation *newOp = rewriter.clone(*op, mapping); auto origType = op->getResult(0).getType().cast(); auto newType = RankedTensorType::get( origType.getShape(), origType.getElementType(), newOp->getOperand(0).getType().cast().getEncoding()); newOp->getResult(0).setType(newType); auto typeInfer = dyn_cast(newOp); if (typeInfer) { SmallVector newTypes; auto success = typeInfer.inferReturnTypes( newOp->getContext(), newOp->getLoc(), newOp->getOperands(), newOp->getAttrDictionary(), newOp->getRegions(), newTypes); if (succeeded(success)) newOp->getResult(0).setType(newTypes.front()); } return newOp; } // op(cvt(arg_0), arg_1, ..., arg_n) // -> cvt(op(arg_0, cvt(arg_1), ..., cvt(arg_n))) void pushConversionForward(triton::gpu::ConvertLayoutOp cvt, SetVector &cvtSlices, mlir::PatternRewriter &rewriter) { auto srcEncoding = cvt.getOperand().getType().cast().getEncoding(); auto dstEncoding = cvt.getResult().getType().cast().getEncoding(); BlockAndValueMapping mapping; auto op = cvtSlices.front(); for (Value arg : op->getOperands()) { if (arg.getDefiningOp() == cvt) mapping.map(arg, cvt.getOperand()); else { auto oldType = arg.getType().cast(); auto newType = RankedTensorType::get( oldType.getShape(), oldType.getElementType(), srcEncoding); auto cvtI = rewriter.create(arg.getLoc(), newType, arg); if (Operation *argOp = arg.getDefiningOp()) cvtI->moveAfter(argOp); mapping.map(arg, cvtI); } } rewriter.setInsertionPoint(op); auto *newOp = cloneWithInferType(rewriter, op, mapping); auto newType = newOp->getResult(0).getType().cast(); auto newCvtType = RankedTensorType::get( newType.getShape(), newType.getElementType(), dstEncoding); auto newCvt = rewriter.create( newOp->getLoc(), newCvtType, newOp->getResult(0)); rewriter.replaceOp(op, newCvt->getResults()); } // class MoveConvertOutOfIf : public mlir::RewritePattern { public: explicit MoveConvertOutOfIf(mlir::MLIRContext *context) : mlir::RewritePattern(scf::IfOp::getOperationName(), 2, context) {} mlir::LogicalResult matchAndRewrite(mlir::Operation *op, mlir::PatternRewriter &rewriter) const override { auto ifOp = cast(*op); // If “scf.if” defines no values, “scf.yield” will be inserted implicitly. // However, "scf.else" is not required to be present, so we need to check // if it exists. auto thenYield = ifOp.thenYield(); int numOps = thenYield.getNumOperands(); SmallVector newThenYieldOps = thenYield.getOperands(); SetVector thenCvts; SmallVector newRetTypes; bool hasElse = !ifOp.getElseRegion().empty(); scf::YieldOp elseYield; SmallVector newElseYieldOps; SetVector elseCvts; if (hasElse) { elseYield = ifOp.elseYield(); newElseYieldOps = elseYield.getOperands(); } BlockAndValueMapping mapping; for (size_t i = 0; i < numOps; i++) { auto thenCvt = dyn_cast( thenYield.getOperand(i).getDefiningOp()); if (hasElse) { auto elseYield = ifOp.elseYield(); auto elseCvt = dyn_cast( elseYield.getOperand(i).getDefiningOp()); if (thenCvt && elseCvt && std::distance(elseCvt->user_begin(), elseCvt->user_end()) == 1 && std::distance(thenCvt->user_begin(), thenCvt->user_end()) == 1 && thenCvt.getOperand().getType() == elseCvt.getOperand().getType()) { // If thenCvt and elseCvt's type are the same, it means a single // conversion is enough to replace both of them. We can move the // conversion out of scf.if and replace both thenCvt and elseCvt with // the new conversion. mapping.map(thenCvt.getResult(), thenCvt.getOperand()); thenCvts.insert((Operation *)thenCvt); newRetTypes.push_back(thenCvt.getOperand().getType()); mapping.map(elseCvt.getResult(), elseCvt.getOperand()); elseCvts.insert((Operation *)elseCvt); } else // Cannot move out of scf.if because thenCvt != elseCvt // Moving it out of scf.if will introduce a new conversion newRetTypes.push_back(thenYield.getOperand(i).getType()); } else { if (thenCvt && std::distance(thenCvt->user_begin(), thenCvt->user_end()) == 1) { // If there's only a single use of the conversion then we can move it mapping.map(thenCvt.getResult(), thenCvt.getOperand()); thenCvts.insert((Operation *)thenCvt); newRetTypes.push_back(thenCvt.getOperand().getType()); } else // Cannot move out of scf.if because either there's another use of // the conversion or there's no conversion at all newRetTypes.push_back(thenYield.getOperand(i).getType()); } } if (mapping.getValueMap().empty()) return mlir::failure(); auto newIfOp = rewriter.create(ifOp.getLoc(), newRetTypes, ifOp.getCondition(), hasElse); auto rematerialize = [&](Block *block, SetVector &cvts) { for (Operation &op : block->getOperations()) { if (cvts.contains(&op)) { if (mapping.contains(op.getOperand(0))) mapping.map(op.getResult(0), mapping.lookup(op.getOperand(0))); continue; } rewriter.clone(op, mapping); } }; rewriter.setInsertionPointToEnd(newIfOp.thenBlock()); rematerialize(ifOp.thenBlock(), thenCvts); if (hasElse) { rewriter.setInsertionPointToEnd(newIfOp.elseBlock()); rematerialize(ifOp.elseBlock(), elseCvts); } rewriter.setInsertionPointAfter(newIfOp); SmallVector newRetValues = newIfOp.getResults(); for (size_t i = 0; i < numOps; i++) { if (newIfOp.getResult(i).getType() != ifOp.getResult(i).getType()) { newRetValues[i] = rewriter.create( newIfOp.getLoc(), ifOp.getResult(i).getType(), newIfOp.getResult(i)); } } rewriter.replaceOp(op, newRetValues); return mlir::success(); } }; // class FoldConvertAndReduce : public mlir::RewritePattern { public: explicit FoldConvertAndReduce(mlir::MLIRContext *context) : mlir::RewritePattern(triton::gpu::ConvertLayoutOp::getOperationName(), 1, context) {} mlir::LogicalResult matchAndRewrite(mlir::Operation *cvtOp, mlir::PatternRewriter &rewriter) const override { auto cvt = dyn_cast(*cvtOp); auto srcEncoding = cvt.getOperand().getType().cast().getEncoding(); auto dstEncoding = cvt.getResult().getType().cast().getEncoding(); // XXX: why is this needed? if (srcEncoding.isa()) return failure(); SetVector cvtSlices; auto filter = [&](Operation *op) { return op->getBlock() == cvt->getBlock() && !(isa(op) && !op->getResult(0).getType().isa()) && !isa(op) && !isa(op); }; mlir::getForwardSlice(cvt.getResult(), &cvtSlices, filter); if (cvtSlices.empty()) return failure(); llvm::MapVector toConvert; for (Operation *op : cvtSlices) { // don't rematerialize anything expensive if (expensiveToRemat(op, srcEncoding)) return failure(); // don't rematerialize non-element-wise if (!op->hasTrait()) return failure(); // don't rematerialize if it adds an extra conversion that can't // be removed for (Value arg : op->getOperands()) { Operation *argOp = arg.getDefiningOp(); SetVector processed; SetVector layout; llvm::MapVector toConvert; if (argOp && (argOp != cvt) && cvtSlices.count(argOp) == 0 && failed(simulateBackwardRematerialization(argOp, processed, layout, toConvert, srcEncoding))) { return failure(); } } } pushConversionForward(cvt, cvtSlices, rewriter); return success(); } }; // Layout conversions are expensive. They require going through // shared memory, which is orders of magnitude slower than // other non-i/o operations in the dialect. // It therefore makes sense to remove them whenever possible, // even if it means rematerializing all values whose definitions // are reachable from it without passing through any memory operation. class RematerializeBackward : public mlir::RewritePattern { public: explicit RematerializeBackward(mlir::MLIRContext *context) : mlir::RewritePattern(triton::gpu::ConvertLayoutOp::getOperationName(), 2, context) {} mlir::LogicalResult matchAndRewrite(mlir::Operation *cvt, mlir::PatternRewriter &rewriter) const override { if (!llvm::isa(cvt)) return mlir::failure(); // we don't touch block arguments Operation *op = cvt->getOperand(0).getDefiningOp(); if (!op) return mlir::failure(); // we don't want to rematerialize any conversion to/from shared if (isSharedEncoding(cvt->getResults()[0]) || isSharedEncoding(cvt->getOperand(0))) return mlir::failure(); // we don't handle conversions to DotOperandEncodingAttr // this is a heuristics to accommodate fused attention auto targetType = cvt->getResultTypes()[0].cast(); if (targetType.getEncoding().isa()) return mlir::failure(); // DFS SetVector processed; SetVector layout; llvm::MapVector toConvert; std::vector> queue; if (failed(simulateBackwardRematerialization( cvt, processed, layout, toConvert, targetType.getEncoding()))) return mlir::failure(); SmallVector sortedValues; SetVector tmp; for (auto &item : toConvert) { Value v = item.first; if (v.getDefiningOp()) tmp.insert(v.getDefiningOp()); else sortedValues.push_back(v); } tmp = mlir::multiRootTopologicalSort(tmp); for (Operation *op : tmp) sortedValues.push_back(op->getResult(0)); BlockAndValueMapping mapping; for (Value currOperand : sortedValues) { // unpack information Attribute targetLayout = toConvert.lookup(currOperand); // rematerialize the operand if necessary Operation *currOperation = currOperand.getDefiningOp(); if (processed.contains(currOperation)) { Operation *newOperation = cloneWithInferType(rewriter, currOperation, mapping); newOperation->moveAfter(currOperation); currOperation = newOperation; currOperand = currOperation->getResult(0); } // compute target type for the layout cast auto currType = currOperand.getType().cast(); auto newType = RankedTensorType::get( currType.getShape(), currType.getElementType(), targetLayout); auto newOperand = rewriter.create( currOperand.getLoc(), newType, currOperand); if (currOperation) newOperand->moveAfter(currOperation); else { Block *block = currOperand.cast().getOwner(); newOperand->moveAfter(block, block->begin()); } mapping.map(currOperand, newOperand); } rewriter.replaceOp(cvt, mapping.lookup(cvt->getOperand(0))); return mlir::success(); } }; // ----------------------------------------------------------------------------- // // ----------------------------------------------------------------------------- class MoveConvertOutOfLoop : public mlir::RewritePattern { public: explicit MoveConvertOutOfLoop(mlir::MLIRContext *context) : mlir::RewritePattern(scf::ForOp::getOperationName(), 1, context) {} SmallVector rematerializeForLoop(mlir::PatternRewriter &rewriter, scf::ForOp &forOp, size_t i, RankedTensorType newType, triton::gpu::ConvertLayoutOp origConversion) const { // Rewrite init argument Type origType = forOp.getInitArgs()[i].getType(); SmallVector newInitArgs = forOp.getInitArgs(); newInitArgs[i] = rewriter.create( newInitArgs[i].getLoc(), newType, newInitArgs[i]); // Clone for loop auto newForOp = rewriter.create( forOp.getLoc(), forOp.getLowerBound(), forOp.getUpperBound(), forOp.getStep(), newInitArgs); newForOp->moveBefore(forOp); rewriter.setInsertionPointToStart(newForOp.getBody()); BlockAndValueMapping mapping; for (const auto &arg : llvm::enumerate(forOp.getRegionIterArgs())) mapping.map(arg.value(), newForOp.getRegionIterArgs()[arg.index()]); mapping.map(origConversion.getResult(), newForOp.getRegionIterArgs()[i]); // the iter arg of interest may have other uses than the conversion // we're hoisting out of the loop. If that's the case we will // need to add extra conversions for all uses... which is only useful // if these extra conversions can be removed by another pattern auto oldArg = forOp.getRegionIterArgs()[i]; auto newArg = newForOp.getRegionIterArgs()[i]; auto newArgFallback = rewriter.create( newForOp.getLoc(), origType, newArg); mapping.map(forOp.getInductionVar(), newForOp.getInductionVar()); for (Operation &op : forOp.getBody()->without_terminator()) { if (&op == (Operation *)(&origConversion)) continue; Operation *newOp = rewriter.clone(op, mapping); if (find(oldArg.getUsers(), &op) != oldArg.getUsers().end()) newOp->replaceUsesOfWith(newArg, newArgFallback); } // create yield, inserting conversions if necessary auto yieldOp = forOp.getBody()->getTerminator(); SmallVector newYieldArgs; for (Value arg : yieldOp->getOperands()) newYieldArgs.push_back(mapping.lookup(arg)); newYieldArgs[i] = rewriter.create( yieldOp->getLoc(), newType, newYieldArgs[i]); rewriter.create(forOp.getLoc(), newYieldArgs); // replace SmallVector newResults = newForOp->getResults(); newResults[i] = rewriter.create( rewriter.getUnknownLoc(), origType, newForOp->getResult(i)); newResults[i].getDefiningOp()->moveAfter(newForOp); return newResults; } mlir::LogicalResult matchAndRewrite(mlir::Operation *op, mlir::PatternRewriter &rewriter) const override { auto forOp = cast(op); auto iterArgs = forOp.getRegionIterArgs(); for (const auto &iterArg : llvm::enumerate(iterArgs)) { // if (iterArg.index() != 1) // continue; // skip non-tensor types if (!iterArg.value().getType().isa()) continue; // we only move `iterArg` out of the loop if // - there is only a single conversion use // - moving this conversion out of the loop will not generate // any extra non-removable conversion auto users = iterArg.value().getUsers(); // check first condition SetVector cvtTargetTypes; for (auto user : users) { if (isa(user)) { auto newType = user->getResults()[0].getType().cast(); auto oldType = user->getOperand(0).getType().cast(); if (oldType.getEncoding().isa() && newType.getEncoding() .isa()) { continue; } if (newType.getEncoding().isa()) { if (newType.getEncoding() .cast() .getVec() == 1) continue; } cvtTargetTypes.insert(newType); } } if (cvtTargetTypes.size() != 1) continue; // TODO: check second condition for (auto user : users) { if (isa(user)) continue; } // check for (auto op : iterArg.value().getUsers()) { auto cvt = dyn_cast(op); if (!cvt) continue; auto targetType = op->getResultTypes()[0].cast(); auto newFor = rematerializeForLoop(rewriter, forOp, iterArg.index(), targetType, cvt); rewriter.replaceOp(forOp, newFor); return success(); } } return failure(); } }; // ----------------------------------------------------------------------------- // // ----------------------------------------------------------------------------- class RematerializeForward : public mlir::RewritePattern { public: explicit RematerializeForward(mlir::MLIRContext *context) : mlir::RewritePattern(triton::gpu::ConvertLayoutOp::getOperationName(), 2, context) {} mlir::LogicalResult matchAndRewrite(mlir::Operation *_cvtOp, mlir::PatternRewriter &rewriter) const override { auto cvt = cast(_cvtOp); auto forOp = dyn_cast(cvt->getParentOp()); if (!forOp) return mlir::failure(); auto isInLoop = [&](Operation *op) { return op->getParentOp() == forOp; }; SetVector cvtSlices; auto filter = [&](Operation *op) { return isInLoop(op) && !isa(op) && !isa(op) && !isa(op) && !isa(op); }; mlir::getForwardSlice(cvt.getResult(), &cvtSlices, filter); if (cvtSlices.empty()) return failure(); for (Operation *op : cvtSlices) { if (!op->hasTrait() && !op->hasTrait()) return failure(); for (Value arg : op->getOperands()) { Operation *argOp = arg.getDefiningOp(); if (argOp && (argOp != cvt) && !isa( argOp)) { return failure(); } } } // Otherwise, we push the conversion forward // since we'll be able to move it out of // the loop once it reaches the yield op pushConversionForward(cvt, cvtSlices, rewriter); return success(); } }; // ----------------------------------------------------------------------------- // // ----------------------------------------------------------------------------- namespace { int computeCapabilityToMMAVersion(int computeCapability) { if (computeCapability < 70) { return 0; } else if (computeCapability < 80) { return 1; } else if (computeCapability < 90) { return 2; } else { assert(false && "computeCapability > 90 not supported"); return 3; } } SmallVector mmaVersionToShapePerWarp(int version) { if (version == 1) return {16, 16}; else if (version == 2) return {16, 8}; else { assert(false && "version not supported"); return {0, 0}; } } SmallVector warpsPerTileV1(const ArrayRef shape, int numWarps) { // Set a default value and ensure product of wpt equals numWarps return {static_cast(numWarps), 1}; } SmallVector warpsPerTileV2(triton::DotOp dotOp, const ArrayRef shape, int numWarps) { SetVector slices; mlir::getForwardSlice(dotOp.getResult(), &slices); if (llvm::find_if(slices, [](Operation *op) { return isa(op); }) != slices.end()) return {(unsigned)numWarps, 1}; SmallVector ret = {1, 1}; SmallVector shapePerWarp = {16, 8}; bool changed = false; // TODO (@daadaada): double-check. // original logic in // https://github.com/openai/triton/blob/master/lib/codegen/analysis/layout.cc#L252 // seems buggy for shape = [32, 16] ? do { changed = false; if (ret[0] * ret[1] >= numWarps) break; if (shape[0] / shapePerWarp[0] / ret[0] >= shape[1] / (shapePerWarp[1] * 2) / ret[1]) { if (ret[0] < shape[0] / shapePerWarp[0]) { ret[0] *= 2; } else ret[1] *= 2; } else { ret[1] *= 2; } } while (true); return ret; } } // namespace class OptimizeBlockedToShared : public mlir::RewritePattern { public: explicit OptimizeBlockedToShared(mlir::MLIRContext *context) : RewritePattern(triton::gpu::ConvertLayoutOp::getOperationName(), 1, context) {} mlir::LogicalResult matchAndRewrite(mlir::Operation *op, mlir::PatternRewriter &rewriter) const override { auto cvt = cast(op); auto srcType = cvt.getOperand().getType().cast(); auto dstType = cvt.getResult().getType().cast(); auto srcBlockedLayout = srcType.getEncoding().dyn_cast(); auto dstSharedLayout = dstType.getEncoding().dyn_cast(); if (!srcBlockedLayout || !dstSharedLayout) return failure(); if (srcBlockedLayout.getOrder() == dstSharedLayout.getOrder()) return failure(); // For now only works if single use is transpose // TODO: rematerialize #shared uses auto users = op->getUsers(); if (std::distance(users.begin(), users.end()) != 1 || !isa(*users.begin())) return failure(); auto tmpShared = triton::gpu::SharedEncodingAttr::get( op->getContext(), dstSharedLayout.getVec(), dstSharedLayout.getPerPhase(), dstSharedLayout.getMaxPhase(), srcBlockedLayout.getOrder()); auto tmpType = RankedTensorType::get(srcType.getShape(), srcType.getElementType(), tmpShared); auto tmpCvt = rewriter.create( op->getLoc(), tmpType, cvt.getOperand()); auto newDstType = RankedTensorType::get( users.begin()->getResultTypes()[0].cast().getShape(), srcType.getElementType(), dstSharedLayout); auto newTrans = rewriter.create(op->getLoc(), newDstType, tmpCvt.getResult()); rewriter.replaceOp(*users.begin(), newTrans.getResult()); return success(); } }; class OptimizeConvertToDotOperand : public mlir::RewritePattern { public: explicit OptimizeConvertToDotOperand(mlir::MLIRContext *context) : RewritePattern(triton::gpu::ConvertLayoutOp::getOperationName(), 1, context) {} mlir::LogicalResult matchAndRewrite(mlir::Operation *op, mlir::PatternRewriter &rewriter) const override { auto cvt = cast(op); auto srcType = cvt.getOperand().getType().cast(); auto dstType = cvt.getResult().getType().cast(); // order ArrayRef order; if (auto srcBlockedLayout = srcType.getEncoding().dyn_cast()) order = srcBlockedLayout.getOrder(); else if (auto srcSharedLayout = srcType.getEncoding() .dyn_cast()) order = srcSharedLayout.getOrder(); else return failure(); // dot operand output auto dstDotOperandLayout = dstType.getEncoding().dyn_cast(); if (!dstDotOperandLayout) return failure(); if (!dstDotOperandLayout.getIsMMAv1Row()) return failure(); bool isMMAv1Row = dstDotOperandLayout.getIsMMAv1Row().cast().getValue(); if ((order[0] == 1 && isMMAv1Row) || (order[0] == 0 && !isMMAv1Row)) return failure(); auto newIsRow = BoolAttr::get(op->getContext(), !isMMAv1Row); auto newDstEncoding = triton::gpu::DotOperandEncodingAttr::get( op->getContext(), dstDotOperandLayout.getOpIdx(), dstDotOperandLayout.getParent(), newIsRow); auto newDstType = RankedTensorType::get( dstType.getShape(), dstType.getElementType(), newDstEncoding); auto newCvt = rewriter.create( op->getLoc(), newDstType, cvt.getOperand()); rewriter.replaceOp(op, newCvt.getResult()); return success(); } }; class BlockedToMMA : public mlir::RewritePattern { int computeCapability; mutable int mmaV1Counter{}; // used to generate ID for MMAv1 encoding public: BlockedToMMA(mlir::MLIRContext *context, int computeCapability) : mlir::RewritePattern(triton::DotOp::getOperationName(), 2, context), computeCapability(computeCapability) {} static SmallVector getWarpsPerTile(triton::DotOp dotOp, const ArrayRef shape, int version, int numWarps) { switch (version) { case 1: return warpsPerTileV1(shape, numWarps); case 2: return warpsPerTileV2(dotOp, shape, numWarps); default: assert(false && "not supported version"); return {0, 0}; } } mlir::LogicalResult matchAndRewrite(mlir::Operation *op, mlir::PatternRewriter &rewriter) const override { auto dotOp = cast(op); // TODO: Check data-types and SM compatibility auto oldRetType = dotOp.getResult().getType().cast(); if (!oldRetType.getEncoding() || oldRetType.getEncoding().isa()) return failure(); auto AType = dotOp.getOperand(0).getType().cast(); auto BType = dotOp.getOperand(1).getType().cast(); // for FMA, should retain the blocked layout. int versionMajor = computeCapabilityToMMAVersion(computeCapability); if (!supportMMA(dotOp, versionMajor)) return failure(); auto AOrder = AType.getEncoding() .cast() .getParent() .cast() .getOrder(); auto BOrder = BType.getEncoding() .cast() .getParent() .cast() .getOrder(); // get MMA encoding for the given number of warps auto retShape = oldRetType.getShape(); auto mod = op->getParentOfType(); int numWarps = triton::gpu::TritonGPUDialect::getNumWarps(mod); auto warpsPerTile = getWarpsPerTile(dotOp, retShape, versionMajor, numWarps); triton::gpu::MmaEncodingAttr mmaEnc; if (versionMajor == 1) { mmaEnc = triton::gpu::MmaEncodingAttr::get( oldRetType.getContext(), versionMajor, numWarps, mmaV1Counter++); } else if (versionMajor == 2) { mmaEnc = triton::gpu::MmaEncodingAttr::get( oldRetType.getContext(), versionMajor, 0 /*versionMinor*/, warpsPerTile); } else { assert(false && "Mma layout only support versionMajor of 1 or 2"); } auto newRetType = RankedTensorType::get(retShape, oldRetType.getElementType(), mmaEnc); // convert accumulator auto oldAcc = dotOp.getOperand(2); auto newAcc = rewriter.create( oldAcc.getLoc(), newRetType, oldAcc); Value a = dotOp.a(); Value b = dotOp.b(); auto oldAType = a.getType().cast(); auto oldBType = b.getType().cast(); auto oldAOrder = oldAType.getEncoding() .cast() .getParent() .cast() .getOrder(); auto oldBOrder = oldBType.getEncoding() .cast() .getParent() .cast() .getOrder(); Attribute isMMAv1RowA; Attribute isMMAv1RowB; if (versionMajor == 1) { isMMAv1RowA = BoolAttr::get(getContext(), oldAOrder[0] == 1); isMMAv1RowB = BoolAttr::get(getContext(), oldBOrder[0] == 1); } auto newAType = RankedTensorType::get( oldAType.getShape(), oldAType.getElementType(), triton::gpu::DotOperandEncodingAttr::get( oldAType.getContext(), 0, newRetType.getEncoding(), isMMAv1RowA)); auto newBType = RankedTensorType::get( oldBType.getShape(), oldBType.getElementType(), triton::gpu::DotOperandEncodingAttr::get( oldBType.getContext(), 1, newRetType.getEncoding(), isMMAv1RowB)); a = rewriter.create(a.getLoc(), newAType, a); b = rewriter.create(b.getLoc(), newBType, b); auto newDot = rewriter.create(dotOp.getLoc(), newRetType, a, b, newAcc, dotOp.allowTF32()); rewriter.replaceOpWithNewOp( op, oldRetType, newDot.getResult()); return success(); } }; // Convert + trans + convert // x = convert_layout distributed -> #shared_x // y = trans x -> #shared_y // z = convert_layout y -> #dot_operand class ConvertTransConvert : public mlir::RewritePattern { public: ConvertTransConvert(mlir::MLIRContext *context) : mlir::RewritePattern(triton::gpu::ConvertLayoutOp::getOperationName(), 1, context) {} LogicalResult matchAndRewrite(mlir::Operation *op, mlir::PatternRewriter &rewriter) const override { auto dstOp = cast(op); auto tmpOp = dyn_cast_or_null(dstOp.src().getDefiningOp()); if (!tmpOp) return mlir::failure(); auto srcOp = dyn_cast_or_null( tmpOp.src().getDefiningOp()); if (!srcOp) return mlir::failure(); auto arg = srcOp.src(); auto X = tmpOp.src(); auto Y = dstOp.src(); // types auto argType = arg.getType().cast(); auto XType = X.getType().cast(); auto YType = Y.getType().cast(); auto ZType = dstOp.getResult().getType().cast(); // encodings auto argEncoding = argType.getEncoding(); auto XEncoding = XType.getEncoding().cast(); auto YEncoding = YType.getEncoding().cast(); auto ZEncoding = ZType.getEncoding().dyn_cast(); if (!ZEncoding) return mlir::failure(); // new X encoding auto newXOrder = triton::gpu::getOrder(argEncoding); auto newXEncoding = triton::gpu::SharedEncodingAttr::get( getContext(), ZEncoding, XType.getShape(), newXOrder, XType.getElementType()); auto newXType = RankedTensorType::get(XType.getShape(), XType.getElementType(), newXEncoding); if (XEncoding == newXEncoding) return mlir::failure(); auto newX = rewriter.create(srcOp.getLoc(), newXType, arg); auto newY = rewriter.create(tmpOp.getLoc(), newX); rewriter.replaceOpWithNewOp(dstOp, ZType, newY); return mlir::success(); } }; // class ConvertDotConvert : public mlir::RewritePattern { public: ConvertDotConvert(mlir::MLIRContext *context) : mlir::RewritePattern(triton::gpu::ConvertLayoutOp::getOperationName(), 1, context) {} LogicalResult matchAndRewrite(mlir::Operation *op, mlir::PatternRewriter &rewriter) const override { auto dstOp = cast(op); auto dotOp = dyn_cast_or_null(dstOp.src().getDefiningOp()); if (!dotOp) return mlir::failure(); if (std::distance(dstOp->user_begin(), dstOp->user_end()) != 1 || std::distance(dotOp->user_begin(), dotOp->user_end()) != 1) return mlir::failure(); auto cvtOp = dyn_cast_or_null( dotOp.getOperand(2).getDefiningOp()); if (!cvtOp) return mlir::failure(); auto loadOp = dyn_cast_or_null(cvtOp.src().getDefiningOp()); if (!loadOp) return mlir::failure(); auto dstTy = dstOp.getResult().getType().cast(); auto srcTy = cvtOp.getOperand().getType().cast(); if (dstTy != srcTy) return mlir::failure(); // TODO: int tensor cores auto _0f = rewriter.create( op->getLoc(), APFloat(0.0f), dstTy.getElementType().cast()); auto _0 = rewriter.create( op->getLoc(), dotOp.getResult().getType(), _0f); auto newDot = rewriter.create( op->getLoc(), dotOp.getResult().getType(), dotOp.getOperand(0), dotOp.getOperand(1), _0, dotOp.allowTF32()); auto newCvt = rewriter.create( op->getLoc(), dstTy, newDot.getResult()); auto newAdd = rewriter.replaceOpWithNewOp( op, newCvt, cvtOp.getOperand()); return mlir::success(); } }; } // namespace #define GEN_PASS_CLASSES #include "triton/Dialect/TritonGPU/Transforms/Passes.h.inc" class TritonGPUCombineOpsPass : public TritonGPUCombineOpsBase { public: TritonGPUCombineOpsPass() = default; TritonGPUCombineOpsPass(int computeCapability) { this->computeCapability = computeCapability; } void runOnOperation() override { MLIRContext *context = &getContext(); ModuleOp m = getOperation(); mlir::RewritePatternSet patterns(context); patterns.add(context); patterns.add(context); patterns.add(context); patterns.add(context); patterns.add(context); patterns.add(context); patterns.add(context); patterns.add(context); patterns.add(context); patterns.add(context); patterns.add(context, computeCapability); patterns.add(context); patterns.add(context); if (applyPatternsAndFoldGreedily(m, std::move(patterns)).failed()) { signalPassFailure(); } if (fixupLoops(m).failed()) { signalPassFailure(); } } }; std::unique_ptr mlir::createTritonGPUCombineOpsPass(int computeCapability) { return std::make_unique(computeCapability); } triton-2.0.0/lib/Dialect/TritonGPU/Transforms/Combine.td000066400000000000000000000002451440023377100230430ustar00rootroot00000000000000#ifndef TRITONGPU_PATTERNS #define TRITONGPU_PATTERNS include "triton/Dialect/TritonGPU/IR/TritonGPUOps.td" include "triton/Dialect/Triton/IR/TritonOps.td" #endif triton-2.0.0/lib/Dialect/TritonGPU/Transforms/DecomposeConversions.cpp000066400000000000000000000052161440023377100260140ustar00rootroot00000000000000#include "mlir/Analysis/SliceAnalysis.h" #include "mlir/Dialect/SCF/SCF.h" #include "mlir/IR/BlockAndValueMapping.h" #include "mlir/IR/BuiltinAttributes.h" #include "mlir/IR/Matchers.h" #include "mlir/IR/PatternMatch.h" #include "mlir/IR/Verifier.h" #include "mlir/Interfaces/InferTypeOpInterface.h" #include "mlir/Pass/Pass.h" #include "mlir/Pass/PassManager.h" #include "mlir/Support/LogicalResult.h" #include "mlir/Transforms/GreedyPatternRewriteDriver.h" #include "mlir/Transforms/Passes.h" #include "mlir/Transforms/RegionUtils.h" #include "triton/Analysis/Utility.h" #include "triton/Dialect/TritonGPU/IR/Dialect.h" #include "triton/Dialect/TritonGPU/Transforms/Passes.h" #include "triton/Dialect/TritonGPU/Transforms/TritonGPUConversion.h" #define GEN_PASS_CLASSES #include "triton/Dialect/TritonGPU/Transforms/Passes.h.inc" using namespace mlir; class TritonGPUDecomposeConversionsPass : public TritonGPUDecomposeConversionsBase< TritonGPUDecomposeConversionsPass> { public: TritonGPUDecomposeConversionsPass() = default; void runOnOperation() override { MLIRContext *context = &getContext(); ModuleOp mod = getOperation(); mod.walk([&](triton::gpu::ConvertLayoutOp cvtOp) -> void { OpBuilder builder(cvtOp); auto srcType = cvtOp.getOperand().getType().cast(); auto dstType = cvtOp.getType().cast(); auto srcEncoding = srcType.getEncoding(); if (srcEncoding.isa()) return; auto dstDotOp = dstType.getEncoding().dyn_cast(); if (!dstDotOp) return; if (auto srcMmaEncoding = srcEncoding.dyn_cast()) { if (srcMmaEncoding.getVersionMajor() == 1 || (srcMmaEncoding.getWarpsPerCTA()[1] == 1 && dstDotOp.getParent() == srcMmaEncoding)) return; } auto tmpType = RankedTensorType::get( dstType.getShape(), dstType.getElementType(), triton::gpu::SharedEncodingAttr::get( mod.getContext(), dstDotOp, srcType.getShape(), triton::gpu::getOrder(srcEncoding), srcType.getElementType())); auto tmp = builder.create( cvtOp.getLoc(), tmpType, cvtOp.getOperand()); auto newConvert = builder.create( cvtOp.getLoc(), dstType, tmp); cvtOp.replaceAllUsesWith(newConvert.getResult()); cvtOp.erase(); }); } }; std::unique_ptr mlir::createTritonGPUDecomposeConversionsPass() { return std::make_unique(); } triton-2.0.0/lib/Dialect/TritonGPU/Transforms/Pipeline.cpp000066400000000000000000000651341440023377100234170ustar00rootroot00000000000000#include "mlir/Dialect/Tensor/IR/Tensor.h" #include "mlir/IR/BlockAndValueMapping.h" #include "mlir/IR/TypeUtilities.h" #include "triton/Analysis/AxisInfo.h" #include "triton/Dialect/TritonGPU/IR/Dialect.h" #include "triton/Dialect/TritonGPU/Transforms/Passes.h" //===----------------------------------------------------------------------===// // // This file implements loop software pipelining // The implementation here is inspired by the pipeline pass in Triton (-v2.0) // and SCF's LoopPipelining. // //===----------------------------------------------------------------------===// using namespace mlir; namespace ttg = triton::gpu; #define GEN_PASS_CLASSES #include "triton/Dialect/TritonGPU/Transforms/Passes.h.inc" static Type getI1SameShape(Value v) { Type vType = v.getType(); auto i1Type = IntegerType::get(vType.getContext(), 1); auto tensorType = vType.cast(); return RankedTensorType::get(tensorType.getShape(), i1Type, tensorType.getEncoding()); } #define int_attr(num) builder.getI64IntegerAttr(num) namespace { class LoopPipeliner { /// Cache forOp we are working on scf::ForOp forOp; /// Cache YieldOp for this forOp scf::YieldOp yieldOp; /// Loads to be pipelined SetVector loads; /// The value that each load will be mapped to (after layout conversion) DenseMap loadsMapping; /// load => buffer DenseMap loadsBuffer; /// load => buffer type (with shared layout after swizzling) DenseMap loadsBufferType; /// load => buffer at stage N DenseMap> loadStageBuffer; /// load => after extract DenseMap loadsExtract; /// Value pipelineIterIdx; /// Value loopIterIdx; /// Comments on numStages: /// [0, numStages-1) are in the prologue /// numStages-1 is appended after the loop body int numStages; /// value (in loop) => value at stage N DenseMap> valueMapping; /// Block arguments that loads depend on SetVector depArgs; /// Operations (inside the loop body) that loads depend on SetVector depOps; /// collect values that v depends on and are defined inside the loop void collectDeps(Value v, int stages, SetVector &deps); void setValueMapping(Value origin, Value newValue, int stage); Value lookupOrDefault(Value origin, int stage); /// Returns a empty buffer of size ttg::AllocTensorOp allocateEmptyBuffer(Operation *op, OpBuilder &builder); public: LoopPipeliner(scf::ForOp forOp, int numStages) : forOp(forOp), numStages(numStages) { // cache yieldOp yieldOp = cast(forOp.getBody()->getTerminator()); } /// Collect loads to pipeline. Return success if we can pipeline this loop LogicalResult initialize(); /// Emit pipelined loads (before loop body) void emitPrologue(); /// emit pipelined loads (after loop body) void emitEpilogue(); /// create the new ForOp (add new args & insert prefetched ops) scf::ForOp createNewForOp(); friend struct PipelinePass; }; // helpers void LoopPipeliner::setValueMapping(Value origin, Value newValue, int stage) { if (valueMapping.find(origin) == valueMapping.end()) valueMapping[origin] = SmallVector(numStages); valueMapping[origin][stage] = newValue; } Value LoopPipeliner::lookupOrDefault(Value origin, int stage) { if (valueMapping.find(origin) == valueMapping.end()) return origin; return valueMapping[origin][stage]; } void LoopPipeliner::collectDeps(Value v, int stages, SetVector &deps) { // Loop-invariant value, skip if (v.getParentRegion() != &forOp.getLoopBody()) return; // Since we only need to peel the loop numStages-1 times, don't worry about // depends that are too far away if (stages < 0) return; if (auto arg = v.dyn_cast()) { if (arg.getArgNumber() > 0) { // Skip the first arg (loop induction variable) // Otherwise the op idx is arg.getArgNumber()-1 deps.insert(v); collectDeps(yieldOp->getOperand(arg.getArgNumber() - 1), stages - 1, deps); } } else { // value // v might be in deps, but we still need to visit v. // This is because v might depend on value in previous iterations deps.insert(v); for (Value op : v.getDefiningOp()->getOperands()) collectDeps(op, stages, deps); } } ttg::AllocTensorOp LoopPipeliner::allocateEmptyBuffer(Operation *op, OpBuilder &builder) { // Allocate a buffer for each pipelined tensor // shape: e.g. (numStages==4), <32x64xbf16> -> <4x32x64xbf16> Value convertLayout = loadsMapping[op->getResult(0)]; if (auto tensorType = convertLayout.getType().dyn_cast()) { return builder.create( convertLayout.getLoc(), loadsBufferType[op->getResult(0)]); } llvm_unreachable("Async copy's return should be of RankedTensorType"); } /// A load instruction can be pipelined if: /// - the load doesn't depend on any other loads (after loop peeling) /// - (?) this load is not a loop-invariant value (we should run LICM before /// this pass?) LogicalResult LoopPipeliner::initialize() { Block *loop = forOp.getBody(); AxisInfoAnalysis axisInfoAnalysis(forOp.getContext()); axisInfoAnalysis.run(forOp->getParentOfType()); // can we use forOp.walk(...) here? SmallVector allLoads; for (Operation &op : *loop) if (auto loadOp = dyn_cast(&op)) { auto ptr = loadOp.ptr(); unsigned vec = axisInfoAnalysis.getPtrContiguity(ptr); auto ty = getElementTypeOrSelf(ptr.getType()) .cast() .getPointeeType(); unsigned width = vec * ty.getIntOrFloatBitWidth(); if (width >= 32) allLoads.push_back(loadOp); } // Early stop: no need to continue if there is no load in the loop. if (allLoads.empty()) return failure(); // load => values that it depends on DenseMap> loadDeps; for (triton::LoadOp loadOp : allLoads) { SetVector deps; for (Value op : loadOp->getOperands()) collectDeps(op, numStages - 1, deps); loadDeps[loadOp] = deps; } // Don't pipeline loads that depend on other loads // (Because if a load depends on another load, this load needs to wait on the // other load in the prologue, which is against the point of the pipeline // pass) for (triton::LoadOp loadOp : allLoads) { bool isCandidate = true; for (triton::LoadOp other : allLoads) { if (loadDeps[loadOp].contains(other)) { isCandidate = false; break; } } // We only pipeline loads that have one covert_layout (to dot_op) use // TODO: lift this constraint in the future if (isCandidate && loadOp.getResult().hasOneUse()) { isCandidate = false; Operation *use = *loadOp.getResult().getUsers().begin(); if (auto convertLayout = llvm::dyn_cast(use)) { if (auto tensorType = convertLayout.getResult() .getType() .dyn_cast()) { if (auto dotOpEnc = tensorType.getEncoding() .dyn_cast()) { isCandidate = true; loadsMapping[loadOp] = convertLayout; auto ty = loadOp.getType().cast(); SmallVector bufferShape(ty.getShape().begin(), ty.getShape().end()); bufferShape.insert(bufferShape.begin(), numStages); auto sharedEnc = ttg::SharedEncodingAttr::get( ty.getContext(), dotOpEnc, ty.getShape(), triton::gpu::getOrder(ty.getEncoding()), ty.getElementType()); loadsBufferType[loadOp] = RankedTensorType::get( bufferShape, ty.getElementType(), sharedEnc); } } } } else isCandidate = false; if (isCandidate) loads.insert(loadOp); } // We have some loads to pipeline if (!loads.empty()) { // Update depArgs & depOps for (Value loadOp : loads) { for (Value dep : loadDeps[loadOp]) { // TODO: we should record the stage that the value is depended on if (auto arg = dep.dyn_cast()) depArgs.insert(arg); else depOps.insert(dep.getDefiningOp()); } } return success(); } return failure(); } void LoopPipeliner::emitPrologue() { // llvm::errs() << "loads to pipeline...:\n"; // for (Value load : loads) // llvm::errs() << load << "\n"; OpBuilder builder(forOp); for (BlockArgument &arg : forOp.getRegionIterArgs()) { OpOperand &operand = forOp.getOpOperandForRegionIterArg(arg); setValueMapping(arg, operand.get(), 0); } // prologue from [0, numStage-1) Value iv = forOp.getLowerBound(); pipelineIterIdx = builder.create(iv.getLoc(), 0, 32); for (int stage = 0; stage < numStages - 1; ++stage) { // Special handling for induction variable as the increment is implicit if (stage != 0) iv = builder.create(iv.getLoc(), iv, forOp.getStep()); setValueMapping(forOp.getInductionVar(), iv, stage); // Special handling for loop condition as there is no condition in ForOp Value loopCond = builder.create( iv.getLoc(), arith::CmpIPredicate::slt, iv, forOp.getUpperBound()); // Rematerialize peeled values SmallVector orderedDeps; for (Operation &op : forOp.getLoopBody().front()) { if (depOps.contains(&op)) orderedDeps.push_back(&op); else if (loads.contains(op.getResult(0))) orderedDeps.push_back(&op); } assert(depOps.size() + loads.size() == orderedDeps.size() && "depOps contains invalid values"); for (Operation *op : orderedDeps) { Operation *newOp = nullptr; if (loads.contains(op->getResult(0))) { // Allocate empty buffer if (stage == 0) { loadsBuffer[op->getResult(0)] = allocateEmptyBuffer(op, builder); loadStageBuffer[op->getResult(0)] = {loadsBuffer[op->getResult(0)]}; } // load => copy async if (auto loadOp = llvm::dyn_cast(op)) { Value mask = lookupOrDefault(loadOp.mask(), stage); Value newMask; if (mask) { Value splatCond = builder.create( mask.getLoc(), mask.getType(), loopCond); newMask = builder.create(mask.getLoc(), mask, splatCond); } else { newMask = builder.create( loopCond.getLoc(), getI1SameShape(loadOp), loopCond); } // TODO: check if the hardware supports async copy newOp = builder.create( op->getLoc(), loadsBuffer[loadOp].getType(), lookupOrDefault(loadOp.ptr(), stage), loadStageBuffer[loadOp][stage], pipelineIterIdx, newMask, lookupOrDefault(loadOp.other(), stage), loadOp.cache(), loadOp.evict(), loadOp.isVolatile(), /*axis*/ 0); builder.create(op->getLoc()); loadStageBuffer[loadOp].push_back(newOp->getResult(0)); } else llvm_unreachable("This should be LoadOp"); } else { newOp = builder.clone(*op); // Update loop-carried uses for (unsigned opIdx = 0; opIdx < op->getNumOperands(); ++opIdx) { auto it = valueMapping.find(op->getOperand(opIdx)); if (it != valueMapping.end()) { Value v = it->second[stage]; assert(v); newOp->setOperand(opIdx, v); } // else, op at opIdx is a loop-invariant value } } // Update mapping of results // if (stage == numStages - 2) // continue; for (unsigned dstIdx : llvm::seq(unsigned(0), op->getNumResults())) { Value originalResult = op->getResult(dstIdx); // copy_async will update the value of its only use // TODO: load should not be used in the preheader? if (loads.contains(originalResult)) { break; // originalResult = loadsMapping[originalResult]; } setValueMapping(originalResult, newOp->getResult(dstIdx), stage); // update mapping for loop-carried values (args) for (OpOperand &operand : yieldOp->getOpOperands()) { if (operand.get() == op->getResult(dstIdx)) setValueMapping( forOp.getRegionIterArgs()[operand.getOperandNumber()], newOp->getResult(dstIdx), stage + 1); } } } // for (Operation *op : orderedDeps) pipelineIterIdx = builder.create( iv.getLoc(), pipelineIterIdx, builder.create(iv.getLoc(), 1, 32)); } // for (int stage = 0; stage < numStages - 1; ++stage) // async.wait & extract_slice builder.create(loads[0].getLoc(), loads.size() * (numStages - 2)); loopIterIdx = builder.create(iv.getLoc(), 0, 32); for (Value loadOp : loads) { auto sliceType = loadsMapping[loadOp].getType().cast(); sliceType = RankedTensorType::get(sliceType.getShape(), sliceType.getElementType(), loadsBufferType[loadOp].getEncoding()); Value extractSlice = builder.create( loadOp.getLoc(), sliceType, loadStageBuffer[loadOp][numStages - 1], SmallVector{int_attr(0), int_attr(0), int_attr(0)}, SmallVector{int_attr(1), int_attr(sliceType.getShape()[0]), int_attr(sliceType.getShape()[1])}, SmallVector{int_attr(1), int_attr(1), int_attr(1)}); loadsExtract[loadOp] = extractSlice; } // Bump up loopIterIdx, this is used for getting the correct slice for the // *next* iteration loopIterIdx = builder.create( loopIterIdx.getLoc(), loopIterIdx, builder.create(loopIterIdx.getLoc(), 1, 32)); } void LoopPipeliner::emitEpilogue() { // If there's any outstanding async copies, we need to wait for them. OpBuilder builder(forOp); OpBuilder::InsertionGuard g(builder); builder.setInsertionPointAfter(forOp); builder.create(forOp.getLoc(), 0); } scf::ForOp LoopPipeliner::createNewForOp() { OpBuilder builder(forOp); // Order of new args: // (original args) // (insertSliceAsync buffer at stage numStages - 1) for each load // (extracted tensor) for each load // (depArgs at stage numStages - 2) // (iv at stage numStages - 2) // (pipeline iteration index) // (loop iteration index) SmallVector newLoopArgs; // We need this to update operands for yield // original block arg => new arg's idx DenseMap depArgsIdx; for (auto v : forOp.getIterOperands()) newLoopArgs.push_back(v); size_t bufferIdx = newLoopArgs.size(); for (Value loadOp : loads) newLoopArgs.push_back(loadStageBuffer[loadOp].back()); size_t loadIdx = newLoopArgs.size(); for (Value loadOp : loads) newLoopArgs.push_back(loadsExtract[loadOp]); size_t depArgsBeginIdx = newLoopArgs.size(); for (BlockArgument depArg : depArgs) { depArgsIdx[depArg] = newLoopArgs.size(); newLoopArgs.push_back(valueMapping[depArg][numStages - 2]); } size_t nextIVIdx = newLoopArgs.size(); newLoopArgs.push_back(valueMapping[forOp.getInductionVar()][numStages - 2]); newLoopArgs.push_back(pipelineIterIdx); newLoopArgs.push_back(loopIterIdx); for (size_t i = 0; i < newLoopArgs.size(); ++i) assert(newLoopArgs[i]); // 1. signature of the new ForOp auto newForOp = builder.create( forOp.getLoc(), forOp.getLowerBound(), forOp.getUpperBound(), forOp.getStep(), newLoopArgs); // 2. body of the new ForOp builder.setInsertionPointToStart(newForOp.getBody()); BlockAndValueMapping mapping; for (const auto &arg : llvm::enumerate(forOp.getRegionIterArgs())) mapping.map(arg.value(), newForOp.getRegionIterArgs()[arg.index()]); mapping.map(forOp.getInductionVar(), newForOp.getInductionVar()); // 2.1 clone the loop body, replace original args with args of the new ForOp // Insert async wait if necessary. for (Operation &op : forOp.getBody()->without_terminator()) { Operation *newOp = builder.clone(op, mapping); // update mapping of results for (unsigned dstIdx : llvm::seq(unsigned(0), op.getNumResults())) mapping.map(op.getResult(dstIdx), newOp->getResult(dstIdx)); } // 3. replace loads with block args (from prologue) for (size_t idx = 0; idx < loads.size(); ++idx) { Value load = loads[idx]; assert(load.hasOneUse() && "we assume that this load has one use (ConvertLayout)"); Value loadUse = load.getUsers().begin()->getResult(0); mapping.lookup(loadUse).replaceAllUsesWith( newForOp.getRegionIterArgs()[loadIdx + idx]); // delete old load and layout conversion mapping.lookup(loadUse).getDefiningOp()->erase(); mapping.lookup(load).getDefiningOp()->erase(); } // 4. prefetch the next iteration SmallVector orderedDeps; for (Operation &op : forOp.getLoopBody().front()) { if (depOps.contains(&op)) orderedDeps.push_back(&op); else if (loads.contains(op.getResult(0))) orderedDeps.push_back(&op); } assert(depOps.size() + loads.size() == orderedDeps.size() && "depOps contains invalid values"); BlockAndValueMapping nextMapping; DenseMap depArgsMapping; size_t argIdx = 0; for (BlockArgument arg : depArgs) { BlockArgument nextArg = newForOp.getRegionIterArgs()[argIdx + depArgsBeginIdx]; nextMapping.map(arg, nextArg); ++argIdx; } // Special handling for iv & loop condition Value nextIV = builder.create( newForOp.getInductionVar().getLoc(), newForOp.getRegionIterArgs()[nextIVIdx], newForOp.getStep()); Value nextLoopCond = builder.create(nextIV.getLoc(), arith::CmpIPredicate::slt, nextIV, newForOp.getUpperBound()); nextMapping.map(forOp.getInductionVar(), nextIV); // Slice index SmallVector nextBuffers; SmallVector extractSlices; pipelineIterIdx = newForOp.getRegionIterArgs()[nextIVIdx + 1]; Value insertSliceIndex = builder.create( nextIV.getLoc(), pipelineIterIdx, builder.create(nextIV.getLoc(), numStages, 32)); loopIterIdx = newForOp.getRegionIterArgs()[nextIVIdx + 2]; Value extractSliceIndex = builder.create( nextIV.getLoc(), loopIterIdx, builder.create(nextIV.getLoc(), numStages, 32)); extractSliceIndex = builder.create( extractSliceIndex.getLoc(), builder.getIndexType(), extractSliceIndex); for (Operation *op : orderedDeps) if (!loads.contains(op->getResult(0))) { Operation *nextOp = builder.clone(*op, nextMapping); auto originYield = cast(forOp.getBody()->getTerminator()); for (unsigned dstIdx : llvm::seq(unsigned(0), op->getNumResults())) { for (OpOperand &operand : originYield->getOpOperands()) { if (operand.get() == op->getResult(dstIdx)) { size_t originIdx = operand.getOperandNumber(); size_t newArgIdx = depArgsIdx[forOp.getRegionIterArgs()[originIdx]]; BlockArgument newArg = newForOp.getRegionIterArgs()[newArgIdx]; nextMapping.map(forOp.getRegionIterArgs()[originIdx], nextOp->getResult(dstIdx)); depArgsMapping[newArg] = nextOp->getResult(dstIdx); } } } } for (Operation *op : orderedDeps) { Operation *nextOp = nullptr; // Update loading mask if (loads.contains(op->getResult(0))) { auto loadOp = llvm::cast(op); Value mask = loadOp.mask(); Value newMask; if (mask) { Value splatCond = builder.create( mask.getLoc(), mask.getType(), nextLoopCond); newMask = builder.create( mask.getLoc(), splatCond, nextMapping.lookupOrDefault(mask)); // If mask is defined outside the loop, don't update the map more than // once if (!(forOp.isDefinedOutsideOfLoop(mask) && nextMapping.contains(mask))) nextMapping.map(mask, newMask); newMask = nextMapping.lookupOrDefault(loadOp.mask()); } else newMask = builder.create( loadOp.getLoc(), getI1SameShape(loadOp), nextLoopCond); Value insertAsyncOp = builder.create( op->getLoc(), loadsBuffer[loadOp].getType(), nextMapping.lookupOrDefault(loadOp.ptr()), newForOp.getRegionIterArgs()[bufferIdx + nextBuffers.size()], insertSliceIndex, newMask, nextMapping.lookupOrDefault(loadOp.other()), loadOp.cache(), loadOp.evict(), loadOp.isVolatile(), /*axis*/ 0); builder.create(op->getLoc()); nextBuffers.push_back(insertAsyncOp); auto sliceType = loadsMapping[loadOp].getType().cast(); sliceType = RankedTensorType::get(sliceType.getShape(), sliceType.getElementType(), loadsBufferType[loadOp].getEncoding()); nextOp = builder.create( op->getLoc(), sliceType, insertAsyncOp, SmallVector{extractSliceIndex, int_attr(0), int_attr(0)}, SmallVector{int_attr(1), int_attr(sliceType.getShape()[0]), int_attr(sliceType.getShape()[1])}, SmallVector{int_attr(1), int_attr(1), int_attr(1)}); extractSlices.push_back(nextOp->getResult(0)); // Update mapping of results for (unsigned dstIdx : llvm::seq(unsigned(0), op->getNumResults())) { nextMapping.map(op->getResult(dstIdx), nextOp->getResult(dstIdx)); // If this is a loop-carried value, update the mapping for yield auto originYield = cast(forOp.getBody()->getTerminator()); for (OpOperand &operand : originYield->getOpOperands()) { if (operand.get() == op->getResult(dstIdx)) { size_t originIdx = operand.getOperandNumber(); size_t newArgIdx = depArgsIdx[forOp.getRegionIterArgs()[originIdx]]; BlockArgument newArg = newForOp.getRegionIterArgs()[newArgIdx]; depArgsMapping[newArg] = nextOp->getResult(dstIdx); } } } } } { OpBuilder::InsertionGuard guard(builder); for (Operation &op : *newForOp.getBody()) { if (auto dotOp = llvm::dyn_cast(&op)) { builder.setInsertionPoint(&op); auto dotType = dotOp.getType().cast(); Value a = dotOp.a(); Value b = dotOp.b(); auto layoutCast = [&](Value dotOperand, int opIdx) -> Value { auto tensorType = dotOperand.getType().cast(); if (!tensorType.getEncoding().isa()) { auto newEncoding = ttg::DotOperandEncodingAttr::get( tensorType.getContext(), opIdx, dotType.getEncoding()); auto newType = RankedTensorType::get(tensorType.getShape(), tensorType.getElementType(), newEncoding); return builder.create(dotOperand.getLoc(), newType, dotOperand); } return dotOperand; }; a = layoutCast(a, 0); b = layoutCast(b, 1); dotOp->setOperand(0, a); dotOp->setOperand(1, b); } } } // async.wait & extract_slice Operation *asyncWait = builder.create( loads[0].getLoc(), loads.size() * (numStages - 2)); for (auto it = extractSlices.rbegin(); it != extractSlices.rend(); ++it) { // move extract_slice after asyncWait it->getDefiningOp()->moveAfter(asyncWait); } // Bump iteration count pipelineIterIdx = builder.create( nextIV.getLoc(), pipelineIterIdx, builder.create(nextIV.getLoc(), 1, 32)); loopIterIdx = builder.create( nextIV.getLoc(), loopIterIdx, builder.create(nextIV.getLoc(), 1, 32)); // Finally, the YieldOp, need to sync with the order of newLoopArgs SmallVector yieldValues; for (Value v : forOp.getBody()->getTerminator()->getOperands()) yieldValues.push_back(mapping.lookup(v)); for (Value nextBuffer : nextBuffers) yieldValues.push_back(nextBuffer); for (Value nextSlice : extractSlices) yieldValues.push_back(nextSlice); for (size_t i = depArgsBeginIdx; i < nextIVIdx; ++i) { auto arg = newForOp.getRegionIterArgs()[i]; assert(depArgsMapping.count(arg) && "Missing loop-carried value"); yieldValues.push_back(depArgsMapping[arg]); } yieldValues.push_back(nextIV); yieldValues.push_back(pipelineIterIdx); yieldValues.push_back(loopIterIdx); builder.setInsertionPointToEnd(newForOp.getBody()); builder.create(forOp.getBody()->getTerminator()->getLoc(), yieldValues); return newForOp; } // ref: mlir/lib/Dialect/SCF/Transforms/LoopPipelining.cpp struct PipelinePass : public TritonGPUPipelineBase { PipelinePass() = default; PipelinePass(int numStages) { this->numStages = numStages; } void runOnOperation() override { int numStages = this->numStages; if (numStages <= 1) return; getOperation()->walk([&](scf::ForOp forOp) -> void { LoopPipeliner pipeliner(forOp, numStages); if (pipeliner.initialize().failed()) return; pipeliner.emitPrologue(); scf::ForOp newForOp = pipeliner.createNewForOp(); pipeliner.emitEpilogue(); // replace the original loop for (unsigned i = 0; i < forOp->getNumResults(); ++i) forOp->getResult(i).replaceAllUsesWith(newForOp->getResult(i)); forOp->erase(); }); } }; } // anonymous namespace std::unique_ptr mlir::createTritonGPUPipelinePass(int numStages) { return std::make_unique(numStages); } triton-2.0.0/lib/Dialect/TritonGPU/Transforms/Prefetch.cpp000066400000000000000000000253321440023377100234060ustar00rootroot00000000000000//===----------------------------------------------------------------------===// // // This pass tries to prefetch operands (a and b) of tt.dot. // Those ConvertLayoutOps will be lowered to shared memory loads. // // For example: // %a: tensor<128x32xf16, #enc> // scf.for %iv = ... iter_args(%a_arg = %a, ...) { // %d = tt.dot %a_arg, %b, %c // ... // scf.yield %a_next, ... // } // // will be translated to // // %a: tensor<128x32xf16, #enc> // %a_tmp = tensor.extract_slice %a[0, 0] [128, 16] // %a_prefetch = triton_gpu.convert_layout %a_tmp // scf.for %iv = ... iter_args(%a_buf = %a, ..., %a_prefetch_arg = %a_prefetch) // { // %x = tt.dot %a_arg, %b, %c // %a_tmp_rem = tensor.extract_slice %a_buf[0, 16] [128, 16] // %a_prefetch_next = triton_gpu.convert_layout %a_tmp_rem // ... // scf.yield %next_a, ..., %a_prefetch_next // } //===----------------------------------------------------------------------===// #include "mlir/IR/BlockAndValueMapping.h" #include "triton/Analysis/Utility.h" #include "triton/Dialect/TritonGPU/IR/Dialect.h" #include "triton/Dialect/TritonGPU/Transforms/Passes.h" using namespace mlir; #define GEN_PASS_CLASSES #include "triton/Dialect/TritonGPU/Transforms/Passes.h.inc" namespace { class Prefetcher { /// cache the ForOp we are working on scf::ForOp forOp; /// cache the YieldOp of this ForOp scf::YieldOp yieldOp; /// // TODO: add a hook to infer prefetchWidth unsigned prefetchWidth = 16; /// dots to be prefetched SetVector dots; /// dot => dot operand DenseMap dot2aLoopArg; DenseMap dot2aHeaderDef; DenseMap dot2bLoopArg; DenseMap dot2bHeaderDef; DenseMap dot2aYield; DenseMap dot2bYield; /// operand => defining DenseMap operand2headPrefetch; LogicalResult isForOpOperand(Value v); Value generatePrefetch(Value v, unsigned opIdx, bool isPrologue, Attribute dotEncoding, OpBuilder &builder, llvm::Optional offsetK = llvm::None, llvm::Optional shapeK = llvm::None); public: Prefetcher() = delete; Prefetcher(scf::ForOp forOp) : forOp(forOp) { yieldOp = cast(forOp.getBody()->getTerminator()); } LogicalResult initialize(); void emitPrologue(); scf::ForOp createNewForOp(); }; Value Prefetcher::generatePrefetch(Value v, unsigned opIdx, bool isPrologue, Attribute dotEncoding, OpBuilder &builder, llvm::Optional offsetK, llvm::Optional shapeK) { // opIdx: 0 => a, 1 => b auto type = v.getType().cast(); SmallVector shape{type.getShape().begin(), type.getShape().end()}; SmallVector offset{0, 0}; Type elementType = type.getElementType(); auto intAttr = [&](int64_t val) { return builder.getI64IntegerAttr(val); }; // k => (prefetchWidth, k - prefetchWidth) int64_t kIdx = opIdx == 0 ? 1 : 0; offset[kIdx] = isPrologue ? 0 : prefetchWidth; shape[kIdx] = isPrologue ? prefetchWidth : (shape[kIdx] - prefetchWidth); if (shapeK) shape[kIdx] = *shapeK; if (offsetK) offset[kIdx] = *offsetK; Value newSmem = builder.create( v.getLoc(), // TODO: encoding? RankedTensorType::get(shape, elementType, type.getEncoding()), v, SmallVector{intAttr(offset[0]), intAttr(offset[1])}, SmallVector{intAttr(shape[0]), intAttr(shape[1])}, SmallVector{intAttr(1), intAttr(1)}); auto dotOperandEnc = triton::gpu::DotOperandEncodingAttr::get( builder.getContext(), opIdx, dotEncoding); Value prefetchSlice = builder.create( v.getLoc(), RankedTensorType::get(shape, elementType, dotOperandEnc), newSmem); return prefetchSlice; } LogicalResult Prefetcher::initialize() { Block *loop = forOp.getBody(); SmallVector dotsInFor; for (Operation &op : *loop) if (auto dotOp = dyn_cast(op)) dotsInFor.push_back(dotOp); if (dotsInFor.empty()) return failure(); // TODO: segfault (original for still has uses) // when used in flash attention that has 2 dots in the loop if (dotsInFor.size() > 1) return failure(); // returns source of cvt auto getPrefetchSrc = [](Value v) -> Value { if (auto cvt = v.getDefiningOp()) if (isSharedEncoding(cvt.getOperand())) return cvt.src(); return Value(); }; auto getIncomingOp = [this](Value v) -> Value { if (auto arg = v.dyn_cast()) if (arg.getOwner()->getParentOp() == forOp.getOperation()) return forOp.getOpOperandForRegionIterArg(arg).get(); return Value(); }; auto getYieldOp = [this](Value v) -> Value { auto arg = v.cast(); unsigned yieldIdx = arg.getArgNumber() - forOp.getNumInductionVars(); return yieldOp.getOperand(yieldIdx); }; for (triton::DotOp dot : dotsInFor) { auto kSize = dot.a().getType().cast().getShape()[1]; // works better with nvidia tensor cores unsigned elementWidth = dot.a().getType().cast().getElementTypeBitWidth(); prefetchWidth = 256 / elementWidth; // Skip prefetching if kSize is less than prefetchWidth if (kSize < prefetchWidth) continue; Value aSmem = getPrefetchSrc(dot.a()); Value bSmem = getPrefetchSrc(dot.b()); if (aSmem && bSmem) { Value aHeaderDef = getIncomingOp(aSmem); Value bHeaderDef = getIncomingOp(bSmem); // Only prefetch loop arg if (aHeaderDef && bHeaderDef) { dots.insert(dot); dot2aHeaderDef[dot] = aHeaderDef; dot2bHeaderDef[dot] = bHeaderDef; dot2aLoopArg[dot] = aSmem; dot2bLoopArg[dot] = bSmem; dot2aYield[dot] = getYieldOp(aSmem); dot2bYield[dot] = getYieldOp(bSmem); } } } return success(); } void Prefetcher::emitPrologue() { OpBuilder builder(forOp); for (Value dot : dots) { Attribute dotEncoding = dot.getType().cast().getEncoding(); Value aPrefetched = generatePrefetch(dot2aHeaderDef[dot], 0, true, dotEncoding, builder); operand2headPrefetch[dot.getDefiningOp().a()] = aPrefetched; Value bPrefetched = generatePrefetch(dot2bHeaderDef[dot], 1, true, dotEncoding, builder); operand2headPrefetch[dot.getDefiningOp().b()] = bPrefetched; } } scf::ForOp Prefetcher::createNewForOp() { OpBuilder builder(forOp); SmallVector loopArgs; for (auto v : forOp.getIterOperands()) loopArgs.push_back(v); for (Value dot : dots) { loopArgs.push_back( operand2headPrefetch[dot.getDefiningOp().a()]); loopArgs.push_back( operand2headPrefetch[dot.getDefiningOp().b()]); } auto newForOp = builder.create( forOp.getLoc(), forOp.getLowerBound(), forOp.getUpperBound(), forOp.getStep(), loopArgs); auto largestPow2 = [](int64_t n) -> int64_t { while ((n & (n - 1)) != 0) n = n & (n - 1); return n; }; builder.setInsertionPointToStart(newForOp.getBody()); BlockAndValueMapping mapping; for (const auto &arg : llvm::enumerate(forOp.getRegionIterArgs())) mapping.map(arg.value(), newForOp.getRegionIterArgs()[arg.index()]); mapping.map(forOp.getInductionVar(), newForOp.getInductionVar()); for (Operation &op : forOp.getBody()->without_terminator()) { Operation *newOp = builder.clone(op, mapping); auto dot = dyn_cast(&op); if (dots.contains(dot)) { Attribute dotEncoding = dot.getType().cast().getEncoding(); // prefetched dot Operation *firstDot = builder.clone(*dot, mapping); if (Value a = operand2headPrefetch.lookup(dot.a())) firstDot->setOperand( 0, newForOp.getRegionIterArgForOpOperand(*a.use_begin())); if (Value b = operand2headPrefetch.lookup(dot.b())) firstDot->setOperand( 1, newForOp.getRegionIterArgForOpOperand(*b.use_begin())); // remaining part int64_t kOff = prefetchWidth; int64_t kRem = dot.a().getType().cast().getShape()[1] - prefetchWidth; Operation *prevDot = firstDot; while (kRem != 0) { // int64_t kShape = largestPow2(kRem); int64_t kShape = prefetchWidth; auto insertionPoint = builder.saveInsertionPoint(); builder.setInsertionPoint(prevDot); Value aRem = generatePrefetch(mapping.lookup(dot2aLoopArg[dot]), 0, false, dotEncoding, builder, kOff, kShape); Value bRem = generatePrefetch(mapping.lookup(dot2bLoopArg[dot]), 1, false, dotEncoding, builder, kOff, kShape); builder.restoreInsertionPoint(insertionPoint); newOp = builder.clone(*dot, mapping); newOp->setOperand(0, aRem); newOp->setOperand(1, bRem); newOp->setOperand(2, prevDot->getResult(0)); prevDot = newOp; kOff += kShape; kRem -= kShape; } } // update mapping of results for (unsigned dstIdx : llvm::seq(unsigned(0), op.getNumResults())) mapping.map(op.getResult(dstIdx), newOp->getResult(dstIdx)); } // prefetch next iteration SmallVector yieldValues; for (Value v : forOp.getBody()->getTerminator()->getOperands()) yieldValues.push_back(mapping.lookup(v)); for (Value dot : dots) { Attribute dotEncoding = dot.getType().cast().getEncoding(); yieldValues.push_back(generatePrefetch(mapping.lookup(dot2aYield[dot]), 0, true, dotEncoding, builder)); yieldValues.push_back(generatePrefetch(mapping.lookup(dot2bYield[dot]), 1, true, dotEncoding, builder)); } // Update ops of yield builder.create(yieldOp.getLoc(), yieldValues); return newForOp; } struct PrefetchPass : public TritonGPUPrefetchBase { void runOnOperation() override { getOperation()->walk([&](scf::ForOp forOp) { Prefetcher prefetcher(forOp); if (prefetcher.initialize().failed()) return; prefetcher.emitPrologue(); scf::ForOp newForOp = prefetcher.createNewForOp(); // replace the original loop for (unsigned i = 0; i < forOp->getNumResults(); ++i) forOp->getResult(i).replaceAllUsesWith(newForOp->getResult(i)); forOp->erase(); }); } }; } // anonymous namespace std::unique_ptr mlir::createTritonGPUPrefetchPass() { return std::make_unique(); } triton-2.0.0/lib/Dialect/TritonGPU/Transforms/ReorderInstructions.cpp000066400000000000000000000073551440023377100257020ustar00rootroot00000000000000#include "mlir/Analysis/SliceAnalysis.h" #include "mlir/Dialect/SCF/SCF.h" #include "mlir/IR/BlockAndValueMapping.h" #include "mlir/IR/BuiltinAttributes.h" #include "mlir/IR/Matchers.h" #include "mlir/IR/PatternMatch.h" #include "mlir/IR/Verifier.h" #include "mlir/Interfaces/InferTypeOpInterface.h" #include "mlir/Pass/Pass.h" #include "mlir/Pass/PassManager.h" #include "mlir/Support/LogicalResult.h" #include "mlir/Transforms/GreedyPatternRewriteDriver.h" #include "mlir/Transforms/Passes.h" #include "mlir/Transforms/RegionUtils.h" #include "triton/Analysis/Utility.h" #include "triton/Dialect/TritonGPU/IR/Dialect.h" #include "triton/Dialect/TritonGPU/Transforms/Passes.h" #include "triton/Dialect/TritonGPU/Transforms/TritonGPUConversion.h" #define GEN_PASS_CLASSES #include "triton/Dialect/TritonGPU/Transforms/Passes.h.inc" using namespace mlir; static inline bool willIncreaseRegisterPressure(triton::gpu::ConvertLayoutOp op) { auto srcType = op.getOperand().getType().cast(); auto dstType = op.getResult().getType().cast(); auto srcEncoding = srcType.getEncoding(); auto dstEncoding = dstType.getEncoding(); if (srcEncoding.isa()) return true; if (dstEncoding.isa()) return true; return false; } class TritonGPUReorderInstructionsPass : public TritonGPUReorderInstructionsBase< TritonGPUReorderInstructionsPass> { public: TritonGPUReorderInstructionsPass() = default; void runOnOperation() override { MLIRContext *context = &getContext(); ModuleOp m = getOperation(); // Sink conversions into loops when they will increase // register pressure DenseMap opToMove; m.walk([&](triton::gpu::ConvertLayoutOp op) { if (!willIncreaseRegisterPressure(op)) return; auto user_begin = op->user_begin(); auto user_end = op->user_end(); if (std::distance(user_begin, user_end) != 1) return; if (user_begin->getParentOfType() == op->getParentOfType()) return; opToMove.insert({op, *user_begin}); }); for (auto &kv : opToMove) kv.first->moveBefore(kv.second); // Move convert(load) immediately after dependent load m.walk([&](triton::gpu::ConvertLayoutOp op) { auto dstType = op.getResult().getType().cast(); auto dstEncoding = dstType.getEncoding(); if (!dstEncoding.isa()) return; Operation *argOp = op.getOperand().getDefiningOp(); if (!argOp) return; op->moveAfter(argOp); }); // Move transpositions just after their definition opToMove.clear(); m.walk([&](triton::TransOp op) { Operation *argOp = op.getOperand().getDefiningOp(); if (!argOp) return; op->moveAfter(argOp); }); // Move `dot` operand so that conversions to opIdx=0 happens before // conversions to opIdx=1 m.walk([&](triton::gpu::ConvertLayoutOp op) { auto dstType = op.getResult().getType().cast(); auto dstEncoding = dstType.getEncoding().dyn_cast(); if (!dstEncoding) return; int opIdx = dstEncoding.getOpIdx(); if (opIdx != 0) return; if (op->getUsers().empty()) return; auto dotUser = dyn_cast(*op->user_begin()); if (!dotUser) return; auto BOp = dotUser.getOperand(1).getDefiningOp(); if (!BOp) return; op->moveBefore(BOp); }); return; } }; std::unique_ptr mlir::createTritonGPUReorderInstructionsPass() { return std::make_unique(); } triton-2.0.0/lib/Dialect/TritonGPU/Transforms/TritonGPUConversion.cpp000066400000000000000000000076101440023377100255460ustar00rootroot00000000000000#include "triton/Dialect/TritonGPU/Transforms/TritonGPUConversion.h" #include "mlir/IR/BlockAndValueMapping.h" #include "triton/Dialect/Triton/IR/Dialect.h" #include "triton/Dialect/TritonGPU/IR/Dialect.h" #include #include using namespace mlir; using namespace mlir::triton::gpu; // // TypeConverter // TritonGPUTypeConverter::TritonGPUTypeConverter(MLIRContext *context, int numWarps) : context(context), numWarps(numWarps) { // TODO: how does MLIR pick the right conversion? addConversion([](Type type) { return type; }); addConversion([this](RankedTensorType tensorType) -> RankedTensorType { // types with encoding are already in the right format // TODO: check for layout encodings specifically if (tensorType.getEncoding()) return tensorType; // pessimistic values for attributes: // - 1 element per thread // - order = arange(rank) ArrayRef shape = tensorType.getShape(); int rank = shape.size(); llvm::SmallVector order(rank); std::iota(order.begin(), order.end(), 0); llvm::SmallVector sizePerThread(rank, 1); Attribute encoding = triton::gpu::BlockedEncodingAttr::get( this->context, shape, sizePerThread, order, this->numWarps); return RankedTensorType::get(shape, tensorType.getElementType(), encoding); }); // // Materializations // // This will be called when (newArgType != origArgType) // This will create newArg, and map(origArg, newArg) addArgumentMaterialization([&](OpBuilder &builder, RankedTensorType tensorType, ValueRange inputs, Location loc) { llvm_unreachable("Argument rematerialization not implemented"); return llvm::None; }); // If the origValue still has live user(s), use this to // convert origValue to newValue addSourceMaterialization([&](OpBuilder &builder, RankedTensorType tensorType, ValueRange inputs, Location loc) { llvm_unreachable("Source rematerialization not implemented"); return llvm::None; }); // This will be called when (desiredType != newOperandType) // where, desiredType = typeConverter->convertType(origType) // NOTE: only for remapped values. addTargetMaterialization([&](OpBuilder &builder, RankedTensorType tensorType, ValueRange inputs, Location loc) { auto cast = builder.create(loc, tensorType, inputs); return Optional(cast.getResult()); // return Optional(cast.getResult(0)); // llvm_unreachable("Not implemented"); // return llvm::None; }); } // // TritonGPUConversion // TritonGPUConversionTarget::TritonGPUConversionTarget( MLIRContext &context, TritonGPUTypeConverter &typeConverter) : ConversionTarget(context) { // TODO: we should also verify ops of TritonGPUDialect addLegalDialect(); // Some ops from SCF are illegal addIllegalOp(); addDynamicallyLegalDialect([&](Operation *op) { if (typeConverter.isLegal(op)) return true; return false; }); // We have requirements for the data layouts addDynamicallyLegalOp([](triton::DotOp dotOp) -> bool { Attribute aEncoding = dotOp.a().getType().cast().getEncoding(); Attribute bEncoding = dotOp.b().getType().cast().getEncoding(); if (aEncoding && aEncoding.isa() && bEncoding && bEncoding.isa()) return true; return false; }); } triton-2.0.0/lib/Dialect/TritonGPU/Transforms/UpdateMmaForVolta.cpp000066400000000000000000000313161440023377100251770ustar00rootroot00000000000000#include "Utility.h" #include "mlir/Dialect/SCF/SCF.h" #include "mlir/IR/Matchers.h" #include "mlir/IR/PatternMatch.h" #include "mlir/Transforms/GreedyPatternRewriteDriver.h" #include "triton/Analysis/Utility.h" #include "triton/Dialect/TritonGPU/IR/Dialect.h" #include "triton/Dialect/TritonGPU/Transforms/Passes.h" namespace mlir { namespace { using triton::DotOp; using triton::gpu::ConvertLayoutOp; using triton::gpu::DotOperandEncodingAttr; using triton::gpu::MmaEncodingAttr; using triton::gpu::SharedEncodingAttr; using triton::gpu::SliceEncodingAttr; // This pattern collects the wrong Mma those need to update and create the right // ones for each. // TODO[Superjomn]: RewirtePattern is not needed here, Rewrite this to a method class CollectMmaToUpdateForVolta : public mlir::RewritePattern { // Holds the mapping from old(wrong) mmaEncodingAttr to the new(correct) // mmaEncodingAttr. DenseMap &mmaToUpdate; public: CollectMmaToUpdateForVolta( mlir::MLIRContext *ctx, DenseMap &mmaToUpdate) : mlir::RewritePattern(triton::DotOp::getOperationName(), 1, ctx), mmaToUpdate(mmaToUpdate) {} mlir::LogicalResult matchAndRewrite(mlir::Operation *op, mlir::PatternRewriter &rewriter) const override { auto dotOp = cast(op); auto *ctx = dotOp->getContext(); auto AT = dotOp.a().getType().cast(); auto BT = dotOp.b().getType().cast(); auto DT = dotOp.d().getType().cast(); auto shapeA = AT.getShape(); auto shapeB = BT.getShape(); if (!DT.getEncoding()) return failure(); auto mmaLayout = DT.getEncoding().dyn_cast(); if (!(mmaLayout && mmaLayout.isVolta())) return failure(); // Has processed. if (mmaToUpdate.count(mmaLayout)) return failure(); auto dotOperandA = AT.getEncoding().cast(); auto dotOperandB = BT.getEncoding().cast(); bool isARow = dotOperandA.getIsMMAv1Row().cast().getValue(); bool isBRow = dotOperandB.getIsMMAv1Row().cast().getValue(); auto [isARow_, isBRow_, isAVec4_, isBVec4_, mmaId] = mmaLayout.decodeVoltaLayoutStates(); bool isAVec4 = !isARow && (shapeA[isARow] <= 16); bool isBVec4 = isBRow && (shapeB[isBRow] <= 16); // The wpt of MMAv1 is also determined by isARow, isBRow and shape, and it // could only be set here for those states might be updated by previous // patterns in the Combine Pass. auto tgtWpt = getWarpsPerCTA(DT.getShape(), isARow, isBRow, isAVec4, isBVec4, product(mmaLayout.getWarpsPerCTA())); if (isARow == isARow_ && isBRow == isBRow_ && isAVec4 == isAVec4_ && isBVec4 == isBVec4_) { if (tgtWpt == mmaLayout.getWarpsPerCTA()) return failure(); } MmaEncodingAttr newMmaLayout; { // Recalculate the wpt, for here we could get the latest information, the // wpt should be updated. auto updatedWpt = getWarpsPerCTA(DT.getShape(), isARow, isBRow, isAVec4, isBVec4, product(mmaLayout.getWarpsPerCTA())); newMmaLayout = MmaEncodingAttr::get(ctx, mmaLayout.getVersionMajor(), updatedWpt, AT.getShape(), BT.getShape(), isARow, isBRow, mmaId); } // Collect the wrong MMA Layouts, and mark need to update. mmaToUpdate.try_emplace(mmaLayout, newMmaLayout); return failure(); } // Get the wpt for MMAv1 using more information. // Reference the original logic here // https://github.com/openai/triton/blob/0e4691e6dd91e001a8d33b71badf8b3314325459/lib/codegen/analysis/layout.cc#L223 SmallVector getWarpsPerCTA(ArrayRef shape, bool isARow, bool isBRow, bool isAVec4, bool isBVec4, int numWarps) const { // TODO[Superjomn]: Share code with // DotOpMmaV1ConversionHelper::AParam/BParam, since same code to compute the // rep,spw and fpw. SmallVector wpt({1, 1}); SmallVector wpt_nm1; SmallVector rep(2), spw(2); std::array fpw{{2, 2, 1}}; int packSize0 = (isARow || isAVec4) ? 1 : 2; rep[0] = 2 * packSize0; spw[0] = fpw[0] * 4 * rep[0]; int packSize1 = (isBRow && !isBVec4) ? 2 : 1; rep[1] = 2 * packSize1; spw[1] = fpw[1] * 4 * rep[1]; do { wpt_nm1 = wpt; if (wpt[0] * wpt[1] < numWarps) wpt[0] = std::clamp(wpt[0] * 2, 1, shape[0] / spw[0]); if (wpt[0] * wpt[1] < numWarps) wpt[1] = std::clamp(wpt[1] * 2, 1, shape[1] / spw[1]); } while (wpt_nm1 != wpt); return wpt; } }; class UpdateMMAForMMAv1 : public mlir::RewritePattern { const DenseMap &mmaToUpdate; public: UpdateMMAForMMAv1( MLIRContext *context, const DenseMap &mmaToUpdate) : RewritePattern(MatchAnyOpTypeTag{}, 1, context), mmaToUpdate(mmaToUpdate) {} LogicalResult matchAndRewrite(mlir::Operation *op, mlir::PatternRewriter &rewriter) const override { // Nothing to update if (mmaToUpdate.empty()) return failure(); if (auto dotOp = llvm::dyn_cast(op)) return rewriteDotOp(op, rewriter); else if (auto cvtOp = llvm::dyn_cast(op)) return rewriteCvtOp(op, rewriter); else if (auto expandDimsOp = llvm::dyn_cast(op)) return rewriteExpandDimsOp(op, rewriter); else if (auto constOp = llvm::dyn_cast(op)) return rewriteConstantOp(op, rewriter); else return rewriteElementwiseOp(op, rewriter); return failure(); } LogicalResult rewriteDotOp(Operation *op, mlir::PatternRewriter &rewriter) const { auto dotOp = llvm::cast(op); auto tensorTy = dotOp->getResult(0).getType().dyn_cast(); if (!tensorTy) return failure(); auto mma = dotOp.d() .getType() .cast() .getEncoding() .dyn_cast(); if (!mma || !mmaToUpdate.count(mma)) return failure(); auto newTensorTy = getUpdatedType(tensorTy); rewriter.replaceOpWithNewOp(op, newTensorTy, dotOp.a(), dotOp.b(), dotOp.c(), dotOp.allowTF32()); return success(); } LogicalResult rewriteCvtOp(Operation *op, mlir::PatternRewriter &rewriter) const { auto cvt = llvm::cast(op); if (!needUpdate(cvt.getResult().getType())) return failure(); auto tensorTy = cvt.result().getType().dyn_cast(); auto newTensorTy = getUpdatedType(tensorTy); auto newOp = rewriter.replaceOpWithNewOp(op, newTensorTy, cvt.getOperand()); return success(); } LogicalResult rewriteExpandDimsOp(Operation *op, mlir::PatternRewriter &rewriter) const { auto expandDims = llvm::cast(op); auto srcTy = expandDims.src().getType(); auto resTy = expandDims.getResult().getType(); // the result type need to update if (!needUpdate(srcTy) && needUpdate(resTy)) { rewriter.replaceOpWithNewOp(op, expandDims.src(), expandDims.axis()); return success(); } return failure(); } LogicalResult rewriteConstantOp(Operation *op, mlir::PatternRewriter &rewriter) const { auto constant = llvm::cast(op); auto resTy = constant.getResult().getType(); if (!needUpdate(resTy)) return failure(); auto tensorTy = constant.getResult().getType().cast(); auto mma = tensorTy.getEncoding().dyn_cast(); auto dot = tensorTy.getEncoding().dyn_cast(); if (!mma && !dot) return failure(); auto newTensorTy = getUpdatedType(tensorTy); if (auto attr = constant.getValue().dyn_cast()) { auto newRet = SplatElementsAttr::get(newTensorTy, attr.getSplatValue()); rewriter.replaceOpWithNewOp(op, newRet); return success(); } return failure(); } LogicalResult rewriteElementwiseOp(Operation *op, mlir::PatternRewriter &rewriter) const { if (op->getNumOperands() != 1 || op->getNumResults() != 1) return failure(); auto srcTy = op->getOperand(0).getType(); auto resTy = op->getResult(0).getType(); if (needUpdate(resTy)) { // The op-inputs' types are not necessary to update, for some // replaceOpWithNewOp will help update them. op->getResult(0).setType( getUpdatedType(resTy.dyn_cast())); return success(); } return failure(); } RankedTensorType getUpdatedType(RankedTensorType type) const { if (!needUpdate(type)) return type; auto encoding = type.getEncoding(); if (auto mma = encoding.dyn_cast()) { auto newMma = mmaToUpdate.lookup(mma); return RankedTensorType::get(type.getShape(), type.getElementType(), newMma); } else if (auto slice = encoding.dyn_cast()) { if (auto mma = slice.getParent().dyn_cast()) { auto newMma = mmaToUpdate.lookup(mma); auto newSlice = SliceEncodingAttr::get(slice.getContext(), slice.getDim(), newMma); return RankedTensorType::get(type.getShape(), type.getElementType(), newSlice); } } else if (auto dotOp = encoding.dyn_cast()) { if (auto mma = dotOp.getParent().dyn_cast()) { auto newMma = mmaToUpdate.lookup(mma); auto newDotOp = DotOperandEncodingAttr::get(dotOp.getContext(), dotOp.getOpIdx(), newMma, dotOp.getIsMMAv1Row()); return RankedTensorType::get(type.getShape(), type.getElementType(), newDotOp); } } return type; } // Tell if this type contains a wrong MMA encoding and need to update. bool needUpdate(Type type) const { auto tensorTy = type.dyn_cast(); if (!tensorTy) return false; return needUpdate(tensorTy); } // Tell if this type contains a wrong MMA encoding and need to update. bool needUpdate(RankedTensorType type) const { auto encoding = type.getEncoding(); if (!encoding) return false; MmaEncodingAttr mma; if ((mma = encoding.dyn_cast())) { } else if (auto slice = encoding.dyn_cast()) { mma = slice.getParent().dyn_cast(); } else if (auto dotOp = encoding.dyn_cast()) { mma = dotOp.getParent().dyn_cast(); } return mma && mmaToUpdate.count(mma); } }; } // namespace #define GEN_PASS_CLASSES #include "triton/Dialect/TritonGPU/Transforms/Passes.h.inc" class UpdateMmaForVoltaPass : public UpdateMmaForVoltaBase { public: UpdateMmaForVoltaPass() = default; void runOnOperation() override { MLIRContext *context = &getContext(); ModuleOp m = getOperation(); llvm::DenseMap mmaToUpdate; { mlir::RewritePatternSet patterns(context); patterns.add(context, mmaToUpdate); GreedyRewriteConfig config; config.enableRegionSimplification = false; // The pattern doesn't modify the IR if (applyPatternsAndFoldGreedily(m, std::move(patterns), config).failed()) signalPassFailure(); } if (!mmaToUpdate.empty()) { mlir::RewritePatternSet patterns(context); patterns.add(context, mmaToUpdate); mlir::GreedyRewriteConfig config; // Make sure the slice and dot_operand layouts' parent mma are updated // before updating DotOp or it will get a mismatch parent-encoding. config.useTopDownTraversal = true; if (applyPatternsAndFoldGreedily(m, std::move(patterns), config).failed()) signalPassFailure(); if (fixupLoops(m).failed()) signalPassFailure(); } } }; std::unique_ptr createTritonGPUUpdateMmaForVoltaPass() { return std::make_unique(); } } // namespace mlir triton-2.0.0/lib/Dialect/TritonGPU/Transforms/Utility.cpp000066400000000000000000000036711440023377100233130ustar00rootroot00000000000000#include "Utility.h" #include "mlir/Dialect/SCF/SCF.h" #include "mlir/IR/BlockAndValueMapping.h" #include "mlir/Transforms/GreedyPatternRewriteDriver.h" namespace mlir { namespace { class FixupLoop : public mlir::RewritePattern { public: explicit FixupLoop(mlir::MLIRContext *context) : mlir::RewritePattern(scf::ForOp::getOperationName(), 2, context) {} mlir::LogicalResult matchAndRewrite(mlir::Operation *op, mlir::PatternRewriter &rewriter) const override { auto forOp = cast(op); // Rewrite init argument SmallVector newInitArgs = forOp.getInitArgs(); bool shouldRematerialize = false; for (size_t i = 0; i < newInitArgs.size(); i++) { if (newInitArgs[i].getType() != forOp.getRegionIterArgs()[i].getType() || newInitArgs[i].getType() != forOp.getResultTypes()[i]) { shouldRematerialize = true; break; } } if (!shouldRematerialize) return failure(); scf::ForOp newForOp = rewriter.create( forOp.getLoc(), forOp.getLowerBound(), forOp.getUpperBound(), forOp.getStep(), newInitArgs); newForOp->moveBefore(forOp); rewriter.setInsertionPointToStart(newForOp.getBody()); BlockAndValueMapping mapping; for (const auto &arg : llvm::enumerate(forOp.getRegionIterArgs())) mapping.map(arg.value(), newForOp.getRegionIterArgs()[arg.index()]); mapping.map(forOp.getInductionVar(), newForOp.getInductionVar()); for (Operation &op : forOp.getBody()->getOperations()) { rewriter.clone(op, mapping); } rewriter.replaceOp(forOp, newForOp.getResults()); return success(); } }; } // namespace LogicalResult fixupLoops(ModuleOp mod) { auto *ctx = mod.getContext(); mlir::RewritePatternSet patterns(ctx); patterns.add(ctx); if (applyPatternsAndFoldGreedily(mod, std::move(patterns)).failed()) return failure(); return success(); } } // namespace mlir triton-2.0.0/lib/Dialect/TritonGPU/Transforms/Utility.h000066400000000000000000000005041440023377100227500ustar00rootroot00000000000000#ifndef TRITON_LIB_DIALECT_TRITONGPU_TRANSFORMS_UTILITY_H_ #define TRITON_LIB_DIALECT_TRITONGPU_TRANSFORMS_UTILITY_H_ #include "mlir/IR/Matchers.h" #include "mlir/IR/PatternMatch.h" namespace mlir { LogicalResult fixupLoops(ModuleOp mod); } // namespace mlir #endif // TRITON_LIB_DIALECT_TRITONGPU_TRANSFORMS_UTILITY_H_ triton-2.0.0/lib/Target/000077500000000000000000000000001440023377100150255ustar00rootroot00000000000000triton-2.0.0/lib/Target/CMakeLists.txt000066400000000000000000000000571440023377100175670ustar00rootroot00000000000000add_subdirectory(LLVMIR) add_subdirectory(PTX) triton-2.0.0/lib/Target/LLVMIR/000077500000000000000000000000001440023377100160325ustar00rootroot00000000000000triton-2.0.0/lib/Target/LLVMIR/CMakeLists.txt000066400000000000000000000004011440023377100205650ustar00rootroot00000000000000add_mlir_translation_library(TritonLLVMIR LLVMIRTranslation.cpp LINK_COMPONENTS Core LINK_LIBS PUBLIC MLIRIR MLIRLLVMIR MLIRSCFToStandard MLIRSupport MLIRTargetLLVMIRExport ) triton-2.0.0/lib/Target/LLVMIR/LLVMIRTranslation.cpp000066400000000000000000000256271440023377100217760ustar00rootroot00000000000000#include "triton/Target/LLVMIR/LLVMIRTranslation.h" #include "mlir/Conversion/Passes.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" #include "mlir/ExecutionEngine/ExecutionEngine.h" #include "mlir/ExecutionEngine/OptUtils.h" #include "mlir/IR/Dialect.h" #include "mlir/Pass/Pass.h" #include "mlir/Pass/PassManager.h" #include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h" #include "mlir/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.h" #include "mlir/Target/LLVMIR/Export.h" #include "mlir/Target/LLVMIR/LLVMTranslationInterface.h" #include "mlir/Transforms/Passes.h" #include "triton/Conversion/TritonGPUToLLVM/TritonGPUToLLVMPass.h" #include "triton/Tools/Sys/GetEnv.hpp" #include "llvm/IR/Constants.h" #include "llvm/IRReader/IRReader.h" #include "llvm/Linker/Linker.h" #include "llvm/Support/SourceMgr.h" #include #include namespace mlir { namespace triton { // Describes NVVM Metadata. It is used to record the nvvm related meta // information from mlir module. struct NVVMMetadata { int maxntidx{-1}; bool isKernel{}; // Free to extend with other information. }; // Add the nvvm related metadata to LLVM IR. static void amendLLVMFunc(llvm::Function *func, const NVVMMetadata &metadata) { auto *module = func->getParent(); auto &ctx = func->getContext(); if (metadata.maxntidx > 0) { auto warps = llvm::ConstantInt::get(llvm::IntegerType::get(ctx, 32), llvm::APInt(32, metadata.maxntidx)); llvm::Metadata *md_args[] = {llvm::ValueAsMetadata::get(func), llvm::MDString::get(ctx, "maxntidx"), llvm::ValueAsMetadata::get(warps)}; module->getOrInsertNamedMetadata("nvvm.annotations") ->addOperand(llvm::MDNode::get(ctx, md_args)); } if (metadata.isKernel) { llvm::Metadata *mdArgs[] = { llvm::ValueAsMetadata::get(func), llvm::MDString::get(ctx, "kernel"), llvm::ValueAsMetadata::get( llvm::ConstantInt::get(llvm::Type::getInt32Ty(ctx), 1))}; module->getOrInsertNamedMetadata("nvvm.annotations") ->addOperand(llvm::MDNode::get(ctx, mdArgs)); } } static void extractNVVMMetadata(mlir::ModuleOp module, llvm::DenseMap *dic) { for (auto op : module.getOps()) { NVVMMetadata meta; bool hasMetadata{}; // maxntid if (op->hasAttr("nvvm.maxntid")) { auto attr = op->getAttr("nvvm.maxntid"); meta.maxntidx = attr.dyn_cast().getInt(); hasMetadata = true; } // kernel if (op->hasAttr("nvvm.kernel")) { meta.isKernel = true; hasMetadata = true; } if (hasMetadata) dic->try_emplace(op.getNameAttr().strref(), std::move(meta)); } } static std::map getExternLibs(mlir::ModuleOp module) { std::map externLibs; SmallVector funcs; module.walk([&](LLVM::LLVMFuncOp func) { if (func.isExternal()) funcs.push_back(func); }); for (auto &func : funcs) { if (func.getOperation()->hasAttr("libname")) { auto name = func.getOperation()->getAttr("libname").dyn_cast(); auto path = func.getOperation()->getAttr("libpath").dyn_cast(); if (name) { std::string libName = name.str(); externLibs[libName] = path.str(); } } } if (module.getOperation()->hasAttr("triton_gpu.externs")) { auto dict = module.getOperation() ->getAttr("triton_gpu.externs") .dyn_cast(); for (auto &attr : dict) { externLibs[attr.getName().strref().trim().str()] = attr.getValue().dyn_cast().strref().trim().str(); } } if (!funcs.empty()) { static const std::string libdevice = "libdevice"; namespace fs = std::filesystem; // Search for libdevice relative to its library path if used from Python // Then native code is in `triton/_C/libtriton.so` and libdevice in // `triton/third_party/cuda/lib/libdevice.10.bc` static const auto this_library_path = [] { Dl_info fileinfo; if (dladdr(reinterpret_cast(&getExternLibs), &fileinfo) == 0) { return std::filesystem::path(); } return std::filesystem::path(fileinfo.dli_fname); }(); static const auto runtime_path = this_library_path.parent_path().parent_path() / "third_party" / "cuda" / "lib" / "libdevice.10.bc"; if (fs::exists(runtime_path)) { externLibs.try_emplace(libdevice, runtime_path.string()); } else { // When using the Math Dialect, it is possible that some ops (e.g., log) // are lowered to a function call. In this case, we need to link libdevice // using its default path: // [triton root dir]/python/triton/language/libdevice.10.bc // TODO(Keren): handle external linkage other than libdevice? static const auto this_file_path = std::filesystem::path(__FILE__); static const auto compiletime_path = this_file_path.parent_path() .parent_path() .parent_path() .parent_path() / "python" / "triton" / "third_party" / "cuda" / "lib" / "libdevice.10.bc"; if (!fs::exists(compiletime_path)) { std::string error_msg = "Can't find libdevice at neither " + runtime_path.string() + " nor " + compiletime_path.string(); llvm::report_fatal_error(error_msg.c_str()); } externLibs.try_emplace(libdevice, compiletime_path.string()); } } return externLibs; } static void linkLibdevice(llvm::Module &module) { // please check https://llvm.org/docs/NVPTXUsage.html#reflection-parameters // this will enable fast math path in libdevice // for example, when enable nvvm-reflect-ftz, sqrt.approx.f32 will change to // sqrt.approx.ftz.f32 auto &ctx = module.getContext(); llvm::Type *i32 = llvm::Type::getInt32Ty(ctx); llvm::Metadata *mdFour = llvm::ConstantAsMetadata::get(llvm::ConstantInt::getSigned(i32, 4)); llvm::Metadata *mdName = llvm::MDString::get(ctx, "nvvm-reflect-ftz"); llvm::Metadata *mdOne = llvm::ConstantAsMetadata::get(llvm::ConstantInt::getSigned(i32, 1)); llvm::MDNode *reflect = llvm::MDNode::get(ctx, {mdFour, mdName, mdOne}); module.addModuleFlag(reflect); } static bool linkExternLib(llvm::Module &module, llvm::StringRef name, llvm::StringRef path) { llvm::SMDiagnostic err; auto &ctx = module.getContext(); auto extMod = llvm::parseIRFile(path, err, ctx); if (!extMod) { llvm::errs() << "Failed to load " << path; return true; } extMod->setTargetTriple(module.getTargetTriple()); extMod->setDataLayout(module.getDataLayout()); if (llvm::Linker::linkModules(module, std::move(extMod), llvm::Linker::Flags::LinkOnlyNeeded)) { llvm::errs() << "Failed to link " << path; return true; } if (name == "libdevice") { linkLibdevice(module); } else { assert(false && "unknown extern lib: "); } return false; } std::unique_ptr translateLLVMToLLVMIR(llvm::LLVMContext *llvmContext, mlir::ModuleOp module) { DialectRegistry registry; mlir::registerLLVMDialectTranslation(registry); mlir::registerNVVMDialectTranslation(registry); module->getContext()->appendDialectRegistry(registry); llvm::DenseMap nvvmMetadata; extractNVVMMetadata(module, &nvvmMetadata); auto llvmModule = mlir::translateModuleToLLVMIR(module, *llvmContext); if (!llvmModule) { llvm::errs() << "Failed to emit LLVM IR\n"; return nullptr; } // Link external libraries before perform optimizations // Note from libdevice users guide: // https://docs.nvidia.com/cuda/libdevice-users-guide/basic-usage.html // The standard process for linking with libdevice is to first link it with // the target module, then run the standard LLVM optimization and code // generation passes. This allows the optimizers to inline and perform // analyses on the used library functions, and eliminate any used functions as // dead code. auto externLibs = getExternLibs(module); for (auto &lib : externLibs) { if (linkExternLib(*llvmModule, lib.first, lib.second)) return nullptr; } auto optPipeline = mlir::makeOptimizingTransformer( /*optLevel=*/3, /*sizeLevel=*/0, /*targetMachine=*/nullptr); if (auto err = optPipeline(llvmModule.get())) { llvm::errs() << "Failed to optimize LLVM IR " << err << "\n"; return nullptr; } for (auto &func : llvmModule->functions()) { auto it = nvvmMetadata.find(func.getName()); if (it != nvvmMetadata.end()) amendLLVMFunc(&func, it->second); } return llvmModule; } std::unique_ptr translateTritonGPUToLLVMIR(llvm::LLVMContext *llvmContext, mlir::ModuleOp module, int computeCapability) { mlir::PassManager pm(module->getContext()); applyPassManagerCLOptions(pm); auto printingFlags = mlir::OpPrintingFlags(); printingFlags.elideLargeElementsAttrs(16); pm.enableIRPrinting( /*shouldPrintBeforePass=*/nullptr, /*shouldPrintAfterPass=*/ [](mlir::Pass *pass, mlir::Operation *) { return ::triton::tools::getBoolEnv("MLIR_ENABLE_DUMP"); }, /*printModuleScope=*/false, /*printAfterOnlyOnChange=*/true, /*printAfterOnlyOnFailure*/ false, llvm::dbgs(), printingFlags); pm.addPass(createConvertTritonGPUToLLVMPass(computeCapability)); // Canonicalize to eliminate the remaining UnrealizedConversionCastOp pm.addPass(mlir::createCanonicalizerPass()); pm.addPass(mlir::createCSEPass()); // Simplify the IR to improve readability. pm.addPass(mlir::createSymbolDCEPass()); pm.addPass(mlir::createCanonicalizerPass()); if (failed(pm.run(module))) { llvm::errs() << "Pass execution failed"; return nullptr; } auto llvmIR = translateLLVMToLLVMIR(llvmContext, module); if (!llvmIR) { llvm::errs() << "Translate to LLVM IR failed"; return nullptr; } return llvmIR; } void addExternalLibs(mlir::ModuleOp &module, const std::vector &names, const std::vector &paths) { if (names.empty() || names.size() != paths.size()) return; llvm::SmallVector attrs; for (size_t i = 0; i < names.size(); ++i) { auto name = StringAttr::get(module->getContext(), names[i]); auto path = StringAttr::get(module->getContext(), paths[i]); NamedAttribute attr(name, path); attrs.push_back(attr); } DictionaryAttr dict = DictionaryAttr::get(module->getContext(), attrs); module.getOperation()->setAttr("triton_gpu.externs", dict); } } // namespace triton } // namespace mlir triton-2.0.0/lib/Target/PTX/000077500000000000000000000000001440023377100155005ustar00rootroot00000000000000triton-2.0.0/lib/Target/PTX/CMakeLists.txt000066400000000000000000000002411440023377100202350ustar00rootroot00000000000000add_mlir_translation_library(TritonPTX PTXTranslation.cpp LINK_COMPONENTS Core LINK_LIBS PUBLIC TritonLLVMIR ) triton-2.0.0/lib/Target/PTX/PTXTranslation.cpp000066400000000000000000000070451440023377100211040ustar00rootroot00000000000000#include "triton/Target/PTX/PTXTranslation.h" #include "triton/Target/LLVMIR/LLVMIRTranslation.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/LegacyPassManager.h" #include "llvm/IR/Module.h" #include "llvm/IR/Verifier.h" #include "llvm/MC/TargetRegistry.h" #include "llvm/Support/TargetSelect.h" #include "llvm/Target/TargetMachine.h" namespace triton { static void initLLVM() { LLVMInitializeNVPTXTargetInfo(); LLVMInitializeNVPTXTarget(); LLVMInitializeNVPTXTargetMC(); LLVMInitializeNVPTXAsmPrinter(); } static bool findAndReplace(std::string &str, const std::string &begin, const std::string &end, const std::string &target) { size_t startReplace = str.find(begin); if (startReplace == std::string::npos) return false; size_t endReplace = str.find(end, startReplace); if (endReplace == std::string::npos) return false; str.replace(startReplace, endReplace + 1 - startReplace, target); return true; } std::string translateLLVMIRToPTX(llvm::Module &module, int cc, int version) { // LLVM version in use may not officially support target hardware. // Supported versions for LLVM 14 are here: // https://github.com/llvm/llvm-project/blob/f28c006a5895fc0e329fe15fead81e37457cb1d1/clang/include/clang/Basic/BuiltinsNVPTX.def int maxPTX = std::min(75, version); int maxCC = std::min(86, cc); // options auto options = llvm::cl::getRegisteredOptions(); auto *shortPtr = static_cast *>(options["nvptx-short-ptr"]); assert(shortPtr); shortPtr->setValue(true); std::string sm = "sm_" + std::to_string(maxCC); // max PTX version int ptxMajor = maxPTX / 10; int ptxMinor = maxPTX % 10; // create llvm::SmallVector buffer; std::string triple = "nvptx64-nvidia-cuda"; std::string proc = "sm_" + std::to_string(maxCC); std::string layout = ""; std::string features = ""; // std::string features = "+ptx" + std::to_string(maxPTX); initLLVM(); // verify and store llvm llvm::legacy::PassManager pm; pm.add(llvm::createVerifierPass()); pm.run(module); // module->print(llvm::outs(), nullptr); // create machine module.setTargetTriple(triple); std::string error; auto target = llvm::TargetRegistry::lookupTarget(module.getTargetTriple(), error); llvm::TargetOptions opt; opt.AllowFPOpFusion = llvm::FPOpFusion::Fast; opt.UnsafeFPMath = false; opt.NoInfsFPMath = false; opt.NoNaNsFPMath = true; llvm::TargetMachine *machine = target->createTargetMachine( module.getTargetTriple(), proc, features, opt, llvm::Reloc::PIC_, llvm::None, llvm::CodeGenOpt::Aggressive); // set data layout if (layout.empty()) module.setDataLayout(machine->createDataLayout()); else module.setDataLayout(layout); // emit machine code for (llvm::Function &f : module.functions()) f.addFnAttr(llvm::Attribute::AlwaysInline); llvm::legacy::PassManager pass; llvm::raw_svector_ostream stream(buffer); // emit machine->addPassesToEmitFile(pass, stream, nullptr, llvm::CodeGenFileType::CGFT_AssemblyFile); pass.run(module); // post-process std::string result(buffer.begin(), buffer.end()); findAndReplace(result, ".version", "\n", ".version " + std::to_string(ptxMajor) + "." + std::to_string(ptxMinor) + "\n"); findAndReplace(result, ".target", "\n", ".target " + sm + "\n"); while (findAndReplace(result, "\t// begin inline asm", "\n", "")) ; while (findAndReplace(result, "\t// end inline asm", "\n", "")) ; return result; } } // namespace triton triton-2.0.0/python/000077500000000000000000000000001440023377100143525ustar00rootroot00000000000000triton-2.0.0/python/MANIFEST.in000066400000000000000000000000111440023377100161000ustar00rootroot00000000000000graft srctriton-2.0.0/python/README.md000066400000000000000000000000001440023377100156170ustar00rootroot00000000000000triton-2.0.0/python/examples/000077500000000000000000000000001440023377100161705ustar00rootroot00000000000000triton-2.0.0/python/examples/copy_strided.py000066400000000000000000000010271440023377100212320ustar00rootroot00000000000000 import triton import triton.language as tl # triton kernel @triton.jit def kernel(X, stride_xm, Z, stride_zn, BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr): off_m = tl.arange(0, BLOCK_M) off_n = tl.arange(0, BLOCK_N) Xs = X + off_m[:, None] * stride_xm + off_n[None, :] * 1 Zs = Z + off_m[:, None] * 1 + off_n[None, :] * stride_zn tl.store(Zs, tl.load(Xs)) ret = triton.compile(kernel, signature="*fp32,i32,*fp32,i32", constants={"BLOCK_M": 64, "BLOCK_N": 64}, output="ttgir") print(ret) triton-2.0.0/python/examples/empty.py000066400000000000000000000003261440023377100177010ustar00rootroot00000000000000import torch import triton import triton.language as tl @triton.jit def kernel(X, stride_xm, stride_xn, BLOCK: tl.constexpr): pass X = torch.randn(1, device="cuda") pgm = kernel[(1,)](X, 1, 1, BLOCK=1024) triton-2.0.0/python/setup.cfg000066400000000000000000000001611440023377100161710ustar00rootroot00000000000000[metadata] description_file = README.md [pycodestyle] ignore = E501,E701,E731 [flake8] ignore = E501,E701,E731 triton-2.0.0/python/setup.py000066400000000000000000000212211440023377100160620ustar00rootroot00000000000000import distutils import os import platform import re import shutil import subprocess import sys import tarfile import tempfile import urllib.request from distutils.version import LooseVersion from typing import NamedTuple from setuptools import Extension, setup from setuptools.command.build_ext import build_ext # Taken from https://github.com/pytorch/pytorch/blob/master/tools/setup_helpers/env.py def check_env_flag(name: str, default: str = "") -> bool: return os.getenv(name, default).upper() in ["ON", "1", "YES", "TRUE", "Y"] def get_build_type(): if check_env_flag("DEBUG"): return "Debug" elif check_env_flag("REL_WITH_DEB_INFO"): return "RelWithDebInfo" elif check_env_flag("TRITON_REL_BUILD_WITH_ASSERTS"): return "TritonRelBuildWithAsserts" else: # TODO: change to release when stable enough return "TritonRelBuildWithAsserts" # --- third party packages ----- class Package(NamedTuple): package: str name: str url: str test_file: str include_flag: str lib_flag: str syspath_var_name: str # pybind11 def get_pybind11_package_info(): name = "pybind11-2.10.0" url = "https://github.com/pybind/pybind11/archive/refs/tags/v2.10.0.tar.gz" return Package("pybind11", name, url, "include/pybind11/pybind11.h", "PYBIND11_INCLUDE_DIR", "", "PYBIND11_SYSPATH") # llvm def get_llvm_package_info(): # download if nothing is installed system = platform.system() if system == "Darwin": system_suffix = "apple-darwin" elif system == "Linux": vglibc = tuple(map(int, platform.libc_ver()[1].split('.'))) vglibc = vglibc[0] * 100 + vglibc[1] linux_suffix = 'ubuntu-18.04' if vglibc > 217 else 'centos-7' system_suffix = f"linux-gnu-{linux_suffix}" else: raise RuntimeError(f"unsupported system: {system}") use_assert_enabled_llvm = check_env_flag("TRITON_USE_ASSERT_ENABLED_LLVM", "False") release_suffix = "assert" if use_assert_enabled_llvm else "release" name = f'llvm+mlir-14.0.6-x86_64-{system_suffix}-{release_suffix}' url = f"https://github.com/ptillet/triton-llvm-releases/releases/download/llvm-14.0.6-f28c006a5895/{name}.tar.xz" return Package("llvm", name, url, "lib", "LLVM_INCLUDE_DIRS", "LLVM_LIBRARY_DIR", "LLVM_SYSPATH") def get_thirdparty_packages(triton_cache_path): packages = [get_pybind11_package_info(), get_llvm_package_info()] thirdparty_cmake_args = [] for p in packages: package_root_dir = os.path.join(triton_cache_path, p.package) package_dir = os.path.join(package_root_dir, p.name) if p.syspath_var_name in os.environ: package_dir = os.environ[p.syspath_var_name] test_file_path = os.path.join(package_dir, p.test_file) if not os.path.exists(test_file_path): try: shutil.rmtree(package_root_dir) except Exception: pass os.makedirs(package_root_dir, exist_ok=True) print(f'downloading and extracting {p.url} ...') ftpstream = urllib.request.urlopen(p.url) file = tarfile.open(fileobj=ftpstream, mode="r|*") file.extractall(path=package_root_dir) if p.include_flag: thirdparty_cmake_args.append(f"-D{p.include_flag}={package_dir}/include") if p.lib_flag: thirdparty_cmake_args.append(f"-D{p.lib_flag}={package_dir}/lib") return thirdparty_cmake_args # ---- package data --- def download_and_copy_ptxas(): base_dir = os.path.dirname(__file__) src_path = "bin/ptxas" url = "https://conda.anaconda.org/nvidia/label/cuda-12.0.0/linux-64/cuda-nvcc-12.0.76-0.tar.bz2" dst_prefix = os.path.join(base_dir, "triton") dst_suffix = os.path.join("third_party", "cuda", src_path) dst_path = os.path.join(dst_prefix, dst_suffix) if not os.path.exists(dst_path): print(f'downloading and extracting {url} ...') ftpstream = urllib.request.urlopen(url) file = tarfile.open(fileobj=ftpstream, mode="r|*") with tempfile.TemporaryDirectory() as temp_dir: file.extractall(path=temp_dir) src_path = os.path.join(temp_dir, src_path) os.makedirs(os.path.split(dst_path)[0], exist_ok=True) shutil.copy(src_path, dst_path) return dst_suffix # ---- cmake extension ---- class CMakeExtension(Extension): def __init__(self, name, path, sourcedir=""): Extension.__init__(self, name, sources=[]) self.sourcedir = os.path.abspath(sourcedir) self.path = path class CMakeBuild(build_ext): user_options = build_ext.user_options + [('base-dir=', None, 'base directory of Triton')] def initialize_options(self): build_ext.initialize_options(self) self.base_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir)) def finalize_options(self): build_ext.finalize_options(self) def run(self): try: out = subprocess.check_output(["cmake", "--version"]) except OSError: raise RuntimeError( "CMake must be installed to build the following extensions: " + ", ".join(e.name for e in self.extensions) ) if platform.system() == "Windows": cmake_version = LooseVersion(re.search(r"version\s*([\d.]+)", out.decode()).group(1)) if cmake_version < "3.1.0": raise RuntimeError("CMake >= 3.1.0 is required on Windows") for ext in self.extensions: self.build_extension(ext) def build_extension(self, ext): lit_dir = shutil.which('lit') triton_cache_path = os.path.join(os.environ["HOME"], ".triton") # lit is used by the test suite thirdparty_cmake_args = get_thirdparty_packages(triton_cache_path) extdir = os.path.abspath(os.path.dirname(self.get_ext_fullpath(ext.path))) # create build directories if not os.path.exists(self.build_temp): os.makedirs(self.build_temp) # python directories python_include_dir = distutils.sysconfig.get_python_inc() cmake_args = [ "-DLLVM_ENABLE_WERROR=ON", "-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=" + extdir, "-DTRITON_BUILD_TUTORIALS=OFF", "-DTRITON_BUILD_PYTHON_MODULE=ON", "-DPython3_EXECUTABLE:FILEPATH=" + sys.executable, "-DCMAKE_VERBOSE_MAKEFILE:BOOL=ON", "-DPYTHON_INCLUDE_DIRS=" + python_include_dir, ] if lit_dir is not None: cmake_args.append("-DLLVM_EXTERNAL_LIT=" + lit_dir) cmake_args.extend(thirdparty_cmake_args) # configuration cfg = get_build_type() build_args = ["--config", cfg] if platform.system() == "Windows": cmake_args += [f"-DCMAKE_RUNTIME_OUTPUT_DIRECTORY_{cfg.upper()}={extdir}"] if sys.maxsize > 2**32: cmake_args += ["-A", "x64"] build_args += ["--", "/m"] else: import multiprocessing cmake_args += ["-DCMAKE_BUILD_TYPE=" + cfg] build_args += ['-j' + str(2 * multiprocessing.cpu_count())] env = os.environ.copy() subprocess.check_call(["cmake", self.base_dir] + cmake_args, cwd=self.build_temp, env=env) subprocess.check_call(["cmake", "--build", "."] + build_args, cwd=self.build_temp) download_and_copy_ptxas() setup( name="triton", version="2.0.0", author="Philippe Tillet", author_email="phil@openai.com", description="A language and compiler for custom Deep Learning operations", long_description="", packages=["triton", "triton/_C", "triton/language", "triton/tools", "triton/impl", "triton/ops", "triton/runtime", "triton/ops/blocksparse"], install_requires=[ "cmake", "filelock", "torch", "lit", ], package_data={"triton": ["third_party/**/*"]}, include_package_data=True, ext_modules=[CMakeExtension("triton", "triton/_C/")], cmdclass={"build_ext": CMakeBuild}, zip_safe=False, # for PyPI keywords=["Compiler", "Deep Learning"], url="https://github.com/openai/triton/", classifiers=[ "Development Status :: 4 - Beta", "Intended Audience :: Developers", "Topic :: Software Development :: Build Tools", "License :: OSI Approved :: MIT License", "Programming Language :: Python :: 3.6", ], test_suite="tests", extras_require={ "tests": [ "autopep8", "flake8", "isort", "numpy", "pytest", "scipy>=1.7.1", ], "tutorials": [ "matplotlib", "pandas", "tabulate", ], }, ) triton-2.0.0/python/src/000077500000000000000000000000001440023377100151415ustar00rootroot00000000000000triton-2.0.0/python/src/main.cc000066400000000000000000000004661440023377100164020ustar00rootroot00000000000000#include void init_superblocking(pybind11::module &m); void init_torch_utils(pybind11::module &m); void init_triton(pybind11::module &m); void init_cutlass(pybind11::module &m); PYBIND11_MODULE(libtriton, m) { m.doc() = "Python bindings to the C++ Triton API"; init_triton(m); } triton-2.0.0/python/src/triton.cc000066400000000000000000001777721440023377100170140ustar00rootroot00000000000000#include "mlir/IR/Builders.h" #include "mlir/IR/BuiltinOps.h" #include "mlir/IR/MLIRContext.h" #include "mlir/IR/Verifier.h" #include "mlir/Conversion/Passes.h" #include "mlir/Pass/Pass.h" #include "mlir/Pass/PassManager.h" #include "mlir/Transforms/Passes.h" #include "mlir/Parser.h" #include "mlir/Support/FileUtilities.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" #include "triton/Analysis/Allocation.h" #include "triton/Conversion/TritonGPUToLLVM/TritonGPUToLLVMPass.h" #include "triton/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.h" #include "triton/Dialect/Triton/IR/Dialect.h" #include "triton/Dialect/Triton/IR/Types.h" #include "triton/Dialect/Triton/Transforms/Passes.h" #include "triton/Dialect/TritonGPU/Transforms/Passes.h" #include "triton/Target/LLVMIR/LLVMIRTranslation.h" #include "triton/Target/PTX/PTXTranslation.h" #include "triton/Tools/Sys/GetEnv.hpp" #include "llvm/IR/LegacyPassManager.h" #include "llvm/IR/Module.h" #include "llvm/IR/Verifier.h" #include "llvm/IRReader/IRReader.h" #include "llvm/Support/FileUtilities.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Support/SourceMgr.h" #include #include #include #include #include #include #include #include #include #include #include #include #include namespace py = pybind11; enum backend_t { HOST, CUDA, ROCM, }; void init_triton_runtime(py::module &&m) { // wrap backend_t py::enum_(m, "backend") .value("HOST", HOST) .value("CUDA", CUDA) // .value("ROCM", ROCM) .export_values(); } /*****************************************************************************/ /* Python bindings for triton::ir */ /*****************************************************************************/ void init_triton_ir(py::module &&m) { using ret = py::return_value_policy; using namespace pybind11::literals; py::enum_(m, "CACHE_MODIFIER") .value("NONE", mlir::triton::CacheModifier::NONE) .value("CA", mlir::triton::CacheModifier::CA) .value("CG", mlir::triton::CacheModifier::CG) .export_values(); py::enum_(m, "EVICTION_POLICY") .value("NORMAL", mlir::triton::EvictionPolicy::NORMAL) .value("EVICT_FIRST", mlir::triton::EvictionPolicy::EVICT_FIRST) .value("EVICT_LAST", mlir::triton::EvictionPolicy::EVICT_LAST) .export_values(); py::enum_(m, "REDUCE_OP") .value("ADD", mlir::triton::RedOp::ADD) .value("FADD", mlir::triton::RedOp::FADD) .value("MIN", mlir::triton::RedOp::MIN) .value("MAX", mlir::triton::RedOp::MAX) .value("UMIN", mlir::triton::RedOp::UMIN) .value("UMAX", mlir::triton::RedOp::UMAX) .value("ARGMIN", mlir::triton::RedOp::ARGMIN) .value("ARGMAX", mlir::triton::RedOp::ARGMAX) .value("ARGUMIN", mlir::triton::RedOp::ARGUMIN) .value("ARGUMAX", mlir::triton::RedOp::ARGUMAX) .value("FMIN", mlir::triton::RedOp::FMIN) .value("FMAX", mlir::triton::RedOp::FMAX) .value("ARGFMIN", mlir::triton::RedOp::ARGFMIN) .value("ARGFMAX", mlir::triton::RedOp::ARGFMAX) .value("XOR", mlir::triton::RedOp::XOR); py::enum_(m, "ATOMIC_OP") .value("ADD", mlir::triton::RMWOp::ADD) .value("FADD", mlir::triton::RMWOp::FADD) .value("AND", mlir::triton::RMWOp::AND) .value("OR", mlir::triton::RMWOp::OR) .value("XOR", mlir::triton::RMWOp::XOR) .value("XCHG", mlir::triton::RMWOp::XCHG) .value("MAX", mlir::triton::RMWOp::MAX) .value("MIN", mlir::triton::RMWOp::MIN) .value("UMIN", mlir::triton::RMWOp::UMIN) .value("UMAX", mlir::triton::RMWOp::UMAX); py::class_(m, "context") .def(py::init<>()) .def("load_triton", [](mlir::MLIRContext &self) { self.getOrLoadDialect(); // we load LLVM because the frontend uses LLVM.undef for // some placeholders self.getOrLoadDialect(); self.getOrLoadDialect(); self.getOrLoadDialect(); }); // .def(py::init([](){ // mlir::MLIRContext context; // context.getOrLoadDialect(); // // TODO: should we return a (raw/unique) pointer here? // return context; // })); // py::class_(m, "value") // .def("multiple_of", [](ir::value *self, int val) { // if (auto *instr = dynamic_cast(self)) { // instr->set_metadata(ir::metadata::multiple_of, val); // } else // throw std::runtime_error("multiple_of"); // }) // .def("max_contiguous", [](ir::value *self, int val) { // if (auto *instr = dynamic_cast(self)) { // instr->set_metadata(ir::metadata::max_contiguous, val); // } else // throw std::runtime_error("max_contiguous"); // }) // .def("set_fdiv_ieee_rounding", [](ir::value *self, bool val) { // if (auto *instr = dynamic_cast(self)) // instr->set_fdiv_ieee_rounding(val); // else // throw std::runtime_error("set_fdiv_ieee_rounding"); // }) // .def("ops", [](ir::value *self) { // if (auto *instr = dynamic_cast(self)) { // return instr->ops(); // } // throw std::runtime_error("cannot use ops()"); // }) // .def("replace_all_uses_with", &ir::value::replace_all_uses_with) // .def("erase_from_parent", [](ir::value *self) { // if (auto *instr = dynamic_cast(self)) // return instr->erase_from_parent(); // throw std::runtime_error("cannot use erase_from_parent"); // }) // .def_property("name", &ir::value::get_name, &ir::value::set_name) // .def_property_readonly("type", &ir::value::get_type); // // // Do we need under in TritonIR ? // // py::class_(m, "undef") // // .def("get", &ir::undef_value::get, ret::reference); py::class_(m, "type") .def("is_integer", &mlir::Type::isInteger) .def("is_fp16", &mlir::Type::isF16) .def("__str__", [](mlir::Type &self) { std::string str; llvm::raw_string_ostream os(str); self.print(os); return os.str(); }); py::class_(m, "function_type") .def("param_types", [](mlir::FunctionType &self) { return std::vector(self.getInputs().begin(), self.getInputs().end()); }); py::class_(m, "value") .def("set_attr", [](mlir::Value &self, std::string &name, mlir::Attribute &attr) -> void { if (mlir::Operation *definingOp = self.getDefiningOp()) definingOp->setAttr(name, attr); else { /* issue a warning */ } }) .def("get_context", &mlir::Value::getContext) .def("replace_all_uses_with", [](mlir::Value &self, mlir::Value &newValue) { self.replaceAllUsesWith(newValue); }) .def("get_type", &mlir::Value::getType); py::class_(m, "block_argument"); py::class_(m, "region") .def("get_parent_region", &mlir::Region::getParentRegion, ret::reference) .def("size", [](mlir::Region &self) { return self.getBlocks().size(); }) .def("empty", &mlir::Region::empty); py::class_(m, "block") .def("arg", [](mlir::Block &self, int index) -> mlir::BlockArgument { return self.getArgument(index); }) .def("add_argument", [](mlir::Block &self, mlir::Type ty) { auto loc = mlir::UnknownLoc::get(ty.getContext()); self.addArgument(ty, loc); }) .def("get_num_arguments", &mlir::Block::getNumArguments) .def("dump", &mlir::Block::dump) .def("move_before", &mlir::Block::moveBefore) .def("insert_before", &mlir::Block::insertBefore) .def("get_parent", &mlir::Block::getParent, ret::reference) .def("merge_block_before", [](mlir::Block &self, mlir::Block &dst) { // ref: RewriterBase::mergeBlocks() if (self.getNumArguments() != 0) throw std::runtime_error( "This block has arguments, don't merge"); dst.getOperations().splice(dst.begin(), self.getOperations()); self.dropAllUses(); self.erase(); }) .def("replace_use_in_block_with", [](mlir::Block &self, mlir::Value &v, mlir::Value &newVal) { v.replaceUsesWithIf(newVal, [&](mlir::OpOperand &operand) { mlir::Operation *user = operand.getOwner(); mlir::Block *currentBlock = user->getBlock(); while (currentBlock) { if (currentBlock == &self) return true; // Move up one level currentBlock = currentBlock->getParent()->getParentOp()->getBlock(); } return false; }); }) .def("__str__", [](mlir::Block &self) { std::string str; llvm::raw_string_ostream os(str); self.print(os); return str; }) .def("has_terminator", [](mlir::Block &self) { return !self.empty() && self.back().hasTrait(); }) .def("erase", [](mlir::Block &self) { self.erase(); }); // using eattr = ir::attribute_kind_t; // py::enum_(m, "attribute_kind") // .value("readonly", eattr::readonly) // .value("writeonly", eattr::writeonly) // .value("noalias", eattr::noalias) // .value("aligned", eattr::aligned) // .value("multiple_of", eattr::multiple_of) // .value("retune", eattr::retune) // .value("not_implemented", eattr::not_implemented); py::class_(m, "attribute"); py::class_(m, "integer_attr"); py::class_(m, "bool_attr"); // Ops py::class_(m, "OpState") .def("set_attr", [](mlir::OpState &self, std::string &name, mlir::Attribute &attr) -> void { self->setAttr(name, attr); }) .def( "get_num_results", [](mlir::OpState &self) -> unsigned { return self->getNumResults(); }) .def("get_result", [](mlir::OpState &self, unsigned idx) -> mlir::Value { return self->getResult(idx); }) .def( "get_region", [](mlir::OpState &self, unsigned idx) -> mlir::Region & { return self->getRegion(idx); }, ret::reference) .def( "get_body", [](mlir::scf::ForOp &self, unsigned idx) -> mlir::Block * { return self.getBody(idx); }, ret::reference) .def("dump", [](mlir::OpState &self) { self->dump(); }) .def("__str__", [](mlir::OpState &self) -> std::string { std::string str; llvm::raw_string_ostream os(str); self->print(os); return str; }) .def("append_operand", [](mlir::OpState &self, mlir::Value &val) { self->insertOperands(self->getNumOperands(), val); }) .def("verify", [](mlir::OpState &self) -> bool { return mlir::succeeded(mlir::verify(self.getOperation())); }); // scf Ops py::class_(m, "ForOp") .def("get_induction_var", &mlir::scf::ForOp::getInductionVar); py::class_(m, "IfOp") .def("get_then_block", &mlir::scf::IfOp::thenBlock, ret::reference) .def("get_else_block", &mlir::scf::IfOp::elseBlock, ret::reference) .def("get_then_yield", &mlir::scf::IfOp::thenYield) .def("get_else_yield", &mlir::scf::IfOp::elseYield); py::class_(m, "YieldOp"); py::class_(m, "WhileOp") .def("get_before", &mlir::scf::WhileOp::getBefore, ret::reference) .def("get_after", &mlir::scf::WhileOp::getAfter, ret::reference); py::class_(m, "ConditionOp"); // dynamic_attr is used to transfer ownership of the MLIR context to the // module py::class_(m, "module", py::dynamic_attr()) .def("dump", &mlir::ModuleOp::dump) .def("str", [](mlir::ModuleOp &self) -> std::string { std::string str; llvm::raw_string_ostream os(str); self.print(os); return str; }) .def("push_back", [](mlir::ModuleOp &self, mlir::FuncOp &funcOp) -> void { self.push_back(funcOp); }) .def("has_function", [](mlir::ModuleOp &self, std::string &funcName) -> bool { if (self.lookupSymbol(funcName)) return true; return false; }) .def("get_function", [](mlir::ModuleOp &self, std::string &funcName) -> mlir::FuncOp { return self.lookupSymbol(funcName); }) .def("get_single_function", [](mlir::ModuleOp &self) -> mlir::FuncOp { llvm::SmallVector funcs; self.walk([&](mlir::FuncOp func) { funcs.push_back(func); }); if (funcs.size() != 1) throw std::runtime_error("Expected a single function"); return funcs[0]; }); m.def("make_attr", [](const std::vector &values, mlir::MLIRContext &context) { return mlir::DenseIntElementsAttr::get( mlir::RankedTensorType::get( {static_cast(values.size())}, mlir::IntegerType::get(&context, 32)), values) .cast(); }); m.def( "parse_mlir_module", [](const std::string &inputFilename, mlir::MLIRContext &context) { // initialize registry // note: we initialize llvm for undef mlir::DialectRegistry registry; registry.insert(); context.appendDialectRegistry(registry); context.loadAllAvailableDialects(); // parse module mlir::OwningOpRef module( mlir::parseSourceFile(inputFilename, &context)); // locations are incompatible with ptx < 7.5 ! module->walk([](mlir::Operation *op) { op->setLoc(mlir::UnknownLoc::get(op->getContext())); }); if (!module) throw std::runtime_error("Parse MLIR file failed."); return module->clone(); }, ret::take_ownership); py::class_(m, "function") // .def_property_readonly("attrs", &ir::function::attrs) // .def("add_attr", &ir::function::add_attr); .def("args", [](mlir::FuncOp &self, unsigned idx) -> mlir::BlockArgument { return self.getArgument(idx); }) .def( "add_entry_block", [](mlir::FuncOp &self) -> mlir::Block * { return self.addEntryBlock(); }, ret::reference) .def( "set_arg_attr", [](mlir::FuncOp &self, int arg_no, const std::string &name, int val) { // set arg attributes "name" to value "val" auto attrTy = mlir::IntegerType::get(self.getContext(), 32); self.setArgAttr(arg_no, name, mlir::IntegerAttr::get(attrTy, val)); }, ret::reference) .def_property_readonly("type", &mlir::FuncOp::getType) .def("reset_type", &mlir::FuncOp::setType); py::class_(m, "InsertPoint"); py::class_(m, "builder", py::dynamic_attr()) .def(py::init()) // // getters .def_property_readonly("context", &mlir::OpBuilder::getContext, ret::reference) .def("create_module", [](mlir::OpBuilder &self) -> mlir::ModuleOp { auto loc = self.getUnknownLoc(); return self.create(loc); }) .def("ret", [](mlir::OpBuilder &self, std::vector &vals) -> void { auto loc = self.getUnknownLoc(); self.create(loc, vals); }) .def("call", [](mlir::OpBuilder &self, mlir::FuncOp &func, std::vector &args) -> mlir::OpState { auto loc = self.getUnknownLoc(); return self.create(loc, func, args); }) // insertion block/point .def("set_insertion_point_to_start", [](mlir::OpBuilder &self, mlir::Block &block) -> void { self.setInsertionPointToStart(&block); }) .def("set_insertion_point_to_end", [](mlir::OpBuilder &self, mlir::Block &block) { self.setInsertionPointToEnd(&block); }) .def("set_insertion_point_after", [](mlir::OpBuilder &self, mlir::Operation &op) { self.setInsertionPointAfter(&op); }) .def( "get_insertion_block", [](mlir::OpBuilder &self) -> mlir::Block * { return self.getInsertionBlock(); }, ret::reference) .def("get_insertion_point", &mlir::OpBuilder::saveInsertionPoint) .def("restore_insertion_point", &mlir::OpBuilder::restoreInsertionPoint) // .def("set_insert_point", [](ir::builder *self, // std::pair pt) { // ir::basic_block *bb = pt.first; // ir::instruction *instr = pt.second; // if (instr) { // if (bb != instr->get_parent()) // throw std::runtime_error("invalid insertion point, instr not in // bb"); // self->set_insert_point(instr); // } else { // assert(bb); // self->set_insert_point(bb); // } // }) // Attr .def("get_bool_attr", &mlir::OpBuilder::getBoolAttr) .def("get_int32_attr", &mlir::OpBuilder::getI32IntegerAttr) // Use arith.ConstantOp to create constants // Constants .def("get_int1", [](mlir::OpBuilder &self, bool v) -> mlir::Value { auto loc = self.getUnknownLoc(); return mlir::Value(self.create( loc, v, self.getI1Type())); }) .def("get_int8", [](mlir::OpBuilder &self, int64_t v) -> mlir::Value { auto loc = self.getUnknownLoc(); return mlir::Value(self.create( loc, v, self.getI8Type())); }) .def("get_int32", [](mlir::OpBuilder &self, int64_t v) -> mlir::Value { auto loc = self.getUnknownLoc(); return mlir::Value(self.create( loc, v, self.getI32Type())); }) .def("get_int64", [](mlir::OpBuilder &self, int64_t v) -> mlir::Value { auto loc = self.getUnknownLoc(); return mlir::Value(self.create( loc, v, self.getI64Type())); }) // bfloat16 cannot be initialized as it is treated as int16 for now //.def("get_bf16", // [](mlir::OpBuilder &self, float v) -> mlir::Value { // auto loc = self.getUnknownLoc(); // auto type = self.getBF16Type(); // return self.create( // loc, // mlir::APFloat(type.getFloatSemantics(), std::to_string(v)), // type); // }) .def("get_fp16", [](mlir::OpBuilder &self, float v) -> mlir::Value { auto loc = self.getUnknownLoc(); return self.create( loc, self.getF16FloatAttr(v)); }) .def("get_fp32", [](mlir::OpBuilder &self, float v) -> mlir::Value { auto loc = self.getUnknownLoc(); return self.create( loc, self.getF32FloatAttr(v)); }) .def("get_null_value", [](mlir::OpBuilder &self, mlir::Type type) -> mlir::Value { auto loc = self.getUnknownLoc(); if (auto floatTy = type.dyn_cast()) return self.create( loc, mlir::APFloat(floatTy.getFloatSemantics(), 0), floatTy); else if (auto intTy = type.dyn_cast()) return self.create(loc, 0, intTy); else throw std::runtime_error("Not implemented"); }) .def("get_all_ones_value", [](mlir::OpBuilder &self, mlir::Type type) -> mlir::Value { auto loc = self.getUnknownLoc(); uint64_t val = 0xFFFFFFFFFFFFFFFF; if (auto intTy = type.dyn_cast()) return self.create(loc, val, intTy); else throw std::runtime_error("Not implemented"); }) // Types .def("get_void_ty", [](mlir::OpBuilder &self) -> mlir::Type { return self.getNoneType(); }) .def("get_int1_ty", [](mlir::OpBuilder &self) -> mlir::Type { return self.getI1Type(); }) // or ret::copy? .def("get_int8_ty", [](mlir::OpBuilder &self) -> mlir::Type { return self.getI8Type(); }) .def("get_int16_ty", [](mlir::OpBuilder &self) -> mlir::Type { return self.getType(16); }) .def( "get_int32_ty", [](mlir::OpBuilder &self) -> mlir::Type { return self.getI32Type(); }) .def( "get_int64_ty", [](mlir::OpBuilder &self) -> mlir::Type { return self.getI64Type(); }) .def("get_fp8_ty", [](mlir::OpBuilder &self) -> mlir::Type { return self.getType(); }) .def( "get_half_ty", [](mlir::OpBuilder &self) -> mlir::Type { return self.getF16Type(); }) .def("get_bf16_ty", [](mlir::OpBuilder &self) -> mlir::Type { return self.getBF16Type(); }) .def( "get_float_ty", [](mlir::OpBuilder &self) -> mlir::Type { return self.getF32Type(); }) .def( "get_double_ty", [](mlir::OpBuilder &self) -> mlir::Type { return self.getF64Type(); }) .def("get_ptr_ty", [](mlir::OpBuilder &self, mlir::Type &type, int addrSpace) -> mlir::Type { return mlir::triton::PointerType::get(type, addrSpace); }) .def("get_block_ty", [](mlir::OpBuilder &self, mlir::Type &elementType, std::vector &shape) -> mlir::Type { return mlir::RankedTensorType::get(shape, elementType); }) .def("get_function_ty", [](mlir::OpBuilder &self, std::vector inTypes, std::vector outTypes) -> mlir::Type { return self.getFunctionType(inTypes, outTypes); }) // Ops .def("get_or_insert_function", [](mlir::OpBuilder &self, mlir::ModuleOp &module, std::string &funcName, mlir::Type &funcType, std::string &visibility) -> mlir::FuncOp { if (mlir::Operation *funcOperation = module.lookupSymbol(funcName)) return llvm::dyn_cast(funcOperation); auto loc = self.getUnknownLoc(); if (auto funcTy = funcType.dyn_cast()) { llvm::SmallVector attrs = { mlir::NamedAttribute(self.getStringAttr("sym_visibility"), self.getStringAttr(visibility))}; return self.create(loc, funcName, funcTy, attrs); } throw std::runtime_error("invalid function type"); }) .def( "create_block", [](mlir::OpBuilder &self) -> mlir::Block * { mlir::Region *parent = self.getBlock()->getParent(); return self.createBlock(parent); }, ret::reference) .def( "create_block_with_parent", [](mlir::OpBuilder &self, mlir::Region &parent, std::vector &argTypes) -> mlir::Block * { auto argLoc = self.getUnknownLoc(); llvm::SmallVector argLocs(argTypes.size(), argLoc); return self.createBlock(&parent, {}, argTypes, argLocs); }, ret::reference) .def( "new_block", [](mlir::OpBuilder &self) -> mlir::Block * { return new mlir::Block(); }, ret::reference) // Unstructured control flow .def("create_cond_branch", [](mlir::OpBuilder &self, mlir::Value condition, mlir::Block *trueDest, mlir::Block *falseDest) { auto loc = self.getUnknownLoc(); self.create(loc, condition, trueDest, falseDest); return; }) .def("create_branch", [](mlir::OpBuilder &self, mlir::Block *dest, std::vector &args) { auto loc = self.getUnknownLoc(); self.create(loc, dest, args); return; }) // Structured control flow .def("create_for_op", [](mlir::OpBuilder &self, mlir::Value &lb, mlir::Value &ub, mlir::Value &step, std::vector &initArgs) -> mlir::scf::ForOp { auto loc = self.getUnknownLoc(); return self.create(loc, lb, ub, step, initArgs); }) .def("create_if_op", [](mlir::OpBuilder &self, std::vector &retTypes, mlir::Value &condition, bool withElse) -> mlir::scf::IfOp { auto loc = self.getUnknownLoc(); return self.create(loc, retTypes, condition, withElse); }) .def("create_yield_op", [](mlir::OpBuilder &self, std::vector &yields) -> mlir::scf::YieldOp { auto loc = self.getUnknownLoc(); return self.create(loc, yields); }) .def("create_while_op", [](mlir::OpBuilder &self, std::vector &retTypes, std::vector &initArgs) -> mlir::scf::WhileOp { auto loc = self.getUnknownLoc(); return self.create(loc, retTypes, initArgs); }) .def("create_condition_op", [](mlir::OpBuilder &self, mlir::Value &cond, std::vector &args) -> mlir::scf::ConditionOp { auto loc = self.getUnknownLoc(); return self.create(loc, cond, args); }) // miscellaneous .def("create_make_range", [](mlir::OpBuilder &self, int start, int end) -> mlir::Value { auto loc = self.getUnknownLoc(); auto retType = mlir::RankedTensorType::get({end - start}, self.getI32Type()); return self.create(loc, retType, start, end); }) // Cast instructions // Conversions for custom FP types (FP8) .def("create_fp_to_fp", [](mlir::OpBuilder &self, mlir::Value &src, mlir::Type &dstType) -> mlir::Value { auto loc = self.getUnknownLoc(); return self.create(loc, dstType, src); }) // Conversions for standard LLVM builtin types .def("create_bitcast", [](mlir::OpBuilder &self, mlir::Value &src, mlir::Type &dstType) -> mlir::Value { auto loc = self.getUnknownLoc(); return self.create(loc, dstType, src); }) .def("create_si_to_fp", [](mlir::OpBuilder &self, mlir::Value &src, mlir::Type &dstType) -> mlir::Value { auto loc = self.getUnknownLoc(); return self.create(loc, dstType, src); }) .def("create_ui_to_fp", [](mlir::OpBuilder &self, mlir::Value &src, mlir::Type &dstType) -> mlir::Value { auto loc = self.getUnknownLoc(); return self.create(loc, dstType, src); }) .def("create_fp_to_si", [](mlir::OpBuilder &self, mlir::Value &src, mlir::Type &dstType) -> mlir::Value { auto loc = self.getUnknownLoc(); return self.create(loc, dstType, src); }) .def("create_fp_to_ui", [](mlir::OpBuilder &self, mlir::Value &src, mlir::Type &dstType) -> mlir::Value { auto loc = self.getUnknownLoc(); return self.create(loc, dstType, src); }) .def("create_fp_ext", [](mlir::OpBuilder &self, mlir::Value &src, mlir::Type &dstType) -> mlir::Value { auto loc = self.getUnknownLoc(); return self.create(loc, dstType, src); }) .def("create_fp_trunc", [](mlir::OpBuilder &self, mlir::Value &src, mlir::Type &dstType) -> mlir::Value { auto loc = self.getUnknownLoc(); return self.create(loc, dstType, src); }) .def("create_int_cast", [](mlir::OpBuilder &self, mlir::Value &src, mlir::Type &dstType, bool isSigned) -> mlir::Value { auto loc = self.getUnknownLoc(); // get element type if necessary mlir::Type srcType = src.getType(); auto srcTensorType = srcType.dyn_cast(); auto dstTensorType = dstType.dyn_cast(); mlir::Type srcEltType = srcType; mlir::Type dstEltType = dstType; if (dstTensorType && srcTensorType) { dstEltType = dstTensorType.getElementType(); srcEltType = srcTensorType.getElementType(); } unsigned srcWidth = srcEltType.getIntOrFloatBitWidth(); unsigned dstWidth = dstEltType.getIntOrFloatBitWidth(); if (srcWidth == dstWidth) return self.create(loc, dstType, src); else if (srcWidth > dstWidth) return self.create(loc, dstType, src); else if (isSigned) return self.create(loc, dstType, src); else return self.create(loc, dstType, src); }) .def("create_to_index", [](mlir::OpBuilder &self, mlir::Value &input) -> mlir::Value { auto loc = self.getUnknownLoc(); return self.create(loc, input, self.getIndexType()); }) .def("create_index_to_si", [](mlir::OpBuilder &self, mlir::Value &input) -> mlir::Value { auto loc = self.getUnknownLoc(); return self.create(loc, input, self.getI32Type()); }) .def("create_fmul", [](mlir::OpBuilder &self, mlir::Value &lhs, mlir::Value &rhs) -> mlir::Value { auto loc = self.getUnknownLoc(); return self.create(loc, lhs, rhs); }) .def("create_fdiv", [](mlir::OpBuilder &self, mlir::Value &lhs, mlir::Value &rhs) -> mlir::Value { auto loc = self.getUnknownLoc(); return self.create(loc, lhs, rhs); }) .def("create_frem", [](mlir::OpBuilder &self, mlir::Value &lhs, mlir::Value &rhs) -> mlir::Value { auto loc = self.getUnknownLoc(); return self.create(loc, lhs, rhs); }) .def("create_fadd", [](mlir::OpBuilder &self, mlir::Value &lhs, mlir::Value &rhs) -> mlir::Value { auto loc = self.getUnknownLoc(); return self.create(loc, lhs, rhs); }) .def("create_fsub", [](mlir::OpBuilder &self, mlir::Value &lhs, mlir::Value &rhs) -> mlir::Value { auto loc = self.getUnknownLoc(); return self.create(loc, lhs, rhs); }) .def("create_mul", [](mlir::OpBuilder &self, mlir::Value &lhs, mlir::Value &rhs) -> mlir::Value { auto loc = self.getUnknownLoc(); return self.create(loc, lhs, rhs); }) .def("create_sdiv", [](mlir::OpBuilder &self, mlir::Value &lhs, mlir::Value &rhs) -> mlir::Value { auto loc = self.getUnknownLoc(); return self.create(loc, lhs, rhs); }) .def("create_udiv", [](mlir::OpBuilder &self, mlir::Value &lhs, mlir::Value &rhs) -> mlir::Value { auto loc = self.getUnknownLoc(); return self.create(loc, lhs, rhs); }) .def("create_srem", [](mlir::OpBuilder &self, mlir::Value &lhs, mlir::Value &rhs) -> mlir::Value { auto loc = self.getUnknownLoc(); return self.create(loc, lhs, rhs); }) .def("create_urem", [](mlir::OpBuilder &self, mlir::Value &lhs, mlir::Value &rhs) -> mlir::Value { auto loc = self.getUnknownLoc(); return self.create(loc, lhs, rhs); }) .def("create_add", [](mlir::OpBuilder &self, mlir::Value &lhs, mlir::Value &rhs) -> mlir::Value { auto loc = self.getUnknownLoc(); return self.create(loc, lhs, rhs); }) .def("create_sub", [](mlir::OpBuilder &self, mlir::Value &lhs, mlir::Value &rhs) -> mlir::Value { auto loc = self.getUnknownLoc(); return mlir::Value( self.create(loc, lhs, rhs)); }) .def("create_shl", [](mlir::OpBuilder &self, mlir::Value &lhs, mlir::Value &rhs) -> mlir::Value { auto loc = self.getUnknownLoc(); return mlir::Value( self.create(loc, lhs, rhs)); }) .def("create_lshr", [](mlir::OpBuilder &self, mlir::Value &lhs, mlir::Value &rhs) -> mlir::Value { auto loc = self.getUnknownLoc(); return mlir::Value( self.create(loc, lhs, rhs)); }) .def("create_ashr", [](mlir::OpBuilder &self, mlir::Value &lhs, mlir::Value &rhs) -> mlir::Value { auto loc = self.getUnknownLoc(); return mlir::Value( self.create(loc, lhs, rhs)); }) // AddPtr (similar to GEP) .def("create_addptr", [](mlir::OpBuilder &self, mlir::Value &ptr, mlir::Value &offset) -> mlir::Value { auto loc = self.getUnknownLoc(); return self.create(loc, ptr.getType(), ptr, offset); }) // Comparison (int) .def("create_icmpSLE", [](mlir::OpBuilder &self, mlir::Value &lhs, mlir::Value &rhs) -> mlir::Value { auto loc = self.getUnknownLoc(); return self.create( loc, mlir::arith::CmpIPredicate::sle, lhs, rhs); }) .def("create_icmpSLT", [](mlir::OpBuilder &self, mlir::Value &lhs, mlir::Value &rhs) -> mlir::Value { auto loc = self.getUnknownLoc(); return self.create( loc, mlir::arith::CmpIPredicate::slt, lhs, rhs); }) .def("create_icmpSGE", [](mlir::OpBuilder &self, mlir::Value &lhs, mlir::Value &rhs) -> mlir::Value { auto loc = self.getUnknownLoc(); return self.create( loc, mlir::arith::CmpIPredicate::sge, lhs, rhs); }) .def("create_icmpSGT", [](mlir::OpBuilder &self, mlir::Value &lhs, mlir::Value &rhs) -> mlir::Value { auto loc = self.getUnknownLoc(); return self.create( loc, mlir::arith::CmpIPredicate::sgt, lhs, rhs); }) .def("create_icmpULE", [](mlir::OpBuilder &self, mlir::Value &lhs, mlir::Value &rhs) -> mlir::Value { auto loc = self.getUnknownLoc(); return self.create( loc, mlir::arith::CmpIPredicate::ule, lhs, rhs); }) .def("create_icmpULT", [](mlir::OpBuilder &self, mlir::Value &lhs, mlir::Value &rhs) -> mlir::Value { auto loc = self.getUnknownLoc(); return self.create( loc, mlir::arith::CmpIPredicate::ult, lhs, rhs); }) .def("create_icmpUGE", [](mlir::OpBuilder &self, mlir::Value &lhs, mlir::Value &rhs) -> mlir::Value { auto loc = self.getUnknownLoc(); return self.create( loc, mlir::arith::CmpIPredicate::uge, lhs, rhs); }) .def("create_icmpUGT", [](mlir::OpBuilder &self, mlir::Value &lhs, mlir::Value &rhs) -> mlir::Value { auto loc = self.getUnknownLoc(); return self.create( loc, mlir::arith::CmpIPredicate::ugt, lhs, rhs); }) .def("create_icmpEQ", [](mlir::OpBuilder &self, mlir::Value &lhs, mlir::Value &rhs) -> mlir::Value { auto loc = self.getUnknownLoc(); return self.create( loc, mlir::arith::CmpIPredicate::eq, lhs, rhs); }) .def("create_icmpNE", [](mlir::OpBuilder &self, mlir::Value &lhs, mlir::Value &rhs) -> mlir::Value { auto loc = self.getUnknownLoc(); return self.create( loc, mlir::arith::CmpIPredicate::ne, lhs, rhs); }) // Comparison (float) .def("create_fcmpOLT", [](mlir::OpBuilder &self, mlir::Value &lhs, mlir::Value &rhs) -> mlir::Value { auto loc = self.getUnknownLoc(); return self.create( loc, mlir::arith::CmpFPredicate::OLT, lhs, rhs); }) .def("create_fcmpOGT", [](mlir::OpBuilder &self, mlir::Value &lhs, mlir::Value &rhs) -> mlir::Value { auto loc = self.getUnknownLoc(); return self.create( loc, mlir::arith::CmpFPredicate::OGT, lhs, rhs); }) .def("create_fcmpOLE", [](mlir::OpBuilder &self, mlir::Value &lhs, mlir::Value &rhs) -> mlir::Value { auto loc = self.getUnknownLoc(); return self.create( loc, mlir::arith::CmpFPredicate::OLE, lhs, rhs); }) .def("create_fcmpOGE", [](mlir::OpBuilder &self, mlir::Value &lhs, mlir::Value &rhs) -> mlir::Value { auto loc = self.getUnknownLoc(); return self.create( loc, mlir::arith::CmpFPredicate::OGE, lhs, rhs); }) .def("create_fcmpOEQ", [](mlir::OpBuilder &self, mlir::Value &lhs, mlir::Value &rhs) -> mlir::Value { auto loc = self.getUnknownLoc(); return self.create( loc, mlir::arith::CmpFPredicate::OEQ, lhs, rhs); }) .def("create_fcmpONE", [](mlir::OpBuilder &self, mlir::Value &lhs, mlir::Value &rhs) -> mlir::Value { auto loc = self.getUnknownLoc(); return self.create( loc, mlir::arith::CmpFPredicate::ONE, lhs, rhs); }) .def("create_fcmpULT", [](mlir::OpBuilder &self, mlir::Value &lhs, mlir::Value &rhs) -> mlir::Value { auto loc = self.getUnknownLoc(); return self.create( loc, mlir::arith::CmpFPredicate::ULT, lhs, rhs); }) .def("create_fcmpUGT", [](mlir::OpBuilder &self, mlir::Value &lhs, mlir::Value &rhs) -> mlir::Value { auto loc = self.getUnknownLoc(); return self.create( loc, mlir::arith::CmpFPredicate::UGT, lhs, rhs); }) .def("create_fcmpULE", [](mlir::OpBuilder &self, mlir::Value &lhs, mlir::Value &rhs) -> mlir::Value { auto loc = self.getUnknownLoc(); return self.create( loc, mlir::arith::CmpFPredicate::ULE, lhs, rhs); }) .def("create_fcmpUGE", [](mlir::OpBuilder &self, mlir::Value &lhs, mlir::Value &rhs) -> mlir::Value { auto loc = self.getUnknownLoc(); return self.create( loc, mlir::arith::CmpFPredicate::UGE, lhs, rhs); }) .def("create_fcmpUEQ", [](mlir::OpBuilder &self, mlir::Value &lhs, mlir::Value &rhs) -> mlir::Value { auto loc = self.getUnknownLoc(); return self.create( loc, mlir::arith::CmpFPredicate::UEQ, lhs, rhs); }) .def("create_fcmpUNE", [](mlir::OpBuilder &self, mlir::Value &lhs, mlir::Value &rhs) -> mlir::Value { auto loc = self.getUnknownLoc(); return self.create( loc, mlir::arith::CmpFPredicate::UNE, lhs, rhs); }) // // Logical .def("create_and", [](mlir::OpBuilder &self, mlir::Value &lhs, mlir::Value &rhs) -> mlir::Value { auto loc = self.getUnknownLoc(); return self.create(loc, lhs, rhs); }) .def("create_xor", [](mlir::OpBuilder &self, mlir::Value &lhs, mlir::Value &rhs) -> mlir::Value { auto loc = self.getUnknownLoc(); return self.create(loc, lhs, rhs); }) .def("create_or", [](mlir::OpBuilder &self, mlir::Value &lhs, mlir::Value &rhs) -> mlir::Value { auto loc = self.getUnknownLoc(); return self.create(loc, lhs, rhs); }) // Input/Output .def("create_load", [](mlir::OpBuilder &self, mlir::Value &ptrs, mlir::triton::CacheModifier cacheModifier, mlir::triton::EvictionPolicy evictionPolicy, bool isVolatile) -> mlir::Value { auto loc = self.getUnknownLoc(); return self.create( loc, ptrs, cacheModifier, evictionPolicy, isVolatile); }) .def("create_store", [](mlir::OpBuilder &self, mlir::Value &ptrs, mlir::Value &value) -> void { auto loc = self.getUnknownLoc(); self.create(loc, ptrs, value); }) .def("create_masked_load", [](mlir::OpBuilder &self, mlir::Value &ptrs, mlir::Value &mask, std::optional &other, mlir::triton::CacheModifier cacheModifier, mlir::triton::EvictionPolicy evictionPolicy, bool isVolatile) -> mlir::Value { auto loc = self.getUnknownLoc(); return self.create( loc, ptrs, mask, other.value_or(mlir::Value()), cacheModifier, evictionPolicy, isVolatile); }) .def("create_masked_store", [](mlir::OpBuilder &self, mlir::Value &ptrs, mlir::Value &val, mlir::Value &mask) -> void { auto loc = self.getUnknownLoc(); self.create(loc, ptrs, val, mask); }) .def("create_view", [](mlir::OpBuilder &self, mlir::Value &arg, std::vector &shape) -> mlir::Value { auto loc = self.getUnknownLoc(); auto argType = arg.getType() .dyn_cast() .getElementType(); return self.create( loc, mlir::RankedTensorType::get(shape, argType), arg); }) .def( "create_expand_dims", [](mlir::OpBuilder &self, mlir::Value &arg, int axis) -> mlir::Value { auto loc = self.getUnknownLoc(); auto argType = arg.getType().dyn_cast(); auto argEltType = argType.getElementType(); std::vector retShape = argType.getShape(); retShape.insert(retShape.begin() + axis, 1); return self.create( loc, mlir::RankedTensorType::get(retShape, argEltType), arg, axis); }) .def("create_cat", [](mlir::OpBuilder &self, mlir::Value &lhs, mlir::Value &rhs) -> mlir::Value { auto loc = self.getUnknownLoc(); auto lhsType = lhs.getType().dyn_cast(); auto rhsType = rhs.getType().dyn_cast(); if (!(lhsType.getShape().size() == 1 && rhsType.getShape().size() == 1)) throw std::runtime_error( "shape not supported by cat. Expecting rank-1 inputs"); std::vector shape{lhsType.getShape()[0] + rhsType.getShape()[0]}; return self.create( loc, mlir::RankedTensorType::get(shape, lhsType.getElementType()), lhs, rhs); }) .def("create_trans", [](mlir::OpBuilder &self, mlir::Value &arg) -> mlir::Value { auto loc = self.getUnknownLoc(); auto argType = arg.getType().dyn_cast(); auto argEltType = argType.getElementType(); std::vector retShape = argType.getShape(); std::reverse(retShape.begin(), retShape.end()); return self.create( loc, mlir::RankedTensorType::get(retShape, argEltType), arg); }) .def("create_broadcast", [](mlir::OpBuilder &self, mlir::Value &arg, std::vector &shape) -> mlir::Value { auto loc = self.getUnknownLoc(); if (auto argType = arg.getType().dyn_cast()) return self.createOrFold( loc, mlir::RankedTensorType::get(shape, argType.getElementType()), arg); throw std::runtime_error( "arg is not of RankedTensorType, use create_splat"); }) .def("create_splat", [](mlir::OpBuilder &self, mlir::Value &arg, std::vector &shape) -> mlir::Value { auto loc = self.getUnknownLoc(); auto argType = arg.getType(); auto ret = self.createOrFold( loc, mlir::RankedTensorType::get(shape, argType), arg); return ret; }) // // atomic .def("create_atomic_cas", [](mlir::OpBuilder &self, mlir::Value &ptr, mlir::Value &cmp, mlir::Value &val) -> mlir::Value { auto loc = self.getUnknownLoc(); mlir::Type dstType; if (auto srcTensorType = ptr.getType().dyn_cast()) { mlir::Type dstElemType = srcTensorType.getElementType() .cast() .getPointeeType(); dstType = mlir::RankedTensorType::get(srcTensorType.getShape(), dstElemType); } else { auto ptrType = mlir::getElementTypeOrSelf(ptr) .cast(); dstType = ptrType.getPointeeType(); } return self.create(loc, dstType, ptr, cmp, val); }) .def("create_atomic_rmw", [](mlir::OpBuilder &self, mlir::triton::RMWOp rmwOp, mlir::Value &ptr, mlir::Value &val, mlir::Value &mask) -> mlir::Value { auto loc = self.getUnknownLoc(); mlir::Type dstType; if (auto srcTensorType = ptr.getType().dyn_cast()) { mlir::Type dstElemType = srcTensorType.getElementType() .cast() .getPointeeType(); dstType = mlir::RankedTensorType::get(srcTensorType.getShape(), dstElemType); } else { auto ptrType = mlir::getElementTypeOrSelf(ptr) .cast(); dstType = ptrType.getPointeeType(); } return self.create(loc, dstType, rmwOp, ptr, val, mask); }) // External .def("create_external_elementwise", [](mlir::OpBuilder &self, const std::string &libName, const std::string &libPath, const std::string &symbol, std::vector &argList, mlir::Type retType) -> mlir::Value { auto loc = self.getUnknownLoc(); return self.create( loc, retType, argList, libName, libPath, symbol); }) // Built-in instruction .def("create_get_program_id", [](mlir::OpBuilder &self, int axis) -> mlir::Value { auto loc = self.getUnknownLoc(); return self.create( loc, self.getI32Type(), self.getI32IntegerAttr(axis)); }) .def("create_get_num_programs", [](mlir::OpBuilder &self, int axis) -> mlir::Value { auto loc = self.getUnknownLoc(); return self.create( loc, self.getI32Type(), self.getI32IntegerAttr(axis)); }) .def("create_dot", [](mlir::OpBuilder &self, mlir::Value &a, mlir::Value &b, mlir::Value &c, bool allowTF32) -> mlir::Value { auto loc = self.getUnknownLoc(); return self.create(loc, c.getType(), a, b, c, allowTF32); }) .def("create_exp", [](mlir::OpBuilder &self, mlir::Value &val) -> mlir::Value { auto loc = self.getUnknownLoc(); return self.create(loc, val); }) .def("create_cos", [](mlir::OpBuilder &self, mlir::Value &val) -> mlir::Value { auto loc = self.getUnknownLoc(); return self.create(loc, val); }) .def("create_sin", [](mlir::OpBuilder &self, mlir::Value &val) -> mlir::Value { auto loc = self.getUnknownLoc(); return self.create(loc, val); }) .def("create_log", [](mlir::OpBuilder &self, mlir::Value &val) -> mlir::Value { auto loc = self.getUnknownLoc(); return self.create(loc, val); }) .def("create_sqrt", [](mlir::OpBuilder &self, mlir::Value &val) -> mlir::Value { auto loc = self.getUnknownLoc(); return self.create(loc, val); }) .def("create_reduce", [](mlir::OpBuilder &self, mlir::Value &operand, mlir::triton::RedOp redOp, int axis) -> mlir::Value { auto loc = self.getUnknownLoc(); auto inputTensorType = operand.getType().dyn_cast(); std::vector shape = inputTensorType.getShape(); shape.erase(shape.begin() + axis); bool withIndex = mlir::triton::ReduceOp::withIndex(redOp); mlir::Type resType = withIndex ? self.getI32Type() : inputTensorType.getElementType(); if (!shape.empty()) { resType = mlir::RankedTensorType::get(shape, resType); } return self.create(loc, resType, redOp, operand, axis); }) .def("create_ptr_to_int", [](mlir::OpBuilder &self, mlir::Value &val, mlir::Type &type) -> mlir::Value { auto loc = self.getUnknownLoc(); return self.create(loc, type, val); }) .def("create_int_to_ptr", [](mlir::OpBuilder &self, mlir::Value &val, mlir::Type &type) -> mlir::Value { auto loc = self.getUnknownLoc(); return self.create(loc, type, val); }) .def("create_select", [](mlir::OpBuilder &self, mlir::Value &condition, mlir::Value &trueValue, mlir::Value &falseValue) -> mlir::Value { auto loc = self.getUnknownLoc(); return self.create(loc, condition, trueValue, falseValue); }) .def("create_printf", [](mlir::OpBuilder &self, const std::string &prefix, const std::vector &values) -> void { auto loc = self.getUnknownLoc(); self.create( loc, mlir::StringAttr::get(self.getContext(), llvm::StringRef(prefix)), values); }) // Undef .def("create_undef", [](mlir::OpBuilder &self, mlir::Type &type) -> mlir::Value { auto loc = self.getUnknownLoc(); return self.create<::mlir::LLVM::UndefOp>(loc, type); }) // Force GPU barrier .def("create_barrier", [](mlir::OpBuilder &self) { auto loc = self.getUnknownLoc(); self.create(loc); }); py::class_(m, "pass_manager") .def(py::init()) .def("enable_debug", [](mlir::PassManager &self) { auto printingFlags = mlir::OpPrintingFlags(); printingFlags.elideLargeElementsAttrs(16); self.enableIRPrinting( /*shouldPrintBeforePass=*/nullptr, /*shouldPrintAfterPass=*/ [](mlir::Pass *pass, mlir::Operation *) { return ::triton::tools::getBoolEnv("MLIR_ENABLE_DUMP"); }, /*printModuleScope=*/false, /*printAfterOnlyOnChange=*/true, /*printAfterOnlyOnFailure*/ false, llvm::dbgs(), printingFlags); }) .def("run", [](mlir::PassManager &self, mlir::ModuleOp &mod) { // TODO: maybe dump module to file and print error for better // diagnostics if (mlir::failed(self.run(mod.getOperation()))) throw std::runtime_error("PassManager::run failed"); }) .def( "add_sccp_pass", [](mlir::PassManager &self) { self.addPass(mlir::createSCCPPass()); }) .def("add_coalesce_pass", [](mlir::PassManager &self) { self.addPass(mlir::createTritonGPUCoalescePass()); }) .def("add_symbol_dce_pass", [](mlir::PassManager &self) { self.addPass(mlir::createSymbolDCEPass()); }) .def("add_inliner_pass", [](mlir::PassManager &self) { self.addPass(mlir::createInlinerPass()); }) .def("add_canonicalizer_pass", [](mlir::PassManager &self) { self.addPass(mlir::createCanonicalizerPass()); }) .def("add_cse_pass", [](mlir::PassManager &self) { self.addPass(mlir::createCSEPass()); }) .def("add_licm_pass", [](mlir::PassManager &self) { self.addPass(mlir::createLoopInvariantCodeMotionPass()); }) .def("add_triton_combine_pass", [](mlir::PassManager &self) { self.addPass(mlir::triton::createCombineOpsPass()); }) .def("add_convert_triton_to_tritongpu_pass", [](mlir::PassManager &self, int numWarps) { self.addPass( mlir::triton::createConvertTritonToTritonGPUPass(numWarps)); }) .def("add_tritongpu_pipeline_pass", [](mlir::PassManager &self, int numStages) { self.addPass(mlir::createTritonGPUPipelinePass(numStages)); }) .def("add_tritongpu_prefetch_pass", [](mlir::PassManager &self) { self.addPass(mlir::createTritonGPUPrefetchPass()); }) .def("add_tritongpu_combine_pass", [](mlir::PassManager &self, int computeCapability) { self.addPass( mlir::createTritonGPUCombineOpsPass(computeCapability)); }) .def("add_tritongpu_update_mma_for_volta_pass", [](mlir::PassManager &self) { self.addPass(mlir::createTritonGPUUpdateMmaForVoltaPass()); }) .def("add_tritongpu_reorder_instructions_pass", [](mlir::PassManager &self) { self.addPass(mlir::createTritonGPUReorderInstructionsPass()); }) .def("add_tritongpu_decompose_conversions_pass", [](mlir::PassManager &self) { self.addPass(mlir::createTritonGPUDecomposeConversionsPass()); }) .def("add_triton_gpu_to_llvm", [](mlir::PassManager &self) { self.addPass(mlir::triton::createConvertTritonGPUToLLVMPass()); }) .def("add_scf_to_cfg", [](mlir::PassManager &self) { self.addPass(mlir::createLowerToCFGPass()); }); } void init_triton_translation(py::module &m) { using ret = py::return_value_policy; m.def("get_shared_memory_size", [](mlir::ModuleOp mod) { auto shared = mod->getAttrOfType("triton_gpu.shared"); return shared.getInt(); }); m.def( "translate_triton_gpu_to_llvmir", [](mlir::ModuleOp op, int computeCapability) { py::gil_scoped_release allow_threads; llvm::LLVMContext llvmContext; auto llvmModule = ::mlir::triton::translateTritonGPUToLLVMIR( &llvmContext, op, computeCapability); if (!llvmModule) llvm::report_fatal_error("Failed to translate TritonGPU to LLVM IR."); std::string str; llvm::raw_string_ostream os(str); llvmModule->print(os, nullptr); os.flush(); return str; }, ret::take_ownership); m.def( "translate_llvmir_to_ptx", [](const std::string llvmIR, int capability, int version) -> std::string { py::gil_scoped_release allow_threads; // create LLVM module from C++ llvm::LLVMContext context; std::unique_ptr buffer = llvm::MemoryBuffer::getMemBuffer(llvmIR.c_str()); llvm::SMDiagnostic error; std::unique_ptr module = llvm::parseIR(buffer->getMemBufferRef(), error, context); if (!module) { llvm::report_fatal_error( "failed to parse IR: " + error.getMessage() + "lineno: " + std::to_string(error.getLineNo())); } // translate module to PTX auto ptxCode = triton::translateLLVMIRToPTX(*module, capability, version); return ptxCode; }, ret::take_ownership); m.def("compile_ptx_to_cubin", [](const std::string &ptxCode, const std::string &ptxasPath, int capability) -> py::object { py::gil_scoped_release allow_threads; // compile ptx with ptxas llvm::SmallString<64> fsrc; llvm::SmallString<64> flog; llvm::sys::fs::createTemporaryFile("compile-ptx-src", "", fsrc); llvm::sys::fs::createTemporaryFile("compile-ptx-log", "", flog); std::string fbin = std::string(fsrc) + ".o"; llvm::FileRemover srcRemover(fsrc); llvm::FileRemover logRemover(flog); llvm::FileRemover binRemover(fbin); const char *_fsrc = fsrc.c_str(); const char *_flog = flog.c_str(); const char *_fbin = fbin.c_str(); std::ofstream ofs(_fsrc); ofs << ptxCode << std::endl; ofs.close(); std::string cmd; int err; cmd = ptxasPath + " -v --gpu-name=sm_" + std::to_string(capability) + " " + _fsrc + " -o " + _fsrc + ".o 2> " + _flog; err = system(cmd.c_str()); if (err != 0) { std::ifstream _log(_flog); std::string log(std::istreambuf_iterator(_log), {}); throw std::runtime_error("Internal Triton PTX codegen error: \n" + log); } std::ifstream _cubin(_fbin, std::ios::binary); std::string cubin(std::istreambuf_iterator(_cubin), {}); _cubin.close(); py::bytes bytes(cubin); return std::move(bytes); }); m.def("add_external_libs", [](mlir::ModuleOp &op, const std::vector &names, const std::vector &paths) { ::mlir::triton::addExternalLibs(op, names, paths); }); } void init_triton(py::module &m) { py::module subm = m.def_submodule("triton"); // init_triton_codegen(subm.def_submodule("code_gen")); init_triton_runtime(subm.def_submodule("runtime")); init_triton_ir(subm.def_submodule("ir")); init_triton_translation(subm); } triton-2.0.0/python/test/000077500000000000000000000000001440023377100153315ustar00rootroot00000000000000triton-2.0.0/python/test/regression/000077500000000000000000000000001440023377100175115ustar00rootroot00000000000000triton-2.0.0/python/test/regression/test_performance.py000066400000000000000000000141211440023377100234220ustar00rootroot00000000000000import subprocess import sys import pytest import torch import triton import triton.language as tl from triton.testing import get_dram_gbps, get_max_tensorcore_tflops DEVICE_NAME = 'v100' ####################### # Utilities ####################### def nvsmi(attrs): attrs = ','.join(attrs) cmd = ['nvidia-smi', '-i', '0', '--query-gpu=' + attrs, '--format=csv,noheader,nounits'] out = subprocess.check_output(cmd) ret = out.decode(sys.stdout.encoding).split(',') ret = [int(x) for x in ret] return ret ####################### # Matrix Multiplication ####################### sm_clocks = {'v100': 1350, 'a100': 1350} mem_clocks = {'v100': 877, 'a100': 1215} matmul_data = { 'v100': { # square (256, 256, 256): {'float16': 0.027}, (512, 512, 512): {'float16': 0.158}, (1024, 1024, 1024): {'float16': 0.466}, (2048, 2048, 2048): {'float16': 0.695}, (4096, 4096, 4096): {'float16': 0.831}, (8192, 8192, 8192): {'float16': 0.849}, # tall-skinny (16, 1024, 1024): {'float16': 0.0128}, (16, 4096, 4096): {'float16': 0.0883}, (16, 8192, 8192): {'float16': 0.101}, (64, 1024, 1024): {'float16': 0.073}, (64, 4096, 4096): {'float16': 0.270}, (64, 8192, 8192): {'float16': 0.459}, (1024, 64, 1024): {'float16': 0.0692}, (4096, 64, 4096): {'float16': 0.264}, (8192, 64, 8192): {'float16': 0.452}, }, 'a100': { (256, 256, 256): {'float16': 0.010, 'float32': 0.0214, 'int8': 0.006}, (512, 512, 512): {'float16': 0.061, 'float32': 0.109, 'int8': 0.030}, (1024, 1024, 1024): {'float16': 0.287, 'float32': 0.331, 'int8': 0.169}, (2048, 2048, 2048): {'float16': 0.604, 'float32': 0.599, 'int8': 0.385}, (4096, 4096, 4096): {'float16': 0.842, 'float32': 0.862, 'int8': 0.711}, (8192, 8192, 8192): {'float16': 0.896, 'float32': 0.932, 'int8': 0.860}, # tall-skinny (16, 1024, 1024): {'float16': 0.0077, 'float32': 0.0127, 'int8': 0.005}, (16, 4096, 4096): {'float16': 0.0363, 'float32': 0.0457, 'int8': 0.0259}, (16, 8192, 8192): {'float16': 0.0564, 'float32': 0.0648, 'int8': 0.0431}, (64, 1024, 1024): {'float16': 0.0271, 'float32': 0.0509, 'int8': 0.0169}, (64, 4096, 4096): {'float16': 0.141, 'float32': 0.162, 'int8': 0.097}, (64, 8192, 8192): {'float16': 0.244, 'float32': 0.257, 'int8': 0.174}, (1024, 64, 1024): {'float16': 0.0263, 'float32': 0.0458, 'int8': 0.017}, (4096, 64, 4096): {'float16': 0.135, 'float32': 0.177, 'int8': 0.102}, (8192, 64, 8192): {'float16': 0.216, 'float32': 0.230, 'int8': 0.177}, } # # deep reductions # (64 , 64 , 16384) : {'a100': 0.}, # (64 , 64 , 65536) : {'a100': 0.}, # (256 , 256 , 8192 ) : {'a100': 0.}, # (256 , 256 , 32768) : {'a100': 0.}, } @pytest.mark.parametrize('M, N, K, dtype_str', [(M, N, K, dtype_str) for M, N, K in matmul_data[DEVICE_NAME].keys() for dtype_str in ['float16']]) def test_matmul(M, N, K, dtype_str): if dtype_str in ['float32', 'int8'] and DEVICE_NAME != 'a100': pytest.skip('Only test float32 & int8 on a100') dtype = {'float16': torch.float16, 'float32': torch.float32, 'int8': torch.int8}[dtype_str] torch.manual_seed(0) ref_gpu_util = matmul_data[DEVICE_NAME][(M, N, K)][dtype_str] cur_sm_clock = nvsmi(['clocks.current.sm'])[0] ref_sm_clock = sm_clocks[DEVICE_NAME] max_gpu_perf = get_max_tensorcore_tflops(dtype, clock_rate=cur_sm_clock * 1e3) assert abs(cur_sm_clock - ref_sm_clock) < 10, f'GPU SMs must run at {ref_sm_clock} MHz' if dtype == torch.int8: a = torch.randint(-128, 127, (M, K), dtype=dtype, device='cuda') b = torch.randint(-128, 127, (N, K), dtype=dtype, device='cuda') b = b.t() # only test row-col layout else: a = torch.randn((M, K), dtype=dtype, device='cuda') b = torch.randn((K, N), dtype=dtype, device='cuda') fn = lambda: triton.ops.matmul(a, b) ms = triton.testing.do_bench(fn, percentiles=None, warmup=25, rep=1000) cur_gpu_perf = 2. * M * N * K / ms * 1e-9 cur_gpu_util = cur_gpu_perf / max_gpu_perf triton.testing.assert_almost_equal(cur_gpu_util, ref_gpu_util, decimal=2) ####################### # Element-Wise ####################### @triton.jit def _add(x_ptr, y_ptr, output_ptr, n_elements, BLOCK_SIZE: tl.constexpr): pid = tl.program_id(axis=0) block_start = pid * BLOCK_SIZE offsets = block_start + tl.arange(0, BLOCK_SIZE) mask = offsets < n_elements x = tl.load(x_ptr + offsets, mask=mask) y = tl.load(y_ptr + offsets, mask=mask) output = x + y tl.store(output_ptr + offsets, output, mask=mask) elementwise_data = { 'v100': { 1024 * 16: 0.0219, 1024 * 64: 0.0791, 1024 * 256: 0.243, 1024 * 1024: 0.530, 1024 * 4096: 0.796, 1024 * 16384: 0.905, 1024 * 65536: 0.939, }, 'a100': { 1024 * 16: 0.008, 1024 * 64: 0.034, 1024 * 256: 0.114, 1024 * 1024: 0.315, 1024 * 4096: 0.580, 1024 * 16384: 0.782, 1024 * 65536: 0.850, } } @pytest.mark.parametrize('N', elementwise_data[DEVICE_NAME].keys()) def test_elementwise(N): torch.manual_seed(0) ref_gpu_util = elementwise_data[DEVICE_NAME][N] cur_mem_clock = nvsmi(['clocks.current.memory'])[0] ref_mem_clock = mem_clocks[DEVICE_NAME] max_gpu_perf = get_dram_gbps() assert abs(cur_mem_clock - ref_mem_clock) < 10, f'GPU memory must run at {ref_mem_clock} MHz' z = torch.empty((N, ), dtype=torch.float16, device='cuda') x = torch.randn_like(z) y = torch.randn_like(z) grid = lambda args: (triton.cdiv(N, args['BLOCK_SIZE']), ) fn = lambda: _add[grid](x, y, z, N, BLOCK_SIZE=1024) ms = triton.testing.do_bench(fn, percentiles=None, warmup=25, rep=250) cur_gpu_perf = 3. * N * z.element_size() / ms * 1e-6 cur_gpu_util = cur_gpu_perf / max_gpu_perf triton.testing.assert_almost_equal(cur_gpu_util, ref_gpu_util, decimal=2) triton-2.0.0/python/test/unit/000077500000000000000000000000001440023377100163105ustar00rootroot00000000000000triton-2.0.0/python/test/unit/language/000077500000000000000000000000001440023377100200735ustar00rootroot00000000000000triton-2.0.0/python/test/unit/language/printf_helper.py000066400000000000000000000027041440023377100233110ustar00rootroot00000000000000import torch from torch.testing import assert_close import triton import triton.language as tl torch_type = { "bool": torch.bool, 'int8': torch.int8, 'uint8': torch.uint8, 'int16': torch.int16, "int32": torch.int32, 'int64': torch.long, 'float16': torch.float16, 'bfloat16': torch.bfloat16, "float32": torch.float32, "float64": torch.float64 } def get_tensor(shape, data_type, b_positive=False): x = None if data_type.startswith('int'): x = torch.arange(0, shape[0], dtype=torch_type[data_type], device='cuda') else: x = torch.arange(0, shape[0], dtype=torch_type[data_type], device='cuda') return x # @pytest.mark.parametrize('data_type', # [("int8"), # ('int16'), # ('int32'), # ("int64"), # ('float16'), # ("float32"), # ("float64")]) def printf(data_type): @triton.jit def kernel(X, Y, BLOCK: tl.constexpr): x = tl.load(X + tl.arange(0, BLOCK)) tl.printf("", x) tl.store(Y + tl.arange(0, BLOCK), x) shape = (128, ) # limit the range of integers so that the sum does not overflow x = get_tensor(shape, data_type) y = torch.zeros(shape, dtype=x.dtype, device="cuda") kernel[(1,)](x, y, BLOCK=shape[0]) assert_close(y, x) printf("float16") printf("int8") triton-2.0.0/python/test/unit/language/test_core.py000066400000000000000000002141461440023377100224440ustar00rootroot00000000000000# flake8: noqa: F821,F841 import itertools import os import re from typing import Optional, Union import numpy as np import pytest import torch from numpy.random import RandomState import triton import triton._C.libtriton.triton as _triton import triton.language as tl from triton.runtime.jit import JITFunction, TensorWrapper, reinterpret int_dtypes = ['int8', 'int16', 'int32', 'int64'] uint_dtypes = ['uint8', 'uint16', 'uint32', 'uint64'] float_dtypes = ['float16', 'float32', 'float64'] dtypes = int_dtypes + uint_dtypes + float_dtypes dtypes_with_bfloat16 = dtypes + ['bfloat16'] torch_dtypes = ['bool'] + int_dtypes + ['uint8'] + float_dtypes + ['bfloat16'] def _bitwidth(dtype: str) -> int: # ex.: "int64" -> 64 return int(re.search(r'(\d+)$', dtype).group(1)) def numpy_random(shape, dtype_str, rs: Optional[RandomState] = None, low=None, high=None): """ Override `rs` if you're calling this function twice and don't want the same result for both calls. """ if isinstance(shape, int): shape = (shape, ) if rs is None: rs = RandomState(seed=17) if dtype_str in int_dtypes + uint_dtypes: iinfo = np.iinfo(getattr(np, dtype_str)) low = iinfo.min if low is None else max(low, iinfo.min) high = iinfo.max if high is None else min(high, iinfo.max) dtype = getattr(np, dtype_str) x = rs.randint(low, high, shape, dtype=dtype) x[x == 0] = 1 # Hack. Never return zero so tests of division don't error out. return x elif dtype_str in float_dtypes: return rs.normal(0, 1, shape).astype(dtype_str) elif dtype_str == 'bfloat16': return (rs.normal(0, 1, shape).astype('float32').view('uint32') & np.uint32(0xffff0000)).view('float32') elif dtype_str in ['bool', 'int1', 'bool_']: return rs.normal(0, 1, shape) > 0.0 else: raise RuntimeError(f'Unknown dtype {dtype_str}') def to_triton(x: np.ndarray, device='cuda', dst_type=None) -> Union[TensorWrapper, torch.Tensor]: ''' Note: We need dst_type because the type of x can be different from dst_type. For example: x is of type `float32`, dst_type is `bfloat16`. If dst_type is None, we infer dst_type from x. ''' t = x.dtype.name if t in uint_dtypes: signed_type_name = t.lstrip('u') # e.g. "uint16" -> "int16" x_signed = x.astype(getattr(np, signed_type_name)) return reinterpret(torch.tensor(x_signed, device=device), getattr(tl, t)) else: if t == 'float32' and dst_type == 'bfloat16': return torch.tensor(x, device=device).bfloat16() return torch.tensor(x, device=device) def torch_dtype_name(dtype) -> str: if isinstance(dtype, triton.language.dtype): return dtype.name elif isinstance(dtype, torch.dtype): # 'torch.int64' -> 'int64' m = re.match(r'^torch\.(\w+)$', str(dtype)) return m.group(1) else: raise TypeError(f'not a triton or torch dtype: {type(dtype)}') def to_numpy(x): if isinstance(x, TensorWrapper): return x.base.cpu().numpy().astype(getattr(np, torch_dtype_name(x.dtype))) elif isinstance(x, torch.Tensor): if x.dtype is torch.bfloat16: return x.cpu().float().numpy() return x.cpu().numpy() else: raise ValueError(f"Not a triton-compatible tensor: {x}") def patch_kernel(template, to_replace): kernel = triton.JITFunction(template.fn) for key, value in to_replace.items(): kernel.src = kernel.src.replace(key, value) return kernel def check_type_supported(dtype): ''' skip test if dtype is not supported on the current device ''' cc = torch.cuda.get_device_capability() if cc[0] < 8 and (dtype is tl.bfloat16 or dtype == "bfloat16" or dtype is torch.bfloat16): pytest.skip("bfloat16 is only supported on NVGPU with cc >= 80") @pytest.mark.parametrize("dtype_x", list(dtypes) + ["bfloat16"]) def test_empty_kernel(dtype_x, device='cuda'): SIZE = 128 @triton.jit def kernel(X, SIZE: tl.constexpr): pass check_type_supported(dtype_x) x = to_triton(numpy_random(SIZE, dtype_str=dtype_x), device=device, dst_type=dtype_x) kernel[(1, )](x, SIZE=SIZE, num_warps=4) # generic test functions def _test_unary(dtype_x, expr, numpy_expr=None, device='cuda'): check_type_supported(dtype_x) # early return if dtype_x is not supported SIZE = 128 # define the kernel / launch-grid @triton.jit def kernel(Z, X, SIZE: tl.constexpr): off = tl.arange(0, SIZE) x = tl.load(X + off) z = GENERATE_TEST_HERE tl.store(Z + off, z) kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': expr}) # inputs x = numpy_random(SIZE, dtype_str=dtype_x) if 'log' in expr: x = np.abs(x) + 0.01 # reference result z_ref = eval(expr if numpy_expr is None else numpy_expr) # triton result x_tri = to_triton(x, device=device, dst_type=dtype_x) z_tri = to_triton(np.empty_like(z_ref), device=device, dst_type=dtype_x) kernel[(1, )](z_tri, x_tri, SIZE=SIZE, num_warps=4) # compare np.testing.assert_allclose(z_ref, to_numpy(z_tri), rtol=0.01) def _binary_op_dtype_override(a: str, b: str) -> Optional[np.dtype]: """ Given two dtype strings, returns the numpy dtype Triton thinks binary operations on the two types should return. Returns None if the return value matches numpy. This is generally needed because Triton and pytorch return narrower floating point types than numpy in mixed operations, and because Triton follows C/C++ semantics around mixed signed/unsigned operations, and numpy/pytorch do not. """ overrides = { ('float16', 'int16'): np.float16, ('float16', 'int32'): np.float16, ('float16', 'int64'): np.float16, ('float16', 'uint16'): np.float16, ('float16', 'uint32'): np.float16, ('float16', 'uint64'): np.float16, ('int8', 'uint8'): np.uint8, ('int8', 'uint16'): np.uint16, ('int8', 'uint32'): np.uint32, ('int8', 'uint64'): np.uint64, ('int16', 'uint16'): np.uint16, ('int16', 'uint32'): np.uint32, ('int16', 'uint64'): np.uint64, ('int32', 'uint32'): np.uint32, ('int32', 'uint64'): np.uint64, ('int64', 'uint64'): np.uint64, } key = (a, b) if a < b else (b, a) return overrides.get(key) def _test_binary(dtype_x, dtype_y, expr, numpy_expr=None, mode_x='real', mode_y='real', device='cuda', y_low=None, y_high=None): check_type_supported(dtype_x) # early return if dtype_x is not supported check_type_supported(dtype_y) SIZE = 128 # define the kernel / launch-grid @triton.jit def kernel(Z, X, Y, SIZE: tl.constexpr): off = tl.arange(0, SIZE) x = tl.load(X + off) y = tl.load(Y + off) z = GENERATE_TEST_HERE tl.store(Z + off, z) kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': expr}) # inputs rs = RandomState(17) x = numpy_random(SIZE, dtype_str=dtype_x, rs=rs) y = numpy_random(SIZE, dtype_str=dtype_y, rs=rs, low=y_low, high=y_high) if mode_x == 'nan': x[:] = float('nan') if mode_y == 'nan': y[:] = float('nan') # reference result z_ref = eval(expr if numpy_expr is None else numpy_expr) dtype_z = _binary_op_dtype_override(dtype_x, dtype_y) if dtype_z is not None: z_ref = z_ref.astype(dtype_z) # triton result x_tri = to_triton(x, device=device, dst_type=dtype_x) y_tri = to_triton(y, device=device, dst_type=dtype_y) z_tri = to_triton(np.empty(SIZE, dtype=z_ref.dtype), device=device) kernel[(1, )](z_tri, x_tri, y_tri, SIZE=SIZE, num_warps=4) np.testing.assert_allclose(z_ref, to_numpy(z_tri), err_msg=expr, rtol=0.01) def _mod_operation_ill_conditioned(dtype_x, dtype_y) -> bool: # The result of x % y is ill-conditioned if x % y is much smaller than x. # pytorch/CUDA has slightly different (probably better) rounding on # remainders than stock LLVM. We currently don't expect to match it # bit-for-bit. return (dtype_x, dtype_y) in [ ('int32', 'bfloat16'), ('int32', 'float16'), ('int32', 'float32'), ('int64', 'bfloat16'), ('int64', 'float16'), ('int64', 'float32'), ('int64', 'float64'), ('uint16', 'bfloat16'), ('uint16', 'float16'), ('uint16', 'float32'), ('uint32', 'bfloat16'), ('uint32', 'float16'), ('uint32', 'float32'), ('uint64', 'bfloat16'), ('uint64', 'float16'), ('uint64', 'float32'), ('uint64', 'float64'), ] # --------------- # test binary ops # --------------- @pytest.mark.parametrize("dtype_x, dtype_y, op", [ (dtype_x, dtype_y, op) for op in ['+', '-', '*', '/', '%'] for dtype_x in dtypes_with_bfloat16 for dtype_y in dtypes_with_bfloat16 ]) def test_bin_op(dtype_x, dtype_y, op, device='cuda'): expr = f' x {op} y' if op == '%' and dtype_x in int_dtypes + uint_dtypes and dtype_y in int_dtypes + uint_dtypes: # LLVM has 'numpy.fmod', not 'numpy.remainder', semantics on integer remainders. numpy_expr = 'np.fmod(x, y)' elif op in ('/', '%') and dtype_x in ('int16', 'float16', 'bfloat16') and dtype_y in ('int16', 'float16', 'bfloat16'): # Triton promotes 16-bit floating-point / and % to 32-bit because there # are no native div or FRem operations on float16. Since we have to # convert anyway, we may as well take the accuracy bump. numpy_expr = f'x.astype(np.float32) {op} y.astype(np.float32)' elif (dtype_x in uint_dtypes and dtype_y in int_dtypes and _bitwidth(dtype_x) >= _bitwidth(dtype_y)): numpy_expr = f'x.astype(np.{dtype_x}) {op} y.astype(np.{dtype_x})' elif (dtype_y in uint_dtypes and dtype_x in int_dtypes and _bitwidth(dtype_y) >= _bitwidth(dtype_x)): numpy_expr = f'x.astype(np.{dtype_y}) {op} y.astype(np.{dtype_y})' else: numpy_expr = None if op == '%' and _mod_operation_ill_conditioned(dtype_x, dtype_y): with pytest.raises(AssertionError, match='Not equal to tolerance'): _test_binary(dtype_x, dtype_y, expr, numpy_expr, device=device) elif (op in ('%', '/') and ((dtype_x in int_dtypes and dtype_y in uint_dtypes) or (dtype_x in uint_dtypes and dtype_y in int_dtypes))): with pytest.raises(triton.CompilationError) as exc_info: _test_binary(dtype_x, dtype_y, expr, numpy_expr, device=device) assert re.match('Cannot use .* because they have different signedness', str(exc_info.value.__cause__)) else: _test_binary(dtype_x, dtype_y, expr, numpy_expr, device=device) @pytest.mark.parametrize("dtype_x, dtype_y", [(dtype_x, dtype_y) for dtype_x in int_dtypes for dtype_y in int_dtypes] + [(dtype_x, dtype_y) for dtype_x in uint_dtypes for dtype_y in uint_dtypes] ) def test_floordiv(dtype_x, dtype_y, device='cuda'): # Triton has IEEE, not numpy/torch, semantics for %, and those carry # through to //, so we have to use a nonstandard expression to get a # reference result for //. expr = 'x // y' numpy_expr = '((x - np.fmod(x, y)) / y)' _test_binary(dtype_x, dtype_y, expr, numpy_expr, device=device) # --------------- # test bitwise ops # --------------- @pytest.mark.parametrize("dtype_x, dtype_y, op", [ (dtype_x, dtype_y, op) for op in ['&', '|', '^'] for dtype_x in dtypes + dtypes_with_bfloat16 for dtype_y in dtypes + dtypes_with_bfloat16 ]) def test_bitwise_op(dtype_x, dtype_y, op, device='cuda'): expr = f'x {op} y' if (dtype_x in uint_dtypes and dtype_y in int_dtypes and _bitwidth(dtype_x) >= _bitwidth(dtype_y)): numpy_expr = f'x.astype(np.{dtype_x}) {op} y.astype(np.{dtype_x})' elif (dtype_y in uint_dtypes and dtype_x in int_dtypes and _bitwidth(dtype_y) >= _bitwidth(dtype_x)): numpy_expr = f'x.astype(np.{dtype_y}) {op} y.astype(np.{dtype_y})' else: numpy_expr = None if 'float' in dtype_x + dtype_y: with pytest.raises(triton.CompilationError) as exc_info: _test_binary(dtype_x, dtype_y, expr, numpy_expr='np.array([])', device=device) # The CompilationError must have been caused by a C++ exception with this text. assert re.match('invalid operands of type', str(exc_info.value.__cause__)) else: _test_binary(dtype_x, dtype_y, expr, numpy_expr, device=device) @pytest.mark.parametrize("dtype_x, dtype_y, op", [ (dtype_x, dtype_y, op) for op in ['<<', '>>'] for dtype_x in int_dtypes + uint_dtypes for dtype_y in int_dtypes + uint_dtypes ]) def test_shift_op(dtype_x, dtype_y, op, device='cuda'): expr = f'x {op} y' bw = max(_bitwidth(dtype_x), _bitwidth(dtype_y)) if dtype_x.startswith('int'): dtype_z = f'int{bw}' else: dtype_z = f'uint{bw}' numpy_expr = f'x.astype(np.{dtype_z}) {op} y.astype(np.{dtype_z})' _test_binary(dtype_x, dtype_y, expr, numpy_expr, device=device, y_low=0, y_high=65) # --------------- # test compare ops # --------------- ops = ['==', '!=', '>', '<', '>=', '<='] @pytest.mark.parametrize("dtype_x, dtype_y, op, mode_x, mode_y", # real [ (dtype_x, dtype_y, op, 'real', 'real') for op in ops for dtype_x in dtypes for dtype_y in dtypes ] + # NaNs [('float32', 'float32', op, mode_x, mode_y) for op in ops for mode_x, mode_y in [('nan', 'real'), ('real', 'nan'), ('nan', 'nan')] ]) def test_compare_op(dtype_x, dtype_y, op, mode_x, mode_y, device='cuda'): expr = f'x {op} y' if (dtype_x in uint_dtypes and dtype_y in int_dtypes and _bitwidth(dtype_x) >= _bitwidth(dtype_y)): numpy_expr = f'x.astype(np.{dtype_x}) {op} y.astype(np.{dtype_x})' elif (dtype_y in uint_dtypes and dtype_x in int_dtypes and _bitwidth(dtype_y) >= _bitwidth(dtype_x)): numpy_expr = f'x.astype(np.{dtype_y}) {op} y.astype(np.{dtype_y})' else: numpy_expr = None _test_binary(dtype_x, dtype_y, expr, numpy_expr, mode_x=mode_x, mode_y=mode_y, device=device) # --------------- # test where # --------------- @pytest.mark.parametrize("dtype", dtypes_with_bfloat16 + ["*int32"]) def test_where(dtype): select_ptrs = False if dtype == "*int32": dtype = "int64" select_ptrs = True check_type_supported(dtype) @triton.jit def where_kernel(cond_ptr, a_ptr, b_ptr, output_ptr, n_elements, BLOCK_SIZE: tl.constexpr, TEST_POINTERS: tl.constexpr): offsets = tl.program_id(axis=0) * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) mask = offsets < n_elements decide = tl.load(cond_ptr + offsets, mask=mask) if TEST_POINTERS: a = tl.load(a_ptr + offsets, mask=mask).to(tl.pi32_t) b = tl.load(b_ptr + offsets, mask=mask).to(tl.pi32_t) else: a = tl.load(a_ptr + offsets, mask=mask) b = tl.load(b_ptr + offsets, mask=mask) output = tl.where(decide, a, b) tl.store(output_ptr + offsets, output, mask=mask) SIZE = 1_000 rs = RandomState(17) cond = numpy_random(SIZE, 'bool', rs) x = numpy_random(SIZE, dtype_str=dtype, rs=rs) y = numpy_random(SIZE, dtype_str=dtype, rs=rs) z = np.where(cond, x, y) cond_tri = to_triton(cond, device='cuda') x_tri = to_triton(x, device='cuda', dst_type=dtype) y_tri = to_triton(y, device='cuda', dst_type=dtype) z_tri = to_triton(np.empty(SIZE, dtype=z.dtype), device='cuda', dst_type=dtype) grid = lambda meta: (triton.cdiv(SIZE, meta['BLOCK_SIZE']),) where_kernel[grid](cond_tri, x_tri, y_tri, z_tri, SIZE, BLOCK_SIZE=1024, TEST_POINTERS=select_ptrs) assert (z == to_numpy(z_tri)).all() def test_where_broadcast(): @triton.jit def where_kernel(cond_ptr, a_ptr, out_ptr, BLOCK_SIZE: tl.constexpr): xoffsets = tl.arange(0, BLOCK_SIZE)[:, None] yoffsets = tl.arange(0, BLOCK_SIZE)[None, :] mask = tl.load(cond_ptr + yoffsets) vals = tl.load(a_ptr + yoffsets + BLOCK_SIZE * xoffsets) res = tl.where(mask, vals, 0.) tl.store(out_ptr + yoffsets + BLOCK_SIZE * xoffsets, res) @triton.jit def where_scalar_condition(a_ptr, out_ptr, BLOCK_SIZE: tl.constexpr): xoffsets = tl.arange(0, BLOCK_SIZE)[:, None] yoffsets = tl.arange(0, BLOCK_SIZE)[None, :] mask = 0 vals = tl.load(a_ptr + yoffsets + BLOCK_SIZE * xoffsets) res = tl.where(mask, vals, 0.) tl.store(out_ptr + yoffsets + BLOCK_SIZE * xoffsets, res) SIZE = 32 dtype = 'float32' rs = RandomState(17) x = numpy_random((SIZE, SIZE), dtype_str=dtype, rs=rs) mask = numpy_random(SIZE, 'bool', rs=rs) z = np.where(mask, x, 0) cond_tri = to_triton(mask, device="cuda") x_tri = to_triton(x, device='cuda', dst_type=dtype) z_tri = to_triton(np.empty((SIZE, SIZE), dtype=z.dtype), device='cuda', dst_type=dtype) where_kernel[(1,)](cond_tri, x_tri, z_tri, SIZE) assert (z == to_numpy(z_tri)).all() where_scalar_condition[(1,)](x_tri, z_tri, SIZE) z = np.where(0, x, 0) assert (z == to_numpy(z_tri)).all() # --------------- # test unary ops # --------------- @pytest.mark.parametrize("dtype_x, expr", [ (dtype_x, ' -x') for dtype_x in dtypes_with_bfloat16 ] + [ (dtype_x, ' ~x') for dtype_x in int_dtypes ]) def test_unary_op(dtype_x, expr, device='cuda'): _test_unary(dtype_x, expr, device=device) # ---------------- # test math ops # ---------------- @pytest.mark.parametrize("expr", [ 'exp', 'log', 'cos', 'sin' ]) def test_math_op(expr, device='cuda'): _test_unary('float32', f'tl.{expr}(x)', f'np.{expr}(x) ', device=device) # ---------------- # test indexing # ---------------- def make_ptr_str(name, shape): rank = len(shape) offsets = [] stride = 1 for i in reversed(range(rank)): idx = ', '.join([':' if ii == i else 'None' for ii in range(rank)]) offsets += [f'tl.arange(0, {shape[i]})[{idx}]*{stride}'] stride *= shape[i] return f"{name} + {' + '.join(offsets)}" # TODO: handle `%4 = triton_gpu.convert_layout %3 : (tensor<32xi32, #blocked0>) -> tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>`` @pytest.mark.parametrize("expr, dtype_str", [ (f'x[{s}]', d) for s in ['None, :', ':, None', 'None, :, :', ':, :, None'] for d in ['int32', 'uint32', 'uint16'] ]) def test_index1d(expr, dtype_str, device='cuda'): rank_x = expr.count(':') rank_y = expr.count(',') + 1 shape_x = [32 for _ in range(rank_x)] shape_z = [32 for _ in range(rank_y)] shape_z_rank_mismatch = [32 for _ in range(rank_y + 1)] shape_z_dim_mismatch = [64 for _ in range(rank_y)] # Triton kernel @triton.jit def kernel(Z, X, SIZE: tl.constexpr): m = tl.arange(0, SIZE) n = tl.arange(0, SIZE) x = tl.load(X_PTR_EXPR) z = GENERATE_TEST_HERE tl.store(Z_PTR_EXPR, z) def generate_kernel(shape_x, shape_z): to_replace = { 'X_PTR_EXPR': make_ptr_str('X', shape_x), 'Z_PTR_EXPR': make_ptr_str('Z', shape_z), 'GENERATE_TEST_HERE': expr, } return patch_kernel(kernel, to_replace) kernel_match = generate_kernel(shape_x, shape_z) kernel_dim_mismatch = generate_kernel(shape_x, shape_z_dim_mismatch) kernel_rank_mismatch = generate_kernel(shape_x, shape_z_rank_mismatch) # torch result x = numpy_random(shape_x, dtype_str=dtype_str) y = np.zeros(shape_z, dtype=getattr(np, dtype_str)) z_ref = eval(expr) + y # triton result z_tri = to_triton(np.empty_like(z_ref), device=device) x_tri = to_triton(x) kernel_match[(1, )](z_tri, x_tri, num_warps=1, SIZE=shape_x[0]) # compare assert (z_ref == to_numpy(z_tri)).all() def catch_compilation_error(kernel): try: kernel[(1, )](z_tri, x_tri, num_warps=1, SIZE=shape_x[0]) except triton.CompilationError as e: np.testing.assert_(True) except BaseException: np.testing.assert_(False) catch_compilation_error(kernel_dim_mismatch) catch_compilation_error(kernel_rank_mismatch) # --------------- # test tuples # --------------- @triton.jit def fn(a, b): return a + b, \ a - b, \ a * b def test_tuples(): device = 'cuda' @triton.jit def with_fn(X, Y, A, B, C): x = tl.load(X) y = tl.load(Y) a, b, c = fn(x, y) tl.store(A, a) tl.store(B, b) tl.store(C, c) @triton.jit def without_fn(X, Y, A, B, C): x = tl.load(X) y = tl.load(Y) a, b, c = x + y, x - y, x * y tl.store(A, a) tl.store(B, b) tl.store(C, c) x = torch.tensor([1.3], device=device, dtype=torch.float32) y = torch.tensor([1.9], device=device, dtype=torch.float32) a_tri = torch.tensor([0], device=device, dtype=torch.float32) b_tri = torch.tensor([0], device=device, dtype=torch.float32) c_tri = torch.tensor([0], device=device, dtype=torch.float32) for kernel in [with_fn, without_fn]: kernel[(1, )](x, y, a_tri, b_tri, c_tri, num_warps=1) a_ref, b_ref, c_ref = x + y, x - y, x * y assert a_tri == a_ref assert b_tri == b_ref assert c_tri == c_ref # --------------- # test atomics # --------------- @pytest.mark.parametrize("op, dtype_x_str, mode", itertools.chain.from_iterable([ [ ('add', 'float16', mode), ('add', 'uint32', mode), ('add', 'int32', mode), ('add', 'float32', mode), ('max', 'uint32', mode), ('max', 'int32', mode), ('max', 'float32', mode), ('min', 'uint32', mode), ('min', 'int32', mode), ('min', 'float32', mode), ] for mode in ['all_neg', 'all_pos', 'min_neg', 'max_pos']])) def test_atomic_rmw(op, dtype_x_str, mode, device='cuda'): capability = torch.cuda.get_device_capability() if capability[0] < 7: if dtype_x_str == 'float16': pytest.skip("Only test atomic float16 ops on devices with sm >= 70") n_programs = 5 # triton kernel @triton.jit def kernel(X, Z): pid = tl.program_id(0) x = tl.load(X + pid) old = GENERATE_TEST_HERE kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.atomic_{op}(Z, x)'}) numpy_op = {'add': np.sum, 'max': np.max, 'min': np.min}[op] max_neutral = float('-inf') if dtype_x_str in float_dtypes else np.iinfo(getattr(np, dtype_x_str)).min min_neutral = float('inf') if dtype_x_str in float_dtypes else np.iinfo(getattr(np, dtype_x_str)).max neutral = {'add': 0, 'max': max_neutral, 'min': min_neutral}[op] # triton result rs = RandomState(17) x = np.array([2**i for i in range(n_programs)], dtype=getattr(np, dtype_x_str)) if mode == 'all_neg': x = -np.abs(x) if mode == 'all_pos': x = np.abs(x) if mode == 'min_neg': idx = rs.randint(n_programs, size=(1, )).item() x[idx] = -np.max(np.abs(x)) - 1 if mode == 'max_pos': idx = rs.randint(n_programs, size=(1, )).item() x[idx] = np.max(np.abs(x)) + 1 x_tri = to_triton(x, device=device) z_tri = to_triton(np.array([neutral], dtype=getattr(np, dtype_x_str)), device=device) kernel[(n_programs, )](x_tri, z_tri) # torch result z_ref = numpy_op(x).astype(getattr(np, dtype_x_str)) # compare exact = op not in ['add'] if exact: assert z_ref.item() == to_numpy(z_tri).item() else: np.testing.assert_allclose(z_ref, to_numpy(z_tri), rtol=0.01) @pytest.mark.parametrize("shape, axis", [(shape, axis) for shape in [(2, 2), (2, 8), (8, 2), (8, 8), (32, 32)] for axis in [0, 1]]) def test_tensor_atomic_rmw(shape, axis, device="cuda"): shape0, shape1 = shape # triton kernel @triton.jit def kernel(Z, X, AXIS: tl.constexpr, SHAPE0: tl.constexpr, SHAPE1: tl.constexpr): off0 = tl.arange(0, SHAPE0) off1 = tl.arange(0, SHAPE1) x = tl.load(X + off0[:, None] * SHAPE1 + off1[None, :]) z = tl.sum(x, axis=AXIS) if AXIS == 1: tl.atomic_add(Z + off0, z) else: tl.atomic_add(Z + off1, z) rs = RandomState(17) x = numpy_random((shape0, shape1), dtype_str="float32", rs=rs) # reference result z_ref = np.sum(x, axis=axis, keepdims=False) # triton result x_tri = to_triton(x, device=device) z_shape = (shape0, ) if axis == 1 else (shape1, ) z_tri = to_triton(np.zeros(z_shape, dtype="float32"), device=device) kernel[(1,)](z_tri, x_tri, axis, shape0, shape1) np.testing.assert_allclose(z_ref, to_numpy(z_tri), rtol=1e-4) def test_atomic_cas(): # 1. make sure that atomic_cas changes the original value (Lock) @triton.jit def change_value(Lock): tl.atomic_cas(Lock, 0, 1) Lock = torch.zeros((1,), device='cuda', dtype=torch.int32) change_value[(1,)](Lock) assert (Lock[0] == 1) # 2. only one block enters the critical section @triton.jit def serialized_add(data, Lock): ptrs = data + tl.arange(0, 128) while tl.atomic_cas(Lock, 0, 1) == 1: pass tl.store(ptrs, tl.load(ptrs) + 1.0) # release lock tl.atomic_xchg(Lock, 0) Lock = torch.zeros((1,), device='cuda', dtype=torch.int32) data = torch.zeros((128,), device='cuda', dtype=torch.float32) ref = torch.full((128,), 64.0) serialized_add[(64,)](data, Lock) triton.testing.assert_almost_equal(data, ref) # --------------- # test cast # --------------- @pytest.mark.parametrize("dtype_x, dtype_z, bitcast", [ (dtype_x, dtype_z, False) for dtype_x in dtypes for dtype_z in dtypes ] + [ ('float32', 'bfloat16', False), ('bfloat16', 'float32', False), ('float32', 'int32', True), ('float32', 'int1', False), ] + [ (f'uint{x}', f'int{x}', True) for x in [8, 16, 32, 64] ] + [ (f'int{x}', f'uint{x}', True) for x in [8, 16, 32, 64] ]) def test_cast(dtype_x, dtype_z, bitcast, device='cuda'): # bfloat16 on cc < 80 will not be tested check_type_supported(dtype_x) check_type_supported(dtype_z) # This is tricky because numpy doesn't have bfloat, and torch doesn't have uints. x0 = 43 if dtype_x in int_dtypes else 43.5 if dtype_x in float_dtypes and dtype_z == 'int1': x0 = 0.5 if dtype_x.startswith('bfloat'): x_tri = torch.tensor([x0], dtype=getattr(torch, dtype_x), device=device) else: x = np.array([x0], dtype=getattr(np, dtype_x)) x_tri = to_triton(x) # triton kernel @triton.jit def kernel(X, Z, BITCAST: tl.constexpr): x_ptr = X + tl.arange(0, 1) z_ptr = Z + tl.arange(0, 1) x = tl.load(x_ptr) z = x.to(Z.dtype.element_ty, bitcast=BITCAST) tl.store(z_ptr, z) dtype_z_np = dtype_z if dtype_z != 'int1' else 'bool_' # triton result if dtype_z.startswith('bfloat'): z_tri = torch.empty((1,), dtype=getattr(torch, dtype_z), device=device) else: z_tri = to_triton(np.empty((1, ), dtype=getattr(np, dtype_z_np)), device=device) kernel[(1, )](x_tri, z_tri, BITCAST=bitcast) # torch result if dtype_z.startswith('bfloat') or dtype_x.startswith('bfloat'): assert bitcast is False z_ref = x_tri.to(z_tri.dtype) assert z_tri == z_ref else: if bitcast: z_ref = x.view(getattr(np, dtype_z_np)) else: z_ref = x.astype(getattr(np, dtype_z_np)) assert to_numpy(z_tri) == z_ref @pytest.mark.parametrize("dtype_str", list(torch_dtypes)) def test_store_constant(dtype_str): check_type_supported(dtype_str) """Tests that boolean True is stored as 1""" @triton.jit def kernel(output_ptr, n_elements, BLOCK_SIZE: tl.constexpr): offsets = tl.program_id(axis=0) * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) mask = offsets < n_elements output = GENERATE_TEST_HERE tl.store(output_ptr + offsets, output, mask=mask) triton_dtype_str = 'uint8' if dtype_str == 'bool' else dtype_str kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.zeros([BLOCK_SIZE], dtype=tl.{triton_dtype_str}) + 1'}) block_size = 128 ref = torch.ones([block_size], dtype=getattr(torch, dtype_str), device='cuda') output = torch.zeros([block_size], dtype=getattr(torch, dtype_str), device='cuda') kernel[(1,)](output, block_size, BLOCK_SIZE=block_size) assert torch.all(output == ref) @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16]) def test_f8_xf16_roundtrip(dtype): """Tests that converting an f8 to f16 and back to f8 doesn't change its value""" check_type_supported(dtype) @triton.jit def copy_kernel(input_ptr, output_ptr, n_elements, BLOCK_SIZE: tl.constexpr): offsets = tl.program_id(axis=0) * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) mask = offsets < n_elements input = tl.load(input_ptr + offsets, mask=mask) output = input tl.store(output_ptr + offsets, output, mask=mask) f8_tensor = torch.tensor(range(-128, 128), dtype=torch.int8, device='cuda') f8 = triton.reinterpret(f8_tensor, tl.float8) n_elements = f8_tensor.numel() xf16 = torch.empty_like(f8_tensor, dtype=dtype) grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),) copy_kernel[grid](f8, xf16, n_elements, BLOCK_SIZE=1024) f8_output_tensor = torch.empty_like(xf16, dtype=torch.int8) f8_output = triton.reinterpret(f8_output_tensor, tl.float8) copy_kernel[grid](xf16, f8_output, n_elements, BLOCK_SIZE=1024) assert torch.all(f8_tensor == f8_output_tensor) def test_f16_to_f8_rounding(): """Takes all float16s, converts them to float8 and back to float16. Checks that the absolute error is the minimum over all float8. Or the same explanation a bit mathier: for all f16 |f16 - fromf8(tof8(f16))| == min over all f8 |f16 - fromf8(f8)|""" @triton.jit def copy_kernel(input_ptr, output_ptr, n_elements, BLOCK_SIZE: tl.constexpr): offsets = tl.program_id(axis=0) * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) mask = offsets < n_elements input = tl.load(input_ptr + offsets, mask=mask) output = input tl.store(output_ptr + offsets, output, mask=mask) # torch.view with a dtype isn't supported in triton's torch yet so use numpy's view f16_input_np = ( np.array( range(-int(2 ** (16 - 1)), int(2 ** (16 - 1))), dtype=np.int16, ) .view(np.float16) ) f16_input = torch.tensor(f16_input_np, dtype=torch.float16, device='cuda') n_elements = f16_input.numel() f8_output_tensor = torch.empty_like(f16_input, dtype=torch.int8) f8_output = triton.reinterpret(f8_output_tensor, tl.float8) grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),) copy_kernel[grid](f16_input, f8_output, n_elements, BLOCK_SIZE=1024) f16_output = torch.empty_like(f16_input, dtype=torch.float16) copy_kernel[grid](f8_output, f16_output, n_elements, BLOCK_SIZE=1024) abs_error = torch.abs(f16_input - f16_output) all_f8_vals_tensor = torch.tensor(range(2 ** 8), dtype=torch.uint8, device='cuda') all_f8_vals = triton.reinterpret(all_f8_vals_tensor, tl.float8) all_f8_vals_in_f16 = torch.empty_like(all_f8_vals_tensor, dtype=torch.float16) copy_kernel[grid](all_f8_vals, all_f8_vals_in_f16, n_elements=256, BLOCK_SIZE=1024) all_finite_f8_vals_in_f16 = all_f8_vals_in_f16[ torch.isfinite(all_f8_vals_in_f16) ] min_error = torch.min( torch.abs( f16_input.reshape((-1, 1)) - all_finite_f8_vals_in_f16.reshape((1, -1)) ), dim=1, )[0] # 1.9375 is float8 max mismatch = torch.logical_and( abs_error != min_error, torch.logical_and(torch.isfinite(f16_input), torch.abs(f16_input) < 1.9375) ) assert torch.all( torch.logical_not(mismatch) ), f"f16_input[mismatch]={f16_input[mismatch]} f16_output[mismatch]={f16_output[mismatch]} abs_error[mismatch]={abs_error[mismatch]} min_error[mismatch]={min_error[mismatch]}" # --------------- # test reduce # --------------- def get_reduced_dtype(dtype_str, op): if op in ('argmin', 'argmax'): return 'int32' if dtype_str in ['int8', 'uint8', 'int16', 'uint16']: return 'int32' if dtype_str == 'bfloat16': return 'float32' return dtype_str @pytest.mark.parametrize("op, dtype_str, shape", [(op, dtype, shape) for op in ['min', 'max', 'sum', 'argmin', 'argmax'] for dtype in dtypes_with_bfloat16 for shape in [32, 64, 128, 512]]) def test_reduce1d(op, dtype_str, shape, device='cuda'): check_type_supported(dtype_str) # bfloat16 on cc < 80 will not be tested # triton kernel @triton.jit def kernel(X, Z, BLOCK: tl.constexpr): x = tl.load(X + tl.arange(0, BLOCK)) tl.store(Z, GENERATE_TEST_HERE) kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'}) # input rs = RandomState(17) # limit the range of integers so that the sum does not overflow x = numpy_random((shape,), dtype_str=dtype_str, rs=rs) x_tri = to_triton(x, device=device) numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min, 'argmin': np.argmin, 'argmax': np.argmax}[op] # numpy result z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str z_tri_dtype_str = z_dtype_str if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16': z_dtype_str = 'float32' z_ref = numpy_op(x).astype(getattr(np, z_dtype_str)) # trunc mantissa for a fair comparison of accuracy z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32') z_tri_dtype_str = 'bfloat16' else: z_ref = numpy_op(x).astype(getattr(np, z_dtype_str)) # triton result z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs), device=device, dst_type=z_tri_dtype_str) kernel[(1,)](x_tri, z_tri, BLOCK=shape) z_tri = to_numpy(z_tri) # compare if op == 'sum': np.testing.assert_allclose(z_ref, z_tri, rtol=0.01) else: if op in ('argmin', 'argmax'): # argmin and argmax can have multiple valid indices. # so instead we compare the values pointed by indices np.testing.assert_equal(x[z_ref], x[z_tri]) else: np.testing.assert_equal(z_ref, z_tri) # TODO: [Qingyi] Fix argmin / argmax reduce_configs1 = [ (op, dtype, (1, 1024), axis) for dtype in dtypes_with_bfloat16 for op in ['min', 'max', 'sum'] for axis in [1] ] # shape (128, 256) and (32, 1024) are not enabled on sm86 because the required shared memory # exceeds the limit of 99KB reduce2d_shapes = [(2, 32), (4, 32), (4, 128)] # TODO: fix and uncomment # , (32, 64), (64, 128)] if 'V100' in torch.cuda.get_device_name(0): reduce2d_shapes += [(128, 256) and (32, 1024)] reduce_configs2 = [ (op, 'float32', shape, axis) for op in ['min', 'max', 'sum'] for shape in reduce2d_shapes for axis in [0, 1] ] @pytest.mark.parametrize("op, dtype_str, shape, axis", reduce_configs1 + reduce_configs2) def test_reduce2d(op, dtype_str, shape, axis, device='cuda'): check_type_supported(dtype_str) # bfloat16 on cc < 80 will not be tested # triton kernel @triton.jit def kernel(X, Z, BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, AXIS: tl.constexpr): range_m = tl.arange(0, BLOCK_M) range_n = tl.arange(0, BLOCK_N) x = tl.load(X + range_m[:, None] * BLOCK_N + range_n[None, :]) z = GENERATE_TEST_HERE if AXIS == 1: tl.store(Z + range_m, z) else: tl.store(Z + range_n, z) kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=AXIS)'}) # input rs = RandomState(17) # limit the range of integers so that the sum does not overflow x = numpy_random(shape, dtype_str=dtype_str, rs=rs) x_tri = to_triton(x) numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min, 'argmin': np.argmin, 'argmax': np.argmax}[op] z_dtype_str = get_reduced_dtype(dtype_str, op) z_tri_dtype_str = z_dtype_str # numpy result if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16': z_dtype_str = 'float32' z_tri_dtype_str = 'bfloat16' z_ref = numpy_op(x, axis=axis).astype(getattr(np, z_dtype_str)) # trunc mantissa for a fair comparison of accuracy z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32') else: z_ref = numpy_op(x, axis=axis).astype(getattr(np, z_dtype_str)) # triton result z_tri = to_triton(numpy_random((shape[1 - axis],), dtype_str=z_dtype_str, rs=rs), device=device, dst_type=z_tri_dtype_str) kernel[(1,)](x_tri, z_tri, BLOCK_M=shape[0], BLOCK_N=shape[1], AXIS=axis) z_tri = to_numpy(z_tri) # compare if op == 'sum': np.testing.assert_allclose(z_ref, z_tri, rtol=0.01) else: if op in ('argmin', 'argmax'): # argmin and argmax can have multiple valid indices. # so instead we compare the values pointed by indices z_ref_index = np.expand_dims(z_ref, axis=axis) z_tri_index = np.expand_dims(z_tri, axis=axis) z_ref_value = np.take_along_axis(x, z_ref_index, axis=axis) z_tri_value = np.take_along_axis(x, z_tri_index, axis=axis) np.testing.assert_equal(z_ref_value, z_tri_value) else: np.testing.assert_equal(z_ref, z_tri) # --------------- # test permute # --------------- @pytest.mark.parametrize("dtype_str, shape, perm", [(dtype, shape, perm) # TODO: bfloat16 for dtype in ['float16', 'float32'] for shape in [(64, 64), (128, 128)] for perm in [(1, 0)]]) def test_permute(dtype_str, shape, perm, device='cuda'): check_type_supported(dtype_str) # bfloat16 on cc < 80 will not be tested # triton kernel @triton.jit def kernel(X, stride_xm, stride_xn, Z, stride_zm, stride_zn, BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr): off_m = tl.arange(0, BLOCK_M) off_n = tl.arange(0, BLOCK_N) Xs = X + off_m[:, None] * stride_xm + off_n[None, :] * stride_xn Zs = Z + off_m[:, None] * stride_zm + off_n[None, :] * stride_zn tl.store(Zs, tl.load(Xs)) # input x = numpy_random(shape, dtype_str=dtype_str) # triton result z_tri = to_triton(np.empty_like(x), device=device, dst_type=dtype_str) z_tri_contiguous = to_triton(np.empty_like(x), device=device, dst_type=dtype_str) x_tri = to_triton(x, device=device, dst_type=dtype_str) pgm = kernel[(1, 1)](x_tri, x_tri.stride(0), x_tri.stride(1), z_tri, z_tri.stride(1), z_tri.stride(0), BLOCK_M=shape[0], BLOCK_N=shape[1]) pgm_contiguous = kernel[(1, 1)](x_tri, x_tri.stride(1), x_tri.stride(0), z_tri_contiguous, z_tri_contiguous.stride(0), z_tri_contiguous.stride(1), BLOCK_M=shape[0], BLOCK_N=shape[1]) # numpy result z_ref = x.transpose(*perm) # compare triton.testing.assert_almost_equal(z_tri, z_ref) triton.testing.assert_almost_equal(z_tri_contiguous, z_ref) # parse ptx to make sure ld/st are vectorized ptx = pgm.asm['ptx'] assert 'ld.global.v4' in ptx assert 'st.global.v4' in ptx ptx = pgm_contiguous.asm['ptx'] assert 'ld.global.v4' in ptx assert 'st.global.v4' in ptx # --------------- # test dot # --------------- @pytest.mark.parametrize("M, N, K, num_warps, col_a, col_b, epilogue, allow_tf32, dtype", [(*shape, 4, False, False, epilogue, allow_tf32, dtype) for shape in [(64, 64, 64)] for epilogue in ['none', 'trans', 'add-matrix', 'add-rows', 'add-cols', 'softmax', 'chain-dot'] for allow_tf32 in [True, False] for dtype in ['float16', 'float32'] if not (allow_tf32 and (dtype in ['float16']))] + [(*shape_nw, col_a, col_b, 'none', allow_tf32, dtype) for shape_nw in [[128, 256, 32, 8], [128, 16, 32, 4], [32, 128, 64, 4], [128, 128, 64, 4], [64, 128, 128, 4], [32, 128, 64, 2], [128, 128, 64, 2], [64, 128, 128, 2]] for allow_tf32 in [True] for col_a in [True, False] for col_b in [True, False] for dtype in ['int8', 'float16', 'float32']]) def test_dot(M, N, K, num_warps, col_a, col_b, epilogue, allow_tf32, dtype, device='cuda'): capability = torch.cuda.get_device_capability() if capability[0] < 7: pytest.skip("Only test tl.dot() on devices with sm >= 70") if capability[0] < 8: if dtype == 'int8': pytest.skip("Only test int8 on devices with sm >= 80") elif dtype == 'float32' and allow_tf32: pytest.skip("Only test tf32 on devices with sm >= 80") if capability[0] == 7: if (M, N, K, num_warps) == (128, 256, 32, 8): pytest.skip("shared memory out of resource") torch.backends.cuda.matmul.allow_tf32 = allow_tf32 # triton kernel @triton.jit def kernel(X, stride_xm, stride_xk, Y, stride_yk, stride_yn, W, stride_wn, stride_wl, Z, stride_zm, stride_zn, BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr, ADD_MATRIX: tl.constexpr, ADD_ROWS: tl.constexpr, ADD_COLS: tl.constexpr, ALLOW_TF32: tl.constexpr, DO_SOFTMAX: tl.constexpr, CHAIN_DOT: tl.constexpr, COL_A: tl.constexpr, COL_B: tl.constexpr): off_m = tl.arange(0, BLOCK_M) off_n = tl.arange(0, BLOCK_N) off_l = tl.arange(0, BLOCK_N) off_k = tl.arange(0, BLOCK_K) Xs = X + off_m[:, None] * stride_xm + off_k[None, :] * stride_xk Ys = Y + off_k[:, None] * stride_yk + off_n[None, :] * stride_yn Ws = W + off_n[:, None] * stride_wn + off_l[None, :] * stride_wl Zs = Z + off_m[:, None] * stride_zm + off_n[None, :] * stride_zn x = tl.load(Xs) y = tl.load(Ys) z = tl.dot(x, y, allow_tf32=ALLOW_TF32) if ADD_MATRIX: z += tl.load(Zs) if ADD_ROWS: ZRs = Z + off_m * stride_zm z += tl.load(ZRs)[:, None] if ADD_COLS: ZCs = Z + off_n * stride_zn z += tl.load(ZCs)[None, :] if DO_SOFTMAX: max = tl.max(z, 1) z = z - max[:, None] num = tl.exp(z) den = tl.sum(num, 1) z = num / den[:, None] if CHAIN_DOT: w = tl.load(Ws) z = tl.dot(z.to(w.dtype), w) tl.store(Zs, z) # input rs = RandomState(17) if col_a: x = numpy_random((K, M), dtype_str=dtype, rs=rs).T else: x = numpy_random((M, K), dtype_str=dtype, rs=rs) if col_b: y = numpy_random((N, K), dtype_str=dtype, rs=rs).T else: y = numpy_random((K, N), dtype_str=dtype, rs=rs) w = numpy_random((N, N), dtype_str=dtype, rs=rs) if 'int' not in dtype: x *= .1 y *= .1 if dtype == 'float32' and allow_tf32: x = (x.view('uint32') & np.uint32(0xffffe000)).view('float32') y = (y.view('uint32') & np.uint32(0xffffe000)).view('float32') w = (w.view('uint32') & np.uint32(0xffffe000)).view('float32') x_tri = to_triton(x, device=device) y_tri = to_triton(y, device=device) w_tri = to_triton(w, device=device) # triton result if dtype == 'int8': z = 1 + numpy_random((M, N), dtype_str='int32', rs=rs) else: z = 1 + numpy_random((M, N), dtype_str=dtype, rs=rs) * .1 z_tri = to_triton(z, device=device) if epilogue == 'trans': z_tri = torch.as_strided(z_tri, (M, N), z_tri.stride()[::-1]) pgm = kernel[(1, 1)](x_tri, x_tri.stride(0), x_tri.stride(1), y_tri, y_tri.stride(0), y_tri.stride(1), w_tri, w_tri.stride(0), w_tri.stride(1), z_tri, z_tri.stride(0), z_tri.stride(1), COL_A=col_a, COL_B=col_b, BLOCK_M=M, BLOCK_K=K, BLOCK_N=N, ADD_MATRIX=epilogue == 'add-matrix', ADD_ROWS=epilogue == 'add-rows', ADD_COLS=epilogue == 'add-cols', DO_SOFTMAX=epilogue == 'softmax', CHAIN_DOT=epilogue == 'chain-dot', ALLOW_TF32=allow_tf32, num_warps=num_warps) # torch result if dtype == 'int8': z_ref = np.matmul(x.astype(np.float32), y.astype(np.float32())).astype(np.int32) else: z_ref = np.matmul(x, y) if epilogue == 'add-matrix': z_ref += z if epilogue == 'add-rows': z_ref += z[:, 0][:, None] if epilogue == 'add-cols': z_ref += z[0, :][None, :] if epilogue == 'softmax': num = np.exp(z_ref - np.max(z_ref, axis=-1, keepdims=True)) denom = np.sum(num, axis=-1, keepdims=True) z_ref = num / denom if epilogue == 'chain-dot': z_ref = np.matmul(z_ref, w) # compare # print(z_ref[:,0], z_tri[:,0]) if dtype == 'float32': # XXX: Somehow there's a larger difference when we use float32 np.testing.assert_allclose(z_ref, to_numpy(z_tri), rtol=0.01, atol=1e-3) else: np.testing.assert_allclose(z_ref, to_numpy(z_tri), rtol=0.01) # make sure ld/st are vectorized ptx = pgm.asm['ptx'] assert 'ld.global.v4' in ptx assert 'st.global.v4' in ptx if dtype == 'float32' and allow_tf32: assert 'mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32' in ptx elif dtype == 'float32' and allow_tf32: assert 'mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32' not in ptx elif dtype == 'int8': assert 'mma.sync.aligned.m16n8k32.row.col.satfinite.s32.s8.s8.s32' in ptx @pytest.mark.parametrize("dtype_str", ['float32', 'float16']) def test_dot_without_load(dtype_str): @triton.jit def _kernel(out): a = GENERATE_TEST_HERE b = GENERATE_TEST_HERE c = tl.dot(a, b) out_ptr = out + tl.arange(0, 32)[:, None] * 32 + tl.arange(0, 32)[None, :] tl.store(out_ptr, c) kernel = patch_kernel(_kernel, {'GENERATE_TEST_HERE': f"tl.full((32, 32), 1.0, tl.{dtype_str})"}) a = torch.ones((32, 32), dtype=getattr(torch, dtype_str), device="cuda") b = torch.ones((32, 32), dtype=getattr(torch, dtype_str), device="cuda") out_ref = torch.matmul(a, b) out = torch.zeros((32, 32), dtype=getattr(torch, dtype_str), device="cuda") kernel[(1,)](out) assert torch.all(out == out_ref) # --------------- # test arange # --------------- @pytest.mark.parametrize("start", [0, 1, 7, 16]) def test_arange(start, device='cuda'): BLOCK = 128 z_tri = torch.empty(BLOCK, dtype=torch.int32, device=device) @triton.jit def _kernel(z, BLOCK: tl.constexpr, START: tl.constexpr, END: tl.constexpr): off = tl.arange(0, BLOCK) val = tl.arange(START, END) tl.store(z + off, val) _kernel[(1,)](z_tri, START=start, END=start + BLOCK, BLOCK=BLOCK) z_ref = torch.arange(start, BLOCK + start, dtype=torch.int32, device=device) triton.testing.assert_almost_equal(z_tri, z_ref) # --------------- # test load # --------------- @pytest.mark.parametrize("dtype_str, size, size_diff", [(dtype_str, size, size_diff) for dtype_str in torch_dtypes for size in [128, 512] for size_diff in [0, 1, 2, 3, 4]]) def test_masked_load(dtype_str, size, size_diff, device='cuda'): dtype = getattr(torch, dtype_str) check_type_supported(dtype) # bfloat16 on cc < 80 will not be tested input_size = size - size_diff output_size = size if dtype_str == 'bool': input = torch.randint(0, 2, (input_size,), dtype=dtype, device=device) elif dtype_str in int_dtypes or dtype_str in uint_dtypes: input = torch.randint(0, 127, (input_size,), dtype=dtype, device=device) else: input = torch.rand(input_size, dtype=dtype, device=device) output = torch.zeros((output_size,), dtype=dtype, device=device) @triton.jit def _kernel(in_ptr, out_ptr, in_size: tl.constexpr, out_size: tl.constexpr): in_offsets = tl.arange(0, out_size) # Load inputs. x = GENERATE_TEST_HERE # Store output output_offsets = tl.arange(0, out_size) tl.store(out_ptr + output_offsets, x) mask_str = "mask=in_offsets < in_size, other=1" if size_diff > 0 else "None" kernel = patch_kernel(_kernel, {'GENERATE_TEST_HERE': f"tl.load(in_ptr + in_offsets, {mask_str})"}) kernel[(1,)](input, output, input_size, output_size) reference_out = torch.cat((input, torch.ones((size_diff,), dtype=dtype, device=device))) triton.testing.allclose(output, reference_out) # Testing masked loads with an intermate copy to shared memory run. @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16, torch.float32]) def test_masked_load_shared_memory(dtype, device='cuda'): check_type_supported(dtype) # bfloat16 on cc < 80 will not be tested M = 32 N = 32 K = 16 in1 = torch.rand((M, K), dtype=dtype, device=device) in2 = torch.rand((K, N), dtype=dtype, device=device) out = torch.zeros((M, N), dtype=dtype, device=device) @triton.jit def _kernel(in1_ptr, in2_ptr, output_ptr, in_stride, in2_stride, out_stride, in_numel, in2_numel, out_numel, M: tl.constexpr, N: tl.constexpr, K: tl.constexpr): M_offsets = tl.arange(0, M) N_offsets = tl.arange(0, N) K_offsets = tl.arange(0, K) in_offsets = M_offsets[:, None] * in_stride + K_offsets[None, :] in2_offsets = K_offsets[:, None] * in2_stride + N_offsets[None, :] # Load inputs. x = tl.load(in1_ptr + in_offsets, mask=in_offsets < in_numel) w = tl.load(in2_ptr + in2_offsets, mask=in2_offsets < in2_numel) # Without a dot product the memory doesn't get promoted to shared. o = tl.dot(x, w) # Store output output_offsets = M_offsets[:, None] * out_stride + N_offsets[None, :] tl.store(output_ptr + output_offsets, o, mask=output_offsets < in2_numel) pgm = _kernel[(1,)](in1, in2, out, in1.stride()[0], in2.stride()[0], out.stride()[0], in1.numel(), in2.numel(), out.numel(), M=M, N=N, K=K) reference_out = torch.matmul(in1, in2) triton.testing.allclose(out, reference_out) @pytest.mark.parametrize("cache", ["", ".ca", ".cg"]) def test_load_cache_modifier(cache): src = torch.empty(128, device='cuda') dst = torch.empty(128, device='cuda') @triton.jit def _kernel(dst, src, CACHE: tl.constexpr): offsets = tl.arange(0, 128) x = tl.load(src + offsets, cache_modifier=CACHE) tl.store(dst + offsets, x) pgm = _kernel[(1,)](dst, src, CACHE=cache) ptx = pgm.asm['ptx'] if cache == '': assert 'ld.global.ca' not in ptx assert 'ld.global.cg' not in ptx if cache == '.cg': assert 'ld.global.cg' in ptx assert 'ld.global.ca' not in ptx if cache == '.ca': assert 'ld.global.ca' in ptx assert 'ld.global.cg' not in ptx @pytest.mark.parametrize("N", [16, 10, 11, 1024]) def test_vectorization(N): src = torch.empty(1024, device='cuda') dst = torch.empty(1024, device='cuda') @triton.jit def _kernel(dst, src, N, BLOCK_SIZE: tl.constexpr): offsets = tl.program_id(0) * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) x = tl.load(src + offsets, mask=offsets < N) tl.store(dst + offsets, x, mask=offsets < N) pgm = _kernel[(1,)](dst, src, N=N, BLOCK_SIZE=src.shape[0]) ptx = pgm.asm["ptx"] if N % 16 == 0: assert "ld.global.v4.b32" in ptx else: assert "ld.global.b32" in ptx # triton.testing.assert_almost_equal(dst, src[:N]) # --------------- # test store # --------------- # --------------- # test if # --------------- # --------------- # test for # --------------- # --------------- # test while # --------------- # --------------- # test default # --------------- # TODO: can't be local to test_default @triton.jit def _impl(value=10): return value def test_default(): value = 5 ret0 = torch.zeros(1, dtype=torch.int32, device='cuda') ret1 = torch.zeros(1, dtype=torch.int32, device='cuda') @triton.jit def _kernel(ret0, ret1, value): tl.store(ret0, _impl()) tl.store(ret1, _impl(value)) _kernel[(1,)](ret0, ret1, value) assert ret0.item() == 10 assert ret1.item() == value # --------------- # test noop # ---------------- def test_noop(device='cuda'): @triton.jit def kernel(x): pass x = to_triton(numpy_random((1,), dtype_str='int32'), device=device) kernel[(1, )](x) @pytest.mark.parametrize("device", ['cuda', 'cpu']) def test_pointer_arguments(device): @triton.jit def kernel(x): pass x = torch.empty(1024, device=device) result = True try: kernel[(1,)](x) except ValueError: result = True if device == 'cpu' else False assert result @pytest.mark.parametrize("value, value_type", [ (-1, 'i32'), (0, 'i32'), (-2**31, 'i32'), (2**31 - 1, 'i32'), (2**31, 'u32'), (2**32 - 1, 'u32'), (2**32, 'i64'), (2**63 - 1, 'i64'), (-2**63, 'i64'), (2**63, 'u64'), (2**64 - 1, 'u64') ]) def test_value_specialization(value: int, value_type: str, device='cuda') -> None: spec_type = None def cache_hook(*args, **kwargs): nonlocal spec_type spec_type = kwargs["compile"]["signature"][0] JITFunction.cache_hook = cache_hook @triton.jit def kernel(VALUE, X): pass x = torch.tensor([3.14159], device='cuda') pgm = kernel[(1, )](value, x) JITFunction.cache_hook = None assert spec_type == value_type # -------------------- # value specialization # -------------------- @pytest.mark.parametrize( "value, overflow", [(2**64 - 1, False), (2**64, True), (-2**63, False), (-2**63 - 1, True)] ) def test_value_specialization_overflow(value: int, overflow: bool, device='cuda') -> None: @triton.jit def kernel(VALUE, X): pass x = torch.tensor([3.14159], device='cuda') if overflow: with pytest.raises(OverflowError): kernel[(1, )](value, x) else: kernel[(1, )](value, x) # ---------------- # test constexpr # ---------------- @pytest.mark.parametrize("op", ['+', '-', '*', '/', '%', '<', '>']) @pytest.mark.parametrize("is_lhs_constexpr", [False, True]) @pytest.mark.parametrize("is_rhs_constexpr", [True, False]) def test_bin_op_constexpr(op, is_lhs_constexpr, is_rhs_constexpr): @triton.jit def kernel(Z, X, Y): x = tl.load(X) y = tl.load(Y) z = GENERATE_TEST_HERE tl.store(Z, z) x_str = "3.14" if is_lhs_constexpr else "x" y_str = "4.13" if is_rhs_constexpr else "y" kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f"{x_str} {op} {y_str}"}) x = numpy_random((1,), dtype_str="float32") y = numpy_random((1,), dtype_str="float32") z = np.array(eval(f"{x_str} {op} {y_str}")) x_tri = to_triton(x) y_tri = to_triton(y) z_tri = to_triton(np.empty((1,), dtype=z.dtype)) kernel[(1,)](z_tri, x_tri, y_tri) np.testing.assert_allclose(z, to_numpy(z_tri)) def test_constexpr_shape(): @triton.jit def kernel(X): off = tl.arange(0, 128 + 128) tl.store(X + off, off) x_tri = to_triton(np.empty((256, ), dtype=np.int32)) kernel[(1,)](x_tri) np.testing.assert_equal(to_numpy(x_tri), np.arange(0, 256)) def test_constexpr_scalar_shape(): @triton.jit def kernel(X, s): off = tl.arange(0, 256) val = off % (256 // s) tl.store(X + off, val) x_tri = to_triton(np.empty((256, ), dtype=np.int32)) kernel[(1,)](x_tri, 32) np.testing.assert_equal(to_numpy(x_tri), np.arange(0, 256) % 8) # ------------- # test call # ------------- @triton.jit def val_multiplier(val, i): return val * i @triton.jit def vecmul_kernel(ptr, n_elements, rep): pid = tl.program_id(axis=0) offsets = pid * 128 + tl.arange(0, 128) mask = offsets < n_elements vec = tl.load(ptr + offsets, mask=mask) for i in range(1, rep): vec = val_multiplier(vec, i) tl.store(ptr + offsets, vec, mask=mask) def test_call(): @triton.jit def kernel(ptr, n_elements, num1, num2): vecmul_kernel(ptr, n_elements, num1) vecmul_kernel(ptr, n_elements, num2) size = 1024 rand_val = numpy_random((size,), dtype_str="float32") rand_val_tri = to_triton(rand_val, device='cuda') kernel[(size // 128,)](rand_val_tri, size, 3, 5) ans = rand_val * 1 * 2 * 1 * 2 * 3 * 4 np.testing.assert_equal(to_numpy(rand_val_tri), ans) # ------------- # test if # ------------- def test_if(): @triton.jit def kernel(Cond, XTrue, XFalse, Ret): pid = tl.program_id(0) cond = tl.load(Cond) if pid % 2: tl.store(Ret, tl.load(XTrue)) else: tl.store(Ret, tl.load(XFalse)) cond = torch.ones(1, dtype=torch.int32, device='cuda') x_true = torch.tensor([3.14], dtype=torch.float32, device='cuda') x_false = torch.tensor([1.51], dtype=torch.float32, device='cuda') ret = torch.empty(1, dtype=torch.float32, device='cuda') kernel[(1,)](cond, x_true, x_false, ret) def test_num_warps_pow2(): dst = torch.empty(128, device='cuda') @triton.jit def _kernel(dst): pass with pytest.raises(AssertionError, match='must be a power of 2'): _kernel[(1,)](dst=dst, num_warps=3) _kernel[(1,)](dst=dst, num_warps=1) _kernel[(1,)](dst=dst, num_warps=2) _kernel[(1,)](dst=dst, num_warps=4) # ------------- # test extern # ------------- @pytest.mark.parametrize("dtype_str, expr, lib_path", [('int32', 'libdevice.ffs', ''), ('float32', 'libdevice.log2', ''), ('float32', 'libdevice.pow', tl.libdevice.LIBDEVICE_PATH), ('float64', 'libdevice.norm4d', '')]) def test_libdevice_tensor(dtype_str, expr, lib_path): @triton.jit def kernel(X, Y, BLOCK: tl.constexpr): x = tl.load(X + tl.arange(0, BLOCK)) y = GENERATE_TEST_HERE tl.store(Y + tl.arange(0, BLOCK), y) shape = (128, ) rs = RandomState(17) # limit the range of integers so that the sum does not overflow x = numpy_random(shape, dtype_str=dtype_str, rs=rs) if expr == 'libdevice.log2': kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': 'tl.broadcast_to(tl.libdevice.log2(5.0), x.shape)'}) y_ref = np.log2(5.0) elif expr == 'libdevice.ffs': kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': 'tl.libdevice.ffs(x)'}) y_ref = np.zeros(shape, dtype=x.dtype) for i in range(shape[0]): y_ref[i] = (int(x[i]) & int(-x[i])).bit_length() elif expr == 'libdevice.pow': # numpy does not allow negative factors in power, so we use abs() x = np.abs(x) kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': 'tl.libdevice.pow(x, x)'}) y_ref = np.power(x, x) elif expr == 'libdevice.norm4d': kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': 'tl.libdevice.norm4d(x, x, x, x)'}) y_ref = np.sqrt(4 * np.power(x, 2)) x_tri = to_triton(x) # triton result y_tri = to_triton(numpy_random((shape[0],), dtype_str=dtype_str, rs=rs), device='cuda') kernel[(1,)](x_tri, y_tri, BLOCK=shape[0], extern_libs={'libdevice': lib_path}) # compare if expr == 'libdevice.ffs': np.testing.assert_equal(y_ref, to_numpy(y_tri)) else: np.testing.assert_allclose(y_ref, to_numpy(y_tri), rtol=0.01) @pytest.mark.parametrize("dtype_str, expr, lib_path", [('float32', 'libdevice.pow', '')]) def test_libdevice_scalar(dtype_str, expr, lib_path): @triton.jit def kernel(X, Y, BLOCK: tl.constexpr): x = X y = GENERATE_TEST_HERE tl.store(Y + tl.arange(0, BLOCK), y) shape = (128, ) rs = RandomState(17) # limit the range of integers so that the sum does not overflow x = numpy_random((1,), dtype_str=dtype_str, rs=rs) y_ref = np.zeros(shape, dtype=x.dtype) # numpy does not allow negative factors in power, so we use abs() x = np.abs(x) kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': 'tl.libdevice.pow(x, x)'}) y_ref[:] = np.power(x, x) # triton result x_tri = to_triton(x)[0].item() y_tri = to_triton(numpy_random((shape[0],), dtype_str=dtype_str, rs=rs), device='cuda') kernel[(1,)](x_tri, y_tri, BLOCK=shape[0], extern_libs={'libdevice': lib_path}) # compare np.testing.assert_allclose(y_ref, to_numpy(y_tri), rtol=0.01) # ----------------------- # test control flow # ----------------------- def test_if_else(): @triton.jit def kernel(Cond, TrueVal, FalseVal, Out): if tl.load(Cond): val = tl.load(TrueVal) else: val = tl.load(FalseVal) tl.store(Out, val) out = to_triton(np.zeros((1,), dtype=np.int32), device='cuda') true_val = to_triton(np.full((1,), 1, dtype=np.int32), device='cuda') false_val = to_triton(np.full((1,), 2, dtype=np.int32), device='cuda') cond = to_triton(np.zeros((1,), dtype=np.int32), device='cuda') # True cond[0] = True kernel[(1,)](cond, true_val, false_val, out) assert to_numpy(out)[0] == true_val[0] # False cond[0] = False kernel[(1,)](cond, true_val, false_val, out) assert to_numpy(out)[0] == false_val[0] def test_if_return(): @triton.jit def kernel(ExitEarly, Out): if tl.load(ExitEarly): tl.store(Out, 0) return tl.store(Out, 1) out = to_triton(np.zeros((1,), dtype=np.int32), device='cuda') exit_early = to_triton(np.zeros((1,), dtype=np.int32), device='cuda') # exit early path taken exit_early[0] = 1 kernel[(1,)](exit_early, out) assert to_numpy(out)[0] == 0 # exit early path not taken exit_early[0] = 0 kernel[(1,)](exit_early, out) assert to_numpy(out)[0] == 1 @pytest.mark.parametrize("_cond1", [True, False]) @pytest.mark.parametrize("_cond2", [True, False]) @pytest.mark.parametrize("_cond3", [True, False]) def test_nested_if_else_return(_cond1, _cond2, _cond3): @triton.jit def kernel(Cond1, Cond2, Cond3, Val1, Val2, Val3, Out): val = 0 if tl.load(Cond1): if tl.load(Cond2): val = tl.load(Val1) else: return else: if tl.load(Cond3): val = tl.load(Val2) else: val = tl.load(Val3) tl.store(Out, val) out = to_triton(np.full((1,), -1, dtype=np.int32), device='cuda') cond1 = to_triton(np.full((1,), _cond1, dtype=np.int32), device='cuda') cond2 = to_triton(np.full((1,), _cond2, dtype=np.int32), device='cuda') cond3 = to_triton(np.full((1,), _cond3, dtype=np.int32), device='cuda') val1 = to_triton(np.full((1,), 1, dtype=np.int32), device='cuda') val2 = to_triton(np.full((1,), 2, dtype=np.int32), device='cuda') val3 = to_triton(np.full((1,), 3, dtype=np.int32), device='cuda') kernel[(1,)](cond1, cond2, cond3, val1, val2, val3, out) targets = { (True, True, True): val1[0], (True, True, False): val1[0], (True, False, True): out[0], (True, False, False): out[0], (False, True, True): val2[0], (False, True, False): val3[0], (False, False, True): val2[0], (False, False, False): val3[0], } assert out[0] == targets[(_cond1, _cond2, _cond3)] def test_while(): @triton.jit def kernel(InitI, Bound, CutOff, OutI, OutJ): init_i = tl.load(InitI) curr_i = init_i j = 0 while curr_i == init_i and j < tl.load(Bound): curr_i = curr_i + (j == tl.load(CutOff)) j += 1 tl.store(OutI, curr_i) tl.store(OutJ, j) out_i = to_triton(np.zeros((1,), dtype=np.int32), device='cuda') out_j = to_triton(np.zeros((1,), dtype=np.int32), device='cuda') init_i = to_triton(np.full((1,), 1, dtype=np.int32), device='cuda') bound = to_triton(np.full((1,), 10, dtype=np.int32), device='cuda') cut_off = to_triton(np.full((1,), 5, dtype=np.int32), device='cuda') kernel[(1,)](init_i, bound, cut_off, out_i, out_j) assert out_i[0] == init_i[0] + 1 assert out_j[0] == cut_off[0] + 1 # def test_for_if(): # @triton.jit # def kernel(bound, cutoff, M, N): # m = 0 # n = 0 # for i in range(bound): # if i > cutoff: # m = m + 1 # else: # n = n + 1 # tl.store(M, m) # tl.store(N, n) # m = to_triton(np.zeros((1,), dtype=np.int32), device='cuda') # n = to_triton(np.zeros((1,), dtype=np.int32), device='cuda') # kernel[(1,)](10, 7, m, n) # print(m[0]) # print(n[0]) # ----------------------- # test layout conversions # ----------------------- # TODO: backend hsould be tested separately class MmaLayout: def __init__(self, version, warps_per_cta): self.version = version self.warps_per_cta = str(warps_per_cta) def __str__(self): return f"#triton_gpu.mma<{{versionMajor={self.version[0]}, versionMinor={self.version[1]}, warpsPerCTA={self.warps_per_cta}}}>" class BlockedLayout: def __init__(self, size_per_thread, threads_per_warp, warps_per_cta, order): self.sz_per_thread = str(size_per_thread) self.threads_per_warp = str(threads_per_warp) self.warps_per_cta = str(warps_per_cta) self.order = str(order) def __str__(self): return f"#triton_gpu.blocked<{{sizePerThread={self.sz_per_thread}, threadsPerWarp={self.threads_per_warp}, warpsPerCTA={self.warps_per_cta}, order={self.order}}}>" layouts = [ # MmaLayout(version=1, warps_per_cta=[1, 4]), MmaLayout(version=(2, 0), warps_per_cta=[1, 4]), # MmaLayout(version=1, warps_per_cta=[4, 1]), MmaLayout(version=(2, 0), warps_per_cta=[4, 1]), BlockedLayout([1, 8], [2, 16], [4, 1], [1, 0]), BlockedLayout([1, 4], [4, 8], [2, 2], [1, 0]), BlockedLayout([1, 1], [1, 32], [2, 2], [1, 0]), BlockedLayout([8, 1], [16, 2], [1, 4], [0, 1]), BlockedLayout([4, 1], [8, 4], [2, 2], [0, 1]), BlockedLayout([1, 1], [32, 1], [2, 2], [0, 1]), BlockedLayout([4, 4], [1, 32], [4, 1], [1, 0]) ] @pytest.mark.parametrize("shape", [(128, 128)]) @pytest.mark.parametrize("dtype", ['float16']) @pytest.mark.parametrize("src_layout", layouts) @pytest.mark.parametrize("dst_layout", layouts) def test_convert2d(dtype, shape, src_layout, dst_layout, device='cuda'): if str(src_layout) == str(dst_layout): pytest.skip() if 'mma' in str(src_layout) and 'mma' in str(dst_layout): pytest.skip() ir = f""" #src = {src_layout} #dst = {dst_layout} """ + """ module attributes {"triton_gpu.num-warps" = 4 : i32} { func public @kernel_0d1d(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}) { %cst = arith.constant dense<128> : tensor<128x1xi32, #src> %0 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 1, parent = #src}>> %1 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #src}>> %2 = tt.splat %arg0 : (!tt.ptr) -> tensor<128x128x!tt.ptr, #src> %4 = tt.expand_dims %0 {axis = 1 : i32} : (tensor<128xi32, #triton_gpu.slice<{dim = 1, parent = #src}>>) -> tensor<128x1xi32, #src> %5 = arith.muli %4, %cst : tensor<128x1xi32, #src> %6 = tt.expand_dims %1 {axis = 0 : i32} : (tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #src}>>) -> tensor<1x128xi32, #src> %7 = tt.broadcast %6 : (tensor<1x128xi32, #src>) -> tensor<128x128xi32, #src> %8 = tt.broadcast %5 : (tensor<128x1xi32, #src>) -> tensor<128x128xi32, #src> %9 = arith.addi %8, %7 : tensor<128x128xi32, #src> %10 = tt.addptr %2, %9 : tensor<128x128x!tt.ptr, #src>, tensor<128x128xi32, #src> %11 = tt.load %10 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<128x128xf16, #src> %3 = tt.splat %arg1 : (!tt.ptr) -> tensor<128x128x!tt.ptr, #dst> %12 = triton_gpu.convert_layout %9 : (tensor<128x128xi32, #src>) -> tensor<128x128xi32, #dst> %13 = triton_gpu.convert_layout %11 : (tensor<128x128xf16, #src>) -> tensor<128x128xf16, #dst> %14 = tt.addptr %3, %12 : tensor<128x128x!tt.ptr, #dst>, tensor<128x128xi32, #dst> tt.store %14, %13 : tensor<128x128xf16, #dst> return } } """ x = to_triton(numpy_random(shape, dtype_str=dtype)) z = torch.empty_like(x) # write the IR to a temporary file using mkstemp import tempfile with tempfile.NamedTemporaryFile(mode='w', suffix='.ttgir') as f: f.write(ir) f.flush() kernel = triton.compile(f.name) kernel[(1, 1, 1)](x.data_ptr(), z.data_ptr()) assert torch.equal(z, x) def test_load_scalar_with_mask(): @triton.jit def kernel(Input, Index, Out, N: int): index = tl.load(Index) scalar = tl.load(Input + index, mask=index < N, other=0) tl.store(Out, scalar, mask=index < N) Index = torch.tensor([0], dtype=torch.int32, device='cuda') Input = torch.tensor([0], dtype=torch.int32, device='cuda') Out = torch.empty_like(Index, device='cuda') kernel[(1,)](Input, Index, Out, Index.numel()) assert Out.data[0] == 0 triton-2.0.0/python/test/unit/language/test_printf.py000066400000000000000000000011251440023377100230050ustar00rootroot00000000000000import os import subprocess import sys dir_path = os.path.dirname(os.path.realpath(__file__)) printf_path = os.path.join(dir_path, "printf_helper.py") def test_printf(): proc = subprocess.Popen([sys.executable, printf_path], stdout=subprocess.PIPE, shell=False) (outs, err) = proc.communicate() outs = outs.split() new_lines = set() for line in outs: try: value = int(float(line)) new_lines.add(value) except Exception as e: print(e) for i in range(128): assert i in new_lines assert len(new_lines) == 128 triton-2.0.0/python/test/unit/language/test_random.py000066400000000000000000000140611440023377100227660ustar00rootroot00000000000000import numpy as np import pytest import scipy.stats import torch import triton import triton.language as tl ##################################### # Reference Philox Implementation ##################################### class PhiloxConfig: def __init__(self, PHILOX_ROUND_A, PHILOX_ROUND_B, PHILOX_KEY_A, PHILOX_KEY_B, DTYPE): self.PHILOX_ROUND_A = np.array(PHILOX_ROUND_A, dtype=DTYPE) self.PHILOX_ROUND_B = np.array(PHILOX_ROUND_B, dtype=DTYPE) self.PHILOX_KEY_A = np.array(PHILOX_KEY_A, dtype=DTYPE) self.PHILOX_KEY_B = np.array(PHILOX_KEY_B, dtype=DTYPE) self.DTYPE = DTYPE # This is better for GPU PHILOX_32 = PhiloxConfig( PHILOX_KEY_A=0x9E3779B9, PHILOX_KEY_B=0xBB67AE85, PHILOX_ROUND_A=0xD2511F53, PHILOX_ROUND_B=0xCD9E8D57, DTYPE=np.uint32, ) # This is what numpy implements PHILOX_64 = PhiloxConfig( PHILOX_KEY_A=0x9E3779B97F4A7C15, PHILOX_KEY_B=0xBB67AE8584CAA73B, PHILOX_ROUND_A=0xD2E7470EE14C6C93, PHILOX_ROUND_B=0xCA5A826395121157, DTYPE=np.uint64, ) class CustomPhilox4x: def __init__(self, seed, config): self._config = config seed = self._into_pieces(seed) self._key = np.array(seed[:2], dtype=self._dtype) self._counter = np.array((0, 0) + seed[2:], dtype=self._dtype) @property def _dtype(self): return self._config.DTYPE def _into_pieces(self, n, pad=4): res = [] while len(res) < pad: res.append(np.array(n, dtype=self._dtype)) n >>= (np.dtype(self._dtype).itemsize * 8) assert n == 0 return tuple(res) def _multiply_low_high(self, a, b): low = a * b high = int(a) * int(b) high = np.array(high >> (np.dtype(self._dtype).itemsize * 8), dtype=self._dtype) return low, high def _single_round(self, counter, key): lo0, hi0 = self._multiply_low_high(self._config.PHILOX_ROUND_A, counter[0]) lo1, hi1 = self._multiply_low_high(self._config.PHILOX_ROUND_B, counter[2]) ret0 = hi1 ^ counter[1] ^ key[0] ret1 = lo1 ret2 = hi0 ^ counter[3] ^ key[1] ret3 = lo0 return np.array([ret0, ret1, ret2, ret3], dtype=self._dtype) def _raise_key(self, key): pk = [self._config.PHILOX_KEY_A, self._config.PHILOX_KEY_B] return key + np.array(pk, dtype=self._dtype) def random_raw(self): counter = self._counter key = self._key for _ in range(10): counter = self._single_round(counter, key) key = self._raise_key(key) self.advance(1) return counter def advance(self, n_steps): self._counter[0] += n_steps assert self._counter[0] < 2**32, "FIXME: doesn't work for large offsets" class CustomPhilox(CustomPhilox4x): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.buffer = [] def random_raw(self): if len(self.buffer) == 0: self.buffer = list(super().random_raw())[::-1] return int(self.buffer.pop()) ##################################### # Unit Tests ##################################### BLOCK = 1024 # test generation of random uint32 @pytest.mark.parametrize('size, seed', [(size, seed) for size in ['10', '4,53', '10000'] for seed in [0, 42, 124, 54, 0xffffffff, 0xdeadbeefcafeb0ba]] ) def test_randint(size, seed, device='cuda'): size = list(map(int, size.split(','))) @triton.jit def kernel(X, N, seed): offset = tl.program_id(0) * BLOCK + tl.arange(0, BLOCK) rand = tl.randint(seed, offset) tl.store(X + offset, rand, mask=offset < N) # triton result x = torch.empty(size, dtype=torch.int32, device=device) N = x.numel() grid = (triton.cdiv(N, BLOCK),) kernel[grid](x, N, seed) out_tri = x.cpu().numpy().astype(np.uint32).flatten().tolist() # reference result gen = CustomPhilox4x(seed, config=PHILOX_32) out_ref = [gen.random_raw()[0] for _ in out_tri] assert out_tri == out_ref # test uniform PRNG @pytest.mark.parametrize('size, seed', [(size, seed) for size in [1000000] for seed in [0, 42, 124, 54]] ) def test_rand(size, seed, device='cuda'): @triton.jit def kernel(X, N, seed): offset = tl.program_id(0) * BLOCK + tl.arange(0, BLOCK) rand = tl.rand(seed, offset) tl.store(X + offset, rand, mask=offset < N) # triton result x = torch.empty(size, dtype=torch.float32, device=device) N = x.numel() grid = (triton.cdiv(N, BLOCK),) kernel[grid](x, N, seed) assert all((x >= 0) & (x <= 1)) assert scipy.stats.kstest(x.tolist(), 'uniform', args=(0, 1)).statistic < 0.01 # test normal PRNG @pytest.mark.parametrize('size, seed', [(size, seed) for size in [1000000] for seed in [0, 42, 124, 54]] ) def test_randn(size, seed, device='cuda'): @triton.jit def kernel(X, N, seed): offset = tl.program_id(0) * BLOCK + tl.arange(0, BLOCK) rand = tl.randn(seed, offset) tl.store(X + offset, rand, mask=offset < N) # triton result x = torch.empty(size, dtype=torch.float32, device=device) N = x.numel() grid = (triton.cdiv(N, BLOCK),) kernel[grid](x, N, seed) assert abs(x.mean()) < 1e-2 assert abs(x.std() - 1) < 1e-2 # tl.rand() should never produce >=1.0 def test_rand_limits(): @triton.jit def kernel(input, output, n: tl.constexpr): idx = tl.arange(0, n) x = tl.load(input + idx) y = tl.random.uint32_to_uniform_float(x) tl.store(output + idx, y) min_max_int32 = torch.tensor([ torch.iinfo(torch.int32).min, torch.iinfo(torch.int32).max, ], dtype=torch.int32, device='cuda') output = torch.empty(2, dtype=torch.float32, device='cuda') kernel[(1,)](min_max_int32, output, 2) assert output[0] == output[1] assert 1.0 - torch.finfo(torch.float32).eps <= output[0].item() < 1.0 triton-2.0.0/python/test/unit/operators/000077500000000000000000000000001440023377100203265ustar00rootroot00000000000000triton-2.0.0/python/test/unit/operators/test_blocksparse.py000066400000000000000000000157421440023377100242600ustar00rootroot00000000000000import pytest import torch import triton @pytest.mark.parametrize("MODE", ["sdd", "dds", "dsd"]) @pytest.mark.parametrize("TRANS_A", [False, True]) @pytest.mark.parametrize("TRANS_B", [False, True]) @pytest.mark.parametrize("BLOCK", [16, 32, 64]) # TODO: float32 fails @pytest.mark.parametrize("DTYPE", [torch.float16]) def test_matmul(MODE, TRANS_A, TRANS_B, BLOCK, DTYPE, Z=3, H=2, M=512, N=384, K=256): seed = 0 torch.manual_seed(seed) is_sdd = MODE == "sdd" is_dsd = MODE == "dsd" is_dds = MODE == "dds" do_sparsify = lambda x: triton.testing.sparsify_tensor(x, layout, BLOCK) do_mask = lambda x: triton.testing.mask_tensor(x, layout, BLOCK) # create inputs # create op a_shape = (Z, H, K, M) if TRANS_A else (Z, H, M, K) b_shape = (Z, H, N, K) if TRANS_B else (Z, H, K, N) c_shape = (Z, H, M, N) shape = { "sdd": (M, N), "dsd": (a_shape[2], a_shape[3]), "dds": (b_shape[2], b_shape[3]), }[MODE] layout = torch.randint(2, (H, shape[0] // BLOCK, shape[1] // BLOCK)) layout[1, 2, :] = 0 layout[1, :, 1] = 0 # create data a_ref, a_tri = triton.testing.make_pair(a_shape, alpha=.1, dtype=DTYPE) b_ref, b_tri = triton.testing.make_pair(b_shape, alpha=.1, dtype=DTYPE) dc_ref, dc_tri = triton.testing.make_pair(c_shape, dtype=DTYPE) # compute [torch] dc_ref = do_mask(dc_ref) if is_sdd else dc_ref a_ref = do_mask(a_ref) if is_dsd else a_ref b_ref = do_mask(b_ref) if is_dds else b_ref a_ref.retain_grad() b_ref.retain_grad() c_ref = torch.matmul(a_ref.transpose(2, 3) if TRANS_A else a_ref, b_ref.transpose(2, 3) if TRANS_B else b_ref) c_ref.backward(dc_ref) c_ref = do_sparsify(c_ref) if is_sdd else c_ref da_ref = do_sparsify(a_ref.grad) if is_dsd else a_ref.grad db_ref = do_sparsify(b_ref.grad) if is_dds else b_ref.grad # triton result dc_tri = do_sparsify(dc_tri) if is_sdd else dc_tri a_tri = do_sparsify(a_tri) if is_dsd else a_tri b_tri = do_sparsify(b_tri) if is_dds else b_tri a_tri.retain_grad() b_tri.retain_grad() op = triton.ops.blocksparse.matmul(layout, BLOCK, MODE, trans_a=TRANS_A, trans_b=TRANS_B, device="cuda") c_tri = triton.testing.catch_oor(lambda: op(a_tri, b_tri), pytest) triton.testing.catch_oor(lambda: c_tri.backward(dc_tri), pytest) da_tri = a_tri.grad db_tri = b_tri.grad # compare triton.testing.assert_almost_equal(c_ref, c_tri) triton.testing.assert_almost_equal(da_ref, da_tri) triton.testing.assert_almost_equal(db_ref, db_tri) configs = [ (16, 256), (32, 576), (64, 1871), (128, 2511), ] @pytest.mark.parametrize("is_dense", [False, True]) @pytest.mark.parametrize("BLOCK, WIDTH", configs) def test_softmax(BLOCK, WIDTH, is_dense, Z=2, H=2, is_causal=True, scale=0.4): # set seed torch.random.manual_seed(0) Z, H, M, N = 2, 3, WIDTH, WIDTH # initialize layout # make sure each row has at least one non-zero element layout = torch.randint(2, (H, M // BLOCK, N // BLOCK)) if is_dense: layout[:] = 1 else: layout[1, 2, :] = 0 layout[1, :, 1] = 0 # initialize data a_shape = (Z, H, M, N) a_ref, a_tri = triton.testing.make_pair(a_shape) dout_ref, dout_tri = triton.testing.make_pair(a_shape) # compute [torch] a_ref = triton.testing.mask_tensor(a_ref, layout, BLOCK, value=float("-inf")) a_ref.retain_grad() at_mask = torch.ones((M, N), device="cuda") if is_causal: at_mask = torch.tril(at_mask) M = at_mask[None, None, :, :] + torch.zeros_like(a_ref) a_ref[M == 0] = float("-inf") out_ref = torch.softmax(a_ref * scale, -1) out_ref.backward(dout_ref) out_ref = triton.testing.sparsify_tensor(out_ref, layout, BLOCK) da_ref = triton.testing.sparsify_tensor(a_ref.grad, layout, BLOCK) # compute [triton] a_tri = triton.testing.sparsify_tensor(a_tri, layout, BLOCK) a_tri.retain_grad() dout_tri = triton.testing.sparsify_tensor(dout_tri, layout, BLOCK) op = triton.ops.blocksparse.softmax(layout, BLOCK, device="cuda", is_dense=is_dense) out_tri = op(a_tri, scale=scale, is_causal=is_causal) out_tri.backward(dout_tri) da_tri = a_tri.grad # compare triton.testing.assert_almost_equal(out_tri, out_ref) triton.testing.assert_almost_equal(da_tri, da_ref) @pytest.mark.parametrize("block", [16, 32, 64]) @pytest.mark.parametrize("dtype", [torch.float16, torch.float32]) def test_attention_fwd_bwd( block, dtype, input_scale=1.0, scale=1 / 8.0, n_ctx=256, batch_size=2, n_heads=2, ): capability = torch.cuda.get_device_capability() if capability[0] < 7: pytest.skip("Only test tl.dot() on devices with sm >= 70") # inputs qkv_shape = (batch_size, n_heads, n_ctx, 64) qkvs = [ torch.nn.Parameter(input_scale * torch.randn(qkv_shape), requires_grad=True).to(dtype).cuda() for _ in range(3) ] # Triton: n_blocks = n_ctx // block layout = torch.tril(torch.ones([n_heads, n_blocks, n_blocks], dtype=torch.long)) query, key, value = [x.clone() for x in qkvs] query.retain_grad() key.retain_grad() value.retain_grad() attn_out = triton_attention(layout, block, query=query, key=key, value=value, scale=scale) # ad hoc loss loss = (attn_out ** 2).mean() loss.backward() grads = [query.grad, key.grad, value.grad] # Torch version: torch_q, torch_k, torch_v = [x.clone() for x in qkvs] attn_mask = torch.ones([n_ctx, n_ctx], device="cuda", dtype=dtype) attn_mask = torch.tril(attn_mask, diagonal=0) attn_mask = 1e6 * (-1 + (attn_mask.reshape((1, 1, n_ctx, n_ctx)).cuda())) torch_q.retain_grad() torch_k.retain_grad() torch_v.retain_grad() scores = scale * torch.einsum("bhsd,bhtd->bhst", torch_q, torch_k) scores = scores + attn_mask probs = torch.softmax(scores, dim=-1) torch_attn_out = torch.einsum("bhst,bhtd->bhsd", probs, torch_v) # ad hoc loss torch_loss = (torch_attn_out ** 2).mean() torch_loss.backward() torch_grads = [torch_q.grad, torch_k.grad, torch_v.grad] # comparison # print(f"Triton loss {loss} and torch loss {torch_loss}. Also checking grads...") triton.testing.assert_almost_equal(loss, torch_loss) for g1, g2 in zip(grads, torch_grads): triton.testing.assert_almost_equal(g1, g2) @pytest.mark.parametrize("block", [16, 32, 64]) def triton_attention( layout, block: int, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, scale: float, ): sparse_dot_sdd_nt = triton.ops.blocksparse.matmul(layout, block, "sdd", trans_a=False, trans_b=True, device=value.device) sparse_dot_dsd_nn = triton.ops.blocksparse.matmul(layout, block, "dsd", trans_a=False, trans_b=False, device=value.device) sparse_softmax = triton.ops.blocksparse.softmax(layout, block, device=value.device) w = sparse_dot_sdd_nt(query, key) w = sparse_softmax(w, scale=scale, is_causal=True) a = sparse_dot_dsd_nn(w, value) return a triton-2.0.0/python/test/unit/operators/test_cross_entropy.py000066400000000000000000000026371440023377100246600ustar00rootroot00000000000000import pytest import torch import triton @pytest.mark.parametrize("M, N, dtype, mode", [ (M, N, dtype, mode) for M in [1024, 821] for N in [512, 857, 1871, 2089, 8573, 31000] for dtype in ['float16', 'float32'] for mode in ['forward', 'backward'] ] ) def test_op(M, N, dtype, mode): capability = torch.cuda.get_device_capability() if capability[0] < 8 and dtype == "bfloat16": pytest.skip("Only test bfloat16 on devices with sm >= 80") dtype = {'bfloat16': torch.bfloat16, 'float16': torch.float16, 'float32': torch.float32}[dtype] # create inputs x = torch.randn(M, N, dtype=dtype, device='cuda', requires_grad=True) idx = 4 + torch.ones(M, dtype=torch.int64, device='cuda') # forward pass tt_y = triton.ops.cross_entropy(x, idx) th_y = torch.nn.CrossEntropyLoss(reduction="none")(x, idx) if mode == 'forward': triton.testing.assert_almost_equal(th_y, tt_y) # backward pass elif mode == 'backward': dy = torch.randn_like(tt_y) # triton backward tt_y.backward(dy) tt_dx = x.grad.clone() # torch backward x.grad.zero_() th_y.backward(dy) th_dx = x.grad.clone() triton.testing.assert_almost_equal(th_dx, tt_dx) triton-2.0.0/python/test/unit/operators/test_flash_attention.py000066400000000000000000000033421440023377100251230ustar00rootroot00000000000000import pytest import torch import triton @pytest.mark.parametrize('Z, H, N_CTX, D_HEAD', [(4, 48, 1024, 64)]) def test_op(Z, H, N_CTX, D_HEAD, dtype=torch.float16): capability = torch.cuda.get_device_capability() if capability[0] < 8: pytest.skip("Flash attention only supported for compute capability < 80") torch.manual_seed(20) q = torch.empty((Z, H, N_CTX, D_HEAD), dtype=dtype, device="cuda").normal_(mean=0.1, std=0.2).requires_grad_() k = torch.empty((Z, H, N_CTX, D_HEAD), dtype=dtype, device="cuda").normal_(mean=0.4, std=0.2).requires_grad_() v = torch.empty((Z, H, N_CTX, D_HEAD), dtype=dtype, device="cuda").normal_(mean=0.3, std=0.2).requires_grad_() sm_scale = 0.2 dout = torch.randn_like(q) # reference implementation M = torch.tril(torch.ones((N_CTX, N_CTX), device="cuda")) p = torch.matmul(q, k.transpose(2, 3)) * sm_scale for z in range(Z): for h in range(H): p[:, :, M == 0] = float("-inf") p = torch.softmax(p.float(), dim=-1).half() # p = torch.exp(p) ref_out = torch.matmul(p, v) ref_out.backward(dout) ref_dv, v.grad = v.grad.clone(), None ref_dk, k.grad = k.grad.clone(), None ref_dq, q.grad = q.grad.clone(), None # # triton implementation tri_out = triton.ops.attention(q, k, v, sm_scale) # print(ref_out) # print(tri_out) tri_out.backward(dout) tri_dv, v.grad = v.grad.clone(), None tri_dk, k.grad = k.grad.clone(), None tri_dq, q.grad = q.grad.clone(), None # compare triton.testing.assert_almost_equal(ref_out, tri_out) triton.testing.assert_almost_equal(ref_dv, tri_dv) triton.testing.assert_almost_equal(ref_dk, tri_dk) triton.testing.assert_almost_equal(ref_dq, tri_dq) triton-2.0.0/python/test/unit/operators/test_matmul.py000066400000000000000000000117401440023377100232410ustar00rootroot00000000000000import itertools import pytest import torch import triton @pytest.mark.parametrize( "BLOCK_M, BLOCK_N, BLOCK_K, SPLIT_K, NWARP, NSTAGE, M, N, K, AT, BT, DTYPE", itertools.chain( *[ [ # 1 warp (16, 16, 16, 1, 1, 2, None, None, None, AT, BT, DTYPE), (32, 16, 16, 1, 1, 2, None, None, None, AT, BT, DTYPE), (16, 32, 16, 1, 1, 2, None, None, None, AT, BT, DTYPE), (16, 16, 32, 1, 1, 2, None, None, None, AT, BT, DTYPE), (32, 16, 32, 1, 1, 2, None, None, None, AT, BT, DTYPE), (16, 32, 32, 1, 1, 2, None, None, None, AT, BT, DTYPE), (16, 16, 64, 1, 1, 2, None, None, None, AT, BT, DTYPE), (64, 16, 64, 1, 1, 2, None, None, None, AT, BT, DTYPE), (16, 64, 64, 1, 1, 2, None, None, None, AT, BT, DTYPE), # 2 warp (64, 32, 64, 1, 2, 2, None, None, None, AT, BT, DTYPE), (32, 64, 64, 1, 2, 2, None, None, None, AT, BT, DTYPE), (64, 32, 16, 1, 2, 2, None, None, None, AT, BT, DTYPE), (32, 64, 16, 1, 2, 2, None, None, None, AT, BT, DTYPE), (128, 32, 32, 1, 2, 2, None, None, None, AT, BT, DTYPE), (32, 128, 32, 1, 2, 2, None, None, None, AT, BT, DTYPE), # 4 warp (128, 64, 16, 1, 4, 2, None, None, None, AT, BT, DTYPE), (64, 128, 16, 1, 4, 2, None, None, None, AT, BT, DTYPE), (128, 32, 32, 1, 4, 2, None, None, None, AT, BT, DTYPE), (32, 128, 32, 1, 4, 2, None, None, None, AT, BT, DTYPE), (128, 32, 64, 1, 4, 2, None, None, None, AT, BT, DTYPE), (32, 128, 64, 1, 4, 2, None, None, None, AT, BT, DTYPE), # 8 warp (128, 256, 16, 1, 8, 2, None, None, None, AT, BT, DTYPE), (256, 128, 16, 1, 8, 2, None, None, None, AT, BT, DTYPE), (256, 128, 32, 1, 8, 2, None, None, None, AT, BT, DTYPE), # split-k (64, 64, 16, 2, 4, 2, None, None, None, AT, BT, DTYPE), (64, 64, 16, 4, 4, 2, None, None, None, AT, BT, DTYPE), (64, 64, 16, 8, 4, 2, None, None, None, AT, BT, DTYPE), # variable input (128, 128, 32, 1, 4, 2, 1024, 1024, 1024, AT, BT, DTYPE), (128, 128, 32, 1, 4, 2, 384, 128, 640, AT, BT, DTYPE), (128, 128, 32, 1, 4, 2, 107, 233, 256, AT, BT, DTYPE), (128, 128, 32, 1, 4, 2, 107, 233, 311, AT, BT, DTYPE), ] for DTYPE in ["float16", "bfloat16", "float32"] for AT in [False, True] for BT in [False, True] ], # n-stage *[ [ (16, 16, 16, 1, 1, STAGES, 1024, 1024, 1024, AT, BT, DTYPE), (64, 32, 64, 1, 2, STAGES, 1024, 1024, 1024, AT, BT, DTYPE), (128, 64, 16, 1, 4, STAGES, 1024, 1024, 1024, AT, BT, DTYPE), (256, 128, 32, 1, 8, STAGES, 1024, 1024, 1024, AT, BT, DTYPE), (128, 128, 32, 1, 4, STAGES, 384, 128, 640, AT, BT, DTYPE), # split-k (64, 64, 16, 8, 4, STAGES, 1024, 1024, 1024, AT, BT, DTYPE), (64, 64, 16, 8, 4, STAGES, 1024, 1024, 32, AT, BT, DTYPE), ] for DTYPE in ["float16", "bfloat16", "float32"] for AT in [False, True] for BT in [False, True] for STAGES in [2, 3, 4] ] ), ) def test_op(BLOCK_M, BLOCK_N, BLOCK_K, SPLIT_K, NWARP, NSTAGE, M, N, K, AT, BT, DTYPE): capability = torch.cuda.get_device_capability() if capability[0] < 7: pytest.skip("Only test tl.dot() on devices with sm >= 70") if capability[0] < 8 and DTYPE == "bfloat16": pytest.skip("Only test bfloat16 on devices with sm >= 80") if DTYPE == "bfloat16" and SPLIT_K != 1: pytest.skip("bfloat16 matmuls don't allow split_k for now") torch.manual_seed(0) # nuke kernel decorators -- will set meta-parameters manually kwargs = {'BLOCK_M': BLOCK_M, 'BLOCK_N': BLOCK_N, 'BLOCK_K': BLOCK_K, 'SPLIT_K': SPLIT_K} pre_hook = None if SPLIT_K == 1 else lambda nargs: nargs['C'].zero_() configs = [triton.Config(kwargs=kwargs, num_warps=NWARP, num_stages=NSTAGE, pre_hook=pre_hook)] kernel = triton.ops._matmul.kernel kernel.configs = configs # kernel.run = kernel.run.run.run # get matrix shape M = BLOCK_M if M is None else M N = BLOCK_N if N is None else N K = BLOCK_K * SPLIT_K if K is None else K # allocate/transpose inputs DTYPE = {"float16": torch.float16, "bfloat16": torch.bfloat16, "float32": torch.float32}[DTYPE] a = .1 * torch.randn((K, M) if AT else (M, K), device="cuda", dtype=DTYPE) b = .1 * torch.randn((N, K) if BT else (K, N), device="cuda", dtype=DTYPE) a = a.t() if AT else a b = b.t() if BT else b # run test th_c = torch.matmul(a, b) tt_c = triton.testing.catch_oor(lambda: triton.ops.matmul(a, b), pytest) triton.testing.assert_almost_equal(th_c, tt_c) triton-2.0.0/python/test/unit/runtime/000077500000000000000000000000001440023377100177735ustar00rootroot00000000000000triton-2.0.0/python/test/unit/runtime/test_cache.py000066400000000000000000000121661440023377100224550ustar00rootroot00000000000000import multiprocessing import os import re import shutil from collections import namedtuple import pytest import torch import triton import triton.language as tl from triton.runtime.jit import JITFunction tmpdir = ".tmp" @triton.jit def function_1(i): i = i + 1 i = function_2(i) return i @triton.jit def function_2(i): i = i + 1 return i @triton.jit def kernel(X, i, BLOCK: tl.constexpr): i = i + 1 i = function_1(i) tl.store(X, i) @triton.jit(do_not_specialize=["i"]) def kernel_nospec(X, i, BLOCK: tl.constexpr): i = i + 1 i = function_1(i) tl.store(X, i) def apply_src_change(target, old, new): kernel.hash = None function_1.hash = None function_2.hash = None function_1.src = function_1.src.replace(old, new) target.src = target.src.replace(old, new) ret = target.cache_key target.src = target.src.replace(new, old) return ret def test_nochange(): baseline = kernel.cache_key updated = apply_src_change(kernel, 'i + 1', 'i + 1') assert baseline == updated def test_toplevel_change(): baseline = kernel.cache_key updated = apply_src_change(kernel, 'i + 1', 'i + 2') assert baseline != updated def test_nested1_change(): baseline = kernel.cache_key updated = apply_src_change(function_1, 'i + 1', 'i + 2') assert baseline != updated def reset_tmp_dir(): os.environ["TRITON_CACHE_DIR"] = tmpdir if os.path.exists(tmpdir): shutil.rmtree(tmpdir) def test_reuse(): counter = 0 def inc_counter(*args, **kwargs): nonlocal counter counter += 1 JITFunction.cache_hook = inc_counter reset_tmp_dir() x = torch.empty(1, dtype=torch.int32, device='cuda') for i in range(10): kernel[(1,)](x, 1, BLOCK=1024) assert counter == 1 @pytest.mark.parametrize('mode', ['enable', 'disable']) def test_specialize(mode): counter = 0 def inc_counter(*args, **kwargs): nonlocal counter counter += 1 JITFunction.cache_hook = inc_counter reset_tmp_dir() x = torch.empty(1, dtype=torch.int32, device='cuda') function = {'enable': kernel, 'disable': kernel_nospec}[mode] target = {'enable': 3, 'disable': 1}[mode] for i in [1, 2, 4, 8, 16, 32]: function[(1,)](x, i, BLOCK=512) assert counter == target @pytest.mark.parametrize("value, value_type", [ (-1, 'i32'), (0, 'i32'), (1, 'i32'), (-2**31, 'i32'), (2**31 - 1, 'i32'), (2**32, 'i64'), (2**63 - 1, 'i64'), (-2**63, 'i64'), (2**31, 'u32'), (2**32 - 1, 'u32'), (2**63, 'u64'), (2**64 - 1, 'u64') ]) def test_value_specialization(value: int, value_type: str, device='cuda') -> None: @triton.jit def kernel(VALUE, X): pass cache_str = None def get_cache_str(*args, **kwargs): nonlocal cache_str cache_str = kwargs["repr"] triton.JITFunction.cache_hook = get_cache_str reset_tmp_dir() x = torch.tensor([3.14159], device='cuda') kernel[(1, )](value, x) triton.JITFunction.cache_hook = None cache_str_match = re.match(r".*VALUE: (\w+).*", cache_str) spec_type = None if cache_str_match is None else cache_str_match.group(1) assert spec_type == value_type def test_constexpr_not_callable() -> None: @triton.jit def kernel(X, c: tl.constexpr): tl.store(X, 2) x = torch.empty(1, dtype=torch.int32, device='cuda') error = False try: kernel[(1, )](x, c="str") except BaseException: error = True assert error is False # try and catch try: kernel[(1, )](x, c=tl.abs) except BaseException: error = True assert error is True def test_jit_warmup_cache() -> None: @triton.jit def kernel_add(a, b, o, N: tl.constexpr): idx = tl.arange(0, N) tl.store(o + idx, tl.load(a + idx) + tl.load(b + idx)) args = [ torch.randn(32, dtype=torch.float32, device="cuda"), torch.randn(32, dtype=torch.float32, device="cuda"), torch.randn(32, dtype=torch.float32, device="cuda"), 32, ] assert len(kernel_add.cache) == 0 kernel_add.warmup(torch.float32, torch.float32, torch.float32, 32, grid=(1,)) assert len(kernel_add.cache) == 1 kernel_add.warmup(*args, grid=(1,)) assert len(kernel_add.cache) == 1 kernel_add.warmup(*args, grid=(1,)) assert len(kernel_add.cache) == 1 def test_compile_in_subproc() -> None: @triton.jit def kernel_sub(a, b, o, N: tl.constexpr): idx = tl.arange(0, N) tl.store(o + idx, tl.load(a + idx) - tl.load(b + idx) * 777) major, minor = torch.cuda.get_device_capability(0) cc = major * 10 + minor config = namedtuple("instance_descriptor", [ "divisible_by_16", "equal_to_1"])( tuple(range(4)), ()) proc = multiprocessing.Process( target=triton.compile, kwargs=dict( fn=kernel_sub, signature={0: "*fp32", 1: "*fp32", 2: "*fp32"}, device=0, constants={3: 32}, configs=[config], warm_cache_only=True, cc=cc, )) proc.start() proc.join() assert proc.exitcode == 0 triton-2.0.0/python/triton/000077500000000000000000000000001440023377100156715ustar00rootroot00000000000000triton-2.0.0/python/triton/_C/000077500000000000000000000000001440023377100162125ustar00rootroot00000000000000triton-2.0.0/python/triton/_C/include000077700000000000000000000000001440023377100220412../../../include/ustar00rootroot00000000000000triton-2.0.0/python/triton/__init__.py000066400000000000000000000016711440023377100200070ustar00rootroot00000000000000"""isort:skip_file""" __version__ = '2.0.0' # --------------------------------------- # Note: import order is significant here. # TODO: torch needs to be imported first # or pybind11 shows `munmap_chunk(): invalid pointer` import torch # noqa: F401 # submodules from . import impl from .utils import ( cdiv, MockTensor, next_power_of_2, reinterpret, TensorWrapper, ) from .runtime import ( autotune, Config, heuristics, JITFunction, KernelInterface, ) from .runtime.jit import jit from .compiler import compile, CompilationError from . import language from . import testing from . import ops __all__ = [ "autotune", "cdiv", "CompilationError", "compile", "Config", "heuristics", "impl", "jit", "JITFunction", "KernelInterface", "language", "MockTensor", "next_power_of_2", "ops", "reinterpret", "runtime", "TensorWrapper", "testing", ] triton-2.0.0/python/triton/compiler.py000066400000000000000000002156251440023377100200700ustar00rootroot00000000000000from __future__ import annotations import ast import contextlib import functools import hashlib import io import json import os import re import shutil import subprocess import sys import sysconfig import tempfile import warnings from collections import namedtuple from pathlib import Path from sysconfig import get_paths from typing import Any, Callable, Dict, Tuple, Union import setuptools import torch from filelock import FileLock import triton import triton._C.libtriton.triton as _triton from . import impl from .tools.disasm import extract def str_to_ty(name): if name[0] == "*": ty = str_to_ty(name[1:]) return triton.language.pointer_type(ty) tys = { "fp8": triton.language.float8, "fp16": triton.language.float16, "bf16": triton.language.bfloat16, "fp32": triton.language.float32, "fp64": triton.language.float64, "i1": triton.language.int1, "i8": triton.language.int8, "i16": triton.language.int16, "i32": triton.language.int32, "i64": triton.language.int64, "u8": triton.language.uint8, "u16": triton.language.uint16, "u32": triton.language.uint32, "u64": triton.language.uint64, "B": triton.language.int1, } return tys[name] def mangle_ty(ty): if ty.is_ptr(): return 'P' + mangle_ty(ty.element_ty) if ty.is_int(): return 'i' + str(ty.int_bitwidth) if ty.is_fp8(): return 'fp8' if ty.is_fp16(): return 'fp16' if ty.is_bf16(): return 'bf16' if ty.is_fp32(): return 'fp32' if ty.is_fp64(): return 'fp64' if ty.is_block(): elt = mangle_ty(ty.scalar) shape = '_'.join(map(str, ty.shape)) return f'{elt}S{shape}S' if ty.is_void(): return 'V' assert False, "Unsupported type" def mangle_fn(name, arg_tys, constants): # doesn't mangle ret type, which must be a function of arg tys mangled_arg_names = '_'.join([mangle_ty(ty) for ty in arg_tys]) mangled_constants = '_'.join([f'{i}c{repr(constants[i])}' for i in sorted(constants)]) mangled_constants = mangled_constants.replace('.', '_d_') mangled_constants = mangled_constants.replace("'", '_sq_') ret = f'{name}__{mangled_arg_names}__{mangled_constants}' return ret class enter_sub_region: def __init__(self, generator: CodeGenerator): self.generator = generator def __enter__(self): # record lscope & local_defs in the parent scope self.liveins = self.generator.lscope.copy() self.prev_defs = self.generator.local_defs.copy() self.generator.local_defs = {} self.insert_block = self.generator.builder.get_insertion_block() self.insert_point = self.generator.builder.get_insertion_point() return self.liveins, self.insert_block def __exit__(self, *args, **kwargs): self.generator.builder.restore_insertion_point(self.insert_point) self.generator.lscope = self.liveins self.generator.local_defs = self.prev_defs class CodeGenerator(ast.NodeVisitor): def __init__(self, context, prototype, gscope, attributes, constants, function_name, module=None, is_kernel=False, function_types=dict()): self.builder = _triton.ir.builder(context) self.module = self.builder.create_module() if module is None else module self.function_ret_types = function_types self.prototype = prototype self.gscope = gscope self.lscope = dict() self.attributes = attributes self.constants = constants self.function_name = function_name self.is_kernel = is_kernel self.last_node = None self.builtins = { 'range': range, 'min': triton.language.minimum, 'float': float, 'int': int, 'print': print, 'isinstance': isinstance, 'getattr': getattr, } self.scf_stack = [] # SSA-construction # name => triton.language.tensor self.local_defs: Dict[str, triton.language.tensor] = {} self.global_uses: Dict[str, triton.language.tensor] = {} def get_value(self, name): ''' This function: 1. make sure `name` is defined 2. if `name` is triton.language.tensor, get stored tensor by calling `self._get_tensor()` ''' # search node.id in local scope ret = None if name in self.lscope: ret = self.lscope[name] if name not in self.local_defs: self.global_uses[name] = ret # search node.id in global scope elif name in self.gscope: ret = self.gscope[name] # search node.id in builtins elif name in self.builtins: ret = self.builtins[name] else: raise ValueError(f'{name} is not defined') return ret def set_value(self, name: str, value: Union[triton.language.tensor, triton.language.constexpr]) -> None: ''' This function: called by visit_Assign() & visit_FuncDef() to store left value (lvalue) 1. record local defined name (FIXME: should consider control flow) 2. store tensor in self.lvalue ''' self.lscope[name] = value self.local_defs[name] = value def is_triton_tensor(self, value): return isinstance(value, triton.language.tensor) # # AST visitor # def visit_compound_statement(self, stmts): for stmt in stmts: self.last_ret_type = self.visit(stmt) if isinstance(stmt, ast.Return): break return stmts and isinstance(stmt, ast.Return) def visit_Module(self, node): ast.NodeVisitor.generic_visit(self, node) def visit_List(self, node): ctx = self.visit(node.ctx) assert ctx is None elts = [self.visit(elt) for elt in node.elts] return elts # By design, only non-kernel functions can return def visit_Return(self, node): ret_value = self.visit(node.value) # ret_block = self.builder.create_block() # post_ret_block = self.builder.create_block() # self.builder.create_branch(ret_block) # self.builder.set_insertion_point_to_end(ret_block) if ret_value is None: self.builder.ret([]) ret_ty = None elif isinstance(ret_value, tuple): ret_values = [triton.language.core._to_tensor(v, self.builder) for v in ret_value] ret_types = [v.type for v in ret_values] self.builder.ret([v.handle for v in ret_values]) ret_ty = tuple(ret_types) else: ret = triton.language.core._to_tensor(ret_value, self.builder) self.builder.ret([ret.handle]) ret_ty = ret.type # self.builder.create_branch(post_ret_block) # self.builder.set_insertion_point_to_end(post_ret_block) return ret_ty def visit_FunctionDef(self, node): arg_names, kwarg_names = self.visit(node.args) # initialize defaults for i, default_value in enumerate(node.args.defaults): arg_node = node.args.args[-i - 1] annotation = arg_node.annotation name = arg_node.arg st_target = ast.Name(id=name, ctx=ast.Store()) if annotation is None: init_node = ast.Assign(targets=[st_target], value=default_value) else: init_node = ast.AnnAssign(target=st_target, value=default_value, annotation=annotation) self.visit(init_node) # initialize function visibility = "public" if self.is_kernel else "private" fn = self.builder.get_or_insert_function(self.module, self.function_name, self.prototype.to_ir(self.builder), visibility) self.module.push_back(fn) entry = fn.add_entry_block() arg_values = [] idx = 0 for i, arg_name in enumerate(arg_names): if i in self.constants: cst = self.constants[i] if not isinstance(cst, triton.language.constexpr): cst = triton.language.constexpr(self.constants[i]) arg_values.append(cst) continue else: if i in self.attributes: fn.set_arg_attr(idx, "tt.divisibility", self.attributes[i][1]) arg_values.append(triton.language.tensor(fn.args(idx), self.prototype.param_types[idx])) idx += 1 insert_pt = self.builder.get_insertion_block() for arg_name, arg_value in zip(arg_names, arg_values): self.set_value(arg_name, arg_value) self.builder.set_insertion_point_to_start(entry) # visit function body has_ret = self.visit_compound_statement(node.body) # finalize function if not has_ret: self.builder.ret([]) else: # update return type if isinstance(self.last_ret_type, tuple): self.prototype.ret_types = list(self.last_ret_type) fn.reset_type(self.prototype.to_ir(self.builder)) else: self.prototype.ret_types = [self.last_ret_type] fn.reset_type(self.prototype.to_ir(self.builder)) if insert_pt: self.builder.set_insertion_point_to_end(insert_pt) def visit_arguments(self, node): arg_names = [] for arg in node.args: arg_names += [self.visit(arg)] kwarg_names = self.visit(node.kwarg) return arg_names, kwarg_names def visit_arg(self, node): ast.NodeVisitor.generic_visit(self, node) return node.arg def visit_AnnAssign(self, node): # extract attributes annotation = self.visit(node.annotation) target = self.visit(node.target) value = self.visit(node.value) # constexpr if annotation == triton.language.constexpr: if target in self.lscope: raise ValueError(f'{target} is already defined.' f' constexpr cannot be reassigned.') if not isinstance(value, triton.language.constexpr): value = triton.language.constexpr(value) self.lscope[target] = value return self.lscope[target] # default: call visit_Assign return self.visit_Assign(node) def visit_Assign(self, node): _names = [] for target in node.targets: _names += [self.visit(target)] assert len(_names) == 1 names = _names[0] values = self.visit(node.value) if not isinstance(names, tuple): names = [names] if not isinstance(values, tuple): values = [values] for name, value in zip(names, values): # by default, constexpr are assigned into python variable if isinstance(value, triton.language.constexpr): value = value.value if not isinstance(value, triton.language.tensor): value = triton.language.core._to_tensor(value, self.builder) self.set_value(name, value) def visit_AugAssign(self, node): name = node.target.id lhs = ast.Name(id=name, ctx=ast.Load()) rhs = ast.BinOp(lhs, node.op, node.value) assign = ast.Assign(targets=[node.target], value=rhs) self.visit(assign) return self.get_value(name) def visit_Name(self, node): if type(node.ctx) == ast.Store: return node.id return self.get_value(node.id) def visit_Store(self, node): ast.NodeVisitor.generic_visit(self, node) def visit_Load(self, node): ast.NodeVisitor.generic_visit(self, node) def visit_Tuple(self, node): args = [self.visit(x) for x in node.elts] return tuple(args) def visit_BinOp(self, node): lhs = self.visit(node.left) rhs = self.visit(node.right) fn = { ast.Add: '__add__', ast.Sub: '__sub__', ast.Mult: '__mul__', ast.Div: '__truediv__', ast.FloorDiv: '__floordiv__', ast.Mod: '__mod__', ast.Pow: '__pow__', ast.LShift: '__lshift__', ast.RShift: '__rshift__', ast.BitAnd: '__and__', ast.BitOr: '__or__', ast.BitXor: '__xor__', }[type(node.op)] if self.is_triton_tensor(lhs): return getattr(lhs, fn)(rhs, _builder=self.builder) elif self.is_triton_tensor(rhs): fn = fn[:2] + 'r' + fn[2:] return getattr(rhs, fn)(lhs, _builder=self.builder) else: return getattr(lhs, fn)(rhs) def visit_then_else_blocks(self, node, liveins, then_block, else_block): # then block self.builder.set_insertion_point_to_start(then_block) self.visit_compound_statement(node.body) then_block = self.builder.get_insertion_block() then_defs = self.local_defs.copy() # else block else_defs = {} if node.orelse: self.builder.set_insertion_point_to_start(else_block) self.lscope = liveins.copy() self.local_defs = {} self.visit_compound_statement(node.orelse) else_defs = self.local_defs.copy() else_block = self.builder.get_insertion_block() # update block arguments names = [] ret_types = [] ir_ret_types = [] # variables in livein whose value is updated in `if` for name in liveins: # check type for defs, block_name in [(then_defs, 'then'), (else_defs, 'else')]: if name in defs: assert defs[name].type == liveins[name].type,\ f'initial value for `{name}` is of type {liveins[name].type}, '\ f'but the {block_name} block redefines it as {defs[name].type}' if name in then_defs or name in else_defs: names.append(name) ret_types.append(then_defs[name].type if name in then_defs else else_defs[name].type) ir_ret_types.append(then_defs[name].handle.get_type() if name in then_defs else else_defs[name].handle.get_type()) # variable defined in then but not in else if name in then_defs and name not in else_defs: else_defs[name] = liveins[name] # variable defined in else but not in then if name in else_defs and name not in then_defs: then_defs[name] = liveins[name] # variables that are both in then and else but not in liveins # TODO: could probably be cleaned up for name in then_defs.keys() & else_defs.keys(): if name in names: continue then_ty = then_defs[name].type else_ty = else_defs[name].type assert then_ty == else_ty,\ f'mismatched type for {name} between then block ({then_ty}) '\ f'and else block ({else_ty})' names.append(name) ret_types.append(then_ty) ir_ret_types.append(then_defs[name].handle.get_type()) return then_defs, else_defs, then_block, else_block, names, ret_types, ir_ret_types def visit_if_top_level(self, cond, node): with enter_sub_region(self) as sr: liveins, ip_block = sr then_block = self.builder.create_block() else_block = self.builder.create_block() # create basic-block after conditional endif_block = self.builder.create_block() # create branch self.builder.set_insertion_point_to_end(ip_block) self.builder.create_cond_branch(cond.handle, then_block, else_block) # visit then and else blocks then_defs, else_defs, then_block, else_block, names, ret_types, ir_ret_types = \ self.visit_then_else_blocks(node, liveins, then_block, else_block) # then terminator self.builder.set_insertion_point_to_end(then_block) if not then_block.has_terminator(): self.builder.create_branch(endif_block, [then_defs[n].handle for n in names]) # else terminator self.builder.set_insertion_point_to_end(else_block) if not else_block.has_terminator(): self.builder.create_branch(endif_block, [else_defs[n].handle for n in names]) for ty in ir_ret_types: endif_block.add_argument(ty) # change block self.builder.set_insertion_point_to_start(endif_block) # update value for i, name in enumerate(names): new_tensor = triton.language.core.tensor(endif_block.arg(i), ret_types[i]) self.set_value(name, new_tensor) # TODO: refactor def visit_if_scf(self, cond, node): with enter_sub_region(self) as sr: liveins, _ = sr ip = self.builder.get_insertion_point() then_block = self.builder.create_block() else_block = self.builder.create_block() if node.orelse else None then_defs, else_defs, then_block, else_block, names, ret_types, _ = \ self.visit_then_else_blocks(node, liveins, then_block, else_block) # create if op self.builder.restore_insertion_point(ip) if_op = self.builder.create_if_op([ty.to_ir(self.builder) for ty in ret_types], cond.handle, True) then_block.merge_block_before(if_op.get_then_block()) self.builder.set_insertion_point_to_end(if_op.get_then_block()) if len(names) > 0: self.builder.create_yield_op([then_defs[n].handle for n in names]) if not node.orelse: else_block = if_op.get_else_block() else: else_block.merge_block_before(if_op.get_else_block()) self.builder.set_insertion_point_to_end(if_op.get_else_block()) if len(names) > 0: self.builder.create_yield_op([else_defs[n].handle for n in names]) # update values for i, name in enumerate(names): new_tensor = triton.language.core.tensor(if_op.get_result(i), ret_types[i]) self.set_value(name, new_tensor) def visit_If(self, node): cond = self.visit(node.test) if isinstance(cond, triton.language.tensor): cond = cond.to(triton.language.int1, _builder=self.builder) if self.scf_stack: self.visit_if_scf(cond, node) else: self.visit_if_top_level(cond, node) else: if isinstance(cond, triton.language.constexpr): cond = cond.value if cond: self.visit_compound_statement(node.body) else: self.visit_compound_statement(node.orelse) def visit_IfExp(self, node): cond = self.visit(node.test) if cond.value: return self.visit(node.body) else: return self.visit(node.orelse) def visit_Pass(self, node): pass def visit_Compare(self, node): assert len(node.comparators) == 1 assert len(node.ops) == 1 lhs = self.visit(node.left) rhs = self.visit(node.comparators[0]) if isinstance(lhs, triton.language.constexpr): lhs = lhs.value if isinstance(rhs, triton.language.constexpr): rhs = rhs.value if type(node.ops[0]) == ast.Is: return triton.language.constexpr(lhs is rhs) if type(node.ops[0]) == ast.IsNot: return triton.language.constexpr(lhs is not rhs) fn = { ast.Eq: '__eq__', ast.NotEq: '__ne__', ast.Lt: '__lt__', ast.LtE: '__le__', ast.Gt: '__gt__', ast.GtE: '__ge__', }[type(node.ops[0])] if self.is_triton_tensor(lhs): return getattr(lhs, fn)(rhs, _builder=self.builder) elif self.is_triton_tensor(rhs): fn = fn[:2] + 'r' + fn[2:] return getattr(rhs, fn)(lhs, _builder=self.builder) else: return getattr(lhs, fn)(rhs) def visit_UnaryOp(self, node): op = self.visit(node.operand) fn = { ast.USub: '__neg__', ast.UAdd: '__pos__', ast.Not: '__not__', ast.Invert: '__invert__', }[type(node.op)] if self.is_triton_tensor(op): return getattr(op, fn)(_builder=self.builder) return getattr(op, fn)() def visit_While(self, node): with enter_sub_region(self) as sr: liveins, insert_block = sr # loop body (the after region) # loop_block = self.builder.create_block() dummy = self.builder.create_block() self.builder.set_insertion_point_to_start(dummy) self.scf_stack.append(node) self.visit_compound_statement(node.body) self.scf_stack.pop() loop_defs = self.local_defs # collect loop-carried values names = [] ret_types = [] init_args = [] for name in loop_defs: if name in liveins: # We should not def new constexpr assert self.is_triton_tensor(loop_defs[name]) assert self.is_triton_tensor(liveins[name]) assert loop_defs[name].type == liveins[name].type # these are loop-carried values names.append(name) ret_types.append(loop_defs[name].type) init_args.append(liveins[name]) self.builder.set_insertion_point_to_end(insert_block) while_op = self.builder.create_while_op([ty.to_ir(self.builder) for ty in ret_types], [arg.handle for arg in init_args]) # merge the condition region before_block = self.builder.create_block_with_parent(while_op.get_before(), [ty.to_ir(self.builder) for ty in ret_types]) self.builder.set_insertion_point_to_start(before_block) for i, name in enumerate(names): self.lscope[name] = triton.language.core.tensor(before_block.arg(i), ret_types[i]) self.local_defs[name] = self.lscope[name] cond = self.visit(node.test) self.builder.set_insertion_point_to_end(before_block) # create ConditionOp: e.g., scf.condition(%cond) %arg0, %arg1, ... self.builder.create_condition_op(cond.handle, [before_block.arg(i) for i in range(len(init_args))]) # merge the loop body after_block = self.builder.create_block_with_parent(while_op.get_after(), [ty.to_ir(self.builder) for ty in ret_types]) # generate loop body self.builder.set_insertion_point_to_start(after_block) for i, name in enumerate(names): self.lscope[name] = triton.language.core.tensor(after_block.arg(i), ret_types[i]) self.local_defs[name] = self.lscope[name] self.scf_stack.append(node) self.visit_compound_statement(node.body) self.scf_stack.pop() loop_defs = self.local_defs yields = [] for name in loop_defs: if name in liveins: yields.append(loop_defs[name]) self.builder.create_yield_op([y.handle for y in yields]) # update global uses in while_op for i, name in enumerate(names): after_block.replace_use_in_block_with(init_args[i].handle, after_block.arg(i)) # WhileOp defines new values, update the symbol table (lscope, local_defs) for i, name in enumerate(names): new_def = triton.language.core.tensor(while_op.get_result(i), ret_types[i]) self.lscope[name] = new_def self.local_defs[name] = new_def for stmt in node.orelse: assert False, "Not implemented" ast.NodeVisitor.generic_visit(self, stmt) def visit_Subscript(self, node): assert node.ctx.__class__.__name__ == "Load" lhs = self.visit(node.value) slices = self.visit(node.slice) if self.is_triton_tensor(lhs): return lhs.__getitem__(slices, _builder=self.builder) return lhs[slices] def visit_ExtSlice(self, node): return [self.visit(dim) for dim in node.dims] def visit_For(self, node): IteratorClass = self.visit(node.iter.func) iter_args = [self.visit(arg) for arg in node.iter.args] if IteratorClass == triton.language.static_range: iterator = IteratorClass(*iter_args) static_range = range(iterator.start.value, iterator.end.value, iterator.step.value) for i in static_range: self.lscope[node.target.id] = triton.language.constexpr(i) self.visit_compound_statement(node.body) for stmt in node.orelse: ast.NodeVisitor.generic_visit(self, stmt) return if IteratorClass != self.builtins['range']: raise RuntimeError('Only `range` and `static_range` iterators are currently supported') # visit iterator arguments # note: only `range` iterator is supported now # collect lower bound (lb), upper bound (ub), and step lb = iter_args[0] if len(iter_args) > 1 else self.visit(ast.Num(0)) ub = iter_args[1] if len(iter_args) > 1 else self.visit(node.iter.args[0]) step = iter_args[2] if len(iter_args) > 2 else self.visit(ast.Num(1)) # handle negative constant step (not supported by scf.for in MLIR) negative_step = False if isinstance(step, triton.language.constexpr) and step.value < 0: step = triton.language.constexpr(-step.value) negative_step = True lb, ub = ub, lb # lb/ub/step might be constexpr, we need to cast them to tensor lb = triton.language.core._to_tensor(lb, self.builder).handle ub = triton.language.core._to_tensor(ub, self.builder).handle step = triton.language.core._to_tensor(step, self.builder).handle # ForOp can only accept IndexType as lb/ub/step. Cast integer to Index lb = self.builder.create_to_index(lb) ub = self.builder.create_to_index(ub) step = self.builder.create_to_index(step) # Create placeholder for the loop induction variable iv = self.builder.create_undef(self.builder.get_int32_ty()) self.set_value(node.target.id, triton.language.core.tensor(iv, triton.language.core.int32)) with enter_sub_region(self) as sr: liveins, insert_block = sr ip = self.builder.get_insertion_point() # create loop body block block = self.builder.create_block() self.builder.set_insertion_point_to_start(block) # dry visit loop body self.scf_stack.append(node) self.visit_compound_statement(node.body) self.scf_stack.pop() block.erase() # If a variable (name) is defined in both its parent & itself, then it's # a loop-carried variable. (They must be of the same type) init_args = [] yields = [] names = [] for name in self.local_defs: if name in liveins: assert self.is_triton_tensor(self.local_defs[name]), f'{name} is not tensor' assert self.is_triton_tensor(liveins[name]) assert self.local_defs[name].type == liveins[name].type,\ f'Loop-carried variable {name} has initial type {liveins[name].type} '\ f'but is re-assigned to {self.local_defs[name].type} in loop! '\ f'Please make sure that the type stays consistent.' names.append(name) init_args.append(triton.language.core._to_tensor(liveins[name], self.builder)) yields.append(triton.language.core._to_tensor(self.local_defs[name], self.builder)) # create ForOp self.builder.restore_insertion_point(ip) for_op = self.builder.create_for_op(lb, ub, step, [arg.handle for arg in init_args]) self.scf_stack.append(node) self.builder.set_insertion_point_to_start(for_op.get_body(0)) for i, name in enumerate(names): self.set_value(name, triton.language.core.tensor(for_op.get_body(0).arg(i + 1), yields[i].type)) self.visit_compound_statement(node.body) self.scf_stack.pop() yields = [] for name in self.local_defs: if name in liveins: yields.append(triton.language.core._to_tensor(self.local_defs[name], self.builder)) # create YieldOp if len(yields) > 0: self.builder.create_yield_op([y.handle for y in yields]) for_op_region = for_op.get_body(0).get_parent() assert for_op_region.size() == 1, "We use SCF, so the loop body should only have one block" # update induction variable with actual value, and replace all uses self.builder.set_insertion_point_to_start(for_op.get_body(0)) iv = self.builder.create_index_to_si(for_op.get_induction_var()) if negative_step: ub_si = self.builder.create_index_to_si(ub) iv = self.builder.create_sub(ub_si, iv) self.lscope[node.target.id].handle.replace_all_uses_with(iv) self.set_value(node.target.id, triton.language.core.tensor(iv, triton.language.core.int32)) # update lscope & local_defs (ForOp defines new values) for i, name in enumerate(names): self.set_value(name, triton.language.core.tensor(for_op.get_result(i), yields[i].type)) for stmt in node.orelse: assert False, "Don't know what to do with else after for" ast.NodeVisitor.generic_visit(self, stmt) def visit_Slice(self, node): lower = self.visit(node.lower) upper = self.visit(node.upper) step = self.visit(node.step) return slice(lower, upper, step) def visit_Index(self, node): return self.visit(node.value) def visit_keyword(self, node): return {node.arg: self.visit(node.value)} def visit_Call(self, node): fn = self.visit(node.func) if isinstance(fn, triton.language.constexpr): fn = fn.value kws = dict() for keyword in node.keywords: kws.update(self.visit(keyword)) args = [self.visit(arg) for arg in node.args] if isinstance(fn, triton.runtime.JITFunction): from inspect import getcallargs args = getcallargs(fn.fn, *args, **kws) args = [args[name] for name in fn.arg_names] args = [arg if isinstance(arg, triton.language.tensor) else triton.language.constexpr(arg) for arg in args] # generate function def attributes = dict() constexprs = [i for i, arg in enumerate(args) if isinstance(arg, triton.language.constexpr)] constants = {i: args[i] for i in constexprs} # generate call args = [None if i in constexprs else arg for i, arg in enumerate(args)] arg_vals = [arg.handle for arg in args if arg is not None] arg_types = [arg.type for arg in args if arg is not None] fn_name = mangle_fn(fn.__name__, arg_types, constants) # generate function def if necessary if not self.module.has_function(fn_name): prototype = triton.language.function_type([], arg_types) gscope = sys.modules[fn.fn.__module__].__dict__ generator = CodeGenerator(self.builder.context, prototype, gscope, attributes, constants, module=self.module, function_name=fn_name, function_types=self.function_ret_types) generator.visit(fn.parse()) callee_ret_type = generator.last_ret_type self.function_ret_types[fn_name] = callee_ret_type else: callee_ret_type = self.function_ret_types[fn_name] symbol = self.module.get_function(fn_name) call_op = self.builder.call(symbol, arg_vals) if call_op.get_num_results() == 0 or callee_ret_type is None: return None elif call_op.get_num_results() == 1: return triton.language.tensor(call_op.get_result(0), callee_ret_type) else: # should return a tuple of tl.tensor results = [] for i in range(call_op.get_num_results()): results.append(triton.language.tensor(call_op.get_result(i), callee_ret_type[i])) return tuple(results) if (hasattr(fn, '__self__') and self.is_triton_tensor(fn.__self__)) \ or impl.is_builtin(fn): return fn(*args, _builder=self.builder, **kws) if fn in self.builtins.values(): args = [arg.value if isinstance(arg, triton.language.constexpr) else arg for arg in args] return fn(*args, **kws) def visit_Constant(self, node): return triton.language.constexpr(node.value) def visit_BoolOp(self, node: ast.BoolOp): assert len(node.values) == 2 lhs = self.visit(node.values[0]) rhs = self.visit(node.values[1]) fn = { ast.And: 'logical_and', ast.Or: 'logical_or', }[type(node.op)] if self.is_triton_tensor(lhs): return getattr(lhs, fn)(rhs, _builder=self.builder) elif self.is_triton_tensor(rhs): fn = fn[:2] + 'r' + fn[2:] return getattr(rhs, fn)(lhs, _builder=self.builder) else: return getattr(lhs, fn)(rhs) if sys.version_info < (3, 8): def visit_NameConstant(self, node): return triton.language.constexpr(node.value) def visit_Num(self, node): return triton.language.constexpr(node.n) def visit_Str(self, node): return triton.language.constexpr(ast.literal_eval(node)) def visit_Attribute(self, node): lhs = self.visit(node.value) if isinstance(lhs, triton.language.tensor): if node.attr == "T": return triton.language.semantic.trans(lhs, builder=self.builder) return getattr(lhs, node.attr) def visit_Expr(self, node): ast.NodeVisitor.generic_visit(self, node) def visit_NoneType(self, node): return None def visit(self, node): if node is not None: self.last_node = node with warnings.catch_warnings(): # The ast library added visit_Constant and deprecated some other # methods but we can't move to that without breaking Python 3.6 and 3.7. warnings.simplefilter("ignore", DeprecationWarning) # python 3.9 warnings.simplefilter("ignore", PendingDeprecationWarning) # python 3.8 return super().visit(node) def generic_visit(self, node): typename = type(node).__name__ raise NotImplementedError("Unsupported node: {}".format(typename)) class CompilationError(Exception): def __init__(self, src, node): self.message = f'at {node.lineno}:{node.col_offset}:\n' self.message += '\n'.join(src.split('\n')[:node.lineno]) self.message += '\n' + ' ' * node.col_offset + '^' self.src = src self.node = node super().__init__(self.message) def __reduce__(self): # this is necessary to make CompilationError picklable return (type(self), (self.src, self.node)) class OutOfResources(Exception): def __init__(self, required, limit, name): self.message = f'out of resource: {name}, '\ f'Required: {required}, '\ f'Hardware limit: {limit}' self.message += '. Reducing block sizes or `num_stages` may help.' self.required = required self.limit = limit self.name = name super().__init__(self.message) def __reduce__(self): # this is necessary to make CompilationError picklable return (type(self), (self.required, self.limit, self.name)) def kernel_suffix(signature, specialization): # suffix format: # <'c' if equal to 1><'d' if divisible by 16> suffix = '' for i, _ in enumerate(signature): suffix += str(i) if i in specialization.equal_to_1: suffix += 'c' if i in specialization.divisible_by_16: suffix += 'd' return suffix # ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------ def parse_mlir_module(path, context): module = _triton.ir.parse_mlir_module(path, context) # module takes ownership of the context module.context = context return module def build_triton_ir(fn, signature, specialization, constants): # canonicalize signature if isinstance(signature, str): signature = {k: v.strip() for k, v in enumerate(signature.split(","))} context = _triton.ir.context() context.load_triton() # create kernel prototype cst_key = lambda i: fn.arg_names.index(i) if isinstance(i, str) else i constants = {cst_key(key): value for key, value in constants.items()} # visit kernel AST gscope = fn.__globals__.copy() function_name = '_'.join([fn.__name__, kernel_suffix(signature.values(), specialization)]) tys = list(signature.values()) new_constants = {k: True if k in tys and tys[k] == "i1" else 1 for k in specialization.equal_to_1} new_attrs = {k: ("multiple_of", 16) for k in specialization.divisible_by_16} all_constants = constants.copy() all_constants.update(new_constants) arg_types = [str_to_ty(v) for k, v in signature.items() if k not in constants] prototype = triton.language.function_type([], arg_types) generator = CodeGenerator(context, prototype, gscope=gscope, constants=all_constants, function_name=function_name, attributes=new_attrs, is_kernel=True) try: generator.visit(fn.parse()) except Exception as e: node = generator.last_node if node is None or isinstance(e, (NotImplementedError, CompilationError)): raise e raise CompilationError(fn.src, node) from e ret = generator.module # module takes ownership of the context ret.context = context return ret, generator def optimize_triton_ir(mod): pm = _triton.ir.pass_manager(mod.context) pm.enable_debug() pm.add_inliner_pass() pm.add_triton_combine_pass() pm.add_canonicalizer_pass() pm.add_cse_pass() pm.add_licm_pass() pm.run(mod) return mod def ast_to_ttir(fn, signature, specialization, constants): mod, _ = build_triton_ir(fn, signature, specialization, constants) return optimize_triton_ir(mod) def ttir_to_ttgir(mod, num_warps, num_stages, compute_capability): pm = _triton.ir.pass_manager(mod.context) pm.add_convert_triton_to_tritongpu_pass(num_warps) pm.enable_debug() pm.add_coalesce_pass() # The combine pass converts blocked layout to mma layout # for dot ops so that pipeline can get shared memory swizzled correctly. pm.add_tritongpu_combine_pass(compute_capability) pm.add_tritongpu_pipeline_pass(num_stages) # Prefetch must be done after pipeline pass because pipeline pass # extracts slices from the original tensor. pm.add_tritongpu_prefetch_pass() pm.add_canonicalizer_pass() pm.add_cse_pass() pm.add_tritongpu_combine_pass(compute_capability) pm.add_licm_pass() pm.add_tritongpu_combine_pass(compute_capability) pm.add_cse_pass() pm.add_tritongpu_decompose_conversions_pass() if compute_capability // 10 == 7: # The update_mma_for_volta pass helps to compute some information for MMA encoding specifically for MMAv1 # NOTE this pass should be placed after all the passes those modifies mma layout pm.add_tritongpu_update_mma_for_volta_pass() pm.add_cse_pass() pm.add_symbol_dce_pass() pm.add_tritongpu_reorder_instructions_pass() pm.run(mod) return mod def add_external_libs(mod, libs): for name, path in libs.items(): if len(name) == 0 or len(path) == 0: return _triton.add_external_libs(mod, list(libs.keys()), list(libs.values())) def ttgir_to_llir(mod, extern_libs, compute_capability): if extern_libs: add_external_libs(mod, extern_libs) return _triton.translate_triton_gpu_to_llvmir(mod, compute_capability) def llir_to_ptx(mod: Any, compute_capability: int, ptx_version: int = None) -> Tuple[str, int]: ''' Translate TritonGPU module to PTX code. :param mod: a TritonGPU dialect module :return: - PTX code - shared memory allocation size ''' if ptx_version is None: _, cuda_version = path_to_ptxas() ptx_version = ptx_get_version(cuda_version) return _triton.translate_llvmir_to_ptx(mod, compute_capability, ptx_version) def ptx_to_cubin(ptx: str, compute_capability: int): ''' Compile TritonGPU module to cubin. :param ptx: ptx code :param compute_capability: compute capability :return: str ''' ptxas, _ = path_to_ptxas() return _triton.compile_ptx_to_cubin(ptx, ptxas, compute_capability) def ptx_get_kernel_name(ptx: str) -> str: ''' Get kernel name from PTX code. This Kernel name is required when launching the kernel. ''' # There is a name mangling in PTX codegen, so the original kernel names in Triton IR are not available in PTX/cubin. assert ptx for line in ptx.split('\n'): line = line.strip() if line.startswith('// .globl'): return line.split()[-1] @functools.lru_cache def ptx_get_version(cuda_version) -> int: ''' Get the highest PTX version supported by the current CUDA driver. ''' assert isinstance(cuda_version, str) major, minor = map(int, cuda_version.split('.')) if major == 12: return 80 + minor if major == 11: return 70 + minor if major == 10: return 63 + minor raise RuntimeError("Triton only support CUDA 10.0 or higher") def path_to_ptxas(): base_dir = os.path.dirname(__file__) paths = [ os.environ.get("TRITON_PTXAS_PATH", ""), os.path.join(base_dir, "third_party", "cuda", "bin", "ptxas") ] for ptxas in paths: if os.path.exists(ptxas) and os.path.isfile(ptxas): result = subprocess.check_output([ptxas, "--version"], stderr=subprocess.STDOUT) if result is not None: version = re.search(r".*release (\d+\.\d+).*", result.decode("utf-8"), flags=re.MULTILINE) if version is not None: return ptxas, version.group(1) raise RuntimeError("Cannot find ptxas") instance_descriptor = namedtuple("instance_descriptor", ["divisible_by_16", "equal_to_1"], defaults=[set(), set()]) # ------------------------------------------------------------------------------ # compiler # ------------------------------------------------------------------------------ def ty_to_cpp(ty): if ty[0] == '*': return "CUdeviceptr" return { "i1": "int32_t", "i8": "int8_t", "i16": "int16_t", "i32": "int32_t", "i64": "int64_t", "u32": "uint32_t", "u64": "uint64_t", "fp16": "float", "bf16": "float", "fp32": "float", "f32": "float", "fp64": "double", }[ty] def generate_name_initializer(signature): src = "int i = 0;\n" tys = signature.split(',') for i, ty in enumerate(tys): src def binary_name_to_header_name(name): if len(name) > 128: # avoid filename too long errors (filename limit is 255) name = "kernel_" + hashlib.sha256(name.encode("utf-8")).hexdigest() return f"{name}.h" def generate_launcher(constants, signature): arg_decls = ', '.join(f"{ty_to_cpp(ty)} arg{i}" for i, ty in signature.items()) def _extracted_type(ty): if ty[0] == '*': return "PyObject*" return { 'i1': 'int32_t', 'i32': 'int32_t', 'i64': 'int64_t', 'u32': 'uint32_t', 'u64': 'uint64_t', 'fp16': 'float', 'bf16': 'float', 'fp32': 'float', 'f32': 'float', 'fp64': 'double', }[ty] def format_of(ty): return { "PyObject*": "O", "float": "f", "double": "d", "long": "l", "uint32_t": "I", "int32_t": "i", "uint64_t": "K", "int64_t": "L", }[ty] format = "iiiiiKKOOO" + ''.join([format_of(_extracted_type(ty)) for ty in signature.values()]) # generate glue code src = f""" #include \"cuda.h\" #include #include static inline void gpuAssert(CUresult code, const char *file, int line) {{ if (code != CUDA_SUCCESS) {{ const char* prefix = "Triton Error [CUDA]: "; const char* str; cuGetErrorString(code, &str); char err[1024] = {{0}}; strcat(err, prefix); strcat(err, str); PyErr_SetString(PyExc_RuntimeError, err); }} }} #define CUDA_CHECK(ans) {{ gpuAssert((ans), __FILE__, __LINE__); }} void _launch(int gridX, int gridY, int gridZ, int num_warps, int shared_memory, CUstream stream, CUfunction function, {arg_decls}) {{ void *params[] = {{ {', '.join(f"&arg{i}" for i in signature.keys() if i not in constants)} }}; if(gridX*gridY*gridZ > 0){{ CUDA_CHECK(cuLaunchKernel(function, gridX, gridY, gridZ, 32*num_warps, 1, 1, shared_memory, stream, params, 0)); }} }} typedef struct _DevicePtrInfo {{ CUdeviceptr dev_ptr; bool valid; }} DevicePtrInfo; static inline DevicePtrInfo getPointer(PyObject *obj, int idx) {{ DevicePtrInfo ptr_info; ptr_info.dev_ptr = 0; ptr_info.valid = true; if (PyLong_Check(obj)) {{ ptr_info.dev_ptr = PyLong_AsUnsignedLongLong(obj); return ptr_info; }} if (obj == Py_None) {{ // valid nullptr return ptr_info; }} PyObject *ptr = PyObject_GetAttrString(obj, "data_ptr"); if(ptr){{ PyObject *empty_tuple = PyTuple_New(0); PyObject *ret = PyObject_Call(ptr, empty_tuple, NULL); Py_DECREF(empty_tuple); Py_DECREF(ptr); if (!PyLong_Check(ret)) {{ PyErr_SetString(PyExc_TypeError, "data_ptr method of Pointer object must return 64-bit int"); ptr_info.valid = false; return ptr_info; }} ptr_info.dev_ptr = PyLong_AsUnsignedLongLong(ret); unsigned attr; CUresult status = cuPointerGetAttribute(&attr, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, ptr_info.dev_ptr); if (ptr_info.dev_ptr && (!(attr == CU_MEMORYTYPE_DEVICE || attr == CU_MEMORYTYPE_UNIFIED) || !(status == CUDA_SUCCESS))) {{ PyErr_Format(PyExc_ValueError, "Pointer argument (at %d) cannot be accessed from Triton (cpu tensor?)", idx); ptr_info.valid = false; }} return ptr_info; }} PyErr_SetString(PyExc_TypeError, "Pointer argument must be either uint64 or have data_ptr method"); return ptr_info; }} static PyObject* launch(PyObject* self, PyObject* args) {{ int gridX, gridY, gridZ; uint64_t _stream; uint64_t _function; int num_warps; int shared_memory; PyObject *launch_enter_hook = NULL; PyObject *launch_exit_hook = NULL; PyObject *compiled_kernel = NULL; PyObject *hook_ret = NULL; {' '.join([f"{_extracted_type(ty)} _arg{i}; " for i, ty in signature.items()])} if(!PyArg_ParseTuple(args, \"{format}\", &gridX, &gridY, &gridZ, &num_warps, &shared_memory, &_stream, &_function, &launch_enter_hook, &launch_exit_hook, &compiled_kernel, {', '.join(f"&_arg{i}" for i, ty in signature.items())})) {{ return NULL; }} if (launch_enter_hook != Py_None) {{ PyObject *new_args = PyTuple_Pack(1, compiled_kernel); hook_ret = PyObject_CallObject(launch_enter_hook, new_args); Py_DECREF(new_args); }} // raise exception asap {"; ".join([f"DevicePtrInfo ptr_info{i} = getPointer(_arg{i}, {i}); if (!ptr_info{i}.valid) return NULL;" if ty[0] == "*" else "" for i, ty in signature.items()])}; _launch(gridX, gridY, gridZ, num_warps, shared_memory, (CUstream)_stream, (CUfunction)_function, {', '.join(f"ptr_info{i}.dev_ptr" if ty[0]=="*" else f"_arg{i}"for i, ty in signature.items())}); if (launch_exit_hook != Py_None) {{ PyObject *new_args = NULL; if (hook_ret) {{ new_args = PyTuple_Pack(2, compiled_kernel, hook_ret); }} else {{ new_args = PyTuple_Pack(1, compiled_kernel); }} hook_ret = PyObject_CallObject(launch_exit_hook, new_args); Py_DECREF(new_args); }} if (hook_ret) {{ Py_DECREF(hook_ret); }} if(PyErr_Occurred()) {{ return NULL; }} // return None Py_INCREF(Py_None); return Py_None; }} static PyMethodDef ModuleMethods[] = {{ {{"launch", launch, METH_VARARGS, "Entry point for all kernels with this signature"}}, {{NULL, NULL, 0, NULL}} // sentinel }}; static struct PyModuleDef ModuleDef = {{ PyModuleDef_HEAD_INIT, \"__triton_launcher\", NULL, //documentation -1, //size ModuleMethods }}; PyMODINIT_FUNC PyInit___triton_launcher(void) {{ PyObject *m = PyModule_Create(&ModuleDef); if(m == NULL) {{ return NULL; }} PyModule_AddFunctions(m, ModuleMethods); return m; }} """ return src def default_cache_dir(): return os.path.join(os.environ["HOME"], ".triton", "cache") def default_cuda_dir(): default_dir = "/usr/local/cuda" return os.getenv("CUDA_HOME", default=default_dir) class CacheManager: def __init__(self, key): self.key = key self.lock_path = None # create cache directory if it doesn't exist self.cache_dir = os.environ.get('TRITON_CACHE_DIR', default_cache_dir()) if self.cache_dir: self.cache_dir = os.path.join(self.cache_dir, self.key) self.lock_path = os.path.join(self.cache_dir, "lock") os.makedirs(self.cache_dir, exist_ok=True) def _make_path(self, filename): return os.path.join(self.cache_dir, filename) def has_file(self, filename): if not self.cache_dir: return False return os.path.exists(self._make_path(filename)) def put(self, data, filename, binary=True): if not self.cache_dir: return binary = isinstance(data, bytes) if not binary: data = str(data) assert self.lock_path is not None filepath = self._make_path(filename) with FileLock(self.lock_path): # use tempfile to be robust against program interruptions mode = "wb" if binary else "w" with open(filepath + ".tmp", mode) as f: f.write(data) os.rename(filepath + ".tmp", filepath) # Utilities for generating and compiling C wrappers @functools.lru_cache() def libcuda_dirs(): locs = subprocess.check_output(["whereis", "libcuda.so"]).decode().strip().split()[1:] return [os.path.dirname(loc) for loc in locs] @contextlib.contextmanager def quiet(): old_stdout, old_stderr = sys.stdout, sys.stderr sys.stdout, sys.stderr = io.StringIO(), io.StringIO() try: yield finally: sys.stdout, sys.stderr = old_stdout, old_stderr def _build(name, src, srcdir): cuda_lib_dirs = libcuda_dirs() cuda_path = os.environ.get('CUDA_PATH', default_cuda_dir()) cu_include_dir = os.path.join(cuda_path, "include") base_dir = os.path.dirname(__file__) triton_include_dir = os.path.join(base_dir, "third_party/cuda/include") cuda_header = os.path.join(cu_include_dir, "cuda.h") triton_cuda_header = os.path.join(triton_include_dir, "cuda.h") if not os.path.exists(cuda_header) and os.path.exists(triton_cuda_header): cu_include_dir = triton_include_dir suffix = sysconfig.get_config_var('EXT_SUFFIX') so = os.path.join(srcdir, '{name}{suffix}'.format(name=name, suffix=suffix)) # try to avoid setuptools if possible cc = os.environ.get("CC") if cc is None: # TODO: support more things here. clang = shutil.which("clang") gcc = shutil.which("gcc") cc = gcc if gcc is not None else clang if cc is None: raise RuntimeError("Failed to find C compiler. Please specify via CC environment variable.") py_include_dir = get_paths()["include"] cc_cmd = [cc, src, "-O3", f"-I{cu_include_dir}", f"-I{py_include_dir}", f"-I{srcdir}", "-shared", "-fPIC", "-lcuda", "-o", so] cc_cmd += [f"-L{dir}" for dir in cuda_lib_dirs] ret = subprocess.check_call(cc_cmd) if ret == 0: return so # fallback on setuptools extra_compile_args = [] library_dirs = cuda_lib_dirs include_dirs = [srcdir, cu_include_dir] libraries = ['cuda'] # extra arguments extra_link_args = [] # create extension module ext = setuptools.Extension( name=name, language='c', sources=[src], include_dirs=include_dirs, extra_compile_args=extra_compile_args + ['-O3'], extra_link_args=extra_link_args, library_dirs=library_dirs, libraries=libraries, ) # build extension module args = ['build_ext'] args.append('--build-temp=' + srcdir) args.append('--build-lib=' + srcdir) args.append('-q') args = dict( name=name, ext_modules=[ext], script_args=args, ) with quiet(): setuptools.setup(**args) return so def make_so_cache_key(version_hash, signature, constants): # Get unique key for the compiled code signature = {k: 'ptr' if v[0] == '*' else v for k, v in signature.items()} key = f"{version_hash}-{''.join(signature.values())}{constants}" key = hashlib.md5(key.encode("utf-8")).hexdigest() return key def make_fn_cache_key(fn_hash, signature, configs, constants, num_warps, num_stages): # Get unique key for the compiled code get_conf_key = lambda conf: (sorted(conf.divisible_by_16), sorted(conf.equal_to_1)) configs_key = [get_conf_key(conf) for conf in configs] key = f"{fn_hash}-{''.join(signature.values())}-{configs_key}-{constants}-{num_warps}-{num_stages}" key = hashlib.md5(key.encode("utf-8")).hexdigest() return key def read_or_execute(cache_manager, force_compile, file_name, metadata, run_if_found: Callable[[str], bytes] = None, run_if_not_found: Callable = None): suffix = file_name.split(".")[1] if not force_compile and cache_manager.has_file(file_name): module = run_if_found(cache_manager._make_path(file_name)) data = module if isinstance(module, bytes) else str(module).encode("utf-8") md5 = hashlib.md5(data).hexdigest() has_changed = metadata and md5 != metadata["md5"][suffix] return module, md5, has_changed, True module = run_if_not_found() data = module if isinstance(module, bytes) else str(module).encode("utf-8") md5 = hashlib.md5(data).hexdigest() cache_manager.put(data, file_name, True if isinstance(data, bytes) else data) return module, md5, True, False # def make_stub(name, signature, constants): # name of files that are cached so_cache_key = make_so_cache_key(triton.runtime.jit.version_key(), signature, constants) so_cache_manager = CacheManager(so_cache_key) so_name = f"{name}.so" # retrieve stub from cache if it exists if not so_cache_manager.has_file(so_name): with tempfile.TemporaryDirectory() as tmpdir: src = generate_launcher(constants, signature) src_path = os.path.join(tmpdir, "main.c") with open(src_path, "w") as f: f.write(src) so = _build(name, src_path, tmpdir) with open(so, "rb") as f: so_cache_manager.put(f.read(), so_name, binary=True) return so_cache_manager._make_path(so_name) def convert_type_repr(x): match = re.search(r'!tt\.ptr<(.*)>', x) if match is not None: return '*' + convert_type_repr(match.group(1)) return x def make_hash(fn, **kwargs): if isinstance(fn, triton.runtime.JITFunction): configs = kwargs["configs"] signature = kwargs["signature"] constants = kwargs.get("constants", dict()) num_warps = kwargs.get("num_warps", 4) num_stages = kwargs.get("num_stages", 3) # Get unique key for the compiled code get_conf_key = lambda conf: (sorted(conf.divisible_by_16), sorted(conf.equal_to_1)) configs_key = [get_conf_key(conf) for conf in configs] key = f"{fn.cache_key}-{''.join(signature.values())}-{configs_key}-{constants}-{num_warps}-{num_stages}" return hashlib.md5(key.encode("utf-8")).hexdigest() assert isinstance(fn, str) return hashlib.md5((Path(fn).read_text() + triton.runtime.jit.version_key()).encode("utf-8")).hexdigest() # - ^\s*func\s+ : match the start of the string, any leading whitespace, the keyword func, # and any following whitespace # - (public\s+)? : optionally match the keyword public and any following whitespace # - (@\w+) : match an @ symbol followed by one or more word characters # (letters, digits, or underscores), and capture it as group 1 (the function name) # - (\((?:%\w+: \S+(?: \{\S+ = \S+ : \S+\})?(?:, )?)*\)) : match a pair of parentheses enclosing # zero or more arguments separated by commas, and capture it as group 2 (the argument list) mlir_prototype_pattern = r'^\s*func\s+(?:public\s+)?(@\w+)(\((?:%\w+: \S+(?: \{\S+ = \S+ : \S+\})?(?:, )?)*\))\s*\{\s*$' ptx_prototype_pattern = r"\.(?:visible|extern)\s+\.(?:entry|func)\s+(\w+)\s*\(([^)]*)\)" prototype_pattern = { "ttir": mlir_prototype_pattern, "ttgir": mlir_prototype_pattern, "ptx": ptx_prototype_pattern, } mlir_arg_type_pattern = r'%\w+: ([^,^\)\s]+)(?: \{\S+ = \S+ : \S+\})?,?' ptx_arg_type_pattern = r"\.param\s+\.(\w+)" arg_type_pattern = { "ttir": mlir_arg_type_pattern, "ttgir": mlir_arg_type_pattern, "ptx": ptx_arg_type_pattern, } # def compile(fn, signature: str, device: int = -1, constants=dict(), num_warps: int = 4, num_stages: int = 3, extern_libs=None, configs=None): def compile(fn, **kwargs): capability = kwargs.get("cc", None) if capability is None: device = torch.cuda.current_device() capability = torch.cuda.get_device_capability(device) capability = capability[0] * 10 + capability[1] # we get the kernel, i.e. the first function generated in the module # if fn is not a JITFunction, then it # has to be a path to a file context = _triton.ir.context() asm = dict() constants = kwargs.get("constants", dict()) num_warps = kwargs.get("num_warps", 4) num_stages = kwargs.get("num_stages", 3 if capability >= 75 else 2) extern_libs = kwargs.get("extern_libs", dict()) # build compilation stages stages = { "ast": (lambda path: fn, None), "ttir": (lambda path: parse_mlir_module(path, context), lambda src: ast_to_ttir(src, signature, configs[0], constants)), "ttgir": (lambda path: parse_mlir_module(path, context), lambda src: ttir_to_ttgir(src, num_warps, num_stages, capability)), "llir": (lambda path: Path(path).read_text(), lambda src: ttgir_to_llir(src, extern_libs, capability)), "ptx": (lambda path: Path(path).read_text(), lambda src: llir_to_ptx(src, capability)), "cubin": (lambda path: Path(path).read_bytes(), lambda src: ptx_to_cubin(src, capability)) } # find out the signature of the function if isinstance(fn, triton.runtime.JITFunction): configs = kwargs.get("configs", None) signature = kwargs["signature"] if configs is None: configs = [instance_descriptor()] assert len(configs) == 1 kwargs["configs"] = configs name = fn.__name__ first_stage = 0 if isinstance(signature, str): signature = {k: v.strip() for k, v in enumerate(signature.split(","))} kwargs["signature"] = signature else: assert isinstance(fn, str) _, ir = os.path.basename(fn).split(".") src = Path(fn).read_text() import re match = re.search(prototype_pattern[ir], src, re.MULTILINE) name, signature = match.group(1), match.group(2) # print(name, signature) types = re.findall(arg_type_pattern[ir], signature) # print(types) param_tys = [convert_type_repr(ty) for ty in types] signature = {k: v for k, v in enumerate(param_tys)} first_stage = list(stages.keys()).index(ir) # cache manager so_path = make_stub(name, signature, constants) # create cache manager fn_cache_manager = CacheManager(make_hash(fn, **kwargs)) # determine name and extension type of provided function if isinstance(fn, triton.runtime.JITFunction): name, ext = fn.__name__, "ast" else: name, ext = os.path.basename(fn).split(".") # load metadata if any metadata = None if fn_cache_manager.has_file(f'{name}.json'): with open(fn_cache_manager._make_path(f"{name}.json")) as f: metadata = json.load(f) else: metadata = {"num_warps": num_warps, "num_stages": num_stages, "ctime": dict()} if ext == "ptx": assert "shared" in kwargs, "ptx compilation must provide shared memory size" metadata["shared"] = kwargs["shared"] first_stage = list(stages.keys()).index(ext) asm = dict() module = fn # run compilation pipeline and populate metadata for ir, (parse, compile) in list(stages.items())[first_stage:]: path = fn_cache_manager._make_path(f"{name}.{ir}") if ir == ext: next_module = parse(fn) elif os.path.exists(path) and\ ir in metadata["ctime"] and\ os.path.getctime(path) == metadata["ctime"][ir]: next_module = parse(path) else: next_module = compile(module) fn_cache_manager.put(next_module, f"{name}.{ir}") if os.path.exists(path): metadata["ctime"][ir] = os.path.getctime(path) asm[ir] = next_module if ir == "cubin" else str(next_module) if ir == "llir" and "shared" not in metadata: metadata["shared"] = _triton.get_shared_memory_size(module) if ir == "ptx": metadata["name"] = ptx_get_kernel_name(next_module) module = next_module # write-back metadata fn_cache_manager.put(json.dumps(metadata), f"{name}.json", binary=False) # return handle to compiled kernel return CompiledKernel(so_path, metadata, asm) class CompiledKernel: # Hooks for external tools to monitor the execution of triton kernels launch_enter_hook = None launch_exit_hook = None def __init__(self, so_path, metadata, asm): # initialize launcher import importlib.util spec = importlib.util.spec_from_file_location("__triton_launcher", so_path) mod = importlib.util.module_from_spec(spec) spec.loader.exec_module(mod) self.c_wrapper = getattr(mod, "launch") # initialize metadata self.shared = metadata["shared"] self.num_warps = metadata["num_warps"] self.num_stages = metadata["num_stages"] # initialize asm dict self.asm = asm # binaries are lazily initialized # because it involves doing runtime things # (e.g., checking amount of shared memory on current device) self.metadata = metadata self.cu_module = None self.cu_function = None def _init_handles(self): if self.cu_module is not None: return device = torch.cuda.current_device() global cuda_utils init_cuda_utils() max_shared = cuda_utils.get_device_properties(device)["max_shared_mem"] if self.shared > max_shared: raise OutOfResources(self.shared, max_shared, "shared memory") mod, func, n_regs, n_spills = cuda_utils.load_binary(self.metadata["name"], self.asm["cubin"], self.shared, device) # print(self.shared, n_regs, n_spills) self.cu_module = mod self.cu_function = func def __getattribute__(self, name): if name == 'c_wrapper': self._init_handles() return super().__getattribute__(name) def __getitem__(self, grid): self._init_handles() def runner(*args, stream=None): if stream is None: stream = torch.cuda.current_stream().cuda_stream self.c_wrapper(grid[0], grid[1], grid[2], self.num_warps, self.shared, stream, self.cu_function, CompiledKernel.launch_enter_hook, CompiledKernel.launch_exit_hook, self, *args) return runner def get_sass(self, fun=None): if 'sass' in self.asm: return self.asm['sass'] fd, path = tempfile.mkstemp() try: with open(fd, 'wb') as cubin: cubin.write(self.asm['cubin']) self.sass = extract(path, fun) finally: os.remove(path) self.asm['sass'] = self.sass return self.sass class CudaUtils(object): def __new__(cls): if not hasattr(cls, 'instance'): cls.instance = super(CudaUtils, cls).__new__(cls) return cls.instance @staticmethod def _generate_src(): return """ #include #include \"cuda.h\" #define PY_SSIZE_T_CLEAN #include static inline void gpuAssert(CUresult code, const char *file, int line) { if (code != CUDA_SUCCESS) { const char* prefix = "Triton Error [CUDA]: "; const char* str; cuGetErrorString(code, &str); char err[1024] = {0}; strcat(err, prefix); strcat(err, str); PyErr_SetString(PyExc_RuntimeError, err); } } #define CUDA_CHECK(ans) { gpuAssert((ans), __FILE__, __LINE__); if(PyErr_Occurred()) return NULL; } static PyObject* getDeviceProperties(PyObject* self, PyObject* args){ int device_id; if(!PyArg_ParseTuple(args, "i", &device_id)) return NULL; // Get device handle CUdevice device; cuDeviceGet(&device, device_id); // create a struct to hold device properties int max_shared_mem; int multiprocessor_count; int sm_clock_rate; int mem_clock_rate; int mem_bus_width; CUDA_CHECK(cuDeviceGetAttribute(&max_shared_mem, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN, device)); CUDA_CHECK(cuDeviceGetAttribute(&multiprocessor_count, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device)); CUDA_CHECK(cuDeviceGetAttribute(&sm_clock_rate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device)); CUDA_CHECK(cuDeviceGetAttribute(&mem_clock_rate, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, device)); CUDA_CHECK(cuDeviceGetAttribute(&mem_bus_width, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, device)); return Py_BuildValue("{s:i, s:i, s:i, s:i, s:i}", "max_shared_mem", max_shared_mem, "multiprocessor_count", multiprocessor_count, "sm_clock_rate", sm_clock_rate, "mem_clock_rate", mem_clock_rate, "mem_bus_width", mem_bus_width); } static PyObject* loadBinary(PyObject* self, PyObject* args) { const char* name; const char* data; Py_ssize_t data_size; int shared; int device; if(!PyArg_ParseTuple(args, "ss#ii", &name, &data, &data_size, &shared, &device)) { return NULL; } CUfunction fun; CUmodule mod; int32_t n_regs = 0; int32_t n_spills = 0; // create driver handles CUDA_CHECK(cuModuleLoadData(&mod, data)); CUDA_CHECK(cuModuleGetFunction(&fun, mod, name)); // get allocated registers and spilled registers from the function CUDA_CHECK(cuFuncGetAttribute(&n_regs, CU_FUNC_ATTRIBUTE_NUM_REGS, fun)); CUDA_CHECK(cuFuncGetAttribute(&n_spills, CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES, fun)); n_spills /= 4; // set dynamic shared memory if necessary int shared_optin; CUDA_CHECK(cuDeviceGetAttribute(&shared_optin, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN, device)); if (shared > 49152 && shared_optin > 49152) { CUDA_CHECK(cuFuncSetCacheConfig(fun, CU_FUNC_CACHE_PREFER_SHARED)); int shared_total, shared_static; CUDA_CHECK(cuDeviceGetAttribute(&shared_total, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR, device)); CUDA_CHECK(cuFuncGetAttribute(&shared_static, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, fun)); CUDA_CHECK(cuFuncSetAttribute(fun, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, shared_optin - shared_static)); } if(PyErr_Occurred()) { return NULL; } return Py_BuildValue("(KKii)", (uint64_t)mod, (uint64_t)fun, n_regs, n_spills); } static PyMethodDef ModuleMethods[] = { {"load_binary", loadBinary, METH_VARARGS, "Load provided cubin into CUDA driver"}, {"get_device_properties", getDeviceProperties, METH_VARARGS, "Get the properties for a given device"}, {NULL, NULL, 0, NULL} // sentinel }; static struct PyModuleDef ModuleDef = { PyModuleDef_HEAD_INIT, \"cuda_utils\", NULL, //documentation -1, //size ModuleMethods }; PyMODINIT_FUNC PyInit_cuda_utils(void) { PyObject *m = PyModule_Create(&ModuleDef); if(m == NULL) { return NULL; } PyModule_AddFunctions(m, ModuleMethods); return m; } """ def __init__(self): src = self._generate_src() key = hashlib.md5(src.encode("utf-8")).hexdigest() cache = CacheManager(key) fname = "cuda_utils.so" if not cache.has_file(fname): with tempfile.TemporaryDirectory() as tmpdir: src_path = os.path.join(tmpdir, "main.c") with open(src_path, "w") as f: f.write(src) so = _build("cuda_utils", src_path, tmpdir) with open(so, "rb") as f: cache.put(f.read(), fname, binary=True) import importlib.util spec = importlib.util.spec_from_file_location("cuda_utils", cache._make_path(fname)) mod = importlib.util.module_from_spec(spec) spec.loader.exec_module(mod) self.load_binary = mod.load_binary self.get_device_properties = mod.get_device_properties def init_cuda_utils(): global cuda_utils if cuda_utils is None: cuda_utils = CudaUtils() cuda_utils = None triton-2.0.0/python/triton/impl/000077500000000000000000000000001440023377100166325ustar00rootroot00000000000000triton-2.0.0/python/triton/impl/__init__.py000066400000000000000000000007011440023377100207410ustar00rootroot00000000000000"""Triton internal implementation details. Client libraries should not import interfaces from the `triton.impl` module; as the details are subject to change. APIs defined in the `triton.impl` module which are public will be re-exported in other relevant `triton` module namespaces. """ from .base import builtin, extern, is_builtin from triton._C.libtriton.triton import ir __all__ = [ "builtin", "extern", "ir", "is_builtin", ] triton-2.0.0/python/triton/impl/base.py000066400000000000000000000015401440023377100201160ustar00rootroot00000000000000from __future__ import annotations from functools import wraps from typing import TypeVar T = TypeVar("T") TRITON_BUILTIN = "__triton_builtin__" def builtin(fn: T) -> T: """Mark a function as a builtin.""" assert callable(fn) @wraps(fn) def wrapper(*args, **kwargs): if "_builder" not in kwargs or kwargs["_builder"] is None: raise ValueError( "Did you forget to add @triton.jit ? " "(`_builder` argument must be provided outside of JIT functions.)" ) return fn(*args, **kwargs) setattr(wrapper, TRITON_BUILTIN, True) return wrapper def is_builtin(fn) -> bool: """Is this a registered triton builtin function?""" return getattr(fn, TRITON_BUILTIN, False) def extern(fn: T) -> T: """A decorator for external functions.""" return builtin(fn) triton-2.0.0/python/triton/language/000077500000000000000000000000001440023377100174545ustar00rootroot00000000000000triton-2.0.0/python/triton/language/__init__.py000066400000000000000000000050271440023377100215710ustar00rootroot00000000000000"""isort:skip_file""" # Import order is significant here. from ..impl import ( ir, builtin, ) from . import libdevice from .core import ( abs, arange, argmin, argmax, atomic_add, atomic_and, atomic_cas, atomic_max, atomic_min, atomic_or, atomic_xchg, atomic_xor, bfloat16, block_type, broadcast, broadcast_to, cat, cdiv, constexpr, cos, debug_barrier, dot, dtype, exp, full, fdiv, float16, float32, float64, float8, function_type, int1, int16, int32, int64, int8, load, log, max, max_contiguous, maximum, min, minimum, multiple_of, num_programs, pi32_t, pointer_type, printf, program_id, ravel, reshape, sigmoid, sin, softmax, sqrt, store, sum, swizzle2d, static_range, tensor, trans, triton, uint16, uint32, uint64, uint8, umulhi, view, void, where, xor_sum, zeros, zeros_like, ) from .random import ( pair_uniform_to_normal, philox, philox_impl, rand, rand4x, randint, randint4x, randn, randn4x, uint32_to_uniform_float, ) __all__ = [ "abs", "arange", "argmin", "argmax", "atomic_add", "atomic_and", "atomic_cas", "atomic_max", "atomic_min", "atomic_or", "atomic_xchg", "atomic_xor", "bfloat16", "block_type", "broadcast", "broadcast_to", "builtin", "cat", "cdiv", "constexpr", "cos", "debug_barrier", "dot", "dtype", "exp", "fdiv", "float16", "float32", "float64", "float8", "full", "function_type", "int1", "int16", "int32", "int64", "int8", "ir", "libdevice", "load", "log", "max", "max_contiguous", "maximum", "min", "minimum", "multiple_of", "num_programs", "pair_uniform_to_normal", "philox", "philox_impl", "pi32_t", "pointer_type", "printf", "program_id", "rand", "rand4x", "randint", "randint4x", "randn", "randn4x", "ravel", "reshape", "sigmoid", "sin", "softmax", "sqrt", "static_range", "store", "sum", "swizzle2d", "tensor", "trans", "triton", "uint16", "uint32", "uint32_to_uniform_float", "uint64", "uint8", "umulhi", "view", "void", "where", "xor_sum", "zeros", "zeros_like", ] triton-2.0.0/python/triton/language/core.py000066400000000000000000001135201440023377100207600ustar00rootroot00000000000000from __future__ import annotations from enum import Enum from typing import Callable, List, TypeVar import triton from . import builtin, semantic from triton._C.libtriton.triton import ir T = TypeVar('T') def _to_tensor(x, builder): if isinstance(x, bool): return tensor(builder.get_int1(x), int1) # Note: compile-time const integers are represented by unsigned values elif isinstance(x, int): if -2**31 <= x < 2**31: return tensor(builder.get_int32(x), int32) elif 2**31 <= x < 2**32: return tensor(builder.get_int32(x), uint32) elif -2**63 <= x < 2**63: return tensor(builder.get_int64(x), int64) elif 2**63 <= x < 2**64: return tensor(builder.get_int64(x), uint64) else: raise RuntimeError(f'Nonrepresentable integer {x}.') elif isinstance(x, float): return tensor(builder.get_fp32(x), float32) elif isinstance(x, constexpr): return _to_tensor(x.value, builder) elif isinstance(x, tensor): return x assert False, f"cannot convert {x} of type {type(x)} to tensor" class dtype: SINT_TYPES = ['int1', 'int8', 'int16', 'int32', 'int64'] UINT_TYPES = ['uint8', 'uint16', 'uint32', 'uint64'] FP_TYPES = ['fp8', 'fp16', 'bf16', 'fp32', 'fp64'] CUSTOMIZED_FP_TYPES = ['fp8'] STANDARD_FP_TYPES = ['fp16', 'bf16', 'fp32', 'fp64'] OTHER_TYPES = ['void'] class SIGNEDNESS(Enum): SIGNED = 0 UNSIGNED = 1 def __init__(self, name): self.name = name assert name in dtype.SINT_TYPES + dtype.UINT_TYPES + dtype.FP_TYPES + dtype.OTHER_TYPES, name if name in dtype.SINT_TYPES: self.int_signedness = dtype.SIGNEDNESS.SIGNED self.int_bitwidth = int(name.split('int')[-1]) self.primitive_bitwidth = self.int_bitwidth elif name in dtype.UINT_TYPES: self.int_signedness = dtype.SIGNEDNESS.UNSIGNED self.int_bitwidth = int(name.split('int')[-1]) self.primitive_bitwidth = self.int_bitwidth elif name in dtype.FP_TYPES: if name == 'fp8': self.fp_mantissa_width = 3 self.primitive_bitwidth = 8 elif name == 'fp16': self.fp_mantissa_width = 10 self.primitive_bitwidth = 16 elif name == 'bf16': self.fp_mantissa_width = 7 self.primitive_bitwidth = 16 elif name == 'fp32': self.fp_mantissa_width = 23 self.primitive_bitwidth = 32 elif name == 'fp64': self.fp_mantissa_width = 53 self.primitive_bitwidth = 64 elif name == 'void': self.primitive_bitwidth = 0 def is_fp8(self): return self.name == 'fp8' def is_fp16(self): return self.name == 'fp16' def is_bf16(self): return self.name == 'bf16' def is_fp32(self): return self.name == 'fp32' def is_fp64(self): return self.name == 'fp64' def is_int1(self): return self.name == 'int1' def is_int8(self): return self.name == 'int8' def is_int16(self): return self.name == 'int16' def is_int32(self): return self.name == 'int32' def is_int64(self): return self.name == 'int64' def is_uint8(self): return self.name == 'uint8' def is_uint16(self): return self.name == 'uint16' def is_uint32(self): return self.name == 'uint32' def is_uint64(self): return self.name == 'uint64' def is_floating(self): return self.name in dtype.FP_TYPES def is_customized_floating(self): return self.name in dtype.CUSTOMIZED_FP_TYPES def is_standard_floating(self): return self.name in dtype.STANDARD_FP_TYPES def is_int_signed(self): return self.name in dtype.SINT_TYPES def is_int_unsigned(self): return self.name in dtype.UINT_TYPES def is_int(self): return self.name in dtype.SINT_TYPES + dtype.UINT_TYPES def is_bool(self): return self.is_int1() @staticmethod def is_void(): raise RuntimeError("Not implemented") @staticmethod def is_block(): return False @staticmethod def is_ptr(): return False def __eq__(self, other: dtype): if not isinstance(other, dtype): return False return self.name == other.name def __ne__(self, other: dtype): return not self.__eq__(other) def __hash__(self): return hash((self.name,)) @property def scalar(self): return self def to_ir(self, builder: ir.builder) -> ir.type: if self.name == 'void': return builder.get_void_ty() elif self.name == 'int1': return builder.get_int1_ty() elif self.name in ('int8', 'uint8'): return builder.get_int8_ty() elif self.name in ('int16', 'uint16'): return builder.get_int16_ty() elif self.name in ('int32', 'uint32'): return builder.get_int32_ty() elif self.name in ('int64', 'uint64'): return builder.get_int64_ty() elif self.name == 'fp8': return builder.get_fp8_ty() elif self.name == 'fp16': return builder.get_half_ty() elif self.name == 'bf16': return builder.get_bf16_ty() elif self.name == 'fp32': return builder.get_float_ty() elif self.name == 'fp64': return builder.get_double_ty() raise ValueError(f'fail to convert {self} to ir type') def __str__(self): return self.name @property def cache_key_part(self) -> str: """See cache_key_part() in triton.cc.""" return self.name def __repr__(self): return f'triton.language.{self.name}' class pointer_type(dtype): def __init__(self, element_ty: dtype, address_space: int = 1): if not isinstance(element_ty, dtype): raise TypeError('element_ty is a {type(element_ty).__name__}.') self.element_ty = element_ty self.address_space = address_space self.name = self.__str__() def to_ir(self, builder: ir.builder) -> ir.pointer_type: return builder.get_ptr_ty(self.element_ty.to_ir(builder), 1) def __str__(self): return f'pointer<{self.element_ty}>' def __repr__(self): return self.__str__() def is_ptr(self): return True def __eq__(self, other: pointer_type) -> bool: if not isinstance(other, pointer_type): return False return self.element_ty == other.element_ty and self.address_space == other.address_space def __ne__(self, other: pointer_type) -> bool: return not self.__eq__(other) @property def scalar(self): return self class block_type(dtype): def __init__(self, element_ty: dtype, shape: List): self.element_ty = element_ty # Note that block_type's shape is a list of int # while tensor's shape is a list of constexpr. # shape can be empty ([]) when an input is a 0D tensor. if not shape: raise TypeError('0d block_type is forbidden') if isinstance(shape[0], constexpr): shape = [s.value for s in shape] self.shape = shape self.numel = 1 for s in self.shape: self.numel *= s self.name = self.__str__() def to_ir(self, builder: ir.builder) -> ir.block_type: return builder.get_block_ty(self.element_ty.to_ir(builder), self.shape) def __str__(self): return f'<{self.shape}, {self.element_ty}>' def __repr__(self): return self.__str__() def is_block(self): return True def get_block_shapes(self) -> List[int]: return self.shape def __eq__(self, other: block_type) -> bool: if not isinstance(other, block_type): return False return self.element_ty == other.element_ty and self.shape == other.shape def __ne__(self, other: block_type) -> bool: return not self.__eq__(other) @property def scalar(self): return self.element_ty class function_type(dtype): def __init__(self, ret_types: List[dtype], param_types: List[dtype]) -> None: self.ret_types = ret_types self.param_types = param_types def __str__(self): return f'fn ({self.param_types}) -> {self.ret_types}' def to_ir(self, builder: ir.builder): ir_param_types = [ty.to_ir(builder) for ty in self.param_types] ret_types = [ret_type.to_ir(builder) for ret_type in self.ret_types] return builder.get_function_ty(ir_param_types, ret_types) # scalar types void = dtype('void') int1 = dtype('int1') int8 = dtype('int8') int16 = dtype('int16') int32 = dtype('int32') int64 = dtype('int64') uint8 = dtype('uint8') uint16 = dtype('uint16') uint32 = dtype('uint32') uint64 = dtype('uint64') float8 = dtype('fp8') float16 = dtype('fp16') bfloat16 = dtype('bf16') float32 = dtype('fp32') float64 = dtype('fp64') # pointer types pi32_t = pointer_type(int32) # ----------------------- # constexpr # ----------------------- class constexpr: """ This class is used to store a value that is known at compile-time. """ def __init__(self, value): if isinstance(value, constexpr): self.value = value.value else: self.value = value def __repr__(self) -> str: return f"constexpr[{self.value}]" def __add__(self, other): return constexpr(self.value + other.value) def __radd__(self, other): return constexpr(other.value + self.value) def __sub__(self, other): return constexpr(self.value - other.value) def __rsub__(self, other): return constexpr(other.value - self.value) def __mul__(self, other): return constexpr(self.value * other.value) def __mod__(self, other): return constexpr(self.value % other.value) def __rmul__(self, other): return constexpr(other.value * self.value) def __truediv__(self, other): return constexpr(self.value / other.value) def __rtruediv__(self, other): return constexpr(other.value / self.value) def __floordiv__(self, other): return constexpr(self.value // other.value) def __rfloordiv__(self, other): return constexpr(other.value // self.value) def __gt__(self, other): return constexpr(self.value > other.value) def __rgt__(self, other): return constexpr(other.value > self.value) def __ge__(self, other): return constexpr(self.value >= other.value) def __rge__(self, other): return constexpr(other.value >= self.value) def __lt__(self, other): return constexpr(self.value < other.value) def __rlt__(self, other): return constexpr(other.value < self.value) def __le__(self, other): return constexpr(self.value <= other.value) def __rle__(self, other): return constexpr(other.value <= self.value) def __eq__(self, other): return constexpr(self.value == other.value) def __ne__(self, other): return constexpr(self.value != other.value) def __bool__(self): return bool(self.value) def __neg__(self): return constexpr(-self.value) def __and__(self, other): return constexpr(self.value & other.value) def logical_and(self, other): return constexpr(self.value and other.value) def __or__(self, other): return constexpr(self.value | other.value) def logical_or(self, other): return constexpr(self.value or other.value) def __pos__(self): return constexpr(+self.value) def __invert__(self): return constexpr(~self.value) def __pow__(self, other): return constexpr(self.value ** other.value) def __rshift__(self, other): return constexpr(self.value >> other.value) def __lshift__(self, other): return constexpr(self.value << other.value) def __not__(self): return constexpr(not self.value) def __call__(self, *args, **kwds): return self.value(*args, **kwds) class tensor: def __init__(self, handle, type: dtype): # IR handle self.handle = handle # Block shape self.shape = (1, ) if type.is_block(): self.shape = type.shape self.numel = 1 for s in self.shape: self.numel *= s self.numel = constexpr(self.numel) self.type = type # Tensor type (can be block_type) # Following the practice in pytorch, dtype is scalar type self.dtype = type.scalar self.shape = [constexpr(s) for s in self.shape] def __str__(self) -> str: # ex. "float32[3,4]" return str(self.dtype) + '[' + ','.join(str(s) for s in self.shape) + ']' @builtin def __add__(self, other, _builder=None): other = _to_tensor(other, _builder) return semantic.add(self, other, _builder) def __radd__(self, other, _builder=None): return self.__add__(other, _builder=_builder) @builtin def __sub__(self, other, _builder=None): other = _to_tensor(other, _builder) return semantic.sub(self, other, _builder) def __rsub__(self, other, _builder=None): other = _to_tensor(other, _builder) return semantic.sub(other, self, _builder) @builtin def __mul__(self, other, _builder=None): other = _to_tensor(other, _builder) return semantic.mul(self, other, _builder) def __rmul__(self, other, _builder=None): return self.__mul__(other, _builder=_builder) @builtin def __truediv__(self, other, _builder=None): other = _to_tensor(other, _builder) return semantic.truediv(self, other, _builder) def __rtruediv__(self, other, _builder=None): other = _to_tensor(other, _builder) return semantic.truediv(other, self, _builder) @builtin def __floordiv__(self, other, _builder=None): other = _to_tensor(other, _builder) return semantic.floordiv(self, other, _builder) @builtin def __rfloordiv__(self, other, _builder=None): other = _to_tensor(other, _builder) return semantic.floordiv(other, self, _builder) @builtin def __mod__(self, other, _builder=None): other = _to_tensor(other, _builder) return semantic.mod(self, other, _builder) @builtin def __rmod__(self, other, _builder=None): other = _to_tensor(other, _builder) return semantic.mod(other, self, _builder) # unary operators @builtin def __neg__(self, _builder=None): return semantic.minus(self, _builder) @builtin def __invert__(self, _builder=None): return semantic.invert(self, _builder) # bitwise operators @builtin def __and__(self, other, _builder=None): other = _to_tensor(other, _builder) return semantic.and_(self, other, _builder) @builtin def __or__(self, other, _builder=None): other = _to_tensor(other, _builder) return semantic.or_(self, other, _builder) @builtin def __xor__(self, other, _builder=None): other = _to_tensor(other, _builder) return semantic.xor_(self, other, _builder) @builtin def __lshift__(self, other, _builder=None): other = _to_tensor(other, _builder) return semantic.shl(self, other, _builder) @builtin def __rshift__(self, other, _builder=None): other = _to_tensor(other, _builder) if self.dtype.is_int_signed(): return semantic.ashr(self, other, _builder) else: return semantic.lshr(self, other, _builder) # comparison operators # > @builtin def __gt__(self, other, _builder=None): other = _to_tensor(other, _builder) return semantic.greater_than(self, other, _builder) @builtin def __rgt__(self, other, _builder=None): other = _to_tensor(other, _builder) return semantic.greater_than(other, self, _builder) # >= @builtin def __ge__(self, other, _builder=None): other = _to_tensor(other, _builder) return semantic.greater_equal(self, other, _builder) @builtin def __rge__(self, other, _builder=None): other = _to_tensor(other, _builder) return semantic.greater_equal(other, self, _builder) # < @builtin def __lt__(self, other, _builder=None): other = _to_tensor(other, _builder) return semantic.less_than(self, other, _builder) @builtin def __rlt__(self, other, _builder=None): other = _to_tensor(other, _builder) return semantic.less_than(other, self, _builder) # <= @builtin def __le__(self, other, _builder=None): other = _to_tensor(other, _builder) return semantic.less_equal(self, other, _builder) @builtin def __rle__(self, other, _builder=None): other = _to_tensor(other, _builder) return semantic.less_equal(other, self, _builder) # == @builtin def __eq__(self, other, _builder=None): other = _to_tensor(other, _builder) return semantic.equal(self, other, _builder) @builtin def __ne__(self, other, _builder=None): other = _to_tensor(other, _builder) return semantic.not_equal(self, other, _builder) @builtin def logical_and(self, other, _builder=None): other = _to_tensor(other, _builder) return semantic.logical_and(self, other, _builder) @builtin def logical_or(self, other, _builder=None): other = _to_tensor(other, _builder) return semantic.logical_or(self, other, _builder) # note: __not__ isn't actually a magic method in python # but it's ok because our ASTVisitor handles it @builtin def __not__(self, _builder=None): return semantic.not_(self, _builder) @builtin def __getitem__(self, slices, _builder=None): if isinstance(slices, slice): slices = [slices] ret = self for dim, sl in enumerate(slices): if isinstance(sl, constexpr) and sl.value is None: ret = semantic.expand_dims(ret, dim, _builder) elif sl == slice(None, None, None): pass else: assert False, "unsupported" return ret @property def T(self): assert False, "Transposition must be created by the AST Visitor" @builtin def to(self, dtype, bitcast=False, _builder=None): if isinstance(bitcast, constexpr): bitcast = bitcast.value if bitcast: return semantic.bitcast(self, dtype, _builder) return semantic.cast(self, dtype, _builder) # ----------------------- # SPMD Programming Model # ----------------------- def _constexpr_to_value(v): if isinstance(v, constexpr): return v.value return v @builtin def program_id(axis, _builder=None): """ Returns the id of the current program instance along the given :code:`axis`. :param axis: The axis of the 3D launch grid. Has to be either 0, 1 or 2. :type axis: int """ # if axis == -1: # pid0 = program_id(0, _builder) # pid1 = program_id(1, _builder) # pid2 = program_id(2, _builder) # npg0 = num_programs(0, _builder) # npg1 = num_programs(0, _builder) # return pid0 + pid1*npg0 + pid2*npg0*npg1 axis = _constexpr_to_value(axis) return semantic.program_id(axis, _builder) @builtin def num_programs(axis, _builder=None): """ Returns the number of program instances launched along the given :code:`axis`. :param axis: The axis of the 3D launch grid. Has to be either 0, 1 or 2. :type axis: int """ axis = _constexpr_to_value(axis) return semantic.num_programs(axis, _builder) # ----------------------- # Block Initialization # ----------------------- @builtin def arange(start, end, _builder=None): """ Returns contiguous values within the open interval [:code:`start`, :code:`end`). :param start: Start of the interval. Must be a power of two. :type start: int :param stop: End of the interval. Must be a power of two >= start. :type stop: int """ start = _constexpr_to_value(start) end = _constexpr_to_value(end) return semantic.arange(start, end, _builder) def _shape_check_impl(shape): shape = _constexpr_to_value(shape) for i, d in enumerate(shape): if not isinstance(d, constexpr): raise TypeError(f"Shape element {i} must have type `constexpr`") if not isinstance(d.value, int): raise TypeError(f"Shape element {i} must have type `constexpr[int]`, got `constexpr[{type(d.value)}]") return [_constexpr_to_value(x) for x in shape] @builtin def full(shape, value, dtype, _builder=None): """ Returns a tensor filled with the scalar value for the given :code:`shape` and :code:`dtype`. :param shape: Shape of the new array, e.g., (8, 16) or (8, ) :value value: A scalar value to fill the array with :type shape: tuple of ints :param dtype: Data-type of the new array, e.g., :code:`tl.float16` :type dtype: DType """ shape = _shape_check_impl(shape) value = _constexpr_to_value(value) dtype = _constexpr_to_value(dtype) return semantic.full(shape, value, dtype, _builder) # ----------------------- # Shape Manipulation # ----------------------- @builtin def broadcast(input, other, _builder=None): """ Tries to broadcast the two given blocks to a common compatible shape. :param input: The first input tensor. :type input: Block :param other: The second input tensor. :type other: Block """ return semantic.broadcast_impl_value(input, other, _builder) @builtin def broadcast_to(input, shape, _builder=None): """ Tries to broadcast the given tensor to a new :code:`shape`. :param input: The input tensor. :type input: Block :param shape: The desired shape. :type shape: Tuple[int] """ shape = _shape_check_impl(shape) return semantic.broadcast_impl_shape(input, shape, _builder) @builtin def trans(input, _builder=None): return semantic.trans(input, _builder) @builtin def cat(input, other, can_reorder=False, _builder=None): """ Concatenate the given blocks :param input: The first input tensor. :type input: :param other: The second input tensor. :type other: :param reorder: Compiler hint. If true, the compiler is allowed to reorder elements while concatenating inputs. Only use if the order does not matter (e.g., result is only used in reduction ops) """ return semantic.cat(input, other, can_reorder, _builder) @builtin def view(input, shape, _builder=None): """ Returns a tensor with the same elements as `input` but a different shape. The order of the elements may not be preserved. :param input: The input tensor. :type input: :param shape: The desired shape. :type shape: Tuple[int] """ shape = _shape_check_impl(shape) return semantic.view(input, shape, _builder) @builtin def reshape(input, shape, _builder=None): # TODO: should be more than just a view shape = _shape_check_impl(shape) return semantic.view(input, shape, _builder) # ----------------------- # Linear Algebra # ----------------------- @builtin def dot(input, other, allow_tf32=True, _builder=None): """ Returns the matrix product of two blocks. The two blocks must be two-dimensional and have compatible inner dimensions. :param input: The first tensor to be multiplied. :type input: 2D tensor of scalar-type in {:code:`float16`, :code:`bfloat16`, :code:`float32`} :param other: The second tensor to be multiplied. :type other: 2D tensor of scalar-type in {:code:`float16`, :code:`bfloat16`, :code:`float32`} """ allow_tf32 = _constexpr_to_value(allow_tf32) return semantic.dot(input, other, allow_tf32, _builder) # ----------------------- # Non-Atomic Memory Operations # ----------------------- @builtin def load(pointer, mask=None, other=None, cache_modifier="", eviction_policy="", volatile=False, _builder=None): """ Return a tensor of data whose values are, elementwise, loaded from memory at location defined by :code:`pointer`. :code:`mask` and :code:`other` are implicitly broadcast to :code:`pointer.shape`. :code:`other` is implicitly typecast to :code:`pointer.dtype.element_ty`. :param pointer: Pointers to the data to be loaded. :type pointer: Block of dtype=triton.PointerDType :param mask: if mask[idx] is false, do not load the data at address :code:`pointer[idx]`. :type mask: Block of triton.int1, optional :param other: if mask[idx] is false, return other[idx] :type other: Block, optional :param cache_modifier: changes cache option in nvidia ptx 'type cache_modifier: str, optional """ # mask, other can be constexpr if _constexpr_to_value(mask) is not None: mask = _to_tensor(mask, _builder) if _constexpr_to_value(other) is not None: other = _to_tensor(other, _builder) cache_modifier = _constexpr_to_value(cache_modifier) eviction_policy = _constexpr_to_value(eviction_policy) volatile = _constexpr_to_value(volatile) return semantic.load(pointer, mask, other, cache_modifier, eviction_policy, volatile, _builder) @builtin def store(pointer, value, mask=None, _builder=None): """ Stores :code:`value` tensor of elements in memory, element-wise, at the memory locations specified by :code:`pointer`. :code:`value` is implicitly broadcast to :code:`pointer.shape` and typecast to :code:`pointer.dtype.element_ty`. :param pointer: The memory locations where the elements of :code:`value` are stored. :type pointer: Block of dtype=triton.PointerDType :param value: The tensor of elements to be stored. :type value: Block :param mask: If mask[idx] is false, do not store :code:`value[idx]` at :code:`pointer[idx]`. :type mask: Block of triton.int1, optional """ # value can be constexpr value = _to_tensor(value, _builder) if _constexpr_to_value(mask) is not None: mask = _to_tensor(mask, _builder) return semantic.store(pointer, value, mask, _builder) # ----------------------- # Atomic Memory Operations # ----------------------- def _add_atomic_docstr(name: str) -> Callable[[T], T]: def _decorator(func: T) -> T: docstr = """ Performs an atomic {name} at the memory location specified by :code:`pointer`. Return the data stored at :code:`pointer` before the atomic operation. :param pointer: The memory locations to compare-and-swap. :type pointer: Block of dtype=triton.PointerDType :param cmp: The values expected to be found in the atomic object :type cmp: Block of dtype=`pointer.dtype.element_ty` :param val: The values to copy in case the expected value matches the contained value. :type val: Block of dtype=`pointer.dtype.element_ty` """ func.__doc__ = docstr.format(name=name) return func return _decorator @builtin @_add_atomic_docstr("compare-and-swap") def atomic_cas(pointer, cmp, val, _builder=None): cmp = _to_tensor(cmp, _builder) val = _to_tensor(val, _builder) return semantic.atomic_cas(pointer, cmp, val, _builder) @builtin @_add_atomic_docstr("exchange") def atomic_xchg(pointer, val, mask=None, _builder=None): val = _to_tensor(val, _builder) return semantic.atomic_xchg(pointer, val, mask, _builder) @builtin @_add_atomic_docstr("add") def atomic_add(pointer, val, mask=None, _builder=None): val = _to_tensor(val, _builder) return semantic.atomic_add(pointer, val, mask, _builder) @builtin @_add_atomic_docstr("max") def atomic_max(pointer, val, mask=None, _builder=None): val = _to_tensor(val, _builder) return semantic.atomic_max(pointer, val, mask, _builder) @builtin @_add_atomic_docstr("min") def atomic_min(pointer, val, mask=None, _builder=None): val = _to_tensor(val, _builder) return semantic.atomic_min(pointer, val, mask, _builder) @builtin @_add_atomic_docstr("logical and") def atomic_and(pointer, val, mask=None, _builder=None): val = _to_tensor(val, _builder) return semantic.atomic_and(pointer, val, mask, _builder) @builtin @_add_atomic_docstr("logical or") def atomic_or(pointer, val, mask=None, _builder=None): val = _to_tensor(val, _builder) return semantic.atomic_or(pointer, val, mask, _builder) @builtin @_add_atomic_docstr("logical xor") def atomic_xor(pointer, val, mask=None, _builder=None): val = _to_tensor(val, _builder) return semantic.atomic_xor(pointer, val, mask, _builder) # ----------------------- # Conditioning # ----------------------- @builtin def where(condition, x, y, _builder=None): """ Returns a tensor of elements from either :code:`x` or :code:`y`, depending on :code:`condition`. Note that :code:`x` and :code:`y` are always evaluated regardless of the value of :code:`condition`. If you want to avoid unintended memory operations, use the :code:`mask` arguments in `triton.load` and `triton.store` instead. The shape of :code:`x` and :code:`y` are both broadcast to the shape of :code:`condition`. :code:`x` and :code:`y` must have the data type. :param condition: When True (nonzero), yield x, otherwise yield y. :type condition: Block of triton.bool :param x: values selected at indices where condition is True. :param y: values selected at indices where condition is False. """ condition = _to_tensor(condition, _builder) x = _to_tensor(x, _builder) y = _to_tensor(y, _builder) return semantic.where(condition, x, y, _builder) # ----------------------- # Math # ----------------------- @builtin def umulhi(x, y, _builder=None): x = _to_tensor(x, _builder) y = _to_tensor(y, _builder) return semantic.umulhi(x, y, _builder) @builtin def fdiv(x, y, ieee_rounding=False, _builder=None): ieee_rounding = _constexpr_to_value(ieee_rounding) return semantic.fdiv(x, y, ieee_rounding, _builder) def _add_math_1arg_docstr(name: str) -> Callable[[T], T]: def _decorator(func: T) -> T: docstr = """ Computes the element-wise {name} of :code:`x` :param x: the input values :type x: Block """ func.__doc__ = docstr.format(name=name) return func return _decorator @builtin @_add_math_1arg_docstr("exponential") def exp(x, _builder=None): return semantic.exp(x, _builder) @builtin @_add_math_1arg_docstr("natural logarithm") def log(x, _builder=None): return semantic.log(x, _builder) @builtin @_add_math_1arg_docstr("cosine") def cos(x, _builder=None): return semantic.cos(x, _builder) @builtin @_add_math_1arg_docstr("sine") def sin(x, _builder=None): return semantic.sin(x, _builder) @builtin @_add_math_1arg_docstr("square root") def sqrt(x, _builder=None): return semantic.sqrt(x, _builder) # ----------------------- # Reductions # ----------------------- def _add_reduction_docstr(name: str) -> Callable[[T], T]: def _decorator(func: T) -> T: docstr = """ Returns the {name} of all elements in the :code:`input` tensor along the provided :code:`axis` :param input: the input values :param axis: the dimension along which the reduction should be done """ func.__doc__ = docstr.format(name=name) return func return _decorator @builtin @_add_reduction_docstr("maximum") def max(input, axis, _builder=None): axis = _constexpr_to_value(axis) return semantic.max(input, axis, _builder) @builtin @_add_reduction_docstr("maximum index") def argmax(input, axis, _builder=None): axis = _constexpr_to_value(axis) return semantic.argmax(input, axis, _builder) @builtin @_add_reduction_docstr("minimum") def min(input, axis, _builder=None): axis = _constexpr_to_value(axis) return semantic.min(input, axis, _builder) @builtin @_add_reduction_docstr("minimum index") def argmin(input, axis, _builder=None): axis = _constexpr_to_value(axis) return semantic.argmin(input, axis, _builder) @builtin @_add_reduction_docstr("sum") def sum(input, axis, _builder=None): axis = _constexpr_to_value(axis) return semantic.sum(input, axis, _builder) @builtin @_add_reduction_docstr("xor sum") def xor_sum(input, axis, _builder=None): axis = _constexpr_to_value(axis) return semantic.xor_sum(input, axis, _builder) # ----------------------- # Internal for debugging # ----------------------- @builtin def debug_barrier(_builder=None): return semantic.debug_barrier(_builder) @builtin def multiple_of(input, values, _builder=None): """ Let the compiler knows that the values in :code:`input` are all multiples of :code:`value`. """ if isinstance(values, constexpr): values = [values] for i, d in enumerate(values): if not isinstance(d, constexpr): raise TypeError(f"values element {i} must have type `constexpr`") if not isinstance(d.value, int): raise TypeError(f"values element {i} must have type `constexpr[int]`, got `constexpr[{type(d.value)}]") values = [x.value for x in values] return semantic.multiple_of(input, values) @builtin def max_contiguous(input, values, _builder=None): """ Let the compiler knows that the `value` first values in :code:`input` are contiguous. """ if isinstance(values, constexpr): values = [values] for i, d in enumerate(values): if not isinstance(d, constexpr): raise TypeError(f"values element {i} must have type `constexpr`") if not isinstance(d.value, int): raise TypeError(f"values element {i} must have type `constexpr[int]`, got `constexpr[{type(d.value)}]") values = [x.value for x in values] return semantic.max_contiguous(input, values) # ----------------------- # Standard library # ----------------------- @triton.jit def abs(x): return where(x >= 0, x, -x) @triton.jit def cdiv(x, div): """ Computes the ceiling division of :code:`x` by :code:`div` :param x: the input number :type input: Block :param div: the divisor :param div: Block """ return (x + div - 1) // div @triton.jit def minimum(x, y): """ Computes the element-wise minimum of :code:`x` and :code:`y`. :param input: the first input tensor :type input: Block :param other: the second input tensor :type other: Block """ return triton.language.where(x < y, x, y) @triton.jit def maximum(x, y): """ Computes the element-wise maximum of :code:`x` and :code:`y`. :param input: the first input tensor :type input: Block :param other: the second input tensor :type other: Block """ return triton.language.where(x > y, x, y) @triton.jit @_add_math_1arg_docstr("sigmoid") def sigmoid(x): return 1 / (1 + triton.language.exp(-x)) @triton.jit @_add_math_1arg_docstr("softmax") def softmax(x, ieee_rounding=False): z = x - triton.language.max(x, 0) num = triton.language.exp(z) den = triton.language.sum(num, 0) return fdiv(num, den, ieee_rounding) @triton.jit def ravel(x): """ Returns a contiguous flattened view of :code:`x` :param x: the input tensor :type x: Block """ return triton.language.view(x, [x.numel]) @triton.jit def swizzle2d(i, j, size_i, size_j, size_g): """ Transforms indices of a row-major size_i*size_j matrix into those of one where indices are row major for each group of size_j rows. For example, for size_i = size_j = 4 and size_g = 2, it will transform [[0 , 1 , 2 , 3 ], [4 , 5 , 6 , 7 ], [8 , 9 , 10, 11], [12, 13, 14, 15]] into [[0, 2, 4 , 6 ], [1, 3, 5 , 7 ], [8, 10, 12, 14], [9, 11, 13, 15]] """ # "unrolled index in array" ij = i * size_j + j # number of elements in `size_g` groups # of `size_j` columns size_gj = size_g * size_j # index of the group in which (i,j) is group_id = ij // size_gj # row-index of the first element of this group off_i = group_id * size_g # last group may have fewer rows size_g = minimum(size_i - off_i, size_g) # new row and column indices new_i = off_i + (ij % size_g) new_j = (ij % size_gj) // size_g return new_i, new_j @triton.jit def zeros(shape, dtype): """ Returns a tensor filled with the scalar value 0 for the given :code:`shape` and :code:`dtype`. :param shape: Shape of the new array, e.g., (8, 16) or (8, ) :type shape: tuple of ints :param dtype: Data-type of the new array, e.g., :code:`tl.float16` :type dtype: DType """ return full(shape, 0, dtype) @triton.jit def zeros_like(input): return zeros(input.shape, input.dtype) @builtin def printf(prefix, *args, _builder=None): import string new_prefix = prefix if isinstance(prefix, constexpr): new_prefix = prefix.value assert isinstance(new_prefix, str), f"{new_prefix} is not string" b_ascii = True for ch in new_prefix: if ch not in string.printable: b_ascii = False break assert b_ascii, f"{new_prefix} is not an ascii string" new_args = [] for arg in args: new_args.append(_to_tensor(arg, _builder)) return semantic.printf(new_prefix, new_args, _builder) # ----------------------- # Iterators # ----------------------- class static_range: """Iterator that counts upward forever.""" def __init__(self, arg1, arg2=None, step=None): assert isinstance(arg1, constexpr) if step is None: self.step = constexpr(1) else: assert isinstance(step, constexpr) self.step = step if arg2 is None: self.start = constexpr(0) self.end = arg1 else: assert isinstance(arg2, constexpr) self.start = arg1 self.end = arg2 def __iter__(self): raise RuntimeError("static_range can only be used in @triton.jit'd functions") def __next__(self): raise RuntimeError("static_range can only be used in @triton.jit'd functions") triton-2.0.0/python/triton/language/extern.py000066400000000000000000000063021440023377100213340ustar00rootroot00000000000000from __future__ import annotations # remove after python 3.11 from . import core, semantic def dispatch(func, lib_name: str, lib_path: str, args: list, arg_type_symbol_dict: dict, ret_shape: tuple, _builder=None): ''' Dispatch a function to a library :param func: the function to dispatch :param lib_name: the name of the library :param lib_path: the path of the library :param args: the arguments of the function :param arg_type_symbol_dict: the type of the arguments :param ret_shape: the shape of the return value :param _builder: the builder :return: the return value of the function ''' if len(arg_type_symbol_dict) == 0: raise ValueError("arg_type_symbol_dict is empty") num_args = len(list(arg_type_symbol_dict.keys())[0]) if len(args) != num_args: raise ValueError(f"length of input args does not match." f"Expect {len(args)}, got {num_args}") arg_types = [] arg_list = [] for arg in args: if isinstance(arg, core.tensor): arg_types.append(arg.dtype) arg_list.append(arg.handle) else: arg_types.append(type(arg)) arg_list.append(arg) arg_types = tuple(arg_types) if arg_types not in arg_type_symbol_dict: raise ValueError(f"input arg type does not match." f"Expect one of {arg_type_symbol_dict.keys()}, got {arg_types}") else: symbol = arg_type_symbol_dict[arg_types][0] ret_type = arg_type_symbol_dict[arg_types][1] if ret_shape: ret_type = core.block_type(ret_type, ret_shape) return core.tensor(func(lib_name, lib_path, symbol, arg_list, ret_type.to_ir(_builder)), ret_type) def elementwise(lib_name: str, lib_path: str, args: list, arg_type_symbol_dict: dict, _builder=None): ''' Dispatch an elementwise function to a library :param lib_name: the name of the library :param lib_path: the path of the library :param args: the arguments of the function :param arg_type_symbol_dict: the type of the arguments :param _builder: the builder :return: the return value of the function ''' dispatch_args = args.copy() all_scalar = True ret_shape = None for i in range(len(dispatch_args)): dispatch_args[i] = core._to_tensor(dispatch_args[i], _builder) if dispatch_args[i].type.is_block(): all_scalar = False if not all_scalar: broadcast_arg = dispatch_args[0] # Get the broadcast shape over all the arguments for i, item in enumerate(dispatch_args): _, broadcast_arg = semantic.binary_op_type_checking_impl( item, broadcast_arg, _builder) # Change the shape of each argument based on the broadcast shape for i in range(len(dispatch_args)): dispatch_args[i], _ = semantic.binary_op_type_checking_impl( dispatch_args[i], broadcast_arg, _builder) ret_shape = broadcast_arg.shape func = getattr(_builder, "create_external_elementwise") return dispatch(func, lib_name, lib_path, dispatch_args, arg_type_symbol_dict, ret_shape, _builder) triton-2.0.0/python/triton/language/libdevice.py000066400000000000000000002014101440023377100217520ustar00rootroot00000000000000import os from .. import impl from . import core, extern LIBDEVICE_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "third_party", "cuda", "lib", "libdevice.10.bc") @impl.extern def clz(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("int32"),): ("__nv_clz", core.dtype("int32")), (core.dtype("int64"),): ("__nv_clzll", core.dtype("int32")), }, _builder) @impl.extern def popc(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("int32"),): ("__nv_popc", core.dtype("int32")), (core.dtype("int64"),): ("__nv_popcll", core.dtype("int32")), }, _builder) @impl.extern def byte_perm(arg0, arg1, arg2, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, arg1, arg2, ], {(core.dtype("int32"), core.dtype("int32"), core.dtype("int32"),): ("__nv_byte_perm", core.dtype("int32")), }, _builder) @impl.extern def min(arg0, arg1, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, arg1, ], {(core.dtype("int32"), core.dtype("int32"),): ("__nv_min", core.dtype("int32")), (core.dtype("uint32"), core.dtype("uint32"),): ("__nv_umin", core.dtype("uint32")), (core.dtype("int64"), core.dtype("int64"),): ("__nv_llmin", core.dtype("int64")), (core.dtype("uint64"), core.dtype("uint64"),): ("__nv_ullmin", core.dtype("uint64")), (core.dtype("fp32"), core.dtype("fp32"),): ("__nv_fminf", core.dtype("fp32")), (core.dtype("fp64"), core.dtype("fp64"),): ("__nv_fmin", core.dtype("fp64")), }, _builder) @impl.extern def max(arg0, arg1, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, arg1, ], {(core.dtype("int32"), core.dtype("int32"),): ("__nv_max", core.dtype("int32")), (core.dtype("uint32"), core.dtype("uint32"),): ("__nv_umax", core.dtype("uint32")), (core.dtype("int64"), core.dtype("int64"),): ("__nv_llmax", core.dtype("int64")), (core.dtype("uint64"), core.dtype("uint64"),): ("__nv_ullmax", core.dtype("uint64")), (core.dtype("fp32"), core.dtype("fp32"),): ("__nv_fmaxf", core.dtype("fp32")), (core.dtype("fp64"), core.dtype("fp64"),): ("__nv_fmax", core.dtype("fp64")), }, _builder) @impl.extern def mulhi(arg0, arg1, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, arg1, ], {(core.dtype("int32"), core.dtype("int32"),): ("__nv_mulhi", core.dtype("int32")), (core.dtype("uint32"), core.dtype("uint32"),): ("__nv_umulhi", core.dtype("uint32")), (core.dtype("int64"), core.dtype("int64"),): ("__nv_mul64hi", core.dtype("int64")), (core.dtype("uint64"), core.dtype("uint64"),): ("__nv_umul64hi", core.dtype("uint64")), }, _builder) @impl.extern def mul24(arg0, arg1, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, arg1, ], {(core.dtype("int32"), core.dtype("int32"),): ("__nv_mul24", core.dtype("int32")), (core.dtype("uint32"), core.dtype("uint32"),): ("__nv_umul24", core.dtype("uint32")), }, _builder) @impl.extern def brev(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("int32"),): ("__nv_brev", core.dtype("int32")), (core.dtype("int64"),): ("__nv_brevll", core.dtype("int64")), }, _builder) @impl.extern def sad(arg0, arg1, arg2, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, arg1, arg2, ], {(core.dtype("int32"), core.dtype("int32"), core.dtype("uint32"),): ("__nv_sad", core.dtype("int32")), (core.dtype("uint32"), core.dtype("uint32"), core.dtype("uint32"),): ("__nv_usad", core.dtype("uint32")), }, _builder) @impl.extern def abs(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("int32"),): ("__nv_abs", core.dtype("int32")), (core.dtype("int64"),): ("__nv_llabs", core.dtype("int64")), (core.dtype("fp32"),): ("__nv_fabsf", core.dtype("fp32")), (core.dtype("fp64"),): ("__nv_fabs", core.dtype("fp64")), }, _builder) @impl.extern def floor(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_floorf", core.dtype("fp32")), (core.dtype("fp64"),): ("__nv_floor", core.dtype("fp64")), }, _builder) @impl.extern def rcp64h(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp64"),): ("__nv_rcp64h", core.dtype("fp64")), }, _builder) @impl.extern def rsqrt(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_rsqrtf", core.dtype("fp32")), (core.dtype("fp64"),): ("__nv_rsqrt", core.dtype("fp64")), }, _builder) @impl.extern def ceil(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp64"),): ("__nv_ceil", core.dtype("fp64")), (core.dtype("fp32"),): ("__nv_ceilf", core.dtype("fp32")), }, _builder) @impl.extern def trunc(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp64"),): ("__nv_trunc", core.dtype("fp64")), (core.dtype("fp32"),): ("__nv_truncf", core.dtype("fp32")), }, _builder) @impl.extern def exp2(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_exp2f", core.dtype("fp32")), (core.dtype("fp64"),): ("__nv_exp2", core.dtype("fp64")), }, _builder) @impl.extern def saturatef(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_saturatef", core.dtype("fp32")), }, _builder) @impl.extern def fma_rn(arg0, arg1, arg2, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, arg1, arg2, ], {(core.dtype("fp32"), core.dtype("fp32"), core.dtype("fp32"),): ("__nv_fmaf_rn", core.dtype("fp32")), (core.dtype("fp64"), core.dtype("fp64"), core.dtype("fp64"),): ("__nv_fma_rn", core.dtype("fp64")), }, _builder) @impl.extern def fma_rz(arg0, arg1, arg2, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, arg1, arg2, ], {(core.dtype("fp32"), core.dtype("fp32"), core.dtype("fp32"),): ("__nv_fmaf_rz", core.dtype("fp32")), (core.dtype("fp64"), core.dtype("fp64"), core.dtype("fp64"),): ("__nv_fma_rz", core.dtype("fp64")), }, _builder) @impl.extern def fma_rd(arg0, arg1, arg2, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, arg1, arg2, ], {(core.dtype("fp32"), core.dtype("fp32"), core.dtype("fp32"),): ("__nv_fmaf_rd", core.dtype("fp32")), (core.dtype("fp64"), core.dtype("fp64"), core.dtype("fp64"),): ("__nv_fma_rd", core.dtype("fp64")), }, _builder) @impl.extern def fma_ru(arg0, arg1, arg2, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, arg1, arg2, ], {(core.dtype("fp32"), core.dtype("fp32"), core.dtype("fp32"),): ("__nv_fmaf_ru", core.dtype("fp32")), (core.dtype("fp64"), core.dtype("fp64"), core.dtype("fp64"),): ("__nv_fma_ru", core.dtype("fp64")), }, _builder) @impl.extern def fast_dividef(arg0, arg1, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, arg1, ], {(core.dtype("fp32"), core.dtype("fp32"),): ("__nv_fast_fdividef", core.dtype("fp32")), }, _builder) @impl.extern def div_rn(arg0, arg1, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, arg1, ], {(core.dtype("fp32"), core.dtype("fp32"),): ("__nv_fdiv_rn", core.dtype("fp32")), (core.dtype("fp64"), core.dtype("fp64"),): ("__nv_ddiv_rn", core.dtype("fp64")), }, _builder) @impl.extern def div_rz(arg0, arg1, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, arg1, ], {(core.dtype("fp32"), core.dtype("fp32"),): ("__nv_fdiv_rz", core.dtype("fp32")), (core.dtype("fp64"), core.dtype("fp64"),): ("__nv_ddiv_rz", core.dtype("fp64")), }, _builder) @impl.extern def div_rd(arg0, arg1, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, arg1, ], {(core.dtype("fp32"), core.dtype("fp32"),): ("__nv_fdiv_rd", core.dtype("fp32")), (core.dtype("fp64"), core.dtype("fp64"),): ("__nv_ddiv_rd", core.dtype("fp64")), }, _builder) @impl.extern def div_ru(arg0, arg1, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, arg1, ], {(core.dtype("fp32"), core.dtype("fp32"),): ("__nv_fdiv_ru", core.dtype("fp32")), (core.dtype("fp64"), core.dtype("fp64"),): ("__nv_ddiv_ru", core.dtype("fp64")), }, _builder) @impl.extern def rcp_rn(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_frcp_rn", core.dtype("fp32")), (core.dtype("fp64"),): ("__nv_drcp_rn", core.dtype("fp64")), }, _builder) @impl.extern def rcp_rz(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_frcp_rz", core.dtype("fp32")), (core.dtype("fp64"),): ("__nv_drcp_rz", core.dtype("fp64")), }, _builder) @impl.extern def rcp_rd(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_frcp_rd", core.dtype("fp32")), (core.dtype("fp64"),): ("__nv_drcp_rd", core.dtype("fp64")), }, _builder) @impl.extern def rcp_ru(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_frcp_ru", core.dtype("fp32")), (core.dtype("fp64"),): ("__nv_drcp_ru", core.dtype("fp64")), }, _builder) @impl.extern def sqrt_rn(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_fsqrt_rn", core.dtype("fp32")), (core.dtype("fp64"),): ("__nv_dsqrt_rn", core.dtype("fp64")), }, _builder) @impl.extern def sqrt_rz(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_fsqrt_rz", core.dtype("fp32")), (core.dtype("fp64"),): ("__nv_dsqrt_rz", core.dtype("fp64")), }, _builder) @impl.extern def sqrt_rd(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_fsqrt_rd", core.dtype("fp32")), (core.dtype("fp64"),): ("__nv_dsqrt_rd", core.dtype("fp64")), }, _builder) @impl.extern def sqrt_ru(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_fsqrt_ru", core.dtype("fp32")), (core.dtype("fp64"),): ("__nv_dsqrt_ru", core.dtype("fp64")), }, _builder) @impl.extern def sqrt(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_sqrtf", core.dtype("fp32")), (core.dtype("fp64"),): ("__nv_sqrt", core.dtype("fp64")), }, _builder) @impl.extern def add_rn(arg0, arg1, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, arg1, ], {(core.dtype("fp64"), core.dtype("fp64"),): ("__nv_dadd_rn", core.dtype("fp64")), (core.dtype("fp32"), core.dtype("fp32"),): ("__nv_fadd_rn", core.dtype("fp32")), }, _builder) @impl.extern def add_rz(arg0, arg1, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, arg1, ], {(core.dtype("fp64"), core.dtype("fp64"),): ("__nv_dadd_rz", core.dtype("fp64")), (core.dtype("fp32"), core.dtype("fp32"),): ("__nv_fadd_rz", core.dtype("fp32")), }, _builder) @impl.extern def add_rd(arg0, arg1, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, arg1, ], {(core.dtype("fp64"), core.dtype("fp64"),): ("__nv_dadd_rd", core.dtype("fp64")), (core.dtype("fp32"), core.dtype("fp32"),): ("__nv_fadd_rd", core.dtype("fp32")), }, _builder) @impl.extern def add_ru(arg0, arg1, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, arg1, ], {(core.dtype("fp64"), core.dtype("fp64"),): ("__nv_dadd_ru", core.dtype("fp64")), (core.dtype("fp32"), core.dtype("fp32"),): ("__nv_fadd_ru", core.dtype("fp32")), }, _builder) @impl.extern def mul_rn(arg0, arg1, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, arg1, ], {(core.dtype("fp64"), core.dtype("fp64"),): ("__nv_dmul_rn", core.dtype("fp64")), (core.dtype("fp32"), core.dtype("fp32"),): ("__nv_fmul_rn", core.dtype("fp32")), }, _builder) @impl.extern def mul_rz(arg0, arg1, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, arg1, ], {(core.dtype("fp64"), core.dtype("fp64"),): ("__nv_dmul_rz", core.dtype("fp64")), (core.dtype("fp32"), core.dtype("fp32"),): ("__nv_fmul_rz", core.dtype("fp32")), }, _builder) @impl.extern def mul_rd(arg0, arg1, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, arg1, ], {(core.dtype("fp64"), core.dtype("fp64"),): ("__nv_dmul_rd", core.dtype("fp64")), (core.dtype("fp32"), core.dtype("fp32"),): ("__nv_fmul_rd", core.dtype("fp32")), }, _builder) @impl.extern def mul_ru(arg0, arg1, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, arg1, ], {(core.dtype("fp64"), core.dtype("fp64"),): ("__nv_dmul_ru", core.dtype("fp64")), (core.dtype("fp32"), core.dtype("fp32"),): ("__nv_fmul_ru", core.dtype("fp32")), }, _builder) @impl.extern def double2float_rn(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp64"),): ("__nv_double2float_rn", core.dtype("fp32")), }, _builder) @impl.extern def double2float_rz(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp64"),): ("__nv_double2float_rz", core.dtype("fp32")), }, _builder) @impl.extern def double2float_rd(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp64"),): ("__nv_double2float_rd", core.dtype("fp32")), }, _builder) @impl.extern def double2float_ru(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp64"),): ("__nv_double2float_ru", core.dtype("fp32")), }, _builder) @impl.extern def double2int_rn(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp64"),): ("__nv_double2int_rn", core.dtype("int32")), }, _builder) @impl.extern def double2int_rz(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp64"),): ("__nv_double2int_rz", core.dtype("int32")), }, _builder) @impl.extern def double2int_rd(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp64"),): ("__nv_double2int_rd", core.dtype("int32")), }, _builder) @impl.extern def double2int_ru(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp64"),): ("__nv_double2int_ru", core.dtype("int32")), }, _builder) @impl.extern def double2uint_rn(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp64"),): ("__nv_double2uint_rn", core.dtype("int32")), }, _builder) @impl.extern def double2uint_rz(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp64"),): ("__nv_double2uint_rz", core.dtype("int32")), }, _builder) @impl.extern def double2uint_rd(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp64"),): ("__nv_double2uint_rd", core.dtype("int32")), }, _builder) @impl.extern def double2uint_ru(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp64"),): ("__nv_double2uint_ru", core.dtype("int32")), }, _builder) @impl.extern def int2double_rn(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("int32"),): ("__nv_int2double_rn", core.dtype("fp64")), }, _builder) @impl.extern def uint2double_rn(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("uint32"),): ("__nv_uint2double_rn", core.dtype("fp64")), }, _builder) @impl.extern def float2int_rn(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_float2int_rn", core.dtype("int32")), }, _builder) @impl.extern def float2int_rz(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_float2int_rz", core.dtype("int32")), }, _builder) @impl.extern def float2int_rd(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_float2int_rd", core.dtype("int32")), }, _builder) @impl.extern def float2int_ru(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_float2int_ru", core.dtype("int32")), }, _builder) @impl.extern def float2uint_rn(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_float2uint_rn", core.dtype("int32")), }, _builder) @impl.extern def float2uint_rz(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_float2uint_rz", core.dtype("int32")), }, _builder) @impl.extern def float2uint_rd(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_float2uint_rd", core.dtype("int32")), }, _builder) @impl.extern def float2uint_ru(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_float2uint_ru", core.dtype("int32")), }, _builder) @impl.extern def int2float_rn(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("int32"),): ("__nv_int2float_rn", core.dtype("fp32")), }, _builder) @impl.extern def int2float_rz(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("int32"),): ("__nv_int2float_rz", core.dtype("fp32")), }, _builder) @impl.extern def int2float_rd(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("int32"),): ("__nv_int2float_rd", core.dtype("fp32")), }, _builder) @impl.extern def int2float_ru(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("int32"),): ("__nv_int2float_ru", core.dtype("fp32")), }, _builder) @impl.extern def uint2float_rn(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("uint32"),): ("__nv_uint2float_rn", core.dtype("fp32")), }, _builder) @impl.extern def uint2float_rz(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("uint32"),): ("__nv_uint2float_rz", core.dtype("fp32")), }, _builder) @impl.extern def uint2float_rd(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("uint32"),): ("__nv_uint2float_rd", core.dtype("fp32")), }, _builder) @impl.extern def uint2float_ru(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("uint32"),): ("__nv_uint2float_ru", core.dtype("fp32")), }, _builder) @impl.extern def hiloint2double(arg0, arg1, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, arg1, ], {(core.dtype("int32"), core.dtype("int32"),): ("__nv_hiloint2double", core.dtype("fp64")), }, _builder) @impl.extern def double2loint(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp64"),): ("__nv_double2loint", core.dtype("int32")), }, _builder) @impl.extern def double2hiint(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp64"),): ("__nv_double2hiint", core.dtype("int32")), }, _builder) @impl.extern def float2ll_rn(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_float2ll_rn", core.dtype("int64")), }, _builder) @impl.extern def float2ll_rz(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_float2ll_rz", core.dtype("int64")), }, _builder) @impl.extern def float2ll_rd(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_float2ll_rd", core.dtype("int64")), }, _builder) @impl.extern def float2ll_ru(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_float2ll_ru", core.dtype("int64")), }, _builder) @impl.extern def float2ull_rn(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_float2ull_rn", core.dtype("int64")), }, _builder) @impl.extern def float2ull_rz(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_float2ull_rz", core.dtype("int64")), }, _builder) @impl.extern def float2ull_rd(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_float2ull_rd", core.dtype("int64")), }, _builder) @impl.extern def float2ull_ru(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_float2ull_ru", core.dtype("int64")), }, _builder) @impl.extern def double2ll_rn(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp64"),): ("__nv_double2ll_rn", core.dtype("int64")), }, _builder) @impl.extern def double2ll_rz(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp64"),): ("__nv_double2ll_rz", core.dtype("int64")), }, _builder) @impl.extern def double2ll_rd(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp64"),): ("__nv_double2ll_rd", core.dtype("int64")), }, _builder) @impl.extern def double2ll_ru(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp64"),): ("__nv_double2ll_ru", core.dtype("int64")), }, _builder) @impl.extern def double2ull_rn(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp64"),): ("__nv_double2ull_rn", core.dtype("int64")), }, _builder) @impl.extern def double2ull_rz(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp64"),): ("__nv_double2ull_rz", core.dtype("int64")), }, _builder) @impl.extern def double2ull_rd(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp64"),): ("__nv_double2ull_rd", core.dtype("int64")), }, _builder) @impl.extern def double2ull_ru(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp64"),): ("__nv_double2ull_ru", core.dtype("int64")), }, _builder) @impl.extern def ll2float_rn(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("int64"),): ("__nv_ll2float_rn", core.dtype("fp32")), }, _builder) @impl.extern def ll2float_rz(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("int64"),): ("__nv_ll2float_rz", core.dtype("fp32")), }, _builder) @impl.extern def ll2float_rd(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("int64"),): ("__nv_ll2float_rd", core.dtype("fp32")), }, _builder) @impl.extern def ll2float_ru(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("int64"),): ("__nv_ll2float_ru", core.dtype("fp32")), }, _builder) @impl.extern def ull2float_rn(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("uint64"),): ("__nv_ull2float_rn", core.dtype("fp32")), }, _builder) @impl.extern def ull2float_rz(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("uint64"),): ("__nv_ull2float_rz", core.dtype("fp32")), }, _builder) @impl.extern def ull2float_rd(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("uint64"),): ("__nv_ull2float_rd", core.dtype("fp32")), }, _builder) @impl.extern def ull2float_ru(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("uint64"),): ("__nv_ull2float_ru", core.dtype("fp32")), }, _builder) @impl.extern def ll2double_rn(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("int64"),): ("__nv_ll2double_rn", core.dtype("fp64")), }, _builder) @impl.extern def ll2double_rz(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("int64"),): ("__nv_ll2double_rz", core.dtype("fp64")), }, _builder) @impl.extern def ll2double_rd(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("int64"),): ("__nv_ll2double_rd", core.dtype("fp64")), }, _builder) @impl.extern def ll2double_ru(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("int64"),): ("__nv_ll2double_ru", core.dtype("fp64")), }, _builder) @impl.extern def ull2double_rn(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("uint64"),): ("__nv_ull2double_rn", core.dtype("fp64")), }, _builder) @impl.extern def ull2double_rz(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("uint64"),): ("__nv_ull2double_rz", core.dtype("fp64")), }, _builder) @impl.extern def ull2double_rd(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("uint64"),): ("__nv_ull2double_rd", core.dtype("fp64")), }, _builder) @impl.extern def ull2double_ru(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("uint64"),): ("__nv_ull2double_ru", core.dtype("fp64")), }, _builder) @impl.extern def int_as_float(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("int32"),): ("__nv_int_as_float", core.dtype("fp32")), }, _builder) @impl.extern def float_as_int(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_float_as_int", core.dtype("int32")), }, _builder) @impl.extern def uint_as_float(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("uint32"),): ("__nv_uint_as_float", core.dtype("fp32")), }, _builder) @impl.extern def float_as_uint(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_float_as_uint", core.dtype("int32")), }, _builder) @impl.extern def longlong_as_double(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("int64"),): ("__nv_longlong_as_double", core.dtype("fp64")), }, _builder) @impl.extern def double_as_longlong(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp64"),): ("__nv_double_as_longlong", core.dtype("int64")), }, _builder) @impl.extern def fast_sinf(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_fast_sinf", core.dtype("fp32")), }, _builder) @impl.extern def fast_cosf(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_fast_cosf", core.dtype("fp32")), }, _builder) @impl.extern def fast_log2f(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_fast_log2f", core.dtype("fp32")), }, _builder) @impl.extern def fast_logf(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_fast_logf", core.dtype("fp32")), }, _builder) @impl.extern def fast_expf(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_fast_expf", core.dtype("fp32")), }, _builder) @impl.extern def fast_tanf(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_fast_tanf", core.dtype("fp32")), }, _builder) @impl.extern def fast_exp10f(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_fast_exp10f", core.dtype("fp32")), }, _builder) @impl.extern def fast_log10f(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_fast_log10f", core.dtype("fp32")), }, _builder) @impl.extern def fast_powf(arg0, arg1, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, arg1, ], {(core.dtype("fp32"), core.dtype("fp32"),): ("__nv_fast_powf", core.dtype("fp32")), }, _builder) @impl.extern def hadd(arg0, arg1, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, arg1, ], {(core.dtype("int32"), core.dtype("int32"),): ("__nv_hadd", core.dtype("int32")), (core.dtype("uint32"), core.dtype("uint32"),): ("__nv_uhadd", core.dtype("uint32")), }, _builder) @impl.extern def rhadd(arg0, arg1, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, arg1, ], {(core.dtype("int32"), core.dtype("int32"),): ("__nv_rhadd", core.dtype("int32")), (core.dtype("uint32"), core.dtype("uint32"),): ("__nv_urhadd", core.dtype("uint32")), }, _builder) @impl.extern def sub_rn(arg0, arg1, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, arg1, ], {(core.dtype("fp32"), core.dtype("fp32"),): ("__nv_fsub_rn", core.dtype("fp32")), (core.dtype("fp64"), core.dtype("fp64"),): ("__nv_dsub_rn", core.dtype("fp64")), }, _builder) @impl.extern def sub_rz(arg0, arg1, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, arg1, ], {(core.dtype("fp32"), core.dtype("fp32"),): ("__nv_fsub_rz", core.dtype("fp32")), (core.dtype("fp64"), core.dtype("fp64"),): ("__nv_dsub_rz", core.dtype("fp64")), }, _builder) @impl.extern def sub_rd(arg0, arg1, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, arg1, ], {(core.dtype("fp32"), core.dtype("fp32"),): ("__nv_fsub_rd", core.dtype("fp32")), (core.dtype("fp64"), core.dtype("fp64"),): ("__nv_dsub_rd", core.dtype("fp64")), }, _builder) @impl.extern def sub_ru(arg0, arg1, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, arg1, ], {(core.dtype("fp32"), core.dtype("fp32"),): ("__nv_fsub_ru", core.dtype("fp32")), (core.dtype("fp64"), core.dtype("fp64"),): ("__nv_dsub_ru", core.dtype("fp64")), }, _builder) @impl.extern def rsqrt_rn(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_frsqrt_rn", core.dtype("fp32")), }, _builder) @impl.extern def ffs(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("int32"),): ("__nv_ffs", core.dtype("int32")), (core.dtype("int64"),): ("__nv_ffsll", core.dtype("int32")), }, _builder) @impl.extern def rint(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_rintf", core.dtype("fp32")), (core.dtype("fp64"),): ("__nv_rint", core.dtype("fp64")), }, _builder) @impl.extern def llrint(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_llrintf", core.dtype("int64")), (core.dtype("fp64"),): ("__nv_llrint", core.dtype("int64")), }, _builder) @impl.extern def nearbyint(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_nearbyintf", core.dtype("fp32")), (core.dtype("fp64"),): ("__nv_nearbyint", core.dtype("fp64")), }, _builder) @impl.extern def isnan(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_isnanf", core.dtype("int32")), (core.dtype("fp64"),): ("__nv_isnand", core.dtype("int32")), }, _builder) @impl.extern def signbit(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_signbitf", core.dtype("int32")), (core.dtype("fp64"),): ("__nv_signbitd", core.dtype("int32")), }, _builder) @impl.extern def copysign(arg0, arg1, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, arg1, ], {(core.dtype("fp32"), core.dtype("fp32"),): ("__nv_copysignf", core.dtype("fp32")), (core.dtype("fp64"), core.dtype("fp64"),): ("__nv_copysign", core.dtype("fp64")), }, _builder) @impl.extern def finitef(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_finitef", core.dtype("int32")), }, _builder) @impl.extern def isinf(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_isinff", core.dtype("int32")), (core.dtype("fp64"),): ("__nv_isinfd", core.dtype("int32")), }, _builder) @impl.extern def nextafter(arg0, arg1, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, arg1, ], {(core.dtype("fp32"), core.dtype("fp32"),): ("__nv_nextafterf", core.dtype("fp32")), (core.dtype("fp64"), core.dtype("fp64"),): ("__nv_nextafter", core.dtype("fp64")), }, _builder) @impl.extern def sin(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_sinf", core.dtype("fp32")), (core.dtype("fp64"),): ("__nv_sin", core.dtype("fp64")), }, _builder) @impl.extern def cos(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_cosf", core.dtype("fp32")), (core.dtype("fp64"),): ("__nv_cos", core.dtype("fp64")), }, _builder) @impl.extern def sinpi(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_sinpif", core.dtype("fp32")), (core.dtype("fp64"),): ("__nv_sinpi", core.dtype("fp64")), }, _builder) @impl.extern def cospi(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_cospif", core.dtype("fp32")), (core.dtype("fp64"),): ("__nv_cospi", core.dtype("fp64")), }, _builder) @impl.extern def tan(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_tanf", core.dtype("fp32")), (core.dtype("fp64"),): ("__nv_tan", core.dtype("fp64")), }, _builder) @impl.extern def log2(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_log2f", core.dtype("fp32")), (core.dtype("fp64"),): ("__nv_log2", core.dtype("fp64")), }, _builder) @impl.extern def exp(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_expf", core.dtype("fp32")), (core.dtype("fp64"),): ("__nv_exp", core.dtype("fp64")), }, _builder) @impl.extern def exp10(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_exp10f", core.dtype("fp32")), (core.dtype("fp64"),): ("__nv_exp10", core.dtype("fp64")), }, _builder) @impl.extern def cosh(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_coshf", core.dtype("fp32")), (core.dtype("fp64"),): ("__nv_cosh", core.dtype("fp64")), }, _builder) @impl.extern def sinh(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_sinhf", core.dtype("fp32")), (core.dtype("fp64"),): ("__nv_sinh", core.dtype("fp64")), }, _builder) @impl.extern def tanh(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_tanhf", core.dtype("fp32")), (core.dtype("fp64"),): ("__nv_tanh", core.dtype("fp64")), }, _builder) @impl.extern def atan2(arg0, arg1, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, arg1, ], {(core.dtype("fp32"), core.dtype("fp32"),): ("__nv_atan2f", core.dtype("fp32")), (core.dtype("fp64"), core.dtype("fp64"),): ("__nv_atan2", core.dtype("fp64")), }, _builder) @impl.extern def atan(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_atanf", core.dtype("fp32")), (core.dtype("fp64"),): ("__nv_atan", core.dtype("fp64")), }, _builder) @impl.extern def asin(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_asinf", core.dtype("fp32")), (core.dtype("fp64"),): ("__nv_asin", core.dtype("fp64")), }, _builder) @impl.extern def acos(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_acosf", core.dtype("fp32")), (core.dtype("fp64"),): ("__nv_acos", core.dtype("fp64")), }, _builder) @impl.extern def log(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_logf", core.dtype("fp32")), (core.dtype("fp64"),): ("__nv_log", core.dtype("fp64")), }, _builder) @impl.extern def log10(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_log10f", core.dtype("fp32")), (core.dtype("fp64"),): ("__nv_log10", core.dtype("fp64")), }, _builder) @impl.extern def log1p(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_log1pf", core.dtype("fp32")), (core.dtype("fp64"),): ("__nv_log1p", core.dtype("fp64")), }, _builder) @impl.extern def acosh(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_acoshf", core.dtype("fp32")), (core.dtype("fp64"),): ("__nv_acosh", core.dtype("fp64")), }, _builder) @impl.extern def asinh(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_asinhf", core.dtype("fp32")), (core.dtype("fp64"),): ("__nv_asinh", core.dtype("fp64")), }, _builder) @impl.extern def atanh(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_atanhf", core.dtype("fp32")), (core.dtype("fp64"),): ("__nv_atanh", core.dtype("fp64")), }, _builder) @impl.extern def expm1(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_expm1f", core.dtype("fp32")), (core.dtype("fp64"),): ("__nv_expm1", core.dtype("fp64")), }, _builder) @impl.extern def hypot(arg0, arg1, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, arg1, ], {(core.dtype("fp32"), core.dtype("fp32"),): ("__nv_hypotf", core.dtype("fp32")), (core.dtype("fp64"), core.dtype("fp64"),): ("__nv_hypot", core.dtype("fp64")), }, _builder) @impl.extern def rhypot(arg0, arg1, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, arg1, ], {(core.dtype("fp32"), core.dtype("fp32"),): ("__nv_rhypotf", core.dtype("fp32")), (core.dtype("fp64"), core.dtype("fp64"),): ("__nv_rhypot", core.dtype("fp64")), }, _builder) @impl.extern def norm3d(arg0, arg1, arg2, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, arg1, arg2, ], {(core.dtype("fp32"), core.dtype("fp32"), core.dtype("fp32"),): ("__nv_norm3df", core.dtype("fp32")), (core.dtype("fp64"), core.dtype("fp64"), core.dtype("fp64"),): ("__nv_norm3d", core.dtype("fp64")), }, _builder) @impl.extern def rnorm3d(arg0, arg1, arg2, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, arg1, arg2, ], {(core.dtype("fp32"), core.dtype("fp32"), core.dtype("fp32"),): ("__nv_rnorm3df", core.dtype("fp32")), (core.dtype("fp64"), core.dtype("fp64"), core.dtype("fp64"),): ("__nv_rnorm3d", core.dtype("fp64")), }, _builder) @impl.extern def norm4d(arg0, arg1, arg2, arg3, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, arg1, arg2, arg3, ], {(core.dtype("fp32"), core.dtype("fp32"), core.dtype("fp32"), core.dtype("fp32"),): ("__nv_norm4df", core.dtype("fp32")), (core.dtype("fp64"), core.dtype("fp64"), core.dtype("fp64"), core.dtype("fp64"),): ("__nv_norm4d", core.dtype("fp64")), }, _builder) @impl.extern def rnorm4d(arg0, arg1, arg2, arg3, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, arg1, arg2, arg3, ], {(core.dtype("fp32"), core.dtype("fp32"), core.dtype("fp32"), core.dtype("fp32"),): ("__nv_rnorm4df", core.dtype("fp32")), (core.dtype("fp64"), core.dtype("fp64"), core.dtype("fp64"), core.dtype("fp64"),): ("__nv_rnorm4d", core.dtype("fp64")), }, _builder) @impl.extern def cbrt(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_cbrtf", core.dtype("fp32")), (core.dtype("fp64"),): ("__nv_cbrt", core.dtype("fp64")), }, _builder) @impl.extern def rcbrt(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_rcbrtf", core.dtype("fp32")), (core.dtype("fp64"),): ("__nv_rcbrt", core.dtype("fp64")), }, _builder) @impl.extern def j0(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_j0f", core.dtype("fp32")), (core.dtype("fp64"),): ("__nv_j0", core.dtype("fp64")), }, _builder) @impl.extern def j1(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_j1f", core.dtype("fp32")), (core.dtype("fp64"),): ("__nv_j1", core.dtype("fp64")), }, _builder) @impl.extern def y0(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_y0f", core.dtype("fp32")), (core.dtype("fp64"),): ("__nv_y0", core.dtype("fp64")), }, _builder) @impl.extern def y1(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_y1f", core.dtype("fp32")), (core.dtype("fp64"),): ("__nv_y1", core.dtype("fp64")), }, _builder) @impl.extern def yn(arg0, arg1, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, arg1, ], {(core.dtype("int32"), core.dtype("fp32"),): ("__nv_ynf", core.dtype("fp32")), (core.dtype("int32"), core.dtype("fp64"),): ("__nv_yn", core.dtype("fp64")), }, _builder) @impl.extern def jn(arg0, arg1, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, arg1, ], {(core.dtype("int32"), core.dtype("fp32"),): ("__nv_jnf", core.dtype("fp32")), (core.dtype("int32"), core.dtype("fp64"),): ("__nv_jn", core.dtype("fp64")), }, _builder) @impl.extern def cyl_bessel_i0(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_cyl_bessel_i0f", core.dtype("fp32")), (core.dtype("fp64"),): ("__nv_cyl_bessel_i0", core.dtype("fp64")), }, _builder) @impl.extern def cyl_bessel_i1(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_cyl_bessel_i1f", core.dtype("fp32")), (core.dtype("fp64"),): ("__nv_cyl_bessel_i1", core.dtype("fp64")), }, _builder) @impl.extern def erf(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_erff", core.dtype("fp32")), (core.dtype("fp64"),): ("__nv_erf", core.dtype("fp64")), }, _builder) @impl.extern def erfinv(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_erfinvf", core.dtype("fp32")), (core.dtype("fp64"),): ("__nv_erfinv", core.dtype("fp64")), }, _builder) @impl.extern def erfc(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_erfcf", core.dtype("fp32")), (core.dtype("fp64"),): ("__nv_erfc", core.dtype("fp64")), }, _builder) @impl.extern def erfcx(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_erfcxf", core.dtype("fp32")), (core.dtype("fp64"),): ("__nv_erfcx", core.dtype("fp64")), }, _builder) @impl.extern def erfcinv(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_erfcinvf", core.dtype("fp32")), (core.dtype("fp64"),): ("__nv_erfcinv", core.dtype("fp64")), }, _builder) @impl.extern def normcdfinv(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_normcdfinvf", core.dtype("fp32")), (core.dtype("fp64"),): ("__nv_normcdfinv", core.dtype("fp64")), }, _builder) @impl.extern def normcdf(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_normcdff", core.dtype("fp32")), (core.dtype("fp64"),): ("__nv_normcdf", core.dtype("fp64")), }, _builder) @impl.extern def lgamma(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_lgammaf", core.dtype("fp32")), (core.dtype("fp64"),): ("__nv_lgamma", core.dtype("fp64")), }, _builder) @impl.extern def ldexp(arg0, arg1, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, arg1, ], {(core.dtype("fp32"), core.dtype("int32"),): ("__nv_ldexpf", core.dtype("fp32")), (core.dtype("fp64"), core.dtype("int32"),): ("__nv_ldexp", core.dtype("fp64")), }, _builder) @impl.extern def scalbn(arg0, arg1, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, arg1, ], {(core.dtype("fp32"), core.dtype("int32"),): ("__nv_scalbnf", core.dtype("fp32")), (core.dtype("fp64"), core.dtype("int32"),): ("__nv_scalbn", core.dtype("fp64")), }, _builder) @impl.extern def fmod(arg0, arg1, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, arg1, ], {(core.dtype("fp32"), core.dtype("fp32"),): ("__nv_fmodf", core.dtype("fp32")), (core.dtype("fp64"), core.dtype("fp64"),): ("__nv_fmod", core.dtype("fp64")), }, _builder) @impl.extern def remainder(arg0, arg1, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, arg1, ], {(core.dtype("fp32"), core.dtype("fp32"),): ("__nv_remainderf", core.dtype("fp32")), (core.dtype("fp64"), core.dtype("fp64"),): ("__nv_remainder", core.dtype("fp64")), }, _builder) @impl.extern def fma(arg0, arg1, arg2, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, arg1, arg2, ], {(core.dtype("fp32"), core.dtype("fp32"), core.dtype("fp32"),): ("__nv_fmaf", core.dtype("fp32")), (core.dtype("fp64"), core.dtype("fp64"), core.dtype("fp64"),): ("__nv_fma", core.dtype("fp64")), }, _builder) @impl.extern def pow(arg0, arg1, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, arg1, ], {(core.dtype("fp32"), core.dtype("int32"),): ("__nv_powif", core.dtype("fp32")), (core.dtype("fp64"), core.dtype("int32"),): ("__nv_powi", core.dtype("fp64")), (core.dtype("fp32"), core.dtype("fp32"),): ("__nv_powf", core.dtype("fp32")), (core.dtype("fp64"), core.dtype("fp64"),): ("__nv_pow", core.dtype("fp64")), }, _builder) @impl.extern def tgamma(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_tgammaf", core.dtype("fp32")), (core.dtype("fp64"),): ("__nv_tgamma", core.dtype("fp64")), }, _builder) @impl.extern def round(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_roundf", core.dtype("fp32")), (core.dtype("fp64"),): ("__nv_round", core.dtype("fp64")), }, _builder) @impl.extern def llround(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_llroundf", core.dtype("int64")), (core.dtype("fp64"),): ("__nv_llround", core.dtype("int64")), }, _builder) @impl.extern def fdim(arg0, arg1, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, arg1, ], {(core.dtype("fp32"), core.dtype("fp32"),): ("__nv_fdimf", core.dtype("fp32")), (core.dtype("fp64"), core.dtype("fp64"),): ("__nv_fdim", core.dtype("fp64")), }, _builder) @impl.extern def ilogb(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_ilogbf", core.dtype("int32")), (core.dtype("fp64"),): ("__nv_ilogb", core.dtype("int32")), }, _builder) @impl.extern def logb(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp32"),): ("__nv_logbf", core.dtype("fp32")), (core.dtype("fp64"),): ("__nv_logb", core.dtype("fp64")), }, _builder) @impl.extern def isfinited(arg0, _builder=None): return extern.elementwise("libdevice", LIBDEVICE_PATH, [arg0, ], {(core.dtype("fp64"),): ("__nv_isfinited", core.dtype("int32")), }, _builder) triton-2.0.0/python/triton/language/random.py000066400000000000000000000127261440023377100213160ustar00rootroot00000000000000import triton from . import core as tl PHILOX_KEY_A: tl.constexpr = 0x9E3779B9 PHILOX_KEY_B: tl.constexpr = 0xBB67AE85 PHILOX_ROUND_A: tl.constexpr = 0xD2511F53 PHILOX_ROUND_B: tl.constexpr = 0xCD9E8D57 N_ROUNDS_DEFAULT = 10 # Default number of rounds for philox # ------------------- # randint # ------------------- @triton.jit def philox_impl(c0, c1, c2, c3, k0, k1, n_rounds: tl.constexpr = N_ROUNDS_DEFAULT): """ Run `n_rounds` rounds of Philox for state (c0, c1, c2, c3) and key (k0, k1). """ for _ in tl.static_range(n_rounds): # update random state A = PHILOX_ROUND_A B = PHILOX_ROUND_B _c0, _c2 = c0, c2 c0 = tl.umulhi(B, _c2) ^ c1 ^ k0 c2 = tl.umulhi(A, _c0) ^ c3 ^ k1 c1 = B * _c2 c3 = A * _c0 # raise key k0 = k0 + PHILOX_KEY_A k1 = k1 + PHILOX_KEY_B return c0, c1, c2, c3 @triton.jit def philox(seed, c0, c1, c2, c3, n_rounds: tl.constexpr = N_ROUNDS_DEFAULT): seed = seed.to(tl.uint64) seed_hi = ((seed >> 32) & 0xffffffff).to(tl.uint32) seed_lo = (seed & 0xffffffff).to(tl.uint32) c0 = c0.to(tl.uint32, bitcast=True) c1 = c1.to(tl.uint32, bitcast=True) c2 = c2.to(tl.uint32, bitcast=True) c3 = c3.to(tl.uint32, bitcast=True) return philox_impl(c0, c1, c2, c3, seed_lo, seed_hi, n_rounds) @triton.jit def randint(seed, offset, n_rounds: tl.constexpr = N_ROUNDS_DEFAULT): """ Given a :code:`seed` scalar and an :code:`offset` block, returns a single block of random :code:`int32`. If you need multiple streams of random numbers, using `randint4x` is likely to be faster than calling `randint` 4 times. :param seed: The seed for generating random numbers. :param offsets: The offsets to generate random numbers for. """ ret, _, _, _ = randint4x(seed, offset, n_rounds) return ret @triton.jit def randint4x(seed, offset, n_rounds: tl.constexpr = N_ROUNDS_DEFAULT): """ Given a :code:`seed` scalar and an :code:`offset` block, returns four blocks of random :code:`int32`. This is the maximally efficient entry point to Triton's Philox pseudo-random number generator. :param seed: The seed for generating random numbers. :param offsets: The offsets to generate random numbers for. """ # _0 = tl.zeros(offset.shape, offset.dtype) _0 = offset * 0 return philox(seed, offset, _0, _0, _0, n_rounds) # ------------------- # rand # ------------------- # @triton.jit # def uint32_to_uniform_float(x): # """ # Numerically stable function to convert a random uint32 into a random float uniformly sampled in [0, 1). # """ # two_to_the_minus_32: tl.constexpr = 2.328306e-10 # return x * two_to_the_minus_32 @triton.jit def uint32_to_uniform_float(x): """ Numerically stable function to convert a random uint32 into a random float uniformly sampled in [0, 1). """ x = x.to(tl.int32, bitcast=True) # maximum value such that `MAX_INT * scale < 1.0` (with float rounding) scale = 4.6566127342e-10 x = tl.where(x < 0, -x - 1, x) return x * scale @triton.jit def rand(seed, offset, n_rounds: tl.constexpr = N_ROUNDS_DEFAULT): """ Given a :code:`seed` scalar and an :code:`offset` block, returns a block of random :code:`float32` in :math:`U(0, 1)` :param seed: The seed for generating random numbers. :param offsets: The offsets to generate random numbers for. """ offset = offset.to(tl.uint32, bitcast=True) source = randint(seed, offset, n_rounds) return uint32_to_uniform_float(source) @triton.jit def rand4x(seed, offsets, n_rounds: tl.constexpr = N_ROUNDS_DEFAULT): """ Given a :code:`seed` scalar and an :code:`offsets` block, returns a 4 blocks of random :code:`float32` in :math:`U(0, 1)` :param seed: The seed for generating random numbers. :param offsets: The offsets to generate random numbers for. """ offsets = offsets.to(tl.uint32, bitcast=True) i1, i2, i3, i4 = randint4x(seed, offsets, n_rounds) u1 = uint32_to_uniform_float(i1) u2 = uint32_to_uniform_float(i2) u3 = uint32_to_uniform_float(i3) u4 = uint32_to_uniform_float(i4) return u1, u2, u3, u4 # ------------------- # randn # ------------------- @triton.jit def pair_uniform_to_normal(u1, u2): """Box-Muller transform""" u1 = tl.maximum(1.0e-7, u1) th = 6.283185307179586 * u2 r = tl.sqrt(-2.0 * tl.log(u1)) return r * tl.cos(th), r * tl.sin(th) @triton.jit def randn(seed, offset, n_rounds: tl.constexpr = N_ROUNDS_DEFAULT): """ Given a :code:`seed` scalar and an :code:`offset` block, returns a block of random :code:`float32` in :math:`\\mathcal{N}(0, 1)` :param seed: The seed for generating random numbers. :param offsets: The offsets to generate random numbers for. """ i1, i2, _, _ = randint4x(seed, offset, n_rounds) u1 = uint32_to_uniform_float(i1) u2 = uint32_to_uniform_float(i2) n1, _ = pair_uniform_to_normal(u1, u2) return n1 @triton.jit def randn4x(seed, offset, n_rounds: tl.constexpr = N_ROUNDS_DEFAULT): """ Given a :code:`seed` scalar and an :code:`offset` block, returns a 4 blocks of random :code:`float32` in :math:`\\mathcal{N}(0, 1)` :param seed: The seed for generating random numbers. :param offsets: The offsets to generate random numbers for. """ u1, u2, u3, u4 = rand4x(seed, offset, n_rounds) n1, n2 = pair_uniform_to_normal(u1, u2) n3, n4 = pair_uniform_to_normal(u3, u4) return n1, n2, n3, n4 triton-2.0.0/python/triton/language/semantic.py000066400000000000000000001402131440023377100216320ustar00rootroot00000000000000from __future__ import annotations # remove after python 3.11 from typing import List, Optional, Tuple from . import core as tl from triton._C.libtriton.triton import ir # Create custom exception that prints message "hello" class IncompatibleTypeErrorImpl(Exception): def __init__(self, type_a, type_b): self.type_a = type_a self.type_b = type_b self.message = "invalid operands of type " + self.type_a.__repr__() + " and " + self.type_b.__repr__() super(IncompatibleTypeErrorImpl, self).__init__(self.message) # ===----------------------------------------------------------------------===## # Programming Model # ===----------------------------------------------------------------------===## def program_id(axis: int, builder: ir.builder) -> tl.tensor: return tl.tensor(builder.create_get_program_id(axis), tl.int32) def num_programs(axis: int, builder: ir.builder) -> tl.tensor: return tl.tensor(builder.create_get_num_programs(axis), tl.int32) # ===----------------------------------------------------------------------===// # Implicit Casting Utilities # ===----------------------------------------------------------------------===// def integer_promote_impl(a_ty: tl.dtype, b_ty: tl.dtype) -> tl.dtype: a_rank = a_ty.int_bitwidth b_rank = b_ty.int_bitwidth a_sn = a_ty.int_signedness b_sn = b_ty.int_signedness # Rules for signedness taken from "Usual arithmetic conversions" on # https://en.cppreference.com/w/c/language/conversion. if a_sn == b_sn: return a_ty if a_rank > b_rank else b_ty elif a_sn == tl.dtype.SIGNEDNESS.UNSIGNED: return a_ty if a_rank >= b_rank else b_ty elif b_sn == tl.dtype.SIGNEDNESS.UNSIGNED: return b_ty if b_rank >= a_rank else a_ty assert False def computation_type_impl(a_ty: tl.dtype, b_ty: tl.dtype, div_or_mod: bool) -> tl.dtype: # 1) if one operand is double, the other is implicitly # converted to double if a_ty.is_fp64() or b_ty.is_fp64(): return tl.float64 # 2) if one operand is float, the other is implicitly # converted to float if a_ty.is_fp32() or b_ty.is_fp32(): return tl.float32 # 3 ) if one operand is half, the other is implicitly converted to half # unless we're doing / or %, which do not exist natively in PTX for fp16. # Supported PTX op: add, sub, mul, fma, neg, abs, min, max, tanh, ex2, setp if a_ty.is_fp16() or b_ty.is_fp16(): if div_or_mod: return tl.float32 else: return tl.float16 # 4) return bf16 only if both operands are of bf16 if a_ty.is_bf16() or b_ty.is_bf16(): if div_or_mod: return tl.float32 if a_ty.is_bf16() and b_ty.is_bf16(): return tl.bfloat16 return tl.float32 if not a_ty.is_int() or not b_ty.is_int(): assert False # 5 ) both operands are integer and undergo # integer promotion if div_or_mod and a_ty.int_signedness != b_ty.int_signedness: raise ValueError("Cannot use /, #, or % with " + a_ty.__repr__() + " and " + b_ty.__repr__() + " because they have different signedness;" "this is unlikely to result in a useful answer. Cast them to the same signedness.") return integer_promote_impl(a_ty, b_ty) # ===----------------------------------------------------------------------===// # Binary Operators # ===----------------------------------------------------------------------===// def check_ptr_type_impl(type_a: tl.dtype, type_b: tl.dtype, allow_ptr_a: bool) -> None: if type_a.is_ptr(): if not allow_ptr_a: raise IncompatibleTypeErrorImpl(type_a, type_b) # T* + U* with T != U if type_b.is_ptr() and (type_a != type_b): raise IncompatibleTypeErrorImpl(type_a, type_b) # T* + float if type_b.is_floating(): raise IncompatibleTypeErrorImpl(type_a, type_b) def binary_op_type_checking_impl(lhs: tl.tensor, rhs: tl.tensor, builder: ir.builder, allow_lhs_ptr=False, allow_rhs_ptr=False, arithmetic_check=True, div_or_mod=False ) -> Tuple[tl.tensor, tl.tensor]: # implicit broadcasting lhs, rhs = broadcast_impl_value(lhs, rhs, builder) # implicit typecasting lhs_sca_ty = lhs.type.scalar rhs_sca_ty = rhs.type.scalar check_ptr_type_impl(lhs_sca_ty, rhs_sca_ty, allow_lhs_ptr) check_ptr_type_impl(rhs_sca_ty, lhs_sca_ty, allow_rhs_ptr) if arithmetic_check and not lhs_sca_ty.is_ptr() and not rhs_sca_ty.is_ptr(): ret_sca_ty = computation_type_impl(lhs_sca_ty, rhs_sca_ty, div_or_mod) lhs = cast(lhs, ret_sca_ty, builder) rhs = cast(rhs, ret_sca_ty, builder) return lhs, rhs def add(input: tl.tensor, other: tl.tensor, builder: ir.builder) -> tl.tensor: input, other = binary_op_type_checking_impl(input, other, builder, True, True) input_scalar_ty = input.type.scalar other_scalar_ty = other.type.scalar # offset + ptr # ptr + offset if other_scalar_ty.is_ptr() and not input_scalar_ty.is_ptr(): input, other = other, input if input_scalar_ty.is_ptr(): return tl.tensor(builder.create_addptr(input.handle, other.handle), input.type) # float + float elif input_scalar_ty.is_floating(): return tl.tensor(builder.create_fadd(input.handle, other.handle), input.type) # int + int elif input_scalar_ty.is_int(): return tl.tensor(builder.create_add(input.handle, other.handle), input.type) assert False def sub(input: tl.tensor, other: tl.tensor, builder: ir.builder) -> tl.tensor: input, other = binary_op_type_checking_impl(input, other, builder, True, False) scalar_ty = input.type.scalar # ptr - offset if scalar_ty.is_ptr(): return tl.tensor(builder.create_addptr(input.handle, minus(other, builder).handle), input.type) # float - float if scalar_ty.is_floating(): return tl.tensor(builder.create_fsub(input.handle, other.handle), input.type) # int - int elif scalar_ty.is_int(): return tl.tensor(builder.create_sub(input.handle, other.handle), input.type) assert False def mul(input: tl.tensor, other: tl.tensor, builder: ir.builder) -> tl.tensor: input, other = binary_op_type_checking_impl(input, other, builder) scalar_ty = input.type.scalar # float * float if scalar_ty.is_floating(): return tl.tensor(builder.create_fmul(input.handle, other.handle), input.type) # * int elif scalar_ty.is_int(): return tl.tensor(builder.create_mul(input.handle, other.handle), input.type) assert False def truediv(input: tl.tensor, other: tl.tensor, builder: ir.builder) -> tl.tensor: input, other = binary_op_type_checking_impl(input, other, builder, False, False, True, True) input_scalar_ty = input.type.scalar other_scalar_ty = other.type.scalar # float / int if input_scalar_ty.is_floating() and other_scalar_ty.is_int(): other = cast(other, input_scalar_ty, builder) # int / float elif input_scalar_ty.is_int() and other_scalar_ty.is_floating(): input = cast(input, other_scalar_ty, builder) # int / int (cast to tl.float32) elif input_scalar_ty.is_int() and other_scalar_ty.is_int(): input = cast(input, tl.float32, builder) other = cast(other, tl.float32, builder) # float / float (cast to highest exponent type) elif input_scalar_ty.is_floating() and other_scalar_ty.is_floating(): if input_scalar_ty.fp_mantissa_width > other_scalar_ty.fp_mantissa_width: other = cast(other, input_scalar_ty, builder) else: input = cast(input, other_scalar_ty, builder) # unreachable else: assert False return tl.tensor(builder.create_fdiv(input.handle, other.handle), input.type) def floordiv(input: tl.tensor, other: tl.tensor, builder: ir.builder) -> tl.tensor: input, other = binary_op_type_checking_impl(input, other, builder, False, False, True, True) input_scalar_ty = input.type.scalar other_scalar_ty = other.type.scalar if input_scalar_ty.is_int() and other_scalar_ty.is_int(): ret_ty = integer_promote_impl(input_scalar_ty, other_scalar_ty) input = cast(input, ret_ty, builder) other = cast(other, ret_ty, builder) if ret_ty.is_int_signed(): return tl.tensor(builder.create_sdiv(input.handle, other.handle), input.type) else: return tl.tensor(builder.create_udiv(input.handle, other.handle), input.type) assert False def fdiv(input: tl.tensor, other: tl.tensor, ieee_rounding: bool, builder: ir.builder) -> tl.tensor: input_scalar_ty = input.type.scalar other_scalar_ty = other.type.scalar if not input_scalar_ty.is_floating() or not other_scalar_ty.is_floating(): raise ValueError("both operands of fdiv must have floating scalar type") input, other = binary_op_type_checking_impl(input, other, builder, False, False, False, True) ret = builder.create_fdiv(input.handle, other.handle) return tl.tensor(ret, input.type) def mod(input: tl.tensor, other: tl.tensor, builder: ir.builder) -> tl.tensor: input, other = binary_op_type_checking_impl(input, other, builder, False, False, True, True) scalar_ty = input.type.scalar other_scalar_ty = other.type.scalar # float % float if scalar_ty.is_floating(): # input - input.div(other, rounding_mode="floor") * other ret = sub(input, mul(floor(fdiv(input, other, False, builder), builder), other, builder), builder) return ret # % int elif scalar_ty.is_int(): if scalar_ty.int_signedness != other_scalar_ty.int_signedness: raise ValueError("Cannot mod " + scalar_ty.__repr__() + " by " + other_scalar_ty.__repr__() + " " "because they have different signedness;" "this is unlikely to result in a useful answer. Cast them to the same signedness.") if scalar_ty.is_int_signed(): return tl.tensor(builder.create_srem(input.handle, other.handle), input.type) else: return tl.tensor(builder.create_urem(input.handle, other.handle), input.type) assert False ############## # bitwise ops ############## def bitwise_op_type_checking_impl(input: tl.tensor, other: tl.tensor, builder: ir.builder) -> Tuple[tl.tensor, tl.tensor]: input, other = binary_op_type_checking_impl(input, other, builder, False, False, False) input_sca_ty = input.type.scalar other_sca_ty = other.type.scalar if not input_sca_ty.is_int() or not other_sca_ty.is_int(): raise IncompatibleTypeErrorImpl(input_sca_ty, other_sca_ty) ret_sca_ty = integer_promote_impl(input_sca_ty, other_sca_ty) if ret_sca_ty != input_sca_ty: input = cast(input, ret_sca_ty, builder) if ret_sca_ty != other_sca_ty: other = cast(other, ret_sca_ty, builder) return input, other def and_(input: tl.tensor, other: tl.tensor, builder: ir.builder) -> tl.tensor: input, other = bitwise_op_type_checking_impl(input, other, builder) return tl.tensor(builder.create_and(input.handle, other.handle), input.type) def or_(input: tl.tensor, other: tl.tensor, builder: ir.builder) -> tl.tensor: input, other = bitwise_op_type_checking_impl(input, other, builder) return tl.tensor(builder.create_or(input.handle, other.handle), input.type) def xor_(input: tl.tensor, other: tl.tensor, builder: ir.builder) -> tl.tensor: input, other = bitwise_op_type_checking_impl(input, other, builder) return tl.tensor(builder.create_xor(input.handle, other.handle), input.type) def logical_and(input: tl.tensor, other: tl.tensor, builder: ir.builder) -> tl.tensor: if not input.type.is_int1(): input = bitcast(input, tl.dtype("int1"), builder) if not other.type.is_int1(): other = bitcast(other, tl.dtype("int1"), builder) return and_(input, other, builder) def logical_or(input: tl.tensor, other: tl.tensor, builder: ir.builder) -> tl.tensor: if not input.type.is_int1(): input = bitcast(input, tl.dtype("int1"), builder) if not other.type.is_int1(): other = bitcast(other, tl.dtype("int1"), builder) return or_(input, other, builder) def not_(input: tl.tensor, builder: ir.builder): if not input.type.is_int1(): input = bitcast(input, tl.dtype("int1"), builder) return invert(input, builder) def lshr(input: tl.tensor, other: tl.tensor, builder: ir.builder) -> tl.tensor: input, other = bitwise_op_type_checking_impl(input, other, builder) return tl.tensor(builder.create_lshr(input.handle, other.handle), input.type) def ashr(input: tl.tensor, other: tl.tensor, builder: ir.builder) -> tl.tensor: input, other = bitwise_op_type_checking_impl(input, other, builder) return tl.tensor(builder.create_ashr(input.handle, other.handle), input.type) def shl(input: tl.tensor, other: tl.tensor, builder: ir.builder) -> tl.tensor: input, other = bitwise_op_type_checking_impl(input, other, builder) return tl.tensor(builder.create_shl(input.handle, other.handle), input.type) # ===----------------------------------------------------------------------===// # Unary Operators # ===----------------------------------------------------------------------===// def plus(input: tl.tensor) -> tl.tensor: return input def minus(input: tl.tensor, builder: ir.builder) -> tl.tensor: input_sca_ty = input.type.scalar if input_sca_ty.is_ptr(): raise ValueError("wrong type argument to unary minus (" + input_sca_ty.__repr__() + ")") _0 = tl.tensor(builder.get_null_value(input_sca_ty.to_ir(builder)), input_sca_ty) return sub(_0, input, builder) def invert(input: tl.tensor, builder: tl.tensor) -> tl.tensor: input_sca_ty = input.type.scalar if input_sca_ty.is_ptr() or input_sca_ty.is_floating(): raise ValueError("wrong type argument to unary invert (" + input_sca_ty.__repr__() + ")") _1 = tl.tensor(builder.get_all_ones_value(input_sca_ty.to_ir(builder)), input_sca_ty) return xor_(input, _1, builder) # ===----------------------------------------------------------------------===// # Comparison Operators # ===----------------------------------------------------------------------===// def _bool_like(v: tl.tensor) -> tl.block_type: if not v.type.is_block(): return tl.int1 shape = v.type.shape return tl.block_type(tl.int1, shape) def greater_than(input: tl.tensor, other: tl.tensor, builder: ir.builder) -> tl.tensor: input, other = binary_op_type_checking_impl(input, other, builder) scalar_ty = input.type.scalar # float > float if scalar_ty.is_floating(): return tl.tensor(builder.create_fcmpOGT(input.handle, other.handle), _bool_like(input)) # > int elif scalar_ty.is_int(): if scalar_ty.is_int_signed(): return tl.tensor(builder.create_icmpSGT(input.handle, other.handle), _bool_like(input)) else: return tl.tensor(builder.create_icmpUGT(input.handle, other.handle), _bool_like(input)) assert False def greater_equal(input: tl.tensor, other: tl.tensor, builder: ir.builder) -> tl.tensor: input, other = binary_op_type_checking_impl(input, other, builder) scalar_ty = input.type.scalar # float >= float if scalar_ty.is_floating(): return tl.tensor(builder.create_fcmpOGE(input.handle, other.handle), _bool_like(input)) # >= int elif scalar_ty.is_int(): if scalar_ty.is_int_signed(): return tl.tensor(builder.create_icmpSGE(input.handle, other.handle), _bool_like(input)) else: return tl.tensor(builder.create_icmpUGE(input.handle, other.handle), _bool_like(input)) assert False def less_than(input: tl.tensor, other: tl.tensor, builder: ir.builder) -> tl.tensor: input, other = binary_op_type_checking_impl(input, other, builder) scalar_ty = input.type.scalar # float < float if scalar_ty.is_floating(): return tl.tensor(builder.create_fcmpOLT(input.handle, other.handle), _bool_like(input)) # < int elif scalar_ty.is_int(): if scalar_ty.is_int_signed(): return tl.tensor(builder.create_icmpSLT(input.handle, other.handle), _bool_like(input)) else: return tl.tensor(builder.create_icmpULT(input.handle, other.handle), _bool_like(input)) assert False def less_equal(input: tl.tensor, other: tl.tensor, builder: ir.builder) -> tl.tensor: input, other = binary_op_type_checking_impl(input, other, builder) scalar_ty = input.type.scalar # float < float if scalar_ty.is_floating(): return tl.tensor(builder.create_fcmpOLE(input.handle, other.handle), _bool_like(input)) # < int elif scalar_ty.is_int(): if scalar_ty.is_int_signed(): return tl.tensor(builder.create_icmpSLE(input.handle, other.handle), _bool_like(input)) else: return tl.tensor(builder.create_icmpULE(input.handle, other.handle), _bool_like(input)) assert False def equal(input: tl.tensor, other: tl.tensor, builder: ir.builder) -> tl.tensor: input, other = binary_op_type_checking_impl(input, other, builder) scalar_ty = input.type.scalar # float == float if scalar_ty.is_floating(): return tl.tensor(builder.create_fcmpOEQ(input.handle, other.handle), _bool_like(input)) # == int elif scalar_ty.is_int(): return tl.tensor(builder.create_icmpEQ(input.handle, other.handle), _bool_like(input)) assert False def not_equal(input: tl.tensor, other: tl.tensor, builder: ir.builder) -> tl.tensor: input, other = binary_op_type_checking_impl(input, other, builder) scalar_ty = input.type.scalar # float == float if scalar_ty.is_floating(): return tl.tensor(builder.create_fcmpUNE(input.handle, other.handle), _bool_like(input)) # == int elif scalar_ty.is_int(): return tl.tensor(builder.create_icmpNE(input.handle, other.handle), _bool_like(input)) assert False # ===----------------------------------------------------------------------===// # Block Creation # ===----------------------------------------------------------------------===// def arange(start: int, end: int, builder: ir.builder) -> tl.tensor: if not isinstance(start, int) or not isinstance(end, int): raise ValueError("arange's arguments must be of type tl.constexpr") shape = [end - start] ret_ty = tl.block_type(tl.int32, shape) return tl.tensor(builder.create_make_range(start, end), ret_ty) def full(shape: List[int], value, dtype: tl.dtype, builder: ir.builder) -> tl.tensor: if value == 0: _value = builder.get_null_value(dtype.to_ir(builder)) else: get_value_fn = getattr(builder, f"get_{dtype.name}") _value = get_value_fn(value) ret_ty = tl.block_type(dtype, shape) return tl.tensor(builder.create_splat(_value, shape), ret_ty) # ===----------------------------------------------------------------------===// # Shape Manipulation # ===----------------------------------------------------------------------===// def view(input: tl.tensor, dst_shape: List[int], builder: ir.builder) -> tl.tensor: # TODO: disable when TritonToTritonGPU handles views properly # assert len(input.shape) == len(dst_shape) numel = 1 for s in dst_shape: numel *= s if input.type.numel != numel: raise ValueError("cannot view block of different shape") ret_ty = tl.block_type(input.type.scalar, dst_shape) return tl.tensor(builder.create_view(input.handle, dst_shape), ret_ty) def expand_dims(input: tl.tensor, axis: int, builder: ir.builder) -> tl.tensor: dst_shape = list(input.type.shape) dst_shape.insert(axis, 1) ret_ty = tl.block_type(input.type.scalar, dst_shape) return tl.tensor(builder.create_expand_dims(input.handle, axis), ret_ty) def cat(lhs: tl.tensor, rhs: tl.tensor, can_reorder: bool, builder: ir.builder) -> tl.tensor: assert can_reorder, "current implementation of `cat` always may reorder elements" assert len(lhs.shape) == 1 ret_type = tl.block_type(lhs.type.scalar, [lhs.shape[0] + rhs.shape[0]]) return tl.tensor(builder.create_cat(lhs.handle, rhs.handle), ret_type) def trans(input: tl.tensor, builder: ir.builder) -> tl.tensor: if len(input.shape) != 2: raise ValueError("Only 2D tensors can be transposed") ret_type = tl.block_type(input.type.scalar, [input.shape[1], input.shape[0]]) return tl.tensor(builder.create_trans(input.handle), ret_type) def broadcast_impl_shape(input: tl.tensor, shape: List[int], builder: ir.builder) -> tl.tensor: if not input.type.is_block(): ret_ty = tl.block_type(input.type, shape) return tl.tensor(builder.create_splat(input.handle, shape), ret_ty) src_shape = input.type.get_block_shapes() if len(src_shape) != len(shape): raise ValueError(f"Cannot broadcast, rank mismatch: {src_shape}, {shape}") if shape == src_shape: return input for i, item in enumerate(src_shape): if shape[i] != item and item != 1: raise ValueError(f"Cannot broadcast, the expanded size of the tensor ({shape[i]})" f" must match the existing size ({item}) at non-singleton dimension" f" {i}: {src_shape}, {shape}") ret_ty = tl.block_type(input.type.scalar, shape) return tl.tensor(builder.create_broadcast(input.handle, shape), ret_ty) def broadcast_impl_value(lhs: tl.tensor, rhs: tl.tensor, builder: ir.builder) -> tl.tensor: lhs_ty = lhs.type rhs_ty = rhs.type # make_shape_compatible(block, scalar) if lhs_ty.is_block() and not rhs_ty.is_block(): rhs_ty = tl.block_type(rhs_ty.scalar, lhs_ty.shape) rhs = tl.tensor(builder.create_splat(rhs.handle, lhs_ty.get_block_shapes()), rhs_ty) # make_shape_compatible(scalar, block) elif not lhs_ty.is_block() and rhs_ty.is_block(): lhs_ty = tl.block_type(lhs_ty.scalar, rhs_ty.shape) lhs = tl.tensor(builder.create_splat(lhs.handle, rhs_ty.get_block_shapes()), lhs_ty) # make_shape_compatible(block, block) elif lhs_ty.is_block() and rhs_ty.is_block(): lhs_shape = lhs_ty.get_block_shapes() rhs_shape = rhs_ty.get_block_shapes() if len(lhs_shape) < len(rhs_shape): # Add new axes to lhs for dim in range(len(lhs_shape), len(rhs_shape)): lhs = tl.tensor(builder.create_expand_dims(lhs.handle, dim), tl.block_type(lhs_ty.scalar, lhs_shape + [1])) lhs_ty = lhs.type lhs_shape = lhs_ty.get_block_shapes() elif len(rhs_shape) < len(lhs_shape): # Add new axes to rhs for dim in range(len(rhs_shape), len(lhs_shape)): rhs = tl.tensor(builder.create_expand_dims(rhs.handle, dim), tl.block_type(rhs_ty.scalar, rhs_shape + [1])) rhs_ty = rhs.type rhs_shape = rhs_ty.get_block_shapes() assert len(rhs_shape) == len(lhs_shape) ret_shape = [] for i, left in enumerate(lhs_shape): right = rhs_shape[i] if left == 1: ret_shape.append(right) elif right == 1: ret_shape.append(left) elif left == right: ret_shape.append(left) else: raise ValueError("Cannot make_shape_compatible: incompatible dimensions " "at index " + str(i) + ": " + str(left) + " and " + str(right)) if lhs_shape != ret_shape: ret_ty = tl.block_type(lhs_ty.scalar, ret_shape) lhs = tl.tensor(builder.create_broadcast(lhs.handle, ret_shape), ret_ty) if rhs_shape != ret_shape: ret_ty = tl.block_type(rhs_ty.scalar, ret_shape) rhs = tl.tensor(builder.create_broadcast(rhs.handle, ret_shape), ret_ty) # (scalar, scalar) => returns original blocks return lhs, rhs ####### # cast ####### def bitcast(input: tl.tensor, dst_ty: tl.dtype, builder: ir.builder) -> tl.tensor: src_ty = input.type if src_ty.is_block(): dst_ty = tl.block_type(dst_ty, input.type.get_block_shapes()) if src_ty == dst_ty: return input src_sca_ty = src_ty.scalar dst_sca_ty = dst_ty.scalar if src_sca_ty.is_ptr() or dst_sca_ty.is_ptr(): return cast(input, dst_ty, builder) # Bitcast src_bits = src_sca_ty.primitive_bitwidth dst_bits = dst_sca_ty.primitive_bitwidth if src_bits != dst_bits: raise ValueError("Cannot bitcast data-type of size " + str(src_bits) + "to " "data-type of size " + str(dst_bits)) return tl.tensor(builder.create_bitcast(input.handle, dst_ty.to_ir(builder)), dst_ty) def cast(input: tl.tensor, dst_ty: tl.dtype, builder: ir.builder) -> tl.tensor: src_ty = input.type if src_ty.is_block(): dst_ty = tl.block_type(dst_ty, input.type.get_block_shapes()) if src_ty == dst_ty: return input src_sca_ty = src_ty.scalar dst_sca_ty = dst_ty.scalar # Casting with customized floating types involved: fp8 <=> bf16, fp16, fp32, fp64 if (src_sca_ty.is_customized_floating() and dst_sca_ty.is_floating()) or \ (src_sca_ty.is_floating() and dst_sca_ty.is_customized_floating()): return tl.tensor(builder.create_fp_to_fp(input.handle, dst_ty.to_ir(builder)), dst_ty) # bf16 <=> (not fp32) if (src_sca_ty.is_fp16() and not dst_sca_ty.is_fp32()) or \ (src_sca_ty.is_bf16() and not dst_sca_ty.is_fp32()): return cast(cast(input, tl.float32, builder), dst_sca_ty, builder) # Standard floating types' casting: truncation # fp64 => fp32, fp16, bf16 # fp32 => fp16, bf16 truncate_fp = src_sca_ty.is_floating() and \ dst_sca_ty.is_floating() and \ src_sca_ty.primitive_bitwidth > dst_sca_ty.primitive_bitwidth if truncate_fp: return tl.tensor(builder.create_fp_trunc(input.handle, dst_ty.to_ir(builder)), dst_ty) # Standard floating types' casting: extension # fp32 => fp64 # fp16 => fp32, fp64 # bf16 => fp32, fp64 ext_fp = src_sca_ty.is_floating() and \ dst_sca_ty.is_floating() and \ src_sca_ty.primitive_bitwidth < dst_sca_ty.primitive_bitwidth if ext_fp: return tl.tensor(builder.create_fp_ext(input.handle, dst_ty.to_ir(builder)), dst_ty) # Casting between integer types if src_sca_ty.is_int() and dst_sca_ty.is_int() and \ (src_sca_ty.int_bitwidth != dst_sca_ty.int_bitwidth or src_sca_ty.int_signedness != dst_sca_ty.int_signedness): sign_extend = src_sca_ty.is_int_signed() and not src_sca_ty.is_bool() if dst_sca_ty.is_bool(): ty = input.dtype.to_ir(builder) _0 = tl.tensor(builder.get_null_value(ty), input.dtype) return not_equal(input, _0, builder) else: return tl.tensor(builder.create_int_cast(input.handle, dst_ty.to_ir(builder), sign_extend), dst_ty) # Casting standard floating types to integer types if src_sca_ty.is_standard_floating() and dst_sca_ty.is_int(): if dst_sca_ty.is_bool(): ty = input.dtype.to_ir(builder) _0 = tl.tensor(builder.get_null_value(ty), input.dtype) return not_equal(input, _0, builder) elif dst_sca_ty.is_int_signed(): return tl.tensor(builder.create_fp_to_si(input.handle, dst_ty.to_ir(builder)), dst_ty) else: return tl.tensor(builder.create_fp_to_ui(input.handle, dst_ty.to_ir(builder)), dst_ty) # Casting integer types to standard floating types if src_sca_ty.is_int() and dst_sca_ty.is_standard_floating(): if src_sca_ty.is_bool() or not src_sca_ty.is_int_signed(): return tl.tensor(builder.create_ui_to_fp(input.handle, dst_ty.to_ir(builder)), dst_ty) else: return tl.tensor(builder.create_si_to_fp(input.handle, dst_ty.to_ir(builder)), dst_ty) # Casting pointer types to integer types if src_sca_ty.is_ptr() and dst_sca_ty.is_int(): bitwidth = dst_sca_ty.int_bitwidth if bitwidth == 64: return tl.tensor(builder.create_ptr_to_int(input.handle, dst_ty.to_ir(builder)), dst_ty) if bitwidth == 1: return not_equal(cast(input, tl.int64, builder), tl.tensor(builder.get_int64(0), tl.int64), builder) # Casting integer types to pointer types if src_sca_ty.is_int() and dst_sca_ty.is_ptr(): return tl.tensor(builder.create_int_to_ptr(input.handle, dst_ty.to_ir(builder)), dst_ty) # Casting pointer types to pointer types if src_sca_ty.is_ptr() and dst_sca_ty.is_ptr(): return tl.tensor(builder.create_bitcast(input.handle, dst_ty.to_ir(builder)), dst_ty) assert False, f'cannot cast {input} to {dst_ty}' # ===----------------------------------------------------------------------===// # Memory Operators # ===----------------------------------------------------------------------===// def load(ptr: tl.tensor, mask: Optional[tl.tensor], other: Optional[tl.tensor], cache_modifier: str, eviction_policy: str, is_volatile: bool, builder: ir.builder) -> tl.tensor: if not ptr.type.scalar.is_ptr(): raise ValueError("Pointer argument of load instruction is " + ptr.type.__repr__()) if ptr.type.is_block(): if mask: mask = broadcast_impl_shape(mask, ptr.type.get_block_shapes(), builder) if other: other = broadcast_impl_shape(other, ptr.type.get_block_shapes(), builder) ptr_ty = ptr.type.scalar elt_ty = ptr_ty.element_ty # treat bool* as tl.int8* if elt_ty == tl.int1: elt_ty = tl.int8 ptr_ty = tl.pointer_type(elt_ty, ptr_ty.address_space) ptr = cast(ptr, ptr_ty, builder) if other: other = cast(other, elt_ty, builder) # cache modifier cache = ir.CACHE_MODIFIER.NONE # default if cache_modifier: if cache_modifier == ".ca": cache = ir.CACHE_MODIFIER.CA elif cache_modifier == ".cg": cache = ir.CACHE_MODIFIER.CG else: raise ValueError(f"Cache modifier {cache_modifier} not supported") # eviction policy eviction = ir.EVICTION_POLICY.NORMAL # default if eviction_policy: if eviction_policy == "evict_last": eviction = ir.EVICTION_POLICY.EVICT_LAST elif eviction_policy == "evict_first": eviction = ir.EVICTION_POLICY.EVICT_FIRST else: raise ValueError(f"Eviction policy {eviction_policy} not supported") if ptr.type.is_block(): shape = ptr.type.get_block_shapes() dst_ty = tl.block_type(elt_ty, shape) else: dst_ty = elt_ty if not mask: if other: raise ValueError("`other` cannot be provided without `mask`") return tl.tensor(builder.create_load(ptr.handle, cache, eviction, is_volatile), dst_ty) else: return tl.tensor(builder.create_masked_load(ptr.handle, mask.handle, other.handle if other else None, cache, eviction, is_volatile), dst_ty) def store(ptr: tl.tensor, val: tl.tensor, mask: Optional[tl.tensor], builder: ir.builder) -> tl.tensor: if not ptr.type.scalar.is_ptr(): raise ValueError("Pointer argument of store instruction is " + ptr.type.__repr__()) if ptr.type.is_block(): val = broadcast_impl_shape(val, ptr.type.get_block_shapes(), builder) if mask and ptr.type.is_block(): mask = broadcast_impl_shape(mask, ptr.type.get_block_shapes(), builder) ptr_ty = ptr.type.scalar elt_ty = ptr_ty.element_ty # treat bool* as tl.int8* if elt_ty == tl.int1: elt_ty = tl.int8 ptr_ty = tl.pointer_type(elt_ty, ptr_ty.address_space) ptr = cast(ptr, ptr_ty, builder) # cast to target data-type val = cast(val, elt_ty, builder) if not mask: return tl.tensor(builder.create_store(ptr.handle, val.handle), tl.void) if not mask.type.scalar.is_bool(): raise ValueError("Mask must have boolean scalar type") return tl.tensor(builder.create_masked_store(ptr.handle, val.handle, mask.handle), tl.void) ######### # atomic ######### def atomic_cas(ptr: tl.tensor, cmp: tl.tensor, val: tl.tensor, builder: ir.builder) -> tl.tensor: element_ty = ptr.type.scalar.element_ty if element_ty.primitive_bitwidth not in [16, 32, 64]: raise ValueError("atomic_cas only supports elements with width {16, 32, 64}") return tl.tensor(builder.create_atomic_cas(ptr.handle, cmp.handle, val.handle), val.type) def atom_red_typechecking_impl(ptr: tl.tensor, val: tl.tensor, mask: tl.tensor, op: str, builder: ir.builder) -> Tuple[tl.tensor, tl.tensor, tl.tensor]: if not ptr.type.scalar.is_ptr(): raise ValueError("Pointer argument of store instruction is " + ptr.type.__repr__()) element_ty = ptr.type.scalar.element_ty if element_ty is tl.float16 and op != 'add': raise ValueError("atomic_" + op + " does not support fp16") if element_ty in [tl.int1, tl.int8, tl.int16, tl.bfloat16]: raise ValueError("atomic_" + op + " does not support " + str(element_ty)) if ptr.type.is_block(): if mask: mask = broadcast_impl_shape(mask, ptr.type.get_block_shapes(), builder) if val: val = broadcast_impl_shape(val, ptr.type.get_block_shapes(), builder) val = cast(val, ptr.type.scalar.element_ty, builder) if not mask: mask_ir = builder.get_int1(True) mask_ty = tl.int1 if ptr.type.is_block(): mask_ir = builder.create_splat(mask_ir, ptr.type.get_block_shapes()) mask_ty = tl.block_type(tl.int1, ptr.type.get_block_shapes()) mask = tl.tensor(mask_ir, mask_ty) return ptr, val, mask def atomic_max(ptr: tl.tensor, val: tl.tensor, mask: tl.tensor, builder: ir.builder) -> tl.tensor: ptr, val, mask = atom_red_typechecking_impl(ptr, val, mask, 'max', builder) sca_ty = val.type.scalar # direct call to atomic_max for integers if sca_ty.is_int(): if sca_ty.is_int_signed(): return tl.tensor(builder.create_atomic_rmw(ir.ATOMIC_OP.MAX, ptr.handle, val.handle, mask.handle), val.type) else: return tl.tensor(builder.create_atomic_rmw(ir.ATOMIC_OP.UMAX, ptr.handle, val.handle, mask.handle), val.type) # for float # return atomic_smax(i_ptr, i_val) if val >= 0 # return atomic_umin(i_ptr, i_val) if val < 0 i_val = bitcast(val, tl.int32, builder) i_ptr = bitcast(ptr, tl.pointer_type(tl.int32, 1), builder) pos = greater_equal(val, tl.tensor(builder.get_fp32(0), sca_ty), builder) neg = less_than(val, tl.tensor(builder.get_fp32(0), sca_ty), builder) pos_ret = tl.tensor(builder.create_atomic_rmw(ir.ATOMIC_OP.MAX, i_ptr.handle, i_val.handle, and_(mask, pos, builder).handle), i_val.type) neg_ret = tl.tensor(builder.create_atomic_rmw(ir.ATOMIC_OP.UMIN, i_ptr.handle, i_val.handle, and_(mask, neg, builder).handle), i_val.type) return where(pos, pos_ret, neg_ret, builder) def atomic_min(ptr: tl.tensor, val: tl.tensor, mask: tl.tensor, builder: ir.builder) -> tl.tensor: ptr, val, mask = atom_red_typechecking_impl(ptr, val, mask, 'min', builder) sca_ty = val.type.scalar # direct call to atomic_min for integers if sca_ty.is_int(): if sca_ty.is_int_signed(): return tl.tensor(builder.create_atomic_rmw(ir.ATOMIC_OP.MIN, ptr.handle, val.handle, mask.handle), val.type) else: return tl.tensor(builder.create_atomic_rmw(ir.ATOMIC_OP.UMIN, ptr.handle, val.handle, mask.handle), val.type) # for float # return atomic_smin(i_ptr, i_val) if val >= 0 # return atomic_umax(i_ptr, i_val) if val < 0 i_val = bitcast(val, tl.int32, builder) i_ptr = bitcast(ptr, tl.pointer_type(tl.int32, 1), builder) pos = greater_equal(val, tl.tensor(builder.get_fp32(0), sca_ty), builder) neg = less_than(val, tl.tensor(builder.get_fp32(0), sca_ty), builder) pos_ret = tl.tensor(builder.create_atomic_rmw(ir.ATOMIC_OP.MIN, i_ptr.handle, i_val.handle, and_(mask, pos, builder).handle), i_val.type) neg_ret = tl.tensor(builder.create_atomic_rmw(ir.ATOMIC_OP.UMAX, i_ptr.handle, i_val.handle, and_(mask, neg, builder).handle), i_val.type) return where(pos, pos_ret, neg_ret, builder) def atomic_add(ptr: tl.tensor, val: tl.tensor, mask: tl.tensor, builder: ir.builder) -> tl.tensor: ptr, val, mask = atom_red_typechecking_impl(ptr, val, mask, 'add', builder) sca_ty = val.type.scalar op = ir.ATOMIC_OP.FADD if sca_ty.is_floating() else ir.ATOMIC_OP.ADD return tl.tensor(builder.create_atomic_rmw(op, ptr.handle, val.handle, mask.handle), val.type) def atomic_and(ptr: tl.tensor, val: tl.tensor, mask: tl.tensor, builder: ir.builder) -> tl.tensor: ptr, val, mask = atom_red_typechecking_impl(ptr, val, mask, 'and', builder) return tl.tensor(builder.create_atomic_rmw(ir.ATOMIC_OP.AND, ptr.handle, val.handle, mask.handle), val.type) def atomic_or(ptr: tl.tensor, val: tl.tensor, mask: tl.tensor, builder: ir.builder) -> tl.tensor: ptr, val, mask = atom_red_typechecking_impl(ptr, val, mask, 'or', builder) return tl.tensor(builder.create_atomic_rmw(ir.ATOMIC_OP.OR, ptr.handle, val.handle, mask.handle), val.type) def atomic_xor(ptr: tl.tensor, val: tl.tensor, mask: tl.tensor, builder: ir.builder) -> tl.tensor: ptr, val, mask = atom_red_typechecking_impl(ptr, val, mask, 'xor', builder) return tl.tensor(builder.create_atomic_rmw(ir.ATOMIC_OP.XOR, ptr.handle, val.handle, mask.handle), val.type) def atomic_xchg(ptr: tl.tensor, val: tl.tensor, mask: tl.tensor, builder: ir.builder) -> tl.tensor: ptr, val, mask = atom_red_typechecking_impl(ptr, val, mask, 'xchg', builder) return tl.tensor(builder.create_atomic_rmw(ir.ATOMIC_OP.XCHG, ptr.handle, val.handle, mask.handle), val.type) # ===----------------------------------------------------------------------===// # Linear Algebra # ===----------------------------------------------------------------------===// def dot(lhs: tl.tensor, rhs: tl.tensor, allow_tf32: bool, builder: ir.builder) -> tl.tensor: assert lhs.type.is_block() and rhs.type.is_block() assert lhs.dtype == rhs.dtype, "lhs and rhs must have the same dtype!" assert len(lhs.shape) == 2 and len(rhs.shape) == 2 assert lhs.shape[1].value == rhs.shape[0].value assert lhs.shape[0].value >= 16 and lhs.shape[1].value >= 16 \ and rhs.shape[1].value >= 16,\ "small blocks not supported!" if lhs.type.scalar.is_int(): _0 = builder.get_int32(0) ret_scalar_ty = tl.int32 else: _0 = builder.get_fp32(0) ret_scalar_ty = tl.float32 M = lhs.type.shape[0] N = rhs.type.shape[1] _0 = builder.create_splat(_0, [M, N]) ret_ty = tl.block_type(ret_scalar_ty, [M, N]) return tl.tensor(builder.create_dot(lhs.handle, rhs.handle, _0, allow_tf32), ret_ty) # ===----------------------------------------------------------------------===// # Indexing # ===----------------------------------------------------------------------===// def where(condition: tl.tensor, x: tl.tensor, y: tl.tensor, builder: ir.builder) -> tl.tensor: condition = cast(condition, tl.int1, builder) if condition.type.is_block(): condition, x = broadcast_impl_value(condition, x, builder) x, y = broadcast_impl_value(x, y, builder) condition, x = broadcast_impl_value(condition, x, builder) x, y = binary_op_type_checking_impl(x, y, builder, True, True) if not condition.type.is_block(): condition, _ = broadcast_impl_value(condition, x, builder) ret_ty = x.type return tl.tensor(builder.create_select(condition.handle, x.handle, y.handle), ret_ty) # ===----------------------------------------------------------------------===// # Reductions # ===----------------------------------------------------------------------=== def reduce_impl(input: tl.tensor, axis: int, builder: ir.builder, name: str, FLOAT_OP: ir.REDUCE_OP, INT_OP: ir.REDUCE_OP) -> tl.tensor: scalar_ty = input.type.scalar out_scalar_ty = scalar_ty # input is extended to 32-bits if necessary # this increases numerical accuracy and can be done pretty much for free # on GPUs if scalar_ty.is_int() and scalar_ty.int_bitwidth <= 32: input = cast(input, tl.int32, builder) out_scalar_ty = tl.int32 # hardware doesn't support FMAX, FMIN, CMP for bfloat16 if scalar_ty is tl.bfloat16: input = cast(input, tl.float32, builder) out_scalar_ty = tl.float32 # choose the right unsigned operation if scalar_ty.is_int_unsigned(): int_op_to_unit = { ir.REDUCE_OP.MIN: ir.REDUCE_OP.UMIN, ir.REDUCE_OP.MAX: ir.REDUCE_OP.UMAX, ir.REDUCE_OP.ARGMIN: ir.REDUCE_OP.ARGUMIN, ir.REDUCE_OP.ARGMAX: ir.REDUCE_OP.ARGUMAX, } if INT_OP in int_op_to_unit: INT_OP = int_op_to_unit[INT_OP] # If we are doing an argmin or argmax we want to use an int32 output type if FLOAT_OP is ir.REDUCE_OP.ARGFMAX or INT_OP is ir.REDUCE_OP.ARGMAX: out_scalar_ty = tl.int32 elif FLOAT_OP is ir.REDUCE_OP.ARGFMIN or INT_OP is ir.REDUCE_OP.ARGMIN: out_scalar_ty = tl.int32 # get result type shape = input.type.shape ret_shape = [] for i, s in enumerate(shape): if i != axis: ret_shape.append(s) if ret_shape: res_ty = tl.block_type(out_scalar_ty, ret_shape) else: # 0d-tensor -> scalar res_ty = out_scalar_ty if scalar_ty.is_floating(): return tl.tensor(builder.create_reduce(input.handle, FLOAT_OP, axis), res_ty) elif scalar_ty.is_int(): return tl.tensor(builder.create_reduce(input.handle, INT_OP, axis), res_ty) assert False def min(input: tl.tensor, axis: int, builder: ir.builder) -> tl.tensor: return reduce_impl(input, axis, builder, "min", ir.REDUCE_OP.FMIN, ir.REDUCE_OP.MIN) def argmin(input: tl.tensor, axis: int, builder: ir.builder) -> tl.tensor: return reduce_impl(input, axis, builder, "argmin", ir.REDUCE_OP.ARGFMIN, ir.REDUCE_OP.ARGMIN) def max(input: tl.tensor, axis: int, builder: ir.builder) -> tl.tensor: return reduce_impl(input, axis, builder, "max", ir.REDUCE_OP.FMAX, ir.REDUCE_OP.MAX) def argmax(input: tl.tensor, axis: int, builder: ir.builder) -> tl.tensor: return reduce_impl(input, axis, builder, "argmax", ir.REDUCE_OP.ARGFMAX, ir.REDUCE_OP.ARGMAX) def sum(input: tl.tensor, axis: int, builder: ir.builder) -> tl.tensor: return reduce_impl(input, axis, builder, "sum", ir.REDUCE_OP.FADD, ir.REDUCE_OP.ADD) def xor_sum(input: tl.tensor, axis: int, builder: ir.builder) -> tl.tensor: scalar_ty = input.type.scalar if not scalar_ty.is_int(): raise ValueError("xor_sum only supported for integers") return reduce_impl(input, axis, builder, "sum", ir.REDUCE_OP.XOR, ir.REDUCE_OP.XOR) # ===----------------------------------------------------------------------=== # Math # ===----------------------------------------------------------------------=== def umulhi(x: tl.tensor, y: tl.tensor, builder: ir.builder) -> tl.tensor: x, y = binary_op_type_checking_impl(x, y, builder) # FIXME(Keren): not portable, should be fixed from . import libdevice return libdevice.mulhi(x, y, _builder=builder) def floor(x: tl.tensor, builder: ir.builder) -> tl.tensor: # FIXME(Keren): not portable, should be fixed from . import libdevice return libdevice.floor(x, _builder=builder) def exp(x: tl.tensor, builder: ir.builder) -> tl.tensor: return tl.tensor(builder.create_exp(x.handle), x.type) def log(x: tl.tensor, builder: ir.builder) -> tl.tensor: return tl.tensor(builder.create_log(x.handle), x.type) def cos(x: tl.tensor, builder: ir.builder) -> tl.tensor: return tl.tensor(builder.create_cos(x.handle), x.type) def sin(x: tl.tensor, builder: ir.builder) -> tl.tensor: return tl.tensor(builder.create_sin(x.handle), x.type) def sqrt(x: tl.tensor, builder: ir.builder) -> tl.tensor: return tl.tensor(builder.create_sqrt(x.handle), x.type) ## def multiple_of(x: tl.tensor, values: List[int]) -> tl.tensor: if len(x.shape) != len(values): raise ValueError("Shape of input to multiple_of does not match the length of values") x.handle.set_attr("tt.divisibility", ir.make_attr(values, x.handle.get_context())) return x def max_contiguous(x: tl.tensor, values: List[int]) -> tl.tensor: if len(x.shape) != len(values): raise ValueError("Shape of input to max_contiguous does not match the length of values") x.handle.set_attr("tt.contiguity", ir.make_attr(values, x.handle.get_context())) return x def debug_barrier(builder: ir.builder) -> tl.tensor: return tl.tensor(builder.create_barrier(), tl.void) def printf(prefix: str, args: List[tl.tensor], builder: ir.builder) -> tl.tensor: new_args = [] for arg in args: new_args.append(arg.handle) return tl.tensor(builder.create_printf(prefix, new_args), tl.void) triton-2.0.0/python/triton/ops/000077500000000000000000000000001440023377100164725ustar00rootroot00000000000000triton-2.0.0/python/triton/ops/__init__.py000066400000000000000000000004711440023377100206050ustar00rootroot00000000000000# from .conv import _conv, conv from . import blocksparse from .cross_entropy import _cross_entropy, cross_entropy from .flash_attention import attention from .matmul import _matmul, matmul __all__ = [ "blocksparse", "_cross_entropy", "cross_entropy", "_matmul", "matmul", "attention", ] triton-2.0.0/python/triton/ops/blocksparse/000077500000000000000000000000001440023377100210025ustar00rootroot00000000000000triton-2.0.0/python/triton/ops/blocksparse/__init__.py000066400000000000000000000001441440023377100231120ustar00rootroot00000000000000from .matmul import matmul from .softmax import softmax __all__ = [ "matmul", "softmax", ] triton-2.0.0/python/triton/ops/blocksparse/matmul.py000066400000000000000000000363751440023377100226710ustar00rootroot00000000000000import torch import triton import triton.language as tl # ******************************************************** # -------------------------------------------------------- # Sparse = Dense x Dense (SDD) # This operation uses super-blocking to make sure that # it's done efficiently when small blocks can be grouped # together # -------------------------------------------------------- # ******************************************************** @triton.heuristics({ 'EVEN_K': lambda nargs: nargs['K'] % nargs['TILE_K'] == 0, }) @triton.jit def _sdd_kernel( A, B, C, stride_za, stride_ha, stride_ma, stride_ak, stride_zb, stride_hb, stride_bk, stride_nb, stride_zc, stride_hc, stride_mc, stride_nc, K, grid_offset, lut, TILE_M: tl.constexpr, TILE_N: tl.constexpr, TILE_K: tl.constexpr, BLOCK: tl.constexpr, EVEN_K: tl.constexpr ): # ------------ # # - Prologue - # # ------------ # block_id = tl.program_id(1) + grid_offset lut += block_id * 3 # offsets off_z = tl.program_id(2) # batch off_h = tl.load(lut + 0) # head # initialize pointers to A start_am = tl.load(lut + 1) offs_am = start_am * BLOCK + (tl.arange(0, TILE_M) % BLOCK) offs_ak = tl.arange(0, TILE_K) a_ptrs = A \ + off_z * stride_za \ + off_h * stride_ha \ + offs_am[:, None] * stride_ma \ + offs_ak[None, :] * stride_ak # initialize pointers to B start_bn = tl.load(lut + 2) offs_bn = start_bn * BLOCK + (tl.arange(0, TILE_N) % BLOCK) offs_bk = tl.arange(0, TILE_K) b_ptrs = B \ + off_z * stride_zb \ + off_h * stride_hb \ + offs_bn[None, :] * stride_nb \ + offs_bk[:, None] * stride_bk # ---------------- # # Inner Loop # # ---------------- # acc = tl.zeros((TILE_M, TILE_N), dtype=tl.float32) for k in range(K, 0, -TILE_K): if EVEN_K: a = tl.load(a_ptrs) b = tl.load(b_ptrs) else: a = tl.load(a_ptrs, mask=offs_ak[None, :] < k, other=0.) b = tl.load(b_ptrs, mask=offs_bk[:, None] < k, other=0.) acc += tl.dot(a, b) a_ptrs += TILE_K * stride_ak b_ptrs += TILE_K * stride_bk c = acc.to(C.dtype.element_ty) # ---------------- # # Epilogue # # ---------------- # offs_cm = tl.arange(0, TILE_M) % BLOCK offs_cn = tl.arange(0, TILE_N) % BLOCK pc = C \ + off_z * stride_zc \ + block_id * stride_hc \ + offs_cm[:, None] * stride_mc \ + offs_cn[None, :] * stride_nc tl.store(pc, c, mask=True) def sdd_matmul(a, b, trans_a, trans_b, trans_c, spdims, block, lut, widths, out=None): if a.stride(2) != 1 and a.stride(3) != 1: a = a.contiguous() if b.stride(2) != 1 and b.stride(3) != 1: b = b.contiguous() # (A * B)^T = B^T * A^T if trans_c: a, b = b, a trans_a, trans_b = not trans_b, not trans_a # shape constraints a_dim = -2 if trans_a else -1 b_dim = -1 if trans_b else -2 Ka, Kb = a.shape[a_dim], b.shape[b_dim] if Ka != Kb: raise ValueError(f"Inner dimension mismatch (A: {Ka} vs B: {Kb})") # allocate output if out is None: c = torch.empty((a.shape[0], lut.shape[0], block, block), dtype=a.dtype, device=a.device) else: assert out.shape == (a.shape[0], lut.shape[0], block, block) c = out grid = [1, c.shape[1], c.shape[0]] _sdd_kernel[grid]( a, b, c, a.stride(0), a.stride(1), a.stride(3 if trans_a else 2), a.stride(2 if trans_a else 3), b.stride(0), b.stride(1), b.stride(3 if trans_b else 2), b.stride(2 if trans_b else 3), c.stride(0), c.stride(1), c.stride(2), c.stride(3), Ka, 0, lut, TILE_M=block, TILE_N=block, TILE_K=32, BLOCK=block, num_stages=4, num_warps=4, ) return c def sdd_lut(layout, block, device): lut = layout.nonzero(as_tuple=False).to(device).int() lut = lut.contiguous() return lut, None # ----------------------------- # Dense = Sparse x Dense (DSD) # This operation uses a look-up table that contains pre-computed pointer increments # in order to minimize computations in the inner loop of the matmul kernel. # ----------------------------- @triton.jit def _dsd_kernel( A, B, C, stride_az, stride_ha, stride_am, stride_ak, stride_zb, stride_hb, stride_bk, stride_bn, stride_zc, stride_hc, stride_cm, stride_cn, DS0, DS1, lut, TILE_M: tl.constexpr, TILE_N: tl.constexpr, TILE_K: tl.constexpr, GROUP_SIZE_M: tl.constexpr, BLOCK: tl.constexpr ): # ------------ # # - Prologue - # # ------------ # pid_m = tl.program_id(0) pid_n = tl.program_id(1) num_pid_m = tl.num_programs(0) num_pid_n = tl.num_programs(1) pid_n, pid_m = tl.swizzle2d(pid_n, pid_m, num_pid_n, num_pid_m, GROUP_SIZE_M) pidz = tl.program_id(2) header = lut + pid_n * 4 offset = tl.load(header + 0) K = tl.load(header + 1) column = tl.load(header + 2) off_h = tl.load(header + 3) pinc = lut + offset # initialize pointers to A (sparse) block_id = tl.load(pinc + 1) block_id = tl.multiple_of(block_id, 8) # compiler hint offs_am = tl.arange(0, TILE_M) offs_ak = tl.arange(0, TILE_K) pa = A + pidz * stride_az \ + block_id * stride_ha \ + offs_am[:, None] * stride_am \ + offs_ak[None, :] * stride_ak # initialize pointers to B (dense) offs_bn = pid_m * TILE_N + tl.arange(0, TILE_N) offs_bn = tl.max_contiguous(tl.multiple_of(offs_bn % DS0, TILE_N), TILE_N) start_bk = tl.load(pinc) start_bk = tl.multiple_of(start_bk, 8) # compiler hint offs_bk = start_bk + tl.arange(0, TILE_K) pb = B + pidz * stride_zb \ + off_h * stride_hb \ + offs_bn[None, :] * stride_bn \ + offs_bk[:, None] * stride_bk # ---------------- # # Inner Loop # # ---------------- # acc = tl.zeros((TILE_M, TILE_N), dtype=tl.float32) pinc += 2 inc_a = tl.load(pinc + 1) inc_a = tl.multiple_of(inc_a, 8) inc_b = tl.load(pinc) inc_b = tl.multiple_of(inc_b, 8) for k in range(K, 0, -TILE_K): a = tl.load(pa, mask=True) b = tl.load(pb, mask=offs_bn[None, :] < DS0) acc += tl.dot(a, b) pa += inc_a pb += inc_b * stride_bk pinc += 2 inc_a = tl.load(pinc + 1) inc_a = tl.multiple_of(inc_a, 8) inc_b = tl.load(pinc) inc_b = tl.multiple_of(inc_b, 8) c = acc.to(C.dtype.element_ty) # initialize pointers to C offs_cm = column * TILE_M + tl.arange(0, TILE_M) offs_cn = pid_m * TILE_N + tl.arange(0, TILE_N) pc = C \ + off_h * stride_hc \ + pidz * stride_zc \ + offs_cm[:, None] * stride_cm \ + offs_cn[None, :] * stride_cn tl.store(pc, c, mask=offs_cn[None, :] < DS0) def dsd_matmul(a, b, trans_a, trans_b, trans_c, spdims, block, lut, width, out=None): if a.stride(2) != 1 and a.stride(3) != 1: a = a.contiguous() if b.stride(2) != 1 and b.stride(3) != 1: b = b.contiguous() # shapes / dtypes AS1 = block * spdims[2 if trans_a else 1] BS0 = b.size(0) BS1 = b.size(1) BS3 = b.size(2 if trans_b else 3) dtype = a.dtype # allocate output CS0 = BS0 CS1 = BS1 CS2 = BS3 if trans_c else AS1 CS3 = AS1 if trans_c else BS3 if out is None: c = torch.empty((CS0, CS1, CS2, CS3), dtype=dtype, device=a.device) else: assert out.shape == (CS0, CS1, CS2, CS3) c = out # meta-parameter heuristics TILE_N = 128 # compute output grid = lambda meta: [triton.cdiv(BS3, meta['TILE_N']), width, BS0] _dsd_kernel[grid]( a, b, c, a.stride(0), a.stride(1), a.stride(3 if trans_a else 2), a.stride(2 if trans_a else 3), b.stride(0), b.stride(1), b.stride(3 if trans_b else 2), b.stride(2 if trans_b else 3), c.stride(0), c.stride(1), c.stride(3 if trans_c else 2), c.stride(2 if trans_c else 3), BS3, AS1, lut, TILE_M=block, TILE_N=TILE_N, TILE_K=min(block, 32), BLOCK=block, num_stages=4, num_warps=4, GROUP_SIZE_M=4, ) # exit() return c def dsd_lut(layout, block, step, trans, device): """ Generates the look-up table for incrementing pointers in the DSD/DDS matmul. Example (BLOCK=32, STEP=16) [[1, 0, 0, 1, 0], [0, 1, 1, 0, 1], [1, 0, 1, 0, 0]] Then the offsets for A are [0 , 16, 32, 48] <- row 0 \\----/ \\----/ col=0 col=3 [64, 80, 96, 112, 128, 144] <- row 1 \\----/ \\----/ \\------/ col=1 col=2 col=3 [160, 176, 192, 208] which leads to increments table [0, 16, 16, 16, || 64, 16, 16, 16, 16, 16, || 160, 16, 16, 16] Because B is dense, the offsets are [0, 16, 96, 112] <- row 0 [32, 48, 64, 80] <- row 1 [0, 16, 64, 80] <- row 2 """ sizes = torch.sum(layout, 2 if trans else 1) head_id, col_id = torch.ones_like(sizes).nonzero(as_tuple=True) sizes = sizes.flatten() segments = sizes * step # pointer increments if trans: nnz = layout.nonzero(as_tuple=False) else: nnz = layout.transpose(1, 2).nonzero(as_tuple=False) num_blocks = nnz.size(0) offsets = torch.zeros_like(sizes) offsets[1:] = torch.cumsum(sizes[:-1], dim=0) offsets = torch.min(offsets, (num_blocks - 1) * torch.ones_like(offsets)) # ------------------------------- # dense input pointer increments # ------------------------------- # Note that the inner loop matmul kernel may have a fixed step size (e.g., TILE_K) # that is smaller than the block size, so we need to do a bit of extra work # to handle this case B_idx = nnz[:, 2] * block B_incs = B_idx.clone() B_incs[1:] -= B_idx[:-1] div = block // step B_incs = B_incs.view(-1, 1).repeat(1, div) B_incs[:, 1:] = step B_incs[:, 0] -= (div - 1) * step # first increment for each reduction is actually the offset B_incs[offsets[segments > 0], 0] = B_idx[offsets[segments > 0]] B_incs = B_incs.view(-1) # ------------------------------- # sparse input pointer increments # ------------------------------- # same as above, except that the increments are in the sparse memory layout if trans: A_idx = torch.arange(num_blocks, device=layout.device) else: A_idx = torch.tensor([], dtype=torch.int64, device=layout.device) current_offset = 0 for z in range(layout.size(0)): layoutw = layout[z, :, :].clone().long() msum = layoutw.sum() layoutw[layoutw > 0] = 1 + torch.arange(msum, device=layout.device) A_idx = torch.cat((A_idx, current_offset + layoutw.T[layoutw.T > 0] - 1)) current_offset += msum A_incs = A_idx * block * block A_incs[1:] -= A_idx[:-1] * block * block A_incs = A_incs.view(-1, 1).repeat(1, div) if trans: A_incs[:, 1:] = step A_incs[:, 0] -= (div - 1) * step else: A_incs[:, 1:] = step * block A_incs[:, 0] -= (div - 1) * step * block A_incs[offsets[segments > 0], 0] = A_idx[offsets[segments > 0]] A_incs = A_incs.view(-1) # create header width = col_id.size(0) offsets = offsets * 2 * div + 4 * width segments = segments * div header = torch.stack((offsets, segments, col_id, head_id), dim=1).view(-1).contiguous() # create increments incs = torch.stack((B_incs, A_incs), dim=1).view(-1).contiguous() # pad by a factor 2*MAX_NUM_STAGES # to accommodate pre-fetching inside the kernel pad = torch.zeros(20, device=incs.device, dtype=incs.dtype) incs = torch.cat((incs, pad)) # create lut lut = torch.cat((header, incs)) lut = lut.type(torch.int32).to(device) # create locks return lut, width # ----------------------------- # Dense = Dense x Sparse (DDS) # ----------------------------- # AB = (B^T A^T)^T def dds_matmul(a, b, trans_a, trans_b, trans_c, spdims, block, lut, width, out=None): return dsd_matmul(b, a, not trans_b, not trans_a, not trans_c, spdims, block, lut, width, out=out) ############## # MAIN API # ############## class _matmul(torch.autograd.Function): fn = {'sdd': sdd_matmul, 'dsd': dsd_matmul, 'dds': dds_matmul} @staticmethod def forward( ctx, a, b, trans_a, trans_b, trans_c, mode, spdims, block, c_lut, c_width, da_lut, da_width, db_lut, db_width, out ): c = _matmul.fn[mode](a, b, trans_a, trans_b, trans_c, spdims, block, c_lut, c_width, out=out) # save for backward ctx.save_for_backward(a, b) ctx.da_lut = da_lut ctx.da_width = da_width ctx.db_lut = db_lut ctx.db_width = db_width ctx.mode = mode ctx.spdims = spdims ctx.block = block ctx.trans_a = trans_a ctx.trans_b = trans_b ctx.trans_c = trans_c ctx.has_out = out is not None return c @staticmethod def backward(ctx, dc): # saved for backward a, b = ctx.saved_tensors da, db = None, None mode = ctx.mode # gradients w.r.t. a if ctx.needs_input_grad[0]: mode_da = mode[1] + mode[0] + mode[2] da = _matmul.fn[mode_da]( dc, b, ctx.trans_c, not ctx.trans_b, ctx.trans_a, ctx.spdims, ctx.block, ctx.da_lut, ctx.da_width, ) # gradients w.r.t. b if ctx.needs_input_grad[1]: mode_db = mode[2] + mode[1] + mode[0] db = _matmul.fn[mode_db]( a, dc, not ctx.trans_a, ctx.trans_c, ctx.trans_b, ctx.spdims, ctx.block, ctx.db_lut, ctx.db_width, ) dout = dc if ctx.has_out else None return da, db, None, None, None,\ None, None, None, None,\ None, None, None, None, None, dout class matmul: def __init__(self, layout, block, mode, device, trans_a=False, trans_b=False, trans_c=False): if mode not in ['sdd', 'dsd', 'dds']: raise NotImplementedError('Supported modes are: sdd, dsd, dds') self.block = block self.mode = mode self.trans_a = trans_a self.trans_b = trans_b self.trans_c = trans_c self.layout = layout self.spdims = layout.shape step = min(block, 32) if self.mode == 'sdd': self.c_lut, self.c_width = sdd_lut(layout, block, device) self.da_lut, self.da_width = dsd_lut(layout, block, step, True, device) self.db_lut, self.db_width = dsd_lut(layout, block, step, False, device) if self.mode == 'dsd': self.c_lut, self.c_width = dsd_lut(layout, block, step, not self.trans_a, device) self.da_lut, self.da_width = sdd_lut(layout, block, device) self.db_lut, self.db_width = dsd_lut(layout, block, step, self.trans_a, device) if self.mode == 'dds': self.c_lut, self.c_width = dsd_lut(layout, block, step, self.trans_b, device) self.da_lut, self.da_width = dsd_lut(layout, block, step, not self.trans_b, device) self.db_lut, self.db_width = sdd_lut(layout, block, device) def __call__(self, a, b, out=None): c = _matmul.apply( a, b, self.trans_a, self.trans_b, self.trans_c, self.mode, self.spdims, self.block, self.c_lut, self.c_width, self.da_lut, self.da_width, self.db_lut, self.db_width, out ) return c triton-2.0.0/python/triton/ops/blocksparse/softmax.py000066400000000000000000000173251440023377100230450ustar00rootroot00000000000000import torch import triton import triton.language as tl def num_warps(n): if n <= 128: return 1 if n <= 256: return 2 if n <= 512: return 4 if n <= 4096: return 8 return 16 @triton.jit def _blocksparse_softmax_fwd( Out, A, stride_xz, LUT, R, extent, stride_zr, stride_hr, # relative attention scale, is_causal, ROW_SIZE: tl.constexpr, BLOCK_SIZE: tl.constexpr, IS_DENSE: tl.constexpr, ): h = tl.program_id(0) m = tl.program_id(1) z = tl.program_id(2) # create index ranges hm = h * tl.num_programs(1) + m lane_n = tl.arange(0, ROW_SIZE) % BLOCK_SIZE block_n = tl.arange(0, ROW_SIZE) // BLOCK_SIZE # extract information from LUT header = LUT + (hm // BLOCK_SIZE) * 2 size = tl.load(header + 0) offset = tl.load(header + 1) # pointer offset off_a = z * stride_xz off_a += (offset + block_n) * BLOCK_SIZE * BLOCK_SIZE # block indx off_a += (m % BLOCK_SIZE) * BLOCK_SIZE # row indx # do not need to read column indices in the dense case if IS_DENSE: ns = tl.arange(0, ROW_SIZE) else: off_lut = offset + 2 * tl.num_programs(0) * tl.num_programs(1) // BLOCK_SIZE start_n = tl.load(LUT + off_lut + block_n, mask=block_n < size, other=0) ns = start_n * BLOCK_SIZE + lane_n # load X mask = block_n < size a = tl.load(A + off_a + lane_n, mask=mask, other=-float("inf")) a = a.to(tl.float32) # compute out = a out *= scale # apply relative attention if R is not None: R += z * stride_zr R += h * stride_hr off_lo = (extent - m - 1) + ns mask_lo = (off_lo >= 0) & (off_lo < extent) rel_logits = tl.load(R + m * extent + off_lo, mask=mask_lo, other=0.0) out += rel_logits out = out.to(tl.float32) # apply causal mask out = tl.where((ns > m) & is_causal, -float("inf"), out) # computation out = tl.softmax(out) # write-back tl.store(Out + off_a + lane_n, out, mask=mask) @triton.jit def _blocksparse_softmax_bwd( DA, stride_zdx, DOut, stride_zdout, Out, stride_zout, scale, LUT, DR, extent, stride_zr, stride_hr, stride_er, is_causal, ROW_SIZE: tl.constexpr, BLOCK_SIZE: tl.constexpr, IS_DENSE: tl.constexpr, ): h = tl.program_id(0) m = tl.program_id(1) z = tl.program_id(2) # create index ranges hm = h * tl.num_programs(1) + m lane_n = tl.arange(0, ROW_SIZE) % BLOCK_SIZE block_n = tl.arange(0, ROW_SIZE) // BLOCK_SIZE # extract information from LUT header = LUT + (hm // BLOCK_SIZE) * 2 size = tl.load(header + 0) offset = tl.load(header + 1) # row-col offset off_mn = (offset + block_n) * BLOCK_SIZE * BLOCK_SIZE off_mn += (m % BLOCK_SIZE) * BLOCK_SIZE mask = block_n < size # pointers As = Out + z * stride_zout + off_mn DOuts = DOut + z * stride_zdout + off_mn # do not need to read column indices in the dense case if IS_DENSE: ns = tl.arange(0, ROW_SIZE) else: off_lut = offset + 2 * tl.num_programs(0) * tl.num_programs(1) // BLOCK_SIZE start_n = tl.load(LUT + off_lut + block_n, mask=mask, other=0) ns = start_n * BLOCK_SIZE + lane_n # load data a = tl.load(As + lane_n, mask=mask, other=0.0) a = a.to(tl.float32) dout = tl.load(DOuts + lane_n, mask=mask, other=0.0) dout = dout.to(tl.float32) # compute a = tl.where((ns > m) & is_causal & (a == a), 0., a) da = a * (dout - tl.sum(a * dout, 0)) # apply relative attention if DR is not None: DR += z * stride_zr DR += h * stride_hr off_lo = (extent - m - 1) + ns mask_lo = (off_lo >= 0) & (off_lo < extent) & mask tl.store(DR + m * extent + off_lo, da, mask=mask_lo) da = da * scale # convert da # write-back DAs = DA + z * stride_zdx + off_mn tl.store(DAs + lane_n, da, mask=mask) class _softmax(torch.autograd.Function): @staticmethod def make_lut(layout, block, device): _empty = torch.tensor([], dtype=torch.int64, device=layout.device) sizes = _empty.clone() # sizes along rows for h in range(layout.shape[0]): sizes = torch.cat((sizes, layout[h, :, :].sum(-1))) total_sizes = sizes * block # offsets in block format offsets = torch.zeros_like(sizes) offsets[1:] = torch.cumsum(sizes[:-1], dim=0) # block indices columns = layout.nonzero(as_tuple=False)[:, 2] header = torch.stack((sizes, offsets), dim=1).view(-1) lut = torch.cat((header, columns)).type(torch.int32).to(device) return lut, int(total_sizes.max()) @staticmethod def forward( ctx, a, scale, rel_logits, is_causal, spdims, block, lut, maxlut, is_dense ): if scale is not None and isinstance(scale, torch.Tensor): assert scale.device.type == "cpu" scale = scale.item() M = a.shape[0] grid = [spdims[0], spdims[1] * block, M] rel_shape = (1, 1, 1, 1) if rel_logits is None else rel_logits.shape rel_strides = (1, 1, 1, 1) if rel_logits is None else rel_logits.stride() # enqueue kernel out = torch.empty_like(a) _blocksparse_softmax_fwd[grid]( out, a, a.stride(0), lut, rel_logits, rel_shape[-1], rel_strides[0], rel_strides[1], # relative attn scale, is_causal, BLOCK_SIZE=block, ROW_SIZE=triton.next_power_of_2(maxlut), IS_DENSE=is_dense, num_warps=num_warps(maxlut) ) # save to context # ctx.mark_dirty(x) ctx.save_for_backward(out, lut) ctx.spdims = spdims ctx.block = block ctx.maxlut = maxlut ctx.scale = scale ctx.rel_shape = rel_shape ctx.rel_strides = rel_strides ctx.rel_dtype = a.dtype ctx.is_dense = is_dense ctx.is_causal = is_causal return out @staticmethod def backward(ctx, dout): # retrieve from context out, lut = ctx.saved_tensors # relative logits gradients dr = None if ctx.needs_input_grad[3]: dr = torch.zeros(ctx.rel_shape, dtype=ctx.rel_dtype, device=out.device) # run kernel M = out.shape[0] grid = (ctx.spdims[0], ctx.spdims[1] * ctx.block, M) da = torch.empty_like(dout) _blocksparse_softmax_bwd[grid]( da, da.stride(0), dout, dout.stride(0), out, out.stride(0), ctx.scale, lut, dr, ctx.rel_shape[-1], ctx.rel_strides[0], ctx.rel_strides[1], ctx.rel_strides[2], ctx.is_causal, BLOCK_SIZE=ctx.block, ROW_SIZE=triton.next_power_of_2(ctx.maxlut), IS_DENSE=ctx.is_dense, num_warps=num_warps(ctx.maxlut) ) return (da, None, None, dr, None, None, None, None, None, None, None, None, None, None, None, None, None, None ) class softmax: def __init__(self, layout, block, device, is_dense=False): self.spdims = layout.shape self.layout = layout self.block = block self.lut, self.maxlut = _softmax.make_lut(self.layout, self.block, device) self.is_dense = is_dense def __call__(self, a, *, scale=1.0, rel_logits=None, is_causal=False): if rel_logits is not None and rel_logits.dtype != a.dtype: raise ValueError(f"relative position embedding must be {a.dtype}") a = _softmax.apply( a, scale, rel_logits, is_causal, self.spdims, self.block, self.lut, self.maxlut, self.is_dense, ) return a triton-2.0.0/python/triton/ops/cross_entropy.py000066400000000000000000000070011440023377100217530ustar00rootroot00000000000000import torch import triton import triton.language as tl def next_power_of_2(n): n -= 1 n |= n >> 1 n |= n >> 2 n |= n >> 4 n |= n >> 8 n |= n >> 16 n += 1 return n def num_warps(N): if N < 2048: return 4 elif N < 8192: return 8 return 16 @triton.heuristics({'num_warps': lambda nargs: num_warps(nargs['N'])}) @triton.heuristics({'BLOCK': lambda nargs: next_power_of_2(nargs['N'])}) @triton.jit def _forward(LOGITS, PROBS, IDX, LOSS, N, BLOCK: tl.constexpr): row = tl.program_id(0) cols = tl.arange(0, BLOCK) idx = tl.load(IDX + row) # pointers to logit and probs LOGITS = LOGITS + row * N + cols WRIT_PROBS = PROBS + row * N + cols READ_PROBS = PROBS + row * N + idx # write-back negative log-probs logits = tl.load(LOGITS, mask=cols < N, other=-float('inf')) logits = logits.to(tl.float32) logits = logits - tl.max(logits, 0) probs = tl.log(tl.sum(tl.exp(logits), 0)) - logits tl.store(WRIT_PROBS, probs, mask=cols < N) # There is a bug in the compiler, which fails to insert a barrier here. # We add it explicitly for now. Will be fixed soon. tl.debug_barrier() # write-back loss probs = tl.load(READ_PROBS) tl.store(LOSS + row, probs) @triton.heuristics({'num_warps': lambda nargs: num_warps(nargs['N'])}) @triton.heuristics({'BLOCK': lambda nargs: next_power_of_2(nargs['N'])}) @triton.jit def _backward(PROBS, IDX, DPROBS, N, BLOCK: tl.constexpr): row = tl.program_id(0) cols = tl.arange(0, BLOCK) idx = tl.load(IDX + row) # pointers to probs PROBS = PROBS + row * N + cols # We know d(-log(p[i])/dlogit[k] = -id_mat[i,k] + p[k] # and we have -log(p[k]) stored in PROBS, so this is easy probs = -tl.load(PROBS, mask=cols < N, other=float('inf')) probs = tl.exp(probs.to(tl.float32)) delta = cols == idx # write result in-place in PROBS dout = tl.load(DPROBS + row) din = (probs - delta) * dout tl.store(PROBS, din.to(PROBS.dtype.element_ty), mask=cols < N) class _cross_entropy(torch.autograd.Function): @classmethod def forward(cls, ctx, logits, indices): # make sure we can use triton assert (indices.dtype == torch.int64), "Indices are expected to be of type long." # make kernel device, dtype = logits.device, logits.dtype n_cols = logits.shape[-1] # run the kernel result = torch.empty_like(indices, dtype=dtype, device=device) neg_logprobs = torch.empty_like(logits, dtype=dtype, device=device) grid = lambda opt: (logits.numel() // n_cols, ) _forward[grid](logits, neg_logprobs, indices, result, n_cols) # save for backward ctx.save_for_backward(neg_logprobs, indices) return result @classmethod def backward(cls, ctx, dneg_logprobs): """We know d(-log(p[i])/dlogit[k] = -id_mat[i,k] + p[k] so we initialize the gradient as neg_logprobs, so we can just exponentiate to get p[k], which is most of what we need... neg_logprobs will be modified in place to become the gradient we want """ # load saved tensors neg_logprobs, indices = ctx.saved_tensors # run the kernel # neg_logprobs will be modified in place to become our gradient: n_cols = neg_logprobs.shape[-1] grid = lambda opt: (neg_logprobs.numel() // n_cols, ) _backward[grid](neg_logprobs, indices, dneg_logprobs, n_cols) return neg_logprobs, None cross_entropy = _cross_entropy.apply triton-2.0.0/python/triton/ops/flash_attention.py000066400000000000000000000233631440023377100222350ustar00rootroot00000000000000""" Fused Attention =============== This is a Triton implementation of the Flash Attention algorithm (see: Dao et al., https://arxiv.org/pdf/2205.14135v2.pdf; Rabe and Staats https://arxiv.org/pdf/2112.05682v2.pdf) """ import torch import triton import triton.language as tl @triton.jit def _fwd_kernel( Q, K, V, sm_scale, L, M, Out, stride_qz, stride_qh, stride_qm, stride_qk, stride_kz, stride_kh, stride_kn, stride_kk, stride_vz, stride_vh, stride_vk, stride_vn, stride_oz, stride_oh, stride_om, stride_on, Z, H, N_CTX, BLOCK_M: tl.constexpr, BLOCK_DMODEL: tl.constexpr, BLOCK_N: tl.constexpr, ): start_m = tl.program_id(0) off_hz = tl.program_id(1) # initialize offsets offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M) offs_n = tl.arange(0, BLOCK_N) offs_d = tl.arange(0, BLOCK_DMODEL) off_q = off_hz * stride_qh + offs_m[:, None] * stride_qm + offs_d[None, :] * stride_qk off_k = off_hz * stride_qh + offs_n[None, :] * stride_kn + offs_d[:, None] * stride_kk off_v = off_hz * stride_qh + offs_n[:, None] * stride_qm + offs_d[None, :] * stride_qk # Initialize pointers to Q, K, V q_ptrs = Q + off_q k_ptrs = K + off_k v_ptrs = V + off_v # initialize pointer to m and l m_prev = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf") l_prev = tl.zeros([BLOCK_M], dtype=tl.float32) acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32) # load q: it will stay in SRAM throughout q = tl.load(q_ptrs) # loop over k, v and update accumulator for start_n in range(0, (start_m + 1) * BLOCK_M, BLOCK_N): # -- compute qk ---- k = tl.load(k_ptrs) qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32) qk += tl.dot(q, k) qk *= sm_scale qk = tl.where(offs_m[:, None] >= (start_n + offs_n[None, :]), qk, float("-inf")) # compute new m m_curr = tl.maximum(tl.max(qk, 1), m_prev) # correct old l l_prev *= tl.exp(m_prev - m_curr) # attention weights p = tl.exp(qk - m_curr[:, None]) l_curr = tl.sum(p, 1) + l_prev # rescale operands of matmuls l_rcp = 1. / l_curr p *= l_rcp acc *= (l_prev * l_rcp)[:, None] # update acc p = p.to(tl.float16) v = tl.load(v_ptrs) acc += tl.dot(p, v) # update m_i and l_i l_prev = l_curr m_prev = m_curr # update pointers k_ptrs += BLOCK_N * stride_kn v_ptrs += BLOCK_N * stride_vk # rematerialize offsets to save registers start_m = tl.program_id(0) offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M) # write back l and m l_ptrs = L + off_hz * N_CTX + offs_m m_ptrs = M + off_hz * N_CTX + offs_m tl.store(l_ptrs, l_prev) tl.store(m_ptrs, m_prev) # initialize pointers to output offs_n = tl.arange(0, BLOCK_DMODEL) off_o = off_hz * stride_oh + offs_m[:, None] * stride_om + offs_n[None, :] * stride_on out_ptrs = Out + off_o tl.store(out_ptrs, acc) @triton.jit def _bwd_preprocess( Out, DO, L, NewDO, Delta, BLOCK_M: tl.constexpr, D_HEAD: tl.constexpr, ): off_m = tl.program_id(0) * BLOCK_M + tl.arange(0, BLOCK_M) off_n = tl.arange(0, D_HEAD) # load o = tl.load(Out + off_m[:, None] * D_HEAD + off_n[None, :]).to(tl.float32) do = tl.load(DO + off_m[:, None] * D_HEAD + off_n[None, :]).to(tl.float32) denom = tl.load(L + off_m).to(tl.float32) # compute do = do / denom[:, None] delta = tl.sum(o * do, axis=1) # write-back tl.store(NewDO + off_m[:, None] * D_HEAD + off_n[None, :], do) tl.store(Delta + off_m, delta) @triton.jit def _bwd_kernel( Q, K, V, sm_scale, Out, DO, DQ, DK, DV, L, M, D, stride_qz, stride_qh, stride_qm, stride_qk, stride_kz, stride_kh, stride_kn, stride_kk, stride_vz, stride_vh, stride_vk, stride_vn, Z, H, N_CTX, num_block, BLOCK_M: tl.constexpr, BLOCK_DMODEL: tl.constexpr, BLOCK_N: tl.constexpr, ): off_hz = tl.program_id(0) off_z = off_hz // H off_h = off_hz % H # offset pointers for batch/head Q += off_z * stride_qz + off_h * stride_qh K += off_z * stride_qz + off_h * stride_qh V += off_z * stride_qz + off_h * stride_qh DO += off_z * stride_qz + off_h * stride_qh DQ += off_z * stride_qz + off_h * stride_qh DK += off_z * stride_qz + off_h * stride_qh DV += off_z * stride_qz + off_h * stride_qh for start_n in range(0, num_block): lo = start_n * BLOCK_M # initialize row/col offsets offs_qm = lo + tl.arange(0, BLOCK_M) offs_n = start_n * BLOCK_M + tl.arange(0, BLOCK_M) offs_m = tl.arange(0, BLOCK_N) offs_k = tl.arange(0, BLOCK_DMODEL) # initialize pointers to value-like data q_ptrs = Q + (offs_qm[:, None] * stride_qm + offs_k[None, :] * stride_qk) k_ptrs = K + (offs_n[:, None] * stride_kn + offs_k[None, :] * stride_kk) v_ptrs = V + (offs_n[:, None] * stride_qm + offs_k[None, :] * stride_qk) do_ptrs = DO + (offs_qm[:, None] * stride_qm + offs_k[None, :] * stride_qk) dq_ptrs = DQ + (offs_qm[:, None] * stride_qm + offs_k[None, :] * stride_qk) # pointer to row-wise quantities in value-like data D_ptrs = D + off_hz * N_CTX m_ptrs = M + off_hz * N_CTX # initialize dv amd dk dv = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32) dk = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32) # k and v stay in SRAM throughout k = tl.load(k_ptrs) v = tl.load(v_ptrs) # loop over rows for start_m in range(lo, num_block * BLOCK_M, BLOCK_M): offs_m_curr = start_m + offs_m # load q, k, v, do on-chip q = tl.load(q_ptrs) # recompute p = softmax(qk, dim=-1).T # NOTE: `do` is pre-divided by `l`; no normalization here qk = tl.dot(q, tl.trans(k)) qk = tl.where(offs_m_curr[:, None] >= (offs_n[None, :]), qk, float("-inf")) m = tl.load(m_ptrs + offs_m_curr) p = tl.exp(qk * sm_scale - m[:, None]) # compute dv do = tl.load(do_ptrs) dv += tl.dot(tl.trans(p.to(tl.float16)), do) # compute dp = dot(v, do) Di = tl.load(D_ptrs + offs_m_curr) dp = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32) - Di[:, None] dp += tl.dot(do, tl.trans(v)) # compute ds = p * (dp - delta[:, None]) ds = p * dp * sm_scale # compute dk = dot(ds.T, q) dk += tl.dot(tl.trans(ds.to(tl.float16)), q) # compute dq dq = tl.load(dq_ptrs) dq += tl.dot(ds.to(tl.float16), k) tl.store(dq_ptrs, dq) # increment pointers dq_ptrs += BLOCK_M * stride_qm q_ptrs += BLOCK_M * stride_qm do_ptrs += BLOCK_M * stride_qm # write-back dv_ptrs = DV + (offs_n[:, None] * stride_qm + offs_k[None, :] * stride_qk) dk_ptrs = DK + (offs_n[:, None] * stride_kn + offs_k[None, :] * stride_kk) tl.store(dv_ptrs, dv) tl.store(dk_ptrs, dk) class _attention(torch.autograd.Function): @staticmethod def forward(ctx, q, k, v, sm_scale): # only support for Ampere now capability = torch.cuda.get_device_capability() if capability[0] < 8: raise RuntimeError("Flash attention currently only supported for compute capability < 80") BLOCK = 128 # shape constraints Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1] assert Lq == Lk and Lk == Lv # assert Lk in {16, 32, 64, 128} assert Lk in {64} # TODO: fix other cases o = torch.empty_like(q) grid = (triton.cdiv(q.shape[2], BLOCK), q.shape[0] * q.shape[1], 1) L = torch.empty((q.shape[0] * q.shape[1], q.shape[2]), device=q.device, dtype=torch.float32) m = torch.empty((q.shape[0] * q.shape[1], q.shape[2]), device=q.device, dtype=torch.float32) num_warps = 4 if Lk <= 64 else 8 _fwd_kernel[grid]( q, k, v, sm_scale, L, m, o, q.stride(0), q.stride(1), q.stride(2), q.stride(3), k.stride(0), k.stride(1), k.stride(2), k.stride(3), v.stride(0), v.stride(1), v.stride(2), v.stride(3), o.stride(0), o.stride(1), o.stride(2), o.stride(3), q.shape[0], q.shape[1], q.shape[2], BLOCK_M=BLOCK, BLOCK_N=BLOCK, BLOCK_DMODEL=Lk, num_warps=num_warps, num_stages=2, ) ctx.save_for_backward(q, k, v, o, L, m) ctx.grid = grid ctx.sm_scale = sm_scale ctx.BLOCK_DMODEL = Lk return o @staticmethod def backward(ctx, do): BLOCK = 128 q, k, v, o, l, m = ctx.saved_tensors do = do.contiguous() dq = torch.zeros_like(q, dtype=torch.float32) dk = torch.empty_like(k) dv = torch.empty_like(v) do_scaled = torch.empty_like(do) delta = torch.empty_like(l) _bwd_preprocess[(ctx.grid[0] * ctx.grid[1], )]( o, do, l, do_scaled, delta, BLOCK_M=BLOCK, D_HEAD=ctx.BLOCK_DMODEL, ) _bwd_kernel[(ctx.grid[1],)]( q, k, v, ctx.sm_scale, o, do_scaled, dq, dk, dv, l, m, delta, q.stride(0), q.stride(1), q.stride(2), q.stride(3), k.stride(0), k.stride(1), k.stride(2), k.stride(3), v.stride(0), v.stride(1), v.stride(2), v.stride(3), q.shape[0], q.shape[1], q.shape[2], ctx.grid[0], BLOCK_M=BLOCK, BLOCK_N=BLOCK, BLOCK_DMODEL=ctx.BLOCK_DMODEL, num_warps=8, num_stages=1, ) return dq, dk, dv, None attention = _attention.apply triton-2.0.0/python/triton/ops/matmul.py000066400000000000000000000152661440023377100203550ustar00rootroot00000000000000import torch import triton import triton.language as tl from .matmul_perf_model import early_config_prune, estimate_matmul_time def init_to_zero(name): return lambda nargs: nargs[name].zero_() def get_configs_io_bound(): configs = [] for num_stages in [2, 3, 4, 5, 6]: for block_m in [16, 32]: for block_k in [32, 64]: for block_n in [32, 64, 128, 256]: num_warps = 2 if block_n <= 64 else 4 configs.append( triton.Config({'BLOCK_M': block_m, 'BLOCK_N': block_n, 'BLOCK_K': block_k, 'SPLIT_K': 1}, num_stages=num_stages, num_warps=num_warps)) # split_k for split_k in [2, 4, 8, 16]: configs.append(triton.Config({'BLOCK_M': block_m, 'BLOCK_N': block_n, 'BLOCK_K': block_k, 'SPLIT_K': split_k}, num_stages=num_stages, num_warps=num_warps, pre_hook=init_to_zero('C'))) return configs @triton.autotune( configs=[ # basic configs for compute-bound matmuls triton.Config({'BLOCK_M': 128, 'BLOCK_N': 256, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=3, num_warps=8), triton.Config({'BLOCK_M': 256, 'BLOCK_N': 128, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=3, num_warps=8), triton.Config({'BLOCK_M': 256, 'BLOCK_N': 64, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), triton.Config({'BLOCK_M': 64, 'BLOCK_N': 256, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), triton.Config({'BLOCK_M': 128, 'BLOCK_N': 128, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), triton.Config({'BLOCK_M': 64, 'BLOCK_N': 128, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), triton.Config({'BLOCK_M': 128, 'BLOCK_N': 32, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), triton.Config({'BLOCK_M': 64, 'BLOCK_N': 32, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=5, num_warps=2), # good for int8 triton.Config({'BLOCK_M': 128, 'BLOCK_N': 256, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=3, num_warps=8), triton.Config({'BLOCK_M': 256, 'BLOCK_N': 128, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=3, num_warps=8), triton.Config({'BLOCK_M': 256, 'BLOCK_N': 64, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=4, num_warps=4), triton.Config({'BLOCK_M': 64, 'BLOCK_N': 256, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=4, num_warps=4), triton.Config({'BLOCK_M': 128, 'BLOCK_N': 128, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=4, num_warps=4), triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=4, num_warps=4), triton.Config({'BLOCK_M': 64, 'BLOCK_N': 128, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=4, num_warps=4), triton.Config({'BLOCK_M': 128, 'BLOCK_N': 32, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=4, num_warps=4), triton.Config({'BLOCK_M': 64, 'BLOCK_N': 32, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=5, num_warps=2), ] + get_configs_io_bound(), key=['M', 'N', 'K'], prune_configs_by={ 'early_config_prune': early_config_prune, 'perf_model': estimate_matmul_time, 'top_k': 10 }, ) @triton.heuristics({ 'EVEN_K': lambda args: args['K'] % (args['BLOCK_K'] * args['SPLIT_K']) == 0, }) @triton.jit def _kernel(A, B, C, M, N, K, stride_am, stride_ak, stride_bk, stride_bn, stride_cm, stride_cn, BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr, GROUP_M: tl.constexpr, SPLIT_K: tl.constexpr, EVEN_K: tl.constexpr, ACC_TYPE: tl.constexpr ): # matrix multiplication pid = tl.program_id(0) pid_z = tl.program_id(1) grid_m = (M + BLOCK_M - 1) // BLOCK_M grid_n = (N + BLOCK_N - 1) // BLOCK_N # re-order program ID for better L2 performance width = GROUP_M * grid_n group_id = pid // width group_size = min(grid_m - group_id * GROUP_M, GROUP_M) pid_m = group_id * GROUP_M + (pid % group_size) pid_n = (pid % width) // (group_size) # do matrix multiplication rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M) rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N) rk = pid_z * BLOCK_K + tl.arange(0, BLOCK_K) # pointers A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak) B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn) acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE) for k in range(K, 0, -BLOCK_K * SPLIT_K): if EVEN_K: a = tl.load(A) b = tl.load(B) else: a = tl.load(A, mask=rk[None, :] < k, other=0.) b = tl.load(B, mask=rk[:, None] < k, other=0.) acc += tl.dot(a, b) A += BLOCK_K * SPLIT_K * stride_ak B += BLOCK_K * SPLIT_K * stride_bk acc = acc.to(C.dtype.element_ty) # rematerialize rm and rn to save registers rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) C = C + (rm[:, None] * stride_cm + rn[None, :] * stride_cn) mask = (rm < M)[:, None] & (rn < N)[None, :] # handles write-back with reduction-splitting if SPLIT_K == 1: tl.store(C, acc, mask=mask) else: tl.atomic_add(C, acc, mask=mask) class _matmul(torch.autograd.Function): kernel = _kernel _locks = {} @staticmethod def _call(a, b): device = a.device # handle non-contiguous inputs if necessary if a.stride(0) > 1 and a.stride(1) > 1: a = a.contiguous() if b.stride(0) > 1 and b.stride(1) > 1: b = b.contiguous() # checks constraints assert a.shape[1] == b.shape[0], "incompatible dimensions" M, K = a.shape _, N = b.shape # allocates output c = torch.empty((M, N), device=device, dtype=a.dtype) # accumulator types ACC_TYPE = tl.float32 if a.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32 # launch kernel grid = lambda META: (triton.cdiv(M, META['BLOCK_M']) * triton.cdiv(N, META['BLOCK_N']), META['SPLIT_K']) _kernel[grid](a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1), GROUP_M=8, ACC_TYPE=ACC_TYPE) return c @staticmethod def forward(ctx, a, b): return _matmul._call(a, b) matmul = _matmul.apply triton-2.0.0/python/triton/ops/matmul_perf_model.py000066400000000000000000000146241440023377100225460ustar00rootroot00000000000000import heapq import torch import triton import triton._C.libtriton.triton as _triton from triton.testing import get_dram_gbps, get_max_simd_tflops, get_max_tensorcore_tflops def get_tensorcore_tflops(backend, device, num_ctas, num_warps, dtype): ''' return compute throughput in TOPS ''' total_warps = num_ctas * min(num_warps, 4) triton.compiler.init_cuda_utils() num_subcores = triton.compiler.cuda_utils.get_device_properties(device)["multiprocessor_count"] * 4 # on recent GPUs tflops = min(num_subcores, total_warps) / num_subcores * get_max_tensorcore_tflops(dtype, backend, device) return tflops def get_simd_tflops(backend, device, num_ctas, num_warps, dtype): ''' return compute throughput in TOPS ''' total_warps = num_ctas * min(num_warps, 4) num_subcores = triton.compiler.cuda_utils.get_device_properties(device)["multiprocessor_count"] * 4 # on recent GPUs tflops = min(num_subcores, total_warps) / num_subcores * get_max_simd_tflops(dtype, backend, device) return tflops def get_tflops(backend, device, num_ctas, num_warps, dtype): capability = torch.cuda.get_device_capability(device) if capability[0] < 8 and dtype == torch.float32: return get_simd_tflops(backend, device, num_ctas, num_warps, dtype) return get_tensorcore_tflops(backend, device, num_ctas, num_warps, dtype) def estimate_matmul_time( # backend, device, num_warps, num_stages, A, B, C, M, N, K, BLOCK_M, BLOCK_N, BLOCK_K, SPLIT_K, debug=False, **kwargs ): ''' return estimated running time in ms = max(compute, loading) + store ''' backend = _triton.runtime.backend.CUDA device = torch.cuda.current_device() dtype = A.dtype dtsize = A.element_size() num_cta_m = triton.cdiv(M, BLOCK_M) num_cta_n = triton.cdiv(N, BLOCK_N) num_cta_k = SPLIT_K num_ctas = num_cta_m * num_cta_n * num_cta_k # If the input is smaller than the block size M, N = max(M, BLOCK_M), max(N, BLOCK_N) # time to compute total_ops = 2 * M * N * K / (1024 * 1024 * 1024) # GOPS tput = get_tflops(backend, device, num_ctas, num_warps, dtype) compute_ms = total_ops / tput # time to load data num_sm = triton.compiler.cuda_utils.get_device_properties(device)["multiprocessor_count"] active_cta_ratio = min(1, num_ctas / num_sm) active_cta_ratio_bw1 = min(1, num_ctas / 32) # 32 active ctas are enough to saturate active_cta_ratio_bw2 = max(min(1, (num_ctas - 32) / (108 - 32)), 0) # 32-108, remaining 5% dram_bw = get_dram_gbps(backend, device) * (active_cta_ratio_bw1 * 0.95 + active_cta_ratio_bw2 * 0.05) # in GB/s l2_bw = dram_bw * 4 # rough estimation (should be 4.7 for A100?) # assume 80% of (following) loads are in L2 cache load_a_dram = M * K * dtsize * (1 + 0.2 * (num_cta_n - 1)) load_a_l2 = M * K * dtsize * 0.8 * (num_cta_n - 1) load_b_dram = N * K * dtsize * (1 + 0.2 * (num_cta_m - 1)) load_b_l2 = N * K * dtsize * 0.8 * (num_cta_m - 1) # total total_dram = (load_a_dram + load_b_dram) / (1024 * 1024) # MB total_l2 = (load_a_l2 + load_b_l2) / (1024 * 1024) # loading time in ms load_ms = total_dram / dram_bw + total_l2 / l2_bw # estimate storing time store_bw = dram_bw * 0.6 # :o store_c_dram = M * N * dtsize * SPLIT_K / (1024 * 1024) # MB if SPLIT_K == 1: store_ms = store_c_dram / store_bw else: reduce_bw = store_bw store_ms = store_c_dram / reduce_bw # c.zero_() zero_ms = M * N * 2 / (1024 * 1024) / store_bw store_ms += zero_ms total_time_ms = max(compute_ms, load_ms) + store_ms if debug: print(f'Total time: {total_time_ms}ms, compute time: {compute_ms}ms, ' f'loading time: {load_ms}ms, store time: {store_ms}ms, ' f'Activate CTAs: {active_cta_ratio*100}%') return total_time_ms def early_config_prune(configs, named_args): device = torch.cuda.current_device() capability = torch.cuda.get_device_capability() # BLOCK_M, BLOCK_N, BLOCK_K, SPLIT_K, num_warps, num_stages dtsize = named_args['A'].element_size() dtype = named_args['A'].dtype # 1. make sure we have enough smem pruned_configs = [] for config in configs: kw = config.kwargs BLOCK_M, BLOCK_N, BLOCK_K, num_stages = \ kw['BLOCK_M'], kw['BLOCK_N'], kw['BLOCK_K'], config.num_stages # TODO: move to `cuda_utils` submodule triton.compiler.init_cuda_utils() max_shared_memory = triton.compiler.cuda_utils.get_device_properties(device)["max_shared_mem"] required_shared_memory = (BLOCK_M + BLOCK_N) * BLOCK_K * num_stages * dtsize if required_shared_memory <= max_shared_memory: pruned_configs.append(config) configs = pruned_configs # Some dtypes do not allow atomic_add if dtype not in [torch.float16, torch.float32]: configs = [config for config in configs if config.kwargs['SPLIT_K'] == 1] # group configs by (BLOCK_M,_N,_K, SPLIT_K, num_warps) configs_map = {} for config in configs: kw = config.kwargs BLOCK_M, BLOCK_N, BLOCK_K, SPLIT_K, num_warps, num_stages = \ kw['BLOCK_M'], kw['BLOCK_N'], kw['BLOCK_K'], kw['SPLIT_K'], config.num_warps, config.num_stages key = (BLOCK_M, BLOCK_N, BLOCK_K, SPLIT_K, num_warps) if key in configs_map: configs_map[key].append((config, num_stages)) else: configs_map[key] = [(config, num_stages)] pruned_configs = [] for k, v in configs_map.items(): BLOCK_M, BLOCK_N, BLOCK_K, SPLIT_K, num_warps = k if capability[0] >= 8: # compute cycles (only works for ampere GPUs) mmas = BLOCK_M * BLOCK_N * BLOCK_K / (16 * 8 * 16) mma_cycles = mmas / min(4, num_warps) * 8 ldgsts_latency = 300 # Does this matter? optimal_num_stages = ldgsts_latency / mma_cycles # nearest stages, prefer large #stages nearest = heapq.nsmallest(2, v, key=lambda x: 10 + abs(x[1] - optimal_num_stages) if (x[1] - optimal_num_stages) < 0 else x[1] - optimal_num_stages) for n in nearest: pruned_configs.append(n[0]) else: # Volta & Turing only supports num_stages <= 2 random_config = v[0][0] random_config.num_stages = 2 pruned_configs.append(random_config) return pruned_configs triton-2.0.0/python/triton/runtime/000077500000000000000000000000001440023377100173545ustar00rootroot00000000000000triton-2.0.0/python/triton/runtime/__init__.py000066400000000000000000000004111440023377100214610ustar00rootroot00000000000000from .autotuner import Config, Heuristics, autotune, heuristics from .jit import JITFunction, KernelInterface, version_key __all__ = [ "Config", "Heuristics", "autotune", "heuristics", "JITFunction", "KernelInterface", "version_key", ] triton-2.0.0/python/triton/runtime/autotuner.py000066400000000000000000000225531440023377100217630ustar00rootroot00000000000000from __future__ import annotations import builtins import time from typing import Dict from ..compiler import OutOfResources from ..testing import do_bench from .jit import KernelInterface class Autotuner(KernelInterface): def __init__(self, fn, arg_names, configs, key, reset_to_zero, prune_configs_by: Dict = None): ''' :param prune_configs_by: a dict of functions that are used to prune configs, fields: 'perf_model': performance model used to predicate running time with different configs, returns running time 'top_k': number of configs to bench 'prune_num_stages_by'(optional): a function used to prune num_stages. It take configs:List[Config] as its input, and returns pruned configs. ''' if not configs: self.configs = [Config({}, num_warps=4, num_stages=2)] else: self.configs = configs self.key_idx = [arg_names.index(k) for k in key] self.cache = {} # hook to reset all required tensor to zeros before relaunching a kernel self.hook = lambda args: 0 if reset_to_zero is not None: self.reset_idx = [arg_names.index(k) for k in reset_to_zero] def _hook(args): for i in self.reset_idx: args[i].zero_() self.hook = _hook self.arg_names = arg_names # prune configs if prune_configs_by: perf_model, top_k = prune_configs_by['perf_model'], prune_configs_by['top_k'] if 'early_config_prune' in prune_configs_by: early_config_prune = prune_configs_by['early_config_prune'] else: perf_model, top_k, early_config_prune = None, None, None self.perf_model, self.configs_top_k = perf_model, top_k self.early_config_prune = early_config_prune self.fn = fn def _bench(self, *args, config, **meta): # check for conflicts, i.e. meta-parameters both provided # as kwargs and by the autotuner conflicts = meta.keys() & config.kwargs.keys() if conflicts: raise ValueError( f"Conflicting meta-parameters: {', '.join(conflicts)}." " Make sure that you don't re-define auto-tuned symbols." ) # augment meta-parameters with tunable ones current = dict(meta, **config.kwargs) def kernel_call(): if config.pre_hook: config.pre_hook(self.nargs) self.hook(args) self.fn.run(*args, num_warps=config.num_warps, num_stages=config.num_stages, **current) try: return do_bench(kernel_call) except OutOfResources: return float('inf') def run(self, *args, **kwargs): self.nargs = dict(zip(self.arg_names, args)) if len(self.configs) > 1: key = tuple(args[i] for i in self.key_idx) if key not in self.cache: # prune configs pruned_configs = self.prune_configs(kwargs) bench_start = time.time() timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs} bench_end = time.time() self.bench_time = bench_end - bench_start self.cache[key] = builtins.min(timings, key=timings.get) self.hook(args) self.configs_timings = timings config = self.cache[key] else: config = self.configs[0] self.best_config = config if config.pre_hook is not None: config.pre_hook(self.nargs) return self.fn.run(*args, num_warps=config.num_warps, num_stages=config.num_stages, **kwargs, **config.kwargs) def prune_configs(self, kwargs): pruned_configs = self.configs if self.early_config_prune: pruned_configs = self.early_config_prune(self.configs, self.nargs) if self.perf_model: top_k = self.configs_top_k if isinstance(top_k, float) and top_k <= 1.0: top_k = int(len(self.configs) * top_k) if len(pruned_configs) > top_k: est_timing = { config: self.perf_model(**self.nargs, **kwargs, **config.kwargs, num_stages=config.num_stages, num_warps=config.num_warps) for config in pruned_configs } pruned_configs = sorted(est_timing.keys(), key=lambda x: est_timing[x])[:top_k] return pruned_configs def warmup(self, *args, **kwargs): self.nargs = dict(zip(self.arg_names, args)) for config in self.prune_configs(kwargs): self.fn.warmup( *args, num_warps=config.num_warps, num_stages=config.num_stages, **kwargs, **config.kwargs, ) self.nargs = None class Config: """ An object that represents a possible kernel configuration for the auto-tuner to try. :ivar meta: a dictionary of meta-parameters to pass to the kernel as keyword arguments. :type meta: dict[Str, Any] :ivar num_warps: the number of warps to use for the kernel when compiled for GPUs. For example, if `num_warps=8`, then each kernel instance will be automatically parallelized to cooperatively execute using `8 * 32 = 256` threads. :type num_warps: int :ivar num_stages: the number of stages that the compiler should use when software-pipelining loops. Mostly useful for matrix multiplication workloads on SM80+ GPUs. :type num_stages: int :ivar pre_hook: a function that will be called before the kernel is called. Parameters of this function are args. """ def __init__(self, kwargs, num_warps=4, num_stages=2, pre_hook=None): self.kwargs = kwargs self.num_warps = num_warps self.num_stages = num_stages self.pre_hook = pre_hook def __str__(self): res = [] for k, v in self.kwargs.items(): res.append(f'{k}: {v}') res.append(f'num_warps: {self.num_warps}') res.append(f'num_stages: {self.num_stages}') return ', '.join(res) def autotune(configs, key, prune_configs_by=None, reset_to_zero=None): """ Decorator for auto-tuning a :code:`triton.jit`'d function. .. highlight:: python .. code-block:: python @triton.autotune(configs=[ triton.Config(meta={'BLOCK_SIZE': 128}, num_warps=4), triton.Config(meta={'BLOCK_SIZE': 1024}, num_warps=8), ], key=['x_size'] # the two above configs will be evaluated anytime # the value of x_size changes ) @triton.jit def kernel(x_ptr, x_size, **META): BLOCK_SIZE = META['BLOCK_SIZE'] :note: When all the configurations are evaluated, the kernel will run multiple time. This means that whatever value the kernel updates will be updated multiple times. To avoid this undesired behavior, you can use the `reset_to_zero` argument, which reset the value of the provided tensor to `zero` before running any configuration. :param configs: a list of :code:`triton.Config` objects :type configs: list[triton.Config] :param key: a list of argument names whose change in value will trigger the evaluation of all provided configs. :type key: list[str] :param prune_configs_by: a dict of functions that are used to prune configs, fields: 'perf_model': performance model used to predicate running time with different configs, returns running time 'top_k': number of configs to bench 'early_config_prune'(optional): a function used to do early prune (eg, num_stages). It take configs:List[Config] as its input, and returns pruned configs. :param reset_to_zero: a list of argument names whose value will be reset to zero before evaluating any configs. :type reset_to_zero: list[str] """ def decorator(fn): return Autotuner(fn, fn.arg_names, configs, key, reset_to_zero, prune_configs_by) return decorator class Heuristics(KernelInterface): def __init__(self, fn, arg_names, values) -> None: self.fn = fn self.values = values self.arg_names = arg_names def run(self, *args, **kwargs): for v, heur in self.values.items(): kwargs[v] = heur({**dict(zip(self.arg_names, args)), **kwargs}) return self.fn.run(*args, **kwargs) def heuristics(values): """ Decorator for specifying how the values of certain meta-parameters may be computed. This is useful for cases where auto-tuning is prohibitevely expensive, or just not applicable. .. highlight:: python .. code-block:: python @triton.heuristics(values={'BLOCK_SIZE': lambda args: 2 ** int(math.ceil(math.log2(args[1])))}) @triton.jit def kernel(x_ptr, x_size, **META): BLOCK_SIZE = META['BLOCK_SIZE'] # smallest power-of-two >= x_size .param values: a dictionary of meta-parameter names and functions that compute the value of the meta-parameter. each such function takes a list of positional arguments as input. .type values: dict[str, Callable[[list[Any]], Any]] """ def decorator(fn): return Heuristics(fn, fn.arg_names, values) return decorator triton-2.0.0/python/triton/runtime/jit.py000066400000000000000000000416211440023377100205200ustar00rootroot00000000000000from __future__ import annotations, division import ast import functools import hashlib import inspect import os import subprocess import textwrap from collections import defaultdict, namedtuple from typing import Callable, Generic, Iterable, Optional, TypeVar, Union, cast, overload import torch import triton from triton.utils import MockTensor try: from torch._C import _cuda_getCurrentRawStream as get_cuda_stream except ImportError: get_cuda_stream = lambda dev_idx: torch.cuda.current_stream(dev_idx).cuda_stream T = TypeVar('T') # ----------------------------------------------------------------------------- # Dependencies Finder # ----------------------------------------------------------------------------- class DependenciesFinder(ast.NodeVisitor): """ This AST visitor is used to find dependencies of a JITFunction. This can be used to invalidate a JITFunction's hash when its source code -- or that of its dependencies -- changes. """ def __init__(self, globals, src) -> None: super().__init__() self.ret = hashlib.md5(src.encode("utf-8")).hexdigest() self.globals = globals def visit_Name(self, node): return self.globals.get(node.id, None) def visit_Attribute(self, node): lhs = self.visit(node.value) while isinstance(lhs, ast.Attribute): lhs = self.visit(lhs.value) if lhs is None or lhs is triton: return None return getattr(lhs, node.attr) def visit_Call(self, node): func = self.visit(node.func) if func is None: return if inspect.isbuiltin(func): return if func.__module__ and func.__module__.startswith('triton.'): return assert isinstance(func, JITFunction) if func.hash is None: tree = ast.parse(func.src) finder = DependenciesFinder(func.__globals__, func.src) finder.visit(tree) func.hash = finder.ret self.ret = (self.ret + func.hash).encode("utf-8") self.ret = hashlib.md5(self.ret).hexdigest() # ----------------------------------------------------------------------------- # JITFunction # ----------------------------------------------------------------------------- @functools.lru_cache() def version_key(): import pkgutil contents = [] # frontend with open(__file__, "rb") as f: contents += [hashlib.md5(f.read()).hexdigest()] with open(triton.compiler.__file__, "rb") as f: contents += [hashlib.md5(f.read()).hexdigest()] # backend with open(triton._C.libtriton.__file__, "rb") as f: contents += [hashlib.md5(f.read()).hexdigest()] # language language_path = os.path.join(*triton.__path__, 'language') for lib in pkgutil.iter_modules([language_path]): with open(lib.module_finder.find_spec(lib.name).origin, "rb") as f: contents += [hashlib.md5(f.read()).hexdigest()] # ptxas version try: ptxas_version = hashlib.md5(subprocess.check_output(["ptxas", "--version"])).hexdigest() except Exception: ptxas_version = '' return '-'.join(triton.__version__) + '-' + ptxas_version + '-' + '-'.join(contents) class KernelInterface(Generic[T]): run: T def __getitem__(self, grid) -> T: """ A JIT function is launched with: fn[grid](*args, **kwargs). Hence JITFunction.__getitem__ returns a callable proxy that memorizes the grid. """ return cast(T, functools.partial(cast(Callable, self.run), grid=grid)) class JITFunction(KernelInterface[T]): # Hook for inspecting compiled functions and modules cache_hook = None divisibility = 16 @staticmethod def _key_of(arg): if hasattr(arg, "dtype"): return arg.dtype elif isinstance(arg, bool): return "i1" elif isinstance(arg, int): if -2**31 <= arg and arg <= 2**31 - 1: return "i32" elif 2**31 <= arg and arg <= 2**32 - 1: return "u32" elif 2**63 <= arg and arg <= 2**64 - 1: return "u64" else: return "i64" elif isinstance(arg, float): return 'fp32' elif arg is None: return None else: raise TypeError(f'Unsupported type {type(arg)} for {arg}') @staticmethod def _spec_of(arg): if hasattr(arg, "data_ptr"): return (arg.data_ptr() % JITFunction.divisibility == 0) elif isinstance(arg, int): return (arg % 16 == 0, arg == 1) return (arg is None, ) def _get_config(self, *args): def is_divisible_by_16(x): if hasattr(x, "data_ptr"): return x.data_ptr() % JITFunction.divisibility == 0 elif isinstance(x, int): return x % JITFunction.divisibility == 0 if x is None: return True return False divisible_by_16 = {i for i, arg in enumerate(args) if is_divisible_by_16(arg) and i not in self.do_not_specialize} equal_to_1 = {i for i, arg in enumerate(args) if isinstance(arg, int) and arg == 1 and i not in self.do_not_specialize} return namedtuple("instance_descriptor", ["divisible_by_16", "equal_to_1"])(tuple(divisible_by_16), tuple(equal_to_1)) # return _triton.code_gen.instance_descriptor(divisible_by_16, equal_to_1) @staticmethod def _type_of(key): if isinstance(key, (torch.dtype, triton.language.dtype)): ty = { torch.bool: 'i1', torch.float16: 'fp16', torch.bfloat16: 'bf16', torch.float32: 'fp32', torch.float64: 'fp64', torch.uint8: 'u8', torch.int8: 'i8', torch.int16: 'i16', torch.int32: 'i32', torch.int64: 'i64', triton.language.uint8: 'u8', triton.language.uint16: 'u16', triton.language.uint32: 'u32', triton.language.uint64: 'u64', triton.language.float8: 'fp8', triton.language.float16: 'fp16', triton.language.bfloat16: 'bf16', triton.language.float32: 'fp32', }[key] return f'*{ty}' if key is None: return '*i8' assert isinstance(key, str) return key def _make_signature(self, sig_key): signature = ",".join([self._type_of(k) for i, k in enumerate(sig_key)]) return signature def _make_constants(self, constexpr_key): constants = dict(zip(self.constexprs, constexpr_key)) return constants def _call_hook(self, key, signature, device, constants, num_warps, num_stages, extern_libs, configs): if JITFunction.cache_hook is None: return False name = self.fn.__name__ module = self.fn.__module__ arg_reprs = ', '.join([f'{name}: {ty}' for name, ty in zip(self.arg_names, key[1])]) repr = f"{name}[num_warps={num_warps}, num_stages={num_stages}]({arg_reprs})" key = str(key) class LegacyCompiler: def __init__(self, module, name): self.module = module self.name = name pass kwargs = dict(signature=signature, device=device, constants=constants, num_warps=num_warps, num_stages=num_stages, extern_libs=extern_libs, configs=configs) return JITFunction.cache_hook(key=key, repr=repr, fn=LegacyCompiler(module, name), compile={"key": key, **kwargs}, is_manual_warmup=False, already_compiled=False) def _make_launcher(self): regular_args = [f'{arg}' for i, arg in enumerate(self.arg_names) if i not in self.constexprs] constexpr_args = [f'{arg}' for i, arg in enumerate(self.arg_names) if i in self.constexprs] args = ', '.join(regular_args) # cache key for regular argument type sig_keys = ', '.join([f'_key_of({arg})' for arg in regular_args]) # cache key for constexpr argument values constexpr_keys = ', '.join(constexpr_args) # cache key for argument specialization specializations = [] for i, arg in enumerate(regular_args): if i in self.do_not_specialize: continue specializations += [f'({arg}.data_ptr() % {JITFunction.divisibility} == 0) if hasattr({arg}, "data_ptr") ' f'else ({arg} % {JITFunction.divisibility} == 0, {arg} == 1) if isinstance({arg}, int) ' f'else (False,)'] spec_keys = ', '.join(specializations) grid_args = ','.join([f'"{arg}": {arg}' for arg in self.arg_names]) src = f""" def {self.fn.__name__}({', '.join(self.arg_names)}, grid, num_warps=4, num_stages=3, extern_libs=None, stream=None, warmup=False): sig_key = {sig_keys}, constexpr_key = {f'{constexpr_keys},' if len(constexpr_keys) > 0 else ()} spec_key = {f'{spec_keys},' if len(spec_keys) > 0 else ()} key = (version_key, sig_key, constexpr_key, spec_key) if not extern_libs is None: key = (key, tuple(extern_libs.items())) assert num_warps > 0 and (num_warps & (num_warps - 1)) == 0, "num_warps must be a power of 2" if callable(grid): grid = grid({{{grid_args}}}) grid_size = len(grid) grid_0 = grid[0] grid_1 = grid[1] if grid_size > 1 else 1 grid_2 = grid[2] if grid_size > 2 else 1 device = torch.cuda.current_device() torch.cuda.set_device(device) if stream is None and not warmup: stream = get_cuda_stream(device) try: bin = cache[device][key] if not warmup: bin.c_wrapper(grid_0, grid_1, grid_2, bin.num_warps, bin.shared, stream, bin.cu_function, triton.compiler.CompiledKernel.launch_enter_hook, triton.compiler.CompiledKernel.launch_exit_hook, bin, {args}) return bin # kernel not cached -- compile except KeyError: # build dict of constant values args = [{args}] all_args = {', '.join([f'{arg}' for arg in self.arg_names])}, configs = self._get_config(*all_args), constants = self._make_constants(constexpr_key) constants.update({{i: None for i, arg in enumerate(all_args) if arg is None}}) constants.update({{i: 1 for i in configs[0].equal_to_1}}) # build kernel signature -- doesn't include specialized arguments signature = {{ i: self._type_of(_key_of(arg)) for i, arg in enumerate(all_args) if i not in self.constexprs }} # build stub signature -- includes arguments that are specialized for i, arg in constants.items(): if callable(arg): raise TypeError(f"Callable constexpr at index {{i}} is not supported") if not self._call_hook(key, signature, device, constants, num_warps, num_stages, extern_libs, configs): bin = triton.compile(self, signature=signature, device=device, constants=constants, num_warps=num_warps, num_stages=num_stages, extern_libs=extern_libs, configs=configs) if not warmup: bin.c_wrapper(grid_0, grid_1, grid_2, bin.num_warps, bin.shared, stream, bin.cu_function, triton.compiler.CompiledKernel.launch_enter_hook, triton.compiler.CompiledKernel.launch_exit_hook, bin, *args) self.cache[device][key] = bin return bin return None """ scope = {"version_key": version_key(), "get_cuda_stream": get_cuda_stream, "self": self, "_spec_of": self._spec_of, "_key_of": self._key_of, "cache": self.cache, "triton": triton, "torch": torch} exec(src, scope) return scope[self.fn.__name__] def __init__(self, fn, version=None, do_not_specialize=None): self.fn = fn self.module = fn.__module__ self.version = version # function signature information signature = inspect.signature(fn) self.arg_names = [v.name for v in signature.parameters.values()] self.has_defaults = any(v.default != inspect._empty for v in signature.parameters.values()) # specialization hints self.do_not_specialize = [] if do_not_specialize is None else do_not_specialize self.do_not_specialize = {self.arg_names.index(arg) if isinstance(arg, str) else arg for arg in self.do_not_specialize} # function source code (without decorators) self.src = textwrap.dedent(inspect.getsource(fn)) self.src = self.src[self.src.find("def"):] # cache of just-in-time compiled kernels self.cache = defaultdict(dict) self.hash = None # JITFunction can be instantiated as kernel # when called with a grid using __getitem__ self.kernel_decorators = [] self.kernel = None # annotations self.annotations = {self.arg_names.index(name): ty for name, ty in fn.__annotations__.items()} self.__annotations__ = fn.__annotations__ # index of constexprs self.constexprs = [self.arg_names.index(ann) for ann in self.__annotations__.keys()] # launcher self.run = self._make_launcher() # re-use docs of wrapped function self.__doc__ = fn.__doc__ self.__name__ = fn.__name__ self.__globals__ = fn.__globals__ self.__module__ = fn.__module__ @property def cache_key(self): # TODO : hash should be attribute of `self` if self.hash is None: dependencies_finder = DependenciesFinder(globals=self.__globals__, src=self.src) dependencies_finder.visit(self.parse()) self.hash = dependencies_finder.ret + version_key() return self.hash def warmup(self, *args, **kwargs): return self.run(*map(MockTensor.wrap_dtype, args), **kwargs, warmup=True) # we do not parse `src` in the constructor because # the user might want to monkey-patch self.src dynamically. # Our unit tests do this, for example. def parse(self): tree = ast.parse(self.src) assert isinstance(tree, ast.Module) assert len(tree.body) == 1 assert isinstance(tree.body[0], ast.FunctionDef) return tree def __call__(self, *args, **kwargs): raise RuntimeError("Cannot call @triton.jit'd outside of the scope of a kernel") def __setattr__(self, name, value): # - when kernel decorators change, cached kernel # needs to be cleared if name == 'kernel_decorators': self.kernel = None super(JITFunction, self).__setattr__(name, value) # - when `.src` attribute is set, cache path needs # to be reinitialized if name == 'src': self.hash = None def __repr__(self): return f"JITFunction({self.module}:{self.fn.__name__})" # ----------------------------------------------------------------------------- # `jit` decorator # ----------------------------------------------------------------------------- @overload def jit(fn: T) -> JITFunction[T]: ... @overload def jit( *, version=None, do_not_specialize: Optional[Iterable[int]] = None, ) -> Callable[[T], JITFunction[T]]: ... def jit( fn: Optional[T] = None, *, version=None, do_not_specialize: Optional[Iterable[int]] = None, ) -> Union[JITFunction[T], Callable[[T], JITFunction[T]]]: """ Decorator for JIT-compiling a function using the Triton compiler. :note: When a jit'd function is called, :code:`torch.tensor` arguments are implicitly converted to pointers using the :code:`.data_ptr()` method. :note: This function will be compiled and run on the GPU. It will only have access to: * python primitives, * builtins within the triton package, * arguments to this function, * other jit'd functions :param fn: the function to be jit-compiled :type fn: Callable """ def decorator(fn: T) -> JITFunction[T]: assert callable(fn) return JITFunction( fn, version=version, do_not_specialize=do_not_specialize, ) if fn is not None: return decorator(fn) else: return decorator class TensorWrapper: def __init__(self, base, dtype): self.dtype = dtype self.base = base self.is_cuda = base.is_cuda self.device = base.device def data_ptr(self): return self.base.data_ptr() def __str__(self) -> str: return f'TensorWrapper[{self.dtype}]({self.base})' def reinterpret(tensor, dtype): if isinstance(tensor, TensorWrapper): if dtype == tensor.base.dtype: # Reinterpreting to the original interpretation; return the base. return tensor.base else: # Reinterpreting a wrapped tensor to a different type. return TensorWrapper(tensor.base, dtype) elif isinstance(tensor, torch.Tensor): # A new wrapper is needed around an unwrapped tensor. return TensorWrapper(tensor, dtype) else: raise TypeError(f'Cannot reinterpret a {type(tensor)}.') triton-2.0.0/python/triton/testing.py000066400000000000000000000415321440023377100177250ustar00rootroot00000000000000import functools import os import subprocess import sys from contextlib import contextmanager import torch import triton._C.libtriton.triton as _triton from .compiler import OutOfResources try: import triton._C.libtriton.cutlass as _cutlass has_cutlass = True except ImportError: _cutlass = None has_cutlass = False # TODO: move to separate module import triton def catch_oor(kernel, pytest_handle=None): try: res = kernel() except OutOfResources as e: if pytest_handle: pytest_handle.skip(str(e)) return None return res def sparsify_tensor(x, mask, block): ret = torch.empty((x.size(0), mask.sum(), block, block), dtype=x.dtype, device=x.device) for idx, (h, i, j) in enumerate(zip(*mask.nonzero(as_tuple=True))): ret[:, idx, :, :] = x[:, h, i * block:(i + 1) * block, j * block:(j + 1) * block] return ret def make_pair(shape, device="cuda", alpha=1e-2, beta=0., trans=False, data=None, dtype=torch.float32): if data is None: data = torch.randn(shape, dtype=torch.float32, requires_grad=True, device=device) ref_ret = data ref_ret = ref_ret * alpha + beta ref_ret = ref_ret.half().to(dtype) if trans: ref_ret = ref_ret.t().requires_grad_() ref_ret = ref_ret.detach().requires_grad_() tri_ret = ref_ret.clone().detach().requires_grad_() return ref_ret, tri_ret def cutlass_matmul(a, b): if _cutlass is None: raise RuntimeError("Cannot find cutlass library") M, N = a.shape[0], b.shape[1] Ka, Kb = a.shape[1], b.shape[0] assert Ka == Kb assert a.dtype == b.dtype assert a.device == b.device # allocate output c = torch.empty_strided((M, N), (1, M), dtype=a.dtype, device=a.device) # run function dtype = str(a.dtype).split('.')[-1] _cutlass.matmul(a.data_ptr(), b.data_ptr(), c.data_ptr(), M, N, Ka, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1), dtype, dtype, dtype, a.device.index, torch.cuda.current_stream(a.device).cuda_stream) return c def mask_tensor(x, mask, block, value=0): ret = x.clone() for h, i, j in zip(*(mask == 0).nonzero(as_tuple=True)): ret[:, h, i * block:(i + 1) * block, j * block:(j + 1) * block] = value return ret def assert_almost_equal(x, y, decimal=2, err_msg=''): import numpy.testing as npt if isinstance(x, torch.Tensor): if x.dtype == torch.bfloat16: x = x.float() x = x.cpu().detach().numpy() if isinstance(y, torch.Tensor): if y.dtype == torch.bfloat16: y = y.float() y = y.cpu().detach().numpy() npt.assert_array_almost_equal(x, y, err_msg=err_msg, decimal=decimal) def allclose(x, y, tol=1e-2): if x.dtype != y.dtype: raise RuntimeError(f'{x.dtype} did not match with {x.dtype}') if x.shape != y.shape: raise RuntimeError(f'{x.shape} did not match with {y.shape}') if x.dtype == torch.bool: return torch.sum(x ^ y) == 0 if x.dtype in [torch.int8, torch.int16, torch.int32, torch.int64]: tol = 0 diff = abs(x - y) x_max = torch.max(x) y_max = torch.max(y) err = torch.max(diff) / torch.max(x_max, y_max) return err <= tol def nvsmi(attrs): attrs = ','.join(attrs) cmd = ['nvidia-smi', '-i', '0', '--query-gpu=' + attrs, '--format=csv,noheader,nounits'] out = subprocess.check_output(cmd) ret = out.decode(sys.stdout.encoding).split(',') ret = [int(x) for x in ret] return ret def do_bench(fn, warmup=25, rep=100, grad_to_none=None, percentiles=(0.5, 0.2, 0.8), record_clocks=False, fast_flush=False): """ Benchmark the runtime of the provided function. By default, return the median runtime of :code:`fn` along with the 20-th and 80-th performance percentile. :param fn: Function to benchmark :type fn: Callable :param warmup: Warmup time (in ms) :type warmup: int :param rep: Repetition time (in ms) :type rep: int :param grad_to_none: Reset the gradient of the provided tensor to None :type grad_to_none: torch.tensor, optional :param percentiles: Performance percentile to return in addition to the median. :type percentiles: list[float] :param fast_flush: Use faster kernel to flush L2 between measurements :type fast_flush: bool """ # Estimate the runtime of the function fn() torch.cuda.synchronize() start_event = torch.cuda.Event(enable_timing=True) end_event = torch.cuda.Event(enable_timing=True) start_event.record() for _ in range(5): fn() end_event.record() torch.cuda.synchronize() estimate_ms = start_event.elapsed_time(end_event) / 5 # compute number of warmup and repeat n_warmup = max(1, int(warmup / estimate_ms)) n_repeat = max(1, int(rep / estimate_ms)) # We maintain a buffer of 256 MB that we clear # before each kernel call to make sure that the L2 # doesn't contain any input data before the run start_event = [torch.cuda.Event(enable_timing=True) for i in range(n_repeat)] end_event = [torch.cuda.Event(enable_timing=True) for i in range(n_repeat)] if fast_flush: cache = torch.empty(int(256e6 // 4), dtype=torch.int, device='cuda') else: cache = torch.empty(int(256e6), dtype=torch.int8, device='cuda') # Warm-up for _ in range(n_warmup): fn() # Benchmark for i in range(n_repeat): # we don't want `fn` to accumulate gradient values # if it contains a backward pass. So we clear the # provided gradients if grad_to_none is not None: for x in grad_to_none: x.grad = None # we clear the L2 cache before each run cache.zero_() # record time of `fn` start_event[i].record() fn() end_event[i].record() # Record clocks torch.cuda.synchronize() times = torch.tensor([s.elapsed_time(e) for s, e in zip(start_event, end_event)]) if percentiles: percentiles = torch.quantile(times, torch.tensor(percentiles)).tolist() return tuple(percentiles) else: return torch.mean(times).item() class Benchmark: """ This class is used by the :code:`perf_report` function to generate line plots with a concise API. """ def __init__( self, x_names, x_vals, line_arg, line_vals, line_names, plot_name, args, xlabel='', ylabel='', x_log=False, y_log=False, color=None, styles=None, ): """ Constructor :param x_names: Name of the arguments that should appear on the x axis of the plot. If the list contains more than one element, all the arguments are assumed to have the same value. :type x_names: List[str] :param x_vals: List of values to use for the arguments in :code:`x_names`. :type x_vals: List[Any] :param line_arg: Argument name for which different values correspond to different lines in the plot. :type line_arg: str :param line_vals: List of values to use for the arguments in :code:`line_arg`. :type line_vals: List[str] :param line_names: Label names for the different lines. :type line_names: List[str] :param plot_name: Name of the plot. :type plot_name: str :param args: List of arguments to remain fixed throughout the benchmark. :type args: List[str] :param xlabel: Label for the x axis of the plot. :type xlabel: str, optional :param ylabel: Label for the y axis of the plot. :type ylabel: str, optional :param x_log: Whether the x axis should be log scale. :type x_log: bool, optional :param y_log: Whether the y axis should be log scale. :type y_log: bool, optional """ self.x_names = x_names self.x_vals = x_vals self.x_log = x_log self.line_arg = line_arg self.line_vals = line_vals self.line_names = line_names self.y_log = y_log self.styles = styles # plot info self.xlabel = xlabel self.ylabel = ylabel self.plot_name = plot_name self.args = args class Mark: def __init__(self, fn, benchmarks): self.fn = fn self.benchmarks = benchmarks def _run(self, bench, save_path, show_plots, print_data): import os import matplotlib.pyplot as plt import pandas as pd y_mean = bench.line_names y_min = [f'{x}-min' for x in bench.line_names] y_max = [f'{x}-max' for x in bench.line_names] df = pd.DataFrame(columns=[bench.x_names[0]] + y_mean + y_min + y_max) for x in bench.x_vals: x_args = {x_name: x for x_name in bench.x_names} row_mean, row_min, row_max = [], [], [] for y in bench.line_vals: ret = self.fn(**x_args, **{bench.line_arg: y}, **bench.args) try: y_mean, y_min, y_max = ret except TypeError: y_mean, y_min, y_max = ret, None, None row_mean += [y_mean] row_min += [y_min] row_max += [y_max] df.loc[len(df)] = [x] + row_mean + row_min + row_max if bench.plot_name: plt.figure() ax = plt.subplot() x = bench.x_names[0] for i, y in enumerate(bench.line_names): y_min, y_max = df[y + '-min'], df[y + '-max'] col = bench.styles[i][0] if bench.styles else None sty = bench.styles[i][1] if bench.styles else None ax.plot(df[x], df[y], label=y, color=col, ls=sty) if y_min is not None and y_max is not None: ax.fill_between(df[x], y_min, y_max, alpha=0.15, color=col) ax.legend() xlabel = bench.xlabel if bench.xlabel else " = ".join(bench.x_names) ax.set_xlabel(xlabel) ax.set_ylabel(bench.ylabel) # ax.set_title(bench.plot_name) ax.set_xscale("log" if bench.x_log else "linear") ax.set_yscale("log" if bench.y_log else "linear") if show_plots: plt.show() if save_path: plt.savefig(os.path.join(save_path, f"{bench.plot_name}.png")) df = df[[bench.x_names[0]] + bench.line_names] if print_data: print(bench.plot_name + ':') print(df) if save_path: df.to_csv(os.path.join(save_path, f"{bench.plot_name}.csv"), float_format='%.1f', index=False) def run(self, show_plots=False, print_data=False, save_path=''): has_single_bench = isinstance(self.benchmarks, Benchmark) benchmarks = [self.benchmarks] if has_single_bench else self.benchmarks if save_path: html = open(os.path.join(save_path, "results.html"), "w") html.write("\n") for bench in benchmarks: self._run(bench, save_path, show_plots, print_data) if save_path: html.write(f"\n") if save_path: html.write("\n") def perf_report(benchmarks): """ Mark a function for benchmarking. The benchmark can then be executed by using the :code:`.run` method on the return value. :param benchmarks: Benchmarking configurations. :type benchmarks: List of :class:`Benchmark` """ wrapper = lambda fn: Mark(fn, benchmarks) return wrapper def get_dram_gbps(backend=None, device=None): ''' return DRAM bandwidth in GB/s ''' # assert backend == CUDA if not backend: backend = _triton.runtime.backend.CUDA if not device: device = torch.cuda.current_device() mem_clock_khz = triton.compiler.cuda_utils.get_device_properties(device)["mem_clock_rate"] # in kHz bus_width = triton.compiler.cuda_utils.get_device_properties(device)["mem_bus_width"] bw_gbps = mem_clock_khz * bus_width * 2 / 1e6 / 8 # In GB/s return bw_gbps def get_max_tensorcore_tflops(dtype: torch.dtype, backend=None, device=None, clock_rate=None): if not backend: backend = _triton.runtime.backend.CUDA if not device: device = torch.cuda.current_device() triton.compiler.init_cuda_utils() num_subcores = triton.compiler.cuda_utils.get_device_properties(device)["multiprocessor_count"] * 4 if not clock_rate: clock_rate = triton.compiler.cuda_utils.get_device_properties(device)["sm_clock_rate"] # in kHz capability = torch.cuda.get_device_capability(device) if capability[0] < 8: assert dtype == torch.float16 ops_per_sub_core = 256 # 2 4x4x4 Tensor Cores else: if dtype == torch.float32: ops_per_sub_core = 256 elif dtype in [torch.float16, torch.bfloat16]: ops_per_sub_core = 512 elif dtype == torch.int8: ops_per_sub_core = 1024 else: raise RuntimeError("dtype not supported") tflops = num_subcores * clock_rate * ops_per_sub_core * 1e-9 return tflops # create decorator that wraps test function into # a cuda-memcheck system call def cuda_memcheck(**target_kwargs): def decorator(test_fn): @functools.wraps(test_fn) def wrapper(*args, **kwargs): import psutil ppid_name = psutil.Process(os.getppid()).name() run_cuda_memcheck = target_kwargs.items() <= kwargs.items() if run_cuda_memcheck and ppid_name != "cuda-memcheck": path = os.path.realpath(test_fn.__globals__["__file__"]) # get path of current file env = {"PATH": os.environ["PATH"], "PYTORCH_NO_CUDA_MEMORY_CACHING": "1"} assert 'request' in kwargs, "memcheck'ed test must have a (possibly unused) `request` fixture" test_id = kwargs['request'].node.callspec.id cmd = f"{path}::{test_fn.__name__}[{test_id}]" out = subprocess.run(["cuda-memcheck", "pytest", "-vs", cmd], capture_output=True, env=env) assert out.returncode == 0, "cuda-memcheck returned an error: bounds checking failed" assert "ERROR SUMMARY: 0 errors" in str(out.stdout) else: test_fn(*args, **kwargs) return wrapper return decorator def nvsmi_attr(attrs): attrs = ",".join(attrs) cmd = [ "nvidia-smi", "-i", "0", "--query-gpu=" + attrs, "--format=csv,noheader,nounits", ] out = subprocess.check_output(cmd) ret = out.decode(sys.stdout.encoding).split(",") ret = [int(x) for x in ret] return ret @contextmanager def set_gpu_clock(ref_sm_clock=1350, ref_mem_clock=1215): try: subprocess.check_output(["nvidia-smi", "-i", "0", "-pm", "1"]) subprocess.check_output( [ "nvidia-smi", "-i", "0", f"--lock-gpu-clocks={ref_sm_clock},{ref_sm_clock}", ] ) subprocess.check_output( [ "nvidia-smi", "-i", "0", f"--lock-memory-clocks={ref_mem_clock},{ref_mem_clock}", ] ) cur_sm_clock = nvsmi_attr(["clocks.current.sm"])[0] cur_mem_clock = nvsmi_attr(["clocks.current.memory"])[0] assert abs(cur_sm_clock - ref_sm_clock) < 10, f"GPU SMs must run at {ref_sm_clock} MHz" assert abs(cur_mem_clock - ref_mem_clock) < 10, f"GPU SMs must run at {ref_mem_clock} MHz" tflops = 1e-6 * 2 * 108 * 4 * 256 * ref_sm_clock gbps = 640 * 2 * ref_mem_clock * 1e-3 yield tflops, gbps finally: subprocess.check_output(["nvidia-smi", "-i", "0", "-pm", "0"]) subprocess.check_output(["nvidia-smi", "-i", "0", "-rgc"]) subprocess.check_output(["nvidia-smi", "-i", "0", "-rmc"]) def get_max_simd_tflops(dtype: torch.dtype, backend=None, device=None): if not backend: backend = _triton.runtime.backend.CUDA if not device: device = torch.cuda.current_device() num_subcores = _triton.runtime.num_sm(backend, device) * 4 # on recent GPUs clock_rate = _triton.runtime.clock_rate(backend, device) # in kHz cc = _triton.runtime.cc(backend, device) if cc < 80: if dtype == torch.float32: ops_per_sub_core = 32 # 2*16 elif dtype == torch.float16: ops_per_sub_core = 64 else: raise RuntimeError("dtype not supported") else: if dtype == torch.float32: ops_per_sub_core = 32 elif dtype in [torch.float16, torch.bfloat16]: ops_per_sub_core = 64 else: raise RuntimeError("dtype not supported") tflops = num_subcores * clock_rate * ops_per_sub_core * 1e-9 return tflops triton-2.0.0/python/triton/third_party/000077500000000000000000000000001440023377100202225ustar00rootroot00000000000000triton-2.0.0/python/triton/third_party/cuda/000077500000000000000000000000001440023377100211365ustar00rootroot00000000000000triton-2.0.0/python/triton/third_party/cuda/include/000077500000000000000000000000001440023377100225615ustar00rootroot00000000000000triton-2.0.0/python/triton/third_party/cuda/include/cuda.h000077500000000000000000030074571440023377100236710ustar00rootroot00000000000000/* * Copyright 1993-2018 NVIDIA Corporation. All rights reserved. * * NOTICE TO LICENSEE: * * This source code and/or documentation ("Licensed Deliverables") are * subject to NVIDIA intellectual property rights under U.S. and * international Copyright laws. * * These Licensed Deliverables contained herein is PROPRIETARY and * CONFIDENTIAL to NVIDIA and is being provided under the terms and * conditions of a form of NVIDIA software license agreement by and * between NVIDIA and Licensee ("License Agreement") or electronically * accepted by Licensee. Notwithstanding any terms or conditions to * the contrary in the License Agreement, reproduction or disclosure * of the Licensed Deliverables to any third party without the express * written consent of NVIDIA is prohibited. * * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE * OF THESE LICENSED DELIVERABLES. * * U.S. Government End Users. These Licensed Deliverables are a * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT * 1995), consisting of "commercial computer software" and "commercial * computer software documentation" as such terms are used in 48 * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government * only as a commercial end item. Consistent with 48 C.F.R.12.212 and * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all * U.S. Government End Users acquire the Licensed Deliverables with * only those rights set forth herein. * * Any use of the Licensed Deliverables in individual and commercial * software must include, in the user documentation and internal * comments to the code, the above Disclaimer and U.S. Government End * Users Notice. */ #ifndef __cuda_cuda_h__ #define __cuda_cuda_h__ #include #ifdef _MSC_VER typedef unsigned __int32 cuuint32_t; typedef unsigned __int64 cuuint64_t; #else #include typedef uint32_t cuuint32_t; typedef uint64_t cuuint64_t; #endif #if defined(__CUDA_API_VERSION_INTERNAL) || defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED) #define __CUDA_DEPRECATED #elif defined(_MSC_VER) #define __CUDA_DEPRECATED __declspec(deprecated) #elif defined(__GNUC__) #define __CUDA_DEPRECATED __attribute__((deprecated)) #else #define __CUDA_DEPRECATED #endif #if defined(CUDA_FORCE_API_VERSION) #error "CUDA_FORCE_API_VERSION is no longer supported." #endif #if defined(__CUDA_API_VERSION_INTERNAL) || defined(CUDA_API_PER_THREAD_DEFAULT_STREAM) #define __CUDA_API_PER_THREAD_DEFAULT_STREAM #define __CUDA_API_PTDS(api) api ## _ptds #define __CUDA_API_PTSZ(api) api ## _ptsz #else #define __CUDA_API_PTDS(api) api #define __CUDA_API_PTSZ(api) api #endif #define cuDeviceTotalMem cuDeviceTotalMem_v2 #define cuCtxCreate cuCtxCreate_v2 #define cuCtxCreate_v3 cuCtxCreate_v3 #define cuModuleGetGlobal cuModuleGetGlobal_v2 #define cuMemGetInfo cuMemGetInfo_v2 #define cuMemAlloc cuMemAlloc_v2 #define cuMemAllocPitch cuMemAllocPitch_v2 #define cuMemFree cuMemFree_v2 #define cuMemGetAddressRange cuMemGetAddressRange_v2 #define cuMemAllocHost cuMemAllocHost_v2 #define cuMemHostGetDevicePointer cuMemHostGetDevicePointer_v2 #define cuMemcpyHtoD __CUDA_API_PTDS(cuMemcpyHtoD_v2) #define cuMemcpyDtoH __CUDA_API_PTDS(cuMemcpyDtoH_v2) #define cuMemcpyDtoD __CUDA_API_PTDS(cuMemcpyDtoD_v2) #define cuMemcpyDtoA __CUDA_API_PTDS(cuMemcpyDtoA_v2) #define cuMemcpyAtoD __CUDA_API_PTDS(cuMemcpyAtoD_v2) #define cuMemcpyHtoA __CUDA_API_PTDS(cuMemcpyHtoA_v2) #define cuMemcpyAtoH __CUDA_API_PTDS(cuMemcpyAtoH_v2) #define cuMemcpyAtoA __CUDA_API_PTDS(cuMemcpyAtoA_v2) #define cuMemcpyHtoAAsync __CUDA_API_PTSZ(cuMemcpyHtoAAsync_v2) #define cuMemcpyAtoHAsync __CUDA_API_PTSZ(cuMemcpyAtoHAsync_v2) #define cuMemcpy2D __CUDA_API_PTDS(cuMemcpy2D_v2) #define cuMemcpy2DUnaligned __CUDA_API_PTDS(cuMemcpy2DUnaligned_v2) #define cuMemcpy3D __CUDA_API_PTDS(cuMemcpy3D_v2) #define cuMemcpyHtoDAsync __CUDA_API_PTSZ(cuMemcpyHtoDAsync_v2) #define cuMemcpyDtoHAsync __CUDA_API_PTSZ(cuMemcpyDtoHAsync_v2) #define cuMemcpyDtoDAsync __CUDA_API_PTSZ(cuMemcpyDtoDAsync_v2) #define cuMemcpy2DAsync __CUDA_API_PTSZ(cuMemcpy2DAsync_v2) #define cuMemcpy3DAsync __CUDA_API_PTSZ(cuMemcpy3DAsync_v2) #define cuMemsetD8 __CUDA_API_PTDS(cuMemsetD8_v2) #define cuMemsetD16 __CUDA_API_PTDS(cuMemsetD16_v2) #define cuMemsetD32 __CUDA_API_PTDS(cuMemsetD32_v2) #define cuMemsetD2D8 __CUDA_API_PTDS(cuMemsetD2D8_v2) #define cuMemsetD2D16 __CUDA_API_PTDS(cuMemsetD2D16_v2) #define cuMemsetD2D32 __CUDA_API_PTDS(cuMemsetD2D32_v2) #define cuArrayCreate cuArrayCreate_v2 #define cuArrayGetDescriptor cuArrayGetDescriptor_v2 #define cuArray3DCreate cuArray3DCreate_v2 #define cuArray3DGetDescriptor cuArray3DGetDescriptor_v2 #define cuTexRefSetAddress cuTexRefSetAddress_v2 #define cuTexRefGetAddress cuTexRefGetAddress_v2 #define cuGraphicsResourceGetMappedPointer cuGraphicsResourceGetMappedPointer_v2 #define cuCtxDestroy cuCtxDestroy_v2 #define cuCtxPopCurrent cuCtxPopCurrent_v2 #define cuCtxPushCurrent cuCtxPushCurrent_v2 #define cuStreamDestroy cuStreamDestroy_v2 #define cuEventDestroy cuEventDestroy_v2 #define cuTexRefSetAddress2D cuTexRefSetAddress2D_v3 #define cuLinkCreate cuLinkCreate_v2 #define cuLinkAddData cuLinkAddData_v2 #define cuLinkAddFile cuLinkAddFile_v2 #define cuMemHostRegister cuMemHostRegister_v2 #define cuGraphicsResourceSetMapFlags cuGraphicsResourceSetMapFlags_v2 #define cuStreamBeginCapture __CUDA_API_PTSZ(cuStreamBeginCapture_v2) #define cuDevicePrimaryCtxRelease cuDevicePrimaryCtxRelease_v2 #define cuDevicePrimaryCtxReset cuDevicePrimaryCtxReset_v2 #define cuDevicePrimaryCtxSetFlags cuDevicePrimaryCtxSetFlags_v2 #define cuDeviceGetUuid_v2 cuDeviceGetUuid_v2 #define cuIpcOpenMemHandle cuIpcOpenMemHandle_v2 #define cuGraphInstantiate cuGraphInstantiate_v2 #if defined(__CUDA_API_PER_THREAD_DEFAULT_STREAM) #define cuMemcpy __CUDA_API_PTDS(cuMemcpy) #define cuMemcpyAsync __CUDA_API_PTSZ(cuMemcpyAsync) #define cuMemcpyPeer __CUDA_API_PTDS(cuMemcpyPeer) #define cuMemcpyPeerAsync __CUDA_API_PTSZ(cuMemcpyPeerAsync) #define cuMemcpy3DPeer __CUDA_API_PTDS(cuMemcpy3DPeer) #define cuMemcpy3DPeerAsync __CUDA_API_PTSZ(cuMemcpy3DPeerAsync) #define cuMemPrefetchAsync __CUDA_API_PTSZ(cuMemPrefetchAsync) #define cuMemsetD8Async __CUDA_API_PTSZ(cuMemsetD8Async) #define cuMemsetD16Async __CUDA_API_PTSZ(cuMemsetD16Async) #define cuMemsetD32Async __CUDA_API_PTSZ(cuMemsetD32Async) #define cuMemsetD2D8Async __CUDA_API_PTSZ(cuMemsetD2D8Async) #define cuMemsetD2D16Async __CUDA_API_PTSZ(cuMemsetD2D16Async) #define cuMemsetD2D32Async __CUDA_API_PTSZ(cuMemsetD2D32Async) #define cuStreamGetPriority __CUDA_API_PTSZ(cuStreamGetPriority) #define cuStreamGetFlags __CUDA_API_PTSZ(cuStreamGetFlags) #define cuStreamGetCtx __CUDA_API_PTSZ(cuStreamGetCtx) #define cuStreamWaitEvent __CUDA_API_PTSZ(cuStreamWaitEvent) #define cuStreamEndCapture __CUDA_API_PTSZ(cuStreamEndCapture) #define cuStreamIsCapturing __CUDA_API_PTSZ(cuStreamIsCapturing) #define cuStreamGetCaptureInfo __CUDA_API_PTSZ(cuStreamGetCaptureInfo) #define cuStreamGetCaptureInfo_v2 __CUDA_API_PTSZ(cuStreamGetCaptureInfo_v2) #define cuStreamUpdateCaptureDependencies __CUDA_API_PTSZ(cuStreamUpdateCaptureDependencies) #define cuStreamAddCallback __CUDA_API_PTSZ(cuStreamAddCallback) #define cuStreamAttachMemAsync __CUDA_API_PTSZ(cuStreamAttachMemAsync) #define cuStreamQuery __CUDA_API_PTSZ(cuStreamQuery) #define cuStreamSynchronize __CUDA_API_PTSZ(cuStreamSynchronize) #define cuEventRecord __CUDA_API_PTSZ(cuEventRecord) #define cuEventRecordWithFlags __CUDA_API_PTSZ(cuEventRecordWithFlags) #define cuLaunchKernel __CUDA_API_PTSZ(cuLaunchKernel) #define cuLaunchHostFunc __CUDA_API_PTSZ(cuLaunchHostFunc) #define cuGraphicsMapResources __CUDA_API_PTSZ(cuGraphicsMapResources) #define cuGraphicsUnmapResources __CUDA_API_PTSZ(cuGraphicsUnmapResources) #define cuStreamWriteValue32 __CUDA_API_PTSZ(cuStreamWriteValue32) #define cuStreamWaitValue32 __CUDA_API_PTSZ(cuStreamWaitValue32) #define cuStreamWriteValue64 __CUDA_API_PTSZ(cuStreamWriteValue64) #define cuStreamWaitValue64 __CUDA_API_PTSZ(cuStreamWaitValue64) #define cuStreamBatchMemOp __CUDA_API_PTSZ(cuStreamBatchMemOp) #define cuLaunchCooperativeKernel __CUDA_API_PTSZ(cuLaunchCooperativeKernel) #define cuSignalExternalSemaphoresAsync __CUDA_API_PTSZ(cuSignalExternalSemaphoresAsync) #define cuWaitExternalSemaphoresAsync __CUDA_API_PTSZ(cuWaitExternalSemaphoresAsync) #define cuGraphUpload __CUDA_API_PTSZ(cuGraphUpload) #define cuGraphLaunch __CUDA_API_PTSZ(cuGraphLaunch) #define cuStreamCopyAttributes __CUDA_API_PTSZ(cuStreamCopyAttributes) #define cuStreamGetAttribute __CUDA_API_PTSZ(cuStreamGetAttribute) #define cuStreamSetAttribute __CUDA_API_PTSZ(cuStreamSetAttribute) #define cuMemMapArrayAsync __CUDA_API_PTSZ(cuMemMapArrayAsync) #define cuMemFreeAsync __CUDA_API_PTSZ(cuMemFreeAsync) #define cuMemAllocAsync __CUDA_API_PTSZ(cuMemAllocAsync) #define cuMemAllocFromPoolAsync __CUDA_API_PTSZ(cuMemAllocFromPoolAsync) #endif /** * \file cuda.h * \brief Header file for the CUDA Toolkit application programming interface. * * \file cudaGL.h * \brief Header file for the OpenGL interoperability functions of the * low-level CUDA driver application programming interface. * * \file cudaD3D9.h * \brief Header file for the Direct3D 9 interoperability functions of the * low-level CUDA driver application programming interface. */ /** * \defgroup CUDA_TYPES Data types used by CUDA driver * @{ */ /** * CUDA API version number */ #define CUDA_VERSION 11060 #ifdef __cplusplus extern "C" { #endif /** * CUDA device pointer * CUdeviceptr is defined as an unsigned integer type whose size matches the size of a pointer on the target platform. */ #if defined(_WIN64) || defined(__LP64__) typedef unsigned long long CUdeviceptr_v2; #else typedef unsigned int CUdeviceptr_v2; #endif typedef CUdeviceptr_v2 CUdeviceptr; /**< CUDA device pointer */ typedef int CUdevice_v1; /**< CUDA device */ typedef CUdevice_v1 CUdevice; /**< CUDA device */ typedef struct CUctx_st *CUcontext; /**< CUDA context */ typedef struct CUmod_st *CUmodule; /**< CUDA module */ typedef struct CUfunc_st *CUfunction; /**< CUDA function */ typedef struct CUarray_st *CUarray; /**< CUDA array */ typedef struct CUmipmappedArray_st *CUmipmappedArray; /**< CUDA mipmapped array */ typedef struct CUtexref_st *CUtexref; /**< CUDA texture reference */ typedef struct CUsurfref_st *CUsurfref; /**< CUDA surface reference */ typedef struct CUevent_st *CUevent; /**< CUDA event */ typedef struct CUstream_st *CUstream; /**< CUDA stream */ typedef struct CUgraphicsResource_st *CUgraphicsResource; /**< CUDA graphics interop resource */ typedef unsigned long long CUtexObject_v1; /**< An opaque value that represents a CUDA texture object */ typedef CUtexObject_v1 CUtexObject; /**< An opaque value that represents a CUDA texture object */ typedef unsigned long long CUsurfObject_v1; /**< An opaque value that represents a CUDA surface object */ typedef CUsurfObject_v1 CUsurfObject; /**< An opaque value that represents a CUDA surface object */ typedef struct CUextMemory_st *CUexternalMemory; /**< CUDA external memory */ typedef struct CUextSemaphore_st *CUexternalSemaphore; /**< CUDA external semaphore */ typedef struct CUgraph_st *CUgraph; /**< CUDA graph */ typedef struct CUgraphNode_st *CUgraphNode; /**< CUDA graph node */ typedef struct CUgraphExec_st *CUgraphExec; /**< CUDA executable graph */ typedef struct CUmemPoolHandle_st *CUmemoryPool; /**< CUDA memory pool */ typedef struct CUuserObject_st *CUuserObject; /**< CUDA user object for graphs */ #ifndef CU_UUID_HAS_BEEN_DEFINED #define CU_UUID_HAS_BEEN_DEFINED typedef struct CUuuid_st { /**< CUDA definition of UUID */ char bytes[16]; } CUuuid; #endif /** * CUDA IPC handle size */ #define CU_IPC_HANDLE_SIZE 64 /** * CUDA IPC event handle */ typedef struct CUipcEventHandle_st { char reserved[CU_IPC_HANDLE_SIZE]; } CUipcEventHandle_v1; typedef CUipcEventHandle_v1 CUipcEventHandle; /** * CUDA IPC mem handle */ typedef struct CUipcMemHandle_st { char reserved[CU_IPC_HANDLE_SIZE]; } CUipcMemHandle_v1; typedef CUipcMemHandle_v1 CUipcMemHandle; /** * CUDA Ipc Mem Flags */ typedef enum CUipcMem_flags_enum { CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS = 0x1 /**< Automatically enable peer access between remote devices as needed */ } CUipcMem_flags; /** * CUDA Mem Attach Flags */ typedef enum CUmemAttach_flags_enum { CU_MEM_ATTACH_GLOBAL = 0x1, /**< Memory can be accessed by any stream on any device */ CU_MEM_ATTACH_HOST = 0x2, /**< Memory cannot be accessed by any stream on any device */ CU_MEM_ATTACH_SINGLE = 0x4 /**< Memory can only be accessed by a single stream on the associated device */ } CUmemAttach_flags; /** * Context creation flags */ typedef enum CUctx_flags_enum { CU_CTX_SCHED_AUTO = 0x00, /**< Automatic scheduling */ CU_CTX_SCHED_SPIN = 0x01, /**< Set spin as default scheduling */ CU_CTX_SCHED_YIELD = 0x02, /**< Set yield as default scheduling */ CU_CTX_SCHED_BLOCKING_SYNC = 0x04, /**< Set blocking synchronization as default scheduling */ CU_CTX_BLOCKING_SYNC = 0x04, /**< Set blocking synchronization as default scheduling * \deprecated This flag was deprecated as of CUDA 4.0 * and was replaced with ::CU_CTX_SCHED_BLOCKING_SYNC. */ CU_CTX_SCHED_MASK = 0x07, CU_CTX_MAP_HOST = 0x08, /**< \deprecated This flag was deprecated as of CUDA 11.0 * and it no longer has any effect. All contexts * as of CUDA 3.2 behave as though the flag is enabled. */ CU_CTX_LMEM_RESIZE_TO_MAX = 0x10, /**< Keep local memory allocation after launch */ CU_CTX_FLAGS_MASK = 0x1f } CUctx_flags; /** * Stream creation flags */ typedef enum CUstream_flags_enum { CU_STREAM_DEFAULT = 0x0, /**< Default stream flag */ CU_STREAM_NON_BLOCKING = 0x1 /**< Stream does not synchronize with stream 0 (the NULL stream) */ } CUstream_flags; /** * Legacy stream handle * * Stream handle that can be passed as a CUstream to use an implicit stream * with legacy synchronization behavior. * * See details of the \link_sync_behavior */ #define CU_STREAM_LEGACY ((CUstream)0x1) /** * Per-thread stream handle * * Stream handle that can be passed as a CUstream to use an implicit stream * with per-thread synchronization behavior. * * See details of the \link_sync_behavior */ #define CU_STREAM_PER_THREAD ((CUstream)0x2) /** * Event creation flags */ typedef enum CUevent_flags_enum { CU_EVENT_DEFAULT = 0x0, /**< Default event flag */ CU_EVENT_BLOCKING_SYNC = 0x1, /**< Event uses blocking synchronization */ CU_EVENT_DISABLE_TIMING = 0x2, /**< Event will not record timing data */ CU_EVENT_INTERPROCESS = 0x4 /**< Event is suitable for interprocess use. CU_EVENT_DISABLE_TIMING must be set */ } CUevent_flags; /** * Event record flags */ typedef enum CUevent_record_flags_enum { CU_EVENT_RECORD_DEFAULT = 0x0, /**< Default event record flag */ CU_EVENT_RECORD_EXTERNAL = 0x1 /**< When using stream capture, create an event record node * instead of the default behavior. This flag is invalid * when used outside of capture. */ } CUevent_record_flags; /** * Event wait flags */ typedef enum CUevent_wait_flags_enum { CU_EVENT_WAIT_DEFAULT = 0x0, /**< Default event wait flag */ CU_EVENT_WAIT_EXTERNAL = 0x1 /**< When using stream capture, create an event wait node * instead of the default behavior. This flag is invalid * when used outside of capture.*/ } CUevent_wait_flags; /** * Flags for ::cuStreamWaitValue32 and ::cuStreamWaitValue64 */ typedef enum CUstreamWaitValue_flags_enum { CU_STREAM_WAIT_VALUE_GEQ = 0x0, /**< Wait until (int32_t)(*addr - value) >= 0 (or int64_t for 64 bit values). Note this is a cyclic comparison which ignores wraparound. (Default behavior.) */ CU_STREAM_WAIT_VALUE_EQ = 0x1, /**< Wait until *addr == value. */ CU_STREAM_WAIT_VALUE_AND = 0x2, /**< Wait until (*addr & value) != 0. */ CU_STREAM_WAIT_VALUE_NOR = 0x3, /**< Wait until ~(*addr | value) != 0. Support for this operation can be queried with ::cuDeviceGetAttribute() and ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR.*/ CU_STREAM_WAIT_VALUE_FLUSH = 1<<30 /**< Follow the wait operation with a flush of outstanding remote writes. This means that, if a remote write operation is guaranteed to have reached the device before the wait can be satisfied, that write is guaranteed to be visible to downstream device work. The device is permitted to reorder remote writes internally. For example, this flag would be required if two remote writes arrive in a defined order, the wait is satisfied by the second write, and downstream work needs to observe the first write. Support for this operation is restricted to selected platforms and can be queried with ::CU_DEVICE_ATTRIBUTE_CAN_USE_WAIT_VALUE_FLUSH.*/ } CUstreamWaitValue_flags; /** * Flags for ::cuStreamWriteValue32 */ typedef enum CUstreamWriteValue_flags_enum { CU_STREAM_WRITE_VALUE_DEFAULT = 0x0, /**< Default behavior */ CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER = 0x1 /**< Permits the write to be reordered with writes which were issued before it, as a performance optimization. Normally, ::cuStreamWriteValue32 will provide a memory fence before the write, which has similar semantics to __threadfence_system() but is scoped to the stream rather than a CUDA thread. */ } CUstreamWriteValue_flags; /** * Operations for ::cuStreamBatchMemOp */ typedef enum CUstreamBatchMemOpType_enum { CU_STREAM_MEM_OP_WAIT_VALUE_32 = 1, /**< Represents a ::cuStreamWaitValue32 operation */ CU_STREAM_MEM_OP_WRITE_VALUE_32 = 2, /**< Represents a ::cuStreamWriteValue32 operation */ CU_STREAM_MEM_OP_WAIT_VALUE_64 = 4, /**< Represents a ::cuStreamWaitValue64 operation */ CU_STREAM_MEM_OP_WRITE_VALUE_64 = 5, /**< Represents a ::cuStreamWriteValue64 operation */ CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES = 3 /**< This has the same effect as ::CU_STREAM_WAIT_VALUE_FLUSH, but as a standalone operation. */ } CUstreamBatchMemOpType; /** * Per-operation parameters for ::cuStreamBatchMemOp */ typedef union CUstreamBatchMemOpParams_union { CUstreamBatchMemOpType operation; struct CUstreamMemOpWaitValueParams_st { CUstreamBatchMemOpType operation; CUdeviceptr address; union { cuuint32_t value; cuuint64_t value64; }; unsigned int flags; CUdeviceptr alias; /**< For driver internal use. Initial value is unimportant. */ } waitValue; struct CUstreamMemOpWriteValueParams_st { CUstreamBatchMemOpType operation; CUdeviceptr address; union { cuuint32_t value; cuuint64_t value64; }; unsigned int flags; CUdeviceptr alias; /**< For driver internal use. Initial value is unimportant. */ } writeValue; struct CUstreamMemOpFlushRemoteWritesParams_st { CUstreamBatchMemOpType operation; unsigned int flags; } flushRemoteWrites; cuuint64_t pad[6]; } CUstreamBatchMemOpParams_v1; typedef CUstreamBatchMemOpParams_v1 CUstreamBatchMemOpParams; /** * Occupancy calculator flag */ typedef enum CUoccupancy_flags_enum { CU_OCCUPANCY_DEFAULT = 0x0, /**< Default behavior */ CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE = 0x1 /**< Assume global caching is enabled and cannot be automatically turned off */ } CUoccupancy_flags; /** * Flags for ::cuStreamUpdateCaptureDependencies */ typedef enum CUstreamUpdateCaptureDependencies_flags_enum { CU_STREAM_ADD_CAPTURE_DEPENDENCIES = 0x0, /**< Add new nodes to the dependency set */ CU_STREAM_SET_CAPTURE_DEPENDENCIES = 0x1 /**< Replace the dependency set with the new nodes */ } CUstreamUpdateCaptureDependencies_flags; /** * Array formats */ typedef enum CUarray_format_enum { CU_AD_FORMAT_UNSIGNED_INT8 = 0x01, /**< Unsigned 8-bit integers */ CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, /**< Unsigned 16-bit integers */ CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, /**< Unsigned 32-bit integers */ CU_AD_FORMAT_SIGNED_INT8 = 0x08, /**< Signed 8-bit integers */ CU_AD_FORMAT_SIGNED_INT16 = 0x09, /**< Signed 16-bit integers */ CU_AD_FORMAT_SIGNED_INT32 = 0x0a, /**< Signed 32-bit integers */ CU_AD_FORMAT_HALF = 0x10, /**< 16-bit floating point */ CU_AD_FORMAT_FLOAT = 0x20, /**< 32-bit floating point */ CU_AD_FORMAT_NV12 = 0xb0, /**< 8-bit YUV planar format, with 4:2:0 sampling */ CU_AD_FORMAT_UNORM_INT8X1 = 0xc0, /**< 1 channel unsigned 8-bit normalized integer */ CU_AD_FORMAT_UNORM_INT8X2 = 0xc1, /**< 2 channel unsigned 8-bit normalized integer */ CU_AD_FORMAT_UNORM_INT8X4 = 0xc2, /**< 4 channel unsigned 8-bit normalized integer */ CU_AD_FORMAT_UNORM_INT16X1 = 0xc3, /**< 1 channel unsigned 16-bit normalized integer */ CU_AD_FORMAT_UNORM_INT16X2 = 0xc4, /**< 2 channel unsigned 16-bit normalized integer */ CU_AD_FORMAT_UNORM_INT16X4 = 0xc5, /**< 4 channel unsigned 16-bit normalized integer */ CU_AD_FORMAT_SNORM_INT8X1 = 0xc6, /**< 1 channel signed 8-bit normalized integer */ CU_AD_FORMAT_SNORM_INT8X2 = 0xc7, /**< 2 channel signed 8-bit normalized integer */ CU_AD_FORMAT_SNORM_INT8X4 = 0xc8, /**< 4 channel signed 8-bit normalized integer */ CU_AD_FORMAT_SNORM_INT16X1 = 0xc9, /**< 1 channel signed 16-bit normalized integer */ CU_AD_FORMAT_SNORM_INT16X2 = 0xca, /**< 2 channel signed 16-bit normalized integer */ CU_AD_FORMAT_SNORM_INT16X4 = 0xcb, /**< 4 channel signed 16-bit normalized integer */ CU_AD_FORMAT_BC1_UNORM = 0x91, /**< 4 channel unsigned normalized block-compressed (BC1 compression) format */ CU_AD_FORMAT_BC1_UNORM_SRGB = 0x92, /**< 4 channel unsigned normalized block-compressed (BC1 compression) format with sRGB encoding*/ CU_AD_FORMAT_BC2_UNORM = 0x93, /**< 4 channel unsigned normalized block-compressed (BC2 compression) format */ CU_AD_FORMAT_BC2_UNORM_SRGB = 0x94, /**< 4 channel unsigned normalized block-compressed (BC2 compression) format with sRGB encoding*/ CU_AD_FORMAT_BC3_UNORM = 0x95, /**< 4 channel unsigned normalized block-compressed (BC3 compression) format */ CU_AD_FORMAT_BC3_UNORM_SRGB = 0x96, /**< 4 channel unsigned normalized block-compressed (BC3 compression) format with sRGB encoding*/ CU_AD_FORMAT_BC4_UNORM = 0x97, /**< 1 channel unsigned normalized block-compressed (BC4 compression) format */ CU_AD_FORMAT_BC4_SNORM = 0x98, /**< 1 channel signed normalized block-compressed (BC4 compression) format */ CU_AD_FORMAT_BC5_UNORM = 0x99, /**< 2 channel unsigned normalized block-compressed (BC5 compression) format */ CU_AD_FORMAT_BC5_SNORM = 0x9a, /**< 2 channel signed normalized block-compressed (BC5 compression) format */ CU_AD_FORMAT_BC6H_UF16 = 0x9b, /**< 3 channel unsigned half-float block-compressed (BC6H compression) format */ CU_AD_FORMAT_BC6H_SF16 = 0x9c, /**< 3 channel signed half-float block-compressed (BC6H compression) format */ CU_AD_FORMAT_BC7_UNORM = 0x9d, /**< 4 channel unsigned normalized block-compressed (BC7 compression) format */ CU_AD_FORMAT_BC7_UNORM_SRGB = 0x9e /**< 4 channel unsigned normalized block-compressed (BC7 compression) format with sRGB encoding */ } CUarray_format; /** * Texture reference addressing modes */ typedef enum CUaddress_mode_enum { CU_TR_ADDRESS_MODE_WRAP = 0, /**< Wrapping address mode */ CU_TR_ADDRESS_MODE_CLAMP = 1, /**< Clamp to edge address mode */ CU_TR_ADDRESS_MODE_MIRROR = 2, /**< Mirror address mode */ CU_TR_ADDRESS_MODE_BORDER = 3 /**< Border address mode */ } CUaddress_mode; /** * Texture reference filtering modes */ typedef enum CUfilter_mode_enum { CU_TR_FILTER_MODE_POINT = 0, /**< Point filter mode */ CU_TR_FILTER_MODE_LINEAR = 1 /**< Linear filter mode */ } CUfilter_mode; /** * Device properties */ typedef enum CUdevice_attribute_enum { CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1, /**< Maximum number of threads per block */ CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2, /**< Maximum block dimension X */ CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y = 3, /**< Maximum block dimension Y */ CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z = 4, /**< Maximum block dimension Z */ CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 5, /**< Maximum grid dimension X */ CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y = 6, /**< Maximum grid dimension Y */ CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z = 7, /**< Maximum grid dimension Z */ CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK = 8, /**< Maximum shared memory available per block in bytes */ CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK = 8, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK */ CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = 9, /**< Memory available on device for __constant__ variables in a CUDA C kernel in bytes */ CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10, /**< Warp size in threads */ CU_DEVICE_ATTRIBUTE_MAX_PITCH = 11, /**< Maximum pitch in bytes allowed by memory copies */ CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12, /**< Maximum number of 32-bit registers available per block */ CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK = 12, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK */ CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13, /**< Typical clock frequency in kilohertz */ CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14, /**< Alignment requirement for textures */ CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15, /**< Device can possibly copy memory and execute a kernel concurrently. Deprecated. Use instead CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT. */ CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16, /**< Number of multiprocessors on device */ CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT = 17, /**< Specifies whether there is a run time limit on kernels */ CU_DEVICE_ATTRIBUTE_INTEGRATED = 18, /**< Device is integrated with host memory */ CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY = 19, /**< Device can map host memory into CUDA address space */ CU_DEVICE_ATTRIBUTE_COMPUTE_MODE = 20, /**< Compute mode (See ::CUcomputemode for details) */ CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH = 21, /**< Maximum 1D texture width */ CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH = 22, /**< Maximum 2D texture width */ CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT = 23, /**< Maximum 2D texture height */ CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH = 24, /**< Maximum 3D texture width */ CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT = 25, /**< Maximum 3D texture height */ CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH = 26, /**< Maximum 3D texture depth */ CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH = 27, /**< Maximum 2D layered texture width */ CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT = 28, /**< Maximum 2D layered texture height */ CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS = 29, /**< Maximum layers in a 2D layered texture */ CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH = 27, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH */ CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT = 28, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT */ CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES = 29, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS */ CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT = 30, /**< Alignment requirement for surfaces */ CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31, /**< Device can possibly execute multiple kernels concurrently */ CU_DEVICE_ATTRIBUTE_ECC_ENABLED = 32, /**< Device has ECC support enabled */ CU_DEVICE_ATTRIBUTE_PCI_BUS_ID = 33, /**< PCI bus ID of the device */ CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID = 34, /**< PCI device ID of the device */ CU_DEVICE_ATTRIBUTE_TCC_DRIVER = 35, /**< Device is using TCC driver model */ CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36, /**< Peak memory clock frequency in kilohertz */ CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH = 37, /**< Global memory bus width in bits */ CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE = 38, /**< Size of L2 cache in bytes */ CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39, /**< Maximum resident threads per multiprocessor */ CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT = 40, /**< Number of asynchronous engines */ CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING = 41, /**< Device shares a unified address space with the host */ CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH = 42, /**< Maximum 1D layered texture width */ CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS = 43, /**< Maximum layers in a 1D layered texture */ CU_DEVICE_ATTRIBUTE_CAN_TEX2D_GATHER = 44, /**< Deprecated, do not use. */ CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH = 45, /**< Maximum 2D texture width if CUDA_ARRAY3D_TEXTURE_GATHER is set */ CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT = 46, /**< Maximum 2D texture height if CUDA_ARRAY3D_TEXTURE_GATHER is set */ CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE = 47, /**< Alternate maximum 3D texture width */ CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE = 48, /**< Alternate maximum 3D texture height */ CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE = 49, /**< Alternate maximum 3D texture depth */ CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID = 50, /**< PCI domain ID of the device */ CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT = 51, /**< Pitch alignment requirement for textures */ CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH = 52, /**< Maximum cubemap texture width/height */ CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH = 53, /**< Maximum cubemap layered texture width/height */ CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS = 54, /**< Maximum layers in a cubemap layered texture */ CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH = 55, /**< Maximum 1D surface width */ CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH = 56, /**< Maximum 2D surface width */ CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT = 57, /**< Maximum 2D surface height */ CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH = 58, /**< Maximum 3D surface width */ CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT = 59, /**< Maximum 3D surface height */ CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH = 60, /**< Maximum 3D surface depth */ CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH = 61, /**< Maximum 1D layered surface width */ CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS = 62, /**< Maximum layers in a 1D layered surface */ CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH = 63, /**< Maximum 2D layered surface width */ CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT = 64, /**< Maximum 2D layered surface height */ CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS = 65, /**< Maximum layers in a 2D layered surface */ CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH = 66, /**< Maximum cubemap surface width */ CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH = 67, /**< Maximum cubemap layered surface width */ CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS = 68, /**< Maximum layers in a cubemap layered surface */ CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH = 69, /**< Deprecated, do not use. Use cudaDeviceGetTexture1DLinearMaxWidth() or cuDeviceGetTexture1DLinearMaxWidth() instead. */ CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH = 70, /**< Maximum 2D linear texture width */ CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT = 71, /**< Maximum 2D linear texture height */ CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH = 72, /**< Maximum 2D linear texture pitch in bytes */ CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH = 73, /**< Maximum mipmapped 2D texture width */ CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT = 74, /**< Maximum mipmapped 2D texture height */ CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75, /**< Major compute capability version number */ CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76, /**< Minor compute capability version number */ CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH = 77, /**< Maximum mipmapped 1D texture width */ CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED = 78, /**< Device supports stream priorities */ CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED = 79, /**< Device supports caching globals in L1 */ CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED = 80, /**< Device supports caching locals in L1 */ CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR = 81, /**< Maximum shared memory available per multiprocessor in bytes */ CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82, /**< Maximum number of 32-bit registers available per multiprocessor */ CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY = 83, /**< Device can allocate managed memory on this system */ CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD = 84, /**< Device is on a multi-GPU board */ CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID = 85, /**< Unique id for a group of devices on the same multi-GPU board */ CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED = 86, /**< Link between the device and the host supports native atomic operations (this is a placeholder attribute, and is not supported on any current hardware)*/ CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO = 87, /**< Ratio of single precision performance (in floating-point operations per second) to double precision performance */ CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS = 88, /**< Device supports coherently accessing pageable memory without calling cudaHostRegister on it */ CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS = 89, /**< Device can coherently access managed memory concurrently with the CPU */ CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED = 90, /**< Device supports compute preemption. */ CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM = 91, /**< Device can access host registered memory at the same virtual address as the CPU */ CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS = 92, /**< ::cuStreamBatchMemOp and related APIs are supported. */ CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS = 93, /**< 64-bit operations are supported in ::cuStreamBatchMemOp and related APIs. */ CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR = 94, /**< ::CU_STREAM_WAIT_VALUE_NOR is supported. */ CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH = 95, /**< Device supports launching cooperative kernels via ::cuLaunchCooperativeKernel */ CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH = 96, /**< Deprecated, ::cuLaunchCooperativeKernelMultiDevice is deprecated. */ CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN = 97, /**< Maximum optin shared memory per block */ CU_DEVICE_ATTRIBUTE_CAN_FLUSH_REMOTE_WRITES = 98, /**< The ::CU_STREAM_WAIT_VALUE_FLUSH flag and the ::CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES MemOp are supported on the device. See \ref CUDA_MEMOP for additional details. */ CU_DEVICE_ATTRIBUTE_HOST_REGISTER_SUPPORTED = 99, /**< Device supports host memory registration via ::cudaHostRegister. */ CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES = 100, /**< Device accesses pageable memory via the host's page tables. */ CU_DEVICE_ATTRIBUTE_DIRECT_MANAGED_MEM_ACCESS_FROM_HOST = 101, /**< The host can directly access managed memory on the device without migration. */ CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED = 102, /**< Deprecated, Use CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED*/ CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED = 102, /**< Device supports virtual memory management APIs like ::cuMemAddressReserve, ::cuMemCreate, ::cuMemMap and related APIs */ CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR_SUPPORTED = 103, /**< Device supports exporting memory to a posix file descriptor with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate */ CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_HANDLE_SUPPORTED = 104, /**< Device supports exporting memory to a Win32 NT handle with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate */ CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_KMT_HANDLE_SUPPORTED = 105, /**< Device supports exporting memory to a Win32 KMT handle with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate */ CU_DEVICE_ATTRIBUTE_MAX_BLOCKS_PER_MULTIPROCESSOR = 106, /**< Maximum number of blocks per multiprocessor */ CU_DEVICE_ATTRIBUTE_GENERIC_COMPRESSION_SUPPORTED = 107, /**< Device supports compression of memory */ CU_DEVICE_ATTRIBUTE_MAX_PERSISTING_L2_CACHE_SIZE = 108, /**< Maximum L2 persisting lines capacity setting in bytes. */ CU_DEVICE_ATTRIBUTE_MAX_ACCESS_POLICY_WINDOW_SIZE = 109, /**< Maximum value of CUaccessPolicyWindow::num_bytes. */ CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED = 110, /**< Device supports specifying the GPUDirect RDMA flag with ::cuMemCreate */ CU_DEVICE_ATTRIBUTE_RESERVED_SHARED_MEMORY_PER_BLOCK = 111, /**< Shared memory reserved by CUDA driver per block in bytes */ CU_DEVICE_ATTRIBUTE_SPARSE_CUDA_ARRAY_SUPPORTED = 112, /**< Device supports sparse CUDA arrays and sparse CUDA mipmapped arrays */ CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED = 113, /**< Device supports using the ::cuMemHostRegister flag ::CU_MEMHOSTERGISTER_READ_ONLY to register memory that must be mapped as read-only to the GPU */ CU_DEVICE_ATTRIBUTE_TIMELINE_SEMAPHORE_INTEROP_SUPPORTED = 114, /**< External timeline semaphore interop is supported on the device */ CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED = 115, /**< Device supports using the ::cuMemAllocAsync and ::cuMemPool family of APIs */ CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED = 116, /**< Device supports GPUDirect RDMA APIs, like nvidia_p2p_get_pages (see https://docs.nvidia.com/cuda/gpudirect-rdma for more information) */ CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS = 117, /**< The returned attribute shall be interpreted as a bitmask, where the individual bits are described by the ::CUflushGPUDirectRDMAWritesOptions enum */ CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING = 118, /**< GPUDirect RDMA writes to the device do not need to be flushed for consumers within the scope indicated by the returned attribute. See ::CUGPUDirectRDMAWritesOrdering for the numerical values returned here. */ CU_DEVICE_ATTRIBUTE_MEMPOOL_SUPPORTED_HANDLE_TYPES = 119, /**< Handle types supported with mempool based IPC */ CU_DEVICE_ATTRIBUTE_DEFERRED_MAPPING_CUDA_ARRAY_SUPPORTED = 121, /**< Device supports deferred mapping CUDA arrays and CUDA mipmapped arrays */ CU_DEVICE_ATTRIBUTE_MAX } CUdevice_attribute; /** * Legacy device properties */ typedef struct CUdevprop_st { int maxThreadsPerBlock; /**< Maximum number of threads per block */ int maxThreadsDim[3]; /**< Maximum size of each dimension of a block */ int maxGridSize[3]; /**< Maximum size of each dimension of a grid */ int sharedMemPerBlock; /**< Shared memory available per block in bytes */ int totalConstantMemory; /**< Constant memory available on device in bytes */ int SIMDWidth; /**< Warp size in threads */ int memPitch; /**< Maximum pitch in bytes allowed by memory copies */ int regsPerBlock; /**< 32-bit registers available per block */ int clockRate; /**< Clock frequency in kilohertz */ int textureAlign; /**< Alignment requirement for textures */ } CUdevprop_v1; typedef CUdevprop_v1 CUdevprop; /** * Pointer information */ typedef enum CUpointer_attribute_enum { CU_POINTER_ATTRIBUTE_CONTEXT = 1, /**< The ::CUcontext on which a pointer was allocated or registered */ CU_POINTER_ATTRIBUTE_MEMORY_TYPE = 2, /**< The ::CUmemorytype describing the physical location of a pointer */ CU_POINTER_ATTRIBUTE_DEVICE_POINTER = 3, /**< The address at which a pointer's memory may be accessed on the device */ CU_POINTER_ATTRIBUTE_HOST_POINTER = 4, /**< The address at which a pointer's memory may be accessed on the host */ CU_POINTER_ATTRIBUTE_P2P_TOKENS = 5, /**< A pair of tokens for use with the nv-p2p.h Linux kernel interface */ CU_POINTER_ATTRIBUTE_SYNC_MEMOPS = 6, /**< Synchronize every synchronous memory operation initiated on this region */ CU_POINTER_ATTRIBUTE_BUFFER_ID = 7, /**< A process-wide unique ID for an allocated memory region*/ CU_POINTER_ATTRIBUTE_IS_MANAGED = 8, /**< Indicates if the pointer points to managed memory */ CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL = 9, /**< A device ordinal of a device on which a pointer was allocated or registered */ CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE = 10, /**< 1 if this pointer maps to an allocation that is suitable for ::cudaIpcGetMemHandle, 0 otherwise **/ CU_POINTER_ATTRIBUTE_RANGE_START_ADDR = 11, /**< Starting address for this requested pointer */ CU_POINTER_ATTRIBUTE_RANGE_SIZE = 12, /**< Size of the address range for this requested pointer */ CU_POINTER_ATTRIBUTE_MAPPED = 13, /**< 1 if this pointer is in a valid address range that is mapped to a backing allocation, 0 otherwise **/ CU_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES = 14, /**< Bitmask of allowed ::CUmemAllocationHandleType for this allocation **/ CU_POINTER_ATTRIBUTE_IS_GPU_DIRECT_RDMA_CAPABLE = 15, /**< 1 if the memory this pointer is referencing can be used with the GPUDirect RDMA API **/ CU_POINTER_ATTRIBUTE_ACCESS_FLAGS = 16, /**< Returns the access flags the device associated with the current context has on the corresponding memory referenced by the pointer given */ CU_POINTER_ATTRIBUTE_MEMPOOL_HANDLE = 17 /**< Returns the mempool handle for the allocation if it was allocated from a mempool. Otherwise returns NULL. **/ } CUpointer_attribute; /** * Function properties */ typedef enum CUfunction_attribute_enum { /** * The maximum number of threads per block, beyond which a launch of the * function would fail. This number depends on both the function and the * device on which the function is currently loaded. */ CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0, /** * The size in bytes of statically-allocated shared memory required by * this function. This does not include dynamically-allocated shared * memory requested by the user at runtime. */ CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES = 1, /** * The size in bytes of user-allocated constant memory required by this * function. */ CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES = 2, /** * The size in bytes of local memory used by each thread of this function. */ CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES = 3, /** * The number of registers used by each thread of this function. */ CU_FUNC_ATTRIBUTE_NUM_REGS = 4, /** * The PTX virtual architecture version for which the function was * compiled. This value is the major PTX version * 10 + the minor PTX * version, so a PTX version 1.3 function would return the value 13. * Note that this may return the undefined value of 0 for cubins * compiled prior to CUDA 3.0. */ CU_FUNC_ATTRIBUTE_PTX_VERSION = 5, /** * The binary architecture version for which the function was compiled. * This value is the major binary version * 10 + the minor binary version, * so a binary version 1.3 function would return the value 13. Note that * this will return a value of 10 for legacy cubins that do not have a * properly-encoded binary architecture version. */ CU_FUNC_ATTRIBUTE_BINARY_VERSION = 6, /** * The attribute to indicate whether the function has been compiled with * user specified option "-Xptxas --dlcm=ca" set . */ CU_FUNC_ATTRIBUTE_CACHE_MODE_CA = 7, /** * The maximum size in bytes of dynamically-allocated shared memory that can be used by * this function. If the user-specified dynamic shared memory size is larger than this * value, the launch will fail. * See ::cuFuncSetAttribute */ CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES = 8, /** * On devices where the L1 cache and shared memory use the same hardware resources, * this sets the shared memory carveout preference, in percent of the total shared memory. * Refer to ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR. * This is only a hint, and the driver can choose a different ratio if required to execute the function. * See ::cuFuncSetAttribute */ CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT = 9, CU_FUNC_ATTRIBUTE_MAX } CUfunction_attribute; /** * Function cache configurations */ typedef enum CUfunc_cache_enum { CU_FUNC_CACHE_PREFER_NONE = 0x00, /**< no preference for shared memory or L1 (default) */ CU_FUNC_CACHE_PREFER_SHARED = 0x01, /**< prefer larger shared memory and smaller L1 cache */ CU_FUNC_CACHE_PREFER_L1 = 0x02, /**< prefer larger L1 cache and smaller shared memory */ CU_FUNC_CACHE_PREFER_EQUAL = 0x03 /**< prefer equal sized L1 cache and shared memory */ } CUfunc_cache; /** * Shared memory configurations */ typedef enum CUsharedconfig_enum { CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE = 0x00, /**< set default shared memory bank size */ CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE = 0x01, /**< set shared memory bank width to four bytes */ CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE = 0x02 /**< set shared memory bank width to eight bytes */ } CUsharedconfig; /** * Shared memory carveout configurations. These may be passed to ::cuFuncSetAttribute */ typedef enum CUshared_carveout_enum { CU_SHAREDMEM_CARVEOUT_DEFAULT = -1, /**< No preference for shared memory or L1 (default) */ CU_SHAREDMEM_CARVEOUT_MAX_SHARED = 100, /**< Prefer maximum available shared memory, minimum L1 cache */ CU_SHAREDMEM_CARVEOUT_MAX_L1 = 0 /**< Prefer maximum available L1 cache, minimum shared memory */ } CUshared_carveout; /** * Memory types */ typedef enum CUmemorytype_enum { CU_MEMORYTYPE_HOST = 0x01, /**< Host memory */ CU_MEMORYTYPE_DEVICE = 0x02, /**< Device memory */ CU_MEMORYTYPE_ARRAY = 0x03, /**< Array memory */ CU_MEMORYTYPE_UNIFIED = 0x04 /**< Unified device or host memory */ } CUmemorytype; /** * Compute Modes */ typedef enum CUcomputemode_enum { CU_COMPUTEMODE_DEFAULT = 0, /**< Default compute mode (Multiple contexts allowed per device) */ CU_COMPUTEMODE_PROHIBITED = 2, /**< Compute-prohibited mode (No contexts can be created on this device at this time) */ CU_COMPUTEMODE_EXCLUSIVE_PROCESS = 3 /**< Compute-exclusive-process mode (Only one context used by a single process can be present on this device at a time) */ } CUcomputemode; /** * Memory advise values */ typedef enum CUmem_advise_enum { CU_MEM_ADVISE_SET_READ_MOSTLY = 1, /**< Data will mostly be read and only occassionally be written to */ CU_MEM_ADVISE_UNSET_READ_MOSTLY = 2, /**< Undo the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY */ CU_MEM_ADVISE_SET_PREFERRED_LOCATION = 3, /**< Set the preferred location for the data as the specified device */ CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION = 4, /**< Clear the preferred location for the data */ CU_MEM_ADVISE_SET_ACCESSED_BY = 5, /**< Data will be accessed by the specified device, so prevent page faults as much as possible */ CU_MEM_ADVISE_UNSET_ACCESSED_BY = 6 /**< Let the Unified Memory subsystem decide on the page faulting policy for the specified device */ } CUmem_advise; typedef enum CUmem_range_attribute_enum { CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY = 1, /**< Whether the range will mostly be read and only occassionally be written to */ CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION = 2, /**< The preferred location of the range */ CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY = 3, /**< Memory range has ::CU_MEM_ADVISE_SET_ACCESSED_BY set for specified device */ CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION = 4 /**< The last location to which the range was prefetched */ } CUmem_range_attribute; /** * Online compiler and linker options */ typedef enum CUjit_option_enum { /** * Max number of registers that a thread may use.\n * Option type: unsigned int\n * Applies to: compiler only */ CU_JIT_MAX_REGISTERS = 0, /** * IN: Specifies minimum number of threads per block to target compilation * for\n * OUT: Returns the number of threads the compiler actually targeted. * This restricts the resource utilization fo the compiler (e.g. max * registers) such that a block with the given number of threads should be * able to launch based on register limitations. Note, this option does not * currently take into account any other resource limitations, such as * shared memory utilization.\n * Cannot be combined with ::CU_JIT_TARGET.\n * Option type: unsigned int\n * Applies to: compiler only */ CU_JIT_THREADS_PER_BLOCK, /** * Overwrites the option value with the total wall clock time, in * milliseconds, spent in the compiler and linker\n * Option type: float\n * Applies to: compiler and linker */ CU_JIT_WALL_TIME, /** * Pointer to a buffer in which to print any log messages * that are informational in nature (the buffer size is specified via * option ::CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES)\n * Option type: char *\n * Applies to: compiler and linker */ CU_JIT_INFO_LOG_BUFFER, /** * IN: Log buffer size in bytes. Log messages will be capped at this size * (including null terminator)\n * OUT: Amount of log buffer filled with messages\n * Option type: unsigned int\n * Applies to: compiler and linker */ CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, /** * Pointer to a buffer in which to print any log messages that * reflect errors (the buffer size is specified via option * ::CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES)\n * Option type: char *\n * Applies to: compiler and linker */ CU_JIT_ERROR_LOG_BUFFER, /** * IN: Log buffer size in bytes. Log messages will be capped at this size * (including null terminator)\n * OUT: Amount of log buffer filled with messages\n * Option type: unsigned int\n * Applies to: compiler and linker */ CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, /** * Level of optimizations to apply to generated code (0 - 4), with 4 * being the default and highest level of optimizations.\n * Option type: unsigned int\n * Applies to: compiler only */ CU_JIT_OPTIMIZATION_LEVEL, /** * No option value required. Determines the target based on the current * attached context (default)\n * Option type: No option value needed\n * Applies to: compiler and linker */ CU_JIT_TARGET_FROM_CUCONTEXT, /** * Target is chosen based on supplied ::CUjit_target. Cannot be * combined with ::CU_JIT_THREADS_PER_BLOCK.\n * Option type: unsigned int for enumerated type ::CUjit_target\n * Applies to: compiler and linker */ CU_JIT_TARGET, /** * Specifies choice of fallback strategy if matching cubin is not found. * Choice is based on supplied ::CUjit_fallback. This option cannot be * used with cuLink* APIs as the linker requires exact matches.\n * Option type: unsigned int for enumerated type ::CUjit_fallback\n * Applies to: compiler only */ CU_JIT_FALLBACK_STRATEGY, /** * Specifies whether to create debug information in output (-g) * (0: false, default)\n * Option type: int\n * Applies to: compiler and linker */ CU_JIT_GENERATE_DEBUG_INFO, /** * Generate verbose log messages (0: false, default)\n * Option type: int\n * Applies to: compiler and linker */ CU_JIT_LOG_VERBOSE, /** * Generate line number information (-lineinfo) (0: false, default)\n * Option type: int\n * Applies to: compiler only */ CU_JIT_GENERATE_LINE_INFO, /** * Specifies whether to enable caching explicitly (-dlcm) \n * Choice is based on supplied ::CUjit_cacheMode_enum.\n * Option type: unsigned int for enumerated type ::CUjit_cacheMode_enum\n * Applies to: compiler only */ CU_JIT_CACHE_MODE, /** * The below jit options are used for internal purposes only, in this version of CUDA */ CU_JIT_NEW_SM3X_OPT, CU_JIT_FAST_COMPILE, /** * Array of device symbol names that will be relocated to the corresponing * host addresses stored in ::CU_JIT_GLOBAL_SYMBOL_ADDRESSES.\n * Must contain ::CU_JIT_GLOBAL_SYMBOL_COUNT entries.\n * When loding a device module, driver will relocate all encountered * unresolved symbols to the host addresses.\n * It is only allowed to register symbols that correspond to unresolved * global variables.\n * It is illegal to register the same device symbol at multiple addresses.\n * Option type: const char **\n * Applies to: dynamic linker only */ CU_JIT_GLOBAL_SYMBOL_NAMES, /** * Array of host addresses that will be used to relocate corresponding * device symbols stored in ::CU_JIT_GLOBAL_SYMBOL_NAMES.\n * Must contain ::CU_JIT_GLOBAL_SYMBOL_COUNT entries.\n * Option type: void **\n * Applies to: dynamic linker only */ CU_JIT_GLOBAL_SYMBOL_ADDRESSES, /** * Number of entries in ::CU_JIT_GLOBAL_SYMBOL_NAMES and * ::CU_JIT_GLOBAL_SYMBOL_ADDRESSES arrays.\n * Option type: unsigned int\n * Applies to: dynamic linker only */ CU_JIT_GLOBAL_SYMBOL_COUNT, /** * Enable link-time optimization (-dlto) for device code (0: false, default).\n * This option is not supported on 32-bit platforms.\n * Option type: int\n * Applies to: compiler and linker */ CU_JIT_LTO, /** * Control single-precision denormals (-ftz) support (0: false, default). * 1 : flushes denormal values to zero * 0 : preserves denormal values * Option type: int\n * Applies to: link-time optimization specified with CU_JIT_LTO */ CU_JIT_FTZ, /** * Control single-precision floating-point division and reciprocals * (-prec-div) support (1: true, default). * 1 : Enables the IEEE round-to-nearest mode * 0 : Enables the fast approximation mode * Option type: int\n * Applies to: link-time optimization specified with CU_JIT_LTO */ CU_JIT_PREC_DIV, /** * Control single-precision floating-point square root * (-prec-sqrt) support (1: true, default). * 1 : Enables the IEEE round-to-nearest mode * 0 : Enables the fast approximation mode * Option type: int\n * Applies to: link-time optimization specified with CU_JIT_LTO */ CU_JIT_PREC_SQRT, /** * Enable/Disable the contraction of floating-point multiplies * and adds/subtracts into floating-point multiply-add (-fma) * operations (1: Enable, default; 0: Disable). * Option type: int\n * Applies to: link-time optimization specified with CU_JIT_LTO */ CU_JIT_FMA, CU_JIT_NUM_OPTIONS } CUjit_option; /** * Online compilation targets */ typedef enum CUjit_target_enum { CU_TARGET_COMPUTE_20 = 20, /**< Compute device class 2.0 */ CU_TARGET_COMPUTE_21 = 21, /**< Compute device class 2.1 */ CU_TARGET_COMPUTE_30 = 30, /**< Compute device class 3.0 */ CU_TARGET_COMPUTE_32 = 32, /**< Compute device class 3.2 */ CU_TARGET_COMPUTE_35 = 35, /**< Compute device class 3.5 */ CU_TARGET_COMPUTE_37 = 37, /**< Compute device class 3.7 */ CU_TARGET_COMPUTE_50 = 50, /**< Compute device class 5.0 */ CU_TARGET_COMPUTE_52 = 52, /**< Compute device class 5.2 */ CU_TARGET_COMPUTE_53 = 53, /**< Compute device class 5.3 */ CU_TARGET_COMPUTE_60 = 60, /**< Compute device class 6.0.*/ CU_TARGET_COMPUTE_61 = 61, /**< Compute device class 6.1.*/ CU_TARGET_COMPUTE_62 = 62, /**< Compute device class 6.2.*/ CU_TARGET_COMPUTE_70 = 70, /**< Compute device class 7.0.*/ CU_TARGET_COMPUTE_72 = 72, /**< Compute device class 7.2.*/ CU_TARGET_COMPUTE_75 = 75, /**< Compute device class 7.5.*/ CU_TARGET_COMPUTE_80 = 80, /**< Compute device class 8.0.*/ CU_TARGET_COMPUTE_86 = 86 /**< Compute device class 8.6.*/ } CUjit_target; /** * Cubin matching fallback strategies */ typedef enum CUjit_fallback_enum { CU_PREFER_PTX = 0, /**< Prefer to compile ptx if exact binary match not found */ CU_PREFER_BINARY /**< Prefer to fall back to compatible binary code if exact match not found */ } CUjit_fallback; /** * Caching modes for dlcm */ typedef enum CUjit_cacheMode_enum { CU_JIT_CACHE_OPTION_NONE = 0, /**< Compile with no -dlcm flag specified */ CU_JIT_CACHE_OPTION_CG, /**< Compile with L1 cache disabled */ CU_JIT_CACHE_OPTION_CA /**< Compile with L1 cache enabled */ } CUjit_cacheMode; /** * Device code formats */ typedef enum CUjitInputType_enum { /** * Compiled device-class-specific device code\n * Applicable options: none */ CU_JIT_INPUT_CUBIN = 0, /** * PTX source code\n * Applicable options: PTX compiler options */ CU_JIT_INPUT_PTX, /** * Bundle of multiple cubins and/or PTX of some device code\n * Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY */ CU_JIT_INPUT_FATBINARY, /** * Host object with embedded device code\n * Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY */ CU_JIT_INPUT_OBJECT, /** * Archive of host objects with embedded device code\n * Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY */ CU_JIT_INPUT_LIBRARY, /** * High-level intermediate code for link-time optimization\n * Applicable options: NVVM compiler options, PTX compiler options */ CU_JIT_INPUT_NVVM, CU_JIT_NUM_INPUT_TYPES } CUjitInputType; typedef struct CUlinkState_st *CUlinkState; /** * Flags to register a graphics resource */ typedef enum CUgraphicsRegisterFlags_enum { CU_GRAPHICS_REGISTER_FLAGS_NONE = 0x00, CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY = 0x01, CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD = 0x02, CU_GRAPHICS_REGISTER_FLAGS_SURFACE_LDST = 0x04, CU_GRAPHICS_REGISTER_FLAGS_TEXTURE_GATHER = 0x08 } CUgraphicsRegisterFlags; /** * Flags for mapping and unmapping interop resources */ typedef enum CUgraphicsMapResourceFlags_enum { CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE = 0x00, CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY = 0x01, CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD = 0x02 } CUgraphicsMapResourceFlags; /** * Array indices for cube faces */ typedef enum CUarray_cubemap_face_enum { CU_CUBEMAP_FACE_POSITIVE_X = 0x00, /**< Positive X face of cubemap */ CU_CUBEMAP_FACE_NEGATIVE_X = 0x01, /**< Negative X face of cubemap */ CU_CUBEMAP_FACE_POSITIVE_Y = 0x02, /**< Positive Y face of cubemap */ CU_CUBEMAP_FACE_NEGATIVE_Y = 0x03, /**< Negative Y face of cubemap */ CU_CUBEMAP_FACE_POSITIVE_Z = 0x04, /**< Positive Z face of cubemap */ CU_CUBEMAP_FACE_NEGATIVE_Z = 0x05 /**< Negative Z face of cubemap */ } CUarray_cubemap_face; /** * Limits */ typedef enum CUlimit_enum { CU_LIMIT_STACK_SIZE = 0x00, /**< GPU thread stack size */ CU_LIMIT_PRINTF_FIFO_SIZE = 0x01, /**< GPU printf FIFO size */ CU_LIMIT_MALLOC_HEAP_SIZE = 0x02, /**< GPU malloc heap size */ CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH = 0x03, /**< GPU device runtime launch synchronize depth */ CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT = 0x04, /**< GPU device runtime pending launch count */ CU_LIMIT_MAX_L2_FETCH_GRANULARITY = 0x05, /**< A value between 0 and 128 that indicates the maximum fetch granularity of L2 (in Bytes). This is a hint */ CU_LIMIT_PERSISTING_L2_CACHE_SIZE = 0x06, /**< A size in bytes for L2 persisting lines cache size */ CU_LIMIT_MAX } CUlimit; /** * Resource types */ typedef enum CUresourcetype_enum { CU_RESOURCE_TYPE_ARRAY = 0x00, /**< Array resoure */ CU_RESOURCE_TYPE_MIPMAPPED_ARRAY = 0x01, /**< Mipmapped array resource */ CU_RESOURCE_TYPE_LINEAR = 0x02, /**< Linear resource */ CU_RESOURCE_TYPE_PITCH2D = 0x03 /**< Pitch 2D resource */ } CUresourcetype; #ifdef _WIN32 #define CUDA_CB __stdcall #else #define CUDA_CB #endif /** * CUDA host function * \param userData Argument value passed to the function */ typedef void (CUDA_CB *CUhostFn)(void *userData); /** * Specifies performance hint with ::CUaccessPolicyWindow for hitProp and missProp members. */ typedef enum CUaccessProperty_enum { CU_ACCESS_PROPERTY_NORMAL = 0, /**< Normal cache persistence. */ CU_ACCESS_PROPERTY_STREAMING = 1, /**< Streaming access is less likely to persit from cache. */ CU_ACCESS_PROPERTY_PERSISTING = 2 /**< Persisting access is more likely to persist in cache.*/ } CUaccessProperty; /** * Specifies an access policy for a window, a contiguous extent of memory * beginning at base_ptr and ending at base_ptr + num_bytes. * num_bytes is limited by CU_DEVICE_ATTRIBUTE_MAX_ACCESS_POLICY_WINDOW_SIZE. * Partition into many segments and assign segments such that: * sum of "hit segments" / window == approx. ratio. * sum of "miss segments" / window == approx 1-ratio. * Segments and ratio specifications are fitted to the capabilities of * the architecture. * Accesses in a hit segment apply the hitProp access policy. * Accesses in a miss segment apply the missProp access policy. */ typedef struct CUaccessPolicyWindow_st { void *base_ptr; /**< Starting address of the access policy window. CUDA driver may align it. */ size_t num_bytes; /**< Size in bytes of the window policy. CUDA driver may restrict the maximum size and alignment. */ float hitRatio; /**< hitRatio specifies percentage of lines assigned hitProp, rest are assigned missProp. */ CUaccessProperty hitProp; /**< ::CUaccessProperty set for hit. */ CUaccessProperty missProp; /**< ::CUaccessProperty set for miss. Must be either NORMAL or STREAMING */ } CUaccessPolicyWindow_v1; typedef CUaccessPolicyWindow_v1 CUaccessPolicyWindow; /** * GPU kernel node parameters */ typedef struct CUDA_KERNEL_NODE_PARAMS_st { CUfunction func; /**< Kernel to launch */ unsigned int gridDimX; /**< Width of grid in blocks */ unsigned int gridDimY; /**< Height of grid in blocks */ unsigned int gridDimZ; /**< Depth of grid in blocks */ unsigned int blockDimX; /**< X dimension of each thread block */ unsigned int blockDimY; /**< Y dimension of each thread block */ unsigned int blockDimZ; /**< Z dimension of each thread block */ unsigned int sharedMemBytes; /**< Dynamic shared-memory size per thread block in bytes */ void **kernelParams; /**< Array of pointers to kernel parameters */ void **extra; /**< Extra options */ } CUDA_KERNEL_NODE_PARAMS_v1; typedef CUDA_KERNEL_NODE_PARAMS_v1 CUDA_KERNEL_NODE_PARAMS; /** * Memset node parameters */ typedef struct CUDA_MEMSET_NODE_PARAMS_st { CUdeviceptr dst; /**< Destination device pointer */ size_t pitch; /**< Pitch of destination device pointer. Unused if height is 1 */ unsigned int value; /**< Value to be set */ unsigned int elementSize; /**< Size of each element in bytes. Must be 1, 2, or 4. */ size_t width; /**< Width of the row in elements */ size_t height; /**< Number of rows */ } CUDA_MEMSET_NODE_PARAMS_v1; typedef CUDA_MEMSET_NODE_PARAMS_v1 CUDA_MEMSET_NODE_PARAMS; /** * Host node parameters */ typedef struct CUDA_HOST_NODE_PARAMS_st { CUhostFn fn; /**< The function to call when the node executes */ void* userData; /**< Argument to pass to the function */ } CUDA_HOST_NODE_PARAMS_v1; typedef CUDA_HOST_NODE_PARAMS_v1 CUDA_HOST_NODE_PARAMS; /** * Graph node types */ typedef enum CUgraphNodeType_enum { CU_GRAPH_NODE_TYPE_KERNEL = 0, /**< GPU kernel node */ CU_GRAPH_NODE_TYPE_MEMCPY = 1, /**< Memcpy node */ CU_GRAPH_NODE_TYPE_MEMSET = 2, /**< Memset node */ CU_GRAPH_NODE_TYPE_HOST = 3, /**< Host (executable) node */ CU_GRAPH_NODE_TYPE_GRAPH = 4, /**< Node which executes an embedded graph */ CU_GRAPH_NODE_TYPE_EMPTY = 5, /**< Empty (no-op) node */ CU_GRAPH_NODE_TYPE_WAIT_EVENT = 6, /**< External event wait node */ CU_GRAPH_NODE_TYPE_EVENT_RECORD = 7, /**< External event record node */ CU_GRAPH_NODE_TYPE_EXT_SEMAS_SIGNAL = 8, /**< External semaphore signal node */ CU_GRAPH_NODE_TYPE_EXT_SEMAS_WAIT = 9, /**< External semaphore wait node */ CU_GRAPH_NODE_TYPE_MEM_ALLOC = 10,/**< Memory Allocation Node */ CU_GRAPH_NODE_TYPE_MEM_FREE = 11 /**< Memory Free Node */ } CUgraphNodeType; typedef enum CUsynchronizationPolicy_enum { CU_SYNC_POLICY_AUTO = 1, CU_SYNC_POLICY_SPIN = 2, CU_SYNC_POLICY_YIELD = 3, CU_SYNC_POLICY_BLOCKING_SYNC = 4 } CUsynchronizationPolicy; /** * Graph kernel node Attributes */ typedef enum CUkernelNodeAttrID_enum { CU_KERNEL_NODE_ATTRIBUTE_ACCESS_POLICY_WINDOW = 1, /**< Identifier for ::CUkernelNodeAttrValue::accessPolicyWindow. */ CU_KERNEL_NODE_ATTRIBUTE_COOPERATIVE = 2 /**< Allows a kernel node to be cooperative (see ::cuLaunchCooperativeKernel). */ } CUkernelNodeAttrID; /** * Graph kernel node attributes union, used with ::cuKernelNodeSetAttribute/::cuKernelNodeGetAttribute */ typedef union CUkernelNodeAttrValue_union { CUaccessPolicyWindow accessPolicyWindow; /**< Attribute ::CUaccessPolicyWindow. */ int cooperative; /**< Nonzero indicates a cooperative kernel (see ::cuLaunchCooperativeKernel). */ } CUkernelNodeAttrValue_v1; typedef CUkernelNodeAttrValue_v1 CUkernelNodeAttrValue; /** * Possible stream capture statuses returned by ::cuStreamIsCapturing */ typedef enum CUstreamCaptureStatus_enum { CU_STREAM_CAPTURE_STATUS_NONE = 0, /**< Stream is not capturing */ CU_STREAM_CAPTURE_STATUS_ACTIVE = 1, /**< Stream is actively capturing */ CU_STREAM_CAPTURE_STATUS_INVALIDATED = 2 /**< Stream is part of a capture sequence that has been invalidated, but not terminated */ } CUstreamCaptureStatus; /** * Possible modes for stream capture thread interactions. For more details see * ::cuStreamBeginCapture and ::cuThreadExchangeStreamCaptureMode */ typedef enum CUstreamCaptureMode_enum { CU_STREAM_CAPTURE_MODE_GLOBAL = 0, CU_STREAM_CAPTURE_MODE_THREAD_LOCAL = 1, CU_STREAM_CAPTURE_MODE_RELAXED = 2 } CUstreamCaptureMode; /** * Stream Attributes */ typedef enum CUstreamAttrID_enum { CU_STREAM_ATTRIBUTE_ACCESS_POLICY_WINDOW = 1, /**< Identifier for ::CUstreamAttrValue::accessPolicyWindow. */ CU_STREAM_ATTRIBUTE_SYNCHRONIZATION_POLICY = 3 /**< ::CUsynchronizationPolicy for work queued up in this stream */ } CUstreamAttrID; /** * Stream attributes union, used with ::cuStreamSetAttribute/::cuStreamGetAttribute */ typedef union CUstreamAttrValue_union { CUaccessPolicyWindow accessPolicyWindow; /**< Attribute ::CUaccessPolicyWindow. */ CUsynchronizationPolicy syncPolicy; /**< Value for ::CU_STREAM_ATTRIBUTE_SYNCHRONIZATION_POLICY. */ } CUstreamAttrValue_v1; typedef CUstreamAttrValue_v1 CUstreamAttrValue; /** * Flags to specify search options. For more details see ::cuGetProcAddress */ typedef enum CUdriverProcAddress_flags_enum { CU_GET_PROC_ADDRESS_DEFAULT = 0, /**< Default search mode for driver symbols. */ CU_GET_PROC_ADDRESS_LEGACY_STREAM = 1 << 0, /**< Search for legacy versions of driver symbols. */ CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM = 1 << 1 /**< Search for per-thread versions of driver symbols. */ } CUdriverProcAddress_flags; /** * Execution Affinity Types */ typedef enum CUexecAffinityType_enum { CU_EXEC_AFFINITY_TYPE_SM_COUNT = 0, /**< Create a context with limited SMs. */ CU_EXEC_AFFINITY_TYPE_MAX } CUexecAffinityType; /** * Value for ::CU_EXEC_AFFINITY_TYPE_SM_COUNT */ typedef struct CUexecAffinitySmCount_st { unsigned int val; /**< The number of SMs the context is limited to use. */ } CUexecAffinitySmCount_v1; typedef CUexecAffinitySmCount_v1 CUexecAffinitySmCount; /** * Execution Affinity Parameters */ typedef struct CUexecAffinityParam_st { CUexecAffinityType type; union { CUexecAffinitySmCount smCount; /** Value for ::CU_EXEC_AFFINITY_TYPE_SM_COUNT */ } param; } CUexecAffinityParam_v1; typedef CUexecAffinityParam_v1 CUexecAffinityParam; /** * Error codes */ typedef enum cudaError_enum { /** * The API call returned with no errors. In the case of query calls, this * also means that the operation being queried is complete (see * ::cuEventQuery() and ::cuStreamQuery()). */ CUDA_SUCCESS = 0, /** * This indicates that one or more of the parameters passed to the API call * is not within an acceptable range of values. */ CUDA_ERROR_INVALID_VALUE = 1, /** * The API call failed because it was unable to allocate enough memory to * perform the requested operation. */ CUDA_ERROR_OUT_OF_MEMORY = 2, /** * This indicates that the CUDA driver has not been initialized with * ::cuInit() or that initialization has failed. */ CUDA_ERROR_NOT_INITIALIZED = 3, /** * This indicates that the CUDA driver is in the process of shutting down. */ CUDA_ERROR_DEINITIALIZED = 4, /** * This indicates profiler is not initialized for this run. This can * happen when the application is running with external profiling tools * like visual profiler. */ CUDA_ERROR_PROFILER_DISABLED = 5, /** * \deprecated * This error return is deprecated as of CUDA 5.0. It is no longer an error * to attempt to enable/disable the profiling via ::cuProfilerStart or * ::cuProfilerStop without initialization. */ CUDA_ERROR_PROFILER_NOT_INITIALIZED = 6, /** * \deprecated * This error return is deprecated as of CUDA 5.0. It is no longer an error * to call cuProfilerStart() when profiling is already enabled. */ CUDA_ERROR_PROFILER_ALREADY_STARTED = 7, /** * \deprecated * This error return is deprecated as of CUDA 5.0. It is no longer an error * to call cuProfilerStop() when profiling is already disabled. */ CUDA_ERROR_PROFILER_ALREADY_STOPPED = 8, /** * This indicates that the CUDA driver that the application has loaded is a * stub library. Applications that run with the stub rather than a real * driver loaded will result in CUDA API returning this error. */ CUDA_ERROR_STUB_LIBRARY = 34, /** * This indicates that no CUDA-capable devices were detected by the installed * CUDA driver. */ CUDA_ERROR_NO_DEVICE = 100, /** * This indicates that the device ordinal supplied by the user does not * correspond to a valid CUDA device or that the action requested is * invalid for the specified device. */ CUDA_ERROR_INVALID_DEVICE = 101, /** * This error indicates that the Grid license is not applied. */ CUDA_ERROR_DEVICE_NOT_LICENSED = 102, /** * This indicates that the device kernel image is invalid. This can also * indicate an invalid CUDA module. */ CUDA_ERROR_INVALID_IMAGE = 200, /** * This most frequently indicates that there is no context bound to the * current thread. This can also be returned if the context passed to an * API call is not a valid handle (such as a context that has had * ::cuCtxDestroy() invoked on it). This can also be returned if a user * mixes different API versions (i.e. 3010 context with 3020 API calls). * See ::cuCtxGetApiVersion() for more details. */ CUDA_ERROR_INVALID_CONTEXT = 201, /** * This indicated that the context being supplied as a parameter to the * API call was already the active context. * \deprecated * This error return is deprecated as of CUDA 3.2. It is no longer an * error to attempt to push the active context via ::cuCtxPushCurrent(). */ CUDA_ERROR_CONTEXT_ALREADY_CURRENT = 202, /** * This indicates that a map or register operation has failed. */ CUDA_ERROR_MAP_FAILED = 205, /** * This indicates that an unmap or unregister operation has failed. */ CUDA_ERROR_UNMAP_FAILED = 206, /** * This indicates that the specified array is currently mapped and thus * cannot be destroyed. */ CUDA_ERROR_ARRAY_IS_MAPPED = 207, /** * This indicates that the resource is already mapped. */ CUDA_ERROR_ALREADY_MAPPED = 208, /** * This indicates that there is no kernel image available that is suitable * for the device. This can occur when a user specifies code generation * options for a particular CUDA source file that do not include the * corresponding device configuration. */ CUDA_ERROR_NO_BINARY_FOR_GPU = 209, /** * This indicates that a resource has already been acquired. */ CUDA_ERROR_ALREADY_ACQUIRED = 210, /** * This indicates that a resource is not mapped. */ CUDA_ERROR_NOT_MAPPED = 211, /** * This indicates that a mapped resource is not available for access as an * array. */ CUDA_ERROR_NOT_MAPPED_AS_ARRAY = 212, /** * This indicates that a mapped resource is not available for access as a * pointer. */ CUDA_ERROR_NOT_MAPPED_AS_POINTER = 213, /** * This indicates that an uncorrectable ECC error was detected during * execution. */ CUDA_ERROR_ECC_UNCORRECTABLE = 214, /** * This indicates that the ::CUlimit passed to the API call is not * supported by the active device. */ CUDA_ERROR_UNSUPPORTED_LIMIT = 215, /** * This indicates that the ::CUcontext passed to the API call can * only be bound to a single CPU thread at a time but is already * bound to a CPU thread. */ CUDA_ERROR_CONTEXT_ALREADY_IN_USE = 216, /** * This indicates that peer access is not supported across the given * devices. */ CUDA_ERROR_PEER_ACCESS_UNSUPPORTED = 217, /** * This indicates that a PTX JIT compilation failed. */ CUDA_ERROR_INVALID_PTX = 218, /** * This indicates an error with OpenGL or DirectX context. */ CUDA_ERROR_INVALID_GRAPHICS_CONTEXT = 219, /** * This indicates that an uncorrectable NVLink error was detected during the * execution. */ CUDA_ERROR_NVLINK_UNCORRECTABLE = 220, /** * This indicates that the PTX JIT compiler library was not found. */ CUDA_ERROR_JIT_COMPILER_NOT_FOUND = 221, /** * This indicates that the provided PTX was compiled with an unsupported toolchain. */ CUDA_ERROR_UNSUPPORTED_PTX_VERSION = 222, /** * This indicates that the PTX JIT compilation was disabled. */ CUDA_ERROR_JIT_COMPILATION_DISABLED = 223, /** * This indicates that the ::CUexecAffinityType passed to the API call is not * supported by the active device. */ CUDA_ERROR_UNSUPPORTED_EXEC_AFFINITY = 224, /** * This indicates that the device kernel source is invalid. This includes * compilation/linker errors encountered in device code or user error. */ CUDA_ERROR_INVALID_SOURCE = 300, /** * This indicates that the file specified was not found. */ CUDA_ERROR_FILE_NOT_FOUND = 301, /** * This indicates that a link to a shared object failed to resolve. */ CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND = 302, /** * This indicates that initialization of a shared object failed. */ CUDA_ERROR_SHARED_OBJECT_INIT_FAILED = 303, /** * This indicates that an OS call failed. */ CUDA_ERROR_OPERATING_SYSTEM = 304, /** * This indicates that a resource handle passed to the API call was not * valid. Resource handles are opaque types like ::CUstream and ::CUevent. */ CUDA_ERROR_INVALID_HANDLE = 400, /** * This indicates that a resource required by the API call is not in a * valid state to perform the requested operation. */ CUDA_ERROR_ILLEGAL_STATE = 401, /** * This indicates that a named symbol was not found. Examples of symbols * are global/constant variable names, driver function names, texture names, * and surface names. */ CUDA_ERROR_NOT_FOUND = 500, /** * This indicates that asynchronous operations issued previously have not * completed yet. This result is not actually an error, but must be indicated * differently than ::CUDA_SUCCESS (which indicates completion). Calls that * may return this value include ::cuEventQuery() and ::cuStreamQuery(). */ CUDA_ERROR_NOT_READY = 600, /** * While executing a kernel, the device encountered a * load or store instruction on an invalid memory address. * This leaves the process in an inconsistent state and any further CUDA work * will return the same error. To continue using CUDA, the process must be terminated * and relaunched. */ CUDA_ERROR_ILLEGAL_ADDRESS = 700, /** * This indicates that a launch did not occur because it did not have * appropriate resources. This error usually indicates that the user has * attempted to pass too many arguments to the device kernel, or the * kernel launch specifies too many threads for the kernel's register * count. Passing arguments of the wrong size (i.e. a 64-bit pointer * when a 32-bit int is expected) is equivalent to passing too many * arguments and can also result in this error. */ CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES = 701, /** * This indicates that the device kernel took too long to execute. This can * only occur if timeouts are enabled - see the device attribute * ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT for more information. * This leaves the process in an inconsistent state and any further CUDA work * will return the same error. To continue using CUDA, the process must be terminated * and relaunched. */ CUDA_ERROR_LAUNCH_TIMEOUT = 702, /** * This error indicates a kernel launch that uses an incompatible texturing * mode. */ CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING = 703, /** * This error indicates that a call to ::cuCtxEnablePeerAccess() is * trying to re-enable peer access to a context which has already * had peer access to it enabled. */ CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED = 704, /** * This error indicates that ::cuCtxDisablePeerAccess() is * trying to disable peer access which has not been enabled yet * via ::cuCtxEnablePeerAccess(). */ CUDA_ERROR_PEER_ACCESS_NOT_ENABLED = 705, /** * This error indicates that the primary context for the specified device * has already been initialized. */ CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE = 708, /** * This error indicates that the context current to the calling thread * has been destroyed using ::cuCtxDestroy, or is a primary context which * has not yet been initialized. */ CUDA_ERROR_CONTEXT_IS_DESTROYED = 709, /** * A device-side assert triggered during kernel execution. The context * cannot be used anymore, and must be destroyed. All existing device * memory allocations from this context are invalid and must be * reconstructed if the program is to continue using CUDA. */ CUDA_ERROR_ASSERT = 710, /** * This error indicates that the hardware resources required to enable * peer access have been exhausted for one or more of the devices * passed to ::cuCtxEnablePeerAccess(). */ CUDA_ERROR_TOO_MANY_PEERS = 711, /** * This error indicates that the memory range passed to ::cuMemHostRegister() * has already been registered. */ CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED = 712, /** * This error indicates that the pointer passed to ::cuMemHostUnregister() * does not correspond to any currently registered memory region. */ CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED = 713, /** * While executing a kernel, the device encountered a stack error. * This can be due to stack corruption or exceeding the stack size limit. * This leaves the process in an inconsistent state and any further CUDA work * will return the same error. To continue using CUDA, the process must be terminated * and relaunched. */ CUDA_ERROR_HARDWARE_STACK_ERROR = 714, /** * While executing a kernel, the device encountered an illegal instruction. * This leaves the process in an inconsistent state and any further CUDA work * will return the same error. To continue using CUDA, the process must be terminated * and relaunched. */ CUDA_ERROR_ILLEGAL_INSTRUCTION = 715, /** * While executing a kernel, the device encountered a load or store instruction * on a memory address which is not aligned. * This leaves the process in an inconsistent state and any further CUDA work * will return the same error. To continue using CUDA, the process must be terminated * and relaunched. */ CUDA_ERROR_MISALIGNED_ADDRESS = 716, /** * While executing a kernel, the device encountered an instruction * which can only operate on memory locations in certain address spaces * (global, shared, or local), but was supplied a memory address not * belonging to an allowed address space. * This leaves the process in an inconsistent state and any further CUDA work * will return the same error. To continue using CUDA, the process must be terminated * and relaunched. */ CUDA_ERROR_INVALID_ADDRESS_SPACE = 717, /** * While executing a kernel, the device program counter wrapped its address space. * This leaves the process in an inconsistent state and any further CUDA work * will return the same error. To continue using CUDA, the process must be terminated * and relaunched. */ CUDA_ERROR_INVALID_PC = 718, /** * An exception occurred on the device while executing a kernel. Common * causes include dereferencing an invalid device pointer and accessing * out of bounds shared memory. Less common cases can be system specific - more * information about these cases can be found in the system specific user guide. * This leaves the process in an inconsistent state and any further CUDA work * will return the same error. To continue using CUDA, the process must be terminated * and relaunched. */ CUDA_ERROR_LAUNCH_FAILED = 719, /** * This error indicates that the number of blocks launched per grid for a kernel that was * launched via either ::cuLaunchCooperativeKernel or ::cuLaunchCooperativeKernelMultiDevice * exceeds the maximum number of blocks as allowed by ::cuOccupancyMaxActiveBlocksPerMultiprocessor * or ::cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags times the number of multiprocessors * as specified by the device attribute ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT. */ CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE = 720, /** * This error indicates that the attempted operation is not permitted. */ CUDA_ERROR_NOT_PERMITTED = 800, /** * This error indicates that the attempted operation is not supported * on the current system or device. */ CUDA_ERROR_NOT_SUPPORTED = 801, /** * This error indicates that the system is not yet ready to start any CUDA * work. To continue using CUDA, verify the system configuration is in a * valid state and all required driver daemons are actively running. * More information about this error can be found in the system specific * user guide. */ CUDA_ERROR_SYSTEM_NOT_READY = 802, /** * This error indicates that there is a mismatch between the versions of * the display driver and the CUDA driver. Refer to the compatibility documentation * for supported versions. */ CUDA_ERROR_SYSTEM_DRIVER_MISMATCH = 803, /** * This error indicates that the system was upgraded to run with forward compatibility * but the visible hardware detected by CUDA does not support this configuration. * Refer to the compatibility documentation for the supported hardware matrix or ensure * that only supported hardware is visible during initialization via the CUDA_VISIBLE_DEVICES * environment variable. */ CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE = 804, /** * This error indicates that the MPS client failed to connect to the MPS control daemon or the MPS server. */ CUDA_ERROR_MPS_CONNECTION_FAILED = 805, /** * This error indicates that the remote procedural call between the MPS server and the MPS client failed. */ CUDA_ERROR_MPS_RPC_FAILURE = 806, /** * This error indicates that the MPS server is not ready to accept new MPS client requests. * This error can be returned when the MPS server is in the process of recovering from a fatal failure. */ CUDA_ERROR_MPS_SERVER_NOT_READY = 807, /** * This error indicates that the hardware resources required to create MPS client have been exhausted. */ CUDA_ERROR_MPS_MAX_CLIENTS_REACHED = 808, /** * This error indicates the the hardware resources required to support device connections have been exhausted. */ CUDA_ERROR_MPS_MAX_CONNECTIONS_REACHED = 809, /** * This error indicates that the operation is not permitted when * the stream is capturing. */ CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED = 900, /** * This error indicates that the current capture sequence on the stream * has been invalidated due to a previous error. */ CUDA_ERROR_STREAM_CAPTURE_INVALIDATED = 901, /** * This error indicates that the operation would have resulted in a merge * of two independent capture sequences. */ CUDA_ERROR_STREAM_CAPTURE_MERGE = 902, /** * This error indicates that the capture was not initiated in this stream. */ CUDA_ERROR_STREAM_CAPTURE_UNMATCHED = 903, /** * This error indicates that the capture sequence contains a fork that was * not joined to the primary stream. */ CUDA_ERROR_STREAM_CAPTURE_UNJOINED = 904, /** * This error indicates that a dependency would have been created which * crosses the capture sequence boundary. Only implicit in-stream ordering * dependencies are allowed to cross the boundary. */ CUDA_ERROR_STREAM_CAPTURE_ISOLATION = 905, /** * This error indicates a disallowed implicit dependency on a current capture * sequence from cudaStreamLegacy. */ CUDA_ERROR_STREAM_CAPTURE_IMPLICIT = 906, /** * This error indicates that the operation is not permitted on an event which * was last recorded in a capturing stream. */ CUDA_ERROR_CAPTURED_EVENT = 907, /** * A stream capture sequence not initiated with the ::CU_STREAM_CAPTURE_MODE_RELAXED * argument to ::cuStreamBeginCapture was passed to ::cuStreamEndCapture in a * different thread. */ CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD = 908, /** * This error indicates that the timeout specified for the wait operation has lapsed. */ CUDA_ERROR_TIMEOUT = 909, /** * This error indicates that the graph update was not performed because it included * changes which violated constraints specific to instantiated graph update. */ CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE = 910, /** * This indicates that an async error has occurred in a device outside of CUDA. * If CUDA was waiting for an external device's signal before consuming shared data, * the external device signaled an error indicating that the data is not valid for * consumption. This leaves the process in an inconsistent state and any further CUDA * work will return the same error. To continue using CUDA, the process must be * terminated and relaunched. */ CUDA_ERROR_EXTERNAL_DEVICE = 911, /** * This indicates that an unknown internal error has occurred. */ CUDA_ERROR_UNKNOWN = 999 } CUresult; /** * P2P Attributes */ typedef enum CUdevice_P2PAttribute_enum { CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK = 0x01, /**< A relative value indicating the performance of the link between two devices */ CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED = 0x02, /**< P2P Access is enable */ CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED = 0x03, /**< Atomic operation over the link supported */ CU_DEVICE_P2P_ATTRIBUTE_ACCESS_ACCESS_SUPPORTED = 0x04, /**< \deprecated use CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED instead */ CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED = 0x04 /**< Accessing CUDA arrays over the link supported */ } CUdevice_P2PAttribute; /** * CUDA stream callback * \param hStream The stream the callback was added to, as passed to ::cuStreamAddCallback. May be NULL. * \param status ::CUDA_SUCCESS or any persistent error on the stream. * \param userData User parameter provided at registration. */ typedef void (CUDA_CB *CUstreamCallback)(CUstream hStream, CUresult status, void *userData); /** * Block size to per-block dynamic shared memory mapping for a certain * kernel \param blockSize Block size of the kernel. * * \return The dynamic shared memory needed by a block. */ typedef size_t (CUDA_CB *CUoccupancyB2DSize)(int blockSize); /** * If set, host memory is portable between CUDA contexts. * Flag for ::cuMemHostAlloc() */ #define CU_MEMHOSTALLOC_PORTABLE 0x01 /** * If set, host memory is mapped into CUDA address space and * ::cuMemHostGetDevicePointer() may be called on the host pointer. * Flag for ::cuMemHostAlloc() */ #define CU_MEMHOSTALLOC_DEVICEMAP 0x02 /** * If set, host memory is allocated as write-combined - fast to write, * faster to DMA, slow to read except via SSE4 streaming load instruction * (MOVNTDQA). * Flag for ::cuMemHostAlloc() */ #define CU_MEMHOSTALLOC_WRITECOMBINED 0x04 /** * If set, host memory is portable between CUDA contexts. * Flag for ::cuMemHostRegister() */ #define CU_MEMHOSTREGISTER_PORTABLE 0x01 /** * If set, host memory is mapped into CUDA address space and * ::cuMemHostGetDevicePointer() may be called on the host pointer. * Flag for ::cuMemHostRegister() */ #define CU_MEMHOSTREGISTER_DEVICEMAP 0x02 /** * If set, the passed memory pointer is treated as pointing to some * memory-mapped I/O space, e.g. belonging to a third-party PCIe device. * On Windows the flag is a no-op. * On Linux that memory is marked as non cache-coherent for the GPU and * is expected to be physically contiguous. It may return * ::CUDA_ERROR_NOT_PERMITTED if run as an unprivileged user, * ::CUDA_ERROR_NOT_SUPPORTED on older Linux kernel versions. * On all other platforms, it is not supported and ::CUDA_ERROR_NOT_SUPPORTED * is returned. * Flag for ::cuMemHostRegister() */ #define CU_MEMHOSTREGISTER_IOMEMORY 0x04 /** * If set, the passed memory pointer is treated as pointing to memory that is * considered read-only by the device. On platforms without * ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, this flag is * required in order to register memory mapped to the CPU as read-only. Support * for the use of this flag can be queried from the device attribute * ::CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED. Using this flag with * a current context associated with a device that does not have this attribute * set will cause ::cuMemHostRegister to error with ::CUDA_ERROR_NOT_SUPPORTED. */ #define CU_MEMHOSTREGISTER_READ_ONLY 0x08 /** * 2D memory copy parameters */ typedef struct CUDA_MEMCPY2D_st { size_t srcXInBytes; /**< Source X in bytes */ size_t srcY; /**< Source Y */ CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */ const void *srcHost; /**< Source host pointer */ CUdeviceptr srcDevice; /**< Source device pointer */ CUarray srcArray; /**< Source array reference */ size_t srcPitch; /**< Source pitch (ignored when src is array) */ size_t dstXInBytes; /**< Destination X in bytes */ size_t dstY; /**< Destination Y */ CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */ void *dstHost; /**< Destination host pointer */ CUdeviceptr dstDevice; /**< Destination device pointer */ CUarray dstArray; /**< Destination array reference */ size_t dstPitch; /**< Destination pitch (ignored when dst is array) */ size_t WidthInBytes; /**< Width of 2D memory copy in bytes */ size_t Height; /**< Height of 2D memory copy */ } CUDA_MEMCPY2D_v2; typedef CUDA_MEMCPY2D_v2 CUDA_MEMCPY2D; /** * 3D memory copy parameters */ typedef struct CUDA_MEMCPY3D_st { size_t srcXInBytes; /**< Source X in bytes */ size_t srcY; /**< Source Y */ size_t srcZ; /**< Source Z */ size_t srcLOD; /**< Source LOD */ CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */ const void *srcHost; /**< Source host pointer */ CUdeviceptr srcDevice; /**< Source device pointer */ CUarray srcArray; /**< Source array reference */ void *reserved0; /**< Must be NULL */ size_t srcPitch; /**< Source pitch (ignored when src is array) */ size_t srcHeight; /**< Source height (ignored when src is array; may be 0 if Depth==1) */ size_t dstXInBytes; /**< Destination X in bytes */ size_t dstY; /**< Destination Y */ size_t dstZ; /**< Destination Z */ size_t dstLOD; /**< Destination LOD */ CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */ void *dstHost; /**< Destination host pointer */ CUdeviceptr dstDevice; /**< Destination device pointer */ CUarray dstArray; /**< Destination array reference */ void *reserved1; /**< Must be NULL */ size_t dstPitch; /**< Destination pitch (ignored when dst is array) */ size_t dstHeight; /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */ size_t WidthInBytes; /**< Width of 3D memory copy in bytes */ size_t Height; /**< Height of 3D memory copy */ size_t Depth; /**< Depth of 3D memory copy */ } CUDA_MEMCPY3D_v2; typedef CUDA_MEMCPY3D_v2 CUDA_MEMCPY3D; /** * 3D memory cross-context copy parameters */ typedef struct CUDA_MEMCPY3D_PEER_st { size_t srcXInBytes; /**< Source X in bytes */ size_t srcY; /**< Source Y */ size_t srcZ; /**< Source Z */ size_t srcLOD; /**< Source LOD */ CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */ const void *srcHost; /**< Source host pointer */ CUdeviceptr srcDevice; /**< Source device pointer */ CUarray srcArray; /**< Source array reference */ CUcontext srcContext; /**< Source context (ignored with srcMemoryType is ::CU_MEMORYTYPE_ARRAY) */ size_t srcPitch; /**< Source pitch (ignored when src is array) */ size_t srcHeight; /**< Source height (ignored when src is array; may be 0 if Depth==1) */ size_t dstXInBytes; /**< Destination X in bytes */ size_t dstY; /**< Destination Y */ size_t dstZ; /**< Destination Z */ size_t dstLOD; /**< Destination LOD */ CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */ void *dstHost; /**< Destination host pointer */ CUdeviceptr dstDevice; /**< Destination device pointer */ CUarray dstArray; /**< Destination array reference */ CUcontext dstContext; /**< Destination context (ignored with dstMemoryType is ::CU_MEMORYTYPE_ARRAY) */ size_t dstPitch; /**< Destination pitch (ignored when dst is array) */ size_t dstHeight; /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */ size_t WidthInBytes; /**< Width of 3D memory copy in bytes */ size_t Height; /**< Height of 3D memory copy */ size_t Depth; /**< Depth of 3D memory copy */ } CUDA_MEMCPY3D_PEER_v1; typedef CUDA_MEMCPY3D_PEER_v1 CUDA_MEMCPY3D_PEER; /** * Array descriptor */ typedef struct CUDA_ARRAY_DESCRIPTOR_st { size_t Width; /**< Width of array */ size_t Height; /**< Height of array */ CUarray_format Format; /**< Array format */ unsigned int NumChannels; /**< Channels per array element */ } CUDA_ARRAY_DESCRIPTOR_v2; typedef CUDA_ARRAY_DESCRIPTOR_v2 CUDA_ARRAY_DESCRIPTOR; /** * 3D array descriptor */ typedef struct CUDA_ARRAY3D_DESCRIPTOR_st { size_t Width; /**< Width of 3D array */ size_t Height; /**< Height of 3D array */ size_t Depth; /**< Depth of 3D array */ CUarray_format Format; /**< Array format */ unsigned int NumChannels; /**< Channels per array element */ unsigned int Flags; /**< Flags */ } CUDA_ARRAY3D_DESCRIPTOR_v2; typedef CUDA_ARRAY3D_DESCRIPTOR_v2 CUDA_ARRAY3D_DESCRIPTOR; /** * Indicates that the layered sparse CUDA array or CUDA mipmapped array has a single mip tail region for all layers */ #define CU_ARRAY_SPARSE_PROPERTIES_SINGLE_MIPTAIL 0x1 /** * CUDA array sparse properties */ typedef struct CUDA_ARRAY_SPARSE_PROPERTIES_st { struct { unsigned int width; /**< Width of sparse tile in elements */ unsigned int height; /**< Height of sparse tile in elements */ unsigned int depth; /**< Depth of sparse tile in elements */ } tileExtent; /** * First mip level at which the mip tail begins. */ unsigned int miptailFirstLevel; /** * Total size of the mip tail. */ unsigned long long miptailSize; /** * Flags will either be zero or ::CU_ARRAY_SPARSE_PROPERTIES_SINGLE_MIPTAIL */ unsigned int flags; unsigned int reserved[4]; } CUDA_ARRAY_SPARSE_PROPERTIES_v1; typedef CUDA_ARRAY_SPARSE_PROPERTIES_v1 CUDA_ARRAY_SPARSE_PROPERTIES; /** * CUDA array memory requirements */ typedef struct CUDA_ARRAY_MEMORY_REQUIREMENTS_st { size_t size; /**< Total required memory size */ size_t alignment; /**< alignment requirement */ unsigned int reserved[4]; } CUDA_ARRAY_MEMORY_REQUIREMENTS_v1; typedef CUDA_ARRAY_MEMORY_REQUIREMENTS_v1 CUDA_ARRAY_MEMORY_REQUIREMENTS; /** * CUDA Resource descriptor */ typedef struct CUDA_RESOURCE_DESC_st { CUresourcetype resType; /**< Resource type */ union { struct { CUarray hArray; /**< CUDA array */ } array; struct { CUmipmappedArray hMipmappedArray; /**< CUDA mipmapped array */ } mipmap; struct { CUdeviceptr devPtr; /**< Device pointer */ CUarray_format format; /**< Array format */ unsigned int numChannels; /**< Channels per array element */ size_t sizeInBytes; /**< Size in bytes */ } linear; struct { CUdeviceptr devPtr; /**< Device pointer */ CUarray_format format; /**< Array format */ unsigned int numChannels; /**< Channels per array element */ size_t width; /**< Width of the array in elements */ size_t height; /**< Height of the array in elements */ size_t pitchInBytes; /**< Pitch between two rows in bytes */ } pitch2D; struct { int reserved[32]; } reserved; } res; unsigned int flags; /**< Flags (must be zero) */ } CUDA_RESOURCE_DESC_v1; typedef CUDA_RESOURCE_DESC_v1 CUDA_RESOURCE_DESC; /** * Texture descriptor */ typedef struct CUDA_TEXTURE_DESC_st { CUaddress_mode addressMode[3]; /**< Address modes */ CUfilter_mode filterMode; /**< Filter mode */ unsigned int flags; /**< Flags */ unsigned int maxAnisotropy; /**< Maximum anisotropy ratio */ CUfilter_mode mipmapFilterMode; /**< Mipmap filter mode */ float mipmapLevelBias; /**< Mipmap level bias */ float minMipmapLevelClamp; /**< Mipmap minimum level clamp */ float maxMipmapLevelClamp; /**< Mipmap maximum level clamp */ float borderColor[4]; /**< Border Color */ int reserved[12]; } CUDA_TEXTURE_DESC_v1; typedef CUDA_TEXTURE_DESC_v1 CUDA_TEXTURE_DESC; /** * Resource view format */ typedef enum CUresourceViewFormat_enum { CU_RES_VIEW_FORMAT_NONE = 0x00, /**< No resource view format (use underlying resource format) */ CU_RES_VIEW_FORMAT_UINT_1X8 = 0x01, /**< 1 channel unsigned 8-bit integers */ CU_RES_VIEW_FORMAT_UINT_2X8 = 0x02, /**< 2 channel unsigned 8-bit integers */ CU_RES_VIEW_FORMAT_UINT_4X8 = 0x03, /**< 4 channel unsigned 8-bit integers */ CU_RES_VIEW_FORMAT_SINT_1X8 = 0x04, /**< 1 channel signed 8-bit integers */ CU_RES_VIEW_FORMAT_SINT_2X8 = 0x05, /**< 2 channel signed 8-bit integers */ CU_RES_VIEW_FORMAT_SINT_4X8 = 0x06, /**< 4 channel signed 8-bit integers */ CU_RES_VIEW_FORMAT_UINT_1X16 = 0x07, /**< 1 channel unsigned 16-bit integers */ CU_RES_VIEW_FORMAT_UINT_2X16 = 0x08, /**< 2 channel unsigned 16-bit integers */ CU_RES_VIEW_FORMAT_UINT_4X16 = 0x09, /**< 4 channel unsigned 16-bit integers */ CU_RES_VIEW_FORMAT_SINT_1X16 = 0x0a, /**< 1 channel signed 16-bit integers */ CU_RES_VIEW_FORMAT_SINT_2X16 = 0x0b, /**< 2 channel signed 16-bit integers */ CU_RES_VIEW_FORMAT_SINT_4X16 = 0x0c, /**< 4 channel signed 16-bit integers */ CU_RES_VIEW_FORMAT_UINT_1X32 = 0x0d, /**< 1 channel unsigned 32-bit integers */ CU_RES_VIEW_FORMAT_UINT_2X32 = 0x0e, /**< 2 channel unsigned 32-bit integers */ CU_RES_VIEW_FORMAT_UINT_4X32 = 0x0f, /**< 4 channel unsigned 32-bit integers */ CU_RES_VIEW_FORMAT_SINT_1X32 = 0x10, /**< 1 channel signed 32-bit integers */ CU_RES_VIEW_FORMAT_SINT_2X32 = 0x11, /**< 2 channel signed 32-bit integers */ CU_RES_VIEW_FORMAT_SINT_4X32 = 0x12, /**< 4 channel signed 32-bit integers */ CU_RES_VIEW_FORMAT_FLOAT_1X16 = 0x13, /**< 1 channel 16-bit floating point */ CU_RES_VIEW_FORMAT_FLOAT_2X16 = 0x14, /**< 2 channel 16-bit floating point */ CU_RES_VIEW_FORMAT_FLOAT_4X16 = 0x15, /**< 4 channel 16-bit floating point */ CU_RES_VIEW_FORMAT_FLOAT_1X32 = 0x16, /**< 1 channel 32-bit floating point */ CU_RES_VIEW_FORMAT_FLOAT_2X32 = 0x17, /**< 2 channel 32-bit floating point */ CU_RES_VIEW_FORMAT_FLOAT_4X32 = 0x18, /**< 4 channel 32-bit floating point */ CU_RES_VIEW_FORMAT_UNSIGNED_BC1 = 0x19, /**< Block compressed 1 */ CU_RES_VIEW_FORMAT_UNSIGNED_BC2 = 0x1a, /**< Block compressed 2 */ CU_RES_VIEW_FORMAT_UNSIGNED_BC3 = 0x1b, /**< Block compressed 3 */ CU_RES_VIEW_FORMAT_UNSIGNED_BC4 = 0x1c, /**< Block compressed 4 unsigned */ CU_RES_VIEW_FORMAT_SIGNED_BC4 = 0x1d, /**< Block compressed 4 signed */ CU_RES_VIEW_FORMAT_UNSIGNED_BC5 = 0x1e, /**< Block compressed 5 unsigned */ CU_RES_VIEW_FORMAT_SIGNED_BC5 = 0x1f, /**< Block compressed 5 signed */ CU_RES_VIEW_FORMAT_UNSIGNED_BC6H = 0x20, /**< Block compressed 6 unsigned half-float */ CU_RES_VIEW_FORMAT_SIGNED_BC6H = 0x21, /**< Block compressed 6 signed half-float */ CU_RES_VIEW_FORMAT_UNSIGNED_BC7 = 0x22 /**< Block compressed 7 */ } CUresourceViewFormat; /** * Resource view descriptor */ typedef struct CUDA_RESOURCE_VIEW_DESC_st { CUresourceViewFormat format; /**< Resource view format */ size_t width; /**< Width of the resource view */ size_t height; /**< Height of the resource view */ size_t depth; /**< Depth of the resource view */ unsigned int firstMipmapLevel; /**< First defined mipmap level */ unsigned int lastMipmapLevel; /**< Last defined mipmap level */ unsigned int firstLayer; /**< First layer index */ unsigned int lastLayer; /**< Last layer index */ unsigned int reserved[16]; } CUDA_RESOURCE_VIEW_DESC_v1; typedef CUDA_RESOURCE_VIEW_DESC_v1 CUDA_RESOURCE_VIEW_DESC; /** * GPU Direct v3 tokens */ typedef struct CUDA_POINTER_ATTRIBUTE_P2P_TOKENS_st { unsigned long long p2pToken; unsigned int vaSpaceToken; } CUDA_POINTER_ATTRIBUTE_P2P_TOKENS_v1; typedef CUDA_POINTER_ATTRIBUTE_P2P_TOKENS_v1 CUDA_POINTER_ATTRIBUTE_P2P_TOKENS; /** * Access flags that specify the level of access the current context's device has * on the memory referenced. */ typedef enum CUDA_POINTER_ATTRIBUTE_ACCESS_FLAGS_enum { CU_POINTER_ATTRIBUTE_ACCESS_FLAG_NONE = 0x0, /**< No access, meaning the device cannot access this memory at all, thus must be staged through accessible memory in order to complete certain operations */ CU_POINTER_ATTRIBUTE_ACCESS_FLAG_READ = 0x1, /**< Read-only access, meaning writes to this memory are considered invalid accesses and thus return error in that case. */ CU_POINTER_ATTRIBUTE_ACCESS_FLAG_READWRITE = 0x3 /**< Read-write access, the device has full read-write access to the memory */ } CUDA_POINTER_ATTRIBUTE_ACCESS_FLAGS; /** * Kernel launch parameters */ typedef struct CUDA_LAUNCH_PARAMS_st { CUfunction function; /**< Kernel to launch */ unsigned int gridDimX; /**< Width of grid in blocks */ unsigned int gridDimY; /**< Height of grid in blocks */ unsigned int gridDimZ; /**< Depth of grid in blocks */ unsigned int blockDimX; /**< X dimension of each thread block */ unsigned int blockDimY; /**< Y dimension of each thread block */ unsigned int blockDimZ; /**< Z dimension of each thread block */ unsigned int sharedMemBytes; /**< Dynamic shared-memory size per thread block in bytes */ CUstream hStream; /**< Stream identifier */ void **kernelParams; /**< Array of pointers to kernel parameters */ } CUDA_LAUNCH_PARAMS_v1; typedef CUDA_LAUNCH_PARAMS_v1 CUDA_LAUNCH_PARAMS; /** * External memory handle types */ typedef enum CUexternalMemoryHandleType_enum { /** * Handle is an opaque file descriptor */ CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD = 1, /** * Handle is an opaque shared NT handle */ CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32 = 2, /** * Handle is an opaque, globally shared handle */ CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT = 3, /** * Handle is a D3D12 heap object */ CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP = 4, /** * Handle is a D3D12 committed resource */ CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE = 5, /** * Handle is a shared NT handle to a D3D11 resource */ CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE = 6, /** * Handle is a globally shared handle to a D3D11 resource */ CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT = 7, /** * Handle is an NvSciBuf object */ CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF = 8 } CUexternalMemoryHandleType; /** * Indicates that the external memory object is a dedicated resource */ #define CUDA_EXTERNAL_MEMORY_DEDICATED 0x1 /** When the \p flags parameter of ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS * contains this flag, it indicates that signaling an external semaphore object * should skip performing appropriate memory synchronization operations over all * the external memory objects that are imported as ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF, * which otherwise are performed by default to ensure data coherency with other * importers of the same NvSciBuf memory objects. */ #define CUDA_EXTERNAL_SEMAPHORE_SIGNAL_SKIP_NVSCIBUF_MEMSYNC 0x01 /** When the \p flags parameter of ::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS * contains this flag, it indicates that waiting on an external semaphore object * should skip performing appropriate memory synchronization operations over all * the external memory objects that are imported as ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF, * which otherwise are performed by default to ensure data coherency with other * importers of the same NvSciBuf memory objects. */ #define CUDA_EXTERNAL_SEMAPHORE_WAIT_SKIP_NVSCIBUF_MEMSYNC 0x02 /** * When \p flags of ::cuDeviceGetNvSciSyncAttributes is set to this, * it indicates that application needs signaler specific NvSciSyncAttr * to be filled by ::cuDeviceGetNvSciSyncAttributes. */ #define CUDA_NVSCISYNC_ATTR_SIGNAL 0x1 /** * When \p flags of ::cuDeviceGetNvSciSyncAttributes is set to this, * it indicates that application needs waiter specific NvSciSyncAttr * to be filled by ::cuDeviceGetNvSciSyncAttributes. */ #define CUDA_NVSCISYNC_ATTR_WAIT 0x2 /** * External memory handle descriptor */ typedef struct CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st { /** * Type of the handle */ CUexternalMemoryHandleType type; union { /** * File descriptor referencing the memory object. Valid * when type is * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD */ int fd; /** * Win32 handle referencing the semaphore object. Valid when * type is one of the following: * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32 * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT * Exactly one of 'handle' and 'name' must be non-NULL. If * type is one of the following: * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT * then 'name' must be NULL. */ struct { /** * Valid NT handle. Must be NULL if 'name' is non-NULL */ void *handle; /** * Name of a valid memory object. * Must be NULL if 'handle' is non-NULL. */ const void *name; } win32; /** * A handle representing an NvSciBuf Object. Valid when type * is ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF */ const void *nvSciBufObject; } handle; /** * Size of the memory allocation */ unsigned long long size; /** * Flags must either be zero or ::CUDA_EXTERNAL_MEMORY_DEDICATED */ unsigned int flags; unsigned int reserved[16]; } CUDA_EXTERNAL_MEMORY_HANDLE_DESC_v1; typedef CUDA_EXTERNAL_MEMORY_HANDLE_DESC_v1 CUDA_EXTERNAL_MEMORY_HANDLE_DESC; /** * External memory buffer descriptor */ typedef struct CUDA_EXTERNAL_MEMORY_BUFFER_DESC_st { /** * Offset into the memory object where the buffer's base is */ unsigned long long offset; /** * Size of the buffer */ unsigned long long size; /** * Flags reserved for future use. Must be zero. */ unsigned int flags; unsigned int reserved[16]; } CUDA_EXTERNAL_MEMORY_BUFFER_DESC_v1; typedef CUDA_EXTERNAL_MEMORY_BUFFER_DESC_v1 CUDA_EXTERNAL_MEMORY_BUFFER_DESC; /** * External memory mipmap descriptor */ typedef struct CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_st { /** * Offset into the memory object where the base level of the * mipmap chain is. */ unsigned long long offset; /** * Format, dimension and type of base level of the mipmap chain */ CUDA_ARRAY3D_DESCRIPTOR arrayDesc; /** * Total number of levels in the mipmap chain */ unsigned int numLevels; unsigned int reserved[16]; } CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_v1; typedef CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_v1 CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC; /** * External semaphore handle types */ typedef enum CUexternalSemaphoreHandleType_enum { /** * Handle is an opaque file descriptor */ CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD = 1, /** * Handle is an opaque shared NT handle */ CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32 = 2, /** * Handle is an opaque, globally shared handle */ CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT = 3, /** * Handle is a shared NT handle referencing a D3D12 fence object */ CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE = 4, /** * Handle is a shared NT handle referencing a D3D11 fence object */ CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE = 5, /** * Opaque handle to NvSciSync Object */ CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC = 6, /** * Handle is a shared NT handle referencing a D3D11 keyed mutex object */ CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX = 7, /** * Handle is a globally shared handle referencing a D3D11 keyed mutex object */ CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT = 8, /** * Handle is an opaque file descriptor referencing a timeline semaphore */ CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD = 9, /** * Handle is an opaque shared NT handle referencing a timeline semaphore */ CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32 = 10 } CUexternalSemaphoreHandleType; /** * External semaphore handle descriptor */ typedef struct CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_st { /** * Type of the handle */ CUexternalSemaphoreHandleType type; union { /** * File descriptor referencing the semaphore object. Valid * when type is one of the following: * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD */ int fd; /** * Win32 handle referencing the semaphore object. Valid when * type is one of the following: * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32 * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32 * Exactly one of 'handle' and 'name' must be non-NULL. If * type is one of the following: * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT * then 'name' must be NULL. */ struct { /** * Valid NT handle. Must be NULL if 'name' is non-NULL */ void *handle; /** * Name of a valid synchronization primitive. * Must be NULL if 'handle' is non-NULL. */ const void *name; } win32; /** * Valid NvSciSyncObj. Must be non NULL */ const void* nvSciSyncObj; } handle; /** * Flags reserved for the future. Must be zero. */ unsigned int flags; unsigned int reserved[16]; } CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_v1; typedef CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_v1 CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC; /** * External semaphore signal parameters */ typedef struct CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st { struct { /** * Parameters for fence objects */ struct { /** * Value of fence to be signaled */ unsigned long long value; } fence; union { /** * Pointer to NvSciSyncFence. Valid if ::CUexternalSemaphoreHandleType * is of type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC. */ void *fence; unsigned long long reserved; } nvSciSync; /** * Parameters for keyed mutex objects */ struct { /** * Value of key to release the mutex with */ unsigned long long key; } keyedMutex; unsigned int reserved[12]; } params; /** * Only when ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS is used to * signal a ::CUexternalSemaphore of type * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC, the valid flag is * ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_SKIP_NVSCIBUF_MEMSYNC which indicates * that while signaling the ::CUexternalSemaphore, no memory synchronization * operations should be performed for any external memory object imported * as ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF. * For all other types of ::CUexternalSemaphore, flags must be zero. */ unsigned int flags; unsigned int reserved[16]; } CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_v1; typedef CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_v1 CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS; /** * External semaphore wait parameters */ typedef struct CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st { struct { /** * Parameters for fence objects */ struct { /** * Value of fence to be waited on */ unsigned long long value; } fence; /** * Pointer to NvSciSyncFence. Valid if CUexternalSemaphoreHandleType * is of type CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC. */ union { void *fence; unsigned long long reserved; } nvSciSync; /** * Parameters for keyed mutex objects */ struct { /** * Value of key to acquire the mutex with */ unsigned long long key; /** * Timeout in milliseconds to wait to acquire the mutex */ unsigned int timeoutMs; } keyedMutex; unsigned int reserved[10]; } params; /** * Only when ::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS is used to wait on * a ::CUexternalSemaphore of type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC, * the valid flag is ::CUDA_EXTERNAL_SEMAPHORE_WAIT_SKIP_NVSCIBUF_MEMSYNC * which indicates that while waiting for the ::CUexternalSemaphore, no memory * synchronization operations should be performed for any external memory * object imported as ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF. * For all other types of ::CUexternalSemaphore, flags must be zero. */ unsigned int flags; unsigned int reserved[16]; } CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_v1; typedef CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_v1 CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS; /** * Semaphore signal node parameters */ typedef struct CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_st { CUexternalSemaphore* extSemArray; /**< Array of external semaphore handles. */ const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS* paramsArray; /**< Array of external semaphore signal parameters. */ unsigned int numExtSems; /**< Number of handles and parameters supplied in extSemArray and paramsArray. */ } CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v1; typedef CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v1 CUDA_EXT_SEM_SIGNAL_NODE_PARAMS; /** * Semaphore wait node parameters */ typedef struct CUDA_EXT_SEM_WAIT_NODE_PARAMS_st { CUexternalSemaphore* extSemArray; /**< Array of external semaphore handles. */ const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS* paramsArray; /**< Array of external semaphore wait parameters. */ unsigned int numExtSems; /**< Number of handles and parameters supplied in extSemArray and paramsArray. */ } CUDA_EXT_SEM_WAIT_NODE_PARAMS_v1; typedef CUDA_EXT_SEM_WAIT_NODE_PARAMS_v1 CUDA_EXT_SEM_WAIT_NODE_PARAMS; typedef unsigned long long CUmemGenericAllocationHandle_v1; typedef CUmemGenericAllocationHandle_v1 CUmemGenericAllocationHandle; /** * Flags for specifying particular handle types */ typedef enum CUmemAllocationHandleType_enum { CU_MEM_HANDLE_TYPE_NONE = 0x0, /**< Does not allow any export mechanism. > */ CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR = 0x1, /**< Allows a file descriptor to be used for exporting. Permitted only on POSIX systems. (int) */ CU_MEM_HANDLE_TYPE_WIN32 = 0x2, /**< Allows a Win32 NT handle to be used for exporting. (HANDLE) */ CU_MEM_HANDLE_TYPE_WIN32_KMT = 0x4, /**< Allows a Win32 KMT handle to be used for exporting. (D3DKMT_HANDLE) */ CU_MEM_HANDLE_TYPE_MAX = 0x7FFFFFFF } CUmemAllocationHandleType; /** * Specifies the memory protection flags for mapping. */ typedef enum CUmemAccess_flags_enum { CU_MEM_ACCESS_FLAGS_PROT_NONE = 0x0, /**< Default, make the address range not accessible */ CU_MEM_ACCESS_FLAGS_PROT_READ = 0x1, /**< Make the address range read accessible */ CU_MEM_ACCESS_FLAGS_PROT_READWRITE = 0x3, /**< Make the address range read-write accessible */ CU_MEM_ACCESS_FLAGS_PROT_MAX = 0x7FFFFFFF } CUmemAccess_flags; /** * Specifies the type of location */ typedef enum CUmemLocationType_enum { CU_MEM_LOCATION_TYPE_INVALID = 0x0, CU_MEM_LOCATION_TYPE_DEVICE = 0x1, /**< Location is a device location, thus id is a device ordinal */ CU_MEM_LOCATION_TYPE_MAX = 0x7FFFFFFF } CUmemLocationType; /** * Defines the allocation types available */ typedef enum CUmemAllocationType_enum { CU_MEM_ALLOCATION_TYPE_INVALID = 0x0, /** This allocation type is 'pinned', i.e. cannot migrate from its current * location while the application is actively using it */ CU_MEM_ALLOCATION_TYPE_PINNED = 0x1, CU_MEM_ALLOCATION_TYPE_MAX = 0x7FFFFFFF } CUmemAllocationType; /** * Flag for requesting different optimal and required granularities for an allocation. */ typedef enum CUmemAllocationGranularity_flags_enum { CU_MEM_ALLOC_GRANULARITY_MINIMUM = 0x0, /**< Minimum required granularity for allocation */ CU_MEM_ALLOC_GRANULARITY_RECOMMENDED = 0x1 /**< Recommended granularity for allocation for best performance */ } CUmemAllocationGranularity_flags; /** * Sparse subresource types */ typedef enum CUarraySparseSubresourceType_enum { CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_SPARSE_LEVEL = 0, CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_MIPTAIL = 1 } CUarraySparseSubresourceType; /** * Memory operation types */ typedef enum CUmemOperationType_enum { CU_MEM_OPERATION_TYPE_MAP = 1, CU_MEM_OPERATION_TYPE_UNMAP = 2 } CUmemOperationType; /** * Memory handle types */ typedef enum CUmemHandleType_enum { CU_MEM_HANDLE_TYPE_GENERIC = 0 } CUmemHandleType; /** * Specifies the CUDA array or CUDA mipmapped array memory mapping information */ typedef struct CUarrayMapInfo_st { CUresourcetype resourceType; /**< Resource type */ union { CUmipmappedArray mipmap; CUarray array; } resource; CUarraySparseSubresourceType subresourceType; /**< Sparse subresource type */ union { struct { unsigned int level; /**< For CUDA mipmapped arrays must a valid mipmap level. For CUDA arrays must be zero */ unsigned int layer; /**< For CUDA layered arrays must be a valid layer index. Otherwise, must be zero */ unsigned int offsetX; /**< Starting X offset in elements */ unsigned int offsetY; /**< Starting Y offset in elements */ unsigned int offsetZ; /**< Starting Z offset in elements */ unsigned int extentWidth; /**< Width in elements */ unsigned int extentHeight; /**< Height in elements */ unsigned int extentDepth; /**< Depth in elements */ } sparseLevel; struct { unsigned int layer; /**< For CUDA layered arrays must be a valid layer index. Otherwise, must be zero */ unsigned long long offset; /**< Offset within mip tail */ unsigned long long size; /**< Extent in bytes */ } miptail; } subresource; CUmemOperationType memOperationType; /**< Memory operation type */ CUmemHandleType memHandleType; /**< Memory handle type */ union { CUmemGenericAllocationHandle memHandle; } memHandle; unsigned long long offset; /**< Offset within the memory */ unsigned int deviceBitMask; /**< Device ordinal bit mask */ unsigned int flags; /**< flags for future use, must be zero now. */ unsigned int reserved[2]; /**< Reserved for future use, must be zero now. */ } CUarrayMapInfo_v1; typedef CUarrayMapInfo_v1 CUarrayMapInfo; /** * Specifies a memory location. */ typedef struct CUmemLocation_st { CUmemLocationType type; /**< Specifies the location type, which modifies the meaning of id. */ int id; /**< identifier for a given this location's ::CUmemLocationType. */ } CUmemLocation_v1; typedef CUmemLocation_v1 CUmemLocation; /** * Specifies compression attribute for an allocation. */ typedef enum CUmemAllocationCompType_enum { CU_MEM_ALLOCATION_COMP_NONE = 0x0, /**< Allocating non-compressible memory */ CU_MEM_ALLOCATION_COMP_GENERIC = 0x1 /**< Allocating compressible memory */ } CUmemAllocationCompType; /** * This flag if set indicates that the memory will be used as a tile pool. */ #define CU_MEM_CREATE_USAGE_TILE_POOL 0x1 /** * Specifies the allocation properties for a allocation. */ typedef struct CUmemAllocationProp_st { /** Allocation type */ CUmemAllocationType type; /** requested ::CUmemAllocationHandleType */ CUmemAllocationHandleType requestedHandleTypes; /** Location of allocation */ CUmemLocation location; /** * Windows-specific POBJECT_ATTRIBUTES required when * ::CU_MEM_HANDLE_TYPE_WIN32 is specified. This object atributes structure * includes security attributes that define * the scope of which exported allocations may be tranferred to other * processes. In all other cases, this field is required to be zero. */ void *win32HandleMetaData; struct { /** * Allocation hint for requesting compressible memory. * On devices that support Compute Data Compression, compressible * memory can be used to accelerate accesses to data with unstructured * sparsity and other compressible data patterns. Applications are * expected to query allocation property of the handle obtained with * ::cuMemCreate using ::cuMemGetAllocationPropertiesFromHandle to * validate if the obtained allocation is compressible or not. Note that * compressed memory may not be mappable on all devices. */ unsigned char compressionType; unsigned char gpuDirectRDMACapable; /** Bitmask indicating intended usage for this allocation */ unsigned short usage; unsigned char reserved[4]; } allocFlags; } CUmemAllocationProp_v1; typedef CUmemAllocationProp_v1 CUmemAllocationProp; /** * Memory access descriptor */ typedef struct CUmemAccessDesc_st { CUmemLocation location; /**< Location on which the request is to change it's accessibility */ CUmemAccess_flags flags; /**< ::CUmemProt accessibility flags to set on the request */ } CUmemAccessDesc_v1; typedef CUmemAccessDesc_v1 CUmemAccessDesc; typedef enum CUgraphExecUpdateResult_enum { CU_GRAPH_EXEC_UPDATE_SUCCESS = 0x0, /**< The update succeeded */ CU_GRAPH_EXEC_UPDATE_ERROR = 0x1, /**< The update failed for an unexpected reason which is described in the return value of the function */ CU_GRAPH_EXEC_UPDATE_ERROR_TOPOLOGY_CHANGED = 0x2, /**< The update failed because the topology changed */ CU_GRAPH_EXEC_UPDATE_ERROR_NODE_TYPE_CHANGED = 0x3, /**< The update failed because a node type changed */ CU_GRAPH_EXEC_UPDATE_ERROR_FUNCTION_CHANGED = 0x4, /**< The update failed because the function of a kernel node changed (CUDA driver < 11.2) */ CU_GRAPH_EXEC_UPDATE_ERROR_PARAMETERS_CHANGED = 0x5, /**< The update failed because the parameters changed in a way that is not supported */ CU_GRAPH_EXEC_UPDATE_ERROR_NOT_SUPPORTED = 0x6, /**< The update failed because something about the node is not supported */ CU_GRAPH_EXEC_UPDATE_ERROR_UNSUPPORTED_FUNCTION_CHANGE = 0x7, /**< The update failed because the function of a kernel node changed in an unsupported way */ CU_GRAPH_EXEC_UPDATE_ERROR_ATTRIBUTES_CHANGED = 0x8 /**< The update failed because the node attributes changed in a way that is not supported */ } CUgraphExecUpdateResult; /** * CUDA memory pool attributes */ typedef enum CUmemPool_attribute_enum { /** * (value type = int) * Allow cuMemAllocAsync to use memory asynchronously freed * in another streams as long as a stream ordering dependency * of the allocating stream on the free action exists. * Cuda events and null stream interactions can create the required * stream ordered dependencies. (default enabled) */ CU_MEMPOOL_ATTR_REUSE_FOLLOW_EVENT_DEPENDENCIES = 1, /** * (value type = int) * Allow reuse of already completed frees when there is no dependency * between the free and allocation. (default enabled) */ CU_MEMPOOL_ATTR_REUSE_ALLOW_OPPORTUNISTIC, /** * (value type = int) * Allow cuMemAllocAsync to insert new stream dependencies * in order to establish the stream ordering required to reuse * a piece of memory released by cuFreeAsync (default enabled). */ CU_MEMPOOL_ATTR_REUSE_ALLOW_INTERNAL_DEPENDENCIES, /** * (value type = cuuint64_t) * Amount of reserved memory in bytes to hold onto before trying * to release memory back to the OS. When more than the release * threshold bytes of memory are held by the memory pool, the * allocator will try to release memory back to the OS on the * next call to stream, event or context synchronize. (default 0) */ CU_MEMPOOL_ATTR_RELEASE_THRESHOLD, /** * (value type = cuuint64_t) * Amount of backing memory currently allocated for the mempool. */ CU_MEMPOOL_ATTR_RESERVED_MEM_CURRENT, /** * (value type = cuuint64_t) * High watermark of backing memory allocated for the mempool since the * last time it was reset. High watermark can only be reset to zero. */ CU_MEMPOOL_ATTR_RESERVED_MEM_HIGH, /** * (value type = cuuint64_t) * Amount of memory from the pool that is currently in use by the application. */ CU_MEMPOOL_ATTR_USED_MEM_CURRENT, /** * (value type = cuuint64_t) * High watermark of the amount of memory from the pool that was in use by the application since * the last time it was reset. High watermark can only be reset to zero. */ CU_MEMPOOL_ATTR_USED_MEM_HIGH } CUmemPool_attribute; /** * Specifies the properties of allocations made from the pool. */ typedef struct CUmemPoolProps_st { CUmemAllocationType allocType; /**< Allocation type. Currently must be specified as CU_MEM_ALLOCATION_TYPE_PINNED */ CUmemAllocationHandleType handleTypes; /**< Handle types that will be supported by allocations from the pool. */ CUmemLocation location; /**< Location where allocations should reside. */ /** * Windows-specific LPSECURITYATTRIBUTES required when * ::CU_MEM_HANDLE_TYPE_WIN32 is specified. This security attribute defines * the scope of which exported allocations may be tranferred to other * processes. In all other cases, this field is required to be zero. */ void *win32SecurityAttributes; unsigned char reserved[64]; /**< reserved for future use, must be 0 */ } CUmemPoolProps_v1; typedef CUmemPoolProps_v1 CUmemPoolProps; /** * Opaque data for exporting a pool allocation */ typedef struct CUmemPoolPtrExportData_st { unsigned char reserved[64]; } CUmemPoolPtrExportData_v1; typedef CUmemPoolPtrExportData_v1 CUmemPoolPtrExportData; /** * Memory allocation node parameters */ typedef struct CUDA_MEM_ALLOC_NODE_PARAMS_st { /** * in: location where the allocation should reside (specified in ::location). * ::handleTypes must be ::CU_MEM_HANDLE_TYPE_NONE. IPC is not supported. */ CUmemPoolProps poolProps; const CUmemAccessDesc *accessDescs; /**< in: array of memory access descriptors. Used to describe peer GPU access */ size_t accessDescCount; /**< in: number of memory access descriptors. Must not exceed the number of GPUs. */ size_t bytesize; /**< in: size in bytes of the requested allocation */ CUdeviceptr dptr; /**< out: address of the allocation returned by CUDA */ } CUDA_MEM_ALLOC_NODE_PARAMS; typedef enum CUgraphMem_attribute_enum { /** * (value type = cuuint64_t) * Amount of memory, in bytes, currently associated with graphs */ CU_GRAPH_MEM_ATTR_USED_MEM_CURRENT, /** * (value type = cuuint64_t) * High watermark of memory, in bytes, associated with graphs since the * last time it was reset. High watermark can only be reset to zero. */ CU_GRAPH_MEM_ATTR_USED_MEM_HIGH, /** * (value type = cuuint64_t) * Amount of memory, in bytes, currently allocated for use by * the CUDA graphs asynchronous allocator. */ CU_GRAPH_MEM_ATTR_RESERVED_MEM_CURRENT, /** * (value type = cuuint64_t) * High watermark of memory, in bytes, currently allocated for use by * the CUDA graphs asynchronous allocator. */ CU_GRAPH_MEM_ATTR_RESERVED_MEM_HIGH } CUgraphMem_attribute; /** * If set, each kernel launched as part of ::cuLaunchCooperativeKernelMultiDevice only * waits for prior work in the stream corresponding to that GPU to complete before the * kernel begins execution. */ #define CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_PRE_LAUNCH_SYNC 0x01 /** * If set, any subsequent work pushed in a stream that participated in a call to * ::cuLaunchCooperativeKernelMultiDevice will only wait for the kernel launched on * the GPU corresponding to that stream to complete before it begins execution. */ #define CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_POST_LAUNCH_SYNC 0x02 /** * If set, the CUDA array is a collection of layers, where each layer is either a 1D * or a 2D array and the Depth member of CUDA_ARRAY3D_DESCRIPTOR specifies the number * of layers, not the depth of a 3D array. */ #define CUDA_ARRAY3D_LAYERED 0x01 /** * Deprecated, use CUDA_ARRAY3D_LAYERED */ #define CUDA_ARRAY3D_2DARRAY 0x01 /** * This flag must be set in order to bind a surface reference * to the CUDA array */ #define CUDA_ARRAY3D_SURFACE_LDST 0x02 /** * If set, the CUDA array is a collection of six 2D arrays, representing faces of a cube. The * width of such a CUDA array must be equal to its height, and Depth must be six. * If ::CUDA_ARRAY3D_LAYERED flag is also set, then the CUDA array is a collection of cubemaps * and Depth must be a multiple of six. */ #define CUDA_ARRAY3D_CUBEMAP 0x04 /** * This flag must be set in order to perform texture gather operations * on a CUDA array. */ #define CUDA_ARRAY3D_TEXTURE_GATHER 0x08 /** * This flag if set indicates that the CUDA * array is a DEPTH_TEXTURE. */ #define CUDA_ARRAY3D_DEPTH_TEXTURE 0x10 /** * This flag indicates that the CUDA array may be bound as a color target * in an external graphics API */ #define CUDA_ARRAY3D_COLOR_ATTACHMENT 0x20 /** * This flag if set indicates that the CUDA array or CUDA mipmapped array * is a sparse CUDA array or CUDA mipmapped array respectively */ #define CUDA_ARRAY3D_SPARSE 0x40 /** * This flag if set indicates that the CUDA array or CUDA mipmapped array * will allow deferred memory mapping */ #define CUDA_ARRAY3D_DEFERRED_MAPPING 0x80 /** * Override the texref format with a format inferred from the array. * Flag for ::cuTexRefSetArray() */ #define CU_TRSA_OVERRIDE_FORMAT 0x01 /** * Read the texture as integers rather than promoting the values to floats * in the range [0,1]. * Flag for ::cuTexRefSetFlags() and ::cuTexObjectCreate() */ #define CU_TRSF_READ_AS_INTEGER 0x01 /** * Use normalized texture coordinates in the range [0,1) instead of [0,dim). * Flag for ::cuTexRefSetFlags() and ::cuTexObjectCreate() */ #define CU_TRSF_NORMALIZED_COORDINATES 0x02 /** * Perform sRGB->linear conversion during texture read. * Flag for ::cuTexRefSetFlags() and ::cuTexObjectCreate() */ #define CU_TRSF_SRGB 0x10 /** * Disable any trilinear filtering optimizations. * Flag for ::cuTexRefSetFlags() and ::cuTexObjectCreate() */ #define CU_TRSF_DISABLE_TRILINEAR_OPTIMIZATION 0x20 /** * Enable seamless cube map filtering. * Flag for ::cuTexObjectCreate() */ #define CU_TRSF_SEAMLESS_CUBEMAP 0x40 /** * End of array terminator for the \p extra parameter to * ::cuLaunchKernel */ #define CU_LAUNCH_PARAM_END ((void*)0x00) /** * Indicator that the next value in the \p extra parameter to * ::cuLaunchKernel will be a pointer to a buffer containing all kernel * parameters used for launching kernel \p f. This buffer needs to * honor all alignment/padding requirements of the individual parameters. * If ::CU_LAUNCH_PARAM_BUFFER_SIZE is not also specified in the * \p extra array, then ::CU_LAUNCH_PARAM_BUFFER_POINTER will have no * effect. */ #define CU_LAUNCH_PARAM_BUFFER_POINTER ((void*)0x01) /** * Indicator that the next value in the \p extra parameter to * ::cuLaunchKernel will be a pointer to a size_t which contains the * size of the buffer specified with ::CU_LAUNCH_PARAM_BUFFER_POINTER. * It is required that ::CU_LAUNCH_PARAM_BUFFER_POINTER also be specified * in the \p extra array if the value associated with * ::CU_LAUNCH_PARAM_BUFFER_SIZE is not zero. */ #define CU_LAUNCH_PARAM_BUFFER_SIZE ((void*)0x02) /** * For texture references loaded into the module, use default texunit from * texture reference. */ #define CU_PARAM_TR_DEFAULT -1 /** * Device that represents the CPU */ #define CU_DEVICE_CPU ((CUdevice)-1) /** * Device that represents an invalid device */ #define CU_DEVICE_INVALID ((CUdevice)-2) /** * Bitmasks for ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS */ typedef enum CUflushGPUDirectRDMAWritesOptions_enum { CU_FLUSH_GPU_DIRECT_RDMA_WRITES_OPTION_HOST = 1<<0, /**< ::cuFlushGPUDirectRDMAWrites() and its CUDA Runtime API counterpart are supported on the device. */ CU_FLUSH_GPU_DIRECT_RDMA_WRITES_OPTION_MEMOPS = 1<<1 /**< The ::CU_STREAM_WAIT_VALUE_FLUSH flag and the ::CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES MemOp are supported on the device. */ } CUflushGPUDirectRDMAWritesOptions; /** * Platform native ordering for GPUDirect RDMA writes */ typedef enum CUGPUDirectRDMAWritesOrdering_enum { CU_GPU_DIRECT_RDMA_WRITES_ORDERING_NONE = 0, /**< The device does not natively support ordering of remote writes. ::cuFlushGPUDirectRDMAWrites() can be leveraged if supported. */ CU_GPU_DIRECT_RDMA_WRITES_ORDERING_OWNER = 100, /**< Natively, the device can consistently consume remote writes, although other CUDA devices may not. */ CU_GPU_DIRECT_RDMA_WRITES_ORDERING_ALL_DEVICES = 200 /**< Any CUDA device in the system can consistently consume remote writes to this device. */ } CUGPUDirectRDMAWritesOrdering; /** * The scopes for ::cuFlushGPUDirectRDMAWrites */ typedef enum CUflushGPUDirectRDMAWritesScope_enum { CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TO_OWNER = 100, /**< Blocks until remote writes are visible to the CUDA device context owning the data. */ CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TO_ALL_DEVICES = 200 /**< Blocks until remote writes are visible to all CUDA device contexts. */ } CUflushGPUDirectRDMAWritesScope; /** * The targets for ::cuFlushGPUDirectRDMAWrites */ typedef enum CUflushGPUDirectRDMAWritesTarget_enum { CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TARGET_CURRENT_CTX = 0 /**< Sets the target for ::cuFlushGPUDirectRDMAWrites() to the currently active CUDA device context. */ } CUflushGPUDirectRDMAWritesTarget; /** * The additional write options for ::cuGraphDebugDotPrint */ typedef enum CUgraphDebugDot_flags_enum { CU_GRAPH_DEBUG_DOT_FLAGS_VERBOSE = 1<<0, /** Output all debug data as if every debug flag is enabled */ CU_GRAPH_DEBUG_DOT_FLAGS_RUNTIME_TYPES = 1<<1, /** Use CUDA Runtime structures for output */ CU_GRAPH_DEBUG_DOT_FLAGS_KERNEL_NODE_PARAMS = 1<<2, /** Adds CUDA_KERNEL_NODE_PARAMS values to output */ CU_GRAPH_DEBUG_DOT_FLAGS_MEMCPY_NODE_PARAMS = 1<<3, /** Adds CUDA_MEMCPY3D values to output */ CU_GRAPH_DEBUG_DOT_FLAGS_MEMSET_NODE_PARAMS = 1<<4, /** Adds CUDA_MEMSET_NODE_PARAMS values to output */ CU_GRAPH_DEBUG_DOT_FLAGS_HOST_NODE_PARAMS = 1<<5, /** Adds CUDA_HOST_NODE_PARAMS values to output */ CU_GRAPH_DEBUG_DOT_FLAGS_EVENT_NODE_PARAMS = 1<<6, /** Adds CUevent handle from record and wait nodes to output */ CU_GRAPH_DEBUG_DOT_FLAGS_EXT_SEMAS_SIGNAL_NODE_PARAMS = 1<<7, /** Adds CUDA_EXT_SEM_SIGNAL_NODE_PARAMS values to output */ CU_GRAPH_DEBUG_DOT_FLAGS_EXT_SEMAS_WAIT_NODE_PARAMS = 1<<8, /** Adds CUDA_EXT_SEM_WAIT_NODE_PARAMS values to output */ CU_GRAPH_DEBUG_DOT_FLAGS_KERNEL_NODE_ATTRIBUTES = 1<<9, /** Adds CUkernelNodeAttrValue values to output */ CU_GRAPH_DEBUG_DOT_FLAGS_HANDLES = 1<<10, /** Adds node handles and every kernel function handle to output */ CU_GRAPH_DEBUG_DOT_FLAGS_MEM_ALLOC_NODE_PARAMS = 1<<11, /** Adds memory alloc node parameters to output */ CU_GRAPH_DEBUG_DOT_FLAGS_MEM_FREE_NODE_PARAMS = 1<<12 /** Adds memory free node parameters to output */ } CUgraphDebugDot_flags; /** * Flags for user objects for graphs */ typedef enum CUuserObject_flags_enum { CU_USER_OBJECT_NO_DESTRUCTOR_SYNC = 1 /**< Indicates the destructor execution is not synchronized by any CUDA handle. */ } CUuserObject_flags; /** * Flags for retaining user object references for graphs */ typedef enum CUuserObjectRetain_flags_enum { CU_GRAPH_USER_OBJECT_MOVE = 1 /**< Transfer references from the caller rather than creating new references. */ } CUuserObjectRetain_flags; /** * Flags for instantiating a graph */ typedef enum CUgraphInstantiate_flags_enum { CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH = 1 /**< Automatically free memory allocated in a graph before relaunching. */ } CUgraphInstantiate_flags; /** @} */ /* END CUDA_TYPES */ #if defined(__GNUC__) #if defined(__CUDA_API_PUSH_VISIBILITY_DEFAULT) #pragma GCC visibility push(default) #endif #endif #ifdef _WIN32 #define CUDAAPI __stdcall #else #define CUDAAPI #endif /** * \defgroup CUDA_ERROR Error Handling * * ___MANBRIEF___ error handling functions of the low-level CUDA driver API * (___CURRENT_FILE___) ___ENDMANBRIEF___ * * This section describes the error handling functions of the low-level CUDA * driver application programming interface. * * @{ */ /** * \brief Gets the string description of an error code * * Sets \p *pStr to the address of a NULL-terminated string description * of the error code \p error. * If the error code is not recognized, ::CUDA_ERROR_INVALID_VALUE * will be returned and \p *pStr will be set to the NULL address. * * \param error - Error code to convert to string * \param pStr - Address of the string pointer. * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_VALUE * * \sa * ::CUresult, * ::cudaGetErrorString */ CUresult CUDAAPI cuGetErrorString(CUresult error, const char **pStr); /** * \brief Gets the string representation of an error code enum name * * Sets \p *pStr to the address of a NULL-terminated string representation * of the name of the enum error code \p error. * If the error code is not recognized, ::CUDA_ERROR_INVALID_VALUE * will be returned and \p *pStr will be set to the NULL address. * * \param error - Error code to convert to string * \param pStr - Address of the string pointer. * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_VALUE * * \sa * ::CUresult, * ::cudaGetErrorName */ CUresult CUDAAPI cuGetErrorName(CUresult error, const char **pStr); /** @} */ /* END CUDA_ERROR */ /** * \defgroup CUDA_INITIALIZE Initialization * * ___MANBRIEF___ initialization functions of the low-level CUDA driver API * (___CURRENT_FILE___) ___ENDMANBRIEF___ * * This section describes the initialization functions of the low-level CUDA * driver application programming interface. * * @{ */ /** * \brief Initialize the CUDA driver API * * Initializes the driver API and must be called before any other function from * the driver API. Currently, the \p Flags parameter must be 0. If ::cuInit() * has not been called, any function from the driver API will return * ::CUDA_ERROR_NOT_INITIALIZED. * * \param Flags - Initialization flag for CUDA. * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_INVALID_DEVICE, * ::CUDA_ERROR_SYSTEM_DRIVER_MISMATCH, * ::CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE * \notefnerr */ CUresult CUDAAPI cuInit(unsigned int Flags); /** @} */ /* END CUDA_INITIALIZE */ /** * \defgroup CUDA_VERSION Version Management * * ___MANBRIEF___ version management functions of the low-level CUDA driver * API (___CURRENT_FILE___) ___ENDMANBRIEF___ * * This section describes the version management functions of the low-level * CUDA driver application programming interface. * * @{ */ /** * \brief Returns the latest CUDA version supported by driver * * Returns in \p *driverVersion the version of CUDA supported by * the driver. The version is returned as * (1000 × major + 10 × minor). For example, CUDA 9.2 * would be represented by 9020. * * This function automatically returns ::CUDA_ERROR_INVALID_VALUE if * \p driverVersion is NULL. * * \param driverVersion - Returns the CUDA driver version * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_VALUE * \notefnerr * * \sa * ::cudaDriverGetVersion, * ::cudaRuntimeGetVersion */ CUresult CUDAAPI cuDriverGetVersion(int *driverVersion); /** @} */ /* END CUDA_VERSION */ /** * \defgroup CUDA_DEVICE Device Management * * ___MANBRIEF___ device management functions of the low-level CUDA driver API * (___CURRENT_FILE___) ___ENDMANBRIEF___ * * This section describes the device management functions of the low-level * CUDA driver application programming interface. * * @{ */ /** * \brief Returns a handle to a compute device * * Returns in \p *device a device handle given an ordinal in the range [0, * ::cuDeviceGetCount()-1]. * * \param device - Returned device handle * \param ordinal - Device number to get handle for * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_INVALID_DEVICE * \notefnerr * * \sa * ::cuDeviceGetAttribute, * ::cuDeviceGetCount, * ::cuDeviceGetName, * ::cuDeviceGetUuid, * ::cuDeviceGetLuid, * ::cuDeviceTotalMem, * ::cuDeviceGetExecAffinitySupport */ CUresult CUDAAPI cuDeviceGet(CUdevice *device, int ordinal); /** * \brief Returns the number of compute-capable devices * * Returns in \p *count the number of devices with compute capability greater * than or equal to 2.0 that are available for execution. If there is no such * device, ::cuDeviceGetCount() returns 0. * * \param count - Returned number of compute-capable devices * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE * \notefnerr * * \sa * ::cuDeviceGetAttribute, * ::cuDeviceGetName, * ::cuDeviceGetUuid, * ::cuDeviceGetLuid, * ::cuDeviceGet, * ::cuDeviceTotalMem, * ::cuDeviceGetExecAffinitySupport, * ::cudaGetDeviceCount */ CUresult CUDAAPI cuDeviceGetCount(int *count); /** * \brief Returns an identifer string for the device * * Returns an ASCII string identifying the device \p dev in the NULL-terminated * string pointed to by \p name. \p len specifies the maximum length of the * string that may be returned. * * \param name - Returned identifier string for the device * \param len - Maximum length of string to store in \p name * \param dev - Device to get identifier string for * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_INVALID_DEVICE * \notefnerr * * \sa * ::cuDeviceGetAttribute, * ::cuDeviceGetUuid, * ::cuDeviceGetLuid, * ::cuDeviceGetCount, * ::cuDeviceGet, * ::cuDeviceTotalMem, * ::cuDeviceGetExecAffinitySupport, * ::cudaGetDeviceProperties */ CUresult CUDAAPI cuDeviceGetName(char *name, int len, CUdevice dev); /** * \brief Return an UUID for the device * * Note there is a later version of this API, ::cuDeviceGetUuid_v2. It will * supplant this version in 12.0, which is retained for minor version compatibility. * * Returns 16-octets identifing the device \p dev in the structure * pointed by the \p uuid. * * \param uuid - Returned UUID * \param dev - Device to get identifier string for * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_INVALID_DEVICE * \notefnerr * * \sa * ::cuDeviceGetUuid_v2 * ::cuDeviceGetAttribute, * ::cuDeviceGetCount, * ::cuDeviceGetName, * ::cuDeviceGetLuid, * ::cuDeviceGet, * ::cuDeviceTotalMem, * ::cuDeviceGetExecAffinitySupport, * ::cudaGetDeviceProperties */ CUresult CUDAAPI cuDeviceGetUuid(CUuuid *uuid, CUdevice dev); /** * \brief Return an UUID for the device (11.4+) * * Returns 16-octets identifing the device \p dev in the structure * pointed by the \p uuid. If the device is in MIG mode, returns its * MIG UUID which uniquely identifies the subscribed MIG compute instance. * * \param uuid - Returned UUID * \param dev - Device to get identifier string for * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_INVALID_DEVICE * \notefnerr * * \sa * ::cuDeviceGetAttribute, * ::cuDeviceGetCount, * ::cuDeviceGetName, * ::cuDeviceGetLuid, * ::cuDeviceGet, * ::cuDeviceTotalMem, * ::cudaGetDeviceProperties */ CUresult CUDAAPI cuDeviceGetUuid_v2(CUuuid *uuid, CUdevice dev); /** * \brief Return an LUID and device node mask for the device * * Return identifying information (\p luid and \p deviceNodeMask) to allow * matching device with graphics APIs. * * \param luid - Returned LUID * \param deviceNodeMask - Returned device node mask * \param dev - Device to get identifier string for * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_INVALID_DEVICE * \notefnerr * * \sa * ::cuDeviceGetAttribute, * ::cuDeviceGetCount, * ::cuDeviceGetName, * ::cuDeviceGet, * ::cuDeviceTotalMem, * ::cuDeviceGetExecAffinitySupport, * ::cudaGetDeviceProperties */ CUresult CUDAAPI cuDeviceGetLuid(char *luid, unsigned int *deviceNodeMask, CUdevice dev); /** * \brief Returns the total amount of memory on the device * * Returns in \p *bytes the total amount of memory available on the device * \p dev in bytes. * * \param bytes - Returned memory available on device in bytes * \param dev - Device handle * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_INVALID_DEVICE * \notefnerr * * \sa * ::cuDeviceGetAttribute, * ::cuDeviceGetCount, * ::cuDeviceGetName, * ::cuDeviceGetUuid, * ::cuDeviceGet, * ::cuDeviceGetExecAffinitySupport, * ::cudaMemGetInfo */ CUresult CUDAAPI cuDeviceTotalMem(size_t *bytes, CUdevice dev); /** * \brief Returns the maximum number of elements allocatable in a 1D linear texture for a given texture element size. * * Returns in \p maxWidthInElements the maximum number of texture elements allocatable in a 1D linear texture * for given \p format and \p numChannels. * * \param maxWidthInElements - Returned maximum number of texture elements allocatable for given \p format and \p numChannels. * \param format - Texture format. * \param numChannels - Number of channels per texture element. * \param dev - Device handle. * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_INVALID_DEVICE * \notefnerr * * \sa * ::cuDeviceGetAttribute, * ::cuDeviceGetCount, * ::cuDeviceGetName, * ::cuDeviceGetUuid, * ::cuDeviceGet, * ::cudaMemGetInfo, * ::cuDeviceTotalMem */ CUresult CUDAAPI cuDeviceGetTexture1DLinearMaxWidth(size_t *maxWidthInElements, CUarray_format format, unsigned numChannels, CUdevice dev); /** * \brief Returns information about the device * * Returns in \p *pi the integer value of the attribute \p attrib on device * \p dev. The supported attributes are: * - ::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK: Maximum number of threads per * block; * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X: Maximum x-dimension of a block * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y: Maximum y-dimension of a block * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z: Maximum z-dimension of a block * - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X: Maximum x-dimension of a grid * - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y: Maximum y-dimension of a grid * - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z: Maximum z-dimension of a grid * - ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK: Maximum amount of * shared memory available to a thread block in bytes * - ::CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY: Memory available on device for * __constant__ variables in a CUDA C kernel in bytes * - ::CU_DEVICE_ATTRIBUTE_WARP_SIZE: Warp size in threads * - ::CU_DEVICE_ATTRIBUTE_MAX_PITCH: Maximum pitch in bytes allowed by the * memory copy functions that involve memory regions allocated through * ::cuMemAllocPitch() * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH: Maximum 1D * texture width * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH: Maximum width * for a 1D texture bound to linear memory * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH: Maximum * mipmapped 1D texture width * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH: Maximum 2D * texture width * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT: Maximum 2D * texture height * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH: Maximum width * for a 2D texture bound to linear memory * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT: Maximum height * for a 2D texture bound to linear memory * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH: Maximum pitch * in bytes for a 2D texture bound to linear memory * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH: Maximum * mipmapped 2D texture width * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT: Maximum * mipmapped 2D texture height * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH: Maximum 3D * texture width * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT: Maximum 3D * texture height * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH: Maximum 3D * texture depth * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE: * Alternate maximum 3D texture width, 0 if no alternate * maximum 3D texture size is supported * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE: * Alternate maximum 3D texture height, 0 if no alternate * maximum 3D texture size is supported * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE: * Alternate maximum 3D texture depth, 0 if no alternate * maximum 3D texture size is supported * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH: * Maximum cubemap texture width or height * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH: * Maximum 1D layered texture width * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS: * Maximum layers in a 1D layered texture * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH: * Maximum 2D layered texture width * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT: * Maximum 2D layered texture height * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS: * Maximum layers in a 2D layered texture * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH: * Maximum cubemap layered texture width or height * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS: * Maximum layers in a cubemap layered texture * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH: * Maximum 1D surface width * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH: * Maximum 2D surface width * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT: * Maximum 2D surface height * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH: * Maximum 3D surface width * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT: * Maximum 3D surface height * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH: * Maximum 3D surface depth * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH: * Maximum 1D layered surface width * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS: * Maximum layers in a 1D layered surface * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH: * Maximum 2D layered surface width * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT: * Maximum 2D layered surface height * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS: * Maximum layers in a 2D layered surface * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH: * Maximum cubemap surface width * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH: * Maximum cubemap layered surface width * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS: * Maximum layers in a cubemap layered surface * - ::CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK: Maximum number of 32-bit * registers available to a thread block * - ::CU_DEVICE_ATTRIBUTE_CLOCK_RATE: The typical clock frequency in kilohertz * - ::CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT: Alignment requirement; texture * base addresses aligned to ::textureAlign bytes do not need an offset * applied to texture fetches * - ::CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT: Pitch alignment requirement * for 2D texture references bound to pitched memory * - ::CU_DEVICE_ATTRIBUTE_GPU_OVERLAP: 1 if the device can concurrently copy * memory between host and device while executing a kernel, or 0 if not * - ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT: Number of multiprocessors on * the device * - ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT: 1 if there is a run time limit * for kernels executed on the device, or 0 if not * - ::CU_DEVICE_ATTRIBUTE_INTEGRATED: 1 if the device is integrated with the * memory subsystem, or 0 if not * - ::CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY: 1 if the device can map host * memory into the CUDA address space, or 0 if not * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE: Compute mode that device is currently * in. Available modes are as follows: * - ::CU_COMPUTEMODE_DEFAULT: Default mode - Device is not restricted and * can have multiple CUDA contexts present at a single time. * - ::CU_COMPUTEMODE_PROHIBITED: Compute-prohibited mode - Device is * prohibited from creating new CUDA contexts. * - ::CU_COMPUTEMODE_EXCLUSIVE_PROCESS: Compute-exclusive-process mode - Device * can have only one context used by a single process at a time. * - ::CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS: 1 if the device supports * executing multiple kernels within the same context simultaneously, or 0 if * not. It is not guaranteed that multiple kernels will be resident * on the device concurrently so this feature should not be relied upon for * correctness. * - ::CU_DEVICE_ATTRIBUTE_ECC_ENABLED: 1 if error correction is enabled on the * device, 0 if error correction is disabled or not supported by the device * - ::CU_DEVICE_ATTRIBUTE_PCI_BUS_ID: PCI bus identifier of the device * - ::CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID: PCI device (also known as slot) identifier * of the device * - ::CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID: PCI domain identifier of the device * - ::CU_DEVICE_ATTRIBUTE_TCC_DRIVER: 1 if the device is using a TCC driver. TCC * is only available on Tesla hardware running Windows Vista or later * - ::CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE: Peak memory clock frequency in kilohertz * - ::CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH: Global memory bus width in bits * - ::CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE: Size of L2 cache in bytes. 0 if the device doesn't have L2 cache * - ::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR: Maximum resident threads per multiprocessor * - ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING: 1 if the device shares a unified address space with * the host, or 0 if not * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR: Major compute capability version number * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR: Minor compute capability version number * - ::CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED: 1 if device supports caching globals * in L1 cache, 0 if caching globals in L1 cache is not supported by the device * - ::CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED: 1 if device supports caching locals * in L1 cache, 0 if caching locals in L1 cache is not supported by the device * - ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR: Maximum amount of * shared memory available to a multiprocessor in bytes; this amount is shared * by all thread blocks simultaneously resident on a multiprocessor * - ::CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR: Maximum number of 32-bit * registers available to a multiprocessor; this number is shared by all thread * blocks simultaneously resident on a multiprocessor * - ::CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY: 1 if device supports allocating managed memory * on this system, 0 if allocating managed memory is not supported by the device on this system. * - ::CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD: 1 if device is on a multi-GPU board, 0 if not. * - ::CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID: Unique identifier for a group of devices * associated with the same board. Devices on the same multi-GPU board will share the same identifier. * - ::CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED: 1 if Link between the device and the host * supports native atomic operations. * - ::CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO: Ratio of single precision performance * (in floating-point operations per second) to double precision performance. * - ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS: Device suppports coherently accessing * pageable memory without calling cudaHostRegister on it. * - ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS: Device can coherently access managed memory * concurrently with the CPU. * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED: Device supports Compute Preemption. * - ::CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM: Device can access host registered * memory at the same virtual address as the CPU. * - ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN: The maximum per block shared memory size * suported on this device. This is the maximum value that can be opted into when using the cuFuncSetAttribute() call. * For more details see ::CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES * - ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES: Device accesses pageable memory via the host's * page tables. * - ::CU_DEVICE_ATTRIBUTE_DIRECT_MANAGED_MEM_ACCESS_FROM_HOST: The host can directly access managed memory on the device without migration. * - ::CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED: Device supports virtual memory management APIs like ::cuMemAddressReserve, ::cuMemCreate, ::cuMemMap and related APIs * - ::CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR_SUPPORTED: Device supports exporting memory to a posix file descriptor with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate * - ::CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_HANDLE_SUPPORTED: Device supports exporting memory to a Win32 NT handle with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate * - ::CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_KMT_HANDLE_SUPPORTED: Device supports exporting memory to a Win32 KMT handle with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCKS_PER_MULTIPROCESSOR: Maximum number of thread blocks that can reside on a multiprocessor * - ::CU_DEVICE_ATTRIBUTE_GENERIC_COMPRESSION_SUPPORTED: Device supports compressible memory allocation via ::cuMemCreate * - ::CU_DEVICE_ATTRIBUTE_MAX_PERSISTING_L2_CACHE_SIZE: Maximum L2 persisting lines capacity setting in bytes * - ::CU_DEVICE_ATTRIBUTE_MAX_ACCESS_POLICY_WINDOW_SIZE: Maximum value of CUaccessPolicyWindow::num_bytes * - ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED: Device supports specifying the GPUDirect RDMA flag with ::cuMemCreate. * - ::CU_DEVICE_ATTRIBUTE_RESERVED_SHARED_MEMORY_PER_BLOCK: Amount of shared memory per block reserved by CUDA driver in bytes * - ::CU_DEVICE_ATTRIBUTE_SPARSE_CUDA_ARRAY_SUPPORTED: Device supports sparse CUDA arrays and sparse CUDA mipmapped arrays. * - ::CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED: Device supports using the ::cuMemHostRegister flag ::CU_MEMHOSTERGISTER_READ_ONLY to register memory that must be mapped as read-only to the GPU * - ::CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED: Device supports using the ::cuMemAllocAsync and ::cuMemPool family of APIs * - ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED: Device supports GPUDirect RDMA APIs, like nvidia_p2p_get_pages (see https://docs.nvidia.com/cuda/gpudirect-rdma for more information) * - ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS: The returned attribute shall be interpreted as a bitmask, where the individual bits are described by the ::CUflushGPUDirectRDMAWritesOptions enum * - ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING: GPUDirect RDMA writes to the device do not need to be flushed for consumers within the scope indicated by the returned attribute. See ::CUGPUDirectRDMAWritesOrdering for the numerical values returned here. * - ::CU_DEVICE_ATTRIBUTE_MEMPOOL_SUPPORTED_HANDLE_TYPES: Bitmask of handle types supported with mempool based IPC * - ::CU_DEVICE_ATTRIBUTE_DEFERRED_MAPPING_CUDA_ARRAY_SUPPORTED: Device supports deferred mapping CUDA arrays and CUDA mipmapped arrays. * * \param pi - Returned device attribute value * \param attrib - Device attribute to query * \param dev - Device handle * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_INVALID_DEVICE * \notefnerr * * \sa * ::cuDeviceGetCount, * ::cuDeviceGetName, * ::cuDeviceGetUuid, * ::cuDeviceGet, * ::cuDeviceTotalMem, * ::cuDeviceGetExecAffinitySupport, * ::cudaDeviceGetAttribute, * ::cudaGetDeviceProperties */ CUresult CUDAAPI cuDeviceGetAttribute(int *pi, CUdevice_attribute attrib, CUdevice dev); /** * \brief Return NvSciSync attributes that this device can support. * * Returns in \p nvSciSyncAttrList, the properties of NvSciSync that * this CUDA device, \p dev can support. The returned \p nvSciSyncAttrList * can be used to create an NvSciSync object that matches this device's capabilities. * * If NvSciSyncAttrKey_RequiredPerm field in \p nvSciSyncAttrList is * already set this API will return ::CUDA_ERROR_INVALID_VALUE. * * The applications should set \p nvSciSyncAttrList to a valid * NvSciSyncAttrList failing which this API will return * ::CUDA_ERROR_INVALID_HANDLE. * * The \p flags controls how applications intends to use * the NvSciSync created from the \p nvSciSyncAttrList. The valid flags are: * - ::CUDA_NVSCISYNC_ATTR_SIGNAL, specifies that the applications intends to * signal an NvSciSync on this CUDA device. * - ::CUDA_NVSCISYNC_ATTR_WAIT, specifies that the applications intends to * wait on an NvSciSync on this CUDA device. * * At least one of these flags must be set, failing which the API * returns ::CUDA_ERROR_INVALID_VALUE. Both the flags are orthogonal * to one another: a developer may set both these flags that allows to * set both wait and signal specific attributes in the same \p nvSciSyncAttrList. * * \param nvSciSyncAttrList - Return NvSciSync attributes supported. * \param dev - Valid Cuda Device to get NvSciSync attributes for. * \param flags - flags describing NvSciSync usage. * * \return * * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_INVALID_HANDLE, * ::CUDA_ERROR_INVALID_DEVICE, * ::CUDA_ERROR_NOT_SUPPORTED, * ::CUDA_ERROR_OUT_OF_MEMORY * * \sa * ::cuImportExternalSemaphore, * ::cuDestroyExternalSemaphore, * ::cuSignalExternalSemaphoresAsync, * ::cuWaitExternalSemaphoresAsync */ CUresult CUDAAPI cuDeviceGetNvSciSyncAttributes(void *nvSciSyncAttrList, CUdevice dev, int flags); /** * \brief Sets the current memory pool of a device * * The memory pool must be local to the specified device. * ::cuMemAllocAsync allocates from the current mempool of the provided stream's device. * By default, a device's current memory pool is its default memory pool. * * \note Use ::cuMemAllocFromPoolAsync to specify asynchronous allocations from a device different * than the one the stream runs on. * * \returns * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_VALUE * * \sa ::cuDeviceGetDefaultMemPool, ::cuDeviceGetMemPool, ::cuMemPoolCreate, ::cuMemPoolDestroy, ::cuMemAllocFromPoolAsync */ CUresult CUDAAPI cuDeviceSetMemPool(CUdevice dev, CUmemoryPool pool); /** * \brief Gets the current mempool for a device * * Returns the last pool provided to ::cuDeviceSetMemPool for this device * or the device's default memory pool if ::cuDeviceSetMemPool has never been called. * By default the current mempool is the default mempool for a device. * Otherwise the returned pool must have been set with ::cuDeviceSetMemPool. * * \returns * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_VALUE * * \sa ::cuDeviceGetDefaultMemPool, ::cuMemPoolCreate, ::cuDeviceSetMemPool */ CUresult CUDAAPI cuDeviceGetMemPool(CUmemoryPool *pool, CUdevice dev); /** * \brief Returns the default mempool of a device * * The default mempool of a device contains device memory from that device. * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_INVALID_DEVICE, * ::CUDA_ERROR_NOT_SUPPORTED * \notefnerr * * \sa ::cuMemAllocAsync, ::cuMemPoolTrimTo, ::cuMemPoolGetAttribute, ::cuMemPoolSetAttribute, cuMemPoolSetAccess, ::cuDeviceGetMemPool, ::cuMemPoolCreate */ CUresult CUDAAPI cuDeviceGetDefaultMemPool(CUmemoryPool *pool_out, CUdevice dev); /** * \brief Blocks until remote writes are visible to the specified scope * * Blocks until GPUDirect RDMA writes to the target context via mappings * created through APIs like nvidia_p2p_get_pages (see * https://docs.nvidia.com/cuda/gpudirect-rdma for more information), are * visible to the specified scope. * * If the scope equals or lies within the scope indicated by * ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING, the call * will be a no-op and can be safely omitted for performance. This can be * determined by comparing the numerical values between the two enums, with * smaller scopes having smaller values. * * Users may query support for this API via * ::CU_DEVICE_ATTRIBUTE_FLUSH_FLUSH_GPU_DIRECT_RDMA_OPTIONS. * * \param target - The target of the operation, see ::CUflushGPUDirectRDMAWritesTarget * \param scope - The scope of the operation, see ::CUflushGPUDirectRDMAWritesScope * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE, * \notefnerr * */ CUresult CUDAAPI cuFlushGPUDirectRDMAWrites(CUflushGPUDirectRDMAWritesTarget target, CUflushGPUDirectRDMAWritesScope scope); /** @} */ /* END CUDA_DEVICE */ /** * \defgroup CUDA_DEVICE_DEPRECATED Device Management [DEPRECATED] * * ___MANBRIEF___ deprecated device management functions of the low-level CUDA * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ * * This section describes the device management functions of the low-level * CUDA driver application programming interface. * * @{ */ /** * \brief Returns properties for a selected device * * \deprecated * * This function was deprecated as of CUDA 5.0 and replaced by ::cuDeviceGetAttribute(). * * Returns in \p *prop the properties of device \p dev. The ::CUdevprop * structure is defined as: * * \code typedef struct CUdevprop_st { int maxThreadsPerBlock; int maxThreadsDim[3]; int maxGridSize[3]; int sharedMemPerBlock; int totalConstantMemory; int SIMDWidth; int memPitch; int regsPerBlock; int clockRate; int textureAlign } CUdevprop; * \endcode * where: * * - ::maxThreadsPerBlock is the maximum number of threads per block; * - ::maxThreadsDim[3] is the maximum sizes of each dimension of a block; * - ::maxGridSize[3] is the maximum sizes of each dimension of a grid; * - ::sharedMemPerBlock is the total amount of shared memory available per * block in bytes; * - ::totalConstantMemory is the total amount of constant memory available on * the device in bytes; * - ::SIMDWidth is the warp size; * - ::memPitch is the maximum pitch allowed by the memory copy functions that * involve memory regions allocated through ::cuMemAllocPitch(); * - ::regsPerBlock is the total number of registers available per block; * - ::clockRate is the clock frequency in kilohertz; * - ::textureAlign is the alignment requirement; texture base addresses that * are aligned to ::textureAlign bytes do not need an offset applied to * texture fetches. * * \param prop - Returned properties of device * \param dev - Device to get properties for * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_INVALID_DEVICE * \notefnerr * * \sa * ::cuDeviceGetAttribute, * ::cuDeviceGetCount, * ::cuDeviceGetName, * ::cuDeviceGetUuid, * ::cuDeviceGet, * ::cuDeviceTotalMem */ __CUDA_DEPRECATED CUresult CUDAAPI cuDeviceGetProperties(CUdevprop *prop, CUdevice dev); /** * \brief Returns the compute capability of the device * * \deprecated * * This function was deprecated as of CUDA 5.0 and its functionality superceded * by ::cuDeviceGetAttribute(). * * Returns in \p *major and \p *minor the major and minor revision numbers that * define the compute capability of the device \p dev. * * \param major - Major revision number * \param minor - Minor revision number * \param dev - Device handle * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_INVALID_DEVICE * \notefnerr * * \sa * ::cuDeviceGetAttribute, * ::cuDeviceGetCount, * ::cuDeviceGetName, * ::cuDeviceGetUuid, * ::cuDeviceGet, * ::cuDeviceTotalMem */ __CUDA_DEPRECATED CUresult CUDAAPI cuDeviceComputeCapability(int *major, int *minor, CUdevice dev); /** @} */ /* END CUDA_DEVICE_DEPRECATED */ /** * \defgroup CUDA_PRIMARY_CTX Primary Context Management * * ___MANBRIEF___ primary context management functions of the low-level CUDA driver * API (___CURRENT_FILE___) ___ENDMANBRIEF___ * * This section describes the primary context management functions of the low-level * CUDA driver application programming interface. * * The primary context is unique per device and shared with the CUDA runtime API. * These functions allow integration with other libraries using CUDA. * * @{ */ /** * \brief Retain the primary context on the GPU * * Retains the primary context on the device. * Once the user successfully retains the primary context, the primary context * will be active and available to the user until the user releases it * with ::cuDevicePrimaryCtxRelease() or resets it with ::cuDevicePrimaryCtxReset(). * Unlike ::cuCtxCreate() the newly retained context is not pushed onto the stack. * * Retaining the primary context for the first time will fail with ::CUDA_ERROR_UNKNOWN * if the compute mode of the device is ::CU_COMPUTEMODE_PROHIBITED. The function * ::cuDeviceGetAttribute() can be used with ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE to * determine the compute mode of the device. * The nvidia-smi tool can be used to set the compute mode for * devices. Documentation for nvidia-smi can be obtained by passing a * -h option to it. * * Please note that the primary context always supports pinned allocations. Other * flags can be specified by ::cuDevicePrimaryCtxSetFlags(). * * \param pctx - Returned context handle of the new context * \param dev - Device for which primary context is requested * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_DEVICE, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_OUT_OF_MEMORY, * ::CUDA_ERROR_UNKNOWN * \notefnerr * * \sa ::cuDevicePrimaryCtxRelease, * ::cuDevicePrimaryCtxSetFlags, * ::cuCtxCreate, * ::cuCtxGetApiVersion, * ::cuCtxGetCacheConfig, * ::cuCtxGetDevice, * ::cuCtxGetFlags, * ::cuCtxGetLimit, * ::cuCtxPopCurrent, * ::cuCtxPushCurrent, * ::cuCtxSetCacheConfig, * ::cuCtxSetLimit, * ::cuCtxSynchronize */ CUresult CUDAAPI cuDevicePrimaryCtxRetain(CUcontext *pctx, CUdevice dev); /** * \brief Release the primary context on the GPU * * Releases the primary context interop on the device. * A retained context should always be released once the user is done using * it. The context is automatically reset once the last reference to it is * released. This behavior is different when the primary context was retained * by the CUDA runtime from CUDA 4.0 and earlier. In this case, the primary * context remains always active. * * Releasing a primary context that has not been previously retained will * fail with ::CUDA_ERROR_INVALID_CONTEXT. * * Please note that unlike ::cuCtxDestroy() this method does not pop the context * from stack in any circumstances. * * \param dev - Device which primary context is released * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_DEVICE, * ::CUDA_ERROR_INVALID_CONTEXT * \notefnerr * * \sa ::cuDevicePrimaryCtxRetain, * ::cuCtxDestroy, * ::cuCtxGetApiVersion, * ::cuCtxGetCacheConfig, * ::cuCtxGetDevice, * ::cuCtxGetFlags, * ::cuCtxGetLimit, * ::cuCtxPopCurrent, * ::cuCtxPushCurrent, * ::cuCtxSetCacheConfig, * ::cuCtxSetLimit, * ::cuCtxSynchronize */ CUresult CUDAAPI cuDevicePrimaryCtxRelease(CUdevice dev); /** * \brief Set flags for the primary context * * Sets the flags for the primary context on the device overwriting perviously * set ones. * * The three LSBs of the \p flags parameter can be used to control how the OS * thread, which owns the CUDA context at the time of an API call, interacts * with the OS scheduler when waiting for results from the GPU. Only one of * the scheduling flags can be set when creating a context. * * - ::CU_CTX_SCHED_SPIN: Instruct CUDA to actively spin when waiting for * results from the GPU. This can decrease latency when waiting for the GPU, * but may lower the performance of CPU threads if they are performing work in * parallel with the CUDA thread. * * - ::CU_CTX_SCHED_YIELD: Instruct CUDA to yield its thread when waiting for * results from the GPU. This can increase latency when waiting for the GPU, * but can increase the performance of CPU threads performing work in parallel * with the GPU. * * - ::CU_CTX_SCHED_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a * synchronization primitive when waiting for the GPU to finish work. * * - ::CU_CTX_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a * synchronization primitive when waiting for the GPU to finish work.
* Deprecated: This flag was deprecated as of CUDA 4.0 and was * replaced with ::CU_CTX_SCHED_BLOCKING_SYNC. * * - ::CU_CTX_SCHED_AUTO: The default value if the \p flags parameter is zero, * uses a heuristic based on the number of active CUDA contexts in the * process \e C and the number of logical processors in the system \e P. If * \e C > \e P, then CUDA will yield to other OS threads when waiting for * the GPU (::CU_CTX_SCHED_YIELD), otherwise CUDA will not yield while * waiting for results and actively spin on the processor (::CU_CTX_SCHED_SPIN). * Additionally, on Tegra devices, ::CU_CTX_SCHED_AUTO uses a heuristic based on * the power profile of the platform and may choose ::CU_CTX_SCHED_BLOCKING_SYNC * for low-powered devices. * * - ::CU_CTX_LMEM_RESIZE_TO_MAX: Instruct CUDA to not reduce local memory * after resizing local memory for a kernel. This can prevent thrashing by * local memory allocations when launching many kernels with high local * memory usage at the cost of potentially increased memory usage.
* Deprecated: This flag is deprecated and the behavior enabled * by this flag is now the default and cannot be disabled. * * \param dev - Device for which the primary context flags are set * \param flags - New flags for the device * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_DEVICE, * ::CUDA_ERROR_INVALID_VALUE, * \notefnerr * * \sa ::cuDevicePrimaryCtxRetain, * ::cuDevicePrimaryCtxGetState, * ::cuCtxCreate, * ::cuCtxGetFlags, * ::cudaSetDeviceFlags */ CUresult CUDAAPI cuDevicePrimaryCtxSetFlags(CUdevice dev, unsigned int flags); /** * \brief Get the state of the primary context * * Returns in \p *flags the flags for the primary context of \p dev, and in * \p *active whether it is active. See ::cuDevicePrimaryCtxSetFlags for flag * values. * * \param dev - Device to get primary context flags for * \param flags - Pointer to store flags * \param active - Pointer to store context state; 0 = inactive, 1 = active * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_DEVICE, * ::CUDA_ERROR_INVALID_VALUE, * \notefnerr * * \sa * ::cuDevicePrimaryCtxSetFlags, * ::cuCtxGetFlags, * ::cudaGetDeviceFlags */ CUresult CUDAAPI cuDevicePrimaryCtxGetState(CUdevice dev, unsigned int *flags, int *active); /** * \brief Destroy all allocations and reset all state on the primary context * * Explicitly destroys and cleans up all resources associated with the current * device in the current process. * * Note that it is responsibility of the calling function to ensure that no * other module in the process is using the device any more. For that reason * it is recommended to use ::cuDevicePrimaryCtxRelease() in most cases. * However it is safe for other modules to call ::cuDevicePrimaryCtxRelease() * even after resetting the device. * Resetting the primary context does not release it, an application that has * retained the primary context should explicitly release its usage. * * \param dev - Device for which primary context is destroyed * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_DEVICE, * ::CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE * \notefnerr * * \sa ::cuDevicePrimaryCtxRetain, * ::cuDevicePrimaryCtxRelease, * ::cuCtxGetApiVersion, * ::cuCtxGetCacheConfig, * ::cuCtxGetDevice, * ::cuCtxGetFlags, * ::cuCtxGetLimit, * ::cuCtxPopCurrent, * ::cuCtxPushCurrent, * ::cuCtxSetCacheConfig, * ::cuCtxSetLimit, * ::cuCtxSynchronize, * ::cudaDeviceReset */ CUresult CUDAAPI cuDevicePrimaryCtxReset(CUdevice dev); /** @} */ /* END CUDA_PRIMARY_CTX */ /** * \brief Returns information about the execution affinity support of the device. * * Returns in \p *pi whether execution affinity type \p type is supported by device \p dev. * The supported types are: * - ::CU_EXEC_AFFINITY_TYPE_SM_COUNT: 1 if context with limited SMs is supported by the device, * or 0 if not; * * \param pi - 1 if the execution affinity type \p type is supported by the device, or 0 if not * \param type - Execution affinity type to query * \param dev - Device handle * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_INVALID_DEVICE * \notefnerr * * \sa * ::cuDeviceGetAttribute, * ::cuDeviceGetCount, * ::cuDeviceGetName, * ::cuDeviceGetUuid, * ::cuDeviceGet, * ::cuDeviceTotalMem */ CUresult CUDAAPI cuDeviceGetExecAffinitySupport(int *pi, CUexecAffinityType type, CUdevice dev); /** * \defgroup CUDA_CTX Context Management * * ___MANBRIEF___ context management functions of the low-level CUDA driver * API (___CURRENT_FILE___) ___ENDMANBRIEF___ * * This section describes the context management functions of the low-level * CUDA driver application programming interface. * * Please note that some functions are described in * \ref CUDA_PRIMARY_CTX "Primary Context Management" section. * * @{ */ /** * \brief Create a CUDA context * * \note In most cases it is recommended to use ::cuDevicePrimaryCtxRetain. * * Creates a new CUDA context and associates it with the calling thread. The * \p flags parameter is described below. The context is created with a usage * count of 1 and the caller of ::cuCtxCreate() must call ::cuCtxDestroy() or * when done using the context. If a context is already current to the thread, * it is supplanted by the newly created context and may be restored by a subsequent * call to ::cuCtxPopCurrent(). * * The three LSBs of the \p flags parameter can be used to control how the OS * thread, which owns the CUDA context at the time of an API call, interacts * with the OS scheduler when waiting for results from the GPU. Only one of * the scheduling flags can be set when creating a context. * * - ::CU_CTX_SCHED_SPIN: Instruct CUDA to actively spin when waiting for * results from the GPU. This can decrease latency when waiting for the GPU, * but may lower the performance of CPU threads if they are performing work in * parallel with the CUDA thread. * * - ::CU_CTX_SCHED_YIELD: Instruct CUDA to yield its thread when waiting for * results from the GPU. This can increase latency when waiting for the GPU, * but can increase the performance of CPU threads performing work in parallel * with the GPU. * * - ::CU_CTX_SCHED_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a * synchronization primitive when waiting for the GPU to finish work. * * - ::CU_CTX_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a * synchronization primitive when waiting for the GPU to finish work.
* Deprecated: This flag was deprecated as of CUDA 4.0 and was * replaced with ::CU_CTX_SCHED_BLOCKING_SYNC. * * - ::CU_CTX_SCHED_AUTO: The default value if the \p flags parameter is zero, * uses a heuristic based on the number of active CUDA contexts in the * process \e C and the number of logical processors in the system \e P. If * \e C > \e P, then CUDA will yield to other OS threads when waiting for * the GPU (::CU_CTX_SCHED_YIELD), otherwise CUDA will not yield while * waiting for results and actively spin on the processor (::CU_CTX_SCHED_SPIN). * Additionally, on Tegra devices, ::CU_CTX_SCHED_AUTO uses a heuristic based on * the power profile of the platform and may choose ::CU_CTX_SCHED_BLOCKING_SYNC * for low-powered devices. * * - ::CU_CTX_MAP_HOST: Instruct CUDA to support mapped pinned allocations. * This flag must be set in order to allocate pinned host memory that is * accessible to the GPU. * * - ::CU_CTX_LMEM_RESIZE_TO_MAX: Instruct CUDA to not reduce local memory * after resizing local memory for a kernel. This can prevent thrashing by * local memory allocations when launching many kernels with high local * memory usage at the cost of potentially increased memory usage.
* Deprecated: This flag is deprecated and the behavior enabled * by this flag is now the default and cannot be disabled. * Instead, the per-thread stack size can be controlled with ::cuCtxSetLimit(). * * Context creation will fail with ::CUDA_ERROR_UNKNOWN if the compute mode of * the device is ::CU_COMPUTEMODE_PROHIBITED. The function ::cuDeviceGetAttribute() * can be used with ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE to determine the * compute mode of the device. The nvidia-smi tool can be used to set * the compute mode for * devices. * Documentation for nvidia-smi can be obtained by passing a * -h option to it. * * \param pctx - Returned context handle of the new context * \param flags - Context creation flags * \param dev - Device to create context on * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_DEVICE, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_OUT_OF_MEMORY, * ::CUDA_ERROR_UNKNOWN * \notefnerr * * \sa ::cuCtxDestroy, * ::cuCtxGetApiVersion, * ::cuCtxGetCacheConfig, * ::cuCtxGetDevice, * ::cuCtxGetFlags, * ::cuCtxGetLimit, * ::cuCtxPopCurrent, * ::cuCtxPushCurrent, * ::cuCtxSetCacheConfig, * ::cuCtxSetLimit, * ::cuCtxSynchronize */ CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev); /** * \brief Create a CUDA context with execution affinity * * Creates a new CUDA context with execution affinity and associates it with * the calling thread. The \p paramsArray and \p flags parameter are described below. * The context is created with a usage count of 1 and the caller of ::cuCtxCreate() must * call ::cuCtxDestroy() or when done using the context. If a context is already * current to the thread, it is supplanted by the newly created context and may * be restored by a subsequent call to ::cuCtxPopCurrent(). * * The type and the amount of execution resource the context can use is limited by \p paramsArray * and \p numParams. The \p paramsArray is an array of \p CUexecAffinityParam and the \p numParams * describes the size of the array. If two \p CUexecAffinityParam in the array have the same type, * the latter execution affinity parameter overrides the former execution affinity parameter. * The supported execution affinity types are: * - ::CU_EXEC_AFFINITY_TYPE_SM_COUNT limits the portion of SMs that the context can use. The portion * of SMs is specified as the number of SMs via \p CUexecAffinitySmCount. This limit will be internally * rounded up to the next hardware-supported amount. Hence, it is imperative to query the actual execution * affinity of the context via \p cuCtxGetExecAffinity after context creation. Currently, this attribute * is only supported under Volta+ MPS. * * The three LSBs of the \p flags parameter can be used to control how the OS * thread, which owns the CUDA context at the time of an API call, interacts * with the OS scheduler when waiting for results from the GPU. Only one of * the scheduling flags can be set when creating a context. * * - ::CU_CTX_SCHED_SPIN: Instruct CUDA to actively spin when waiting for * results from the GPU. This can decrease latency when waiting for the GPU, * but may lower the performance of CPU threads if they are performing work in * parallel with the CUDA thread. * * - ::CU_CTX_SCHED_YIELD: Instruct CUDA to yield its thread when waiting for * results from the GPU. This can increase latency when waiting for the GPU, * but can increase the performance of CPU threads performing work in parallel * with the GPU. * * - ::CU_CTX_SCHED_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a * synchronization primitive when waiting for the GPU to finish work. * * - ::CU_CTX_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a * synchronization primitive when waiting for the GPU to finish work.
* Deprecated: This flag was deprecated as of CUDA 4.0 and was * replaced with ::CU_CTX_SCHED_BLOCKING_SYNC. * * - ::CU_CTX_SCHED_AUTO: The default value if the \p flags parameter is zero, * uses a heuristic based on the number of active CUDA contexts in the * process \e C and the number of logical processors in the system \e P. If * \e C > \e P, then CUDA will yield to other OS threads when waiting for * the GPU (::CU_CTX_SCHED_YIELD), otherwise CUDA will not yield while * waiting for results and actively spin on the processor (::CU_CTX_SCHED_SPIN). * Additionally, on Tegra devices, ::CU_CTX_SCHED_AUTO uses a heuristic based on * the power profile of the platform and may choose ::CU_CTX_SCHED_BLOCKING_SYNC * for low-powered devices. * * - ::CU_CTX_MAP_HOST: Instruct CUDA to support mapped pinned allocations. * This flag must be set in order to allocate pinned host memory that is * accessible to the GPU. * * - ::CU_CTX_LMEM_RESIZE_TO_MAX: Instruct CUDA to not reduce local memory * after resizing local memory for a kernel. This can prevent thrashing by * local memory allocations when launching many kernels with high local * memory usage at the cost of potentially increased memory usage.
* Deprecated: This flag is deprecated and the behavior enabled * by this flag is now the default and cannot be disabled. * Instead, the per-thread stack size can be controlled with ::cuCtxSetLimit(). * * Context creation will fail with ::CUDA_ERROR_UNKNOWN if the compute mode of * the device is ::CU_COMPUTEMODE_PROHIBITED. The function ::cuDeviceGetAttribute() * can be used with ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE to determine the * compute mode of the device. The nvidia-smi tool can be used to set * the compute mode for * devices. * Documentation for nvidia-smi can be obtained by passing a * -h option to it. * * \param pctx - Returned context handle of the new context * \param paramsArray - Execution affinity parameters * \param numParams - Number of execution affinity parameters * \param flags - Context creation flags * \param dev - Device to create context on * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_DEVICE, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_OUT_OF_MEMORY, * ::CUDA_ERROR_UNSUPPORTED_EXEC_AFFINITY, * ::CUDA_ERROR_UNKNOWN * \notefnerr * * \sa ::cuCtxDestroy, * ::cuCtxGetApiVersion, * ::cuCtxGetCacheConfig, * ::cuCtxGetDevice, * ::cuCtxGetFlags, * ::cuCtxGetLimit, * ::cuCtxPopCurrent, * ::cuCtxPushCurrent, * ::cuCtxSetCacheConfig, * ::cuCtxSetLimit, * ::cuCtxSynchronize, * ::CUexecAffinityParam */ CUresult CUDAAPI cuCtxCreate_v3(CUcontext *pctx, CUexecAffinityParam *paramsArray, int numParams, unsigned int flags, CUdevice dev); /** * \brief Destroy a CUDA context * * Destroys the CUDA context specified by \p ctx. The context \p ctx will be * destroyed regardless of how many threads it is current to. * It is the responsibility of the calling function to ensure that no API * call issues using \p ctx while ::cuCtxDestroy() is executing. * * Destroys and cleans up all resources associated with the context. * It is the caller's responsibility to ensure that the context or its resources * are not accessed or passed in subsequent API calls and doing so will result in undefined behavior. * These resources include CUDA types such as ::CUmodule, ::CUfunction, ::CUstream, ::CUevent, * ::CUarray, ::CUmipmappedArray, ::CUtexObject, ::CUsurfObject, ::CUtexref, ::CUsurfref, * ::CUgraphicsResource, ::CUlinkState, ::CUexternalMemory and ::CUexternalSemaphore. * * If \p ctx is current to the calling thread then \p ctx will also be * popped from the current thread's context stack (as though ::cuCtxPopCurrent() * were called). If \p ctx is current to other threads, then \p ctx will * remain current to those threads, and attempting to access \p ctx from * those threads will result in the error ::CUDA_ERROR_CONTEXT_IS_DESTROYED. * * \param ctx - Context to destroy * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE * \notefnerr * * \sa ::cuCtxCreate, * ::cuCtxGetApiVersion, * ::cuCtxGetCacheConfig, * ::cuCtxGetDevice, * ::cuCtxGetFlags, * ::cuCtxGetLimit, * ::cuCtxPopCurrent, * ::cuCtxPushCurrent, * ::cuCtxSetCacheConfig, * ::cuCtxSetLimit, * ::cuCtxSynchronize */ CUresult CUDAAPI cuCtxDestroy(CUcontext ctx); /** * \brief Pushes a context on the current CPU thread * * Pushes the given context \p ctx onto the CPU thread's stack of current * contexts. The specified context becomes the CPU thread's current context, so * all CUDA functions that operate on the current context are affected. * * The previous current context may be made current again by calling * ::cuCtxDestroy() or ::cuCtxPopCurrent(). * * \param ctx - Context to push * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE * \notefnerr * * \sa ::cuCtxCreate, * ::cuCtxDestroy, * ::cuCtxGetApiVersion, * ::cuCtxGetCacheConfig, * ::cuCtxGetDevice, * ::cuCtxGetFlags, * ::cuCtxGetLimit, * ::cuCtxPopCurrent, * ::cuCtxSetCacheConfig, * ::cuCtxSetLimit, * ::cuCtxSynchronize */ CUresult CUDAAPI cuCtxPushCurrent(CUcontext ctx); /** * \brief Pops the current CUDA context from the current CPU thread. * * Pops the current CUDA context from the CPU thread and passes back the * old context handle in \p *pctx. That context may then be made current * to a different CPU thread by calling ::cuCtxPushCurrent(). * * If a context was current to the CPU thread before ::cuCtxCreate() or * ::cuCtxPushCurrent() was called, this function makes that context current to * the CPU thread again. * * \param pctx - Returned popped context handle * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT * \notefnerr * * \sa ::cuCtxCreate, * ::cuCtxDestroy, * ::cuCtxGetApiVersion, * ::cuCtxGetCacheConfig, * ::cuCtxGetDevice, * ::cuCtxGetFlags, * ::cuCtxGetLimit, * ::cuCtxPushCurrent, * ::cuCtxSetCacheConfig, * ::cuCtxSetLimit, * ::cuCtxSynchronize */ CUresult CUDAAPI cuCtxPopCurrent(CUcontext *pctx); /** * \brief Binds the specified CUDA context to the calling CPU thread * * Binds the specified CUDA context to the calling CPU thread. * If \p ctx is NULL then the CUDA context previously bound to the * calling CPU thread is unbound and ::CUDA_SUCCESS is returned. * * If there exists a CUDA context stack on the calling CPU thread, this * will replace the top of that stack with \p ctx. * If \p ctx is NULL then this will be equivalent to popping the top * of the calling CPU thread's CUDA context stack (or a no-op if the * calling CPU thread's CUDA context stack is empty). * * \param ctx - Context to bind to the calling CPU thread * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT * \notefnerr * * \sa * ::cuCtxGetCurrent, * ::cuCtxCreate, * ::cuCtxDestroy, * ::cudaSetDevice */ CUresult CUDAAPI cuCtxSetCurrent(CUcontext ctx); /** * \brief Returns the CUDA context bound to the calling CPU thread. * * Returns in \p *pctx the CUDA context bound to the calling CPU thread. * If no context is bound to the calling CPU thread then \p *pctx is * set to NULL and ::CUDA_SUCCESS is returned. * * \param pctx - Returned context handle * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * \notefnerr * * \sa * ::cuCtxSetCurrent, * ::cuCtxCreate, * ::cuCtxDestroy, * ::cudaGetDevice */ CUresult CUDAAPI cuCtxGetCurrent(CUcontext *pctx); /** * \brief Returns the device ID for the current context * * Returns in \p *device the ordinal of the current context's device. * * \param device - Returned device ID for the current context * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE, * \notefnerr * * \sa ::cuCtxCreate, * ::cuCtxDestroy, * ::cuCtxGetApiVersion, * ::cuCtxGetCacheConfig, * ::cuCtxGetFlags, * ::cuCtxGetLimit, * ::cuCtxPopCurrent, * ::cuCtxPushCurrent, * ::cuCtxSetCacheConfig, * ::cuCtxSetLimit, * ::cuCtxSynchronize, * ::cudaGetDevice */ CUresult CUDAAPI cuCtxGetDevice(CUdevice *device); /** * \brief Returns the flags for the current context * * Returns in \p *flags the flags of the current context. See ::cuCtxCreate * for flag values. * * \param flags - Pointer to store flags of current context * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE, * \notefnerr * * \sa ::cuCtxCreate, * ::cuCtxGetApiVersion, * ::cuCtxGetCacheConfig, * ::cuCtxGetCurrent, * ::cuCtxGetDevice, * ::cuCtxGetLimit, * ::cuCtxGetSharedMemConfig, * ::cuCtxGetStreamPriorityRange, * ::cudaGetDeviceFlags */ CUresult CUDAAPI cuCtxGetFlags(unsigned int *flags); /** * \brief Block for a context's tasks to complete * * Blocks until the device has completed all preceding requested tasks. * ::cuCtxSynchronize() returns an error if one of the preceding tasks failed. * If the context was created with the ::CU_CTX_SCHED_BLOCKING_SYNC flag, the * CPU thread will block until the GPU context has finished its work. * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT * \notefnerr * * \sa ::cuCtxCreate, * ::cuCtxDestroy, * ::cuCtxGetApiVersion, * ::cuCtxGetCacheConfig, * ::cuCtxGetDevice, * ::cuCtxGetFlags, * ::cuCtxGetLimit, * ::cuCtxPopCurrent, * ::cuCtxPushCurrent, * ::cuCtxSetCacheConfig, * ::cuCtxSetLimit, * ::cudaDeviceSynchronize */ CUresult CUDAAPI cuCtxSynchronize(void); /** * \brief Set resource limits * * Setting \p limit to \p value is a request by the application to update * the current limit maintained by the context. The driver is free to * modify the requested value to meet h/w requirements (this could be * clamping to minimum or maximum values, rounding up to nearest element * size, etc). The application can use ::cuCtxGetLimit() to find out exactly * what the limit has been set to. * * Setting each ::CUlimit has its own specific restrictions, so each is * discussed here. * * - ::CU_LIMIT_STACK_SIZE controls the stack size in bytes of each GPU thread. * The driver automatically increases the per-thread stack size * for each kernel launch as needed. This size isn't reset back to the * original value after each launch. Setting this value will take effect * immediately, and if necessary, the device will block until all preceding * requested tasks are complete. * * - ::CU_LIMIT_PRINTF_FIFO_SIZE controls the size in bytes of the FIFO used * by the ::printf() device system call. Setting ::CU_LIMIT_PRINTF_FIFO_SIZE * must be performed before launching any kernel that uses the ::printf() * device system call, otherwise ::CUDA_ERROR_INVALID_VALUE will be returned. * * - ::CU_LIMIT_MALLOC_HEAP_SIZE controls the size in bytes of the heap used * by the ::malloc() and ::free() device system calls. Setting * ::CU_LIMIT_MALLOC_HEAP_SIZE must be performed before launching any kernel * that uses the ::malloc() or ::free() device system calls, otherwise * ::CUDA_ERROR_INVALID_VALUE will be returned. * * - ::CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH controls the maximum nesting depth of * a grid at which a thread can safely call ::cudaDeviceSynchronize(). Setting * this limit must be performed before any launch of a kernel that uses the * device runtime and calls ::cudaDeviceSynchronize() above the default sync * depth, two levels of grids. Calls to ::cudaDeviceSynchronize() will fail * with error code ::cudaErrorSyncDepthExceeded if the limitation is * violated. This limit can be set smaller than the default or up the maximum * launch depth of 24. When setting this limit, keep in mind that additional * levels of sync depth require the driver to reserve large amounts of device * memory which can no longer be used for user allocations. If these * reservations of device memory fail, ::cuCtxSetLimit() will return * ::CUDA_ERROR_OUT_OF_MEMORY, and the limit can be reset to a lower value. * This limit is only applicable to devices of compute capability 3.5 and * higher. Attempting to set this limit on devices of compute capability less * than 3.5 will result in the error ::CUDA_ERROR_UNSUPPORTED_LIMIT being * returned. * * - ::CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT controls the maximum number of * outstanding device runtime launches that can be made from the current * context. A grid is outstanding from the point of launch up until the grid * is known to have been completed. Device runtime launches which violate * this limitation fail and return ::cudaErrorLaunchPendingCountExceeded when * ::cudaGetLastError() is called after launch. If more pending launches than * the default (2048 launches) are needed for a module using the device * runtime, this limit can be increased. Keep in mind that being able to * sustain additional pending launches will require the driver to reserve * larger amounts of device memory upfront which can no longer be used for * allocations. If these reservations fail, ::cuCtxSetLimit() will return * ::CUDA_ERROR_OUT_OF_MEMORY, and the limit can be reset to a lower value. * This limit is only applicable to devices of compute capability 3.5 and * higher. Attempting to set this limit on devices of compute capability less * than 3.5 will result in the error ::CUDA_ERROR_UNSUPPORTED_LIMIT being * returned. * * - ::CU_LIMIT_MAX_L2_FETCH_GRANULARITY controls the L2 cache fetch granularity. * Values can range from 0B to 128B. This is purely a performence hint and * it can be ignored or clamped depending on the platform. * * - ::CU_LIMIT_PERSISTING_L2_CACHE_SIZE controls size in bytes availabe for * persisting L2 cache. This is purely a performance hint and it can be * ignored or clamped depending on the platform. * * \param limit - Limit to set * \param value - Size of limit * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_UNSUPPORTED_LIMIT, * ::CUDA_ERROR_OUT_OF_MEMORY, * ::CUDA_ERROR_INVALID_CONTEXT * \notefnerr * * \sa ::cuCtxCreate, * ::cuCtxDestroy, * ::cuCtxGetApiVersion, * ::cuCtxGetCacheConfig, * ::cuCtxGetDevice, * ::cuCtxGetFlags, * ::cuCtxGetLimit, * ::cuCtxPopCurrent, * ::cuCtxPushCurrent, * ::cuCtxSetCacheConfig, * ::cuCtxSynchronize, * ::cudaDeviceSetLimit */ CUresult CUDAAPI cuCtxSetLimit(CUlimit limit, size_t value); /** * \brief Returns resource limits * * Returns in \p *pvalue the current size of \p limit. The supported * ::CUlimit values are: * - ::CU_LIMIT_STACK_SIZE: stack size in bytes of each GPU thread. * - ::CU_LIMIT_PRINTF_FIFO_SIZE: size in bytes of the FIFO used by the * ::printf() device system call. * - ::CU_LIMIT_MALLOC_HEAP_SIZE: size in bytes of the heap used by the * ::malloc() and ::free() device system calls. * - ::CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH: maximum grid depth at which a thread * can issue the device runtime call ::cudaDeviceSynchronize() to wait on * child grid launches to complete. * - ::CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT: maximum number of outstanding * device runtime launches that can be made from this context. * - ::CU_LIMIT_MAX_L2_FETCH_GRANULARITY: L2 cache fetch granularity. * - ::CU_LIMIT_PERSISTING_L2_CACHE_SIZE: Persisting L2 cache size in bytes * * \param limit - Limit to query * \param pvalue - Returned size of limit * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_UNSUPPORTED_LIMIT * \notefnerr * * \sa ::cuCtxCreate, * ::cuCtxDestroy, * ::cuCtxGetApiVersion, * ::cuCtxGetCacheConfig, * ::cuCtxGetDevice, * ::cuCtxGetFlags, * ::cuCtxPopCurrent, * ::cuCtxPushCurrent, * ::cuCtxSetCacheConfig, * ::cuCtxSetLimit, * ::cuCtxSynchronize, * ::cudaDeviceGetLimit */ CUresult CUDAAPI cuCtxGetLimit(size_t *pvalue, CUlimit limit); /** * \brief Returns the preferred cache configuration for the current context. * * On devices where the L1 cache and shared memory use the same hardware * resources, this function returns through \p pconfig the preferred cache configuration * for the current context. This is only a preference. The driver will use * the requested configuration if possible, but it is free to choose a different * configuration if required to execute functions. * * This will return a \p pconfig of ::CU_FUNC_CACHE_PREFER_NONE on devices * where the size of the L1 cache and shared memory are fixed. * * The supported cache configurations are: * - ::CU_FUNC_CACHE_PREFER_NONE: no preference for shared memory or L1 (default) * - ::CU_FUNC_CACHE_PREFER_SHARED: prefer larger shared memory and smaller L1 cache * - ::CU_FUNC_CACHE_PREFER_L1: prefer larger L1 cache and smaller shared memory * - ::CU_FUNC_CACHE_PREFER_EQUAL: prefer equal sized L1 cache and shared memory * * \param pconfig - Returned cache configuration * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE * \notefnerr * * \sa ::cuCtxCreate, * ::cuCtxDestroy, * ::cuCtxGetApiVersion, * ::cuCtxGetDevice, * ::cuCtxGetFlags, * ::cuCtxGetLimit, * ::cuCtxPopCurrent, * ::cuCtxPushCurrent, * ::cuCtxSetCacheConfig, * ::cuCtxSetLimit, * ::cuCtxSynchronize, * ::cuFuncSetCacheConfig, * ::cudaDeviceGetCacheConfig */ CUresult CUDAAPI cuCtxGetCacheConfig(CUfunc_cache *pconfig); /** * \brief Sets the preferred cache configuration for the current context. * * On devices where the L1 cache and shared memory use the same hardware * resources, this sets through \p config the preferred cache configuration for * the current context. This is only a preference. The driver will use * the requested configuration if possible, but it is free to choose a different * configuration if required to execute the function. Any function preference * set via ::cuFuncSetCacheConfig() will be preferred over this context-wide * setting. Setting the context-wide cache configuration to * ::CU_FUNC_CACHE_PREFER_NONE will cause subsequent kernel launches to prefer * to not change the cache configuration unless required to launch the kernel. * * This setting does nothing on devices where the size of the L1 cache and * shared memory are fixed. * * Launching a kernel with a different preference than the most recent * preference setting may insert a device-side synchronization point. * * The supported cache configurations are: * - ::CU_FUNC_CACHE_PREFER_NONE: no preference for shared memory or L1 (default) * - ::CU_FUNC_CACHE_PREFER_SHARED: prefer larger shared memory and smaller L1 cache * - ::CU_FUNC_CACHE_PREFER_L1: prefer larger L1 cache and smaller shared memory * - ::CU_FUNC_CACHE_PREFER_EQUAL: prefer equal sized L1 cache and shared memory * * \param config - Requested cache configuration * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE * \notefnerr * * \sa ::cuCtxCreate, * ::cuCtxDestroy, * ::cuCtxGetApiVersion, * ::cuCtxGetCacheConfig, * ::cuCtxGetDevice, * ::cuCtxGetFlags, * ::cuCtxGetLimit, * ::cuCtxPopCurrent, * ::cuCtxPushCurrent, * ::cuCtxSetLimit, * ::cuCtxSynchronize, * ::cuFuncSetCacheConfig, * ::cudaDeviceSetCacheConfig */ CUresult CUDAAPI cuCtxSetCacheConfig(CUfunc_cache config); /** * \brief Returns the current shared memory configuration for the current context. * * This function will return in \p pConfig the current size of shared memory banks * in the current context. On devices with configurable shared memory banks, * ::cuCtxSetSharedMemConfig can be used to change this setting, so that all * subsequent kernel launches will by default use the new bank size. When * ::cuCtxGetSharedMemConfig is called on devices without configurable shared * memory, it will return the fixed bank size of the hardware. * * The returned bank configurations can be either: * - ::CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE: shared memory bank width is * four bytes. * - ::CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE: shared memory bank width will * eight bytes. * * \param pConfig - returned shared memory configuration * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE * \notefnerr * * \sa ::cuCtxCreate, * ::cuCtxDestroy, * ::cuCtxGetApiVersion, * ::cuCtxGetCacheConfig, * ::cuCtxGetDevice, * ::cuCtxGetFlags, * ::cuCtxGetLimit, * ::cuCtxPopCurrent, * ::cuCtxPushCurrent, * ::cuCtxSetLimit, * ::cuCtxSynchronize, * ::cuCtxGetSharedMemConfig, * ::cuFuncSetCacheConfig, * ::cudaDeviceGetSharedMemConfig */ CUresult CUDAAPI cuCtxGetSharedMemConfig(CUsharedconfig *pConfig); /** * \brief Sets the shared memory configuration for the current context. * * On devices with configurable shared memory banks, this function will set * the context's shared memory bank size which is used for subsequent kernel * launches. * * Changed the shared memory configuration between launches may insert a device * side synchronization point between those launches. * * Changing the shared memory bank size will not increase shared memory usage * or affect occupancy of kernels, but may have major effects on performance. * Larger bank sizes will allow for greater potential bandwidth to shared memory, * but will change what kinds of accesses to shared memory will result in bank * conflicts. * * This function will do nothing on devices with fixed shared memory bank size. * * The supported bank configurations are: * - ::CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE: set bank width to the default initial * setting (currently, four bytes). * - ::CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE: set shared memory bank width to * be natively four bytes. * - ::CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE: set shared memory bank width to * be natively eight bytes. * * \param config - requested shared memory configuration * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE * \notefnerr * * \sa ::cuCtxCreate, * ::cuCtxDestroy, * ::cuCtxGetApiVersion, * ::cuCtxGetCacheConfig, * ::cuCtxGetDevice, * ::cuCtxGetFlags, * ::cuCtxGetLimit, * ::cuCtxPopCurrent, * ::cuCtxPushCurrent, * ::cuCtxSetLimit, * ::cuCtxSynchronize, * ::cuCtxGetSharedMemConfig, * ::cuFuncSetCacheConfig, * ::cudaDeviceSetSharedMemConfig */ CUresult CUDAAPI cuCtxSetSharedMemConfig(CUsharedconfig config); /** * \brief Gets the context's API version. * * Returns a version number in \p version corresponding to the capabilities of * the context (e.g. 3010 or 3020), which library developers can use to direct * callers to a specific API version. If \p ctx is NULL, returns the API version * used to create the currently bound context. * * Note that new API versions are only introduced when context capabilities are * changed that break binary compatibility, so the API version and driver version * may be different. For example, it is valid for the API version to be 3020 while * the driver version is 4020. * * \param ctx - Context to check * \param version - Pointer to version * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_UNKNOWN * \notefnerr * * \sa ::cuCtxCreate, * ::cuCtxDestroy, * ::cuCtxGetDevice, * ::cuCtxGetFlags, * ::cuCtxGetLimit, * ::cuCtxPopCurrent, * ::cuCtxPushCurrent, * ::cuCtxSetCacheConfig, * ::cuCtxSetLimit, * ::cuCtxSynchronize */ CUresult CUDAAPI cuCtxGetApiVersion(CUcontext ctx, unsigned int *version); /** * \brief Returns numerical values that correspond to the least and * greatest stream priorities. * * Returns in \p *leastPriority and \p *greatestPriority the numerical values that correspond * to the least and greatest stream priorities respectively. Stream priorities * follow a convention where lower numbers imply greater priorities. The range of * meaningful stream priorities is given by [\p *greatestPriority, \p *leastPriority]. * If the user attempts to create a stream with a priority value that is * outside the meaningful range as specified by this API, the priority is * automatically clamped down or up to either \p *leastPriority or \p *greatestPriority * respectively. See ::cuStreamCreateWithPriority for details on creating a * priority stream. * A NULL may be passed in for \p *leastPriority or \p *greatestPriority if the value * is not desired. * * This function will return '0' in both \p *leastPriority and \p *greatestPriority if * the current context's device does not support stream priorities * (see ::cuDeviceGetAttribute). * * \param leastPriority - Pointer to an int in which the numerical value for least * stream priority is returned * \param greatestPriority - Pointer to an int in which the numerical value for greatest * stream priority is returned * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_VALUE, * \notefnerr * * \sa ::cuStreamCreateWithPriority, * ::cuStreamGetPriority, * ::cuCtxGetDevice, * ::cuCtxGetFlags, * ::cuCtxSetLimit, * ::cuCtxSynchronize, * ::cudaDeviceGetStreamPriorityRange */ CUresult CUDAAPI cuCtxGetStreamPriorityRange(int *leastPriority, int *greatestPriority); /** * \brief Resets all persisting lines in cache to normal status. * * ::cuCtxResetPersistingL2Cache Resets all persisting lines in cache to normal * status. Takes effect on function return. * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_NOT_SUPPORTED * \notefnerr * * \sa * ::CUaccessPolicyWindow */ CUresult CUDAAPI cuCtxResetPersistingL2Cache(void); /** * \brief Returns the execution affinity setting for the current context. * * Returns in \p *pExecAffinity the current value of \p type. The supported * ::CUexecAffinityType values are: * - ::CU_EXEC_AFFINITY_TYPE_SM_COUNT: number of SMs the context is limited to use. * * \param type - Execution affinity type to query * \param pExecAffinity - Returned execution affinity * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_UNSUPPORTED_EXEC_AFFINITY * \notefnerr * * \sa * ::CUexecAffinityParam */ CUresult CUDAAPI cuCtxGetExecAffinity(CUexecAffinityParam *pExecAffinity, CUexecAffinityType type); /** @} */ /* END CUDA_CTX */ /** * \defgroup CUDA_CTX_DEPRECATED Context Management [DEPRECATED] * * ___MANBRIEF___ deprecated context management functions of the low-level CUDA * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ * * This section describes the deprecated context management functions of the low-level * CUDA driver application programming interface. * * @{ */ /** * \brief Increment a context's usage-count * * \deprecated * * Note that this function is deprecated and should not be used. * * Increments the usage count of the context and passes back a context handle * in \p *pctx that must be passed to ::cuCtxDetach() when the application is * done with the context. ::cuCtxAttach() fails if there is no context current * to the thread. * * Currently, the \p flags parameter must be 0. * * \param pctx - Returned context handle of the current context * \param flags - Context attach flags (must be 0) * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE * \notefnerr * * \sa ::cuCtxCreate, * ::cuCtxDestroy, * ::cuCtxDetach, * ::cuCtxGetApiVersion, * ::cuCtxGetCacheConfig, * ::cuCtxGetDevice, * ::cuCtxGetFlags, * ::cuCtxGetLimit, * ::cuCtxPopCurrent, * ::cuCtxPushCurrent, * ::cuCtxSetCacheConfig, * ::cuCtxSetLimit, * ::cuCtxSynchronize */ __CUDA_DEPRECATED CUresult CUDAAPI cuCtxAttach(CUcontext *pctx, unsigned int flags); /** * \brief Decrement a context's usage-count * * \deprecated * * Note that this function is deprecated and should not be used. * * Decrements the usage count of the context \p ctx, and destroys the context * if the usage count goes to 0. The context must be a handle that was passed * back by ::cuCtxCreate() or ::cuCtxAttach(), and must be current to the * calling thread. * * \param ctx - Context to destroy * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT * \notefnerr * * \sa ::cuCtxCreate, * ::cuCtxDestroy, * ::cuCtxGetApiVersion, * ::cuCtxGetCacheConfig, * ::cuCtxGetDevice, * ::cuCtxGetFlags, * ::cuCtxGetLimit, * ::cuCtxPopCurrent, * ::cuCtxPushCurrent, * ::cuCtxSetCacheConfig, * ::cuCtxSetLimit, * ::cuCtxSynchronize */ __CUDA_DEPRECATED CUresult CUDAAPI cuCtxDetach(CUcontext ctx); /** @} */ /* END CUDA_CTX_DEPRECATED */ /** * \defgroup CUDA_MODULE Module Management * * ___MANBRIEF___ module management functions of the low-level CUDA driver API * (___CURRENT_FILE___) ___ENDMANBRIEF___ * * This section describes the module management functions of the low-level CUDA * driver application programming interface. * * @{ */ /** * \brief Loads a compute module * * Takes a filename \p fname and loads the corresponding module \p module into * the current context. The CUDA driver API does not attempt to lazily * allocate the resources needed by a module; if the memory for functions and * data (constant and global) needed by the module cannot be allocated, * ::cuModuleLoad() fails. The file should be a \e cubin file as output by * \b nvcc, or a \e PTX file either as output by \b nvcc or handwritten, or * a \e fatbin file as output by \b nvcc from toolchain 4.0 or later. * * \param module - Returned module * \param fname - Filename of module to load * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_INVALID_PTX, * ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION, * ::CUDA_ERROR_NOT_FOUND, * ::CUDA_ERROR_OUT_OF_MEMORY, * ::CUDA_ERROR_FILE_NOT_FOUND, * ::CUDA_ERROR_NO_BINARY_FOR_GPU, * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND, * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED, * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND * \notefnerr * * \sa ::cuModuleGetFunction, * ::cuModuleGetGlobal, * ::cuModuleGetTexRef, * ::cuModuleLoadData, * ::cuModuleLoadDataEx, * ::cuModuleLoadFatBinary, * ::cuModuleUnload */ CUresult CUDAAPI cuModuleLoad(CUmodule *module, const char *fname); /** * \brief Load a module's data * * Takes a pointer \p image and loads the corresponding module \p module into * the current context. The pointer may be obtained by mapping a \e cubin or * \e PTX or \e fatbin file, passing a \e cubin or \e PTX or \e fatbin file * as a NULL-terminated text string, or incorporating a \e cubin or \e fatbin * object into the executable resources and using operating system calls such * as Windows \c FindResource() to obtain the pointer. * * \param module - Returned module * \param image - Module data to load * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_INVALID_PTX, * ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION, * ::CUDA_ERROR_OUT_OF_MEMORY, * ::CUDA_ERROR_NO_BINARY_FOR_GPU, * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND, * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED, * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND * \notefnerr * * \sa ::cuModuleGetFunction, * ::cuModuleGetGlobal, * ::cuModuleGetTexRef, * ::cuModuleLoad, * ::cuModuleLoadDataEx, * ::cuModuleLoadFatBinary, * ::cuModuleUnload */ CUresult CUDAAPI cuModuleLoadData(CUmodule *module, const void *image); /** * \brief Load a module's data with options * * Takes a pointer \p image and loads the corresponding module \p module into * the current context. The pointer may be obtained by mapping a \e cubin or * \e PTX or \e fatbin file, passing a \e cubin or \e PTX or \e fatbin file * as a NULL-terminated text string, or incorporating a \e cubin or \e fatbin * object into the executable resources and using operating system calls such * as Windows \c FindResource() to obtain the pointer. Options are passed as * an array via \p options and any corresponding parameters are passed in * \p optionValues. The number of total options is supplied via \p numOptions. * Any outputs will be returned via \p optionValues. * * \param module - Returned module * \param image - Module data to load * \param numOptions - Number of options * \param options - Options for JIT * \param optionValues - Option values for JIT * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_INVALID_PTX, * ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION, * ::CUDA_ERROR_OUT_OF_MEMORY, * ::CUDA_ERROR_NO_BINARY_FOR_GPU, * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND, * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED, * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND * \notefnerr * * \sa ::cuModuleGetFunction, * ::cuModuleGetGlobal, * ::cuModuleGetTexRef, * ::cuModuleLoad, * ::cuModuleLoadData, * ::cuModuleLoadFatBinary, * ::cuModuleUnload */ CUresult CUDAAPI cuModuleLoadDataEx(CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues); /** * \brief Load a module's data * * Takes a pointer \p fatCubin and loads the corresponding module \p module * into the current context. The pointer represents a fat binary object, * which is a collection of different \e cubin and/or \e PTX files, all * representing the same device code, but compiled and optimized for different * architectures. * * Prior to CUDA 4.0, there was no documented API for constructing and using * fat binary objects by programmers. Starting with CUDA 4.0, fat binary * objects can be constructed by providing the -fatbin option to \b nvcc. * More information can be found in the \b nvcc document. * * \param module - Returned module * \param fatCubin - Fat binary to load * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_INVALID_PTX, * ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION, * ::CUDA_ERROR_NOT_FOUND, * ::CUDA_ERROR_OUT_OF_MEMORY, * ::CUDA_ERROR_NO_BINARY_FOR_GPU, * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND, * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED, * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND * \notefnerr * * \sa ::cuModuleGetFunction, * ::cuModuleGetGlobal, * ::cuModuleGetTexRef, * ::cuModuleLoad, * ::cuModuleLoadData, * ::cuModuleLoadDataEx, * ::cuModuleUnload */ CUresult CUDAAPI cuModuleLoadFatBinary(CUmodule *module, const void *fatCubin); /** * \brief Unloads a module * * Unloads a module \p hmod from the current context. * * \param hmod - Module to unload * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE * \notefnerr * \note_destroy_ub * * \sa ::cuModuleGetFunction, * ::cuModuleGetGlobal, * ::cuModuleGetTexRef, * ::cuModuleLoad, * ::cuModuleLoadData, * ::cuModuleLoadDataEx, * ::cuModuleLoadFatBinary */ CUresult CUDAAPI cuModuleUnload(CUmodule hmod); /** * \brief Returns a function handle * * Returns in \p *hfunc the handle of the function of name \p name located in * module \p hmod. If no function of that name exists, ::cuModuleGetFunction() * returns ::CUDA_ERROR_NOT_FOUND. * * \param hfunc - Returned function handle * \param hmod - Module to retrieve function from * \param name - Name of function to retrieve * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_NOT_FOUND * \notefnerr * * \sa ::cuModuleGetGlobal, * ::cuModuleGetTexRef, * ::cuModuleLoad, * ::cuModuleLoadData, * ::cuModuleLoadDataEx, * ::cuModuleLoadFatBinary, * ::cuModuleUnload */ CUresult CUDAAPI cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name); /** * \brief Returns a global pointer from a module * * Returns in \p *dptr and \p *bytes the base pointer and size of the * global of name \p name located in module \p hmod. If no variable of that name * exists, ::cuModuleGetGlobal() returns ::CUDA_ERROR_NOT_FOUND. Both * parameters \p dptr and \p bytes are optional. If one of them is * NULL, it is ignored. * * \param dptr - Returned global device pointer * \param bytes - Returned global size in bytes * \param hmod - Module to retrieve global from * \param name - Name of global to retrieve * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_NOT_FOUND * \notefnerr * * \sa ::cuModuleGetFunction, * ::cuModuleGetTexRef, * ::cuModuleLoad, * ::cuModuleLoadData, * ::cuModuleLoadDataEx, * ::cuModuleLoadFatBinary, * ::cuModuleUnload, * ::cudaGetSymbolAddress, * ::cudaGetSymbolSize */ CUresult CUDAAPI cuModuleGetGlobal(CUdeviceptr *dptr, size_t *bytes, CUmodule hmod, const char *name); /** * \brief Returns a handle to a texture reference * * Returns in \p *pTexRef the handle of the texture reference of name \p name * in the module \p hmod. If no texture reference of that name exists, * ::cuModuleGetTexRef() returns ::CUDA_ERROR_NOT_FOUND. This texture reference * handle should not be destroyed, since it will be destroyed when the module * is unloaded. * * \param pTexRef - Returned texture reference * \param hmod - Module to retrieve texture reference from * \param name - Name of texture reference to retrieve * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_NOT_FOUND * \notefnerr * * \sa ::cuModuleGetFunction, * ::cuModuleGetGlobal, * ::cuModuleGetSurfRef, * ::cuModuleLoad, * ::cuModuleLoadData, * ::cuModuleLoadDataEx, * ::cuModuleLoadFatBinary, * ::cuModuleUnload, * ::cudaGetTextureReference */ CUresult CUDAAPI cuModuleGetTexRef(CUtexref *pTexRef, CUmodule hmod, const char *name); /** * \brief Returns a handle to a surface reference * * Returns in \p *pSurfRef the handle of the surface reference of name \p name * in the module \p hmod. If no surface reference of that name exists, * ::cuModuleGetSurfRef() returns ::CUDA_ERROR_NOT_FOUND. * * \param pSurfRef - Returned surface reference * \param hmod - Module to retrieve surface reference from * \param name - Name of surface reference to retrieve * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_NOT_FOUND * \notefnerr * * \sa ::cuModuleGetFunction, * ::cuModuleGetGlobal, * ::cuModuleGetTexRef, * ::cuModuleLoad, * ::cuModuleLoadData, * ::cuModuleLoadDataEx, * ::cuModuleLoadFatBinary, * ::cuModuleUnload, * ::cudaGetSurfaceReference */ CUresult CUDAAPI cuModuleGetSurfRef(CUsurfref *pSurfRef, CUmodule hmod, const char *name); /** * \brief Creates a pending JIT linker invocation. * * If the call is successful, the caller owns the returned CUlinkState, which * should eventually be destroyed with ::cuLinkDestroy. The * device code machine size (32 or 64 bit) will match the calling application. * * Both linker and compiler options may be specified. Compiler options will * be applied to inputs to this linker action which must be compiled from PTX. * The options ::CU_JIT_WALL_TIME, * ::CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, and ::CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES * will accumulate data until the CUlinkState is destroyed. * * \p optionValues must remain valid for the life of the CUlinkState if output * options are used. No other references to inputs are maintained after this * call returns. * * \param numOptions Size of options arrays * \param options Array of linker and compiler options * \param optionValues Array of option values, each cast to void * * \param stateOut On success, this will contain a CUlinkState to specify * and complete this action * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_OUT_OF_MEMORY, * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND * \notefnerr * * \sa ::cuLinkAddData, * ::cuLinkAddFile, * ::cuLinkComplete, * ::cuLinkDestroy */ CUresult CUDAAPI cuLinkCreate(unsigned int numOptions, CUjit_option *options, void **optionValues, CUlinkState *stateOut); /** * \brief Add an input to a pending linker invocation * * Ownership of \p data is retained by the caller. No reference is retained to any * inputs after this call returns. * * This method accepts only compiler options, which are used if the data must * be compiled from PTX, and does not accept any of * ::CU_JIT_WALL_TIME, ::CU_JIT_INFO_LOG_BUFFER, ::CU_JIT_ERROR_LOG_BUFFER, * ::CU_JIT_TARGET_FROM_CUCONTEXT, or ::CU_JIT_TARGET. * * \param state A pending linker action. * \param type The type of the input data. * \param data The input data. PTX must be NULL-terminated. * \param size The length of the input data. * \param name An optional name for this input in log messages. * \param numOptions Size of options. * \param options Options to be applied only for this input (overrides options from ::cuLinkCreate). * \param optionValues Array of option values, each cast to void *. * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_HANDLE, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_INVALID_IMAGE, * ::CUDA_ERROR_INVALID_PTX, * ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION, * ::CUDA_ERROR_OUT_OF_MEMORY, * ::CUDA_ERROR_NO_BINARY_FOR_GPU * * \sa ::cuLinkCreate, * ::cuLinkAddFile, * ::cuLinkComplete, * ::cuLinkDestroy */ CUresult CUDAAPI cuLinkAddData(CUlinkState state, CUjitInputType type, void *data, size_t size, const char *name, unsigned int numOptions, CUjit_option *options, void **optionValues); /** * \brief Add a file input to a pending linker invocation * * No reference is retained to any inputs after this call returns. * * This method accepts only compiler options, which are used if the input * must be compiled from PTX, and does not accept any of * ::CU_JIT_WALL_TIME, ::CU_JIT_INFO_LOG_BUFFER, ::CU_JIT_ERROR_LOG_BUFFER, * ::CU_JIT_TARGET_FROM_CUCONTEXT, or ::CU_JIT_TARGET. * * This method is equivalent to invoking ::cuLinkAddData on the contents * of the file. * * \param state A pending linker action * \param type The type of the input data * \param path Path to the input file * \param numOptions Size of options * \param options Options to be applied only for this input (overrides options from ::cuLinkCreate) * \param optionValues Array of option values, each cast to void * * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_FILE_NOT_FOUND * ::CUDA_ERROR_INVALID_HANDLE, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_INVALID_IMAGE, * ::CUDA_ERROR_INVALID_PTX, * ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION, * ::CUDA_ERROR_OUT_OF_MEMORY, * ::CUDA_ERROR_NO_BINARY_FOR_GPU * * \sa ::cuLinkCreate, * ::cuLinkAddData, * ::cuLinkComplete, * ::cuLinkDestroy */ CUresult CUDAAPI cuLinkAddFile(CUlinkState state, CUjitInputType type, const char *path, unsigned int numOptions, CUjit_option *options, void **optionValues); /** * \brief Complete a pending linker invocation * * Completes the pending linker action and returns the cubin image for the linked * device code, which can be used with ::cuModuleLoadData. The cubin is owned by * \p state, so it should be loaded before \p state is destroyed via ::cuLinkDestroy. * This call does not destroy \p state. * * \param state A pending linker invocation * \param cubinOut On success, this will point to the output image * \param sizeOut Optional parameter to receive the size of the generated image * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_HANDLE, * ::CUDA_ERROR_OUT_OF_MEMORY * * \sa ::cuLinkCreate, * ::cuLinkAddData, * ::cuLinkAddFile, * ::cuLinkDestroy, * ::cuModuleLoadData */ CUresult CUDAAPI cuLinkComplete(CUlinkState state, void **cubinOut, size_t *sizeOut); /** * \brief Destroys state for a JIT linker invocation. * * \param state State object for the linker invocation * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_HANDLE * * \sa ::cuLinkCreate */ CUresult CUDAAPI cuLinkDestroy(CUlinkState state); /** @} */ /* END CUDA_MODULE */ /** * \defgroup CUDA_MEM Memory Management * * ___MANBRIEF___ memory management functions of the low-level CUDA driver API * (___CURRENT_FILE___) ___ENDMANBRIEF___ * * This section describes the memory management functions of the low-level CUDA * driver application programming interface. * * @{ */ /** * \brief Gets free and total memory * * Returns in \p *total the total amount of memory available to the the current context. * Returns in \p *free the amount of memory on the device that is free according to the OS. * CUDA is not guaranteed to be able to allocate all of the memory that the OS reports as free. * In a multi-tenet situation, free estimate returned is prone to race condition where * a new allocation/free done by a different process or a different thread in the same * process between the time when free memory was estimated and reported, will result in * deviation in free value reported and actual free memory. * * The integrated GPU on Tegra shares memory with CPU and other component * of the SoC. The free and total values returned by the API excludes * the SWAP memory space maintained by the OS on some platforms. * The OS may move some of the memory pages into swap area as the GPU or * CPU allocate or access memory. See Tegra app note on how to calculate * total and free memory on Tegra. * * \param free - Returned free memory in bytes * \param total - Returned total memory in bytes * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE * \notefnerr * * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, * ::cuMemGetAddressRange, ::cuMemHostAlloc, * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, * ::cudaMemGetInfo */ CUresult CUDAAPI cuMemGetInfo(size_t *free, size_t *total); /** * \brief Allocates device memory * * Allocates \p bytesize bytes of linear memory on the device and returns in * \p *dptr a pointer to the allocated memory. The allocated memory is suitably * aligned for any kind of variable. The memory is not cleared. If \p bytesize * is 0, ::cuMemAlloc() returns ::CUDA_ERROR_INVALID_VALUE. * * \param dptr - Returned device pointer * \param bytesize - Requested allocation size in bytes * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_OUT_OF_MEMORY * \notefnerr * * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAllocHost, * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, * ::cudaMalloc */ CUresult CUDAAPI cuMemAlloc(CUdeviceptr *dptr, size_t bytesize); /** * \brief Allocates pitched device memory * * Allocates at least \p WidthInBytes * \p Height bytes of linear memory on * the device and returns in \p *dptr a pointer to the allocated memory. The * function may pad the allocation to ensure that corresponding pointers in * any given row will continue to meet the alignment requirements for * coalescing as the address is updated from row to row. \p ElementSizeBytes * specifies the size of the largest reads and writes that will be performed * on the memory range. \p ElementSizeBytes may be 4, 8 or 16 (since coalesced * memory transactions are not possible on other data sizes). If * \p ElementSizeBytes is smaller than the actual read/write size of a kernel, * the kernel will run correctly, but possibly at reduced speed. The pitch * returned in \p *pPitch by ::cuMemAllocPitch() is the width in bytes of the * allocation. The intended usage of pitch is as a separate parameter of the * allocation, used to compute addresses within the 2D array. Given the row * and column of an array element of type \b T, the address is computed as: * \code T* pElement = (T*)((char*)BaseAddress + Row * Pitch) + Column; * \endcode * * The pitch returned by ::cuMemAllocPitch() is guaranteed to work with * ::cuMemcpy2D() under all circumstances. For allocations of 2D arrays, it is * recommended that programmers consider performing pitch allocations using * ::cuMemAllocPitch(). Due to alignment restrictions in the hardware, this is * especially true if the application will be performing 2D memory copies * between different regions of device memory (whether linear memory or CUDA * arrays). * * The byte alignment of the pitch returned by ::cuMemAllocPitch() is guaranteed * to match or exceed the alignment requirement for texture binding with * ::cuTexRefSetAddress2D(). * * \param dptr - Returned device pointer * \param pPitch - Returned pitch of allocation in bytes * \param WidthInBytes - Requested allocation width in bytes * \param Height - Requested allocation height in rows * \param ElementSizeBytes - Size of largest reads/writes for range * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_OUT_OF_MEMORY * \notefnerr * * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, * ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, * ::cudaMallocPitch */ CUresult CUDAAPI cuMemAllocPitch(CUdeviceptr *dptr, size_t *pPitch, size_t WidthInBytes, size_t Height, unsigned int ElementSizeBytes); /** * \brief Frees device memory * * Frees the memory space pointed to by \p dptr, which must have been returned * by a previous call to ::cuMemAlloc() or ::cuMemAllocPitch(). * * \param dptr - Pointer to memory to free * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE * \notefnerr * * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFreeHost, * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, * ::cudaFree */ CUresult CUDAAPI cuMemFree(CUdeviceptr dptr); /** * \brief Get information on memory allocations * * Returns the base address in \p *pbase and size in \p *psize of the * allocation by ::cuMemAlloc() or ::cuMemAllocPitch() that contains the input * pointer \p dptr. Both parameters \p pbase and \p psize are optional. If one * of them is NULL, it is ignored. * * \param pbase - Returned base address * \param psize - Returned size of device memory allocation * \param dptr - Device pointer to query * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_NOT_FOUND, * ::CUDA_ERROR_INVALID_VALUE * \notefnerr * * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, * ::cuMemGetInfo, ::cuMemHostAlloc, * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32 */ CUresult CUDAAPI cuMemGetAddressRange(CUdeviceptr *pbase, size_t *psize, CUdeviceptr dptr); /** * \brief Allocates page-locked host memory * * Allocates \p bytesize bytes of host memory that is page-locked and * accessible to the device. The driver tracks the virtual memory ranges * allocated with this function and automatically accelerates calls to * functions such as ::cuMemcpy(). Since the memory can be accessed directly by * the device, it can be read or written with much higher bandwidth than * pageable memory obtained with functions such as ::malloc(). Allocating * excessive amounts of memory with ::cuMemAllocHost() may degrade system * performance, since it reduces the amount of memory available to the system * for paging. As a result, this function is best used sparingly to allocate * staging areas for data exchange between host and device. * * Note all host memory allocated using ::cuMemHostAlloc() will automatically * be immediately accessible to all contexts on all devices which support unified * addressing (as may be queried using ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING). * The device pointer that may be used to access this host memory from those * contexts is always equal to the returned host pointer \p *pp. * See \ref CUDA_UNIFIED for additional details. * * \param pp - Returned host pointer to page-locked memory * \param bytesize - Requested allocation size in bytes * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_OUT_OF_MEMORY * \notefnerr * * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, * ::cudaMallocHost */ CUresult CUDAAPI cuMemAllocHost(void **pp, size_t bytesize); /** * \brief Frees page-locked host memory * * Frees the memory space pointed to by \p p, which must have been returned by * a previous call to ::cuMemAllocHost(). * * \param p - Pointer to memory to free * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE * \notefnerr * * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, * ::cudaFreeHost */ CUresult CUDAAPI cuMemFreeHost(void *p); /** * \brief Allocates page-locked host memory * * Allocates \p bytesize bytes of host memory that is page-locked and accessible * to the device. The driver tracks the virtual memory ranges allocated with * this function and automatically accelerates calls to functions such as * ::cuMemcpyHtoD(). Since the memory can be accessed directly by the device, * it can be read or written with much higher bandwidth than pageable memory * obtained with functions such as ::malloc(). Allocating excessive amounts of * pinned memory may degrade system performance, since it reduces the amount * of memory available to the system for paging. As a result, this function is * best used sparingly to allocate staging areas for data exchange between * host and device. * * The \p Flags parameter enables different options to be specified that * affect the allocation, as follows. * * - ::CU_MEMHOSTALLOC_PORTABLE: The memory returned by this call will be * considered as pinned memory by all CUDA contexts, not just the one that * performed the allocation. * * - ::CU_MEMHOSTALLOC_DEVICEMAP: Maps the allocation into the CUDA address * space. The device pointer to the memory may be obtained by calling * ::cuMemHostGetDevicePointer(). * * - ::CU_MEMHOSTALLOC_WRITECOMBINED: Allocates the memory as write-combined * (WC). WC memory can be transferred across the PCI Express bus more * quickly on some system configurations, but cannot be read efficiently by * most CPUs. WC memory is a good option for buffers that will be written by * the CPU and read by the GPU via mapped pinned memory or host->device * transfers. * * All of these flags are orthogonal to one another: a developer may allocate * memory that is portable, mapped and/or write-combined with no restrictions. * * The ::CU_MEMHOSTALLOC_DEVICEMAP flag may be specified on CUDA contexts for * devices that do not support mapped pinned memory. The failure is deferred * to ::cuMemHostGetDevicePointer() because the memory may be mapped into * other CUDA contexts via the ::CU_MEMHOSTALLOC_PORTABLE flag. * * The memory allocated by this function must be freed with ::cuMemFreeHost(). * * Note all host memory allocated using ::cuMemHostAlloc() will automatically * be immediately accessible to all contexts on all devices which support unified * addressing (as may be queried using ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING). * Unless the flag ::CU_MEMHOSTALLOC_WRITECOMBINED is specified, the device pointer * that may be used to access this host memory from those contexts is always equal * to the returned host pointer \p *pp. If the flag ::CU_MEMHOSTALLOC_WRITECOMBINED * is specified, then the function ::cuMemHostGetDevicePointer() must be used * to query the device pointer, even if the context supports unified addressing. * See \ref CUDA_UNIFIED for additional details. * * \param pp - Returned host pointer to page-locked memory * \param bytesize - Requested allocation size in bytes * \param Flags - Flags for allocation request * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_OUT_OF_MEMORY * \notefnerr * * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, * ::cuMemGetAddressRange, ::cuMemGetInfo, * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, * ::cudaHostAlloc */ CUresult CUDAAPI cuMemHostAlloc(void **pp, size_t bytesize, unsigned int Flags); /** * \brief Passes back device pointer of mapped pinned memory * * Passes back the device pointer \p pdptr corresponding to the mapped, pinned * host buffer \p p allocated by ::cuMemHostAlloc. * * ::cuMemHostGetDevicePointer() will fail if the ::CU_MEMHOSTALLOC_DEVICEMAP * flag was not specified at the time the memory was allocated, or if the * function is called on a GPU that does not support mapped pinned memory. * * For devices that have a non-zero value for the device attribute * ::CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM, the memory * can also be accessed from the device using the host pointer \p p. * The device pointer returned by ::cuMemHostGetDevicePointer() may or may not * match the original host pointer \p p and depends on the devices visible to the * application. If all devices visible to the application have a non-zero value for the * device attribute, the device pointer returned by ::cuMemHostGetDevicePointer() * will match the original pointer \p p. If any device visible to the application * has a zero value for the device attribute, the device pointer returned by * ::cuMemHostGetDevicePointer() will not match the original host pointer \p p, * but it will be suitable for use on all devices provided Unified Virtual Addressing * is enabled. In such systems, it is valid to access the memory using either pointer * on devices that have a non-zero value for the device attribute. Note however that * such devices should access the memory using only one of the two pointers and not both. * * \p Flags provides for future releases. For now, it must be set to 0. * * \param pdptr - Returned device pointer * \param p - Host pointer * \param Flags - Options (must be 0) * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE * \notefnerr * * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, * ::cuMemsetD2D8, ::cuMemsetD2D16, * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, * ::cudaHostGetDevicePointer */ CUresult CUDAAPI cuMemHostGetDevicePointer(CUdeviceptr *pdptr, void *p, unsigned int Flags); /** * \brief Passes back flags that were used for a pinned allocation * * Passes back the flags \p pFlags that were specified when allocating * the pinned host buffer \p p allocated by ::cuMemHostAlloc. * * ::cuMemHostGetFlags() will fail if the pointer does not reside in * an allocation performed by ::cuMemAllocHost() or ::cuMemHostAlloc(). * * \param pFlags - Returned flags word * \param p - Host pointer * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE * \notefnerr * * \sa * ::cuMemAllocHost, * ::cuMemHostAlloc, * ::cudaHostGetFlags */ CUresult CUDAAPI cuMemHostGetFlags(unsigned int *pFlags, void *p); /** * \brief Allocates memory that will be automatically managed by the Unified Memory system * * Allocates \p bytesize bytes of managed memory on the device and returns in * \p *dptr a pointer to the allocated memory. If the device doesn't support * allocating managed memory, ::CUDA_ERROR_NOT_SUPPORTED is returned. Support * for managed memory can be queried using the device attribute * ::CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY. The allocated memory is suitably * aligned for any kind of variable. The memory is not cleared. If \p bytesize * is 0, ::cuMemAllocManaged returns ::CUDA_ERROR_INVALID_VALUE. The pointer * is valid on the CPU and on all GPUs in the system that support managed memory. * All accesses to this pointer must obey the Unified Memory programming model. * * \p flags specifies the default stream association for this allocation. * \p flags must be one of ::CU_MEM_ATTACH_GLOBAL or ::CU_MEM_ATTACH_HOST. If * ::CU_MEM_ATTACH_GLOBAL is specified, then this memory is accessible from * any stream on any device. If ::CU_MEM_ATTACH_HOST is specified, then the * allocation should not be accessed from devices that have a zero value for the * device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS; an explicit call to * ::cuStreamAttachMemAsync will be required to enable access on such devices. * * If the association is later changed via ::cuStreamAttachMemAsync to * a single stream, the default association as specifed during ::cuMemAllocManaged * is restored when that stream is destroyed. For __managed__ variables, the * default association is always ::CU_MEM_ATTACH_GLOBAL. Note that destroying a * stream is an asynchronous operation, and as a result, the change to default * association won't happen until all work in the stream has completed. * * Memory allocated with ::cuMemAllocManaged should be released with ::cuMemFree. * * Device memory oversubscription is possible for GPUs that have a non-zero value for the * device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. Managed memory on * such GPUs may be evicted from device memory to host memory at any time by the Unified * Memory driver in order to make room for other allocations. * * In a multi-GPU system where all GPUs have a non-zero value for the device attribute * ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, managed memory may not be populated when this * API returns and instead may be populated on access. In such systems, managed memory can * migrate to any processor's memory at any time. The Unified Memory driver will employ heuristics to * maintain data locality and prevent excessive page faults to the extent possible. The application * can also guide the driver about memory usage patterns via ::cuMemAdvise. The application * can also explicitly migrate memory to a desired processor's memory via * ::cuMemPrefetchAsync. * * In a multi-GPU system where all of the GPUs have a zero value for the device attribute * ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS and all the GPUs have peer-to-peer support * with each other, the physical storage for managed memory is created on the GPU which is active * at the time ::cuMemAllocManaged is called. All other GPUs will reference the data at reduced * bandwidth via peer mappings over the PCIe bus. The Unified Memory driver does not migrate * memory among such GPUs. * * In a multi-GPU system where not all GPUs have peer-to-peer support with each other and * where the value of the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS * is zero for at least one of those GPUs, the location chosen for physical storage of managed * memory is system-dependent. * - On Linux, the location chosen will be device memory as long as the current set of active * contexts are on devices that either have peer-to-peer support with each other or have a * non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. * If there is an active context on a GPU that does not have a non-zero value for that device * attribute and it does not have peer-to-peer support with the other devices that have active * contexts on them, then the location for physical storage will be 'zero-copy' or host memory. * Note that this means that managed memory that is located in device memory is migrated to * host memory if a new context is created on a GPU that doesn't have a non-zero value for * the device attribute and does not support peer-to-peer with at least one of the other devices * that has an active context. This in turn implies that context creation may fail if there is * insufficient host memory to migrate all managed allocations. * - On Windows, the physical storage is always created in 'zero-copy' or host memory. * All GPUs will reference the data at reduced bandwidth over the PCIe bus. In these * circumstances, use of the environment variable CUDA_VISIBLE_DEVICES is recommended to * restrict CUDA to only use those GPUs that have peer-to-peer support. * Alternatively, users can also set CUDA_MANAGED_FORCE_DEVICE_ALLOC to a * non-zero value to force the driver to always use device memory for physical storage. * When this environment variable is set to a non-zero value, all contexts created in * that process on devices that support managed memory have to be peer-to-peer compatible * with each other. Context creation will fail if a context is created on a device that * supports managed memory and is not peer-to-peer compatible with any of the other * managed memory supporting devices on which contexts were previously created, even if * those contexts have been destroyed. These environment variables are described * in the CUDA programming guide under the "CUDA environment variables" section. * - On ARM, managed memory is not available on discrete gpu with Drive PX-2. * * \param dptr - Returned device pointer * \param bytesize - Requested allocation size in bytes * \param flags - Must be one of ::CU_MEM_ATTACH_GLOBAL or ::CU_MEM_ATTACH_HOST * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_NOT_SUPPORTED, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_OUT_OF_MEMORY * \notefnerr * * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAllocHost, * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, * ::cuDeviceGetAttribute, ::cuStreamAttachMemAsync, * ::cudaMallocManaged */ CUresult CUDAAPI cuMemAllocManaged(CUdeviceptr *dptr, size_t bytesize, unsigned int flags); /** * \brief Returns a handle to a compute device * * Returns in \p *device a device handle given a PCI bus ID string. * * \param dev - Returned device handle * * \param pciBusId - String in one of the following forms: * [domain]:[bus]:[device].[function] * [domain]:[bus]:[device] * [bus]:[device].[function] * where \p domain, \p bus, \p device, and \p function are all hexadecimal values * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_INVALID_DEVICE * \notefnerr * * \sa * ::cuDeviceGet, * ::cuDeviceGetAttribute, * ::cuDeviceGetPCIBusId, * ::cudaDeviceGetByPCIBusId */ CUresult CUDAAPI cuDeviceGetByPCIBusId(CUdevice *dev, const char *pciBusId); /** * \brief Returns a PCI Bus Id string for the device * * Returns an ASCII string identifying the device \p dev in the NULL-terminated * string pointed to by \p pciBusId. \p len specifies the maximum length of the * string that may be returned. * * \param pciBusId - Returned identifier string for the device in the following format * [domain]:[bus]:[device].[function] * where \p domain, \p bus, \p device, and \p function are all hexadecimal values. * pciBusId should be large enough to store 13 characters including the NULL-terminator. * * \param len - Maximum length of string to store in \p name * * \param dev - Device to get identifier string for * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_INVALID_DEVICE * \notefnerr * * \sa * ::cuDeviceGet, * ::cuDeviceGetAttribute, * ::cuDeviceGetByPCIBusId, * ::cudaDeviceGetPCIBusId */ CUresult CUDAAPI cuDeviceGetPCIBusId(char *pciBusId, int len, CUdevice dev); /** * \brief Gets an interprocess handle for a previously allocated event * * Takes as input a previously allocated event. This event must have been * created with the ::CU_EVENT_INTERPROCESS and ::CU_EVENT_DISABLE_TIMING * flags set. This opaque handle may be copied into other processes and * opened with ::cuIpcOpenEventHandle to allow efficient hardware * synchronization between GPU work in different processes. * * After the event has been opened in the importing process, * ::cuEventRecord, ::cuEventSynchronize, ::cuStreamWaitEvent and * ::cuEventQuery may be used in either process. Performing operations * on the imported event after the exported event has been freed * with ::cuEventDestroy will result in undefined behavior. * * IPC functionality is restricted to devices with support for unified * addressing on Linux and Windows operating systems. * IPC functionality on Windows is restricted to GPUs in TCC mode * * \param pHandle - Pointer to a user allocated CUipcEventHandle * in which to return the opaque event handle * \param event - Event allocated with ::CU_EVENT_INTERPROCESS and * ::CU_EVENT_DISABLE_TIMING flags. * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_HANDLE, * ::CUDA_ERROR_OUT_OF_MEMORY, * ::CUDA_ERROR_MAP_FAILED, * ::CUDA_ERROR_INVALID_VALUE * * \sa * ::cuEventCreate, * ::cuEventDestroy, * ::cuEventSynchronize, * ::cuEventQuery, * ::cuStreamWaitEvent, * ::cuIpcOpenEventHandle, * ::cuIpcGetMemHandle, * ::cuIpcOpenMemHandle, * ::cuIpcCloseMemHandle, * ::cudaIpcGetEventHandle */ CUresult CUDAAPI cuIpcGetEventHandle(CUipcEventHandle *pHandle, CUevent event); /** * \brief Opens an interprocess event handle for use in the current process * * Opens an interprocess event handle exported from another process with * ::cuIpcGetEventHandle. This function returns a ::CUevent that behaves like * a locally created event with the ::CU_EVENT_DISABLE_TIMING flag specified. * This event must be freed with ::cuEventDestroy. * * Performing operations on the imported event after the exported event has * been freed with ::cuEventDestroy will result in undefined behavior. * * IPC functionality is restricted to devices with support for unified * addressing on Linux and Windows operating systems. * IPC functionality on Windows is restricted to GPUs in TCC mode * * \param phEvent - Returns the imported event * \param handle - Interprocess handle to open * * \returns * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_MAP_FAILED, * ::CUDA_ERROR_PEER_ACCESS_UNSUPPORTED, * ::CUDA_ERROR_INVALID_HANDLE, * ::CUDA_ERROR_INVALID_VALUE * * \sa * ::cuEventCreate, * ::cuEventDestroy, * ::cuEventSynchronize, * ::cuEventQuery, * ::cuStreamWaitEvent, * ::cuIpcGetEventHandle, * ::cuIpcGetMemHandle, * ::cuIpcOpenMemHandle, * ::cuIpcCloseMemHandle, * ::cudaIpcOpenEventHandle */ CUresult CUDAAPI cuIpcOpenEventHandle(CUevent *phEvent, CUipcEventHandle handle); /** * \brief Gets an interprocess memory handle for an existing device memory * allocation * * Takes a pointer to the base of an existing device memory allocation created * with ::cuMemAlloc and exports it for use in another process. This is a * lightweight operation and may be called multiple times on an allocation * without adverse effects. * * If a region of memory is freed with ::cuMemFree and a subsequent call * to ::cuMemAlloc returns memory with the same device address, * ::cuIpcGetMemHandle will return a unique handle for the * new memory. * * IPC functionality is restricted to devices with support for unified * addressing on Linux and Windows operating systems. * IPC functionality on Windows is restricted to GPUs in TCC mode * * \param pHandle - Pointer to user allocated ::CUipcMemHandle to return * the handle in. * \param dptr - Base pointer to previously allocated device memory * * \returns * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_HANDLE, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_OUT_OF_MEMORY, * ::CUDA_ERROR_MAP_FAILED, * ::CUDA_ERROR_INVALID_VALUE * * \sa * ::cuMemAlloc, * ::cuMemFree, * ::cuIpcGetEventHandle, * ::cuIpcOpenEventHandle, * ::cuIpcOpenMemHandle, * ::cuIpcCloseMemHandle, * ::cudaIpcGetMemHandle */ CUresult CUDAAPI cuIpcGetMemHandle(CUipcMemHandle *pHandle, CUdeviceptr dptr); /** * \brief Opens an interprocess memory handle exported from another process * and returns a device pointer usable in the local process. * * Maps memory exported from another process with ::cuIpcGetMemHandle into * the current device address space. For contexts on different devices * ::cuIpcOpenMemHandle can attempt to enable peer access between the * devices as if the user called ::cuCtxEnablePeerAccess. This behavior is * controlled by the ::CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS flag. * ::cuDeviceCanAccessPeer can determine if a mapping is possible. * * Contexts that may open ::CUipcMemHandles are restricted in the following way. * ::CUipcMemHandles from each ::CUdevice in a given process may only be opened * by one ::CUcontext per ::CUdevice per other process. * * If the memory handle has already been opened by the current context, the * reference count on the handle is incremented by 1 and the existing device pointer * is returned. * * Memory returned from ::cuIpcOpenMemHandle must be freed with * ::cuIpcCloseMemHandle. * * Calling ::cuMemFree on an exported memory region before calling * ::cuIpcCloseMemHandle in the importing context will result in undefined * behavior. * * IPC functionality is restricted to devices with support for unified * addressing on Linux and Windows operating systems. * IPC functionality on Windows is restricted to GPUs in TCC mode * * \param pdptr - Returned device pointer * \param handle - ::CUipcMemHandle to open * \param Flags - Flags for this operation. Must be specified as ::CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS * * \returns * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_MAP_FAILED, * ::CUDA_ERROR_INVALID_HANDLE, * ::CUDA_ERROR_TOO_MANY_PEERS, * ::CUDA_ERROR_INVALID_VALUE * * \note No guarantees are made about the address returned in \p *pdptr. * In particular, multiple processes may not receive the same address for the same \p handle. * * \sa * ::cuMemAlloc, * ::cuMemFree, * ::cuIpcGetEventHandle, * ::cuIpcOpenEventHandle, * ::cuIpcGetMemHandle, * ::cuIpcCloseMemHandle, * ::cuCtxEnablePeerAccess, * ::cuDeviceCanAccessPeer, * ::cudaIpcOpenMemHandle */ CUresult CUDAAPI cuIpcOpenMemHandle(CUdeviceptr *pdptr, CUipcMemHandle handle, unsigned int Flags); /** * \brief Attempts to close memory mapped with ::cuIpcOpenMemHandle * * Decrements the reference count of the memory returned by ::cuIpcOpenMemHandle by 1. * When the reference count reaches 0, this API unmaps the memory. The original allocation * in the exporting process as well as imported mappings in other processes * will be unaffected. * * Any resources used to enable peer access will be freed if this is the * last mapping using them. * * IPC functionality is restricted to devices with support for unified * addressing on Linux and Windows operating systems. * IPC functionality on Windows is restricted to GPUs in TCC mode * * \param dptr - Device pointer returned by ::cuIpcOpenMemHandle * * \returns * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_MAP_FAILED, * ::CUDA_ERROR_INVALID_HANDLE, * ::CUDA_ERROR_INVALID_VALUE * \sa * ::cuMemAlloc, * ::cuMemFree, * ::cuIpcGetEventHandle, * ::cuIpcOpenEventHandle, * ::cuIpcGetMemHandle, * ::cuIpcOpenMemHandle, * ::cudaIpcCloseMemHandle */ CUresult CUDAAPI cuIpcCloseMemHandle(CUdeviceptr dptr); /** * \brief Registers an existing host memory range for use by CUDA * * Page-locks the memory range specified by \p p and \p bytesize and maps it * for the device(s) as specified by \p Flags. This memory range also is added * to the same tracking mechanism as ::cuMemHostAlloc to automatically accelerate * calls to functions such as ::cuMemcpyHtoD(). Since the memory can be accessed * directly by the device, it can be read or written with much higher bandwidth * than pageable memory that has not been registered. Page-locking excessive * amounts of memory may degrade system performance, since it reduces the amount * of memory available to the system for paging. As a result, this function is * best used sparingly to register staging areas for data exchange between * host and device. * * This function has limited support on Mac OS X. OS 10.7 or higher is required. * * The \p Flags parameter enables different options to be specified that * affect the allocation, as follows. * * - ::CU_MEMHOSTREGISTER_PORTABLE: The memory returned by this call will be * considered as pinned memory by all CUDA contexts, not just the one that * performed the allocation. * * - ::CU_MEMHOSTREGISTER_DEVICEMAP: Maps the allocation into the CUDA address * space. The device pointer to the memory may be obtained by calling * ::cuMemHostGetDevicePointer(). * * - ::CU_MEMHOSTREGISTER_IOMEMORY: The pointer is treated as pointing to some * I/O memory space, e.g. the PCI Express resource of a 3rd party device. * * - ::CU_MEMHOSTREGISTER_READ_ONLY: The pointer is treated as pointing to memory * that is considered read-only by the device. On platforms without * ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, this flag is * required in order to register memory mapped to the CPU as read-only. Support * for the use of this flag can be queried from the device attribute * ::CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED. Using this flag with * a current context associated with a device that does not have this attribute * set will cause ::cuMemHostRegister to error with CUDA_ERROR_NOT_SUPPORTED. * * All of these flags are orthogonal to one another: a developer may page-lock * memory that is portable or mapped with no restrictions. * * The ::CU_MEMHOSTREGISTER_DEVICEMAP flag may be specified on CUDA contexts for * devices that do not support mapped pinned memory. The failure is deferred * to ::cuMemHostGetDevicePointer() because the memory may be mapped into * other CUDA contexts via the ::CU_MEMHOSTREGISTER_PORTABLE flag. * * For devices that have a non-zero value for the device attribute * ::CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM, the memory * can also be accessed from the device using the host pointer \p p. * The device pointer returned by ::cuMemHostGetDevicePointer() may or may not * match the original host pointer \p ptr and depends on the devices visible to the * application. If all devices visible to the application have a non-zero value for the * device attribute, the device pointer returned by ::cuMemHostGetDevicePointer() * will match the original pointer \p ptr. If any device visible to the application * has a zero value for the device attribute, the device pointer returned by * ::cuMemHostGetDevicePointer() will not match the original host pointer \p ptr, * but it will be suitable for use on all devices provided Unified Virtual Addressing * is enabled. In such systems, it is valid to access the memory using either pointer * on devices that have a non-zero value for the device attribute. Note however that * such devices should access the memory using only of the two pointers and not both. * * The memory page-locked by this function must be unregistered with * ::cuMemHostUnregister(). * * \param p - Host pointer to memory to page-lock * \param bytesize - Size in bytes of the address range to page-lock * \param Flags - Flags for allocation request * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_OUT_OF_MEMORY, * ::CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED, * ::CUDA_ERROR_NOT_PERMITTED, * ::CUDA_ERROR_NOT_SUPPORTED * \notefnerr * * \sa * ::cuMemHostUnregister, * ::cuMemHostGetFlags, * ::cuMemHostGetDevicePointer, * ::cudaHostRegister */ CUresult CUDAAPI cuMemHostRegister(void *p, size_t bytesize, unsigned int Flags); /** * \brief Unregisters a memory range that was registered with cuMemHostRegister. * * Unmaps the memory range whose base address is specified by \p p, and makes * it pageable again. * * The base address must be the same one specified to ::cuMemHostRegister(). * * \param p - Host pointer to memory to unregister * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_OUT_OF_MEMORY, * ::CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED, * \notefnerr * * \sa * ::cuMemHostRegister, * ::cudaHostUnregister */ CUresult CUDAAPI cuMemHostUnregister(void *p); /** * \brief Copies memory * * Copies data between two pointers. * \p dst and \p src are base pointers of the destination and source, respectively. * \p ByteCount specifies the number of bytes to copy. * Note that this function infers the type of the transfer (host to host, host to * device, device to device, or device to host) from the pointer values. This * function is only allowed in contexts which support unified addressing. * * \param dst - Destination unified virtual address space pointer * \param src - Source unified virtual address space pointer * \param ByteCount - Size of memory copy in bytes * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE * \notefnerr * \note_sync * \note_memcpy * * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, * ::cudaMemcpy, * ::cudaMemcpyToSymbol, * ::cudaMemcpyFromSymbol */ CUresult CUDAAPI cuMemcpy(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount); /** * \brief Copies device memory between two contexts * * Copies from device memory in one context to device memory in another * context. \p dstDevice is the base device pointer of the destination memory * and \p dstContext is the destination context. \p srcDevice is the base * device pointer of the source memory and \p srcContext is the source pointer. * \p ByteCount specifies the number of bytes to copy. * * \param dstDevice - Destination device pointer * \param dstContext - Destination context * \param srcDevice - Source device pointer * \param srcContext - Source context * \param ByteCount - Size of memory copy in bytes * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE * \notefnerr * \note_sync * * \sa ::cuMemcpyDtoD, ::cuMemcpy3DPeer, ::cuMemcpyDtoDAsync, ::cuMemcpyPeerAsync, * ::cuMemcpy3DPeerAsync, * ::cudaMemcpyPeer */ CUresult CUDAAPI cuMemcpyPeer(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount); /** * \brief Copies memory from Host to Device * * Copies from host memory to device memory. \p dstDevice and \p srcHost are * the base addresses of the destination and source, respectively. \p ByteCount * specifies the number of bytes to copy. * * \param dstDevice - Destination device pointer * \param srcHost - Source host pointer * \param ByteCount - Size of memory copy in bytes * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE * \notefnerr * \note_sync * \note_memcpy * * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, * ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, * ::cudaMemcpy, * ::cudaMemcpyToSymbol */ CUresult CUDAAPI cuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount); /** * \brief Copies memory from Device to Host * * Copies from device to host memory. \p dstHost and \p srcDevice specify the * base pointers of the destination and source, respectively. \p ByteCount * specifies the number of bytes to copy. * * \param dstHost - Destination host pointer * \param srcDevice - Source device pointer * \param ByteCount - Size of memory copy in bytes * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE * \notefnerr * \note_sync * \note_memcpy * * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, * ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, * ::cudaMemcpy, * ::cudaMemcpyFromSymbol */ CUresult CUDAAPI cuMemcpyDtoH(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount); /** * \brief Copies memory from Device to Device * * Copies from device memory to device memory. \p dstDevice and \p srcDevice * are the base pointers of the destination and source, respectively. * \p ByteCount specifies the number of bytes to copy. * * \param dstDevice - Destination device pointer * \param srcDevice - Source device pointer * \param ByteCount - Size of memory copy in bytes * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE * \notefnerr * \note_sync * * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, * ::cudaMemcpy, * ::cudaMemcpyToSymbol, * ::cudaMemcpyFromSymbol */ CUresult CUDAAPI cuMemcpyDtoD(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount); /** * \brief Copies memory from Device to Array * * Copies from device memory to a 1D CUDA array. \p dstArray and \p dstOffset * specify the CUDA array handle and starting index of the destination data. * \p srcDevice specifies the base pointer of the source. \p ByteCount * specifies the number of bytes to copy. * * \param dstArray - Destination array * \param dstOffset - Offset in bytes of destination array * \param srcDevice - Source device pointer * \param ByteCount - Size of memory copy in bytes * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE * \notefnerr * \note_sync * * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, * ::cudaMemcpyToArray */ CUresult CUDAAPI cuMemcpyDtoA(CUarray dstArray, size_t dstOffset, CUdeviceptr srcDevice, size_t ByteCount); /** * \brief Copies memory from Array to Device * * Copies from one 1D CUDA array to device memory. \p dstDevice specifies the * base pointer of the destination and must be naturally aligned with the CUDA * array elements. \p srcArray and \p srcOffset specify the CUDA array handle * and the offset in bytes into the array where the copy is to begin. * \p ByteCount specifies the number of bytes to copy and must be evenly * divisible by the array element size. * * \param dstDevice - Destination device pointer * \param srcArray - Source array * \param srcOffset - Offset in bytes of source array * \param ByteCount - Size of memory copy in bytes * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE * \notefnerr * \note_sync * * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, * ::cudaMemcpyFromArray */ CUresult CUDAAPI cuMemcpyAtoD(CUdeviceptr dstDevice, CUarray srcArray, size_t srcOffset, size_t ByteCount); /** * \brief Copies memory from Host to Array * * Copies from host memory to a 1D CUDA array. \p dstArray and \p dstOffset * specify the CUDA array handle and starting offset in bytes of the destination * data. \p pSrc specifies the base address of the source. \p ByteCount specifies * the number of bytes to copy. * * \param dstArray - Destination array * \param dstOffset - Offset in bytes of destination array * \param srcHost - Source host pointer * \param ByteCount - Size of memory copy in bytes * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE * \notefnerr * \note_sync * \note_memcpy * * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoAAsync, * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, * ::cudaMemcpyToArray */ CUresult CUDAAPI cuMemcpyHtoA(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount); /** * \brief Copies memory from Array to Host * * Copies from one 1D CUDA array to host memory. \p dstHost specifies the base * pointer of the destination. \p srcArray and \p srcOffset specify the CUDA * array handle and starting offset in bytes of the source data. * \p ByteCount specifies the number of bytes to copy. * * \param dstHost - Destination device pointer * \param srcArray - Source array * \param srcOffset - Offset in bytes of source array * \param ByteCount - Size of memory copy in bytes * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE * \notefnerr * \note_sync * \note_memcpy * * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, * ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, * ::cudaMemcpyFromArray */ CUresult CUDAAPI cuMemcpyAtoH(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount); /** * \brief Copies memory from Array to Array * * Copies from one 1D CUDA array to another. \p dstArray and \p srcArray * specify the handles of the destination and source CUDA arrays for the copy, * respectively. \p dstOffset and \p srcOffset specify the destination and * source offsets in bytes into the CUDA arrays. \p ByteCount is the number of * bytes to be copied. The size of the elements in the CUDA arrays need not be * the same format, but the elements must be the same size; and count must be * evenly divisible by that size. * * \param dstArray - Destination array * \param dstOffset - Offset in bytes of destination array * \param srcArray - Source array * \param srcOffset - Offset in bytes of source array * \param ByteCount - Size of memory copy in bytes * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE * \notefnerr * \note_sync * * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoD, * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, * ::cudaMemcpyArrayToArray */ CUresult CUDAAPI cuMemcpyAtoA(CUarray dstArray, size_t dstOffset, CUarray srcArray, size_t srcOffset, size_t ByteCount); /** * \brief Copies memory for 2D arrays * * Perform a 2D memory copy according to the parameters specified in \p pCopy. * The ::CUDA_MEMCPY2D structure is defined as: * * \code typedef struct CUDA_MEMCPY2D_st { unsigned int srcXInBytes, srcY; CUmemorytype srcMemoryType; const void *srcHost; CUdeviceptr srcDevice; CUarray srcArray; unsigned int srcPitch; unsigned int dstXInBytes, dstY; CUmemorytype dstMemoryType; void *dstHost; CUdeviceptr dstDevice; CUarray dstArray; unsigned int dstPitch; unsigned int WidthInBytes; unsigned int Height; } CUDA_MEMCPY2D; * \endcode * where: * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the * source and destination, respectively; ::CUmemorytype_enum is defined as: * * \code typedef enum CUmemorytype_enum { CU_MEMORYTYPE_HOST = 0x01, CU_MEMORYTYPE_DEVICE = 0x02, CU_MEMORYTYPE_ARRAY = 0x03, CU_MEMORYTYPE_UNIFIED = 0x04 } CUmemorytype; * \endcode * * \par * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch * specify the (unified virtual address space) base address of the source data * and the bytes per row to apply. ::srcArray is ignored. * This value may be used only if unified addressing is supported in the calling * context. * * \par * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost and ::srcPitch * specify the (host) base address of the source data and the bytes per row to * apply. ::srcArray is ignored. * * \par * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice and ::srcPitch * specify the (device) base address of the source data and the bytes per row * to apply. ::srcArray is ignored. * * \par * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the * handle of the source data. ::srcHost, ::srcDevice and ::srcPitch are * ignored. * * \par * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch * specify the (host) base address of the destination data and the bytes per * row to apply. ::dstArray is ignored. * * \par * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch * specify the (unified virtual address space) base address of the source data * and the bytes per row to apply. ::dstArray is ignored. * This value may be used only if unified addressing is supported in the calling * context. * * \par * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch * specify the (device) base address of the destination data and the bytes per * row to apply. ::dstArray is ignored. * * \par * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the * handle of the destination data. ::dstHost, ::dstDevice and ::dstPitch are * ignored. * * - ::srcXInBytes and ::srcY specify the base address of the source data for * the copy. * * \par * For host pointers, the starting address is * \code void* Start = (void*)((char*)srcHost+srcY*srcPitch + srcXInBytes); * \endcode * * \par * For device pointers, the starting address is * \code CUdeviceptr Start = srcDevice+srcY*srcPitch+srcXInBytes; * \endcode * * \par * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array * element size. * * - ::dstXInBytes and ::dstY specify the base address of the destination data * for the copy. * * \par * For host pointers, the base address is * \code void* dstStart = (void*)((char*)dstHost+dstY*dstPitch + dstXInBytes); * \endcode * * \par * For device pointers, the starting address is * \code CUdeviceptr dstStart = dstDevice+dstY*dstPitch+dstXInBytes; * \endcode * * \par * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array * element size. * * - ::WidthInBytes and ::Height specify the width (in bytes) and height of * the 2D copy being performed. * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes + * ::srcXInBytes, and ::dstPitch must be greater than or equal to * ::WidthInBytes + dstXInBytes. * * \par * ::cuMemcpy2D() returns an error if any pitch is greater than the maximum * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH). ::cuMemAllocPitch() passes back * pitches that always work with ::cuMemcpy2D(). On intra-device memory copies * (device to device, CUDA array to device, CUDA array to CUDA array), * ::cuMemcpy2D() may fail for pitches not computed by ::cuMemAllocPitch(). * ::cuMemcpy2DUnaligned() does not have this restriction, but may run * significantly slower in the cases where ::cuMemcpy2D() would have returned * an error code. * * \param pCopy - Parameters for the memory copy * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE * \notefnerr * \note_sync * * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, * ::cuMemAllocPitch, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, * ::cudaMemcpy2D, * ::cudaMemcpy2DToArray, * ::cudaMemcpy2DFromArray */ CUresult CUDAAPI cuMemcpy2D(const CUDA_MEMCPY2D *pCopy); /** * \brief Copies memory for 2D arrays * * Perform a 2D memory copy according to the parameters specified in \p pCopy. * The ::CUDA_MEMCPY2D structure is defined as: * * \code typedef struct CUDA_MEMCPY2D_st { unsigned int srcXInBytes, srcY; CUmemorytype srcMemoryType; const void *srcHost; CUdeviceptr srcDevice; CUarray srcArray; unsigned int srcPitch; unsigned int dstXInBytes, dstY; CUmemorytype dstMemoryType; void *dstHost; CUdeviceptr dstDevice; CUarray dstArray; unsigned int dstPitch; unsigned int WidthInBytes; unsigned int Height; } CUDA_MEMCPY2D; * \endcode * where: * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the * source and destination, respectively; ::CUmemorytype_enum is defined as: * * \code typedef enum CUmemorytype_enum { CU_MEMORYTYPE_HOST = 0x01, CU_MEMORYTYPE_DEVICE = 0x02, CU_MEMORYTYPE_ARRAY = 0x03, CU_MEMORYTYPE_UNIFIED = 0x04 } CUmemorytype; * \endcode * * \par * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch * specify the (unified virtual address space) base address of the source data * and the bytes per row to apply. ::srcArray is ignored. * This value may be used only if unified addressing is supported in the calling * context. * * \par * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost and ::srcPitch * specify the (host) base address of the source data and the bytes per row to * apply. ::srcArray is ignored. * * \par * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice and ::srcPitch * specify the (device) base address of the source data and the bytes per row * to apply. ::srcArray is ignored. * * \par * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the * handle of the source data. ::srcHost, ::srcDevice and ::srcPitch are * ignored. * * \par * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch * specify the (unified virtual address space) base address of the source data * and the bytes per row to apply. ::dstArray is ignored. * This value may be used only if unified addressing is supported in the calling * context. * * \par * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch * specify the (host) base address of the destination data and the bytes per * row to apply. ::dstArray is ignored. * * \par * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch * specify the (device) base address of the destination data and the bytes per * row to apply. ::dstArray is ignored. * * \par * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the * handle of the destination data. ::dstHost, ::dstDevice and ::dstPitch are * ignored. * * - ::srcXInBytes and ::srcY specify the base address of the source data for * the copy. * * \par * For host pointers, the starting address is * \code void* Start = (void*)((char*)srcHost+srcY*srcPitch + srcXInBytes); * \endcode * * \par * For device pointers, the starting address is * \code CUdeviceptr Start = srcDevice+srcY*srcPitch+srcXInBytes; * \endcode * * \par * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array * element size. * * - ::dstXInBytes and ::dstY specify the base address of the destination data * for the copy. * * \par * For host pointers, the base address is * \code void* dstStart = (void*)((char*)dstHost+dstY*dstPitch + dstXInBytes); * \endcode * * \par * For device pointers, the starting address is * \code CUdeviceptr dstStart = dstDevice+dstY*dstPitch+dstXInBytes; * \endcode * * \par * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array * element size. * * - ::WidthInBytes and ::Height specify the width (in bytes) and height of * the 2D copy being performed. * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes + * ::srcXInBytes, and ::dstPitch must be greater than or equal to * ::WidthInBytes + dstXInBytes. * * \par * ::cuMemcpy2D() returns an error if any pitch is greater than the maximum * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH). ::cuMemAllocPitch() passes back * pitches that always work with ::cuMemcpy2D(). On intra-device memory copies * (device to device, CUDA array to device, CUDA array to CUDA array), * ::cuMemcpy2D() may fail for pitches not computed by ::cuMemAllocPitch(). * ::cuMemcpy2DUnaligned() does not have this restriction, but may run * significantly slower in the cases where ::cuMemcpy2D() would have returned * an error code. * * \param pCopy - Parameters for the memory copy * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE * \notefnerr * \note_sync * * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, * ::cudaMemcpy2D, * ::cudaMemcpy2DToArray, * ::cudaMemcpy2DFromArray */ CUresult CUDAAPI cuMemcpy2DUnaligned(const CUDA_MEMCPY2D *pCopy); /** * \brief Copies memory for 3D arrays * * Perform a 3D memory copy according to the parameters specified in * \p pCopy. The ::CUDA_MEMCPY3D structure is defined as: * * \code typedef struct CUDA_MEMCPY3D_st { unsigned int srcXInBytes, srcY, srcZ; unsigned int srcLOD; CUmemorytype srcMemoryType; const void *srcHost; CUdeviceptr srcDevice; CUarray srcArray; unsigned int srcPitch; // ignored when src is array unsigned int srcHeight; // ignored when src is array; may be 0 if Depth==1 unsigned int dstXInBytes, dstY, dstZ; unsigned int dstLOD; CUmemorytype dstMemoryType; void *dstHost; CUdeviceptr dstDevice; CUarray dstArray; unsigned int dstPitch; // ignored when dst is array unsigned int dstHeight; // ignored when dst is array; may be 0 if Depth==1 unsigned int WidthInBytes; unsigned int Height; unsigned int Depth; } CUDA_MEMCPY3D; * \endcode * where: * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the * source and destination, respectively; ::CUmemorytype_enum is defined as: * * \code typedef enum CUmemorytype_enum { CU_MEMORYTYPE_HOST = 0x01, CU_MEMORYTYPE_DEVICE = 0x02, CU_MEMORYTYPE_ARRAY = 0x03, CU_MEMORYTYPE_UNIFIED = 0x04 } CUmemorytype; * \endcode * * \par * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch * specify the (unified virtual address space) base address of the source data * and the bytes per row to apply. ::srcArray is ignored. * This value may be used only if unified addressing is supported in the calling * context. * * \par * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost, ::srcPitch and * ::srcHeight specify the (host) base address of the source data, the bytes * per row, and the height of each 2D slice of the 3D array. ::srcArray is * ignored. * * \par * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice, ::srcPitch and * ::srcHeight specify the (device) base address of the source data, the bytes * per row, and the height of each 2D slice of the 3D array. ::srcArray is * ignored. * * \par * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the * handle of the source data. ::srcHost, ::srcDevice, ::srcPitch and * ::srcHeight are ignored. * * \par * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch * specify the (unified virtual address space) base address of the source data * and the bytes per row to apply. ::dstArray is ignored. * This value may be used only if unified addressing is supported in the calling * context. * * \par * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch * specify the (host) base address of the destination data, the bytes per row, * and the height of each 2D slice of the 3D array. ::dstArray is ignored. * * \par * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch * specify the (device) base address of the destination data, the bytes per * row, and the height of each 2D slice of the 3D array. ::dstArray is ignored. * * \par * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the * handle of the destination data. ::dstHost, ::dstDevice, ::dstPitch and * ::dstHeight are ignored. * * - ::srcXInBytes, ::srcY and ::srcZ specify the base address of the source * data for the copy. * * \par * For host pointers, the starting address is * \code void* Start = (void*)((char*)srcHost+(srcZ*srcHeight+srcY)*srcPitch + srcXInBytes); * \endcode * * \par * For device pointers, the starting address is * \code CUdeviceptr Start = srcDevice+(srcZ*srcHeight+srcY)*srcPitch+srcXInBytes; * \endcode * * \par * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array * element size. * * - dstXInBytes, ::dstY and ::dstZ specify the base address of the * destination data for the copy. * * \par * For host pointers, the base address is * \code void* dstStart = (void*)((char*)dstHost+(dstZ*dstHeight+dstY)*dstPitch + dstXInBytes); * \endcode * * \par * For device pointers, the starting address is * \code CUdeviceptr dstStart = dstDevice+(dstZ*dstHeight+dstY)*dstPitch+dstXInBytes; * \endcode * * \par * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array * element size. * * - ::WidthInBytes, ::Height and ::Depth specify the width (in bytes), height * and depth of the 3D copy being performed. * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes + * ::srcXInBytes, and ::dstPitch must be greater than or equal to * ::WidthInBytes + dstXInBytes. * - If specified, ::srcHeight must be greater than or equal to ::Height + * ::srcY, and ::dstHeight must be greater than or equal to ::Height + ::dstY. * * \par * ::cuMemcpy3D() returns an error if any pitch is greater than the maximum * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH). * * The ::srcLOD and ::dstLOD members of the ::CUDA_MEMCPY3D structure must be * set to 0. * * \param pCopy - Parameters for the memory copy * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE * \notefnerr * \note_sync * * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, * ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, * ::cudaMemcpy3D */ CUresult CUDAAPI cuMemcpy3D(const CUDA_MEMCPY3D *pCopy); /** * \brief Copies memory between contexts * * Perform a 3D memory copy according to the parameters specified in * \p pCopy. See the definition of the ::CUDA_MEMCPY3D_PEER structure * for documentation of its parameters. * * \param pCopy - Parameters for the memory copy * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE * \notefnerr * \note_sync * * \sa ::cuMemcpyDtoD, ::cuMemcpyPeer, ::cuMemcpyDtoDAsync, ::cuMemcpyPeerAsync, * ::cuMemcpy3DPeerAsync, * ::cudaMemcpy3DPeer */ CUresult CUDAAPI cuMemcpy3DPeer(const CUDA_MEMCPY3D_PEER *pCopy); /** * \brief Copies memory asynchronously * * Copies data between two pointers. * \p dst and \p src are base pointers of the destination and source, respectively. * \p ByteCount specifies the number of bytes to copy. * Note that this function infers the type of the transfer (host to host, host to * device, device to device, or device to host) from the pointer values. This * function is only allowed in contexts which support unified addressing. * * \param dst - Destination unified virtual address space pointer * \param src - Source unified virtual address space pointer * \param ByteCount - Size of memory copy in bytes * \param hStream - Stream identifier * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_INVALID_HANDLE * \notefnerr * \note_async * \note_null_stream * \note_memcpy * * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, * ::cuMemsetD32, ::cuMemsetD32Async, * ::cudaMemcpyAsync, * ::cudaMemcpyToSymbolAsync, * ::cudaMemcpyFromSymbolAsync */ CUresult CUDAAPI cuMemcpyAsync(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount, CUstream hStream); /** * \brief Copies device memory between two contexts asynchronously. * * Copies from device memory in one context to device memory in another * context. \p dstDevice is the base device pointer of the destination memory * and \p dstContext is the destination context. \p srcDevice is the base * device pointer of the source memory and \p srcContext is the source pointer. * \p ByteCount specifies the number of bytes to copy. * * \param dstDevice - Destination device pointer * \param dstContext - Destination context * \param srcDevice - Source device pointer * \param srcContext - Source context * \param ByteCount - Size of memory copy in bytes * \param hStream - Stream identifier * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_INVALID_HANDLE * \notefnerr * \note_async * \note_null_stream * * \sa ::cuMemcpyDtoD, ::cuMemcpyPeer, ::cuMemcpy3DPeer, ::cuMemcpyDtoDAsync, * ::cuMemcpy3DPeerAsync, * ::cudaMemcpyPeerAsync */ CUresult CUDAAPI cuMemcpyPeerAsync(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount, CUstream hStream); /** * \brief Copies memory from Host to Device * * Copies from host memory to device memory. \p dstDevice and \p srcHost are * the base addresses of the destination and source, respectively. \p ByteCount * specifies the number of bytes to copy. * * \param dstDevice - Destination device pointer * \param srcHost - Source host pointer * \param ByteCount - Size of memory copy in bytes * \param hStream - Stream identifier * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_INVALID_HANDLE * \notefnerr * \note_async * \note_null_stream * \note_memcpy * * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, * ::cuMemcpyHtoD, ::cuMemFree, ::cuMemFreeHost, * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, * ::cuMemsetD32, ::cuMemsetD32Async, * ::cudaMemcpyAsync, * ::cudaMemcpyToSymbolAsync */ CUresult CUDAAPI cuMemcpyHtoDAsync(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream); /** * \brief Copies memory from Device to Host * * Copies from device to host memory. \p dstHost and \p srcDevice specify the * base pointers of the destination and source, respectively. \p ByteCount * specifies the number of bytes to copy. * * \param dstHost - Destination host pointer * \param srcDevice - Source device pointer * \param ByteCount - Size of memory copy in bytes * \param hStream - Stream identifier * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_INVALID_HANDLE * \notefnerr * \note_async * \note_null_stream * \note_memcpy * * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, * ::cuMemcpyDtoH, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, * ::cuMemsetD32, ::cuMemsetD32Async, * ::cudaMemcpyAsync, * ::cudaMemcpyFromSymbolAsync */ CUresult CUDAAPI cuMemcpyDtoHAsync(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream); /** * \brief Copies memory from Device to Device * * Copies from device memory to device memory. \p dstDevice and \p srcDevice * are the base pointers of the destination and source, respectively. * \p ByteCount specifies the number of bytes to copy. * * \param dstDevice - Destination device pointer * \param srcDevice - Source device pointer * \param ByteCount - Size of memory copy in bytes * \param hStream - Stream identifier * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_INVALID_HANDLE * \notefnerr * \note_async * \note_null_stream * * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, * ::cuMemsetD32, ::cuMemsetD32Async, * ::cudaMemcpyAsync, * ::cudaMemcpyToSymbolAsync, * ::cudaMemcpyFromSymbolAsync */ CUresult CUDAAPI cuMemcpyDtoDAsync(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream); /** * \brief Copies memory from Host to Array * * Copies from host memory to a 1D CUDA array. \p dstArray and \p dstOffset * specify the CUDA array handle and starting offset in bytes of the * destination data. \p srcHost specifies the base address of the source. * \p ByteCount specifies the number of bytes to copy. * * \param dstArray - Destination array * \param dstOffset - Offset in bytes of destination array * \param srcHost - Source host pointer * \param ByteCount - Size of memory copy in bytes * \param hStream - Stream identifier * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_INVALID_HANDLE * \notefnerr * \note_async * \note_null_stream * \note_memcpy * * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, * ::cuMemsetD32, ::cuMemsetD32Async, * ::cudaMemcpyToArrayAsync */ CUresult CUDAAPI cuMemcpyHtoAAsync(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount, CUstream hStream); /** * \brief Copies memory from Array to Host * * Copies from one 1D CUDA array to host memory. \p dstHost specifies the base * pointer of the destination. \p srcArray and \p srcOffset specify the CUDA * array handle and starting offset in bytes of the source data. * \p ByteCount specifies the number of bytes to copy. * * \param dstHost - Destination pointer * \param srcArray - Source array * \param srcOffset - Offset in bytes of source array * \param ByteCount - Size of memory copy in bytes * \param hStream - Stream identifier * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_INVALID_HANDLE * \notefnerr * \note_async * \note_null_stream * \note_memcpy * * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, * ::cuMemcpyAtoH, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, * ::cuMemsetD32, ::cuMemsetD32Async, * ::cudaMemcpyFromArrayAsync */ CUresult CUDAAPI cuMemcpyAtoHAsync(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount, CUstream hStream); /** * \brief Copies memory for 2D arrays * * Perform a 2D memory copy according to the parameters specified in \p pCopy. * The ::CUDA_MEMCPY2D structure is defined as: * * \code typedef struct CUDA_MEMCPY2D_st { unsigned int srcXInBytes, srcY; CUmemorytype srcMemoryType; const void *srcHost; CUdeviceptr srcDevice; CUarray srcArray; unsigned int srcPitch; unsigned int dstXInBytes, dstY; CUmemorytype dstMemoryType; void *dstHost; CUdeviceptr dstDevice; CUarray dstArray; unsigned int dstPitch; unsigned int WidthInBytes; unsigned int Height; } CUDA_MEMCPY2D; * \endcode * where: * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the * source and destination, respectively; ::CUmemorytype_enum is defined as: * * \code typedef enum CUmemorytype_enum { CU_MEMORYTYPE_HOST = 0x01, CU_MEMORYTYPE_DEVICE = 0x02, CU_MEMORYTYPE_ARRAY = 0x03, CU_MEMORYTYPE_UNIFIED = 0x04 } CUmemorytype; * \endcode * * \par * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost and ::srcPitch * specify the (host) base address of the source data and the bytes per row to * apply. ::srcArray is ignored. * * \par * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch * specify the (unified virtual address space) base address of the source data * and the bytes per row to apply. ::srcArray is ignored. * This value may be used only if unified addressing is supported in the calling * context. * * \par * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice and ::srcPitch * specify the (device) base address of the source data and the bytes per row * to apply. ::srcArray is ignored. * * \par * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the * handle of the source data. ::srcHost, ::srcDevice and ::srcPitch are * ignored. * * \par * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch * specify the (unified virtual address space) base address of the source data * and the bytes per row to apply. ::dstArray is ignored. * This value may be used only if unified addressing is supported in the calling * context. * * \par * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch * specify the (host) base address of the destination data and the bytes per * row to apply. ::dstArray is ignored. * * \par * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch * specify the (device) base address of the destination data and the bytes per * row to apply. ::dstArray is ignored. * * \par * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the * handle of the destination data. ::dstHost, ::dstDevice and ::dstPitch are * ignored. * * - ::srcXInBytes and ::srcY specify the base address of the source data for * the copy. * * \par * For host pointers, the starting address is * \code void* Start = (void*)((char*)srcHost+srcY*srcPitch + srcXInBytes); * \endcode * * \par * For device pointers, the starting address is * \code CUdeviceptr Start = srcDevice+srcY*srcPitch+srcXInBytes; * \endcode * * \par * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array * element size. * * - ::dstXInBytes and ::dstY specify the base address of the destination data * for the copy. * * \par * For host pointers, the base address is * \code void* dstStart = (void*)((char*)dstHost+dstY*dstPitch + dstXInBytes); * \endcode * * \par * For device pointers, the starting address is * \code CUdeviceptr dstStart = dstDevice+dstY*dstPitch+dstXInBytes; * \endcode * * \par * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array * element size. * * - ::WidthInBytes and ::Height specify the width (in bytes) and height of * the 2D copy being performed. * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes + * ::srcXInBytes, and ::dstPitch must be greater than or equal to * ::WidthInBytes + dstXInBytes. * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes + * ::srcXInBytes, and ::dstPitch must be greater than or equal to * ::WidthInBytes + dstXInBytes. * - If specified, ::srcHeight must be greater than or equal to ::Height + * ::srcY, and ::dstHeight must be greater than or equal to ::Height + ::dstY. * * \par * ::cuMemcpy2DAsync() returns an error if any pitch is greater than the maximum * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH). ::cuMemAllocPitch() passes back * pitches that always work with ::cuMemcpy2D(). On intra-device memory copies * (device to device, CUDA array to device, CUDA array to CUDA array), * ::cuMemcpy2DAsync() may fail for pitches not computed by ::cuMemAllocPitch(). * * \param pCopy - Parameters for the memory copy * \param hStream - Stream identifier * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_INVALID_HANDLE * \notefnerr * \note_async * \note_null_stream * * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DUnaligned, * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, * ::cuMemsetD32, ::cuMemsetD32Async, * ::cudaMemcpy2DAsync, * ::cudaMemcpy2DToArrayAsync, * ::cudaMemcpy2DFromArrayAsync */ CUresult CUDAAPI cuMemcpy2DAsync(const CUDA_MEMCPY2D *pCopy, CUstream hStream); /** * \brief Copies memory for 3D arrays * * Perform a 3D memory copy according to the parameters specified in * \p pCopy. The ::CUDA_MEMCPY3D structure is defined as: * * \code typedef struct CUDA_MEMCPY3D_st { unsigned int srcXInBytes, srcY, srcZ; unsigned int srcLOD; CUmemorytype srcMemoryType; const void *srcHost; CUdeviceptr srcDevice; CUarray srcArray; unsigned int srcPitch; // ignored when src is array unsigned int srcHeight; // ignored when src is array; may be 0 if Depth==1 unsigned int dstXInBytes, dstY, dstZ; unsigned int dstLOD; CUmemorytype dstMemoryType; void *dstHost; CUdeviceptr dstDevice; CUarray dstArray; unsigned int dstPitch; // ignored when dst is array unsigned int dstHeight; // ignored when dst is array; may be 0 if Depth==1 unsigned int WidthInBytes; unsigned int Height; unsigned int Depth; } CUDA_MEMCPY3D; * \endcode * where: * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the * source and destination, respectively; ::CUmemorytype_enum is defined as: * * \code typedef enum CUmemorytype_enum { CU_MEMORYTYPE_HOST = 0x01, CU_MEMORYTYPE_DEVICE = 0x02, CU_MEMORYTYPE_ARRAY = 0x03, CU_MEMORYTYPE_UNIFIED = 0x04 } CUmemorytype; * \endcode * * \par * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch * specify the (unified virtual address space) base address of the source data * and the bytes per row to apply. ::srcArray is ignored. * This value may be used only if unified addressing is supported in the calling * context. * * \par * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost, ::srcPitch and * ::srcHeight specify the (host) base address of the source data, the bytes * per row, and the height of each 2D slice of the 3D array. ::srcArray is * ignored. * * \par * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice, ::srcPitch and * ::srcHeight specify the (device) base address of the source data, the bytes * per row, and the height of each 2D slice of the 3D array. ::srcArray is * ignored. * * \par * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the * handle of the source data. ::srcHost, ::srcDevice, ::srcPitch and * ::srcHeight are ignored. * * \par * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch * specify the (unified virtual address space) base address of the source data * and the bytes per row to apply. ::dstArray is ignored. * This value may be used only if unified addressing is supported in the calling * context. * * \par * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch * specify the (host) base address of the destination data, the bytes per row, * and the height of each 2D slice of the 3D array. ::dstArray is ignored. * * \par * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch * specify the (device) base address of the destination data, the bytes per * row, and the height of each 2D slice of the 3D array. ::dstArray is ignored. * * \par * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the * handle of the destination data. ::dstHost, ::dstDevice, ::dstPitch and * ::dstHeight are ignored. * * - ::srcXInBytes, ::srcY and ::srcZ specify the base address of the source * data for the copy. * * \par * For host pointers, the starting address is * \code void* Start = (void*)((char*)srcHost+(srcZ*srcHeight+srcY)*srcPitch + srcXInBytes); * \endcode * * \par * For device pointers, the starting address is * \code CUdeviceptr Start = srcDevice+(srcZ*srcHeight+srcY)*srcPitch+srcXInBytes; * \endcode * * \par * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array * element size. * * - dstXInBytes, ::dstY and ::dstZ specify the base address of the * destination data for the copy. * * \par * For host pointers, the base address is * \code void* dstStart = (void*)((char*)dstHost+(dstZ*dstHeight+dstY)*dstPitch + dstXInBytes); * \endcode * * \par * For device pointers, the starting address is * \code CUdeviceptr dstStart = dstDevice+(dstZ*dstHeight+dstY)*dstPitch+dstXInBytes; * \endcode * * \par * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array * element size. * * - ::WidthInBytes, ::Height and ::Depth specify the width (in bytes), height * and depth of the 3D copy being performed. * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes + * ::srcXInBytes, and ::dstPitch must be greater than or equal to * ::WidthInBytes + dstXInBytes. * - If specified, ::srcHeight must be greater than or equal to ::Height + * ::srcY, and ::dstHeight must be greater than or equal to ::Height + ::dstY. * * \par * ::cuMemcpy3DAsync() returns an error if any pitch is greater than the maximum * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH). * * The ::srcLOD and ::dstLOD members of the ::CUDA_MEMCPY3D structure must be * set to 0. * * \param pCopy - Parameters for the memory copy * \param hStream - Stream identifier * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_INVALID_HANDLE * \notefnerr * \note_async * \note_null_stream * * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, * ::cuMemcpy3D, ::cuMemcpyAtoA, ::cuMemcpyAtoD, * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, * ::cuMemsetD32, ::cuMemsetD32Async, * ::cudaMemcpy3DAsync */ CUresult CUDAAPI cuMemcpy3DAsync(const CUDA_MEMCPY3D *pCopy, CUstream hStream); /** * \brief Copies memory between contexts asynchronously. * * Perform a 3D memory copy according to the parameters specified in * \p pCopy. See the definition of the ::CUDA_MEMCPY3D_PEER structure * for documentation of its parameters. * * \param pCopy - Parameters for the memory copy * \param hStream - Stream identifier * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE * \notefnerr * \note_async * \note_null_stream * * \sa ::cuMemcpyDtoD, ::cuMemcpyPeer, ::cuMemcpyDtoDAsync, ::cuMemcpyPeerAsync, * ::cuMemcpy3DPeerAsync, * ::cudaMemcpy3DPeerAsync */ CUresult CUDAAPI cuMemcpy3DPeerAsync(const CUDA_MEMCPY3D_PEER *pCopy, CUstream hStream); /** * \brief Initializes device memory * * Sets the memory range of \p N 8-bit values to the specified value * \p uc. * * \param dstDevice - Destination device pointer * \param uc - Value to set * \param N - Number of elements * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE * \notefnerr * \note_memset * * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, * ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, * ::cuMemsetD32, ::cuMemsetD32Async, * ::cudaMemset */ CUresult CUDAAPI cuMemsetD8(CUdeviceptr dstDevice, unsigned char uc, size_t N); /** * \brief Initializes device memory * * Sets the memory range of \p N 16-bit values to the specified value * \p us. The \p dstDevice pointer must be two byte aligned. * * \param dstDevice - Destination device pointer * \param us - Value to set * \param N - Number of elements * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE * \notefnerr * \note_memset * * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16Async, * ::cuMemsetD32, ::cuMemsetD32Async, * ::cudaMemset */ CUresult CUDAAPI cuMemsetD16(CUdeviceptr dstDevice, unsigned short us, size_t N); /** * \brief Initializes device memory * * Sets the memory range of \p N 32-bit values to the specified value * \p ui. The \p dstDevice pointer must be four byte aligned. * * \param dstDevice - Destination device pointer * \param ui - Value to set * \param N - Number of elements * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE * \notefnerr * \note_memset * * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, * ::cuMemsetD32Async, * ::cudaMemset */ CUresult CUDAAPI cuMemsetD32(CUdeviceptr dstDevice, unsigned int ui, size_t N); /** * \brief Initializes device memory * * Sets the 2D memory range of \p Width 8-bit values to the specified value * \p uc. \p Height specifies the number of rows to set, and \p dstPitch * specifies the number of bytes between each row. This function performs * fastest when the pitch is one that has been passed back by * ::cuMemAllocPitch(). * * \param dstDevice - Destination device pointer * \param dstPitch - Pitch of destination device pointer(Unused if \p Height is 1) * \param uc - Value to set * \param Width - Width of row * \param Height - Number of rows * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE * \notefnerr * \note_memset * * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8Async, * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, * ::cuMemsetD32, ::cuMemsetD32Async, * ::cudaMemset2D */ CUresult CUDAAPI cuMemsetD2D8(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height); /** * \brief Initializes device memory * * Sets the 2D memory range of \p Width 16-bit values to the specified value * \p us. \p Height specifies the number of rows to set, and \p dstPitch * specifies the number of bytes between each row. The \p dstDevice pointer * and \p dstPitch offset must be two byte aligned. This function performs * fastest when the pitch is one that has been passed back by * ::cuMemAllocPitch(). * * \param dstDevice - Destination device pointer * \param dstPitch - Pitch of destination device pointer(Unused if \p Height is 1) * \param us - Value to set * \param Width - Width of row * \param Height - Number of rows * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE * \notefnerr * \note_memset * * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, * ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, * ::cuMemsetD32, ::cuMemsetD32Async, * ::cudaMemset2D */ CUresult CUDAAPI cuMemsetD2D16(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height); /** * \brief Initializes device memory * * Sets the 2D memory range of \p Width 32-bit values to the specified value * \p ui. \p Height specifies the number of rows to set, and \p dstPitch * specifies the number of bytes between each row. The \p dstDevice pointer * and \p dstPitch offset must be four byte aligned. This function performs * fastest when the pitch is one that has been passed back by * ::cuMemAllocPitch(). * * \param dstDevice - Destination device pointer * \param dstPitch - Pitch of destination device pointer(Unused if \p Height is 1) * \param ui - Value to set * \param Width - Width of row * \param Height - Number of rows * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE * \notefnerr * \note_memset * * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32Async, * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, * ::cuMemsetD32, ::cuMemsetD32Async, * ::cudaMemset2D */ CUresult CUDAAPI cuMemsetD2D32(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height); /** * \brief Sets device memory * * Sets the memory range of \p N 8-bit values to the specified value * \p uc. * * \param dstDevice - Destination device pointer * \param uc - Value to set * \param N - Number of elements * \param hStream - Stream identifier * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE * \notefnerr * \note_memset * \note_null_stream * * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, * ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD16Async, * ::cuMemsetD32, ::cuMemsetD32Async, * ::cudaMemsetAsync */ CUresult CUDAAPI cuMemsetD8Async(CUdeviceptr dstDevice, unsigned char uc, size_t N, CUstream hStream); /** * \brief Sets device memory * * Sets the memory range of \p N 16-bit values to the specified value * \p us. The \p dstDevice pointer must be two byte aligned. * * \param dstDevice - Destination device pointer * \param us - Value to set * \param N - Number of elements * \param hStream - Stream identifier * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE * \notefnerr * \note_memset * \note_null_stream * * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, * ::cuMemsetD32, ::cuMemsetD32Async, * ::cudaMemsetAsync */ CUresult CUDAAPI cuMemsetD16Async(CUdeviceptr dstDevice, unsigned short us, size_t N, CUstream hStream); /** * \brief Sets device memory * * Sets the memory range of \p N 32-bit values to the specified value * \p ui. The \p dstDevice pointer must be four byte aligned. * * \param dstDevice - Destination device pointer * \param ui - Value to set * \param N - Number of elements * \param hStream - Stream identifier * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE * \notefnerr * \note_memset * \note_null_stream * * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, ::cuMemsetD32, * ::cudaMemsetAsync */ CUresult CUDAAPI cuMemsetD32Async(CUdeviceptr dstDevice, unsigned int ui, size_t N, CUstream hStream); /** * \brief Sets device memory * * Sets the 2D memory range of \p Width 8-bit values to the specified value * \p uc. \p Height specifies the number of rows to set, and \p dstPitch * specifies the number of bytes between each row. This function performs * fastest when the pitch is one that has been passed back by * ::cuMemAllocPitch(). * * \param dstDevice - Destination device pointer * \param dstPitch - Pitch of destination device pointer(Unused if \p Height is 1) * \param uc - Value to set * \param Width - Width of row * \param Height - Number of rows * \param hStream - Stream identifier * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE * \notefnerr * \note_memset * \note_null_stream * * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, * ::cuMemsetD32, ::cuMemsetD32Async, * ::cudaMemset2DAsync */ CUresult CUDAAPI cuMemsetD2D8Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height, CUstream hStream); /** * \brief Sets device memory * * Sets the 2D memory range of \p Width 16-bit values to the specified value * \p us. \p Height specifies the number of rows to set, and \p dstPitch * specifies the number of bytes between each row. The \p dstDevice pointer * and \p dstPitch offset must be two byte aligned. This function performs * fastest when the pitch is one that has been passed back by * ::cuMemAllocPitch(). * * \param dstDevice - Destination device pointer * \param dstPitch - Pitch of destination device pointer(Unused if \p Height is 1) * \param us - Value to set * \param Width - Width of row * \param Height - Number of rows * \param hStream - Stream identifier * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE * \notefnerr * \note_memset * \note_null_stream * * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, * ::cuMemsetD2D16, ::cuMemsetD2D32, ::cuMemsetD2D32Async, * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, * ::cuMemsetD32, ::cuMemsetD32Async, * ::cudaMemset2DAsync */ CUresult CUDAAPI cuMemsetD2D16Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height, CUstream hStream); /** * \brief Sets device memory * * Sets the 2D memory range of \p Width 32-bit values to the specified value * \p ui. \p Height specifies the number of rows to set, and \p dstPitch * specifies the number of bytes between each row. The \p dstDevice pointer * and \p dstPitch offset must be four byte aligned. This function performs * fastest when the pitch is one that has been passed back by * ::cuMemAllocPitch(). * * \param dstDevice - Destination device pointer * \param dstPitch - Pitch of destination device pointer(Unused if \p Height is 1) * \param ui - Value to set * \param Width - Width of row * \param Height - Number of rows * \param hStream - Stream identifier * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE * \notefnerr * \note_memset * \note_null_stream * * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, * ::cuMemsetD32, ::cuMemsetD32Async, * ::cudaMemset2DAsync */ CUresult CUDAAPI cuMemsetD2D32Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height, CUstream hStream); /** * \brief Creates a 1D or 2D CUDA array * * Creates a CUDA array according to the ::CUDA_ARRAY_DESCRIPTOR structure * \p pAllocateArray and returns a handle to the new CUDA array in \p *pHandle. * The ::CUDA_ARRAY_DESCRIPTOR is defined as: * * \code typedef struct { unsigned int Width; unsigned int Height; CUarray_format Format; unsigned int NumChannels; } CUDA_ARRAY_DESCRIPTOR; * \endcode * where: * * - \p Width, and \p Height are the width, and height of the CUDA array (in * elements); the CUDA array is one-dimensional if height is 0, two-dimensional * otherwise; * - ::Format specifies the format of the elements; ::CUarray_format is * defined as: * \code typedef enum CUarray_format_enum { CU_AD_FORMAT_UNSIGNED_INT8 = 0x01, CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, CU_AD_FORMAT_SIGNED_INT8 = 0x08, CU_AD_FORMAT_SIGNED_INT16 = 0x09, CU_AD_FORMAT_SIGNED_INT32 = 0x0a, CU_AD_FORMAT_HALF = 0x10, CU_AD_FORMAT_FLOAT = 0x20 } CUarray_format; * \endcode * - \p NumChannels specifies the number of packed components per CUDA array * element; it may be 1, 2, or 4; * * Here are examples of CUDA array descriptions: * * Description for a CUDA array of 2048 floats: * \code CUDA_ARRAY_DESCRIPTOR desc; desc.Format = CU_AD_FORMAT_FLOAT; desc.NumChannels = 1; desc.Width = 2048; desc.Height = 1; * \endcode * * Description for a 64 x 64 CUDA array of floats: * \code CUDA_ARRAY_DESCRIPTOR desc; desc.Format = CU_AD_FORMAT_FLOAT; desc.NumChannels = 1; desc.Width = 64; desc.Height = 64; * \endcode * * Description for a \p width x \p height CUDA array of 64-bit, 4x16-bit * float16's: * \code CUDA_ARRAY_DESCRIPTOR desc; desc.Format = CU_AD_FORMAT_HALF; desc.NumChannels = 4; desc.Width = width; desc.Height = height; * \endcode * * Description for a \p width x \p height CUDA array of 16-bit elements, each * of which is two 8-bit unsigned chars: * \code CUDA_ARRAY_DESCRIPTOR arrayDesc; desc.Format = CU_AD_FORMAT_UNSIGNED_INT8; desc.NumChannels = 2; desc.Width = width; desc.Height = height; * \endcode * * \param pHandle - Returned array * \param pAllocateArray - Array descriptor * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_OUT_OF_MEMORY, * ::CUDA_ERROR_UNKNOWN * \notefnerr * * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, * ::cudaMallocArray */ CUresult CUDAAPI cuArrayCreate(CUarray *pHandle, const CUDA_ARRAY_DESCRIPTOR *pAllocateArray); /** * \brief Get a 1D or 2D CUDA array descriptor * * Returns in \p *pArrayDescriptor a descriptor containing information on the * format and dimensions of the CUDA array \p hArray. It is useful for * subroutines that have been passed a CUDA array, but need to know the CUDA * array parameters for validation or other purposes. * * \param pArrayDescriptor - Returned array descriptor * \param hArray - Array to get descriptor of * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_INVALID_HANDLE * \notefnerr * * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, * ::cuArrayDestroy, ::cuMemAlloc, ::cuMemAllocHost, * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, * ::cudaArrayGetInfo */ CUresult CUDAAPI cuArrayGetDescriptor(CUDA_ARRAY_DESCRIPTOR *pArrayDescriptor, CUarray hArray); /** * \brief Returns the layout properties of a sparse CUDA array * * Returns the layout properties of a sparse CUDA array in \p sparseProperties * If the CUDA array is not allocated with flag ::CUDA_ARRAY3D_SPARSE * ::CUDA_ERROR_INVALID_VALUE will be returned. * * If the returned value in ::CUDA_ARRAY_SPARSE_PROPERTIES::flags contains ::CU_ARRAY_SPARSE_PROPERTIES_SINGLE_MIPTAIL, * then ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailSize represents the total size of the array. Otherwise, it will be zero. * Also, the returned value in ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailFirstLevel is always zero. * Note that the \p array must have been allocated using ::cuArrayCreate or ::cuArray3DCreate. For CUDA arrays obtained * using ::cuMipmappedArrayGetLevel, ::CUDA_ERROR_INVALID_VALUE will be returned. Instead, ::cuMipmappedArrayGetSparseProperties * must be used to obtain the sparse properties of the entire CUDA mipmapped array to which \p array belongs to. * * \return * ::CUDA_SUCCESS * ::CUDA_ERROR_INVALID_VALUE * * \param[out] sparseProperties - Pointer to ::CUDA_ARRAY_SPARSE_PROPERTIES * \param[in] array - CUDA array to get the sparse properties of * \sa ::cuMipmappedArrayGetSparseProperties, ::cuMemMapArrayAsync */ CUresult CUDAAPI cuArrayGetSparseProperties(CUDA_ARRAY_SPARSE_PROPERTIES *sparseProperties, CUarray array); /** * \brief Returns the layout properties of a sparse CUDA mipmapped array * * Returns the sparse array layout properties in \p sparseProperties * If the CUDA mipmapped array is not allocated with flag ::CUDA_ARRAY3D_SPARSE * ::CUDA_ERROR_INVALID_VALUE will be returned. * * For non-layered CUDA mipmapped arrays, ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailSize returns the * size of the mip tail region. The mip tail region includes all mip levels whose width, height or depth * is less than that of the tile. * For layered CUDA mipmapped arrays, if ::CUDA_ARRAY_SPARSE_PROPERTIES::flags contains ::CU_ARRAY_SPARSE_PROPERTIES_SINGLE_MIPTAIL, * then ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailSize specifies the size of the mip tail of all layers combined. * Otherwise, ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailSize specifies mip tail size per layer. * The returned value of ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailFirstLevel is valid only if ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailSize is non-zero. * * \return * ::CUDA_SUCCESS * ::CUDA_ERROR_INVALID_VALUE * * \param[out] sparseProperties - Pointer to ::CUDA_ARRAY_SPARSE_PROPERTIES * \param[in] mipmap - CUDA mipmapped array to get the sparse properties of * \sa ::cuArrayGetSparseProperties, ::cuMemMapArrayAsync */ CUresult CUDAAPI cuMipmappedArrayGetSparseProperties(CUDA_ARRAY_SPARSE_PROPERTIES *sparseProperties, CUmipmappedArray mipmap); /** * \brief Returns the memory requirements of a CUDA array * * Returns the memory requirements of a CUDA array in \p memoryRequirements * If the CUDA array is not allocated with flag ::CUDA_ARRAY3D_DEFERRED_MAPPING * ::CUDA_ERROR_INVALID_VALUE will be returned. * * The returned value in ::CUDA_ARRAY_MEMORY_REQUIREMENTS::size * represents the total size of the CUDA array. * The returned value in ::CUDA_ARRAY_MEMORY_REQUIREMENTS::alignment * represents the alignment necessary for mapping the CUDA array. * * \return * ::CUDA_SUCCESS * ::CUDA_ERROR_INVALID_VALUE * * \param[out] memoryRequirements - Pointer to ::CUDA_ARRAY_MEMORY_REQUIREMENTS * \param[in] array - CUDA array to get the memory requirements of * \param[in] device - Device to get the memory requirements for * \sa ::cuMipmappedArrayGetMemoryRequirements, ::cuMemMapArrayAsync */ CUresult CUDAAPI cuArrayGetMemoryRequirements(CUDA_ARRAY_MEMORY_REQUIREMENTS *memoryRequirements, CUarray array, CUdevice device); /** * \brief Returns the memory requirements of a CUDA mipmapped array * * Returns the memory requirements of a CUDA mipmapped array in \p memoryRequirements * If the CUDA mipmapped array is not allocated with flag ::CUDA_ARRAY3D_DEFERRED_MAPPING * ::CUDA_ERROR_INVALID_VALUE will be returned. * * The returned value in ::CUDA_ARRAY_MEMORY_REQUIREMENTS::size * represents the total size of the CUDA mipmapped array. * The returned value in ::CUDA_ARRAY_MEMORY_REQUIREMENTS::alignment * represents the alignment necessary for mapping the CUDA mipmapped * array. * * \return * ::CUDA_SUCCESS * ::CUDA_ERROR_INVALID_VALUE * * \param[out] memoryRequirements - Pointer to ::CUDA_ARRAY_MEMORY_REQUIREMENTS * \param[in] mipmap - CUDA mipmapped array to get the memory requirements of * \param[in] device - Device to get the memory requirements for * \sa ::cuArrayGetMemoryRequirements, ::cuMemMapArrayAsync */ CUresult CUDAAPI cuMipmappedArrayGetMemoryRequirements(CUDA_ARRAY_MEMORY_REQUIREMENTS *memoryRequirements, CUmipmappedArray mipmap, CUdevice device); /** * \brief Gets a CUDA array plane from a CUDA array * * Returns in \p pPlaneArray a CUDA array that represents a single format plane * of the CUDA array \p hArray. * * If \p planeIdx is greater than the maximum number of planes in this array or if the array does * not have a multi-planar format e.g: ::CU_AD_FORMAT_NV12, then ::CUDA_ERROR_INVALID_VALUE is returned. * * Note that if the \p hArray has format ::CU_AD_FORMAT_NV12, then passing in 0 for \p planeIdx returns * a CUDA array of the same size as \p hArray but with one channel and ::CU_AD_FORMAT_UNSIGNED_INT8 as its format. * If 1 is passed for \p planeIdx, then the returned CUDA array has half the height and width * of \p hArray with two channels and ::CU_AD_FORMAT_UNSIGNED_INT8 as its format. * * \param pPlaneArray - Returned CUDA array referenced by the \p planeIdx * \param hArray - Multiplanar CUDA array * \param planeIdx - Plane index * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_INVALID_HANDLE * \notefnerr * * \sa * ::cuArrayCreate, * ::cudaGetArrayPlane */ CUresult CUDAAPI cuArrayGetPlane(CUarray *pPlaneArray, CUarray hArray, unsigned int planeIdx); /** * \brief Destroys a CUDA array * * Destroys the CUDA array \p hArray. * * \param hArray - Array to destroy * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_HANDLE, * ::CUDA_ERROR_ARRAY_IS_MAPPED, * ::CUDA_ERROR_CONTEXT_IS_DESTROYED * \notefnerr * * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, * ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, * ::cudaFreeArray */ CUresult CUDAAPI cuArrayDestroy(CUarray hArray); /** * \brief Creates a 3D CUDA array * * Creates a CUDA array according to the ::CUDA_ARRAY3D_DESCRIPTOR structure * \p pAllocateArray and returns a handle to the new CUDA array in \p *pHandle. * The ::CUDA_ARRAY3D_DESCRIPTOR is defined as: * * \code typedef struct { unsigned int Width; unsigned int Height; unsigned int Depth; CUarray_format Format; unsigned int NumChannels; unsigned int Flags; } CUDA_ARRAY3D_DESCRIPTOR; * \endcode * where: * * - \p Width, \p Height, and \p Depth are the width, height, and depth of the * CUDA array (in elements); the following types of CUDA arrays can be allocated: * - A 1D array is allocated if \p Height and \p Depth extents are both zero. * - A 2D array is allocated if only \p Depth extent is zero. * - A 3D array is allocated if all three extents are non-zero. * - A 1D layered CUDA array is allocated if only \p Height is zero and the * ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 1D array. The number * of layers is determined by the depth extent. * - A 2D layered CUDA array is allocated if all three extents are non-zero and * the ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 2D array. The number * of layers is determined by the depth extent. * - A cubemap CUDA array is allocated if all three extents are non-zero and the * ::CUDA_ARRAY3D_CUBEMAP flag is set. \p Width must be equal to \p Height, and * \p Depth must be six. A cubemap is a special type of 2D layered CUDA array, * where the six layers represent the six faces of a cube. The order of the six * layers in memory is the same as that listed in ::CUarray_cubemap_face. * - A cubemap layered CUDA array is allocated if all three extents are non-zero, * and both, ::CUDA_ARRAY3D_CUBEMAP and ::CUDA_ARRAY3D_LAYERED flags are set. * \p Width must be equal to \p Height, and \p Depth must be a multiple of six. * A cubemap layered CUDA array is a special type of 2D layered CUDA array that * consists of a collection of cubemaps. The first six layers represent the first * cubemap, the next six layers form the second cubemap, and so on. * * - ::Format specifies the format of the elements; ::CUarray_format is * defined as: * \code typedef enum CUarray_format_enum { CU_AD_FORMAT_UNSIGNED_INT8 = 0x01, CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, CU_AD_FORMAT_SIGNED_INT8 = 0x08, CU_AD_FORMAT_SIGNED_INT16 = 0x09, CU_AD_FORMAT_SIGNED_INT32 = 0x0a, CU_AD_FORMAT_HALF = 0x10, CU_AD_FORMAT_FLOAT = 0x20 } CUarray_format; * \endcode * * - \p NumChannels specifies the number of packed components per CUDA array * element; it may be 1, 2, or 4; * * - ::Flags may be set to * - ::CUDA_ARRAY3D_LAYERED to enable creation of layered CUDA arrays. If this flag is set, * \p Depth specifies the number of layers, not the depth of a 3D array. * - ::CUDA_ARRAY3D_SURFACE_LDST to enable surface references to be bound to the CUDA array. * If this flag is not set, ::cuSurfRefSetArray will fail when attempting to bind the CUDA array * to a surface reference. * - ::CUDA_ARRAY3D_CUBEMAP to enable creation of cubemaps. If this flag is set, \p Width must be * equal to \p Height, and \p Depth must be six. If the ::CUDA_ARRAY3D_LAYERED flag is also set, * then \p Depth must be a multiple of six. * - ::CUDA_ARRAY3D_TEXTURE_GATHER to indicate that the CUDA array will be used for texture gather. * Texture gather can only be performed on 2D CUDA arrays. * * \p Width, \p Height and \p Depth must meet certain size requirements as listed in the following table. * All values are specified in elements. Note that for brevity's sake, the full name of the device attribute * is not specified. For ex., TEXTURE1D_WIDTH refers to the device attribute * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH. * * Note that 2D CUDA arrays have different size requirements if the ::CUDA_ARRAY3D_TEXTURE_GATHER flag * is set. \p Width and \p Height must not be greater than ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH * and ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT respectively, in that case. * * * * * * * * * * * * * * * * * * * * * * * * * * *
CUDA array typeValid extents that must always be met
{(width range in elements), (height range), * (depth range)}
Valid extents with CUDA_ARRAY3D_SURFACE_LDST set
* {(width range in elements), (height range), (depth range)}
1D{ (1,TEXTURE1D_WIDTH), 0, 0 }{ (1,SURFACE1D_WIDTH), 0, 0 }
2D{ (1,TEXTURE2D_WIDTH), (1,TEXTURE2D_HEIGHT), 0 }{ (1,SURFACE2D_WIDTH), (1,SURFACE2D_HEIGHT), 0 }
3D{ (1,TEXTURE3D_WIDTH), (1,TEXTURE3D_HEIGHT), (1,TEXTURE3D_DEPTH) } *
OR
{ (1,TEXTURE3D_WIDTH_ALTERNATE), (1,TEXTURE3D_HEIGHT_ALTERNATE), * (1,TEXTURE3D_DEPTH_ALTERNATE) }
{ (1,SURFACE3D_WIDTH), (1,SURFACE3D_HEIGHT), * (1,SURFACE3D_DEPTH) }
1D Layered{ (1,TEXTURE1D_LAYERED_WIDTH), 0, * (1,TEXTURE1D_LAYERED_LAYERS) }{ (1,SURFACE1D_LAYERED_WIDTH), 0, * (1,SURFACE1D_LAYERED_LAYERS) }
2D Layered{ (1,TEXTURE2D_LAYERED_WIDTH), (1,TEXTURE2D_LAYERED_HEIGHT), * (1,TEXTURE2D_LAYERED_LAYERS) }{ (1,SURFACE2D_LAYERED_WIDTH), (1,SURFACE2D_LAYERED_HEIGHT), * (1,SURFACE2D_LAYERED_LAYERS) }
Cubemap{ (1,TEXTURECUBEMAP_WIDTH), (1,TEXTURECUBEMAP_WIDTH), 6 }{ (1,SURFACECUBEMAP_WIDTH), * (1,SURFACECUBEMAP_WIDTH), 6 }
Cubemap Layered{ (1,TEXTURECUBEMAP_LAYERED_WIDTH), (1,TEXTURECUBEMAP_LAYERED_WIDTH), * (1,TEXTURECUBEMAP_LAYERED_LAYERS) }{ (1,SURFACECUBEMAP_LAYERED_WIDTH), (1,SURFACECUBEMAP_LAYERED_WIDTH), * (1,SURFACECUBEMAP_LAYERED_LAYERS) }
* * Here are examples of CUDA array descriptions: * * Description for a CUDA array of 2048 floats: * \code CUDA_ARRAY3D_DESCRIPTOR desc; desc.Format = CU_AD_FORMAT_FLOAT; desc.NumChannels = 1; desc.Width = 2048; desc.Height = 0; desc.Depth = 0; * \endcode * * Description for a 64 x 64 CUDA array of floats: * \code CUDA_ARRAY3D_DESCRIPTOR desc; desc.Format = CU_AD_FORMAT_FLOAT; desc.NumChannels = 1; desc.Width = 64; desc.Height = 64; desc.Depth = 0; * \endcode * * Description for a \p width x \p height x \p depth CUDA array of 64-bit, * 4x16-bit float16's: * \code CUDA_ARRAY3D_DESCRIPTOR desc; desc.Format = CU_AD_FORMAT_HALF; desc.NumChannels = 4; desc.Width = width; desc.Height = height; desc.Depth = depth; * \endcode * * \param pHandle - Returned array * \param pAllocateArray - 3D array descriptor * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_OUT_OF_MEMORY, * ::CUDA_ERROR_UNKNOWN * \notefnerr * * \sa ::cuArray3DGetDescriptor, ::cuArrayCreate, * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, * ::cudaMalloc3DArray */ CUresult CUDAAPI cuArray3DCreate(CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR *pAllocateArray); /** * \brief Get a 3D CUDA array descriptor * * Returns in \p *pArrayDescriptor a descriptor containing information on the * format and dimensions of the CUDA array \p hArray. It is useful for * subroutines that have been passed a CUDA array, but need to know the CUDA * array parameters for validation or other purposes. * * This function may be called on 1D and 2D arrays, in which case the \p Height * and/or \p Depth members of the descriptor struct will be set to 0. * * \param pArrayDescriptor - Returned 3D array descriptor * \param hArray - 3D array to get descriptor of * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_INVALID_HANDLE, * ::CUDA_ERROR_CONTEXT_IS_DESTROYED * \notefnerr * * \sa ::cuArray3DCreate, ::cuArrayCreate, * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, * ::cudaArrayGetInfo */ CUresult CUDAAPI cuArray3DGetDescriptor(CUDA_ARRAY3D_DESCRIPTOR *pArrayDescriptor, CUarray hArray); /** * \brief Creates a CUDA mipmapped array * * Creates a CUDA mipmapped array according to the ::CUDA_ARRAY3D_DESCRIPTOR structure * \p pMipmappedArrayDesc and returns a handle to the new CUDA mipmapped array in \p *pHandle. * \p numMipmapLevels specifies the number of mipmap levels to be allocated. This value is * clamped to the range [1, 1 + floor(log2(max(width, height, depth)))]. * * The ::CUDA_ARRAY3D_DESCRIPTOR is defined as: * * \code typedef struct { unsigned int Width; unsigned int Height; unsigned int Depth; CUarray_format Format; unsigned int NumChannels; unsigned int Flags; } CUDA_ARRAY3D_DESCRIPTOR; * \endcode * where: * * - \p Width, \p Height, and \p Depth are the width, height, and depth of the * CUDA array (in elements); the following types of CUDA arrays can be allocated: * - A 1D mipmapped array is allocated if \p Height and \p Depth extents are both zero. * - A 2D mipmapped array is allocated if only \p Depth extent is zero. * - A 3D mipmapped array is allocated if all three extents are non-zero. * - A 1D layered CUDA mipmapped array is allocated if only \p Height is zero and the * ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 1D array. The number * of layers is determined by the depth extent. * - A 2D layered CUDA mipmapped array is allocated if all three extents are non-zero and * the ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 2D array. The number * of layers is determined by the depth extent. * - A cubemap CUDA mipmapped array is allocated if all three extents are non-zero and the * ::CUDA_ARRAY3D_CUBEMAP flag is set. \p Width must be equal to \p Height, and * \p Depth must be six. A cubemap is a special type of 2D layered CUDA array, * where the six layers represent the six faces of a cube. The order of the six * layers in memory is the same as that listed in ::CUarray_cubemap_face. * - A cubemap layered CUDA mipmapped array is allocated if all three extents are non-zero, * and both, ::CUDA_ARRAY3D_CUBEMAP and ::CUDA_ARRAY3D_LAYERED flags are set. * \p Width must be equal to \p Height, and \p Depth must be a multiple of six. * A cubemap layered CUDA array is a special type of 2D layered CUDA array that * consists of a collection of cubemaps. The first six layers represent the first * cubemap, the next six layers form the second cubemap, and so on. * * - ::Format specifies the format of the elements; ::CUarray_format is * defined as: * \code typedef enum CUarray_format_enum { CU_AD_FORMAT_UNSIGNED_INT8 = 0x01, CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, CU_AD_FORMAT_SIGNED_INT8 = 0x08, CU_AD_FORMAT_SIGNED_INT16 = 0x09, CU_AD_FORMAT_SIGNED_INT32 = 0x0a, CU_AD_FORMAT_HALF = 0x10, CU_AD_FORMAT_FLOAT = 0x20 } CUarray_format; * \endcode * * - \p NumChannels specifies the number of packed components per CUDA array * element; it may be 1, 2, or 4; * * - ::Flags may be set to * - ::CUDA_ARRAY3D_LAYERED to enable creation of layered CUDA mipmapped arrays. If this flag is set, * \p Depth specifies the number of layers, not the depth of a 3D array. * - ::CUDA_ARRAY3D_SURFACE_LDST to enable surface references to be bound to individual mipmap levels of * the CUDA mipmapped array. If this flag is not set, ::cuSurfRefSetArray will fail when attempting to * bind a mipmap level of the CUDA mipmapped array to a surface reference. * - ::CUDA_ARRAY3D_CUBEMAP to enable creation of mipmapped cubemaps. If this flag is set, \p Width must be * equal to \p Height, and \p Depth must be six. If the ::CUDA_ARRAY3D_LAYERED flag is also set, * then \p Depth must be a multiple of six. * - ::CUDA_ARRAY3D_TEXTURE_GATHER to indicate that the CUDA mipmapped array will be used for texture gather. * Texture gather can only be performed on 2D CUDA mipmapped arrays. * * \p Width, \p Height and \p Depth must meet certain size requirements as listed in the following table. * All values are specified in elements. Note that for brevity's sake, the full name of the device attribute * is not specified. For ex., TEXTURE1D_MIPMAPPED_WIDTH refers to the device attribute * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH. * * * * * * * * * * * * * * * * * * * * * * * * * * *
CUDA array typeValid extents that must always be met
{(width range in elements), (height range), * (depth range)}
Valid extents with CUDA_ARRAY3D_SURFACE_LDST set
* {(width range in elements), (height range), (depth range)}
1D{ (1,TEXTURE1D_MIPMAPPED_WIDTH), 0, 0 }{ (1,SURFACE1D_WIDTH), 0, 0 }
2D{ (1,TEXTURE2D_MIPMAPPED_WIDTH), (1,TEXTURE2D_MIPMAPPED_HEIGHT), 0 }{ (1,SURFACE2D_WIDTH), (1,SURFACE2D_HEIGHT), 0 }
3D{ (1,TEXTURE3D_WIDTH), (1,TEXTURE3D_HEIGHT), (1,TEXTURE3D_DEPTH) } *
OR
{ (1,TEXTURE3D_WIDTH_ALTERNATE), (1,TEXTURE3D_HEIGHT_ALTERNATE), * (1,TEXTURE3D_DEPTH_ALTERNATE) }
{ (1,SURFACE3D_WIDTH), (1,SURFACE3D_HEIGHT), * (1,SURFACE3D_DEPTH) }
1D Layered{ (1,TEXTURE1D_LAYERED_WIDTH), 0, * (1,TEXTURE1D_LAYERED_LAYERS) }{ (1,SURFACE1D_LAYERED_WIDTH), 0, * (1,SURFACE1D_LAYERED_LAYERS) }
2D Layered{ (1,TEXTURE2D_LAYERED_WIDTH), (1,TEXTURE2D_LAYERED_HEIGHT), * (1,TEXTURE2D_LAYERED_LAYERS) }{ (1,SURFACE2D_LAYERED_WIDTH), (1,SURFACE2D_LAYERED_HEIGHT), * (1,SURFACE2D_LAYERED_LAYERS) }
Cubemap{ (1,TEXTURECUBEMAP_WIDTH), (1,TEXTURECUBEMAP_WIDTH), 6 }{ (1,SURFACECUBEMAP_WIDTH), * (1,SURFACECUBEMAP_WIDTH), 6 }
Cubemap Layered{ (1,TEXTURECUBEMAP_LAYERED_WIDTH), (1,TEXTURECUBEMAP_LAYERED_WIDTH), * (1,TEXTURECUBEMAP_LAYERED_LAYERS) }{ (1,SURFACECUBEMAP_LAYERED_WIDTH), (1,SURFACECUBEMAP_LAYERED_WIDTH), * (1,SURFACECUBEMAP_LAYERED_LAYERS) }
* * * \param pHandle - Returned mipmapped array * \param pMipmappedArrayDesc - mipmapped array descriptor * \param numMipmapLevels - Number of mipmap levels * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_OUT_OF_MEMORY, * ::CUDA_ERROR_UNKNOWN * \notefnerr * * \sa * ::cuMipmappedArrayDestroy, * ::cuMipmappedArrayGetLevel, * ::cuArrayCreate, * ::cudaMallocMipmappedArray */ CUresult CUDAAPI cuMipmappedArrayCreate(CUmipmappedArray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR *pMipmappedArrayDesc, unsigned int numMipmapLevels); /** * \brief Gets a mipmap level of a CUDA mipmapped array * * Returns in \p *pLevelArray a CUDA array that represents a single mipmap level * of the CUDA mipmapped array \p hMipmappedArray. * * If \p level is greater than the maximum number of levels in this mipmapped array, * ::CUDA_ERROR_INVALID_VALUE is returned. * * \param pLevelArray - Returned mipmap level CUDA array * \param hMipmappedArray - CUDA mipmapped array * \param level - Mipmap level * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_INVALID_HANDLE * \notefnerr * * \sa * ::cuMipmappedArrayCreate, * ::cuMipmappedArrayDestroy, * ::cuArrayCreate, * ::cudaGetMipmappedArrayLevel */ CUresult CUDAAPI cuMipmappedArrayGetLevel(CUarray *pLevelArray, CUmipmappedArray hMipmappedArray, unsigned int level); /** * \brief Destroys a CUDA mipmapped array * * Destroys the CUDA mipmapped array \p hMipmappedArray. * * \param hMipmappedArray - Mipmapped array to destroy * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_HANDLE, * ::CUDA_ERROR_ARRAY_IS_MAPPED, * ::CUDA_ERROR_CONTEXT_IS_DESTROYED * \notefnerr * * \sa * ::cuMipmappedArrayCreate, * ::cuMipmappedArrayGetLevel, * ::cuArrayCreate, * ::cudaFreeMipmappedArray */ CUresult CUDAAPI cuMipmappedArrayDestroy(CUmipmappedArray hMipmappedArray); /** @} */ /* END CUDA_MEM */ /** * \defgroup CUDA_VA Virtual Memory Management * * ___MANBRIEF___ virtual memory management functions of the low-level CUDA driver API * (___CURRENT_FILE___) ___ENDMANBRIEF___ * * This section describes the virtual memory management functions of the low-level CUDA * driver application programming interface. * * @{ */ /** * \brief Allocate an address range reservation. * * Reserves a virtual address range based on the given parameters, giving * the starting address of the range in \p ptr. This API requires a system that * supports UVA. The size and address parameters must be a multiple of the * host page size and the alignment must be a power of two or zero for default * alignment. * * \param[out] ptr - Resulting pointer to start of virtual address range allocated * \param[in] size - Size of the reserved virtual address range requested * \param[in] alignment - Alignment of the reserved virtual address range requested * \param[in] addr - Fixed starting address range requested * \param[in] flags - Currently unused, must be zero * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_OUT_OF_MEMORY, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_PERMITTED, * ::CUDA_ERROR_NOT_SUPPORTED * * \sa ::cuMemAddressFree */ CUresult CUDAAPI cuMemAddressReserve(CUdeviceptr *ptr, size_t size, size_t alignment, CUdeviceptr addr, unsigned long long flags); /** * \brief Free an address range reservation. * * Frees a virtual address range reserved by cuMemAddressReserve. The size * must match what was given to memAddressReserve and the ptr given must * match what was returned from memAddressReserve. * * \param[in] ptr - Starting address of the virtual address range to free * \param[in] size - Size of the virtual address region to free * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_PERMITTED, * ::CUDA_ERROR_NOT_SUPPORTED * * \sa ::cuMemAddressReserve */ CUresult CUDAAPI cuMemAddressFree(CUdeviceptr ptr, size_t size); /** * \brief Create a CUDA memory handle representing a memory allocation of a given size described by the given properties * * This creates a memory allocation on the target device specified through the * \p prop strcuture. The created allocation will not have any device or host * mappings. The generic memory \p handle for the allocation can be * mapped to the address space of calling process via ::cuMemMap. This handle * cannot be transmitted directly to other processes (see * ::cuMemExportToShareableHandle). On Windows, the caller must also pass * an LPSECURITYATTRIBUTE in \p prop to be associated with this handle which * limits or allows access to this handle for a recepient process (see * ::CUmemAllocationProp::win32HandleMetaData for more). The \p size of this * allocation must be a multiple of the the value given via * ::cuMemGetAllocationGranularity with the ::CU_MEM_ALLOC_GRANULARITY_MINIMUM * flag. * If ::CUmemAllocationProp::allocFlags::usage contains ::CU_MEM_CREATE_USAGE_TILE_POOL flag then * the memory allocation is intended only to be used as backing tile pool for sparse CUDA arrays * and sparse CUDA mipmapped arrays. * (see ::cuMemMapArrayAsync). * * \param[out] handle - Value of handle returned. All operations on this allocation are to be performed using this handle. * \param[in] size - Size of the allocation requested * \param[in] prop - Properties of the allocation to create. * \param[in] flags - flags for future use, must be zero now. * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_OUT_OF_MEMORY, * ::CUDA_ERROR_INVALID_DEVICE, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_PERMITTED, * ::CUDA_ERROR_NOT_SUPPORTED * \notefnerr * * \sa ::cuMemRelease, ::cuMemExportToShareableHandle, ::cuMemImportFromShareableHandle */ CUresult CUDAAPI cuMemCreate(CUmemGenericAllocationHandle *handle, size_t size, const CUmemAllocationProp *prop, unsigned long long flags); /** * \brief Release a memory handle representing a memory allocation which was previously allocated through cuMemCreate. * * Frees the memory that was allocated on a device through cuMemCreate. * * The memory allocation will be freed when all outstanding mappings to the memory * are unmapped and when all outstanding references to the handle (including it's * shareable counterparts) are also released. The generic memory handle can be * freed when there are still outstanding mappings made with this handle. Each * time a recepient process imports a shareable handle, it needs to pair it with * ::cuMemRelease for the handle to be freed. If \p handle is not a valid handle * the behavior is undefined. * * \param[in] handle Value of handle which was returned previously by cuMemCreate. * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_PERMITTED, * ::CUDA_ERROR_NOT_SUPPORTED * \notefnerr * * \sa ::cuMemCreate */ CUresult CUDAAPI cuMemRelease(CUmemGenericAllocationHandle handle); /** * \brief Maps an allocation handle to a reserved virtual address range. * * Maps bytes of memory represented by \p handle starting from byte \p offset to * \p size to address range [\p addr, \p addr + \p size]. This range must be an * address reservation previously reserved with ::cuMemAddressReserve, and * \p offset + \p size must be less than the size of the memory allocation. * Both \p ptr, \p size, and \p offset must be a multiple of the value given via * ::cuMemGetAllocationGranularity with the ::CU_MEM_ALLOC_GRANULARITY_MINIMUM flag. * * Please note calling ::cuMemMap does not make the address accessible, * the caller needs to update accessibility of a contiguous mapped VA * range by calling ::cuMemSetAccess. * * Once a recipient process obtains a shareable memory handle * from ::cuMemImportFromShareableHandle, the process must * use ::cuMemMap to map the memory into its address ranges before * setting accessibility with ::cuMemSetAccess. * * ::cuMemMap can only create mappings on VA range reservations * that are not currently mapped. * * \param[in] ptr - Address where memory will be mapped. * \param[in] size - Size of the memory mapping. * \param[in] offset - Offset into the memory represented by * - \p handle from which to start mapping * - Note: currently must be zero. * \param[in] handle - Handle to a shareable memory * \param[in] flags - flags for future use, must be zero now. * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_INVALID_DEVICE, * ::CUDA_ERROR_OUT_OF_MEMORY, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_PERMITTED, * ::CUDA_ERROR_NOT_SUPPORTED * \notefnerr * * \sa ::cuMemUnmap, ::cuMemSetAccess, ::cuMemCreate, ::cuMemAddressReserve, ::cuMemImportFromShareableHandle */ CUresult CUDAAPI cuMemMap(CUdeviceptr ptr, size_t size, size_t offset, CUmemGenericAllocationHandle handle, unsigned long long flags); /** * \brief Maps or unmaps subregions of sparse CUDA arrays and sparse CUDA mipmapped arrays * * Performs map or unmap operations on subregions of sparse CUDA arrays and sparse CUDA mipmapped arrays. * Each operation is specified by a ::CUarrayMapInfo entry in the \p mapInfoList array of size \p count. * The structure ::CUarrayMapInfo is defined as follow: \code typedef struct CUarrayMapInfo_st { CUresourcetype resourceType; union { CUmipmappedArray mipmap; CUarray array; } resource; CUarraySparseSubresourceType subresourceType; union { struct { unsigned int level; unsigned int layer; unsigned int offsetX; unsigned int offsetY; unsigned int offsetZ; unsigned int extentWidth; unsigned int extentHeight; unsigned int extentDepth; } sparseLevel; struct { unsigned int layer; unsigned long long offset; unsigned long long size; } miptail; } subresource; CUmemOperationType memOperationType; CUmemHandleType memHandleType; union { CUmemGenericAllocationHandle memHandle; } memHandle; unsigned long long offset; unsigned int deviceBitMask; unsigned int flags; unsigned int reserved[2]; } CUarrayMapInfo; \endcode * * where ::CUarrayMapInfo::resourceType specifies the type of resource to be operated on. * If ::CUarrayMapInfo::resourceType is set to ::CUresourcetype::CU_RESOURCE_TYPE_ARRAY then * ::CUarrayMapInfo::resource::array must be set to a valid sparse CUDA array handle. * The CUDA array must be either a 2D, 2D layered or 3D CUDA array and must have been allocated using * ::cuArrayCreate or ::cuArray3DCreate with the flag ::CUDA_ARRAY3D_SPARSE * or ::CUDA_ARRAY3D_DEFERRED_MAPPING. * For CUDA arrays obtained using ::cuMipmappedArrayGetLevel, ::CUDA_ERROR_INVALID_VALUE will be returned. * If ::CUarrayMapInfo::resourceType is set to ::CUresourcetype::CU_RESOURCE_TYPE_MIPMAPPED_ARRAY * then ::CUarrayMapInfo::resource::mipmap must be set to a valid sparse CUDA mipmapped array handle. * The CUDA mipmapped array must be either a 2D, 2D layered or 3D CUDA mipmapped array and must have been * allocated using ::cuMipmappedArrayCreate with the flag ::CUDA_ARRAY3D_SPARSE * or ::CUDA_ARRAY3D_DEFERRED_MAPPING. * * ::CUarrayMapInfo::subresourceType specifies the type of subresource within the resource. * ::CUarraySparseSubresourceType_enum is defined as: \code typedef enum CUarraySparseSubresourceType_enum { CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_SPARSE_LEVEL = 0, CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_MIPTAIL = 1 } CUarraySparseSubresourceType; \endcode * * where ::CUarraySparseSubresourceType::CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_SPARSE_LEVEL indicates a * sparse-miplevel which spans at least one tile in every dimension. The remaining miplevels which * are too small to span at least one tile in any dimension constitute the mip tail region as indicated by * ::CUarraySparseSubresourceType::CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_MIPTAIL subresource type. * * If ::CUarrayMapInfo::subresourceType is set to ::CUarraySparseSubresourceType::CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_SPARSE_LEVEL * then ::CUarrayMapInfo::subresource::sparseLevel struct must contain valid array subregion offsets and extents. * The ::CUarrayMapInfo::subresource::sparseLevel::offsetX, ::CUarrayMapInfo::subresource::sparseLevel::offsetY * and ::CUarrayMapInfo::subresource::sparseLevel::offsetZ must specify valid X, Y and Z offsets respectively. * The ::CUarrayMapInfo::subresource::sparseLevel::extentWidth, ::CUarrayMapInfo::subresource::sparseLevel::extentHeight * and ::CUarrayMapInfo::subresource::sparseLevel::extentDepth must specify valid width, height and depth extents respectively. * These offsets and extents must be aligned to the corresponding tile dimension. * For CUDA mipmapped arrays ::CUarrayMapInfo::subresource::sparseLevel::level must specify a valid mip level index. Otherwise, * must be zero. * For layered CUDA arrays and layered CUDA mipmapped arrays ::CUarrayMapInfo::subresource::sparseLevel::layer must specify a valid layer index. Otherwise, * must be zero. * ::CUarrayMapInfo::subresource::sparseLevel::offsetZ must be zero and ::CUarrayMapInfo::subresource::sparseLevel::extentDepth * must be set to 1 for 2D and 2D layered CUDA arrays and CUDA mipmapped arrays. * Tile extents can be obtained by calling ::cuArrayGetSparseProperties and ::cuMipmappedArrayGetSparseProperties * * If ::CUarrayMapInfo::subresourceType is set to ::CUarraySparseSubresourceType::CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_MIPTAIL * then ::CUarrayMapInfo::subresource::miptail struct must contain valid mip tail offset in * ::CUarrayMapInfo::subresource::miptail::offset and size in ::CUarrayMapInfo::subresource::miptail::size. * Both, mip tail offset and mip tail size must be aligned to the tile size. * For layered CUDA mipmapped arrays which don't have the flag ::CU_ARRAY_SPARSE_PROPERTIES_SINGLE_MIPTAIL set in ::CUDA_ARRAY_SPARSE_PROPERTIES::flags * as returned by ::cuMipmappedArrayGetSparseProperties, ::CUarrayMapInfo::subresource::miptail::layer must specify a valid layer index. * Otherwise, must be zero. * * If ::CUarrayMapInfo::resource::array or ::CUarrayMapInfo::resource::mipmap was created with ::CUDA_ARRAY3D_DEFERRED_MAPPING * flag set the ::CUarrayMapInfo::subresourceType and the contents of ::CUarrayMapInfo::subresource will be ignored. * * ::CUarrayMapInfo::memOperationType specifies the type of operation. ::CUmemOperationType is defined as: \code typedef enum CUmemOperationType_enum { CU_MEM_OPERATION_TYPE_MAP = 1, CU_MEM_OPERATION_TYPE_UNMAP = 2 } CUmemOperationType; \endcode * If ::CUarrayMapInfo::memOperationType is set to ::CUmemOperationType::CU_MEM_OPERATION_TYPE_MAP then the subresource * will be mapped onto the tile pool memory specified by ::CUarrayMapInfo::memHandle at offset ::CUarrayMapInfo::offset. * The tile pool allocation has to be created by specifying the ::CU_MEM_CREATE_USAGE_TILE_POOL flag when calling ::cuMemCreate. Also, * ::CUarrayMapInfo::memHandleType must be set to ::CUmemHandleType::CU_MEM_HANDLE_TYPE_GENERIC. * * If ::CUarrayMapInfo::memOperationType is set to ::CUmemOperationType::CU_MEM_OPERATION_TYPE_UNMAP then an unmapping operation * is performed. ::CUarrayMapInfo::memHandle must be NULL. * * ::CUarrayMapInfo::deviceBitMask specifies the list of devices that must map or unmap physical memory. * Currently, this mask must have exactly one bit set, and the corresponding device must match the device associated with the stream. * If ::CUarrayMapInfo::memOperationType is set to ::CUmemOperationType::CU_MEM_OPERATION_TYPE_MAP, the device must also match * the device associated with the tile pool memory allocation as specified by ::CUarrayMapInfo::memHandle. * * ::CUarrayMapInfo::flags and ::CUarrayMapInfo::reserved[] are unused and must be set to zero. * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_INVALID_HANDLE * * \param[in] mapInfoList - List of ::CUarrayMapInfo * \param[in] count - Count of ::CUarrayMapInfo in \p mapInfoList * \param[in] hStream - Stream identifier for the stream to use for map or unmap operations * * \sa ::cuMipmappedArrayCreate, ::cuArrayCreate, ::cuArray3DCreate, ::cuMemCreate, ::cuArrayGetSparseProperties, ::cuMipmappedArrayGetSparseProperties */ CUresult CUDAAPI cuMemMapArrayAsync(CUarrayMapInfo *mapInfoList, unsigned int count, CUstream hStream); /** * \brief Unmap the backing memory of a given address range. * * The range must be the entire contiguous address range that was mapped to. In * other words, ::cuMemUnmap cannot unmap a sub-range of an address range mapped * by ::cuMemCreate / ::cuMemMap. Any backing memory allocations will be freed * if there are no existing mappings and there are no unreleased memory handles. * * When ::cuMemUnmap returns successfully the address range is converted to an * address reservation and can be used for a future calls to ::cuMemMap. Any new * mapping to this virtual address will need to have access granted through * ::cuMemSetAccess, as all mappings start with no accessibility setup. * * \param[in] ptr - Starting address for the virtual address range to unmap * \param[in] size - Size of the virtual address range to unmap * \returns * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_PERMITTED, * ::CUDA_ERROR_NOT_SUPPORTED * \notefnerr * \note_sync * * \sa ::cuMemCreate, ::cuMemAddressReserve */ CUresult CUDAAPI cuMemUnmap(CUdeviceptr ptr, size_t size); /** * \brief Set the access flags for each location specified in \p desc for the given virtual address range * * Given the virtual address range via \p ptr and \p size, and the locations * in the array given by \p desc and \p count, set the access flags for the * target locations. The range must be a fully mapped address range * containing all allocations created by ::cuMemMap / ::cuMemCreate. * * \param[in] ptr - Starting address for the virtual address range * \param[in] size - Length of the virtual address range * \param[in] desc - Array of ::CUmemAccessDesc that describe how to change the * - mapping for each location specified * \param[in] count - Number of ::CUmemAccessDesc in \p desc * \returns * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_INVALID_DEVICE, * ::CUDA_ERROR_NOT_SUPPORTED * \notefnerr * \note_sync * * \sa ::cuMemSetAccess, ::cuMemCreate, :cuMemMap */ CUresult CUDAAPI cuMemSetAccess(CUdeviceptr ptr, size_t size, const CUmemAccessDesc *desc, size_t count); /** * \brief Get the access \p flags set for the given \p location and \p ptr * * \param[out] flags - Flags set for this location * \param[in] location - Location in which to check the flags for * \param[in] ptr - Address in which to check the access flags for * \returns * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_INVALID_DEVICE, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_PERMITTED, * ::CUDA_ERROR_NOT_SUPPORTED * * \sa ::cuMemSetAccess */ CUresult CUDAAPI cuMemGetAccess(unsigned long long *flags, const CUmemLocation *location, CUdeviceptr ptr); /** * \brief Exports an allocation to a requested shareable handle type * * Given a CUDA memory handle, create a shareable memory * allocation handle that can be used to share the memory with other * processes. The recipient process can convert the shareable handle back into a * CUDA memory handle using ::cuMemImportFromShareableHandle and map * it with ::cuMemMap. The implementation of what this handle is and how it * can be transferred is defined by the requested handle type in \p handleType * * Once all shareable handles are closed and the allocation is released, the allocated * memory referenced will be released back to the OS and uses of the CUDA handle afterward * will lead to undefined behavior. * * This API can also be used in conjunction with other APIs (e.g. Vulkan, OpenGL) * that support importing memory from the shareable type * * \param[out] shareableHandle - Pointer to the location in which to store the requested handle type * \param[in] handle - CUDA handle for the memory allocation * \param[in] handleType - Type of shareable handle requested (defines type and size of the \p shareableHandle output parameter) * \param[in] flags - Reserved, must be zero * \returns * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_PERMITTED, * ::CUDA_ERROR_NOT_SUPPORTED * * \sa ::cuMemImportFromShareableHandle */ CUresult CUDAAPI cuMemExportToShareableHandle(void *shareableHandle, CUmemGenericAllocationHandle handle, CUmemAllocationHandleType handleType, unsigned long long flags); /** * \brief Imports an allocation from a requested shareable handle type. * * If the current process cannot support the memory described by this shareable * handle, this API will error as CUDA_ERROR_NOT_SUPPORTED. * * \note Importing shareable handles exported from some graphics APIs(VUlkan, OpenGL, etc) * created on devices under an SLI group may not be supported, and thus this API will * return CUDA_ERROR_NOT_SUPPORTED. * There is no guarantee that the contents of \p handle will be the same CUDA memory handle * for the same given OS shareable handle, or the same underlying allocation. * * \param[out] handle - CUDA Memory handle for the memory allocation. * \param[in] osHandle - Shareable Handle representing the memory allocation that is to be imported. * \param[in] shHandleType - handle type of the exported handle ::CUmemAllocationHandleType. * \returns * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_PERMITTED, * ::CUDA_ERROR_NOT_SUPPORTED * * \sa ::cuMemExportToShareableHandle, ::cuMemMap, ::cuMemRelease */ CUresult CUDAAPI cuMemImportFromShareableHandle(CUmemGenericAllocationHandle *handle, void *osHandle, CUmemAllocationHandleType shHandleType); /** * \brief Calculates either the minimal or recommended granularity * * Calculates either the minimal or recommended granularity * for a given allocation specification and returns it in granularity. This * granularity can be used as a multiple for alignment, size, or address mapping. * * \param[out] granularity Returned granularity. * \param[in] prop Property for which to determine the granularity for * \param[in] option Determines which granularity to return * \returns * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_PERMITTED, * ::CUDA_ERROR_NOT_SUPPORTED * * \sa ::cuMemCreate, ::cuMemMap */ CUresult CUDAAPI cuMemGetAllocationGranularity(size_t *granularity, const CUmemAllocationProp *prop, CUmemAllocationGranularity_flags option); /** * \brief Retrieve the contents of the property structure defining properties for this handle * * \param[out] prop - Pointer to a properties structure which will hold the information about this handle * \param[in] handle - Handle which to perform the query on * \returns * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_PERMITTED, * ::CUDA_ERROR_NOT_SUPPORTED * * \sa ::cuMemCreate, ::cuMemImportFromShareableHandle */ CUresult CUDAAPI cuMemGetAllocationPropertiesFromHandle(CUmemAllocationProp *prop, CUmemGenericAllocationHandle handle); /** * \brief Given an address \p addr, returns the allocation handle of the backing memory allocation. * * The handle is guaranteed to be the same handle value used to map the memory. If the address * requested is not mapped, the function will fail. The returned handle must be released with * corresponding number of calls to ::cuMemRelease. * * \note The address \p addr, can be any address in a range previously mapped * by ::cuMemMap, and not necessarily the start address. * * \param[out] handle CUDA Memory handle for the backing memory allocation. * \param[in] addr Memory address to query, that has been mapped previously. * \returns * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_PERMITTED, * ::CUDA_ERROR_NOT_SUPPORTED * * \sa ::cuMemCreate, ::cuMemRelease, ::cuMemMap */ CUresult CUDAAPI cuMemRetainAllocationHandle(CUmemGenericAllocationHandle *handle, void *addr); /** @} */ /* END CUDA_VA */ /** * \defgroup CUDA_MALLOC_ASYNC Stream Ordered Memory Allocator * * ___MANBRIEF___ Functions for performing allocation and free operations in stream order. * Functions for controlling the behavior of the underlying allocator. * (___CURRENT_FILE___) ___ENDMANBRIEF___ * * This section describes the stream ordered memory allocator exposed by the * low-level CUDA driver application programming interface. * * @{ * * \section CUDA_MALLOC_ASYNC_overview overview * * The asynchronous allocator allows the user to allocate and free in stream order. * All asynchronous accesses of the allocation must happen between * the stream executions of the allocation and the free. If the memory is accessed * outside of the promised stream order, a use before allocation / use after free error * will cause undefined behavior. * * The allocator is free to reallocate the memory as long as it can guarantee * that compliant memory accesses will not overlap temporally. * The allocator may refer to internal stream ordering as well as inter-stream dependencies * (such as CUDA events and null stream dependencies) when establishing the temporal guarantee. * The allocator may also insert inter-stream dependencies to establish the temporal guarantee. * * \section CUDA_MALLOC_ASYNC_support Supported Platforms * * Whether or not a device supports the integrated stream ordered memory allocator * may be queried by calling ::cuDeviceGetAttribute() with the device attribute * ::CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED */ /** * \brief Frees memory with stream ordered semantics * * Inserts a free operation into \p hStream. * The allocation must not be accessed after stream execution reaches the free. * After this API returns, accessing the memory from any subsequent work launched on the GPU * or querying its pointer attributes results in undefined behavior. * * \note During stream capture, this function results in the creation of a free node and * must therefore be passed the address of a graph allocation. * * \param dptr - memory to free * \param hStream - The stream establishing the stream ordering contract. * \returns * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT (default stream specified with no current context), * ::CUDA_ERROR_NOT_SUPPORTED */ CUresult CUDAAPI cuMemFreeAsync(CUdeviceptr dptr, CUstream hStream); /** * \brief Allocates memory with stream ordered semantics * * Inserts an allocation operation into \p hStream. * A pointer to the allocated memory is returned immediately in *dptr. * The allocation must not be accessed until the the allocation operation completes. * The allocation comes from the memory pool current to the stream's device. * * \note The default memory pool of a device contains device memory from that device. * \note Basic stream ordering allows future work submitted into the same stream to use the allocation. * Stream query, stream synchronize, and CUDA events can be used to guarantee that the allocation * operation completes before work submitted in a separate stream runs. * \note During stream capture, this function results in the creation of an allocation node. In this case, * the allocation is owned by the graph instead of the memory pool. The memory pool's properties * are used to set the node's creation parameters. * * \param[out] dptr - Returned device pointer * \param[in] bytesize - Number of bytes to allocate * \param[in] hStream - The stream establishing the stream ordering contract and the memory pool to allocate from * \returns * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT (default stream specified with no current context), * ::CUDA_ERROR_NOT_SUPPORTED, * ::CUDA_ERROR_OUT_OF_MEMORY * * \sa ::cuMemAllocFromPoolAsync, ::cuMemFreeAsync, ::cuDeviceSetMemPool, * ::cuDeviceGetDefaultMemPool, ::cuDeviceGetMemPool, ::cuMemPoolCreate, * ::cuMemPoolSetAccess, ::cuMemPoolSetAttribute */ CUresult CUDAAPI cuMemAllocAsync(CUdeviceptr *dptr, size_t bytesize, CUstream hStream); /** * \brief Tries to release memory back to the OS * * Releases memory back to the OS until the pool contains fewer than minBytesToKeep * reserved bytes, or there is no more memory that the allocator can safely release. * The allocator cannot release OS allocations that back outstanding asynchronous allocations. * The OS allocations may happen at different granularity from the user allocations. * * \note: Allocations that have not been freed count as outstanding. * \note: Allocations that have been asynchronously freed but whose completion has * not been observed on the host (eg. by a synchronize) can count as outstanding. * * \param[in] pool - The memory pool to trim * \param[in] minBytesToKeep - If the pool has less than minBytesToKeep reserved, * the TrimTo operation is a no-op. Otherwise the pool will be guaranteed to have * at least minBytesToKeep bytes reserved after the operation. * \returns * ::CUDA_SUCCESS, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_VALUE * * \sa ::cuMemAllocAsync, ::cuMemFreeAsync, ::cuDeviceGetDefaultMemPool, * ::cuDeviceGetMemPool, ::cuMemPoolCreate */ CUresult CUDAAPI cuMemPoolTrimTo(CUmemoryPool pool, size_t minBytesToKeep); /** * \brief Sets attributes of a memory pool * * Supported attributes are: * - ::CU_MEMPOOL_ATTR_RELEASE_THRESHOLD: (value type = cuuint64_t) * Amount of reserved memory in bytes to hold onto before trying * to release memory back to the OS. When more than the release * threshold bytes of memory are held by the memory pool, the * allocator will try to release memory back to the OS on the * next call to stream, event or context synchronize. (default 0) * - ::CU_MEMPOOL_ATTR_REUSE_FOLLOW_EVENT_DEPENDENCIES: (value type = int) * Allow ::cuMemAllocAsync to use memory asynchronously freed * in another stream as long as a stream ordering dependency * of the allocating stream on the free action exists. * Cuda events and null stream interactions can create the required * stream ordered dependencies. (default enabled) * - ::CU_MEMPOOL_ATTR_REUSE_ALLOW_OPPORTUNISTIC: (value type = int) * Allow reuse of already completed frees when there is no dependency * between the free and allocation. (default enabled) * - ::CU_MEMPOOL_ATTR_REUSE_ALLOW_INTERNAL_DEPENDENCIES: (value type = int) * Allow ::cuMemAllocAsync to insert new stream dependencies * in order to establish the stream ordering required to reuse * a piece of memory released by ::cuMemFreeAsync (default enabled). * - ::CU_MEMPOOL_ATTR_RESERVED_MEM_HIGH: (value type = cuuint64_t) * Reset the high watermark that tracks the amount of backing memory that was * allocated for the memory pool. It is illegal to set this attribute to a non-zero value. * - ::CU_MEMPOOL_ATTR_USED_MEM_HIGH: (value type = cuuint64_t) * Reset the high watermark that tracks the amount of used memory that was * allocated for the memory pool. * * \param[in] pool - The memory pool to modify * \param[in] attr - The attribute to modify * \param[in] value - Pointer to the value to assign * * \returns * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_VALUE * * \sa ::cuMemAllocAsync, ::cuMemFreeAsync, ::cuDeviceGetDefaultMemPool, * ::cuDeviceGetMemPool, ::cuMemPoolCreate */ CUresult CUDAAPI cuMemPoolSetAttribute(CUmemoryPool pool, CUmemPool_attribute attr, void *value); /** * \brief Gets attributes of a memory pool * * Supported attributes are: * - ::CU_MEMPOOL_ATTR_RELEASE_THRESHOLD: (value type = cuuint64_t) * Amount of reserved memory in bytes to hold onto before trying * to release memory back to the OS. When more than the release * threshold bytes of memory are held by the memory pool, the * allocator will try to release memory back to the OS on the * next call to stream, event or context synchronize. (default 0) * - ::CU_MEMPOOL_ATTR_REUSE_FOLLOW_EVENT_DEPENDENCIES: (value type = int) * Allow ::cuMemAllocAsync to use memory asynchronously freed * in another stream as long as a stream ordering dependency * of the allocating stream on the free action exists. * Cuda events and null stream interactions can create the required * stream ordered dependencies. (default enabled) * - ::CU_MEMPOOL_ATTR_REUSE_ALLOW_OPPORTUNISTIC: (value type = int) * Allow reuse of already completed frees when there is no dependency * between the free and allocation. (default enabled) * - ::CU_MEMPOOL_ATTR_REUSE_ALLOW_INTERNAL_DEPENDENCIES: (value type = int) * Allow ::cuMemAllocAsync to insert new stream dependencies * in order to establish the stream ordering required to reuse * a piece of memory released by ::cuMemFreeAsync (default enabled). * - ::CU_MEMPOOL_ATTR_RESERVED_MEM_CURRENT: (value type = cuuint64_t) * Amount of backing memory currently allocated for the mempool * - ::CU_MEMPOOL_ATTR_RESERVED_MEM_HIGH: (value type = cuuint64_t) * High watermark of backing memory allocated for the mempool since the * last time it was reset. * - ::CU_MEMPOOL_ATTR_USED_MEM_CURRENT: (value type = cuuint64_t) * Amount of memory from the pool that is currently in use by the application. * - ::CU_MEMPOOL_ATTR_USED_MEM_HIGH: (value type = cuuint64_t) * High watermark of the amount of memory from the pool that was in use by the application. * * \param[in] pool - The memory pool to get attributes of * \param[in] attr - The attribute to get * \param[out] value - Retrieved value * * \returns * ::CUDA_SUCCESS, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_VALUE * * \sa ::cuMemAllocAsync, ::cuMemFreeAsync, ::cuDeviceGetDefaultMemPool, * ::cuDeviceGetMemPool, ::cuMemPoolCreate */ CUresult CUDAAPI cuMemPoolGetAttribute(CUmemoryPool pool, CUmemPool_attribute attr, void *value); /** * \brief Controls visibility of pools between devices * * \param[in] pool - The pool being modified * \param[in] map - Array of access descriptors. Each descriptor instructs the access to enable for a single gpu. * \param[in] count - Number of descriptors in the map array. * * \returns * ::CUDA_SUCCESS, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_VALUE * * \sa ::cuMemAllocAsync, ::cuMemFreeAsync, ::cuDeviceGetDefaultMemPool, * ::cuDeviceGetMemPool, ::cuMemPoolCreate */ CUresult CUDAAPI cuMemPoolSetAccess(CUmemoryPool pool, const CUmemAccessDesc *map, size_t count); /** * \brief Returns the accessibility of a pool from a device * * Returns the accessibility of the pool's memory from the specified location. * * \param[out] flags - the accessibility of the pool from the specified location * \param[in] memPool - the pool being queried * \param[in] location - the location accessing the pool * * \sa ::cuMemAllocAsync, ::cuMemFreeAsync, ::cuDeviceGetDefaultMemPool, * ::cuDeviceGetMemPool, ::cuMemPoolCreate */ CUresult CUDAAPI cuMemPoolGetAccess(CUmemAccess_flags *flags, CUmemoryPool memPool, CUmemLocation *location); /** * \brief Creates a memory pool * * Creates a CUDA memory pool and returns the handle in \p pool. The \p poolProps determines * the properties of the pool such as the backing device and IPC capabilities. * * By default, the pool's memory will be accessible from the device it is allocated on. * * \note Specifying CU_MEM_HANDLE_TYPE_NONE creates a memory pool that will not support IPC. * * \returns * ::CUDA_SUCCESS, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_OUT_OF_MEMORY, * ::CUDA_ERROR_NOT_SUPPORTED * * \sa ::cuDeviceSetMemPool, ::cuDeviceGetMemPool, ::cuDeviceGetDefaultMemPool, * ::cuMemAllocFromPoolAsync, ::cuMemPoolExportToShareableHandle */ CUresult CUDAAPI cuMemPoolCreate(CUmemoryPool *pool, const CUmemPoolProps *poolProps); /** * \brief Destroys the specified memory pool * * If any pointers obtained from this pool haven't been freed or * the pool has free operations that haven't completed * when ::cuMemPoolDestroy is invoked, the function will return immediately and the * resources associated with the pool will be released automatically * once there are no more outstanding allocations. * * Destroying the current mempool of a device sets the default mempool of * that device as the current mempool for that device. * * \note A device's default memory pool cannot be destroyed. * * \returns * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_VALUE * * \sa ::cuMemFreeAsync, ::cuDeviceSetMemPool, ::cuDeviceGetMemPool, * ::cuDeviceGetDefaultMemPool, ::cuMemPoolCreate */ CUresult CUDAAPI cuMemPoolDestroy(CUmemoryPool pool); /** * \brief Allocates memory from a specified pool with stream ordered semantics. * * Inserts an allocation operation into \p hStream. * A pointer to the allocated memory is returned immediately in *dptr. * The allocation must not be accessed until the the allocation operation completes. * The allocation comes from the specified memory pool. * * \note * - The specified memory pool may be from a device different than that of the specified \p hStream. * * - Basic stream ordering allows future work submitted into the same stream to use the allocation. * Stream query, stream synchronize, and CUDA events can be used to guarantee that the allocation * operation completes before work submitted in a separate stream runs. * * \note During stream capture, this function results in the creation of an allocation node. In this case, * the allocation is owned by the graph instead of the memory pool. The memory pool's properties * are used to set the node's creation parameters. * * \param[out] dptr - Returned device pointer * \param[in] bytesize - Number of bytes to allocate * \param[in] pool - The pool to allocate from * \param[in] hStream - The stream establishing the stream ordering semantic * * \returns * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT (default stream specified with no current context), * ::CUDA_ERROR_NOT_SUPPORTED, * ::CUDA_ERROR_OUT_OF_MEMORY * * \sa ::cuMemAllocAsync, ::cuMemFreeAsync, ::cuDeviceGetDefaultMemPool, * ::cuDeviceGetMemPool, ::cuMemPoolCreate, ::cuMemPoolSetAccess, * ::cuMemPoolSetAttribute */ CUresult CUDAAPI cuMemAllocFromPoolAsync(CUdeviceptr *dptr, size_t bytesize, CUmemoryPool pool, CUstream hStream); /** * \brief Exports a memory pool to the requested handle type. * * Given an IPC capable mempool, create an OS handle to share the pool with another process. * A recipient process can convert the shareable handle into a mempool with ::cuMemPoolImportFromShareableHandle. * Individual pointers can then be shared with the ::cuMemPoolExportPointer and ::cuMemPoolImportPointer APIs. * The implementation of what the shareable handle is and how it can be transferred is defined by the requested * handle type. * * \note: To create an IPC capable mempool, create a mempool with a CUmemAllocationHandleType other than CU_MEM_HANDLE_TYPE_NONE. * * \param[out] handle_out - Returned OS handle * \param[in] pool - pool to export * \param[in] handleType - the type of handle to create * \param[in] flags - must be 0 * * \returns * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_OUT_OF_MEMORY * * \sa ::cuMemPoolImportFromShareableHandle, ::cuMemPoolExportPointer, * ::cuMemPoolImportPointer, ::cuMemAllocAsync, ::cuMemFreeAsync, * ::cuDeviceGetDefaultMemPool, ::cuDeviceGetMemPool, ::cuMemPoolCreate, * ::cuMemPoolSetAccess, ::cuMemPoolSetAttribute */ CUresult CUDAAPI cuMemPoolExportToShareableHandle(void *handle_out, CUmemoryPool pool, CUmemAllocationHandleType handleType, unsigned long long flags); /** * \brief imports a memory pool from a shared handle. * * Specific allocations can be imported from the imported pool with cuMemPoolImportPointer. * * \note Imported memory pools do not support creating new allocations. * As such imported memory pools may not be used in cuDeviceSetMemPool * or ::cuMemAllocFromPoolAsync calls. * * \param[out] pool_out - Returned memory pool * \param[in] handle - OS handle of the pool to open * \param[in] handleType - The type of handle being imported * \param[in] flags - must be 0 * * \returns * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_OUT_OF_MEMORY * * \sa ::cuMemPoolExportToShareableHandle, ::cuMemPoolExportPointer, ::cuMemPoolImportPointer */ CUresult CUDAAPI cuMemPoolImportFromShareableHandle( CUmemoryPool *pool_out, void *handle, CUmemAllocationHandleType handleType, unsigned long long flags); /** * \brief Export data to share a memory pool allocation between processes. * * Constructs \p shareData_out for sharing a specific allocation from an already shared memory pool. * The recipient process can import the allocation with the ::cuMemPoolImportPointer api. * The data is not a handle and may be shared through any IPC mechanism. * * \param[out] shareData_out - Returned export data * \param[in] ptr - pointer to memory being exported * * \returns * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_OUT_OF_MEMORY * * \sa ::cuMemPoolExportToShareableHandle, ::cuMemPoolImportFromShareableHandle, ::cuMemPoolImportPointer */ CUresult CUDAAPI cuMemPoolExportPointer(CUmemPoolPtrExportData *shareData_out, CUdeviceptr ptr); /** * \brief Import a memory pool allocation from another process. * * Returns in \p ptr_out a pointer to the imported memory. * The imported memory must not be accessed before the allocation operation completes * in the exporting process. The imported memory must be freed from all importing processes before * being freed in the exporting process. The pointer may be freed with cuMemFree * or cuMemFreeAsync. If cuMemFreeAsync is used, the free must be completed * on the importing process before the free operation on the exporting process. * * \note The cuMemFreeAsync api may be used in the exporting process before * the cuMemFreeAsync operation completes in its stream as long as the * cuMemFreeAsync in the exporting process specifies a stream with * a stream dependency on the importing process's cuMemFreeAsync. * * \param[out] ptr_out - pointer to imported memory * \param[in] pool - pool from which to import * \param[in] shareData - data specifying the memory to import * * \returns * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_OUT_OF_MEMORY * * \sa ::cuMemPoolExportToShareableHandle, ::cuMemPoolImportFromShareableHandle, ::cuMemPoolExportPointer */ CUresult CUDAAPI cuMemPoolImportPointer(CUdeviceptr *ptr_out, CUmemoryPool pool, CUmemPoolPtrExportData *shareData); /** @} */ /* END CUDA_MALLOC_ASYNC */ /** * \defgroup CUDA_UNIFIED Unified Addressing * * ___MANBRIEF___ unified addressing functions of the low-level CUDA driver * API (___CURRENT_FILE___) ___ENDMANBRIEF___ * * This section describes the unified addressing functions of the * low-level CUDA driver application programming interface. * * @{ * * \section CUDA_UNIFIED_overview Overview * * CUDA devices can share a unified address space with the host. * For these devices there is no distinction between a device * pointer and a host pointer -- the same pointer value may be * used to access memory from the host program and from a kernel * running on the device (with exceptions enumerated below). * * \section CUDA_UNIFIED_support Supported Platforms * * Whether or not a device supports unified addressing may be * queried by calling ::cuDeviceGetAttribute() with the device * attribute ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING. * * Unified addressing is automatically enabled in 64-bit processes * * \section CUDA_UNIFIED_lookup Looking Up Information from Pointer Values * * It is possible to look up information about the memory which backs a * pointer value. For instance, one may want to know if a pointer points * to host or device memory. As another example, in the case of device * memory, one may want to know on which CUDA device the memory * resides. These properties may be queried using the function * ::cuPointerGetAttribute() * * Since pointers are unique, it is not necessary to specify information * about the pointers specified to the various copy functions in the * CUDA API. The function ::cuMemcpy() may be used to perform a copy * between two pointers, ignoring whether they point to host or device * memory (making ::cuMemcpyHtoD(), ::cuMemcpyDtoD(), and ::cuMemcpyDtoH() * unnecessary for devices supporting unified addressing). For * multidimensional copies, the memory type ::CU_MEMORYTYPE_UNIFIED may be * used to specify that the CUDA driver should infer the location of the * pointer from its value. * * \section CUDA_UNIFIED_automaphost Automatic Mapping of Host Allocated Host Memory * * All host memory allocated in all contexts using ::cuMemAllocHost() and * ::cuMemHostAlloc() is always directly accessible from all contexts on * all devices that support unified addressing. This is the case regardless * of whether or not the flags ::CU_MEMHOSTALLOC_PORTABLE and * ::CU_MEMHOSTALLOC_DEVICEMAP are specified. * * The pointer value through which allocated host memory may be accessed * in kernels on all devices that support unified addressing is the same * as the pointer value through which that memory is accessed on the host, * so it is not necessary to call ::cuMemHostGetDevicePointer() to get the device * pointer for these allocations. * * Note that this is not the case for memory allocated using the flag * ::CU_MEMHOSTALLOC_WRITECOMBINED, as discussed below. * * \section CUDA_UNIFIED_autopeerregister Automatic Registration of Peer Memory * * Upon enabling direct access from a context that supports unified addressing * to another peer context that supports unified addressing using * ::cuCtxEnablePeerAccess() all memory allocated in the peer context using * ::cuMemAlloc() and ::cuMemAllocPitch() will immediately be accessible * by the current context. The device pointer value through * which any peer memory may be accessed in the current context * is the same pointer value through which that memory may be * accessed in the peer context. * * \section CUDA_UNIFIED_exceptions Exceptions, Disjoint Addressing * * Not all memory may be accessed on devices through the same pointer * value through which they are accessed on the host. These exceptions * are host memory registered using ::cuMemHostRegister() and host memory * allocated using the flag ::CU_MEMHOSTALLOC_WRITECOMBINED. For these * exceptions, there exists a distinct host and device address for the * memory. The device address is guaranteed to not overlap any valid host * pointer range and is guaranteed to have the same value across all * contexts that support unified addressing. * * This device address may be queried using ::cuMemHostGetDevicePointer() * when a context using unified addressing is current. Either the host * or the unified device pointer value may be used to refer to this memory * through ::cuMemcpy() and similar functions using the * ::CU_MEMORYTYPE_UNIFIED memory type. * */ /** * \brief Returns information about a pointer * * The supported attributes are: * * - ::CU_POINTER_ATTRIBUTE_CONTEXT: * * Returns in \p *data the ::CUcontext in which \p ptr was allocated or * registered. * The type of \p data must be ::CUcontext *. * * If \p ptr was not allocated by, mapped by, or registered with * a ::CUcontext which uses unified virtual addressing then * ::CUDA_ERROR_INVALID_VALUE is returned. * * - ::CU_POINTER_ATTRIBUTE_MEMORY_TYPE: * * Returns in \p *data the physical memory type of the memory that * \p ptr addresses as a ::CUmemorytype enumerated value. * The type of \p data must be unsigned int. * * If \p ptr addresses device memory then \p *data is set to * ::CU_MEMORYTYPE_DEVICE. The particular ::CUdevice on which the * memory resides is the ::CUdevice of the ::CUcontext returned by the * ::CU_POINTER_ATTRIBUTE_CONTEXT attribute of \p ptr. * * If \p ptr addresses host memory then \p *data is set to * ::CU_MEMORYTYPE_HOST. * * If \p ptr was not allocated by, mapped by, or registered with * a ::CUcontext which uses unified virtual addressing then * ::CUDA_ERROR_INVALID_VALUE is returned. * * If the current ::CUcontext does not support unified virtual * addressing then ::CUDA_ERROR_INVALID_CONTEXT is returned. * * - ::CU_POINTER_ATTRIBUTE_DEVICE_POINTER: * * Returns in \p *data the device pointer value through which * \p ptr may be accessed by kernels running in the current * ::CUcontext. * The type of \p data must be CUdeviceptr *. * * If there exists no device pointer value through which * kernels running in the current ::CUcontext may access * \p ptr then ::CUDA_ERROR_INVALID_VALUE is returned. * * If there is no current ::CUcontext then * ::CUDA_ERROR_INVALID_CONTEXT is returned. * * Except in the exceptional disjoint addressing cases discussed * below, the value returned in \p *data will equal the input * value \p ptr. * * - ::CU_POINTER_ATTRIBUTE_HOST_POINTER: * * Returns in \p *data the host pointer value through which * \p ptr may be accessed by by the host program. * The type of \p data must be void **. * If there exists no host pointer value through which * the host program may directly access \p ptr then * ::CUDA_ERROR_INVALID_VALUE is returned. * * Except in the exceptional disjoint addressing cases discussed * below, the value returned in \p *data will equal the input * value \p ptr. * * - ::CU_POINTER_ATTRIBUTE_P2P_TOKENS: * * Returns in \p *data two tokens for use with the nv-p2p.h Linux * kernel interface. \p data must be a struct of type * CUDA_POINTER_ATTRIBUTE_P2P_TOKENS. * * \p ptr must be a pointer to memory obtained from :cuMemAlloc(). * Note that p2pToken and vaSpaceToken are only valid for the * lifetime of the source allocation. A subsequent allocation at * the same address may return completely different tokens. * Querying this attribute has a side effect of setting the attribute * ::CU_POINTER_ATTRIBUTE_SYNC_MEMOPS for the region of memory that * \p ptr points to. * * - ::CU_POINTER_ATTRIBUTE_SYNC_MEMOPS: * * A boolean attribute which when set, ensures that synchronous memory operations * initiated on the region of memory that \p ptr points to will always synchronize. * See further documentation in the section titled "API synchronization behavior" * to learn more about cases when synchronous memory operations can * exhibit asynchronous behavior. * * - ::CU_POINTER_ATTRIBUTE_BUFFER_ID: * * Returns in \p *data a buffer ID which is guaranteed to be unique within the process. * \p data must point to an unsigned long long. * * \p ptr must be a pointer to memory obtained from a CUDA memory allocation API. * Every memory allocation from any of the CUDA memory allocation APIs will * have a unique ID over a process lifetime. Subsequent allocations do not reuse IDs * from previous freed allocations. IDs are only unique within a single process. * * * - ::CU_POINTER_ATTRIBUTE_IS_MANAGED: * * Returns in \p *data a boolean that indicates whether the pointer points to * managed memory or not. * * If \p ptr is not a valid CUDA pointer then ::CUDA_ERROR_INVALID_VALUE is returned. * * - ::CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL: * * Returns in \p *data an integer representing a device ordinal of a device against * which the memory was allocated or registered. * * - ::CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE: * * Returns in \p *data a boolean that indicates if this pointer maps to * an allocation that is suitable for ::cudaIpcGetMemHandle. * * - ::CU_POINTER_ATTRIBUTE_RANGE_START_ADDR: * * Returns in \p *data the starting address for the allocation referenced * by the device pointer \p ptr. Note that this is not necessarily the * address of the mapped region, but the address of the mappable address * range \p ptr references (e.g. from ::cuMemAddressReserve). * * - ::CU_POINTER_ATTRIBUTE_RANGE_SIZE: * * Returns in \p *data the size for the allocation referenced by the device * pointer \p ptr. Note that this is not necessarily the size of the mapped * region, but the size of the mappable address range \p ptr references * (e.g. from ::cuMemAddressReserve). To retrieve the size of the mapped * region, see ::cuMemGetAddressRange * * - ::CU_POINTER_ATTRIBUTE_MAPPED: * * Returns in \p *data a boolean that indicates if this pointer is in a * valid address range that is mapped to a backing allocation. * * - ::CU_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES: * * Returns a bitmask of the allowed handle types for an allocation that may * be passed to ::cuMemExportToShareableHandle. * * - ::CU_POINTER_ATTRIBUTE_MEMPOOL_HANDLE: * * Returns in \p *data the handle to the mempool that the allocation was obtained from. * * \par * * Note that for most allocations in the unified virtual address space * the host and device pointer for accessing the allocation will be the * same. The exceptions to this are * - user memory registered using ::cuMemHostRegister * - host memory allocated using ::cuMemHostAlloc with the * ::CU_MEMHOSTALLOC_WRITECOMBINED flag * For these types of allocation there will exist separate, disjoint host * and device addresses for accessing the allocation. In particular * - The host address will correspond to an invalid unmapped device address * (which will result in an exception if accessed from the device) * - The device address will correspond to an invalid unmapped host address * (which will result in an exception if accessed from the host). * For these types of allocations, querying ::CU_POINTER_ATTRIBUTE_HOST_POINTER * and ::CU_POINTER_ATTRIBUTE_DEVICE_POINTER may be used to retrieve the host * and device addresses from either address. * * \param data - Returned pointer attribute value * \param attribute - Pointer attribute to query * \param ptr - Pointer * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_INVALID_DEVICE * \notefnerr * * \sa * ::cuPointerSetAttribute, * ::cuMemAlloc, * ::cuMemFree, * ::cuMemAllocHost, * ::cuMemFreeHost, * ::cuMemHostAlloc, * ::cuMemHostRegister, * ::cuMemHostUnregister, * ::cudaPointerGetAttributes */ CUresult CUDAAPI cuPointerGetAttribute(void *data, CUpointer_attribute attribute, CUdeviceptr ptr); /** * \brief Prefetches memory to the specified destination device * * Prefetches memory to the specified destination device. \p devPtr is the * base device pointer of the memory to be prefetched and \p dstDevice is the * destination device. \p count specifies the number of bytes to copy. \p hStream * is the stream in which the operation is enqueued. The memory range must refer * to managed memory allocated via ::cuMemAllocManaged or declared via __managed__ variables. * * Passing in CU_DEVICE_CPU for \p dstDevice will prefetch the data to host memory. If * \p dstDevice is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS * must be non-zero. Additionally, \p hStream must be associated with a device that has a * non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. * * The start address and end address of the memory range will be rounded down and rounded up * respectively to be aligned to CPU page size before the prefetch operation is enqueued * in the stream. * * If no physical memory has been allocated for this region, then this memory region * will be populated and mapped on the destination device. If there's insufficient * memory to prefetch the desired region, the Unified Memory driver may evict pages from other * ::cuMemAllocManaged allocations to host memory in order to make room. Device memory * allocated using ::cuMemAlloc or ::cuArrayCreate will not be evicted. * * By default, any mappings to the previous location of the migrated pages are removed and * mappings for the new location are only setup on \p dstDevice. The exact behavior however * also depends on the settings applied to this memory range via ::cuMemAdvise as described * below: * * If ::CU_MEM_ADVISE_SET_READ_MOSTLY was set on any subset of this memory range, * then that subset will create a read-only copy of the pages on \p dstDevice. * * If ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION was called on any subset of this memory * range, then the pages will be migrated to \p dstDevice even if \p dstDevice is not the * preferred location of any pages in the memory range. * * If ::CU_MEM_ADVISE_SET_ACCESSED_BY was called on any subset of this memory range, * then mappings to those pages from all the appropriate processors are updated to * refer to the new location if establishing such a mapping is possible. Otherwise, * those mappings are cleared. * * Note that this API is not required for functionality and only serves to improve performance * by allowing the application to migrate data to a suitable location before it is accessed. * Memory accesses to this range are always coherent and are allowed even when the data is * actively being migrated. * * Note that this function is asynchronous with respect to the host and all work * on other devices. * * \param devPtr - Pointer to be prefetched * \param count - Size in bytes * \param dstDevice - Destination device to prefetch to * \param hStream - Stream to enqueue prefetch operation * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_INVALID_DEVICE * \notefnerr * \note_async * \note_null_stream * * \sa ::cuMemcpy, ::cuMemcpyPeer, ::cuMemcpyAsync, * ::cuMemcpy3DPeerAsync, ::cuMemAdvise, * ::cudaMemPrefetchAsync */ CUresult CUDAAPI cuMemPrefetchAsync(CUdeviceptr devPtr, size_t count, CUdevice dstDevice, CUstream hStream); /** * \brief Advise about the usage of a given memory range * * Advise the Unified Memory subsystem about the usage pattern for the memory range * starting at \p devPtr with a size of \p count bytes. The start address and end address of the memory * range will be rounded down and rounded up respectively to be aligned to CPU page size before the * advice is applied. The memory range must refer to managed memory allocated via ::cuMemAllocManaged * or declared via __managed__ variables. The memory range could also refer to system-allocated pageable * memory provided it represents a valid, host-accessible region of memory and all additional constraints * imposed by \p advice as outlined below are also satisfied. Specifying an invalid system-allocated pageable * memory range results in an error being returned. * * The \p advice parameter can take the following values: * - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read * from and only occasionally written to. Any read accesses from any processor to this region will create a * read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync * is called on this region, it will create a read-only copy of the data on the destination processor. * If any processor writes to this region, all copies of the corresponding page will be invalidated * except for the one where the write occurred. The \p device argument is ignored for this advice. * Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU * that has a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. * Also, if a context is created on a device that does not have the device attribute * ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until * all such contexts are destroyed. * If the memory region refers to valid system-allocated pageable memory, then the accessing device must * have a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only * copy to be created on that device. Note however that if the accessing device also has a non-zero value for the * device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice * will not create a read-only copy when that device accesses this memory region. * * - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the * Unified Memory driver from attempting heuristic read-duplication on the memory range. Any read-duplicated * copies of the data will be collapsed into a single copy. The location for the collapsed * copy will be the preferred location if the page has a preferred location and one of the read-duplicated * copies was resident at that location. Otherwise, the location chosen is arbitrary. * * - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the * data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the * preferred location as host memory. If \p device is a GPU, then it must have a non-zero value for the * device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. Setting the preferred location * does not cause data to migrate to that location immediately. Instead, it guides the migration policy * when a fault occurs on that memory region. If the data is already in its preferred location and the * faulting processor can establish a mapping without requiring the data to be migrated, then * data migration will be avoided. On the other hand, if the data is not in its preferred location * or if a direct mapping cannot be established, then it will be migrated to the processor accessing * it. It is important to note that setting the preferred location does not prevent data prefetching * done using ::cuMemPrefetchAsync. * Having a preferred location can override the page thrash detection and resolution logic in the Unified * Memory driver. Normally, if a page is detected to be constantly thrashing between for example host and device * memory, the page may eventually be pinned to host memory by the Unified Memory driver. But * if the preferred location is set as device memory, then the page will continue to thrash indefinitely. * If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the * policies associated with that advice will override the policies of this advice, unless read accesses from * \p device will not result in a read-only copy being created on that device as outlined in description for * the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. * If the memory region refers to valid system-allocated pageable memory, then \p device must have a non-zero * value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. Additionally, if \p device has * a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, * then this call has no effect. Note however that this behavior may change in the future. * * - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION * and changes the preferred location to none. * * - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. * Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. If \p device is a GPU, then * the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. * This advice does not cause data migration and has no impact on the location of the data per se. Instead, * it causes the data to always be mapped in the specified processor's page tables, as long as the * location of the data permits a mapping to be established. If the data gets migrated for any reason, * the mappings are updated accordingly. * This advice is recommended in scenarios where data locality is not important, but avoiding faults is. * Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the * data located on one GPU is occasionally accessed by peer GPUs. In such scenarios, migrating data * over to the other GPUs is not as important because the accesses are infrequent and the overhead of * migration may be too high. But preventing faults can still help improve performance, and so having * a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated * to host memory because the CPU typically cannot access device memory directly. Any GPU that had the * ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the * page in host memory. * If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the * policies associated with that advice will override the policies of this advice. Additionally, if the * preferred location of this memory region or any subset of it is also \p device, then the policies * associated with ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. * If the memory region refers to valid system-allocated pageable memory, then \p device must have a non-zero * value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. Additionally, if \p device has * a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, * then this call has no effect. * * - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY. Any mappings to * the data from \p device may be removed at any time causing accesses to result in non-fatal page faults. * If the memory region refers to valid system-allocated pageable memory, then \p device must have a non-zero * value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. Additionally, if \p device has * a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, * then this call has no effect. * * \param devPtr - Pointer to memory to set the advice for * \param count - Size in bytes of the memory range * \param advice - Advice to be applied for the specified memory range * \param device - Device to apply the advice for * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_INVALID_DEVICE * \notefnerr * \note_async * \note_null_stream * * \sa ::cuMemcpy, ::cuMemcpyPeer, ::cuMemcpyAsync, * ::cuMemcpy3DPeerAsync, ::cuMemPrefetchAsync, * ::cudaMemAdvise */ CUresult CUDAAPI cuMemAdvise(CUdeviceptr devPtr, size_t count, CUmem_advise advice, CUdevice device); /** * \brief Query an attribute of a given memory range * * Query an attribute about the memory range starting at \p devPtr with a size of \p count bytes. The * memory range must refer to managed memory allocated via ::cuMemAllocManaged or declared via * __managed__ variables. * * The \p attribute parameter can take the following values: * - ::CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY: If this attribute is specified, \p data will be interpreted * as a 32-bit integer, and \p dataSize must be 4. The result returned will be 1 if all pages in the given * memory range have read-duplication enabled, or 0 otherwise. * - ::CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION: If this attribute is specified, \p data will be * interpreted as a 32-bit integer, and \p dataSize must be 4. The result returned will be a GPU device * id if all pages in the memory range have that GPU as their preferred location, or it will be CU_DEVICE_CPU * if all pages in the memory range have the CPU as their preferred location, or it will be CU_DEVICE_INVALID * if either all the pages don't have the same preferred location or some of the pages don't have a * preferred location at all. Note that the actual location of the pages in the memory range at the time of * the query may be different from the preferred location. * - ::CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY: If this attribute is specified, \p data will be interpreted * as an array of 32-bit integers, and \p dataSize must be a non-zero multiple of 4. The result returned * will be a list of device ids that had ::CU_MEM_ADVISE_SET_ACCESSED_BY set for that entire memory range. * If any device does not have that advice set for the entire memory range, that device will not be included. * If \p data is larger than the number of devices that have that advice set for that memory range, * CU_DEVICE_INVALID will be returned in all the extra space provided. For ex., if \p dataSize is 12 * (i.e. \p data has 3 elements) and only device 0 has the advice set, then the result returned will be * { 0, CU_DEVICE_INVALID, CU_DEVICE_INVALID }. If \p data is smaller than the number of devices that have * that advice set, then only as many devices will be returned as can fit in the array. There is no * guarantee on which specific devices will be returned, however. * - ::CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION: If this attribute is specified, \p data will be * interpreted as a 32-bit integer, and \p dataSize must be 4. The result returned will be the last location * to which all pages in the memory range were prefetched explicitly via ::cuMemPrefetchAsync. This will either be * a GPU id or CU_DEVICE_CPU depending on whether the last location for prefetch was a GPU or the CPU * respectively. If any page in the memory range was never explicitly prefetched or if all pages were not * prefetched to the same location, CU_DEVICE_INVALID will be returned. Note that this simply returns the * last location that the applicaton requested to prefetch the memory range to. It gives no indication as to * whether the prefetch operation to that location has completed or even begun. * * \param data - A pointers to a memory location where the result * of each attribute query will be written to. * \param dataSize - Array containing the size of data * \param attribute - The attribute to query * \param devPtr - Start of the range to query * \param count - Size of the range to query * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_INVALID_DEVICE * \notefnerr * \note_async * \note_null_stream * * \sa ::cuMemRangeGetAttributes, ::cuMemPrefetchAsync, * ::cuMemAdvise, * ::cudaMemRangeGetAttribute */ CUresult CUDAAPI cuMemRangeGetAttribute(void *data, size_t dataSize, CUmem_range_attribute attribute, CUdeviceptr devPtr, size_t count); /** * \brief Query attributes of a given memory range. * * Query attributes of the memory range starting at \p devPtr with a size of \p count bytes. The * memory range must refer to managed memory allocated via ::cuMemAllocManaged or declared via * __managed__ variables. The \p attributes array will be interpreted to have \p numAttributes * entries. The \p dataSizes array will also be interpreted to have \p numAttributes entries. * The results of the query will be stored in \p data. * * The list of supported attributes are given below. Please refer to ::cuMemRangeGetAttribute for * attribute descriptions and restrictions. * * - ::CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY * - ::CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION * - ::CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY * - ::CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION * * \param data - A two-dimensional array containing pointers to memory * locations where the result of each attribute query will be written to. * \param dataSizes - Array containing the sizes of each result * \param attributes - An array of attributes to query * (numAttributes and the number of attributes in this array should match) * \param numAttributes - Number of attributes to query * \param devPtr - Start of the range to query * \param count - Size of the range to query * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_INVALID_DEVICE * \notefnerr * * \sa ::cuMemRangeGetAttribute, ::cuMemAdvise, * ::cuMemPrefetchAsync, * ::cudaMemRangeGetAttributes */ CUresult CUDAAPI cuMemRangeGetAttributes(void **data, size_t *dataSizes, CUmem_range_attribute *attributes, size_t numAttributes, CUdeviceptr devPtr, size_t count); /** * \brief Set attributes on a previously allocated memory region * * The supported attributes are: * * - ::CU_POINTER_ATTRIBUTE_SYNC_MEMOPS: * * A boolean attribute that can either be set (1) or unset (0). When set, * the region of memory that \p ptr points to is guaranteed to always synchronize * memory operations that are synchronous. If there are some previously initiated * synchronous memory operations that are pending when this attribute is set, the * function does not return until those memory operations are complete. * See further documentation in the section titled "API synchronization behavior" * to learn more about cases when synchronous memory operations can * exhibit asynchronous behavior. * \p value will be considered as a pointer to an unsigned integer to which this attribute is to be set. * * \param value - Pointer to memory containing the value to be set * \param attribute - Pointer attribute to set * \param ptr - Pointer to a memory region allocated using CUDA memory allocation APIs * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_INVALID_DEVICE * \notefnerr * * \sa ::cuPointerGetAttribute, * ::cuPointerGetAttributes, * ::cuMemAlloc, * ::cuMemFree, * ::cuMemAllocHost, * ::cuMemFreeHost, * ::cuMemHostAlloc, * ::cuMemHostRegister, * ::cuMemHostUnregister */ CUresult CUDAAPI cuPointerSetAttribute(const void *value, CUpointer_attribute attribute, CUdeviceptr ptr); /** * \brief Returns information about a pointer. * * The supported attributes are (refer to ::cuPointerGetAttribute for attribute descriptions and restrictions): * * - ::CU_POINTER_ATTRIBUTE_CONTEXT * - ::CU_POINTER_ATTRIBUTE_MEMORY_TYPE * - ::CU_POINTER_ATTRIBUTE_DEVICE_POINTER * - ::CU_POINTER_ATTRIBUTE_HOST_POINTER * - ::CU_POINTER_ATTRIBUTE_SYNC_MEMOPS * - ::CU_POINTER_ATTRIBUTE_BUFFER_ID * - ::CU_POINTER_ATTRIBUTE_IS_MANAGED * - ::CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL * - ::CU_POINTER_ATTRIBUTE_RANGE_START_ADDR * - ::CU_POINTER_ATTRIBUTE_RANGE_SIZE * - ::CU_POINTER_ATTRIBUTE_MAPPED * - ::CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE * - ::CU_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES * - ::CU_POINTER_ATTRIBUTE_MEMPOOL_HANDLE * * \param numAttributes - Number of attributes to query * \param attributes - An array of attributes to query * (numAttributes and the number of attributes in this array should match) * \param data - A two-dimensional array containing pointers to memory * locations where the result of each attribute query will be written to. * \param ptr - Pointer to query * * Unlike ::cuPointerGetAttribute, this function will not return an error when the \p ptr * encountered is not a valid CUDA pointer. Instead, the attributes are assigned default NULL values * and CUDA_SUCCESS is returned. * * If \p ptr was not allocated by, mapped by, or registered with a ::CUcontext which uses UVA * (Unified Virtual Addressing), ::CUDA_ERROR_INVALID_CONTEXT is returned. * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_INVALID_DEVICE * \notefnerr * * \sa * ::cuPointerGetAttribute, * ::cuPointerSetAttribute, * ::cudaPointerGetAttributes */ CUresult CUDAAPI cuPointerGetAttributes(unsigned int numAttributes, CUpointer_attribute *attributes, void **data, CUdeviceptr ptr); /** @} */ /* END CUDA_UNIFIED */ /** * \defgroup CUDA_STREAM Stream Management * * ___MANBRIEF___ stream management functions of the low-level CUDA driver API * (___CURRENT_FILE___) ___ENDMANBRIEF___ * * This section describes the stream management functions of the low-level CUDA * driver application programming interface. * * @{ */ /** * \brief Create a stream * * Creates a stream and returns a handle in \p phStream. The \p Flags argument * determines behaviors of the stream. * * Valid values for \p Flags are: * - ::CU_STREAM_DEFAULT: Default stream creation flag. * - ::CU_STREAM_NON_BLOCKING: Specifies that work running in the created * stream may run concurrently with work in stream 0 (the NULL stream), and that * the created stream should perform no implicit synchronization with stream 0. * * \param phStream - Returned newly created stream * \param Flags - Parameters for stream creation * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_OUT_OF_MEMORY * \notefnerr * * \sa ::cuStreamDestroy, * ::cuStreamCreateWithPriority, * ::cuStreamGetPriority, * ::cuStreamGetFlags, * ::cuStreamWaitEvent, * ::cuStreamQuery, * ::cuStreamSynchronize, * ::cuStreamAddCallback, * ::cudaStreamCreate, * ::cudaStreamCreateWithFlags */ CUresult CUDAAPI cuStreamCreate(CUstream *phStream, unsigned int Flags); /** * \brief Create a stream with the given priority * * Creates a stream with the specified priority and returns a handle in \p phStream. * This API alters the scheduler priority of work in the stream. Work in a higher * priority stream may preempt work already executing in a low priority stream. * * \p priority follows a convention where lower numbers represent higher priorities. * '0' represents default priority. The range of meaningful numerical priorities can * be queried using ::cuCtxGetStreamPriorityRange. If the specified priority is * outside the numerical range returned by ::cuCtxGetStreamPriorityRange, * it will automatically be clamped to the lowest or the highest number in the range. * * \param phStream - Returned newly created stream * \param flags - Flags for stream creation. See ::cuStreamCreate for a list of * valid flags * \param priority - Stream priority. Lower numbers represent higher priorities. * See ::cuCtxGetStreamPriorityRange for more information about * meaningful stream priorities that can be passed. * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_OUT_OF_MEMORY * \notefnerr * * \note Stream priorities are supported only on GPUs * with compute capability 3.5 or higher. * * \note In the current implementation, only compute kernels launched in * priority streams are affected by the stream's priority. Stream priorities have * no effect on host-to-device and device-to-host memory operations. * * \sa ::cuStreamDestroy, * ::cuStreamCreate, * ::cuStreamGetPriority, * ::cuCtxGetStreamPriorityRange, * ::cuStreamGetFlags, * ::cuStreamWaitEvent, * ::cuStreamQuery, * ::cuStreamSynchronize, * ::cuStreamAddCallback, * ::cudaStreamCreateWithPriority */ CUresult CUDAAPI cuStreamCreateWithPriority(CUstream *phStream, unsigned int flags, int priority); /** * \brief Query the priority of a given stream * * Query the priority of a stream created using ::cuStreamCreate or ::cuStreamCreateWithPriority * and return the priority in \p priority. Note that if the stream was created with a * priority outside the numerical range returned by ::cuCtxGetStreamPriorityRange, * this function returns the clamped priority. * See ::cuStreamCreateWithPriority for details about priority clamping. * * \param hStream - Handle to the stream to be queried * \param priority - Pointer to a signed integer in which the stream's priority is returned * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_INVALID_HANDLE, * ::CUDA_ERROR_OUT_OF_MEMORY * \notefnerr * * \sa ::cuStreamDestroy, * ::cuStreamCreate, * ::cuStreamCreateWithPriority, * ::cuCtxGetStreamPriorityRange, * ::cuStreamGetFlags, * ::cudaStreamGetPriority */ CUresult CUDAAPI cuStreamGetPriority(CUstream hStream, int *priority); /** * \brief Query the flags of a given stream * * Query the flags of a stream created using ::cuStreamCreate or ::cuStreamCreateWithPriority * and return the flags in \p flags. * * \param hStream - Handle to the stream to be queried * \param flags - Pointer to an unsigned integer in which the stream's flags are returned * The value returned in \p flags is a logical 'OR' of all flags that * were used while creating this stream. See ::cuStreamCreate for the list * of valid flags * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_INVALID_HANDLE, * ::CUDA_ERROR_OUT_OF_MEMORY * \notefnerr * * \sa ::cuStreamDestroy, * ::cuStreamCreate, * ::cuStreamGetPriority, * ::cudaStreamGetFlags */ CUresult CUDAAPI cuStreamGetFlags(CUstream hStream, unsigned int *flags); /** * \brief Query the context associated with a stream * * Returns the CUDA context that the stream is associated with. * * The stream handle \p hStream can refer to any of the following: *

* * \param hStream - Handle to the stream to be queried * \param pctx - Returned context associated with the stream * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_HANDLE, * \notefnerr * * \sa ::cuStreamDestroy, * ::cuStreamCreateWithPriority, * ::cuStreamGetPriority, * ::cuStreamGetFlags, * ::cuStreamWaitEvent, * ::cuStreamQuery, * ::cuStreamSynchronize, * ::cuStreamAddCallback, * ::cudaStreamCreate, * ::cudaStreamCreateWithFlags */ CUresult CUDAAPI cuStreamGetCtx(CUstream hStream, CUcontext *pctx); /** * \brief Make a compute stream wait on an event * * Makes all future work submitted to \p hStream wait for all work captured in * \p hEvent. See ::cuEventRecord() for details on what is captured by an event. * The synchronization will be performed efficiently on the device when applicable. * \p hEvent may be from a different context or device than \p hStream. * * flags include: * - ::CU_EVENT_WAIT_DEFAULT: Default event creation flag. * - ::CU_EVENT_WAIT_EXTERNAL: Event is captured in the graph as an external * event node when performing stream capture. This flag is invalid outside * of stream capture. * * \param hStream - Stream to wait * \param hEvent - Event to wait on (may not be NULL) * \param Flags - See ::CUevent_capture_flags * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_HANDLE, * \note_null_stream * \notefnerr * * \sa ::cuStreamCreate, * ::cuEventRecord, * ::cuStreamQuery, * ::cuStreamSynchronize, * ::cuStreamAddCallback, * ::cuStreamDestroy, * ::cudaStreamWaitEvent */ CUresult CUDAAPI cuStreamWaitEvent(CUstream hStream, CUevent hEvent, unsigned int Flags); /** * \brief Add a callback to a compute stream * * \note This function is slated for eventual deprecation and removal. If * you do not require the callback to execute in case of a device error, * consider using ::cuLaunchHostFunc. Additionally, this function is not * supported with ::cuStreamBeginCapture and ::cuStreamEndCapture, unlike * ::cuLaunchHostFunc. * * Adds a callback to be called on the host after all currently enqueued * items in the stream have completed. For each * cuStreamAddCallback call, the callback will be executed exactly once. * The callback will block later work in the stream until it is finished. * * The callback may be passed ::CUDA_SUCCESS or an error code. In the event * of a device error, all subsequently executed callbacks will receive an * appropriate ::CUresult. * * Callbacks must not make any CUDA API calls. Attempting to use a CUDA API * will result in ::CUDA_ERROR_NOT_PERMITTED. Callbacks must not perform any * synchronization that may depend on outstanding device work or other callbacks * that are not mandated to run earlier. Callbacks without a mandated order * (in independent streams) execute in undefined order and may be serialized. * * For the purposes of Unified Memory, callback execution makes a number of * guarantees: *
    *
  • The callback stream is considered idle for the duration of the * callback. Thus, for example, a callback may always use memory attached * to the callback stream.
  • *
  • The start of execution of a callback has the same effect as * synchronizing an event recorded in the same stream immediately prior to * the callback. It thus synchronizes streams which have been "joined" * prior to the callback.
  • *
  • Adding device work to any stream does not have the effect of making * the stream active until all preceding host functions and stream callbacks * have executed. Thus, for * example, a callback might use global attached memory even if work has * been added to another stream, if the work has been ordered behind the * callback with an event.
  • *
  • Completion of a callback does not cause a stream to become * active except as described above. The callback stream will remain idle * if no device work follows the callback, and will remain idle across * consecutive callbacks without device work in between. Thus, for example, * stream synchronization can be done by signaling from a callback at the * end of the stream.
  • *
* * \param hStream - Stream to add callback to * \param callback - The function to call once preceding stream operations are complete * \param userData - User specified data to be passed to the callback function * \param flags - Reserved for future use, must be 0 * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_HANDLE, * ::CUDA_ERROR_NOT_SUPPORTED * \note_null_stream * \notefnerr * * \sa ::cuStreamCreate, * ::cuStreamQuery, * ::cuStreamSynchronize, * ::cuStreamWaitEvent, * ::cuStreamDestroy, * ::cuMemAllocManaged, * ::cuStreamAttachMemAsync, * ::cuStreamLaunchHostFunc, * ::cudaStreamAddCallback */ CUresult CUDAAPI cuStreamAddCallback(CUstream hStream, CUstreamCallback callback, void *userData, unsigned int flags); /** * \brief Begins graph capture on a stream * * Begin graph capture on \p hStream. When a stream is in capture mode, all operations * pushed into the stream will not be executed, but will instead be captured into * a graph, which will be returned via ::cuStreamEndCapture. Capture may not be initiated * if \p stream is CU_STREAM_LEGACY. Capture must be ended on the same stream in which * it was initiated, and it may only be initiated if the stream is not already in capture * mode. The capture mode may be queried via ::cuStreamIsCapturing. A unique id * representing the capture sequence may be queried via ::cuStreamGetCaptureInfo. * * If \p mode is not ::CU_STREAM_CAPTURE_MODE_RELAXED, ::cuStreamEndCapture must be * called on this stream from the same thread. * * \param hStream - Stream in which to initiate capture * \param mode - Controls the interaction of this capture sequence with other API * calls that are potentially unsafe. For more details see * ::cuThreadExchangeStreamCaptureMode. * * \note Kernels captured using this API must not use texture and surface references. * Reading or writing through any texture or surface reference is undefined * behavior. This restriction does not apply to texture and surface objects. * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_VALUE * \notefnerr * * \sa * ::cuStreamCreate, * ::cuStreamIsCapturing, * ::cuStreamEndCapture, * ::cuThreadExchangeStreamCaptureMode */ CUresult CUDAAPI cuStreamBeginCapture(CUstream hStream, CUstreamCaptureMode mode); /** * \brief Swaps the stream capture interaction mode for a thread * * Sets the calling thread's stream capture interaction mode to the value contained * in \p *mode, and overwrites \p *mode with the previous mode for the thread. To * facilitate deterministic behavior across function or module boundaries, callers * are encouraged to use this API in a push-pop fashion: \code CUstreamCaptureMode mode = desiredMode; cuThreadExchangeStreamCaptureMode(&mode); ... cuThreadExchangeStreamCaptureMode(&mode); // restore previous mode * \endcode * * During stream capture (see ::cuStreamBeginCapture), some actions, such as a call * to ::cudaMalloc, may be unsafe. In the case of ::cudaMalloc, the operation is * not enqueued asynchronously to a stream, and is not observed by stream capture. * Therefore, if the sequence of operations captured via ::cuStreamBeginCapture * depended on the allocation being replayed whenever the graph is launched, the * captured graph would be invalid. * * Therefore, stream capture places restrictions on API calls that can be made within * or concurrently to a ::cuStreamBeginCapture-::cuStreamEndCapture sequence. This * behavior can be controlled via this API and flags to ::cuStreamBeginCapture. * * A thread's mode is one of the following: * - \p CU_STREAM_CAPTURE_MODE_GLOBAL: This is the default mode. If the local thread has * an ongoing capture sequence that was not initiated with * \p CU_STREAM_CAPTURE_MODE_RELAXED at \p cuStreamBeginCapture, or if any other thread * has a concurrent capture sequence initiated with \p CU_STREAM_CAPTURE_MODE_GLOBAL, * this thread is prohibited from potentially unsafe API calls. * - \p CU_STREAM_CAPTURE_MODE_THREAD_LOCAL: If the local thread has an ongoing capture * sequence not initiated with \p CU_STREAM_CAPTURE_MODE_RELAXED, it is prohibited * from potentially unsafe API calls. Concurrent capture sequences in other threads * are ignored. * - \p CU_STREAM_CAPTURE_MODE_RELAXED: The local thread is not prohibited from potentially * unsafe API calls. Note that the thread is still prohibited from API calls which * necessarily conflict with stream capture, for example, attempting ::cuEventQuery * on an event that was last recorded inside a capture sequence. * * \param mode - Pointer to mode value to swap with the current mode * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_VALUE * \notefnerr * * \sa * ::cuStreamBeginCapture */ CUresult CUDAAPI cuThreadExchangeStreamCaptureMode(CUstreamCaptureMode *mode); /** * \brief Ends capture on a stream, returning the captured graph * * End capture on \p hStream, returning the captured graph via \p phGraph. * Capture must have been initiated on \p hStream via a call to ::cuStreamBeginCapture. * If capture was invalidated, due to a violation of the rules of stream capture, then * a NULL graph will be returned. * * If the \p mode argument to ::cuStreamBeginCapture was not * ::CU_STREAM_CAPTURE_MODE_RELAXED, this call must be from the same thread as * ::cuStreamBeginCapture. * * \param hStream - Stream to query * \param phGraph - The captured graph * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD * \notefnerr * * \sa * ::cuStreamCreate, * ::cuStreamBeginCapture, * ::cuStreamIsCapturing */ CUresult CUDAAPI cuStreamEndCapture(CUstream hStream, CUgraph *phGraph); /** * \brief Returns a stream's capture status * * Return the capture status of \p hStream via \p captureStatus. After a successful * call, \p *captureStatus will contain one of the following: * - ::CU_STREAM_CAPTURE_STATUS_NONE: The stream is not capturing. * - ::CU_STREAM_CAPTURE_STATUS_ACTIVE: The stream is capturing. * - ::CU_STREAM_CAPTURE_STATUS_INVALIDATED: The stream was capturing but an error * has invalidated the capture sequence. The capture sequence must be terminated * with ::cuStreamEndCapture on the stream where it was initiated in order to * continue using \p hStream. * * Note that, if this is called on ::CU_STREAM_LEGACY (the "null stream") while * a blocking stream in the same context is capturing, it will return * ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT and \p *captureStatus is unspecified * after the call. The blocking stream capture is not invalidated. * * When a blocking stream is capturing, the legacy stream is in an * unusable state until the blocking stream capture is terminated. The legacy * stream is not supported for stream capture, but attempted use would have an * implicit dependency on the capturing stream(s). * * \param hStream - Stream to query * \param captureStatus - Returns the stream's capture status * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT * \notefnerr * * \sa * ::cuStreamCreate, * ::cuStreamBeginCapture, * ::cuStreamEndCapture */ CUresult CUDAAPI cuStreamIsCapturing(CUstream hStream, CUstreamCaptureStatus *captureStatus); /** * \brief Query capture status of a stream * * Note there is a later version of this API, ::cuStreamGetCaptureInfo_v2. It will * supplant this version in 12.0, which is retained for minor version compatibility. * * Query the capture status of a stream and and get an id for * the capture sequence, which is unique over the lifetime of the process. * * If called on ::CU_STREAM_LEGACY (the "null stream") while a stream not created * with ::CU_STREAM_NON_BLOCKING is capturing, returns ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT. * * A valid id is returned only if both of the following are true: * - the call returns CUDA_SUCCESS * - captureStatus is set to ::CU_STREAM_CAPTURE_STATUS_ACTIVE * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT * \notefnerr * * \sa * ::cuStreamGetCaptureInfo_v2, * ::cuStreamBeginCapture, * ::cuStreamIsCapturing */ CUresult CUDAAPI cuStreamGetCaptureInfo(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out); /** * \brief Query a stream's capture state (11.3+) * * Query stream state related to stream capture. * * If called on ::CU_STREAM_LEGACY (the "null stream") while a stream not created * with ::CU_STREAM_NON_BLOCKING is capturing, returns ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT. * * Valid data (other than capture status) is returned only if both of the following are true: * - the call returns CUDA_SUCCESS * - the returned capture status is ::CU_STREAM_CAPTURE_STATUS_ACTIVE * * This version of cuStreamGetCaptureInfo is introduced in CUDA 11.3 and will supplant the * previous version in 12.0. Developers requiring compatibility across minor versions to * CUDA 11.0 (driver version 445) should use ::cuStreamGetCaptureInfo or include a fallback * path. * * \param hStream - The stream to query * \param captureStatus_out - Location to return the capture status of the stream; required * \param id_out - Optional location to return an id for the capture sequence, which is * unique over the lifetime of the process * \param graph_out - Optional location to return the graph being captured into. All * operations other than destroy and node removal are permitted on the graph * while the capture sequence is in progress. This API does not transfer * ownership of the graph, which is transferred or destroyed at * ::cuStreamEndCapture. Note that the graph handle may be invalidated before * end of capture for certain errors. Nodes that are or become * unreachable from the original stream at ::cuStreamEndCapture due to direct * actions on the graph do not trigger ::CUDA_ERROR_STREAM_CAPTURE_UNJOINED. * \param dependencies_out - Optional location to store a pointer to an array of nodes. * The next node to be captured in the stream will depend on this set of nodes, * absent operations such as event wait which modify this set. The array pointer * is valid until the next API call which operates on the stream or until end of * capture. The node handles may be copied out and are valid until they or the * graph is destroyed. The driver-owned array may also be passed directly to * APIs that operate on the graph (not the stream) without copying. * \param numDependencies_out - Optional location to store the size of the array * returned in dependencies_out. * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT * \note_graph_thread_safety * \notefnerr * * \sa * ::cuStreamGetCaptureInfo, * ::cuStreamBeginCapture, * ::cuStreamIsCapturing, * ::cuStreamUpdateCaptureDependencies */ CUresult CUDAAPI cuStreamGetCaptureInfo_v2(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out, CUgraph *graph_out, const CUgraphNode **dependencies_out, size_t *numDependencies_out); /** * \brief Update the set of dependencies in a capturing stream (11.3+) * * Modifies the dependency set of a capturing stream. The dependency set is the set * of nodes that the next captured node in the stream will depend on. * * Valid flags are ::CU_STREAM_ADD_CAPTURE_DEPENDENCIES and * ::CU_STREAM_SET_CAPTURE_DEPENDENCIES. These control whether the set passed to * the API is added to the existing set or replaces it. A flags value of 0 defaults * to ::CU_STREAM_ADD_CAPTURE_DEPENDENCIES. * * Nodes that are removed from the dependency set via this API do not result in * ::CUDA_ERROR_STREAM_CAPTURE_UNJOINED if they are unreachable from the stream at * ::cuStreamEndCapture. * * Returns ::CUDA_ERROR_ILLEGAL_STATE if the stream is not capturing. * * This API is new in CUDA 11.3. Developers requiring compatibility across minor * versions to CUDA 11.0 should not use this API or provide a fallback. * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_ILLEGAL_STATE * * \sa * ::cuStreamBeginCapture, * ::cuStreamGetCaptureInfo, * ::cuStreamGetCaptureInfo_v2 */ CUresult CUDAAPI cuStreamUpdateCaptureDependencies(CUstream hStream, CUgraphNode *dependencies, size_t numDependencies, unsigned int flags); /** * \brief Attach memory to a stream asynchronously * * Enqueues an operation in \p hStream to specify stream association of * \p length bytes of memory starting from \p dptr. This function is a * stream-ordered operation, meaning that it is dependent on, and will * only take effect when, previous work in stream has completed. Any * previous association is automatically replaced. * * \p dptr must point to one of the following types of memories: * - managed memory declared using the __managed__ keyword or allocated with * ::cuMemAllocManaged. * - a valid host-accessible region of system-allocated pageable memory. This * type of memory may only be specified if the device associated with the * stream reports a non-zero value for the device attribute * ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. * * For managed allocations, \p length must be either zero or the entire * allocation's size. Both indicate that the entire allocation's stream * association is being changed. Currently, it is not possible to change stream * association for a portion of a managed allocation. * * For pageable host allocations, \p length must be non-zero. * * The stream association is specified using \p flags which must be * one of ::CUmemAttach_flags. * If the ::CU_MEM_ATTACH_GLOBAL flag is specified, the memory can be accessed * by any stream on any device. * If the ::CU_MEM_ATTACH_HOST flag is specified, the program makes a guarantee * that it won't access the memory on the device from any stream on a device that * has a zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. * If the ::CU_MEM_ATTACH_SINGLE flag is specified and \p hStream is associated with * a device that has a zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, * the program makes a guarantee that it will only access the memory on the device * from \p hStream. It is illegal to attach singly to the NULL stream, because the * NULL stream is a virtual global stream and not a specific stream. An error will * be returned in this case. * * When memory is associated with a single stream, the Unified Memory system will * allow CPU access to this memory region so long as all operations in \p hStream * have completed, regardless of whether other streams are active. In effect, * this constrains exclusive ownership of the managed memory region by * an active GPU to per-stream activity instead of whole-GPU activity. * * Accessing memory on the device from streams that are not associated with * it will produce undefined results. No error checking is performed by the * Unified Memory system to ensure that kernels launched into other streams * do not access this region. * * It is a program's responsibility to order calls to ::cuStreamAttachMemAsync * via events, synchronization or other means to ensure legal access to memory * at all times. Data visibility and coherency will be changed appropriately * for all kernels which follow a stream-association change. * * If \p hStream is destroyed while data is associated with it, the association is * removed and the association reverts to the default visibility of the allocation * as specified at ::cuMemAllocManaged. For __managed__ variables, the default * association is always ::CU_MEM_ATTACH_GLOBAL. Note that destroying a stream is an * asynchronous operation, and as a result, the change to default association won't * happen until all work in the stream has completed. * * \param hStream - Stream in which to enqueue the attach operation * \param dptr - Pointer to memory (must be a pointer to managed memory or * to a valid host-accessible region of system-allocated * pageable memory) * \param length - Length of memory * \param flags - Must be one of ::CUmemAttach_flags * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_HANDLE, * ::CUDA_ERROR_NOT_SUPPORTED * \note_null_stream * \notefnerr * * \sa ::cuStreamCreate, * ::cuStreamQuery, * ::cuStreamSynchronize, * ::cuStreamWaitEvent, * ::cuStreamDestroy, * ::cuMemAllocManaged, * ::cudaStreamAttachMemAsync */ CUresult CUDAAPI cuStreamAttachMemAsync(CUstream hStream, CUdeviceptr dptr, size_t length, unsigned int flags); /** * \brief Determine status of a compute stream * * Returns ::CUDA_SUCCESS if all operations in the stream specified by * \p hStream have completed, or ::CUDA_ERROR_NOT_READY if not. * * For the purposes of Unified Memory, a return value of ::CUDA_SUCCESS * is equivalent to having called ::cuStreamSynchronize(). * * \param hStream - Stream to query status of * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_HANDLE, * ::CUDA_ERROR_NOT_READY * \note_null_stream * \notefnerr * * \sa ::cuStreamCreate, * ::cuStreamWaitEvent, * ::cuStreamDestroy, * ::cuStreamSynchronize, * ::cuStreamAddCallback, * ::cudaStreamQuery */ CUresult CUDAAPI cuStreamQuery(CUstream hStream); /** * \brief Wait until a stream's tasks are completed * * Waits until the device has completed all operations in the stream specified * by \p hStream. If the context was created with the * ::CU_CTX_SCHED_BLOCKING_SYNC flag, the CPU thread will block until the * stream is finished with all of its tasks. * * \param hStream - Stream to wait for * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_HANDLE * \note_null_stream * \notefnerr * * \sa ::cuStreamCreate, * ::cuStreamDestroy, * ::cuStreamWaitEvent, * ::cuStreamQuery, * ::cuStreamAddCallback, * ::cudaStreamSynchronize */ CUresult CUDAAPI cuStreamSynchronize(CUstream hStream); /** * \brief Destroys a stream * * Destroys the stream specified by \p hStream. * * In case the device is still doing work in the stream \p hStream * when ::cuStreamDestroy() is called, the function will return immediately * and the resources associated with \p hStream will be released automatically * once the device has completed all work in \p hStream. * * \param hStream - Stream to destroy * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_INVALID_HANDLE * \notefnerr * * \sa ::cuStreamCreate, * ::cuStreamWaitEvent, * ::cuStreamQuery, * ::cuStreamSynchronize, * ::cuStreamAddCallback, * ::cudaStreamDestroy */ CUresult CUDAAPI cuStreamDestroy(CUstream hStream); /** * \brief Copies attributes from source stream to destination stream. * * Copies attributes from source stream \p src to destination stream \p dst. * Both streams must have the same context. * * \param[out] dst Destination stream * \param[in] src Source stream * For list of attributes see ::CUstreamAttrID * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_VALUE * \notefnerr * * \sa * ::CUaccessPolicyWindow */ CUresult CUDAAPI cuStreamCopyAttributes(CUstream dst, CUstream src); /** * \brief Queries stream attribute. * * Queries attribute \p attr from \p hStream and stores it in corresponding * member of \p value_out. * * \param[in] hStream * \param[in] attr * \param[out] value_out * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_INVALID_HANDLE * \notefnerr * * \sa * ::CUaccessPolicyWindow */ CUresult CUDAAPI cuStreamGetAttribute(CUstream hStream, CUstreamAttrID attr, CUstreamAttrValue *value_out); /** * \brief Sets stream attribute. * * Sets attribute \p attr on \p hStream from corresponding attribute of * \p value. The updated attribute will be applied to subsequent work * submitted to the stream. It will not affect previously submitted work. * * \param[out] hStream * \param[in] attr * \param[in] value * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_INVALID_HANDLE * \notefnerr * * \sa * ::CUaccessPolicyWindow */ CUresult CUDAAPI cuStreamSetAttribute(CUstream hStream, CUstreamAttrID attr, const CUstreamAttrValue *value); /** @} */ /* END CUDA_STREAM */ /** * \defgroup CUDA_EVENT Event Management * * ___MANBRIEF___ event management functions of the low-level CUDA driver API * (___CURRENT_FILE___) ___ENDMANBRIEF___ * * This section describes the event management functions of the low-level CUDA * driver application programming interface. * * @{ */ /** * \brief Creates an event * * Creates an event *phEvent for the current context with the flags specified via * \p Flags. Valid flags include: * - ::CU_EVENT_DEFAULT: Default event creation flag. * - ::CU_EVENT_BLOCKING_SYNC: Specifies that the created event should use blocking * synchronization. A CPU thread that uses ::cuEventSynchronize() to wait on * an event created with this flag will block until the event has actually * been recorded. * - ::CU_EVENT_DISABLE_TIMING: Specifies that the created event does not need * to record timing data. Events created with this flag specified and * the ::CU_EVENT_BLOCKING_SYNC flag not specified will provide the best * performance when used with ::cuStreamWaitEvent() and ::cuEventQuery(). * - ::CU_EVENT_INTERPROCESS: Specifies that the created event may be used as an * interprocess event by ::cuIpcGetEventHandle(). ::CU_EVENT_INTERPROCESS must * be specified along with ::CU_EVENT_DISABLE_TIMING. * * \param phEvent - Returns newly created event * \param Flags - Event creation flags * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_OUT_OF_MEMORY * \notefnerr * * \sa * ::cuEventRecord, * ::cuEventQuery, * ::cuEventSynchronize, * ::cuEventDestroy, * ::cuEventElapsedTime, * ::cudaEventCreate, * ::cudaEventCreateWithFlags */ CUresult CUDAAPI cuEventCreate(CUevent *phEvent, unsigned int Flags); /** * \brief Records an event * * Captures in \p hEvent the contents of \p hStream at the time of this call. * \p hEvent and \p hStream must be from the same context. * Calls such as ::cuEventQuery() or ::cuStreamWaitEvent() will then * examine or wait for completion of the work that was captured. Uses of * \p hStream after this call do not modify \p hEvent. See note on default * stream behavior for what is captured in the default case. * * ::cuEventRecord() can be called multiple times on the same event and * will overwrite the previously captured state. Other APIs such as * ::cuStreamWaitEvent() use the most recently captured state at the time * of the API call, and are not affected by later calls to * ::cuEventRecord(). Before the first call to ::cuEventRecord(), an * event represents an empty set of work, so for example ::cuEventQuery() * would return ::CUDA_SUCCESS. * * \param hEvent - Event to record * \param hStream - Stream to record event for * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_HANDLE, * ::CUDA_ERROR_INVALID_VALUE * \note_null_stream * \notefnerr * * \sa ::cuEventCreate, * ::cuEventQuery, * ::cuEventSynchronize, * ::cuStreamWaitEvent, * ::cuEventDestroy, * ::cuEventElapsedTime, * ::cudaEventRecord, * ::cuEventRecordWithFlags */ CUresult CUDAAPI cuEventRecord(CUevent hEvent, CUstream hStream); /** * \brief Records an event * * Captures in \p hEvent the contents of \p hStream at the time of this call. * \p hEvent and \p hStream must be from the same context. * Calls such as ::cuEventQuery() or ::cuStreamWaitEvent() will then * examine or wait for completion of the work that was captured. Uses of * \p hStream after this call do not modify \p hEvent. See note on default * stream behavior for what is captured in the default case. * * ::cuEventRecordWithFlags() can be called multiple times on the same event and * will overwrite the previously captured state. Other APIs such as * ::cuStreamWaitEvent() use the most recently captured state at the time * of the API call, and are not affected by later calls to * ::cuEventRecordWithFlags(). Before the first call to ::cuEventRecordWithFlags(), an * event represents an empty set of work, so for example ::cuEventQuery() * would return ::CUDA_SUCCESS. * * flags include: * - ::CU_EVENT_RECORD_DEFAULT: Default event creation flag. * - ::CU_EVENT_RECORD_EXTERNAL: Event is captured in the graph as an external * event node when performing stream capture. This flag is invalid outside * of stream capture. * * \param hEvent - Event to record * \param hStream - Stream to record event for * \param flags - See ::CUevent_capture_flags * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_HANDLE, * ::CUDA_ERROR_INVALID_VALUE * \note_null_stream * \notefnerr * * \sa ::cuEventCreate, * ::cuEventQuery, * ::cuEventSynchronize, * ::cuStreamWaitEvent, * ::cuEventDestroy, * ::cuEventElapsedTime, * ::cuEventRecord, * ::cudaEventRecord */ CUresult CUDAAPI cuEventRecordWithFlags(CUevent hEvent, CUstream hStream, unsigned int flags); /** * \brief Queries an event's status * * Queries the status of all work currently captured by \p hEvent. See * ::cuEventRecord() for details on what is captured by an event. * * Returns ::CUDA_SUCCESS if all captured work has been completed, or * ::CUDA_ERROR_NOT_READY if any captured work is incomplete. * * For the purposes of Unified Memory, a return value of ::CUDA_SUCCESS * is equivalent to having called ::cuEventSynchronize(). * * \param hEvent - Event to query * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_HANDLE, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_NOT_READY * \notefnerr * * \sa ::cuEventCreate, * ::cuEventRecord, * ::cuEventSynchronize, * ::cuEventDestroy, * ::cuEventElapsedTime, * ::cudaEventQuery */ CUresult CUDAAPI cuEventQuery(CUevent hEvent); /** * \brief Waits for an event to complete * * Waits until the completion of all work currently captured in \p hEvent. * See ::cuEventRecord() for details on what is captured by an event. * * Waiting for an event that was created with the ::CU_EVENT_BLOCKING_SYNC * flag will cause the calling CPU thread to block until the event has * been completed by the device. If the ::CU_EVENT_BLOCKING_SYNC flag has * not been set, then the CPU thread will busy-wait until the event has * been completed by the device. * * \param hEvent - Event to wait for * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_HANDLE * \notefnerr * * \sa ::cuEventCreate, * ::cuEventRecord, * ::cuEventQuery, * ::cuEventDestroy, * ::cuEventElapsedTime, * ::cudaEventSynchronize */ CUresult CUDAAPI cuEventSynchronize(CUevent hEvent); /** * \brief Destroys an event * * Destroys the event specified by \p hEvent. * * An event may be destroyed before it is complete (i.e., while * ::cuEventQuery() would return ::CUDA_ERROR_NOT_READY). In this case, the * call does not block on completion of the event, and any associated * resources will automatically be released asynchronously at completion. * * \param hEvent - Event to destroy * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_HANDLE * \notefnerr * * \sa ::cuEventCreate, * ::cuEventRecord, * ::cuEventQuery, * ::cuEventSynchronize, * ::cuEventElapsedTime, * ::cudaEventDestroy */ CUresult CUDAAPI cuEventDestroy(CUevent hEvent); /** * \brief Computes the elapsed time between two events * * Computes the elapsed time between two events (in milliseconds with a * resolution of around 0.5 microseconds). * * If either event was last recorded in a non-NULL stream, the resulting time * may be greater than expected (even if both used the same stream handle). This * happens because the ::cuEventRecord() operation takes place asynchronously * and there is no guarantee that the measured latency is actually just between * the two events. Any number of other different stream operations could execute * in between the two measured events, thus altering the timing in a significant * way. * * If ::cuEventRecord() has not been called on either event then * ::CUDA_ERROR_INVALID_HANDLE is returned. If ::cuEventRecord() has been called * on both events but one or both of them has not yet been completed (that is, * ::cuEventQuery() would return ::CUDA_ERROR_NOT_READY on at least one of the * events), ::CUDA_ERROR_NOT_READY is returned. If either event was created with * the ::CU_EVENT_DISABLE_TIMING flag, then this function will return * ::CUDA_ERROR_INVALID_HANDLE. * * \param pMilliseconds - Time between \p hStart and \p hEnd in ms * \param hStart - Starting event * \param hEnd - Ending event * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_HANDLE, * ::CUDA_ERROR_NOT_READY * \notefnerr * * \sa ::cuEventCreate, * ::cuEventRecord, * ::cuEventQuery, * ::cuEventSynchronize, * ::cuEventDestroy, * ::cudaEventElapsedTime */ CUresult CUDAAPI cuEventElapsedTime(float *pMilliseconds, CUevent hStart, CUevent hEnd); /** @} */ /* END CUDA_EVENT */ /** * \defgroup CUDA_EXTRES_INTEROP External Resource Interoperability * * ___MANBRIEF___ External resource interoperability functions of the low-level CUDA driver API * (___CURRENT_FILE___) ___ENDMANBRIEF___ * * This section describes the external resource interoperability functions of the low-level CUDA * driver application programming interface. * * @{ */ /** * \brief Imports an external memory object * * Imports an externally allocated memory object and returns * a handle to that in \p extMem_out. * * The properties of the handle being imported must be described in * \p memHandleDesc. The ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC structure * is defined as follows: * * \code typedef struct CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st { CUexternalMemoryHandleType type; union { int fd; struct { void *handle; const void *name; } win32; const void *nvSciBufObject; } handle; unsigned long long size; unsigned int flags; } CUDA_EXTERNAL_MEMORY_HANDLE_DESC; * \endcode * * where ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type specifies the type * of handle being imported. ::CUexternalMemoryHandleType is * defined as: * * \code typedef enum CUexternalMemoryHandleType_enum { CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD = 1, CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32 = 2, CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT = 3, CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP = 4, CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE = 5, CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE = 6, CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT = 7, CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF = 8 } CUexternalMemoryHandleType; * \endcode * * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD, then * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::fd must be a valid * file descriptor referencing a memory object. Ownership of * the file descriptor is transferred to the CUDA driver when the * handle is imported successfully. Performing any operations on the * file descriptor after it is imported results in undefined behavior. * * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32, then exactly one * of ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle and * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name must not be * NULL. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle * is not NULL, then it must represent a valid shared NT handle that * references a memory object. Ownership of this handle is * not transferred to CUDA after the import operation, so the * application must release the handle using the appropriate system * call. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name * is not NULL, then it must point to a NULL-terminated array of * UTF-16 characters that refers to a memory object. * * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT, then * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle must * be non-NULL and * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name * must be NULL. The handle specified must be a globally shared KMT * handle. This handle does not hold a reference to the underlying * object, and thus will be invalid when all references to the * memory object are destroyed. * * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP, then exactly one * of ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle and * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name must not be * NULL. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle * is not NULL, then it must represent a valid shared NT handle that * is returned by ID3D12Device::CreateSharedHandle when referring to a * ID3D12Heap object. This handle holds a reference to the underlying * object. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name * is not NULL, then it must point to a NULL-terminated array of * UTF-16 characters that refers to a ID3D12Heap object. * * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE, then exactly one * of ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle and * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name must not be * NULL. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle * is not NULL, then it must represent a valid shared NT handle that * is returned by ID3D12Device::CreateSharedHandle when referring to a * ID3D12Resource object. This handle holds a reference to the * underlying object. If * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name * is not NULL, then it must point to a NULL-terminated array of * UTF-16 characters that refers to a ID3D12Resource object. * * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE, then * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle must * represent a valid shared NT handle that is returned by * IDXGIResource1::CreateSharedHandle when referring to a * ID3D11Resource object. If * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name * is not NULL, then it must point to a NULL-terminated array of * UTF-16 characters that refers to a ID3D11Resource object. * * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT, then * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle must * represent a valid shared KMT handle that is returned by * IDXGIResource::GetSharedHandle when referring to a * ID3D11Resource object and * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name * must be NULL. * * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF, then * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::nvSciBufObject must be non-NULL * and reference a valid NvSciBuf object. * If the NvSciBuf object imported into CUDA is also mapped by other drivers, then the * application must use ::cuWaitExternalSemaphoresAsync or ::cuSignalExternalSemaphoresAsync * as appropriate barriers to maintain coherence between CUDA and the other drivers. * See ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_SKIP_NVSCIBUF_MEMSYNC and ::CUDA_EXTERNAL_SEMAPHORE_WAIT_SKIP_NVSCIBUF_MEMSYNC * for memory synchronization. * * * The size of the memory object must be specified in * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::size. * * Specifying the flag ::CUDA_EXTERNAL_MEMORY_DEDICATED in * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::flags indicates that the * resource is a dedicated resource. The definition of what a * dedicated resource is outside the scope of this extension. * This flag must be set if ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type * is one of the following: * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT * * \param extMem_out - Returned handle to an external memory object * \param memHandleDesc - Memory import handle descriptor * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_INVALID_HANDLE * \notefnerr * * \note If the Vulkan memory imported into CUDA is mapped on the CPU then the * application must use vkInvalidateMappedMemoryRanges/vkFlushMappedMemoryRanges * as well as appropriate Vulkan pipeline barriers to maintain coherence between * CPU and GPU. For more information on these APIs, please refer to "Synchronization * and Cache Control" chapter from Vulkan specification. * * \sa ::cuDestroyExternalMemory, * ::cuExternalMemoryGetMappedBuffer, * ::cuExternalMemoryGetMappedMipmappedArray */ CUresult CUDAAPI cuImportExternalMemory(CUexternalMemory *extMem_out, const CUDA_EXTERNAL_MEMORY_HANDLE_DESC *memHandleDesc); /** * \brief Maps a buffer onto an imported memory object * * Maps a buffer onto an imported memory object and returns a device * pointer in \p devPtr. * * The properties of the buffer being mapped must be described in * \p bufferDesc. The ::CUDA_EXTERNAL_MEMORY_BUFFER_DESC structure is * defined as follows: * * \code typedef struct CUDA_EXTERNAL_MEMORY_BUFFER_DESC_st { unsigned long long offset; unsigned long long size; unsigned int flags; } CUDA_EXTERNAL_MEMORY_BUFFER_DESC; * \endcode * * where ::CUDA_EXTERNAL_MEMORY_BUFFER_DESC::offset is the offset in * the memory object where the buffer's base address is. * ::CUDA_EXTERNAL_MEMORY_BUFFER_DESC::size is the size of the buffer. * ::CUDA_EXTERNAL_MEMORY_BUFFER_DESC::flags must be zero. * * The offset and size have to be suitably aligned to match the * requirements of the external API. Mapping two buffers whose ranges * overlap may or may not result in the same virtual address being * returned for the overlapped portion. In such cases, the application * must ensure that all accesses to that region from the GPU are * volatile. Otherwise writes made via one address are not guaranteed * to be visible via the other address, even if they're issued by the * same thread. It is recommended that applications map the combined * range instead of mapping separate buffers and then apply the * appropriate offsets to the returned pointer to derive the * individual buffers. * * The returned pointer \p devPtr must be freed using ::cuMemFree. * * \param devPtr - Returned device pointer to buffer * \param extMem - Handle to external memory object * \param bufferDesc - Buffer descriptor * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_INVALID_HANDLE * \notefnerr * * \sa ::cuImportExternalMemory, * ::cuDestroyExternalMemory, * ::cuExternalMemoryGetMappedMipmappedArray */ CUresult CUDAAPI cuExternalMemoryGetMappedBuffer(CUdeviceptr *devPtr, CUexternalMemory extMem, const CUDA_EXTERNAL_MEMORY_BUFFER_DESC *bufferDesc); /** * \brief Maps a CUDA mipmapped array onto an external memory object * * Maps a CUDA mipmapped array onto an external object and returns a * handle to it in \p mipmap. * * The properties of the CUDA mipmapped array being mapped must be * described in \p mipmapDesc. The structure * ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC is defined as follows: * * \code typedef struct CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_st { unsigned long long offset; CUDA_ARRAY3D_DESCRIPTOR arrayDesc; unsigned int numLevels; } CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC; * \endcode * * where ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::offset is the * offset in the memory object where the base level of the mipmap * chain is. * ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::arrayDesc describes * the format, dimensions and type of the base level of the mipmap * chain. For further details on these parameters, please refer to the * documentation for ::cuMipmappedArrayCreate. Note that if the mipmapped * array is bound as a color target in the graphics API, then the flag * ::CUDA_ARRAY3D_COLOR_ATTACHMENT must be specified in * ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::arrayDesc::Flags. * ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::numLevels specifies * the total number of levels in the mipmap chain. * * If \p extMem was imported from a handle of type ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF, then * ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::numLevels must be equal to 1. * * The returned CUDA mipmapped array must be freed using ::cuMipmappedArrayDestroy. * * \param mipmap - Returned CUDA mipmapped array * \param extMem - Handle to external memory object * \param mipmapDesc - CUDA array descriptor * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_INVALID_HANDLE * \notefnerr * * \sa ::cuImportExternalMemory, * ::cuDestroyExternalMemory, * ::cuExternalMemoryGetMappedBuffer */ CUresult CUDAAPI cuExternalMemoryGetMappedMipmappedArray(CUmipmappedArray *mipmap, CUexternalMemory extMem, const CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC *mipmapDesc); /** * \brief Destroys an external memory object. * * Destroys the specified external memory object. Any existing buffers * and CUDA mipmapped arrays mapped onto this object must no longer be * used and must be explicitly freed using ::cuMemFree and * ::cuMipmappedArrayDestroy respectively. * * \param extMem - External memory object to be destroyed * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_HANDLE * \notefnerr * * \sa ::cuImportExternalMemory, * ::cuExternalMemoryGetMappedBuffer, * ::cuExternalMemoryGetMappedMipmappedArray */ CUresult CUDAAPI cuDestroyExternalMemory(CUexternalMemory extMem); /** * \brief Imports an external semaphore * * Imports an externally allocated synchronization object and returns * a handle to that in \p extSem_out. * * The properties of the handle being imported must be described in * \p semHandleDesc. The ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC is * defined as follows: * * \code typedef struct CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_st { CUexternalSemaphoreHandleType type; union { int fd; struct { void *handle; const void *name; } win32; const void* NvSciSyncObj; } handle; unsigned int flags; } CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC; * \endcode * * where ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type specifies the type of * handle being imported. ::CUexternalSemaphoreHandleType is defined * as: * * \code typedef enum CUexternalSemaphoreHandleType_enum { CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD = 1, CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32 = 2, CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT = 3, CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE = 4, CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE = 5, CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC = 6, CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX = 7, CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT = 8, CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD = 9, CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32 = 10 } CUexternalSemaphoreHandleType; * \endcode * * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD, then * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::fd must be a valid * file descriptor referencing a synchronization object. Ownership of * the file descriptor is transferred to the CUDA driver when the * handle is imported successfully. Performing any operations on the * file descriptor after it is imported results in undefined behavior. * * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32, then exactly one * of ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle and * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name must not be * NULL. If * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle * is not NULL, then it must represent a valid shared NT handle that * references a synchronization object. Ownership of this handle is * not transferred to CUDA after the import operation, so the * application must release the handle using the appropriate system * call. If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name * is not NULL, then it must name a valid synchronization object. * * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT, then * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle must * be non-NULL and * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name * must be NULL. The handle specified must be a globally shared KMT * handle. This handle does not hold a reference to the underlying * object, and thus will be invalid when all references to the * synchronization object are destroyed. * * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE, then exactly one * of ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle and * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name must not be * NULL. If * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle * is not NULL, then it must represent a valid shared NT handle that * is returned by ID3D12Device::CreateSharedHandle when referring to a * ID3D12Fence object. This handle holds a reference to the underlying * object. If * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name * is not NULL, then it must name a valid synchronization object that * refers to a valid ID3D12Fence object. * * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE, then * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle * represents a valid shared NT handle that is returned by * ID3D11Fence::CreateSharedHandle. If * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name * is not NULL, then it must name a valid synchronization object that * refers to a valid ID3D11Fence object. * * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC, then * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::nvSciSyncObj * represents a valid NvSciSyncObj. * * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX, then * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle * represents a valid shared NT handle that * is returned by IDXGIResource1::CreateSharedHandle when referring to * a IDXGIKeyedMutex object. If * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name * is not NULL, then it must name a valid synchronization object that * refers to a valid IDXGIKeyedMutex object. * * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT, then * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle * represents a valid shared KMT handle that * is returned by IDXGIResource::GetSharedHandle when referring to * a IDXGIKeyedMutex object and * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name must be NULL. * * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD, then * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::fd must be a valid * file descriptor referencing a synchronization object. Ownership of * the file descriptor is transferred to the CUDA driver when the * handle is imported successfully. Performing any operations on the * file descriptor after it is imported results in undefined behavior. * * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32, then exactly one * of ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle and * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name must not be * NULL. If * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle * is not NULL, then it must represent a valid shared NT handle that * references a synchronization object. Ownership of this handle is * not transferred to CUDA after the import operation, so the * application must release the handle using the appropriate system * call. If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name * is not NULL, then it must name a valid synchronization object. * * \param extSem_out - Returned handle to an external semaphore * \param semHandleDesc - Semaphore import handle descriptor * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_NOT_SUPPORTED, * ::CUDA_ERROR_INVALID_HANDLE * \notefnerr * * \sa ::cuDestroyExternalSemaphore, * ::cuSignalExternalSemaphoresAsync, * ::cuWaitExternalSemaphoresAsync */ CUresult CUDAAPI cuImportExternalSemaphore(CUexternalSemaphore *extSem_out, const CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC *semHandleDesc); /** * \brief Signals a set of external semaphore objects * * Enqueues a signal operation on a set of externally allocated * semaphore object in the specified stream. The operations will be * executed when all prior operations in the stream complete. * * The exact semantics of signaling a semaphore depends on the type of * the object. * * If the semaphore object is any one of the following types: * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD, * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32, * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT * then signaling the semaphore will set it to the signaled state. * * If the semaphore object is any one of the following types: * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE, * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE, * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD, * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32 * then the semaphore will be set to the value specified in * ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS::params::fence::value. * * If the semaphore object is of the type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC * this API sets ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS::params::nvSciSync::fence * to a value that can be used by subsequent waiters of the same NvSciSync object * to order operations with those currently submitted in \p stream. Such an update * will overwrite previous contents of * ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS::params::nvSciSync::fence. By default, * signaling such an external semaphore object causes appropriate memory synchronization * operations to be performed over all external memory objects that are imported as * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF. This ensures that any subsequent accesses * made by other importers of the same set of NvSciBuf memory object(s) are coherent. * These operations can be skipped by specifying the flag * ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_SKIP_NVSCIBUF_MEMSYNC, which can be used as a * performance optimization when data coherency is not required. But specifying this * flag in scenarios where data coherency is required results in undefined behavior. * Also, for semaphore object of the type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC, * if the NvSciSyncAttrList used to create the NvSciSyncObj had not set the flags in * ::cuDeviceGetNvSciSyncAttributes to CUDA_NVSCISYNC_ATTR_SIGNAL, this API will return * CUDA_ERROR_NOT_SUPPORTED. * * If the semaphore object is any one of the following types: * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX, * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT * then the keyed mutex will be released with the key specified in * ::CUDA_EXTERNAL_SEMAPHORE_PARAMS::params::keyedmutex::key. * * \param extSemArray - Set of external semaphores to be signaled * \param paramsArray - Array of semaphore parameters * \param numExtSems - Number of semaphores to signal * \param stream - Stream to enqueue the signal operations in * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_HANDLE, * ::CUDA_ERROR_NOT_SUPPORTED * \notefnerr * * \sa ::cuImportExternalSemaphore, * ::cuDestroyExternalSemaphore, * ::cuWaitExternalSemaphoresAsync */ CUresult CUDAAPI cuSignalExternalSemaphoresAsync(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS *paramsArray, unsigned int numExtSems, CUstream stream); /** * \brief Waits on a set of external semaphore objects * * Enqueues a wait operation on a set of externally allocated * semaphore object in the specified stream. The operations will be * executed when all prior operations in the stream complete. * * The exact semantics of waiting on a semaphore depends on the type * of the object. * * If the semaphore object is any one of the following types: * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD, * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32, * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT * then waiting on the semaphore will wait until the semaphore reaches * the signaled state. The semaphore will then be reset to the * unsignaled state. Therefore for every signal operation, there can * only be one wait operation. * * If the semaphore object is any one of the following types: * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE, * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE, * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD, * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32 * then waiting on the semaphore will wait until the value of the * semaphore is greater than or equal to * ::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS::params::fence::value. * * If the semaphore object is of the type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC * then, waiting on the semaphore will wait until the * ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS::params::nvSciSync::fence is signaled by the * signaler of the NvSciSyncObj that was associated with this semaphore object. * By default, waiting on such an external semaphore object causes appropriate * memory synchronization operations to be performed over all external memory objects * that are imported as ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF. This ensures that * any subsequent accesses made by other importers of the same set of NvSciBuf memory * object(s) are coherent. These operations can be skipped by specifying the flag * ::CUDA_EXTERNAL_SEMAPHORE_WAIT_SKIP_NVSCIBUF_MEMSYNC, which can be used as a * performance optimization when data coherency is not required. But specifying this * flag in scenarios where data coherency is required results in undefined behavior. * Also, for semaphore object of the type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC, * if the NvSciSyncAttrList used to create the NvSciSyncObj had not set the flags in * ::cuDeviceGetNvSciSyncAttributes to CUDA_NVSCISYNC_ATTR_WAIT, this API will return * CUDA_ERROR_NOT_SUPPORTED. * * If the semaphore object is any one of the following types: * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX, * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT * then the keyed mutex will be acquired when it is released with the key * specified in ::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS::params::keyedmutex::key * or until the timeout specified by * ::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS::params::keyedmutex::timeoutMs * has lapsed. The timeout interval can either be a finite value * specified in milliseconds or an infinite value. In case an infinite * value is specified the timeout never elapses. The windows INFINITE * macro must be used to specify infinite timeout. * * \param extSemArray - External semaphores to be waited on * \param paramsArray - Array of semaphore parameters * \param numExtSems - Number of semaphores to wait on * \param stream - Stream to enqueue the wait operations in * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_HANDLE, * ::CUDA_ERROR_NOT_SUPPORTED, * ::CUDA_ERROR_TIMEOUT * \notefnerr * * \sa ::cuImportExternalSemaphore, * ::cuDestroyExternalSemaphore, * ::cuSignalExternalSemaphoresAsync */ CUresult CUDAAPI cuWaitExternalSemaphoresAsync(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS *paramsArray, unsigned int numExtSems, CUstream stream); /** * \brief Destroys an external semaphore * * Destroys an external semaphore object and releases any references * to the underlying resource. Any outstanding signals or waits must * have completed before the semaphore is destroyed. * * \param extSem - External semaphore to be destroyed * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_HANDLE * \notefnerr * * \sa ::cuImportExternalSemaphore, * ::cuSignalExternalSemaphoresAsync, * ::cuWaitExternalSemaphoresAsync */ CUresult CUDAAPI cuDestroyExternalSemaphore(CUexternalSemaphore extSem); /** @} */ /* END CUDA_EXTRES_INTEROP */ /** * \defgroup CUDA_MEMOP Stream memory operations * * ___MANBRIEF___ Stream memory operations of the low-level CUDA driver API * (___CURRENT_FILE___) ___ENDMANBRIEF___ * * This section describes the stream memory operations of the low-level CUDA * driver application programming interface. * * The whole set of operations is disabled by default. Users are required * to explicitly enable them, e.g. on Linux by passing the kernel module * parameter shown below: * modprobe nvidia NVreg_EnableStreamMemOPs=1 * There is currently no way to enable these operations on other operating * systems. * * Users can programmatically query whether the device supports these * operations with ::cuDeviceGetAttribute() and * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS. * * Support for the ::CU_STREAM_WAIT_VALUE_NOR flag can be queried with * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR. * * Support for the ::cuStreamWriteValue64() and ::cuStreamWaitValue64() * functions, as well as for the ::CU_STREAM_MEM_OP_WAIT_VALUE_64 and * ::CU_STREAM_MEM_OP_WRITE_VALUE_64 flags, can be queried with * ::CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS. * * Support for both ::CU_STREAM_WAIT_VALUE_FLUSH and * ::CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES requires dedicated platform * hardware features and can be queried with ::cuDeviceGetAttribute() and * ::CU_DEVICE_ATTRIBUTE_CAN_FLUSH_REMOTE_WRITES. * * Note that all memory pointers passed as parameters to these operations * are device pointers. Where necessary a device pointer should be * obtained, for example with ::cuMemHostGetDevicePointer(). * * None of the operations accepts pointers to managed memory buffers * (::cuMemAllocManaged). * * @{ */ /** * \brief Wait on a memory location * * Enqueues a synchronization of the stream on the given memory location. Work * ordered after the operation will block until the given condition on the * memory is satisfied. By default, the condition is to wait for * (int32_t)(*addr - value) >= 0, a cyclic greater-or-equal. * Other condition types can be specified via \p flags. * * If the memory was registered via ::cuMemHostRegister(), the device pointer * should be obtained with ::cuMemHostGetDevicePointer(). This function cannot * be used with managed memory (::cuMemAllocManaged). * * Support for this can be queried with ::cuDeviceGetAttribute() and * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS. * * Support for CU_STREAM_WAIT_VALUE_NOR can be queried with ::cuDeviceGetAttribute() and * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR. * * \param stream The stream to synchronize on the memory location. * \param addr The memory location to wait on. * \param value The value to compare with the memory location. * \param flags See ::CUstreamWaitValue_flags. * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_NOT_SUPPORTED * \notefnerr * * \sa ::cuStreamWaitValue64, * ::cuStreamWriteValue32, * ::cuStreamWriteValue64, * ::cuStreamBatchMemOp, * ::cuMemHostRegister, * ::cuStreamWaitEvent */ CUresult CUDAAPI cuStreamWaitValue32(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags); /** * \brief Wait on a memory location * * Enqueues a synchronization of the stream on the given memory location. Work * ordered after the operation will block until the given condition on the * memory is satisfied. By default, the condition is to wait for * (int64_t)(*addr - value) >= 0, a cyclic greater-or-equal. * Other condition types can be specified via \p flags. * * If the memory was registered via ::cuMemHostRegister(), the device pointer * should be obtained with ::cuMemHostGetDevicePointer(). * * Support for this can be queried with ::cuDeviceGetAttribute() and * ::CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS. * * \param stream The stream to synchronize on the memory location. * \param addr The memory location to wait on. * \param value The value to compare with the memory location. * \param flags See ::CUstreamWaitValue_flags. * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_NOT_SUPPORTED * \notefnerr * * \sa ::cuStreamWaitValue32, * ::cuStreamWriteValue32, * ::cuStreamWriteValue64, * ::cuStreamBatchMemOp, * ::cuMemHostRegister, * ::cuStreamWaitEvent */ CUresult CUDAAPI cuStreamWaitValue64(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags); /** * \brief Write a value to memory * * Write a value to memory. Unless the ::CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER * flag is passed, the write is preceded by a system-wide memory fence, * equivalent to a __threadfence_system() but scoped to the stream * rather than a CUDA thread. * * If the memory was registered via ::cuMemHostRegister(), the device pointer * should be obtained with ::cuMemHostGetDevicePointer(). This function cannot * be used with managed memory (::cuMemAllocManaged). * * Support for this can be queried with ::cuDeviceGetAttribute() and * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS. * * \param stream The stream to do the write in. * \param addr The device address to write to. * \param value The value to write. * \param flags See ::CUstreamWriteValue_flags. * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_NOT_SUPPORTED * \notefnerr * * \sa ::cuStreamWriteValue64, * ::cuStreamWaitValue32, * ::cuStreamWaitValue64, * ::cuStreamBatchMemOp, * ::cuMemHostRegister, * ::cuEventRecord */ CUresult CUDAAPI cuStreamWriteValue32(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags); /** * \brief Write a value to memory * * Write a value to memory. Unless the ::CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER * flag is passed, the write is preceded by a system-wide memory fence, * equivalent to a __threadfence_system() but scoped to the stream * rather than a CUDA thread. * * If the memory was registered via ::cuMemHostRegister(), the device pointer * should be obtained with ::cuMemHostGetDevicePointer(). * * Support for this can be queried with ::cuDeviceGetAttribute() and * ::CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS. * * \param stream The stream to do the write in. * \param addr The device address to write to. * \param value The value to write. * \param flags See ::CUstreamWriteValue_flags. * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_NOT_SUPPORTED * \notefnerr * * \sa ::cuStreamWriteValue32, * ::cuStreamWaitValue32, * ::cuStreamWaitValue64, * ::cuStreamBatchMemOp, * ::cuMemHostRegister, * ::cuEventRecord */ CUresult CUDAAPI cuStreamWriteValue64(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags); /** * \brief Batch operations to synchronize the stream via memory operations * * This is a batch version of ::cuStreamWaitValue32() and ::cuStreamWriteValue32(). * Batching operations may avoid some performance overhead in both the API call * and the device execution versus adding them to the stream in separate API * calls. The operations are enqueued in the order they appear in the array. * * See ::CUstreamBatchMemOpType for the full set of supported operations, and * ::cuStreamWaitValue32(), ::cuStreamWaitValue64(), ::cuStreamWriteValue32(), * and ::cuStreamWriteValue64() for details of specific operations. * * Basic support for this can be queried with ::cuDeviceGetAttribute() and * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS. See related APIs for details * on querying support for specific operations. * * \param stream The stream to enqueue the operations in. * \param count The number of operations in the array. Must be less than 256. * \param paramArray The types and parameters of the individual operations. * \param flags Reserved for future expansion; must be 0. * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_NOT_SUPPORTED * \notefnerr * * \sa ::cuStreamWaitValue32, * ::cuStreamWaitValue64, * ::cuStreamWriteValue32, * ::cuStreamWriteValue64, * ::cuMemHostRegister */ CUresult CUDAAPI cuStreamBatchMemOp(CUstream stream, unsigned int count, CUstreamBatchMemOpParams *paramArray, unsigned int flags); /** @} */ /* END CUDA_MEMOP */ /** * \defgroup CUDA_EXEC Execution Control * * ___MANBRIEF___ execution control functions of the low-level CUDA driver API * (___CURRENT_FILE___) ___ENDMANBRIEF___ * * This section describes the execution control functions of the low-level CUDA * driver application programming interface. * * @{ */ /** * \brief Returns information about a function * * Returns in \p *pi the integer value of the attribute \p attrib on the kernel * given by \p hfunc. The supported attributes are: * - ::CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK: The maximum number of threads * per block, beyond which a launch of the function would fail. This number * depends on both the function and the device on which the function is * currently loaded. * - ::CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES: The size in bytes of * statically-allocated shared memory per block required by this function. * This does not include dynamically-allocated shared memory requested by * the user at runtime. * - ::CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES: The size in bytes of user-allocated * constant memory required by this function. * - ::CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES: The size in bytes of local memory * used by each thread of this function. * - ::CU_FUNC_ATTRIBUTE_NUM_REGS: The number of registers used by each thread * of this function. * - ::CU_FUNC_ATTRIBUTE_PTX_VERSION: The PTX virtual architecture version for * which the function was compiled. This value is the major PTX version * 10 * + the minor PTX version, so a PTX version 1.3 function would return the * value 13. Note that this may return the undefined value of 0 for cubins * compiled prior to CUDA 3.0. * - ::CU_FUNC_ATTRIBUTE_BINARY_VERSION: The binary architecture version for * which the function was compiled. This value is the major binary * version * 10 + the minor binary version, so a binary version 1.3 function * would return the value 13. Note that this will return a value of 10 for * legacy cubins that do not have a properly-encoded binary architecture * version. * - ::CU_FUNC_CACHE_MODE_CA: The attribute to indicate whether the function has * been compiled with user specified option "-Xptxas --dlcm=ca" set . * - ::CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES: The maximum size in bytes of * dynamically-allocated shared memory. * - ::CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT: Preferred shared memory-L1 * cache split ratio in percent of total shared memory. * * \param pi - Returned attribute value * \param attrib - Attribute requested * \param hfunc - Function to query attribute of * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_HANDLE, * ::CUDA_ERROR_INVALID_VALUE * \notefnerr * * \sa ::cuCtxGetCacheConfig, * ::cuCtxSetCacheConfig, * ::cuFuncSetCacheConfig, * ::cuLaunchKernel, * ::cudaFuncGetAttributes, * ::cudaFuncSetAttribute */ CUresult CUDAAPI cuFuncGetAttribute(int *pi, CUfunction_attribute attrib, CUfunction hfunc); /** * \brief Sets information about a function * * This call sets the value of a specified attribute \p attrib on the kernel given * by \p hfunc to an integer value specified by \p val * This function returns CUDA_SUCCESS if the new value of the attribute could be * successfully set. If the set fails, this call will return an error. * Not all attributes can have values set. Attempting to set a value on a read-only * attribute will result in an error (CUDA_ERROR_INVALID_VALUE) * * Supported attributes for the cuFuncSetAttribute call are: * - ::CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES: This maximum size in bytes of * dynamically-allocated shared memory. The value should contain the requested * maximum size of dynamically-allocated shared memory. The sum of this value and * the function attribute ::CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES cannot exceed the * device attribute ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN. * The maximal size of requestable dynamic shared memory may differ by GPU * architecture. * - ::CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT: On devices where the L1 * cache and shared memory use the same hardware resources, this sets the shared memory * carveout preference, in percent of the total shared memory. * See ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR * This is only a hint, and the driver can choose a different ratio if required to execute the function. * * \param hfunc - Function to query attribute of * \param attrib - Attribute requested * \param value - The value to set * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_HANDLE, * ::CUDA_ERROR_INVALID_VALUE * \notefnerr * * \sa ::cuCtxGetCacheConfig, * ::cuCtxSetCacheConfig, * ::cuFuncSetCacheConfig, * ::cuLaunchKernel, * ::cudaFuncGetAttributes, * ::cudaFuncSetAttribute */ CUresult CUDAAPI cuFuncSetAttribute(CUfunction hfunc, CUfunction_attribute attrib, int value); /** * \brief Sets the preferred cache configuration for a device function * * On devices where the L1 cache and shared memory use the same hardware * resources, this sets through \p config the preferred cache configuration for * the device function \p hfunc. This is only a preference. The driver will use * the requested configuration if possible, but it is free to choose a different * configuration if required to execute \p hfunc. Any context-wide preference * set via ::cuCtxSetCacheConfig() will be overridden by this per-function * setting unless the per-function setting is ::CU_FUNC_CACHE_PREFER_NONE. In * that case, the current context-wide setting will be used. * * This setting does nothing on devices where the size of the L1 cache and * shared memory are fixed. * * Launching a kernel with a different preference than the most recent * preference setting may insert a device-side synchronization point. * * * The supported cache configurations are: * - ::CU_FUNC_CACHE_PREFER_NONE: no preference for shared memory or L1 (default) * - ::CU_FUNC_CACHE_PREFER_SHARED: prefer larger shared memory and smaller L1 cache * - ::CU_FUNC_CACHE_PREFER_L1: prefer larger L1 cache and smaller shared memory * - ::CU_FUNC_CACHE_PREFER_EQUAL: prefer equal sized L1 cache and shared memory * * \param hfunc - Kernel to configure cache for * \param config - Requested cache configuration * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT * \notefnerr * * \sa ::cuCtxGetCacheConfig, * ::cuCtxSetCacheConfig, * ::cuFuncGetAttribute, * ::cuLaunchKernel, * ::cudaFuncSetCacheConfig */ CUresult CUDAAPI cuFuncSetCacheConfig(CUfunction hfunc, CUfunc_cache config); /** * \brief Sets the shared memory configuration for a device function. * * On devices with configurable shared memory banks, this function will * force all subsequent launches of the specified device function to have * the given shared memory bank size configuration. On any given launch of the * function, the shared memory configuration of the device will be temporarily * changed if needed to suit the function's preferred configuration. Changes in * shared memory configuration between subsequent launches of functions, * may introduce a device side synchronization point. * * Any per-function setting of shared memory bank size set via * ::cuFuncSetSharedMemConfig will override the context wide setting set with * ::cuCtxSetSharedMemConfig. * * Changing the shared memory bank size will not increase shared memory usage * or affect occupancy of kernels, but may have major effects on performance. * Larger bank sizes will allow for greater potential bandwidth to shared memory, * but will change what kinds of accesses to shared memory will result in bank * conflicts. * * This function will do nothing on devices with fixed shared memory bank size. * * The supported bank configurations are: * - ::CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE: use the context's shared memory * configuration when launching this function. * - ::CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE: set shared memory bank width to * be natively four bytes when launching this function. * - ::CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE: set shared memory bank width to * be natively eight bytes when launching this function. * * \param hfunc - kernel to be given a shared memory config * \param config - requested shared memory configuration * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT * \notefnerr * * \sa ::cuCtxGetCacheConfig, * ::cuCtxSetCacheConfig, * ::cuCtxGetSharedMemConfig, * ::cuCtxSetSharedMemConfig, * ::cuFuncGetAttribute, * ::cuLaunchKernel, * ::cudaFuncSetSharedMemConfig */ CUresult CUDAAPI cuFuncSetSharedMemConfig(CUfunction hfunc, CUsharedconfig config); /** * \brief Returns a module handle * * Returns in \p *hmod the handle of the module that function \p hfunc * is located in. The lifetime of the module corresponds to the lifetime of * the context it was loaded in or until the module is explicitly unloaded. * * The CUDA runtime manages its own modules loaded into the primary context. * If the handle returned by this API refers to a module loaded by the CUDA runtime, * calling ::cuModuleUnload() on that module will result in undefined behavior. * * \param hmod - Returned module handle * \param hfunc - Function to retrieve module for * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_NOT_FOUND * \notefnerr * */ CUresult CUDAAPI cuFuncGetModule(CUmodule *hmod, CUfunction hfunc); /** * \brief Launches a CUDA function * * Invokes the kernel \p f on a \p gridDimX x \p gridDimY x \p gridDimZ * grid of blocks. Each block contains \p blockDimX x \p blockDimY x * \p blockDimZ threads. * * \p sharedMemBytes sets the amount of dynamic shared memory that will be * available to each thread block. * * Kernel parameters to \p f can be specified in one of two ways: * * 1) Kernel parameters can be specified via \p kernelParams. If \p f * has N parameters, then \p kernelParams needs to be an array of N * pointers. Each of \p kernelParams[0] through \p kernelParams[N-1] * must point to a region of memory from which the actual kernel * parameter will be copied. The number of kernel parameters and their * offsets and sizes do not need to be specified as that information is * retrieved directly from the kernel's image. * * 2) Kernel parameters can also be packaged by the application into * a single buffer that is passed in via the \p extra parameter. * This places the burden on the application of knowing each kernel * parameter's size and alignment/padding within the buffer. Here is * an example of using the \p extra parameter in this manner: * \code size_t argBufferSize; char argBuffer[256]; // populate argBuffer and argBufferSize void *config[] = { CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer, CU_LAUNCH_PARAM_BUFFER_SIZE, &argBufferSize, CU_LAUNCH_PARAM_END }; status = cuLaunchKernel(f, gx, gy, gz, bx, by, bz, sh, s, NULL, config); * \endcode * * The \p extra parameter exists to allow ::cuLaunchKernel to take * additional less commonly used arguments. \p extra specifies a list of * names of extra settings and their corresponding values. Each extra * setting name is immediately followed by the corresponding value. The * list must be terminated with either NULL or ::CU_LAUNCH_PARAM_END. * * - ::CU_LAUNCH_PARAM_END, which indicates the end of the \p extra * array; * - ::CU_LAUNCH_PARAM_BUFFER_POINTER, which specifies that the next * value in \p extra will be a pointer to a buffer containing all * the kernel parameters for launching kernel \p f; * - ::CU_LAUNCH_PARAM_BUFFER_SIZE, which specifies that the next * value in \p extra will be a pointer to a size_t containing the * size of the buffer specified with ::CU_LAUNCH_PARAM_BUFFER_POINTER; * * The error ::CUDA_ERROR_INVALID_VALUE will be returned if kernel * parameters are specified with both \p kernelParams and \p extra * (i.e. both \p kernelParams and \p extra are non-NULL). * * Calling ::cuLaunchKernel() invalidates the persistent function state * set through the following deprecated APIs: * ::cuFuncSetBlockShape(), * ::cuFuncSetSharedSize(), * ::cuParamSetSize(), * ::cuParamSeti(), * ::cuParamSetf(), * ::cuParamSetv(). * * Note that to use ::cuLaunchKernel(), the kernel \p f must either have * been compiled with toolchain version 3.2 or later so that it will * contain kernel parameter information, or have no kernel parameters. * If either of these conditions is not met, then ::cuLaunchKernel() will * return ::CUDA_ERROR_INVALID_IMAGE. * * \param f - Kernel to launch * \param gridDimX - Width of grid in blocks * \param gridDimY - Height of grid in blocks * \param gridDimZ - Depth of grid in blocks * \param blockDimX - X dimension of each thread block * \param blockDimY - Y dimension of each thread block * \param blockDimZ - Z dimension of each thread block * \param sharedMemBytes - Dynamic shared-memory size per thread block in bytes * \param hStream - Stream identifier * \param kernelParams - Array of pointers to kernel parameters * \param extra - Extra options * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_HANDLE, * ::CUDA_ERROR_INVALID_IMAGE, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_LAUNCH_FAILED, * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, * ::CUDA_ERROR_LAUNCH_TIMEOUT, * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING, * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED * \note_null_stream * \notefnerr * * \sa ::cuCtxGetCacheConfig, * ::cuCtxSetCacheConfig, * ::cuFuncSetCacheConfig, * ::cuFuncGetAttribute, * ::cudaLaunchKernel */ CUresult CUDAAPI cuLaunchKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra); /** * \brief Launches a CUDA function where thread blocks can cooperate and synchronize as they execute * * Invokes the kernel \p f on a \p gridDimX x \p gridDimY x \p gridDimZ * grid of blocks. Each block contains \p blockDimX x \p blockDimY x * \p blockDimZ threads. * * \p sharedMemBytes sets the amount of dynamic shared memory that will be * available to each thread block. * * The device on which this kernel is invoked must have a non-zero value for * the device attribute ::CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH. * * The total number of blocks launched cannot exceed the maximum number of blocks per * multiprocessor as returned by ::cuOccupancyMaxActiveBlocksPerMultiprocessor (or * ::cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags) times the number of multiprocessors * as specified by the device attribute ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT. * * The kernel cannot make use of CUDA dynamic parallelism. * * Kernel parameters must be specified via \p kernelParams. If \p f * has N parameters, then \p kernelParams needs to be an array of N * pointers. Each of \p kernelParams[0] through \p kernelParams[N-1] * must point to a region of memory from which the actual kernel * parameter will be copied. The number of kernel parameters and their * offsets and sizes do not need to be specified as that information is * retrieved directly from the kernel's image. * * Calling ::cuLaunchCooperativeKernel() sets persistent function state that is * the same as function state set through ::cuLaunchKernel API * * When the kernel \p f is launched via ::cuLaunchCooperativeKernel(), the previous * block shape, shared size and parameter info associated with \p f * is overwritten. * * Note that to use ::cuLaunchCooperativeKernel(), the kernel \p f must either have * been compiled with toolchain version 3.2 or later so that it will * contain kernel parameter information, or have no kernel parameters. * If either of these conditions is not met, then ::cuLaunchCooperativeKernel() will * return ::CUDA_ERROR_INVALID_IMAGE. * * \param f - Kernel to launch * \param gridDimX - Width of grid in blocks * \param gridDimY - Height of grid in blocks * \param gridDimZ - Depth of grid in blocks * \param blockDimX - X dimension of each thread block * \param blockDimY - Y dimension of each thread block * \param blockDimZ - Z dimension of each thread block * \param sharedMemBytes - Dynamic shared-memory size per thread block in bytes * \param hStream - Stream identifier * \param kernelParams - Array of pointers to kernel parameters * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_HANDLE, * ::CUDA_ERROR_INVALID_IMAGE, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_LAUNCH_FAILED, * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, * ::CUDA_ERROR_LAUNCH_TIMEOUT, * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING, * ::CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE, * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED * \note_null_stream * \notefnerr * * \sa ::cuCtxGetCacheConfig, * ::cuCtxSetCacheConfig, * ::cuFuncSetCacheConfig, * ::cuFuncGetAttribute, * ::cuLaunchCooperativeKernelMultiDevice, * ::cudaLaunchCooperativeKernel */ CUresult CUDAAPI cuLaunchCooperativeKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams); /** * \brief Launches CUDA functions on multiple devices where thread blocks can cooperate and synchronize as they execute * * \deprecated This function is deprecated as of CUDA 11.3. * * Invokes kernels as specified in the \p launchParamsList array where each element * of the array specifies all the parameters required to perform a single kernel launch. * These kernels can cooperate and synchronize as they execute. The size of the array is * specified by \p numDevices. * * No two kernels can be launched on the same device. All the devices targeted by this * multi-device launch must be identical. All devices must have a non-zero value for the * device attribute ::CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH. * * All kernels launched must be identical with respect to the compiled code. Note that * any __device__, __constant__ or __managed__ variables present in the module that owns * the kernel launched on each device, are independently instantiated on every device. * It is the application's responsiblity to ensure these variables are initialized and * used appropriately. * * The size of the grids as specified in blocks, the size of the blocks themselves * and the amount of shared memory used by each thread block must also match across * all launched kernels. * * The streams used to launch these kernels must have been created via either ::cuStreamCreate * or ::cuStreamCreateWithPriority. The NULL stream or ::CU_STREAM_LEGACY or ::CU_STREAM_PER_THREAD * cannot be used. * * The total number of blocks launched per kernel cannot exceed the maximum number of blocks * per multiprocessor as returned by ::cuOccupancyMaxActiveBlocksPerMultiprocessor (or * ::cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags) times the number of multiprocessors * as specified by the device attribute ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT. Since the * total number of blocks launched per device has to match across all devices, the maximum * number of blocks that can be launched per device will be limited by the device with the * least number of multiprocessors. * * The kernels cannot make use of CUDA dynamic parallelism. * * The ::CUDA_LAUNCH_PARAMS structure is defined as: * \code typedef struct CUDA_LAUNCH_PARAMS_st { CUfunction function; unsigned int gridDimX; unsigned int gridDimY; unsigned int gridDimZ; unsigned int blockDimX; unsigned int blockDimY; unsigned int blockDimZ; unsigned int sharedMemBytes; CUstream hStream; void **kernelParams; } CUDA_LAUNCH_PARAMS; * \endcode * where: * - ::CUDA_LAUNCH_PARAMS::function specifies the kernel to be launched. All functions must * be identical with respect to the compiled code. * - ::CUDA_LAUNCH_PARAMS::gridDimX is the width of the grid in blocks. This must match across * all kernels launched. * - ::CUDA_LAUNCH_PARAMS::gridDimY is the height of the grid in blocks. This must match across * all kernels launched. * - ::CUDA_LAUNCH_PARAMS::gridDimZ is the depth of the grid in blocks. This must match across * all kernels launched. * - ::CUDA_LAUNCH_PARAMS::blockDimX is the X dimension of each thread block. This must match across * all kernels launched. * - ::CUDA_LAUNCH_PARAMS::blockDimX is the Y dimension of each thread block. This must match across * all kernels launched. * - ::CUDA_LAUNCH_PARAMS::blockDimZ is the Z dimension of each thread block. This must match across * all kernels launched. * - ::CUDA_LAUNCH_PARAMS::sharedMemBytes is the dynamic shared-memory size per thread block in bytes. * This must match across all kernels launched. * - ::CUDA_LAUNCH_PARAMS::hStream is the handle to the stream to perform the launch in. This cannot * be the NULL stream or ::CU_STREAM_LEGACY or ::CU_STREAM_PER_THREAD. The CUDA context associated * with this stream must match that associated with ::CUDA_LAUNCH_PARAMS::function. * - ::CUDA_LAUNCH_PARAMS::kernelParams is an array of pointers to kernel parameters. If * ::CUDA_LAUNCH_PARAMS::function has N parameters, then ::CUDA_LAUNCH_PARAMS::kernelParams * needs to be an array of N pointers. Each of ::CUDA_LAUNCH_PARAMS::kernelParams[0] through * ::CUDA_LAUNCH_PARAMS::kernelParams[N-1] must point to a region of memory from which the actual * kernel parameter will be copied. The number of kernel parameters and their offsets and sizes * do not need to be specified as that information is retrieved directly from the kernel's image. * * By default, the kernel won't begin execution on any GPU until all prior work in all the specified * streams has completed. This behavior can be overridden by specifying the flag * ::CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_PRE_LAUNCH_SYNC. When this flag is specified, each kernel * will only wait for prior work in the stream corresponding to that GPU to complete before it begins * execution. * * Similarly, by default, any subsequent work pushed in any of the specified streams will not begin * execution until the kernels on all GPUs have completed. This behavior can be overridden by specifying * the flag ::CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_POST_LAUNCH_SYNC. When this flag is specified, * any subsequent work pushed in any of the specified streams will only wait for the kernel launched * on the GPU corresponding to that stream to complete before it begins execution. * * Calling ::cuLaunchCooperativeKernelMultiDevice() sets persistent function state that is * the same as function state set through ::cuLaunchKernel API when called individually for each * element in \p launchParamsList. * * When kernels are launched via ::cuLaunchCooperativeKernelMultiDevice(), the previous * block shape, shared size and parameter info associated with each ::CUDA_LAUNCH_PARAMS::function * in \p launchParamsList is overwritten. * * Note that to use ::cuLaunchCooperativeKernelMultiDevice(), the kernels must either have * been compiled with toolchain version 3.2 or later so that it will * contain kernel parameter information, or have no kernel parameters. * If either of these conditions is not met, then ::cuLaunchCooperativeKernelMultiDevice() will * return ::CUDA_ERROR_INVALID_IMAGE. * * \param launchParamsList - List of launch parameters, one per device * \param numDevices - Size of the \p launchParamsList array * \param flags - Flags to control launch behavior * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_HANDLE, * ::CUDA_ERROR_INVALID_IMAGE, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_LAUNCH_FAILED, * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, * ::CUDA_ERROR_LAUNCH_TIMEOUT, * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING, * ::CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE, * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED * \note_null_stream * \notefnerr * * \sa ::cuCtxGetCacheConfig, * ::cuCtxSetCacheConfig, * ::cuFuncSetCacheConfig, * ::cuFuncGetAttribute, * ::cuLaunchCooperativeKernel, * ::cudaLaunchCooperativeKernelMultiDevice */ __CUDA_DEPRECATED CUresult CUDAAPI cuLaunchCooperativeKernelMultiDevice(CUDA_LAUNCH_PARAMS *launchParamsList, unsigned int numDevices, unsigned int flags); /** * \brief Enqueues a host function call in a stream * * Enqueues a host function to run in a stream. The function will be called * after currently enqueued work and will block work added after it. * * The host function must not make any CUDA API calls. Attempting to use a * CUDA API may result in ::CUDA_ERROR_NOT_PERMITTED, but this is not required. * The host function must not perform any synchronization that may depend on * outstanding CUDA work not mandated to run earlier. Host functions without a * mandated order (such as in independent streams) execute in undefined order * and may be serialized. * * For the purposes of Unified Memory, execution makes a number of guarantees: *
    *
  • The stream is considered idle for the duration of the function's * execution. Thus, for example, the function may always use memory attached * to the stream it was enqueued in.
  • *
  • The start of execution of the function has the same effect as * synchronizing an event recorded in the same stream immediately prior to * the function. It thus synchronizes streams which have been "joined" * prior to the function.
  • *
  • Adding device work to any stream does not have the effect of making * the stream active until all preceding host functions and stream callbacks * have executed. Thus, for * example, a function might use global attached memory even if work has * been added to another stream, if the work has been ordered behind the * function call with an event.
  • *
  • Completion of the function does not cause a stream to become * active except as described above. The stream will remain idle * if no device work follows the function, and will remain idle across * consecutive host functions or stream callbacks without device work in * between. Thus, for example, * stream synchronization can be done by signaling from a host function at the * end of the stream.
  • *
* * Note that, in contrast to ::cuStreamAddCallback, the function will not be * called in the event of an error in the CUDA context. * * \param hStream - Stream to enqueue function call in * \param fn - The function to call once preceding stream operations are complete * \param userData - User-specified data to be passed to the function * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_HANDLE, * ::CUDA_ERROR_NOT_SUPPORTED * \note_null_stream * \notefnerr * * \sa ::cuStreamCreate, * ::cuStreamQuery, * ::cuStreamSynchronize, * ::cuStreamWaitEvent, * ::cuStreamDestroy, * ::cuMemAllocManaged, * ::cuStreamAttachMemAsync, * ::cuStreamAddCallback */ CUresult CUDAAPI cuLaunchHostFunc(CUstream hStream, CUhostFn fn, void *userData); /** @} */ /* END CUDA_EXEC */ /** * \defgroup CUDA_EXEC_DEPRECATED Execution Control [DEPRECATED] * * ___MANBRIEF___ deprecated execution control functions of the low-level CUDA * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ * * This section describes the deprecated execution control functions of the * low-level CUDA driver application programming interface. * * @{ */ /** * \brief Sets the block-dimensions for the function * * \deprecated * * Specifies the \p x, \p y, and \p z dimensions of the thread blocks that are * created when the kernel given by \p hfunc is launched. * * \param hfunc - Kernel to specify dimensions of * \param x - X dimension * \param y - Y dimension * \param z - Z dimension * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_HANDLE, * ::CUDA_ERROR_INVALID_VALUE * \notefnerr * * \sa ::cuFuncSetSharedSize, * ::cuFuncSetCacheConfig, * ::cuFuncGetAttribute, * ::cuParamSetSize, * ::cuParamSeti, * ::cuParamSetf, * ::cuParamSetv, * ::cuLaunch, * ::cuLaunchGrid, * ::cuLaunchGridAsync, * ::cuLaunchKernel */ __CUDA_DEPRECATED CUresult CUDAAPI cuFuncSetBlockShape(CUfunction hfunc, int x, int y, int z); /** * \brief Sets the dynamic shared-memory size for the function * * \deprecated * * Sets through \p bytes the amount of dynamic shared memory that will be * available to each thread block when the kernel given by \p hfunc is launched. * * \param hfunc - Kernel to specify dynamic shared-memory size for * \param bytes - Dynamic shared-memory size per thread in bytes * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_HANDLE, * ::CUDA_ERROR_INVALID_VALUE * \notefnerr * * \sa ::cuFuncSetBlockShape, * ::cuFuncSetCacheConfig, * ::cuFuncGetAttribute, * ::cuParamSetSize, * ::cuParamSeti, * ::cuParamSetf, * ::cuParamSetv, * ::cuLaunch, * ::cuLaunchGrid, * ::cuLaunchGridAsync, * ::cuLaunchKernel */ __CUDA_DEPRECATED CUresult CUDAAPI cuFuncSetSharedSize(CUfunction hfunc, unsigned int bytes); /** * \brief Sets the parameter size for the function * * \deprecated * * Sets through \p numbytes the total size in bytes needed by the function * parameters of the kernel corresponding to \p hfunc. * * \param hfunc - Kernel to set parameter size for * \param numbytes - Size of parameter list in bytes * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE * \notefnerr * * \sa ::cuFuncSetBlockShape, * ::cuFuncSetSharedSize, * ::cuFuncGetAttribute, * ::cuParamSetf, * ::cuParamSeti, * ::cuParamSetv, * ::cuLaunch, * ::cuLaunchGrid, * ::cuLaunchGridAsync, * ::cuLaunchKernel */ __CUDA_DEPRECATED CUresult CUDAAPI cuParamSetSize(CUfunction hfunc, unsigned int numbytes); /** * \brief Adds an integer parameter to the function's argument list * * \deprecated * * Sets an integer parameter that will be specified the next time the * kernel corresponding to \p hfunc will be invoked. \p offset is a byte offset. * * \param hfunc - Kernel to add parameter to * \param offset - Offset to add parameter to argument list * \param value - Value of parameter * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE * \notefnerr * * \sa ::cuFuncSetBlockShape, * ::cuFuncSetSharedSize, * ::cuFuncGetAttribute, * ::cuParamSetSize, * ::cuParamSetf, * ::cuParamSetv, * ::cuLaunch, * ::cuLaunchGrid, * ::cuLaunchGridAsync, * ::cuLaunchKernel */ __CUDA_DEPRECATED CUresult CUDAAPI cuParamSeti(CUfunction hfunc, int offset, unsigned int value); /** * \brief Adds a floating-point parameter to the function's argument list * * \deprecated * * Sets a floating-point parameter that will be specified the next time the * kernel corresponding to \p hfunc will be invoked. \p offset is a byte offset. * * \param hfunc - Kernel to add parameter to * \param offset - Offset to add parameter to argument list * \param value - Value of parameter * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE * \notefnerr * * \sa ::cuFuncSetBlockShape, * ::cuFuncSetSharedSize, * ::cuFuncGetAttribute, * ::cuParamSetSize, * ::cuParamSeti, * ::cuParamSetv, * ::cuLaunch, * ::cuLaunchGrid, * ::cuLaunchGridAsync, * ::cuLaunchKernel */ __CUDA_DEPRECATED CUresult CUDAAPI cuParamSetf(CUfunction hfunc, int offset, float value); /** * \brief Adds arbitrary data to the function's argument list * * \deprecated * * Copies an arbitrary amount of data (specified in \p numbytes) from \p ptr * into the parameter space of the kernel corresponding to \p hfunc. \p offset * is a byte offset. * * \param hfunc - Kernel to add data to * \param offset - Offset to add data to argument list * \param ptr - Pointer to arbitrary data * \param numbytes - Size of data to copy in bytes * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE * \notefnerr * * \sa ::cuFuncSetBlockShape, * ::cuFuncSetSharedSize, * ::cuFuncGetAttribute, * ::cuParamSetSize, * ::cuParamSetf, * ::cuParamSeti, * ::cuLaunch, * ::cuLaunchGrid, * ::cuLaunchGridAsync, * ::cuLaunchKernel */ __CUDA_DEPRECATED CUresult CUDAAPI cuParamSetv(CUfunction hfunc, int offset, void *ptr, unsigned int numbytes); /** * \brief Launches a CUDA function * * \deprecated * * Invokes the kernel \p f on a 1 x 1 x 1 grid of blocks. The block * contains the number of threads specified by a previous call to * ::cuFuncSetBlockShape(). * * The block shape, dynamic shared memory size, and parameter information * must be set using * ::cuFuncSetBlockShape(), * ::cuFuncSetSharedSize(), * ::cuParamSetSize(), * ::cuParamSeti(), * ::cuParamSetf(), and * ::cuParamSetv() * prior to calling this function. * * Launching a function via ::cuLaunchKernel() invalidates the function's * block shape, dynamic shared memory size, and parameter information. After * launching via cuLaunchKernel, this state must be re-initialized prior to * calling this function. Failure to do so results in undefined behavior. * * \param f - Kernel to launch * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_LAUNCH_FAILED, * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, * ::CUDA_ERROR_LAUNCH_TIMEOUT, * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING, * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED * \notefnerr * * \sa ::cuFuncSetBlockShape, * ::cuFuncSetSharedSize, * ::cuFuncGetAttribute, * ::cuParamSetSize, * ::cuParamSetf, * ::cuParamSeti, * ::cuParamSetv, * ::cuLaunchGrid, * ::cuLaunchGridAsync, * ::cuLaunchKernel */ __CUDA_DEPRECATED CUresult CUDAAPI cuLaunch(CUfunction f); /** * \brief Launches a CUDA function * * \deprecated * * Invokes the kernel \p f on a \p grid_width x \p grid_height grid of * blocks. Each block contains the number of threads specified by a previous * call to ::cuFuncSetBlockShape(). * * The block shape, dynamic shared memory size, and parameter information * must be set using * ::cuFuncSetBlockShape(), * ::cuFuncSetSharedSize(), * ::cuParamSetSize(), * ::cuParamSeti(), * ::cuParamSetf(), and * ::cuParamSetv() * prior to calling this function. * * Launching a function via ::cuLaunchKernel() invalidates the function's * block shape, dynamic shared memory size, and parameter information. After * launching via cuLaunchKernel, this state must be re-initialized prior to * calling this function. Failure to do so results in undefined behavior. * * \param f - Kernel to launch * \param grid_width - Width of grid in blocks * \param grid_height - Height of grid in blocks * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_LAUNCH_FAILED, * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, * ::CUDA_ERROR_LAUNCH_TIMEOUT, * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING, * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED * \notefnerr * * \sa ::cuFuncSetBlockShape, * ::cuFuncSetSharedSize, * ::cuFuncGetAttribute, * ::cuParamSetSize, * ::cuParamSetf, * ::cuParamSeti, * ::cuParamSetv, * ::cuLaunch, * ::cuLaunchGridAsync, * ::cuLaunchKernel */ __CUDA_DEPRECATED CUresult CUDAAPI cuLaunchGrid(CUfunction f, int grid_width, int grid_height); /** * \brief Launches a CUDA function * * \deprecated * * Invokes the kernel \p f on a \p grid_width x \p grid_height grid of * blocks. Each block contains the number of threads specified by a previous * call to ::cuFuncSetBlockShape(). * * The block shape, dynamic shared memory size, and parameter information * must be set using * ::cuFuncSetBlockShape(), * ::cuFuncSetSharedSize(), * ::cuParamSetSize(), * ::cuParamSeti(), * ::cuParamSetf(), and * ::cuParamSetv() * prior to calling this function. * * Launching a function via ::cuLaunchKernel() invalidates the function's * block shape, dynamic shared memory size, and parameter information. After * launching via cuLaunchKernel, this state must be re-initialized prior to * calling this function. Failure to do so results in undefined behavior. * * \param f - Kernel to launch * \param grid_width - Width of grid in blocks * \param grid_height - Height of grid in blocks * \param hStream - Stream identifier * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_HANDLE, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_LAUNCH_FAILED, * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, * ::CUDA_ERROR_LAUNCH_TIMEOUT, * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING, * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED * * \note In certain cases where cubins are created with no ABI (i.e., using \p ptxas \p --abi-compile \p no), * this function may serialize kernel launches. The CUDA driver retains asynchronous behavior by * growing the per-thread stack as needed per launch and not shrinking it afterwards. * * \note_null_stream * \notefnerr * * \sa ::cuFuncSetBlockShape, * ::cuFuncSetSharedSize, * ::cuFuncGetAttribute, * ::cuParamSetSize, * ::cuParamSetf, * ::cuParamSeti, * ::cuParamSetv, * ::cuLaunch, * ::cuLaunchGrid, * ::cuLaunchKernel */ __CUDA_DEPRECATED CUresult CUDAAPI cuLaunchGridAsync(CUfunction f, int grid_width, int grid_height, CUstream hStream); /** * \brief Adds a texture-reference to the function's argument list * * \deprecated * * Makes the CUDA array or linear memory bound to the texture reference * \p hTexRef available to a device program as a texture. In this version of * CUDA, the texture-reference must be obtained via ::cuModuleGetTexRef() and * the \p texunit parameter must be set to ::CU_PARAM_TR_DEFAULT. * * \param hfunc - Kernel to add texture-reference to * \param texunit - Texture unit (must be ::CU_PARAM_TR_DEFAULT) * \param hTexRef - Texture-reference to add to argument list * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE * \notefnerr */ __CUDA_DEPRECATED CUresult CUDAAPI cuParamSetTexRef(CUfunction hfunc, int texunit, CUtexref hTexRef); /** @} */ /* END CUDA_EXEC_DEPRECATED */ /** * \defgroup CUDA_GRAPH Graph Management * * ___MANBRIEF___ graph management functions of the low-level CUDA driver API * (___CURRENT_FILE___) ___ENDMANBRIEF___ * * This section describes the graph management functions of the low-level CUDA * driver application programming interface. * * @{ */ /** * \brief Creates a graph * * Creates an empty graph, which is returned via \p phGraph. * * \param phGraph - Returns newly created graph * \param flags - Graph creation flags, must be 0 * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_OUT_OF_MEMORY * \note_graph_thread_safety * \notefnerr * * \sa * ::cuGraphAddChildGraphNode, * ::cuGraphAddEmptyNode, * ::cuGraphAddKernelNode, * ::cuGraphAddHostNode, * ::cuGraphAddMemcpyNode, * ::cuGraphAddMemsetNode, * ::cuGraphInstantiate, * ::cuGraphDestroy, * ::cuGraphGetNodes, * ::cuGraphGetRootNodes, * ::cuGraphGetEdges, * ::cuGraphClone */ CUresult CUDAAPI cuGraphCreate(CUgraph *phGraph, unsigned int flags); /** * \brief Creates a kernel execution node and adds it to a graph * * Creates a new kernel execution node and adds it to \p hGraph with \p numDependencies * dependencies specified via \p dependencies and arguments specified in \p nodeParams. * It is possible for \p numDependencies to be 0, in which case the node will be placed * at the root of the graph. \p dependencies may not have any duplicate entries. * A handle to the new node will be returned in \p phGraphNode. * * The CUDA_KERNEL_NODE_PARAMS structure is defined as: * * \code * typedef struct CUDA_KERNEL_NODE_PARAMS_st { * CUfunction func; * unsigned int gridDimX; * unsigned int gridDimY; * unsigned int gridDimZ; * unsigned int blockDimX; * unsigned int blockDimY; * unsigned int blockDimZ; * unsigned int sharedMemBytes; * void **kernelParams; * void **extra; * } CUDA_KERNEL_NODE_PARAMS; * \endcode * * When the graph is launched, the node will invoke kernel \p func on a (\p gridDimX x * \p gridDimY x \p gridDimZ) grid of blocks. Each block contains * (\p blockDimX x \p blockDimY x \p blockDimZ) threads. * * \p sharedMemBytes sets the amount of dynamic shared memory that will be * available to each thread block. * * Kernel parameters to \p func can be specified in one of two ways: * * 1) Kernel parameters can be specified via \p kernelParams. If the kernel has N * parameters, then \p kernelParams needs to be an array of N pointers. Each pointer, * from \p kernelParams[0] to \p kernelParams[N-1], points to the region of memory from which the actual * parameter will be copied. The number of kernel parameters and their offsets and sizes do not need * to be specified as that information is retrieved directly from the kernel's image. * * 2) Kernel parameters for non-cooperative kernels can also be packaged by the application into a single * buffer that is passed in via \p extra. This places the burden on the application of knowing each * kernel parameter's size and alignment/padding within the buffer. The \p extra parameter exists * to allow this function to take additional less commonly used arguments. \p extra specifies * a list of names of extra settings and their corresponding values. Each extra setting name is * immediately followed by the corresponding value. The list must be terminated with either NULL or * CU_LAUNCH_PARAM_END. * * - ::CU_LAUNCH_PARAM_END, which indicates the end of the \p extra * array; * - ::CU_LAUNCH_PARAM_BUFFER_POINTER, which specifies that the next * value in \p extra will be a pointer to a buffer * containing all the kernel parameters for launching kernel * \p func; * - ::CU_LAUNCH_PARAM_BUFFER_SIZE, which specifies that the next * value in \p extra will be a pointer to a size_t * containing the size of the buffer specified with * ::CU_LAUNCH_PARAM_BUFFER_POINTER; * * The error ::CUDA_ERROR_INVALID_VALUE will be returned if kernel parameters are specified with both * \p kernelParams and \p extra (i.e. both \p kernelParams and \p extra are non-NULL). * ::CUDA_ERROR_INVALID_VALUE will be returned if \p extra is used for a cooperative kernel. * * The \p kernelParams or \p extra array, as well as the argument values it points to, * are copied during this call. * * \note Kernels launched using graphs must not use texture and surface references. Reading or * writing through any texture or surface reference is undefined behavior. * This restriction does not apply to texture and surface objects. * * \param phGraphNode - Returns newly created node * \param hGraph - Graph to which to add the node * \param dependencies - Dependencies of the node * \param numDependencies - Number of dependencies * \param nodeParams - Parameters for the GPU execution node * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_VALUE * \note_graph_thread_safety * \notefnerr * * \sa * ::cuLaunchKernel, * ::cuLaunchCooperativeKernel, * ::cuGraphKernelNodeGetParams, * ::cuGraphKernelNodeSetParams, * ::cuGraphCreate, * ::cuGraphDestroyNode, * ::cuGraphAddChildGraphNode, * ::cuGraphAddEmptyNode, * ::cuGraphAddHostNode, * ::cuGraphAddMemcpyNode, * ::cuGraphAddMemsetNode */ CUresult CUDAAPI cuGraphAddKernelNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_KERNEL_NODE_PARAMS *nodeParams); /** * \brief Returns a kernel node's parameters * * Returns the parameters of kernel node \p hNode in \p nodeParams. * The \p kernelParams or \p extra array returned in \p nodeParams, * as well as the argument values it points to, are owned by the node. * This memory remains valid until the node is destroyed or its * parameters are modified, and should not be modified * directly. Use ::cuGraphKernelNodeSetParams to update the * parameters of this node. * * The params will contain either \p kernelParams or \p extra, * according to which of these was most recently set on the node. * * \param hNode - Node to get the parameters for * \param nodeParams - Pointer to return the parameters * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_VALUE * \note_graph_thread_safety * \notefnerr * * \sa * ::cuLaunchKernel, * ::cuGraphAddKernelNode, * ::cuGraphKernelNodeSetParams */ CUresult CUDAAPI cuGraphKernelNodeGetParams(CUgraphNode hNode, CUDA_KERNEL_NODE_PARAMS *nodeParams); /** * \brief Sets a kernel node's parameters * * Sets the parameters of kernel node \p hNode to \p nodeParams. * * \param hNode - Node to set the parameters for * \param nodeParams - Parameters to copy * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_INVALID_HANDLE, * ::CUDA_ERROR_OUT_OF_MEMORY * \note_graph_thread_safety * \notefnerr * * \sa * ::cuLaunchKernel, * ::cuGraphAddKernelNode, * ::cuGraphKernelNodeGetParams */ CUresult CUDAAPI cuGraphKernelNodeSetParams(CUgraphNode hNode, const CUDA_KERNEL_NODE_PARAMS *nodeParams); /** * \brief Creates a memcpy node and adds it to a graph * * Creates a new memcpy node and adds it to \p hGraph with \p numDependencies * dependencies specified via \p dependencies. * It is possible for \p numDependencies to be 0, in which case the node will be placed * at the root of the graph. \p dependencies may not have any duplicate entries. * A handle to the new node will be returned in \p phGraphNode. * * When the graph is launched, the node will perform the memcpy described by \p copyParams. * See ::cuMemcpy3D() for a description of the structure and its restrictions. * * Memcpy nodes have some additional restrictions with regards to managed memory, if the * system contains at least one device which has a zero value for the device attribute * ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. If one or more of the operands refer * to managed memory, then using the memory type ::CU_MEMORYTYPE_UNIFIED is disallowed * for those operand(s). The managed memory will be treated as residing on either the * host or the device, depending on which memory type is specified. * * \param phGraphNode - Returns newly created node * \param hGraph - Graph to which to add the node * \param dependencies - Dependencies of the node * \param numDependencies - Number of dependencies * \param copyParams - Parameters for the memory copy * \param ctx - Context on which to run the node * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_VALUE * \note_graph_thread_safety * \notefnerr * * \sa * ::cuMemcpy3D, * ::cuGraphMemcpyNodeGetParams, * ::cuGraphMemcpyNodeSetParams, * ::cuGraphCreate, * ::cuGraphDestroyNode, * ::cuGraphAddChildGraphNode, * ::cuGraphAddEmptyNode, * ::cuGraphAddKernelNode, * ::cuGraphAddHostNode, * ::cuGraphAddMemsetNode */ CUresult CUDAAPI cuGraphAddMemcpyNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_MEMCPY3D *copyParams, CUcontext ctx); /** * \brief Returns a memcpy node's parameters * * Returns the parameters of memcpy node \p hNode in \p nodeParams. * * \param hNode - Node to get the parameters for * \param nodeParams - Pointer to return the parameters * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_VALUE * \note_graph_thread_safety * \notefnerr * * \sa * ::cuMemcpy3D, * ::cuGraphAddMemcpyNode, * ::cuGraphMemcpyNodeSetParams */ CUresult CUDAAPI cuGraphMemcpyNodeGetParams(CUgraphNode hNode, CUDA_MEMCPY3D *nodeParams); /** * \brief Sets a memcpy node's parameters * * Sets the parameters of memcpy node \p hNode to \p nodeParams. * * \param hNode - Node to set the parameters for * \param nodeParams - Parameters to copy * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_VALUE, * \note_graph_thread_safety * \notefnerr * * \sa * ::cuMemcpy3D, * ::cuGraphAddMemcpyNode, * ::cuGraphMemcpyNodeGetParams */ CUresult CUDAAPI cuGraphMemcpyNodeSetParams(CUgraphNode hNode, const CUDA_MEMCPY3D *nodeParams); /** * \brief Creates a memset node and adds it to a graph * * Creates a new memset node and adds it to \p hGraph with \p numDependencies * dependencies specified via \p dependencies. * It is possible for \p numDependencies to be 0, in which case the node will be placed * at the root of the graph. \p dependencies may not have any duplicate entries. * A handle to the new node will be returned in \p phGraphNode. * * The element size must be 1, 2, or 4 bytes. * When the graph is launched, the node will perform the memset described by \p memsetParams. * * \param phGraphNode - Returns newly created node * \param hGraph - Graph to which to add the node * \param dependencies - Dependencies of the node * \param numDependencies - Number of dependencies * \param memsetParams - Parameters for the memory set * \param ctx - Context on which to run the node * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_INVALID_CONTEXT * \note_graph_thread_safety * \notefnerr * * \sa * ::cuMemsetD2D32, * ::cuGraphMemsetNodeGetParams, * ::cuGraphMemsetNodeSetParams, * ::cuGraphCreate, * ::cuGraphDestroyNode, * ::cuGraphAddChildGraphNode, * ::cuGraphAddEmptyNode, * ::cuGraphAddKernelNode, * ::cuGraphAddHostNode, * ::cuGraphAddMemcpyNode */ CUresult CUDAAPI cuGraphAddMemsetNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_MEMSET_NODE_PARAMS *memsetParams, CUcontext ctx); /** * \brief Returns a memset node's parameters * * Returns the parameters of memset node \p hNode in \p nodeParams. * * \param hNode - Node to get the parameters for * \param nodeParams - Pointer to return the parameters * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_VALUE * \note_graph_thread_safety * \notefnerr * * \sa * ::cuMemsetD2D32, * ::cuGraphAddMemsetNode, * ::cuGraphMemsetNodeSetParams */ CUresult CUDAAPI cuGraphMemsetNodeGetParams(CUgraphNode hNode, CUDA_MEMSET_NODE_PARAMS *nodeParams); /** * \brief Sets a memset node's parameters * * Sets the parameters of memset node \p hNode to \p nodeParams. * * \param hNode - Node to set the parameters for * \param nodeParams - Parameters to copy * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_VALUE * \note_graph_thread_safety * \notefnerr * * \sa * ::cuMemsetD2D32, * ::cuGraphAddMemsetNode, * ::cuGraphMemsetNodeGetParams */ CUresult CUDAAPI cuGraphMemsetNodeSetParams(CUgraphNode hNode, const CUDA_MEMSET_NODE_PARAMS *nodeParams); /** * \brief Creates a host execution node and adds it to a graph * * Creates a new CPU execution node and adds it to \p hGraph with \p numDependencies * dependencies specified via \p dependencies and arguments specified in \p nodeParams. * It is possible for \p numDependencies to be 0, in which case the node will be placed * at the root of the graph. \p dependencies may not have any duplicate entries. * A handle to the new node will be returned in \p phGraphNode. * * When the graph is launched, the node will invoke the specified CPU function. * Host nodes are not supported under MPS with pre-Volta GPUs. * * \param phGraphNode - Returns newly created node * \param hGraph - Graph to which to add the node * \param dependencies - Dependencies of the node * \param numDependencies - Number of dependencies * \param nodeParams - Parameters for the host node * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_NOT_SUPPORTED, * ::CUDA_ERROR_INVALID_VALUE * \note_graph_thread_safety * \notefnerr * * \sa * ::cuLaunchHostFunc, * ::cuGraphHostNodeGetParams, * ::cuGraphHostNodeSetParams, * ::cuGraphCreate, * ::cuGraphDestroyNode, * ::cuGraphAddChildGraphNode, * ::cuGraphAddEmptyNode, * ::cuGraphAddKernelNode, * ::cuGraphAddMemcpyNode, * ::cuGraphAddMemsetNode */ CUresult CUDAAPI cuGraphAddHostNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_HOST_NODE_PARAMS *nodeParams); /** * \brief Returns a host node's parameters * * Returns the parameters of host node \p hNode in \p nodeParams. * * \param hNode - Node to get the parameters for * \param nodeParams - Pointer to return the parameters * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_VALUE * \note_graph_thread_safety * \notefnerr * * \sa * ::cuLaunchHostFunc, * ::cuGraphAddHostNode, * ::cuGraphHostNodeSetParams */ CUresult CUDAAPI cuGraphHostNodeGetParams(CUgraphNode hNode, CUDA_HOST_NODE_PARAMS *nodeParams); /** * \brief Sets a host node's parameters * * Sets the parameters of host node \p hNode to \p nodeParams. * * \param hNode - Node to set the parameters for * \param nodeParams - Parameters to copy * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_VALUE * \note_graph_thread_safety * \notefnerr * * \sa * ::cuLaunchHostFunc, * ::cuGraphAddHostNode, * ::cuGraphHostNodeGetParams */ CUresult CUDAAPI cuGraphHostNodeSetParams(CUgraphNode hNode, const CUDA_HOST_NODE_PARAMS *nodeParams); /** * \brief Creates a child graph node and adds it to a graph * * Creates a new node which executes an embedded graph, and adds it to \p hGraph with * \p numDependencies dependencies specified via \p dependencies. * It is possible for \p numDependencies to be 0, in which case the node will be placed * at the root of the graph. \p dependencies may not have any duplicate entries. * A handle to the new node will be returned in \p phGraphNode. * * If \p hGraph contains allocation or free nodes, this call will return an error. * * The node executes an embedded child graph. The child graph is cloned in this call. * * \param phGraphNode - Returns newly created node * \param hGraph - Graph to which to add the node * \param dependencies - Dependencies of the node * \param numDependencies - Number of dependencies * \param childGraph - The graph to clone into this node * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_VALUE, * \note_graph_thread_safety * \notefnerr * * \sa * ::cuGraphChildGraphNodeGetGraph, * ::cuGraphCreate, * ::cuGraphDestroyNode, * ::cuGraphAddEmptyNode, * ::cuGraphAddKernelNode, * ::cuGraphAddHostNode, * ::cuGraphAddMemcpyNode, * ::cuGraphAddMemsetNode, * ::cuGraphClone */ CUresult CUDAAPI cuGraphAddChildGraphNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUgraph childGraph); /** * \brief Gets a handle to the embedded graph of a child graph node * * Gets a handle to the embedded graph in a child graph node. This call * does not clone the graph. Changes to the graph will be reflected in * the node, and the node retains ownership of the graph. * * Allocation and free nodes cannot be added to the returned graph. * Attempting to do so will return an error. * * \param hNode - Node to get the embedded graph for * \param phGraph - Location to store a handle to the graph * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_VALUE, * \note_graph_thread_safety * \notefnerr * * \sa * ::cuGraphAddChildGraphNode, * ::cuGraphNodeFindInClone */ CUresult CUDAAPI cuGraphChildGraphNodeGetGraph(CUgraphNode hNode, CUgraph *phGraph); /** * \brief Creates an empty node and adds it to a graph * * Creates a new node which performs no operation, and adds it to \p hGraph with * \p numDependencies dependencies specified via \p dependencies. * It is possible for \p numDependencies to be 0, in which case the node will be placed * at the root of the graph. \p dependencies may not have any duplicate entries. * A handle to the new node will be returned in \p phGraphNode. * * An empty node performs no operation during execution, but can be used for * transitive ordering. For example, a phased execution graph with 2 groups of n * nodes with a barrier between them can be represented using an empty node and * 2*n dependency edges, rather than no empty node and n^2 dependency edges. * * \param phGraphNode - Returns newly created node * \param hGraph - Graph to which to add the node * \param dependencies - Dependencies of the node * \param numDependencies - Number of dependencies * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_VALUE, * \note_graph_thread_safety * \notefnerr * * \sa * ::cuGraphCreate, * ::cuGraphDestroyNode, * ::cuGraphAddChildGraphNode, * ::cuGraphAddKernelNode, * ::cuGraphAddHostNode, * ::cuGraphAddMemcpyNode, * ::cuGraphAddMemsetNode */ CUresult CUDAAPI cuGraphAddEmptyNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies); /** * \brief Creates an event record node and adds it to a graph * * Creates a new event record node and adds it to \p hGraph with \p numDependencies * dependencies specified via \p dependencies and event specified in \p event. * It is possible for \p numDependencies to be 0, in which case the node will be placed * at the root of the graph. \p dependencies may not have any duplicate entries. * A handle to the new node will be returned in \p phGraphNode. * * Each launch of the graph will record \p event to capture execution of the * node's dependencies. * * \param phGraphNode - Returns newly created node * \param hGraph - Graph to which to add the node * \param dependencies - Dependencies of the node * \param numDependencies - Number of dependencies * \param event - Event for the node * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_NOT_SUPPORTED, * ::CUDA_ERROR_INVALID_VALUE * \note_graph_thread_safety * \notefnerr * * \sa * ::cuGraphAddEventWaitNode, * ::cuEventRecordWithFlags, * ::cuStreamWaitEvent, * ::cuGraphCreate, * ::cuGraphDestroyNode, * ::cuGraphAddChildGraphNode, * ::cuGraphAddEmptyNode, * ::cuGraphAddKernelNode, * ::cuGraphAddMemcpyNode, * ::cuGraphAddMemsetNode, */ CUresult CUDAAPI cuGraphAddEventRecordNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUevent event); /** * \brief Returns the event associated with an event record node * * Returns the event of event record node \p hNode in \p event_out. * * \param hNode - Node to get the event for * \param event_out - Pointer to return the event * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_VALUE * \note_graph_thread_safety * \notefnerr * * \sa * ::cuGraphAddEventRecordNode, * ::cuGraphEventRecordNodeSetEvent, * ::cuGraphEventWaitNodeGetEvent, * ::cuEventRecordWithFlags, * ::cuStreamWaitEvent */ CUresult CUDAAPI cuGraphEventRecordNodeGetEvent(CUgraphNode hNode, CUevent *event_out); /** * \brief Sets an event record node's event * * Sets the event of event record node \p hNode to \p event. * * \param hNode - Node to set the event for * \param event - Event to use * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_INVALID_HANDLE, * ::CUDA_ERROR_OUT_OF_MEMORY * \note_graph_thread_safety * \notefnerr * * \sa * ::cuGraphAddEventRecordNode, * ::cuGraphEventRecordNodeGetEvent, * ::cuGraphEventWaitNodeSetEvent, * ::cuEventRecordWithFlags, * ::cuStreamWaitEvent */ CUresult CUDAAPI cuGraphEventRecordNodeSetEvent(CUgraphNode hNode, CUevent event); /** * \brief Creates an event wait node and adds it to a graph * * Creates a new event wait node and adds it to \p hGraph with \p numDependencies * dependencies specified via \p dependencies and event specified in \p event. * It is possible for \p numDependencies to be 0, in which case the node will be placed * at the root of the graph. \p dependencies may not have any duplicate entries. * A handle to the new node will be returned in \p phGraphNode. * * The graph node will wait for all work captured in \p event. See ::cuEventRecord() * for details on what is captured by an event. \p event may be from a different context * or device than the launch stream. * * \param phGraphNode - Returns newly created node * \param hGraph - Graph to which to add the node * \param dependencies - Dependencies of the node * \param numDependencies - Number of dependencies * \param event - Event for the node * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_NOT_SUPPORTED, * ::CUDA_ERROR_INVALID_VALUE * \note_graph_thread_safety * \notefnerr * * \sa * ::cuGraphAddEventRecordNode, * ::cuEventRecordWithFlags, * ::cuStreamWaitEvent, * ::cuGraphCreate, * ::cuGraphDestroyNode, * ::cuGraphAddChildGraphNode, * ::cuGraphAddEmptyNode, * ::cuGraphAddKernelNode, * ::cuGraphAddMemcpyNode, * ::cuGraphAddMemsetNode, */ CUresult CUDAAPI cuGraphAddEventWaitNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUevent event); /** * \brief Returns the event associated with an event wait node * * Returns the event of event wait node \p hNode in \p event_out. * * \param hNode - Node to get the event for * \param event_out - Pointer to return the event * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_VALUE * \note_graph_thread_safety * \notefnerr * * \sa * ::cuGraphAddEventWaitNode, * ::cuGraphEventWaitNodeSetEvent, * ::cuGraphEventRecordNodeGetEvent, * ::cuEventRecordWithFlags, * ::cuStreamWaitEvent */ CUresult CUDAAPI cuGraphEventWaitNodeGetEvent(CUgraphNode hNode, CUevent *event_out); /** * \brief Sets an event wait node's event * * Sets the event of event wait node \p hNode to \p event. * * \param hNode - Node to set the event for * \param event - Event to use * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_INVALID_HANDLE, * ::CUDA_ERROR_OUT_OF_MEMORY * \note_graph_thread_safety * \notefnerr * * \sa * ::cuGraphAddEventWaitNode, * ::cuGraphEventWaitNodeGetEvent, * ::cuGraphEventRecordNodeSetEvent, * ::cuEventRecordWithFlags, * ::cuStreamWaitEvent */ CUresult CUDAAPI cuGraphEventWaitNodeSetEvent(CUgraphNode hNode, CUevent event); /** * \brief Creates an external semaphore signal node and adds it to a graph * * Creates a new external semaphore signal node and adds it to \p hGraph with \p * numDependencies dependencies specified via \p dependencies and arguments specified * in \p nodeParams. It is possible for \p numDependencies to be 0, in which case the * node will be placed at the root of the graph. \p dependencies may not have any * duplicate entries. A handle to the new node will be returned in \p phGraphNode. * * Performs a signal operation on a set of externally allocated semaphore objects * when the node is launched. The operation(s) will occur after all of the node's * dependencies have completed. * * \param phGraphNode - Returns newly created node * \param hGraph - Graph to which to add the node * \param dependencies - Dependencies of the node * \param numDependencies - Number of dependencies * \param nodeParams - Parameters for the node * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_NOT_SUPPORTED, * ::CUDA_ERROR_INVALID_VALUE * \note_graph_thread_safety * \notefnerr * * \sa * ::cuGraphExternalSemaphoresSignalNodeGetParams, * ::cuGraphExternalSemaphoresSignalNodeSetParams, * ::cuGraphExecExternalSemaphoresSignalNodeSetParams, * ::cuGraphAddExternalSemaphoresWaitNode, * ::cuImportExternalSemaphore, * ::cuSignalExternalSemaphoresAsync, * ::cuWaitExternalSemaphoresAsync, * ::cuGraphCreate, * ::cuGraphDestroyNode, * ::cuGraphAddEventRecordNode, * ::cuGraphAddEventWaitNode, * ::cuGraphAddChildGraphNode, * ::cuGraphAddEmptyNode, * ::cuGraphAddKernelNode, * ::cuGraphAddMemcpyNode, * ::cuGraphAddMemsetNode, */ CUresult CUDAAPI cuGraphAddExternalSemaphoresSignalNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *nodeParams); /** * \brief Returns an external semaphore signal node's parameters * * Returns the parameters of an external semaphore signal node \p hNode in \p params_out. * The \p extSemArray and \p paramsArray returned in \p params_out, * are owned by the node. This memory remains valid until the node is destroyed or its * parameters are modified, and should not be modified * directly. Use ::cuGraphExternalSemaphoresSignalNodeSetParams to update the * parameters of this node. * * \param hNode - Node to get the parameters for * \param params_out - Pointer to return the parameters * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_VALUE * \note_graph_thread_safety * \notefnerr * * \sa * ::cuLaunchKernel, * ::cuGraphAddExternalSemaphoresSignalNode, * ::cuGraphExternalSemaphoresSignalNodeSetParams, * ::cuGraphAddExternalSemaphoresWaitNode, * ::cuSignalExternalSemaphoresAsync, * ::cuWaitExternalSemaphoresAsync */ CUresult CUDAAPI cuGraphExternalSemaphoresSignalNodeGetParams(CUgraphNode hNode, CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *params_out); /** * \brief Sets an external semaphore signal node's parameters * * Sets the parameters of an external semaphore signal node \p hNode to \p nodeParams. * * \param hNode - Node to set the parameters for * \param nodeParams - Parameters to copy * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_INVALID_HANDLE, * ::CUDA_ERROR_OUT_OF_MEMORY * \note_graph_thread_safety * \notefnerr * * \sa * ::cuGraphAddExternalSemaphoresSignalNode, * ::cuGraphExternalSemaphoresSignalNodeSetParams, * ::cuGraphAddExternalSemaphoresWaitNode, * ::cuSignalExternalSemaphoresAsync, * ::cuWaitExternalSemaphoresAsync */ CUresult CUDAAPI cuGraphExternalSemaphoresSignalNodeSetParams(CUgraphNode hNode, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *nodeParams); /** * \brief Creates an external semaphore wait node and adds it to a graph * * Creates a new external semaphore wait node and adds it to \p hGraph with \p numDependencies * dependencies specified via \p dependencies and arguments specified in \p nodeParams. * It is possible for \p numDependencies to be 0, in which case the node will be placed * at the root of the graph. \p dependencies may not have any duplicate entries. A handle * to the new node will be returned in \p phGraphNode. * * Performs a wait operation on a set of externally allocated semaphore objects * when the node is launched. The node's dependencies will not be launched until * the wait operation has completed. * * \param phGraphNode - Returns newly created node * \param hGraph - Graph to which to add the node * \param dependencies - Dependencies of the node * \param numDependencies - Number of dependencies * \param nodeParams - Parameters for the node * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_NOT_SUPPORTED, * ::CUDA_ERROR_INVALID_VALUE * \note_graph_thread_safety * \notefnerr * * \sa * ::cuGraphExternalSemaphoresWaitNodeGetParams, * ::cuGraphExternalSemaphoresWaitNodeSetParams, * ::cuGraphExecExternalSemaphoresWaitNodeSetParams, * ::cuGraphAddExternalSemaphoresSignalNode, * ::cuImportExternalSemaphore, * ::cuSignalExternalSemaphoresAsync, * ::cuWaitExternalSemaphoresAsync, * ::cuGraphCreate, * ::cuGraphDestroyNode, * ::cuGraphAddEventRecordNode, * ::cuGraphAddEventWaitNode, * ::cuGraphAddChildGraphNode, * ::cuGraphAddEmptyNode, * ::cuGraphAddKernelNode, * ::cuGraphAddMemcpyNode, * ::cuGraphAddMemsetNode, */ CUresult CUDAAPI cuGraphAddExternalSemaphoresWaitNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_EXT_SEM_WAIT_NODE_PARAMS *nodeParams); /** * \brief Returns an external semaphore wait node's parameters * * Returns the parameters of an external semaphore wait node \p hNode in \p params_out. * The \p extSemArray and \p paramsArray returned in \p params_out, * are owned by the node. This memory remains valid until the node is destroyed or its * parameters are modified, and should not be modified * directly. Use ::cuGraphExternalSemaphoresSignalNodeSetParams to update the * parameters of this node. * * \param hNode - Node to get the parameters for * \param params_out - Pointer to return the parameters * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_VALUE * \note_graph_thread_safety * \notefnerr * * \sa * ::cuLaunchKernel, * ::cuGraphAddExternalSemaphoresWaitNode, * ::cuGraphExternalSemaphoresWaitNodeSetParams, * ::cuGraphAddExternalSemaphoresWaitNode, * ::cuSignalExternalSemaphoresAsync, * ::cuWaitExternalSemaphoresAsync */ CUresult CUDAAPI cuGraphExternalSemaphoresWaitNodeGetParams(CUgraphNode hNode, CUDA_EXT_SEM_WAIT_NODE_PARAMS *params_out); /** * \brief Sets an external semaphore wait node's parameters * * Sets the parameters of an external semaphore wait node \p hNode to \p nodeParams. * * \param hNode - Node to set the parameters for * \param nodeParams - Parameters to copy * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_INVALID_HANDLE, * ::CUDA_ERROR_OUT_OF_MEMORY * \note_graph_thread_safety * \notefnerr * * \sa * ::cuGraphAddExternalSemaphoresWaitNode, * ::cuGraphExternalSemaphoresWaitNodeSetParams, * ::cuGraphAddExternalSemaphoresWaitNode, * ::cuSignalExternalSemaphoresAsync, * ::cuWaitExternalSemaphoresAsync */ CUresult CUDAAPI cuGraphExternalSemaphoresWaitNodeSetParams(CUgraphNode hNode, const CUDA_EXT_SEM_WAIT_NODE_PARAMS *nodeParams); /** * \brief Creates an allocation node and adds it to a graph * * Creates a new allocation node and adds it to \p hGraph with \p numDependencies * dependencies specified via \p dependencies and arguments specified in \p nodeParams. * It is possible for \p numDependencies to be 0, in which case the node will be placed * at the root of the graph. \p dependencies may not have any duplicate entries. A handle * to the new node will be returned in \p phGraphNode. * * \param phGraphNode - Returns newly created node * \param hGraph - Graph to which to add the node * \param dependencies - Dependencies of the node * \param numDependencies - Number of dependencies * \param nodeParams - Parameters for the node * * When ::cuGraphAddMemAllocNode creates an allocation node, it returns the address of the allocation in * \p nodeParams.dptr. The allocation's address remains fixed across instantiations and launches. * * If the allocation is freed in the same graph, by creating a free node using ::cuGraphAddMemFreeNode, * the allocation can be accessed by nodes ordered after the allocation node but before the free node. * These allocations cannot be freed outside the owning graph, and they can only be freed once in the * owning graph. * * If the allocation is not freed in the same graph, then it can be accessed not only by nodes in the * graph which are ordered after the allocation node, but also by stream operations ordered after the * graph's execution but before the allocation is freed. * * Allocations which are not freed in the same graph can be freed by: * - passing the allocation to ::cuMemFreeAsync or ::cuMemFree; * - launching a graph with a free node for that allocation; or * - specifying ::CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH during instantiation, which makes * each launch behave as though it called ::cuMemFreeAsync for every unfreed allocation. * * It is not possible to free an allocation in both the owning graph and another graph. If the allocation * is freed in the same graph, a free node cannot be added to another graph. If the allocation is freed * in another graph, a free node can no longer be added to the owning graph. * * The following restrictions apply to graphs which contain allocation and/or memory free nodes: * - Nodes and edges of the graph cannot be deleted. * - The graph cannot be used in a child node. * - Only one instantiation of the graph may exist at any point in time. * - The graph cannot be cloned. * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_NOT_SUPPORTED, * ::CUDA_ERROR_INVALID_VALUE * \note_graph_thread_safety * \notefnerr * * \sa * ::cuGraphAddMemFreeNode, * ::cuGraphMemAllocNodeGetParams, * ::cuDeviceGraphMemTrim, * ::cuDeviceGetGraphMemAttribute, * ::cuDeviceSetGraphMemAttribute, * ::cuMemAllocAsync, * ::cuMemFreeAsync, * ::cuGraphCreate, * ::cuGraphDestroyNode, * ::cuGraphAddChildGraphNode, * ::cuGraphAddEmptyNode, * ::cuGraphAddEventRecordNode, * ::cuGraphAddEventWaitNode, * ::cuGraphAddExternalSemaphoresSignalNode, * ::cuGraphAddExternalSemaphoresWaitNode, * ::cuGraphAddKernelNode, * ::cuGraphAddMemcpyNode, * ::cuGraphAddMemsetNode */ CUresult CUDAAPI cuGraphAddMemAllocNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUDA_MEM_ALLOC_NODE_PARAMS *nodeParams); /** * \brief Returns a memory alloc node's parameters * * Returns the parameters of a memory alloc node \p hNode in \p params_out. * The \p poolProps and \p accessDescs returned in \p params_out, are owned by the * node. This memory remains valid until the node is destroyed. The returned * parameters must not be modified. * * \param hNode - Node to get the parameters for * \param params_out - Pointer to return the parameters * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_VALUE * \note_graph_thread_safety * \notefnerr * * \sa * ::cuGraphAddMemAllocNode, * ::cuGraphMemFreeNodeGetParams */ CUresult CUDAAPI cuGraphMemAllocNodeGetParams(CUgraphNode hNode, CUDA_MEM_ALLOC_NODE_PARAMS *params_out); /** * \brief Creates a memory free node and adds it to a graph * * Creates a new memory free node and adds it to \p hGraph with \p numDependencies * dependencies specified via \p dependencies and arguments specified in \p nodeParams. * It is possible for \p numDependencies to be 0, in which case the node will be placed * at the root of the graph. \p dependencies may not have any duplicate entries. A handle * to the new node will be returned in \p phGraphNode. * * \param phGraphNode - Returns newly created node * \param hGraph - Graph to which to add the node * \param dependencies - Dependencies of the node * \param numDependencies - Number of dependencies * \param dptr - Address of memory to free * * ::cuGraphAddMemFreeNode will return ::CUDA_ERROR_INVALID_VALUE if the user attempts to free: * - an allocation twice in the same graph. * - an address that was not returned by an allocation node. * - an invalid address. * * The following restrictions apply to graphs which contain allocation and/or memory free nodes: * - Nodes and edges of the graph cannot be deleted. * - The graph cannot be used in a child node. * - Only one instantiation of the graph may exist at any point in time. * - The graph cannot be cloned. * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_NOT_SUPPORTED, * ::CUDA_ERROR_INVALID_VALUE * \note_graph_thread_safety * \notefnerr * * \sa * ::cuGraphAddMemAllocNode, * ::cuGraphMemFreeNodeGetParams, * ::cuDeviceGraphMemTrim, * ::cuDeviceGetGraphMemAttribute, * ::cuDeviceSetGraphMemAttribute, * ::cuMemAllocAsync, * ::cuMemFreeAsync, * ::cuGraphCreate, * ::cuGraphDestroyNode, * ::cuGraphAddChildGraphNode, * ::cuGraphAddEmptyNode, * ::cuGraphAddEventRecordNode, * ::cuGraphAddEventWaitNode, * ::cuGraphAddExternalSemaphoresSignalNode, * ::cuGraphAddExternalSemaphoresWaitNode, * ::cuGraphAddKernelNode, * ::cuGraphAddMemcpyNode, * ::cuGraphAddMemsetNode */ CUresult CUDAAPI cuGraphAddMemFreeNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUdeviceptr dptr); /** * \brief Returns a memory free node's parameters * * Returns the address of a memory free node \p hNode in \p dptr_out. * * \param hNode - Node to get the parameters for * \param dptr_out - Pointer to return the device address * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_VALUE * \note_graph_thread_safety * \notefnerr * * \sa * ::cuGraphAddMemFreeNode, * ::cuGraphMemAllocNodeGetParams */ CUresult CUDAAPI cuGraphMemFreeNodeGetParams(CUgraphNode hNode, CUdeviceptr *dptr_out); /** * \brief Free unused memory that was cached on the specified device for use with graphs back to the OS. * * Blocks which are not in use by a graph that is either currently executing or scheduled to execute are * freed back to the operating system. * * \param device - The device for which cached memory should be freed. * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_DEVICE * * \sa * ::cuGraphAddMemAllocNode, * ::cuGraphAddMemFreeNode, * ::cuDeviceSetGraphMemAttribute, * ::cuDeviceGetGraphMemAttribute */ CUresult CUDAAPI cuDeviceGraphMemTrim(CUdevice device); /** * \brief Query asynchronous allocation attributes related to graphs * * Valid attributes are: * * - ::CU_GRAPH_MEM_ATTR_USED_MEM_CURRENT: Amount of memory, in bytes, currently associated with graphs * - ::CU_GRAPH_MEM_ATTR_USED_MEM_HIGH: High watermark of memory, in bytes, associated with graphs since the * last time it was reset. High watermark can only be reset to zero. * - ::CU_GRAPH_MEM_ATTR_RESERVED_MEM_CURRENT: Amount of memory, in bytes, currently allocated for use by * the CUDA graphs asynchronous allocator. * - ::CU_GRAPH_MEM_ATTR_RESERVED_MEM_HIGH: High watermark of memory, in bytes, currently allocated for use by * the CUDA graphs asynchronous allocator. * * \param device - Specifies the scope of the query * \param attr - attribute to get * \param value - retrieved value * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_DEVICE * * \sa * ::cuDeviceSetGraphMemAttribute, * ::cuGraphAddMemAllocNode, * ::cuGraphAddMemFreeNode */ CUresult CUDAAPI cuDeviceGetGraphMemAttribute(CUdevice device, CUgraphMem_attribute attr, void* value); /** * \brief Set asynchronous allocation attributes related to graphs * * Valid attributes are: * * - ::CU_GRAPH_MEM_ATTR_USED_MEM_HIGH: High watermark of memory, in bytes, associated with graphs since the * last time it was reset. High watermark can only be reset to zero. * - ::CU_GRAPH_MEM_ATTR_RESERVED_MEM_HIGH: High watermark of memory, in bytes, currently allocated for use by * the CUDA graphs asynchronous allocator. * * \param device - Specifies the scope of the query * \param attr - attribute to get * \param value - pointer to value to set * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_DEVICE * * \sa * ::cuDeviceGetGraphMemAttribute, * ::cuGraphAddMemAllocNode, * ::cuGraphAddMemFreeNode */ CUresult CUDAAPI cuDeviceSetGraphMemAttribute(CUdevice device, CUgraphMem_attribute attr, void* value); /** * \brief Clones a graph * * This function creates a copy of \p originalGraph and returns it in \p phGraphClone. * All parameters are copied into the cloned graph. The original graph may be modified * after this call without affecting the clone. * * Child graph nodes in the original graph are recursively copied into the clone. * * \param phGraphClone - Returns newly created cloned graph * \param originalGraph - Graph to clone * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_OUT_OF_MEMORY * \note_graph_thread_safety * \notefnerr * * \sa * ::cuGraphCreate, * ::cuGraphNodeFindInClone */ CUresult CUDAAPI cuGraphClone(CUgraph *phGraphClone, CUgraph originalGraph); /** * \brief Finds a cloned version of a node * * This function returns the node in \p hClonedGraph corresponding to \p hOriginalNode * in the original graph. * * \p hClonedGraph must have been cloned from \p hOriginalGraph via ::cuGraphClone. * \p hOriginalNode must have been in \p hOriginalGraph at the time of the call to * ::cuGraphClone, and the corresponding cloned node in \p hClonedGraph must not have * been removed. The cloned node is then returned via \p phClonedNode. * * \param phNode - Returns handle to the cloned node * \param hOriginalNode - Handle to the original node * \param hClonedGraph - Cloned graph to query * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_VALUE, * \note_graph_thread_safety * \notefnerr * * \sa * ::cuGraphClone */ CUresult CUDAAPI cuGraphNodeFindInClone(CUgraphNode *phNode, CUgraphNode hOriginalNode, CUgraph hClonedGraph); /** * \brief Returns a node's type * * Returns the node type of \p hNode in \p type. * * \param hNode - Node to query * \param type - Pointer to return the node type * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_VALUE * \note_graph_thread_safety * \notefnerr * * \sa * ::cuGraphGetNodes, * ::cuGraphGetRootNodes, * ::cuGraphChildGraphNodeGetGraph, * ::cuGraphKernelNodeGetParams, * ::cuGraphKernelNodeSetParams, * ::cuGraphHostNodeGetParams, * ::cuGraphHostNodeSetParams, * ::cuGraphMemcpyNodeGetParams, * ::cuGraphMemcpyNodeSetParams, * ::cuGraphMemsetNodeGetParams, * ::cuGraphMemsetNodeSetParams */ CUresult CUDAAPI cuGraphNodeGetType(CUgraphNode hNode, CUgraphNodeType *type); /** * \brief Returns a graph's nodes * * Returns a list of \p hGraph's nodes. \p nodes may be NULL, in which case this * function will return the number of nodes in \p numNodes. Otherwise, * \p numNodes entries will be filled in. If \p numNodes is higher than the actual * number of nodes, the remaining entries in \p nodes will be set to NULL, and the * number of nodes actually obtained will be returned in \p numNodes. * * \param hGraph - Graph to query * \param nodes - Pointer to return the nodes * \param numNodes - See description * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_VALUE * \note_graph_thread_safety * \notefnerr * * \sa * ::cuGraphCreate, * ::cuGraphGetRootNodes, * ::cuGraphGetEdges, * ::cuGraphNodeGetType, * ::cuGraphNodeGetDependencies, * ::cuGraphNodeGetDependentNodes */ CUresult CUDAAPI cuGraphGetNodes(CUgraph hGraph, CUgraphNode *nodes, size_t *numNodes); /** * \brief Returns a graph's root nodes * * Returns a list of \p hGraph's root nodes. \p rootNodes may be NULL, in which case this * function will return the number of root nodes in \p numRootNodes. Otherwise, * \p numRootNodes entries will be filled in. If \p numRootNodes is higher than the actual * number of root nodes, the remaining entries in \p rootNodes will be set to NULL, and the * number of nodes actually obtained will be returned in \p numRootNodes. * * \param hGraph - Graph to query * \param rootNodes - Pointer to return the root nodes * \param numRootNodes - See description * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_VALUE * \note_graph_thread_safety * \notefnerr * * \sa * ::cuGraphCreate, * ::cuGraphGetNodes, * ::cuGraphGetEdges, * ::cuGraphNodeGetType, * ::cuGraphNodeGetDependencies, * ::cuGraphNodeGetDependentNodes */ CUresult CUDAAPI cuGraphGetRootNodes(CUgraph hGraph, CUgraphNode *rootNodes, size_t *numRootNodes); /** * \brief Returns a graph's dependency edges * * Returns a list of \p hGraph's dependency edges. Edges are returned via corresponding * indices in \p from and \p to; that is, the node in \p to[i] has a dependency on the * node in \p from[i]. \p from and \p to may both be NULL, in which * case this function only returns the number of edges in \p numEdges. Otherwise, * \p numEdges entries will be filled in. If \p numEdges is higher than the actual * number of edges, the remaining entries in \p from and \p to will be set to NULL, and * the number of edges actually returned will be written to \p numEdges. * * \param hGraph - Graph to get the edges from * \param from - Location to return edge endpoints * \param to - Location to return edge endpoints * \param numEdges - See description * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_VALUE * \note_graph_thread_safety * \notefnerr * * \sa * ::cuGraphGetNodes, * ::cuGraphGetRootNodes, * ::cuGraphAddDependencies, * ::cuGraphRemoveDependencies, * ::cuGraphNodeGetDependencies, * ::cuGraphNodeGetDependentNodes */ CUresult CUDAAPI cuGraphGetEdges(CUgraph hGraph, CUgraphNode *from, CUgraphNode *to, size_t *numEdges); /** * \brief Returns a node's dependencies * * Returns a list of \p node's dependencies. \p dependencies may be NULL, in which case this * function will return the number of dependencies in \p numDependencies. Otherwise, * \p numDependencies entries will be filled in. If \p numDependencies is higher than the actual * number of dependencies, the remaining entries in \p dependencies will be set to NULL, and the * number of nodes actually obtained will be returned in \p numDependencies. * * \param hNode - Node to query * \param dependencies - Pointer to return the dependencies * \param numDependencies - See description * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_VALUE * \note_graph_thread_safety * \notefnerr * * \sa * ::cuGraphNodeGetDependentNodes, * ::cuGraphGetNodes, * ::cuGraphGetRootNodes, * ::cuGraphGetEdges, * ::cuGraphAddDependencies, * ::cuGraphRemoveDependencies */ CUresult CUDAAPI cuGraphNodeGetDependencies(CUgraphNode hNode, CUgraphNode *dependencies, size_t *numDependencies); /** * \brief Returns a node's dependent nodes * * Returns a list of \p node's dependent nodes. \p dependentNodes may be NULL, in which * case this function will return the number of dependent nodes in \p numDependentNodes. * Otherwise, \p numDependentNodes entries will be filled in. If \p numDependentNodes is * higher than the actual number of dependent nodes, the remaining entries in * \p dependentNodes will be set to NULL, and the number of nodes actually obtained will * be returned in \p numDependentNodes. * * \param hNode - Node to query * \param dependentNodes - Pointer to return the dependent nodes * \param numDependentNodes - See description * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_VALUE * \note_graph_thread_safety * \notefnerr * * \sa * ::cuGraphNodeGetDependencies, * ::cuGraphGetNodes, * ::cuGraphGetRootNodes, * ::cuGraphGetEdges, * ::cuGraphAddDependencies, * ::cuGraphRemoveDependencies */ CUresult CUDAAPI cuGraphNodeGetDependentNodes(CUgraphNode hNode, CUgraphNode *dependentNodes, size_t *numDependentNodes); /** * \brief Adds dependency edges to a graph * * The number of dependencies to be added is defined by \p numDependencies * Elements in \p from and \p to at corresponding indices define a dependency. * Each node in \p from and \p to must belong to \p hGraph. * * If \p numDependencies is 0, elements in \p from and \p to will be ignored. * Specifying an existing dependency will return an error. * * \param hGraph - Graph to which dependencies are added * \param from - Array of nodes that provide the dependencies * \param to - Array of dependent nodes * \param numDependencies - Number of dependencies to be added * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_VALUE * \note_graph_thread_safety * \notefnerr * * \sa * ::cuGraphRemoveDependencies, * ::cuGraphGetEdges, * ::cuGraphNodeGetDependencies, * ::cuGraphNodeGetDependentNodes */ CUresult CUDAAPI cuGraphAddDependencies(CUgraph hGraph, const CUgraphNode *from, const CUgraphNode *to, size_t numDependencies); /** * \brief Removes dependency edges from a graph * * The number of \p dependencies to be removed is defined by \p numDependencies. * Elements in \p from and \p to at corresponding indices define a dependency. * Each node in \p from and \p to must belong to \p hGraph. * * If \p numDependencies is 0, elements in \p from and \p to will be ignored. * Specifying a non-existing dependency will return an error. * * Dependencies cannot be removed from graphs which contain allocation or free nodes. * Any attempt to do so will return an error. * * \param hGraph - Graph from which to remove dependencies * \param from - Array of nodes that provide the dependencies * \param to - Array of dependent nodes * \param numDependencies - Number of dependencies to be removed * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_VALUE * \note_graph_thread_safety * \notefnerr * * \sa * ::cuGraphAddDependencies, * ::cuGraphGetEdges, * ::cuGraphNodeGetDependencies, * ::cuGraphNodeGetDependentNodes */ CUresult CUDAAPI cuGraphRemoveDependencies(CUgraph hGraph, const CUgraphNode *from, const CUgraphNode *to, size_t numDependencies); /** * \brief Remove a node from the graph * * Removes \p hNode from its graph. This operation also severs any dependencies of other nodes * on \p hNode and vice versa. * * Nodes which belong to a graph which contains allocation or free nodes cannot be destroyed. * Any attempt to do so will return an error. * * \param hNode - Node to remove * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_VALUE * \note_graph_thread_safety * \notefnerr * * \sa * ::cuGraphAddChildGraphNode, * ::cuGraphAddEmptyNode, * ::cuGraphAddKernelNode, * ::cuGraphAddHostNode, * ::cuGraphAddMemcpyNode, * ::cuGraphAddMemsetNode */ CUresult CUDAAPI cuGraphDestroyNode(CUgraphNode hNode); /** * \brief Creates an executable graph from a graph * * Instantiates \p hGraph as an executable graph. The graph is validated for any * structural constraints or intra-node constraints which were not previously * validated. If instantiation is successful, a handle to the instantiated graph * is returned in \p phGraphExec. * * If there are any errors, diagnostic information may be returned in \p errorNode and * \p logBuffer. This is the primary way to inspect instantiation errors. The output * will be null terminated unless the diagnostics overflow * the buffer. In this case, they will be truncated, and the last byte can be * inspected to determine if truncation occurred. * * \param phGraphExec - Returns instantiated graph * \param hGraph - Graph to instantiate * \param phErrorNode - In case of an instantiation error, this may be modified to * indicate a node contributing to the error * \param logBuffer - A character buffer to store diagnostic messages * \param bufferSize - Size of the log buffer in bytes * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_VALUE * \note_graph_thread_safety * \notefnerr * * \sa * ::cuGraphInstantiateWithFlags, * ::cuGraphCreate, * ::cuGraphUpload, * ::cuGraphLaunch, * ::cuGraphExecDestroy */ CUresult CUDAAPI cuGraphInstantiate(CUgraphExec *phGraphExec, CUgraph hGraph, CUgraphNode *phErrorNode, char *logBuffer, size_t bufferSize); /** * \brief Creates an executable graph from a graph * * Instantiates \p hGraph as an executable graph. The graph is validated for any * structural constraints or intra-node constraints which were not previously * validated. If instantiation is successful, a handle to the instantiated graph * is returned in \p phGraphExec. * * The \p flags parameter controls the behavior of instantiation and subsequent * graph launches. Valid flags are: * * - ::CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH, which configures a * graph containing memory allocation nodes to automatically free any * unfreed memory allocations before the graph is relaunched. * * If \p hGraph contains any allocation or free nodes, there can be at most one * executable graph in existence for that graph at a time. * * An attempt to instantiate a second executable graph before destroying the first * with ::cuGraphExecDestroy will result in an error. * * \param phGraphExec - Returns instantiated graph * \param hGraph - Graph to instantiate * \param flags - Flags to control instantiation. See ::CUgraphInstantiate_flags. * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_VALUE * \note_graph_thread_safety * \notefnerr * * \sa * ::cuGraphInstantiate, * ::cuGraphCreate, * ::cuGraphUpload, * ::cuGraphLaunch, * ::cuGraphExecDestroy */ CUresult CUDAAPI cuGraphInstantiateWithFlags(CUgraphExec *phGraphExec, CUgraph hGraph, unsigned long long flags); /** * \brief Sets the parameters for a kernel node in the given graphExec * * Sets the parameters of a kernel node in an executable graph \p hGraphExec. * The node is identified by the corresponding node \p hNode in the * non-executable graph, from which the executable graph was instantiated. * * \p hNode must not have been removed from the original graph. All \p nodeParams * fields may change, but the following restrictions apply to \p func updates: * * - The owning context of the function cannot change. * - A node whose function originally did not use CUDA dynamic parallelism cannot be updated * to a function which uses CDP * * The modifications only affect future launches of \p hGraphExec. Already * enqueued or running launches of \p hGraphExec are not affected by this call. * \p hNode is also not modified by this call. * * \param hGraphExec - The executable graph in which to set the specified node * \param hNode - kernel node from the graph from which graphExec was instantiated * \param nodeParams - Updated Parameters to set * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_VALUE, * \note_graph_thread_safety * \notefnerr * * \sa * ::cuGraphAddKernelNode, * ::cuGraphKernelNodeSetParams, * ::cuGraphExecMemcpyNodeSetParams, * ::cuGraphExecMemsetNodeSetParams, * ::cuGraphExecHostNodeSetParams, * ::cuGraphExecChildGraphNodeSetParams, * ::cuGraphExecEventRecordNodeSetEvent, * ::cuGraphExecEventWaitNodeSetEvent, * ::cuGraphExecExternalSemaphoresSignalNodeSetParams, * ::cuGraphExecExternalSemaphoresWaitNodeSetParams, * ::cuGraphExecUpdate, * ::cuGraphInstantiate */ CUresult CUDAAPI cuGraphExecKernelNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_KERNEL_NODE_PARAMS *nodeParams); /** * \brief Sets the parameters for a memcpy node in the given graphExec. * * Updates the work represented by \p hNode in \p hGraphExec as though \p hNode had * contained \p copyParams at instantiation. hNode must remain in the graph which was * used to instantiate \p hGraphExec. Changed edges to and from hNode are ignored. * * The source and destination memory in \p copyParams must be allocated from the same * contexts as the original source and destination memory. Both the instantiation-time * memory operands and the memory operands in \p copyParams must be 1-dimensional. * Zero-length operations are not supported. * * The modifications only affect future launches of \p hGraphExec. Already enqueued * or running launches of \p hGraphExec are not affected by this call. hNode is also * not modified by this call. * * Returns CUDA_ERROR_INVALID_VALUE if the memory operands' mappings changed or * either the original or new memory operands are multidimensional. * * \param hGraphExec - The executable graph in which to set the specified node * \param hNode - Memcpy node from the graph which was used to instantiate graphExec * \param copyParams - The updated parameters to set * \param ctx - Context on which to run the node * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_VALUE, * \note_graph_thread_safety * \notefnerr * * \sa * ::cuGraphAddMemcpyNode, * ::cuGraphMemcpyNodeSetParams, * ::cuGraphExecKernelNodeSetParams, * ::cuGraphExecMemsetNodeSetParams, * ::cuGraphExecHostNodeSetParams, * ::cuGraphExecChildGraphNodeSetParams, * ::cuGraphExecEventRecordNodeSetEvent, * ::cuGraphExecEventWaitNodeSetEvent, * ::cuGraphExecExternalSemaphoresSignalNodeSetParams, * ::cuGraphExecExternalSemaphoresWaitNodeSetParams, * ::cuGraphExecUpdate, * ::cuGraphInstantiate */ CUresult CUDAAPI cuGraphExecMemcpyNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_MEMCPY3D *copyParams, CUcontext ctx); /** * \brief Sets the parameters for a memset node in the given graphExec. * * Updates the work represented by \p hNode in \p hGraphExec as though \p hNode had * contained \p memsetParams at instantiation. hNode must remain in the graph which was * used to instantiate \p hGraphExec. Changed edges to and from hNode are ignored. * * The destination memory in \p memsetParams must be allocated from the same * contexts as the original destination memory. Both the instantiation-time * memory operand and the memory operand in \p memsetParams must be 1-dimensional. * Zero-length operations are not supported. * * The modifications only affect future launches of \p hGraphExec. Already enqueued * or running launches of \p hGraphExec are not affected by this call. hNode is also * not modified by this call. * * Returns CUDA_ERROR_INVALID_VALUE if the memory operand's mappings changed or * either the original or new memory operand are multidimensional. * * \param hGraphExec - The executable graph in which to set the specified node * \param hNode - Memset node from the graph which was used to instantiate graphExec * \param memsetParams - The updated parameters to set * \param ctx - Context on which to run the node * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_VALUE, * \note_graph_thread_safety * \notefnerr * * \sa * ::cuGraphAddMemsetNode, * ::cuGraphMemsetNodeSetParams, * ::cuGraphExecKernelNodeSetParams, * ::cuGraphExecMemcpyNodeSetParams, * ::cuGraphExecHostNodeSetParams, * ::cuGraphExecChildGraphNodeSetParams, * ::cuGraphExecEventRecordNodeSetEvent, * ::cuGraphExecEventWaitNodeSetEvent, * ::cuGraphExecExternalSemaphoresSignalNodeSetParams, * ::cuGraphExecExternalSemaphoresWaitNodeSetParams, * ::cuGraphExecUpdate, * ::cuGraphInstantiate */ CUresult CUDAAPI cuGraphExecMemsetNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_MEMSET_NODE_PARAMS *memsetParams, CUcontext ctx); /** * \brief Sets the parameters for a host node in the given graphExec. * * Updates the work represented by \p hNode in \p hGraphExec as though \p hNode had * contained \p nodeParams at instantiation. hNode must remain in the graph which was * used to instantiate \p hGraphExec. Changed edges to and from hNode are ignored. * * The modifications only affect future launches of \p hGraphExec. Already enqueued * or running launches of \p hGraphExec are not affected by this call. hNode is also * not modified by this call. * * \param hGraphExec - The executable graph in which to set the specified node * \param hNode - Host node from the graph which was used to instantiate graphExec * \param nodeParams - The updated parameters to set * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_VALUE, * \note_graph_thread_safety * \notefnerr * * \sa * ::cuGraphAddHostNode, * ::cuGraphHostNodeSetParams, * ::cuGraphExecKernelNodeSetParams, * ::cuGraphExecMemcpyNodeSetParams, * ::cuGraphExecMemsetNodeSetParams, * ::cuGraphExecChildGraphNodeSetParams, * ::cuGraphExecEventRecordNodeSetEvent, * ::cuGraphExecEventWaitNodeSetEvent, * ::cuGraphExecExternalSemaphoresSignalNodeSetParams, * ::cuGraphExecExternalSemaphoresWaitNodeSetParams, * ::cuGraphExecUpdate, * ::cuGraphInstantiate */ CUresult CUDAAPI cuGraphExecHostNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_HOST_NODE_PARAMS *nodeParams); /** * \brief Updates node parameters in the child graph node in the given graphExec. * * Updates the work represented by \p hNode in \p hGraphExec as though the nodes contained * in \p hNode's graph had the parameters contained in \p childGraph's nodes at instantiation. * \p hNode must remain in the graph which was used to instantiate \p hGraphExec. * Changed edges to and from \p hNode are ignored. * * The modifications only affect future launches of \p hGraphExec. Already enqueued * or running launches of \p hGraphExec are not affected by this call. \p hNode is also * not modified by this call. * * The topology of \p childGraph, as well as the node insertion order, must match that * of the graph contained in \p hNode. See ::cuGraphExecUpdate() for a list of restrictions * on what can be updated in an instantiated graph. The update is recursive, so child graph * nodes contained within the top level child graph will also be updated. * * \param hGraphExec - The executable graph in which to set the specified node * \param hNode - Host node from the graph which was used to instantiate graphExec * \param childGraph - The graph supplying the updated parameters * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_VALUE, * \note_graph_thread_safety * \notefnerr * * \sa * ::cuGraphAddChildGraphNode, * ::cuGraphChildGraphNodeGetGraph, * ::cuGraphExecKernelNodeSetParams, * ::cuGraphExecMemcpyNodeSetParams, * ::cuGraphExecMemsetNodeSetParams, * ::cuGraphExecHostNodeSetParams, * ::cuGraphExecEventRecordNodeSetEvent, * ::cuGraphExecEventWaitNodeSetEvent, * ::cuGraphExecExternalSemaphoresSignalNodeSetParams, * ::cuGraphExecExternalSemaphoresWaitNodeSetParams, * ::cuGraphExecUpdate, * ::cuGraphInstantiate */ CUresult CUDAAPI cuGraphExecChildGraphNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, CUgraph childGraph); /** * \brief Sets the event for an event record node in the given graphExec * * Sets the event of an event record node in an executable graph \p hGraphExec. * The node is identified by the corresponding node \p hNode in the * non-executable graph, from which the executable graph was instantiated. * * The modifications only affect future launches of \p hGraphExec. Already * enqueued or running launches of \p hGraphExec are not affected by this call. * \p hNode is also not modified by this call. * * \param hGraphExec - The executable graph in which to set the specified node * \param hNode - event record node from the graph from which graphExec was instantiated * \param event - Updated event to use * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_VALUE, * \note_graph_thread_safety * \notefnerr * * \sa * ::cuGraphAddEventRecordNode, * ::cuGraphEventRecordNodeGetEvent, * ::cuGraphEventWaitNodeSetEvent, * ::cuEventRecordWithFlags, * ::cuStreamWaitEvent, * ::cuGraphExecKernelNodeSetParams, * ::cuGraphExecMemcpyNodeSetParams, * ::cuGraphExecMemsetNodeSetParams, * ::cuGraphExecHostNodeSetParams, * ::cuGraphExecChildGraphNodeSetParams, * ::cuGraphExecEventWaitNodeSetEvent, * ::cuGraphExecExternalSemaphoresSignalNodeSetParams, * ::cuGraphExecExternalSemaphoresWaitNodeSetParams, * ::cuGraphExecUpdate, * ::cuGraphInstantiate */ CUresult CUDAAPI cuGraphExecEventRecordNodeSetEvent(CUgraphExec hGraphExec, CUgraphNode hNode, CUevent event); /** * \brief Sets the event for an event wait node in the given graphExec * * Sets the event of an event wait node in an executable graph \p hGraphExec. * The node is identified by the corresponding node \p hNode in the * non-executable graph, from which the executable graph was instantiated. * * The modifications only affect future launches of \p hGraphExec. Already * enqueued or running launches of \p hGraphExec are not affected by this call. * \p hNode is also not modified by this call. * * \param hGraphExec - The executable graph in which to set the specified node * \param hNode - event wait node from the graph from which graphExec was instantiated * \param event - Updated event to use * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_VALUE, * \note_graph_thread_safety * \notefnerr * * \sa * ::cuGraphAddEventWaitNode, * ::cuGraphEventWaitNodeGetEvent, * ::cuGraphEventRecordNodeSetEvent, * ::cuEventRecordWithFlags, * ::cuStreamWaitEvent, * ::cuGraphExecKernelNodeSetParams, * ::cuGraphExecMemcpyNodeSetParams, * ::cuGraphExecMemsetNodeSetParams, * ::cuGraphExecHostNodeSetParams, * ::cuGraphExecChildGraphNodeSetParams, * ::cuGraphExecEventRecordNodeSetEvent, * ::cuGraphExecExternalSemaphoresSignalNodeSetParams, * ::cuGraphExecExternalSemaphoresWaitNodeSetParams, * ::cuGraphExecUpdate, * ::cuGraphInstantiate */ CUresult CUDAAPI cuGraphExecEventWaitNodeSetEvent(CUgraphExec hGraphExec, CUgraphNode hNode, CUevent event); /** * \brief Sets the parameters for an external semaphore signal node in the given graphExec * * Sets the parameters of an external semaphore signal node in an executable graph \p hGraphExec. * The node is identified by the corresponding node \p hNode in the * non-executable graph, from which the executable graph was instantiated. * * \p hNode must not have been removed from the original graph. * * The modifications only affect future launches of \p hGraphExec. Already * enqueued or running launches of \p hGraphExec are not affected by this call. * \p hNode is also not modified by this call. * * Changing \p nodeParams->numExtSems is not supported. * * \param hGraphExec - The executable graph in which to set the specified node * \param hNode - semaphore signal node from the graph from which graphExec was instantiated * \param nodeParams - Updated Parameters to set * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_VALUE, * \note_graph_thread_safety * \notefnerr * * \sa * ::cuGraphAddExternalSemaphoresSignalNode, * ::cuImportExternalSemaphore, * ::cuSignalExternalSemaphoresAsync, * ::cuWaitExternalSemaphoresAsync, * ::cuGraphExecKernelNodeSetParams, * ::cuGraphExecMemcpyNodeSetParams, * ::cuGraphExecMemsetNodeSetParams, * ::cuGraphExecHostNodeSetParams, * ::cuGraphExecChildGraphNodeSetParams, * ::cuGraphExecEventRecordNodeSetEvent, * ::cuGraphExecEventWaitNodeSetEvent, * ::cuGraphExecExternalSemaphoresWaitNodeSetParams, * ::cuGraphExecUpdate, * ::cuGraphInstantiate */ CUresult CUDAAPI cuGraphExecExternalSemaphoresSignalNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *nodeParams); /** * \brief Sets the parameters for an external semaphore wait node in the given graphExec * * Sets the parameters of an external semaphore wait node in an executable graph \p hGraphExec. * The node is identified by the corresponding node \p hNode in the * non-executable graph, from which the executable graph was instantiated. * * \p hNode must not have been removed from the original graph. * * The modifications only affect future launches of \p hGraphExec. Already * enqueued or running launches of \p hGraphExec are not affected by this call. * \p hNode is also not modified by this call. * * Changing \p nodeParams->numExtSems is not supported. * * \param hGraphExec - The executable graph in which to set the specified node * \param hNode - semaphore wait node from the graph from which graphExec was instantiated * \param nodeParams - Updated Parameters to set * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_VALUE, * \note_graph_thread_safety * \notefnerr * * \sa * ::cuGraphAddExternalSemaphoresWaitNode, * ::cuImportExternalSemaphore, * ::cuSignalExternalSemaphoresAsync, * ::cuWaitExternalSemaphoresAsync, * ::cuGraphExecKernelNodeSetParams, * ::cuGraphExecMemcpyNodeSetParams, * ::cuGraphExecMemsetNodeSetParams, * ::cuGraphExecHostNodeSetParams, * ::cuGraphExecChildGraphNodeSetParams, * ::cuGraphExecEventRecordNodeSetEvent, * ::cuGraphExecEventWaitNodeSetEvent, * ::cuGraphExecExternalSemaphoresSignalNodeSetParams, * ::cuGraphExecUpdate, * ::cuGraphInstantiate */ CUresult CUDAAPI cuGraphExecExternalSemaphoresWaitNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_EXT_SEM_WAIT_NODE_PARAMS *nodeParams); /** * \brief Enables or disables the specified node in the given graphExec * * Sets \p hNode to be either enabled or disabled. Disabled nodes are functionally equivalent * to empty nodes until they are reenabled. Existing node parameters are not affected by * disabling/enabling the node. * * The node is identified by the corresponding node \p hNode in the non-executable * graph, from which the executable graph was instantiated. * * \p hNode must not have been removed from the original graph. * * The modifications only affect future launches of \p hGraphExec. Already * enqueued or running launches of \p hGraphExec are not affected by this call. * \p hNode is also not modified by this call. * * \note Currently only kernel nodes are supported. * * \param hGraphExec - The executable graph in which to set the specified node * \param hNode - Node from the graph from which graphExec was instantiated * \param isEnabled - Node is enabled if != 0, otherwise the node is disabled * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_VALUE, * \note_graph_thread_safety * \notefnerr * * \sa * ::cuGraphNodeGetEnabled, * ::cuGraphExecUpdate, * ::cuGraphInstantiate * ::cuGraphLaunch */ CUresult CUDAAPI cuGraphNodeSetEnabled(CUgraphExec hGraphExec, CUgraphNode hNode, unsigned int isEnabled); /** * \brief Query whether a node in the given graphExec is enabled * * Sets isEnabled to 1 if \p hNode is enabled, or 0 if \p hNode is disabled. * * The node is identified by the corresponding node \p hNode in the non-executable * graph, from which the executable graph was instantiated. * * \p hNode must not have been removed from the original graph. * * \note Currently only kernel nodes are supported. * * \param hGraphExec - The executable graph in which to set the specified node * \param hNode - Node from the graph from which graphExec was instantiated * \param isEnabled - Location to return the enabled status of the node * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_VALUE, * \note_graph_thread_safety * \notefnerr * * \sa * ::cuGraphNodeSetEnabled, * ::cuGraphExecUpdate, * ::cuGraphInstantiate * ::cuGraphLaunch */ CUresult CUDAAPI cuGraphNodeGetEnabled(CUgraphExec hGraphExec, CUgraphNode hNode, unsigned int *isEnabled); /** * \brief Uploads an executable graph in a stream * * Uploads \p hGraphExec to the device in \p hStream without executing it. Uploads of * the same \p hGraphExec will be serialized. Each upload is ordered behind both any * previous work in \p hStream and any previous launches of \p hGraphExec. * Uses memory cached by \p stream to back the allocations owned by \p hGraphExec. * * \param hGraphExec - Executable graph to upload * \param hStream - Stream in which to upload the graph * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_VALUE * \note_graph_thread_safety * \notefnerr * * \sa * ::cuGraphInstantiate, * ::cuGraphLaunch, * ::cuGraphExecDestroy */ CUresult CUDAAPI cuGraphUpload(CUgraphExec hGraphExec, CUstream hStream); /** * \brief Launches an executable graph in a stream * * Executes \p hGraphExec in \p hStream. Only one instance of \p hGraphExec may be executing * at a time. Each launch is ordered behind both any previous work in \p hStream * and any previous launches of \p hGraphExec. To execute a graph concurrently, it must be * instantiated multiple times into multiple executable graphs. * * If any allocations created by \p hGraphExec remain unfreed (from a previous launch) and * \p hGraphExec was not instantiated with ::CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH, * the launch will fail with ::CUDA_ERROR_INVALID_VALUE. * * \param hGraphExec - Executable graph to launch * \param hStream - Stream in which to launch the graph * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_VALUE * \note_graph_thread_safety * \notefnerr * * \sa * ::cuGraphInstantiate, * ::cuGraphUpload, * ::cuGraphExecDestroy */ CUresult CUDAAPI cuGraphLaunch(CUgraphExec hGraphExec, CUstream hStream); /** * \brief Destroys an executable graph * * Destroys the executable graph specified by \p hGraphExec, as well * as all of its executable nodes. If the executable graph is * in-flight, it will not be terminated, but rather freed * asynchronously on completion. * * \param hGraphExec - Executable graph to destroy * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_VALUE * \note_graph_thread_safety * \notefnerr * * \sa * ::cuGraphInstantiate, * ::cuGraphUpload, * ::cuGraphLaunch */ CUresult CUDAAPI cuGraphExecDestroy(CUgraphExec hGraphExec); /** * \brief Destroys a graph * * Destroys the graph specified by \p hGraph, as well as all of its nodes. * * \param hGraph - Graph to destroy * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_VALUE * \note_graph_thread_safety * \notefnerr * * \sa * ::cuGraphCreate */ CUresult CUDAAPI cuGraphDestroy(CUgraph hGraph); /** * \brief Check whether an executable graph can be updated with a graph and perform the update if possible * * Updates the node parameters in the instantiated graph specified by \p hGraphExec with the * node parameters in a topologically identical graph specified by \p hGraph. * * Limitations: * * - Kernel nodes: * - The owning context of the function cannot change. * - A node whose function originally did not use CUDA dynamic parallelism cannot be updated * to a function which uses CDP. * - A cooperative node cannot be updated to a non-cooperative node, and vice-versa. * - Memset and memcpy nodes: * - The CUDA device(s) to which the operand(s) was allocated/mapped cannot change. * - The source/destination memory must be allocated from the same contexts as the original * source/destination memory. * - Only 1D memsets can be changed. * - Additional memcpy node restrictions: * - Changing either the source or destination memory type(i.e. CU_MEMORYTYPE_DEVICE, * CU_MEMORYTYPE_ARRAY, etc.) is not supported. * - External semaphore wait nodes and record nodes: * - Changing the number of semaphores is not supported. * * Note: The API may add further restrictions in future releases. The return code should always be checked. * * cuGraphExecUpdate sets \p updateResult_out to CU_GRAPH_EXEC_UPDATE_ERROR_TOPOLOGY_CHANGED under * the following conditions: * * - The count of nodes directly in \p hGraphExec and \p hGraph differ, in which case \p hErrorNode_out * is NULL. * - A node is deleted in \p hGraph but not not its pair from \p hGraphExec, in which case \p hErrorNode_out * is NULL. * - A node is deleted in \p hGraphExec but not its pair from \p hGraph, in which case \p hErrorNode_out is * the pairless node from \p hGraph. * - The dependent nodes of a pair differ, in which case \p hErrorNode_out is the node from \p hGraph. * * cuGraphExecUpdate sets \p updateResult_out to: * - CU_GRAPH_EXEC_UPDATE_ERROR if passed an invalid value. * - CU_GRAPH_EXEC_UPDATE_ERROR_TOPOLOGY_CHANGED if the graph topology changed * - CU_GRAPH_EXEC_UPDATE_ERROR_NODE_TYPE_CHANGED if the type of a node changed, in which case * \p hErrorNode_out is set to the node from \p hGraph. * - CU_GRAPH_EXEC_UPDATE_ERROR_UNSUPPORTED_FUNCTION_CHANGE if the function changed in an unsupported * way(see note above), in which case \p hErrorNode_out is set to the node from \p hGraph * - CU_GRAPH_EXEC_UPDATE_ERROR_PARAMETERS_CHANGED if any parameters to a node changed in a way * that is not supported, in which case \p hErrorNode_out is set to the node from \p hGraph. * - CU_GRAPH_EXEC_UPDATE_ERROR_ATTRIBUTES_CHANGED if any attributes of a node changed in a way * that is not supported, in which case \p hErrorNode_out is set to the node from \p hGraph. * - CU_GRAPH_EXEC_UPDATE_ERROR_NOT_SUPPORTED if something about a node is unsupported, like * the node's type or configuration, in which case \p hErrorNode_out is set to the node from \p hGraph * * If \p updateResult_out isn't set in one of the situations described above, the update check passes * and cuGraphExecUpdate updates \p hGraphExec to match the contents of \p hGraph. If an error happens * during the update, \p updateResult_out will be set to CU_GRAPH_EXEC_UPDATE_ERROR; otherwise, * \p updateResult_out is set to CU_GRAPH_EXEC_UPDATE_SUCCESS. * * cuGraphExecUpdate returns CUDA_SUCCESS when the updated was performed successfully. It returns * CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE if the graph update was not performed because it included * changes which violated constraints specific to instantiated graph update. * * \param hGraphExec The instantiated graph to be updated * \param hGraph The graph containing the updated parameters * \param hErrorNode_out The node which caused the permissibility check to forbid the update, if any * \param updateResult_out Whether the graph update was permitted. If was forbidden, the reason why * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE, * \note_graph_thread_safety * \notefnerr * * \sa * ::cuGraphInstantiate, */ CUresult CUDAAPI cuGraphExecUpdate(CUgraphExec hGraphExec, CUgraph hGraph, CUgraphNode *hErrorNode_out, CUgraphExecUpdateResult *updateResult_out); /** * \brief Copies attributes from source node to destination node. * * Copies attributes from source node \p src to destination node \p dst. * Both node must have the same context. * * \param[out] dst Destination node * \param[in] src Source node * For list of attributes see ::CUkernelNodeAttrID * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_VALUE * \notefnerr * * \sa * ::CUaccessPolicyWindow */ CUresult CUDAAPI cuGraphKernelNodeCopyAttributes(CUgraphNode dst, CUgraphNode src); /** * \brief Queries node attribute. * * Queries attribute \p attr from node \p hNode and stores it in corresponding * member of \p value_out. * * \param[in] hNode * \param[in] attr * \param[out] value_out * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_INVALID_HANDLE * \notefnerr * * \sa * ::CUaccessPolicyWindow */ CUresult CUDAAPI cuGraphKernelNodeGetAttribute(CUgraphNode hNode, CUkernelNodeAttrID attr, CUkernelNodeAttrValue *value_out); /** * \brief Sets node attribute. * * Sets attribute \p attr on node \p hNode from corresponding attribute of * \p value. * * \param[out] hNode * \param[in] attr * \param[out] value * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_INVALID_HANDLE * \notefnerr * * \sa * ::CUaccessPolicyWindow */ CUresult CUDAAPI cuGraphKernelNodeSetAttribute(CUgraphNode hNode, CUkernelNodeAttrID attr, const CUkernelNodeAttrValue *value); /** * \brief Write a DOT file describing graph structure * * Using the provided \p hGraph, write to \p path a DOT formatted description of the graph. * By default this includes the graph topology, node types, node id, kernel names and memcpy direction. * \p flags can be specified to write more detailed information about each node type such as * parameter values, kernel attributes, node and function handles. * * \param hGraph - The graph to create a DOT file from * \param path - The path to write the DOT file to * \param flags - Flags from CUgraphDebugDot_flags for specifying which additional node information to write * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_OPERATING_SYSTEM */ CUresult CUDAAPI cuGraphDebugDotPrint(CUgraph hGraph, const char *path, unsigned int flags); /** * \brief Create a user object * * Create a user object with the specified destructor callback and initial reference count. The * initial references are owned by the caller. * * Destructor callbacks cannot make CUDA API calls and should avoid blocking behavior, as they * are executed by a shared internal thread. Another thread may be signaled to perform such * actions, if it does not block forward progress of tasks scheduled through CUDA. * * See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects. * * \param object_out - Location to return the user object handle * \param ptr - The pointer to pass to the destroy function * \param destroy - Callback to free the user object when it is no longer in use * \param initialRefcount - The initial refcount to create the object with, typically 1. The * initial references are owned by the calling thread. * \param flags - Currently it is required to pass ::CU_USER_OBJECT_NO_DESTRUCTOR_SYNC, * which is the only defined flag. This indicates that the destroy * callback cannot be waited on by any CUDA API. Users requiring * synchronization of the callback should signal its completion * manually. * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_VALUE * * \sa * ::cuUserObjectRetain, * ::cuUserObjectRelease, * ::cuGraphRetainUserObject, * ::cuGraphReleaseUserObject, * ::cuGraphCreate */ CUresult CUDAAPI cuUserObjectCreate(CUuserObject *object_out, void *ptr, CUhostFn destroy, unsigned int initialRefcount, unsigned int flags); /** * \brief Retain a reference to a user object * * Retains new references to a user object. The new references are owned by the caller. * * See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects. * * \param object - The object to retain * \param count - The number of references to retain, typically 1. Must be nonzero * and not larger than INT_MAX. * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_VALUE * * \sa * ::cuUserObjectCreate, * ::cuUserObjectRelease, * ::cuGraphRetainUserObject, * ::cuGraphReleaseUserObject, * ::cuGraphCreate */ CUresult CUDAAPI cuUserObjectRetain(CUuserObject object, unsigned int count); /** * \brief Release a reference to a user object * * Releases user object references owned by the caller. The object's destructor is invoked if * the reference count reaches zero. * * It is undefined behavior to release references not owned by the caller, or to use a user * object handle after all references are released. * * See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects. * * \param object - The object to release * \param count - The number of references to release, typically 1. Must be nonzero * and not larger than INT_MAX. * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_VALUE * * \sa * ::cuUserObjectCreate, * ::cuUserObjectRetain, * ::cuGraphRetainUserObject, * ::cuGraphReleaseUserObject, * ::cuGraphCreate */ CUresult CUDAAPI cuUserObjectRelease(CUuserObject object, unsigned int count); /** * \brief Retain a reference to a user object from a graph * * Creates or moves user object references that will be owned by a CUDA graph. * * See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects. * * \param graph - The graph to associate the reference with * \param object - The user object to retain a reference for * \param count - The number of references to add to the graph, typically 1. Must be * nonzero and not larger than INT_MAX. * \param flags - The optional flag ::CU_GRAPH_USER_OBJECT_MOVE transfers references * from the calling thread, rather than create new references. Pass 0 * to create new references. * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_VALUE * * \sa * ::cuUserObjectCreate, * ::cuUserObjectRetain, * ::cuUserObjectRelease, * ::cuGraphReleaseUserObject, * ::cuGraphCreate */ CUresult CUDAAPI cuGraphRetainUserObject(CUgraph graph, CUuserObject object, unsigned int count, unsigned int flags); /** * \brief Release a user object reference from a graph * * Releases user object references owned by a graph. * * See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects. * * \param graph - The graph that will release the reference * \param object - The user object to release a reference for * \param count - The number of references to release, typically 1. Must be nonzero * and not larger than INT_MAX. * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_VALUE * * \sa * ::cuUserObjectCreate, * ::cuUserObjectRetain, * ::cuUserObjectRelease, * ::cuGraphRetainUserObject, * ::cuGraphCreate */ CUresult CUDAAPI cuGraphReleaseUserObject(CUgraph graph, CUuserObject object, unsigned int count); /** @} */ /* END CUDA_GRAPH */ /** * \defgroup CUDA_OCCUPANCY Occupancy * * ___MANBRIEF___ occupancy calculation functions of the low-level CUDA driver * API (___CURRENT_FILE___) ___ENDMANBRIEF___ * * This section describes the occupancy calculation functions of the low-level CUDA * driver application programming interface. * * @{ */ /** * \brief Returns occupancy of a function * * Returns in \p *numBlocks the number of the maximum active blocks per * streaming multiprocessor. * * \param numBlocks - Returned occupancy * \param func - Kernel for which occupancy is calculated * \param blockSize - Block size the kernel is intended to be launched with * \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_UNKNOWN * \notefnerr * * \sa * ::cudaOccupancyMaxActiveBlocksPerMultiprocessor */ CUresult CUDAAPI cuOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize); /** * \brief Returns occupancy of a function * * Returns in \p *numBlocks the number of the maximum active blocks per * streaming multiprocessor. * * The \p Flags parameter controls how special cases are handled. The * valid flags are: * * - ::CU_OCCUPANCY_DEFAULT, which maintains the default behavior as * ::cuOccupancyMaxActiveBlocksPerMultiprocessor; * * - ::CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE, which suppresses the * default behavior on platform where global caching affects * occupancy. On such platforms, if caching is enabled, but * per-block SM resource usage would result in zero occupancy, the * occupancy calculator will calculate the occupancy as if caching * is disabled. Setting ::CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE makes * the occupancy calculator to return 0 in such cases. More information * can be found about this feature in the "Unified L1/Texture Cache" * section of the Maxwell tuning guide. * * \param numBlocks - Returned occupancy * \param func - Kernel for which occupancy is calculated * \param blockSize - Block size the kernel is intended to be launched with * \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes * \param flags - Requested behavior for the occupancy calculator * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_UNKNOWN * \notefnerr * * \sa * ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags */ CUresult CUDAAPI cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize, unsigned int flags); /** * \brief Suggest a launch configuration with reasonable occupancy * * Returns in \p *blockSize a reasonable block size that can achieve * the maximum occupancy (or, the maximum number of active warps with * the fewest blocks per multiprocessor), and in \p *minGridSize the * minimum grid size to achieve the maximum occupancy. * * If \p blockSizeLimit is 0, the configurator will use the maximum * block size permitted by the device / function instead. * * If per-block dynamic shared memory allocation is not needed, the * user should leave both \p blockSizeToDynamicSMemSize and \p * dynamicSMemSize as 0. * * If per-block dynamic shared memory allocation is needed, then if * the dynamic shared memory size is constant regardless of block * size, the size should be passed through \p dynamicSMemSize, and \p * blockSizeToDynamicSMemSize should be NULL. * * Otherwise, if the per-block dynamic shared memory size varies with * different block sizes, the user needs to provide a unary function * through \p blockSizeToDynamicSMemSize that computes the dynamic * shared memory needed by \p func for any given block size. \p * dynamicSMemSize is ignored. An example signature is: * * \code * // Take block size, returns dynamic shared memory needed * size_t blockToSmem(int blockSize); * \endcode * * \param minGridSize - Returned minimum grid size needed to achieve the maximum occupancy * \param blockSize - Returned maximum block size that can achieve the maximum occupancy * \param func - Kernel for which launch configuration is calculated * \param blockSizeToDynamicSMemSize - A function that calculates how much per-block dynamic shared memory \p func uses based on the block size * \param dynamicSMemSize - Dynamic shared memory usage intended, in bytes * \param blockSizeLimit - The maximum block size \p func is designed to handle * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_UNKNOWN * \notefnerr * * \sa * ::cudaOccupancyMaxPotentialBlockSize */ CUresult CUDAAPI cuOccupancyMaxPotentialBlockSize(int *minGridSize, int *blockSize, CUfunction func, CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize, int blockSizeLimit); /** * \brief Suggest a launch configuration with reasonable occupancy * * An extended version of ::cuOccupancyMaxPotentialBlockSize. In * addition to arguments passed to ::cuOccupancyMaxPotentialBlockSize, * ::cuOccupancyMaxPotentialBlockSizeWithFlags also takes a \p Flags * parameter. * * The \p Flags parameter controls how special cases are handled. The * valid flags are: * * - ::CU_OCCUPANCY_DEFAULT, which maintains the default behavior as * ::cuOccupancyMaxPotentialBlockSize; * * - ::CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE, which suppresses the * default behavior on platform where global caching affects * occupancy. On such platforms, the launch configurations that * produces maximal occupancy might not support global * caching. Setting ::CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE * guarantees that the the produced launch configuration is global * caching compatible at a potential cost of occupancy. More information * can be found about this feature in the "Unified L1/Texture Cache" * section of the Maxwell tuning guide. * * \param minGridSize - Returned minimum grid size needed to achieve the maximum occupancy * \param blockSize - Returned maximum block size that can achieve the maximum occupancy * \param func - Kernel for which launch configuration is calculated * \param blockSizeToDynamicSMemSize - A function that calculates how much per-block dynamic shared memory \p func uses based on the block size * \param dynamicSMemSize - Dynamic shared memory usage intended, in bytes * \param blockSizeLimit - The maximum block size \p func is designed to handle * \param flags - Options * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_UNKNOWN * \notefnerr * * \sa * ::cudaOccupancyMaxPotentialBlockSizeWithFlags */ CUresult CUDAAPI cuOccupancyMaxPotentialBlockSizeWithFlags(int *minGridSize, int *blockSize, CUfunction func, CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize, int blockSizeLimit, unsigned int flags); /** * \brief Returns dynamic shared memory available per block when launching \p numBlocks blocks on SM * * Returns in \p *dynamicSmemSize the maximum size of dynamic shared memory to allow \p numBlocks blocks per SM. * * \param dynamicSmemSize - Returned maximum dynamic shared memory * \param func - Kernel function for which occupancy is calculated * \param numBlocks - Number of blocks to fit on SM * \param blockSize - Size of the blocks * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_UNKNOWN * \notefnerr * * \sa */ CUresult CUDAAPI cuOccupancyAvailableDynamicSMemPerBlock(size_t *dynamicSmemSize, CUfunction func, int numBlocks, int blockSize); /** @} */ /* END CUDA_OCCUPANCY */ /** * \defgroup CUDA_TEXREF_DEPRECATED Texture Reference Management [DEPRECATED] * * ___MANBRIEF___ deprecated texture reference management functions of the * low-level CUDA driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ * * This section describes the deprecated texture reference management * functions of the low-level CUDA driver application programming interface. * * @{ */ /** * \brief Binds an array as a texture reference * * \deprecated * * Binds the CUDA array \p hArray to the texture reference \p hTexRef. Any * previous address or CUDA array state associated with the texture reference * is superseded by this function. \p Flags must be set to * ::CU_TRSA_OVERRIDE_FORMAT. Any CUDA array previously bound to \p hTexRef is * unbound. * * \param hTexRef - Texture reference to bind * \param hArray - Array to bind * \param Flags - Options (must be ::CU_TRSA_OVERRIDE_FORMAT) * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE * * \sa ::cuTexRefSetAddress, * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, * ::cudaBindTextureToArray */ __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetArray(CUtexref hTexRef, CUarray hArray, unsigned int Flags); /** * \brief Binds a mipmapped array to a texture reference * * \deprecated * * Binds the CUDA mipmapped array \p hMipmappedArray to the texture reference \p hTexRef. * Any previous address or CUDA array state associated with the texture reference * is superseded by this function. \p Flags must be set to ::CU_TRSA_OVERRIDE_FORMAT. * Any CUDA array previously bound to \p hTexRef is unbound. * * \param hTexRef - Texture reference to bind * \param hMipmappedArray - Mipmapped array to bind * \param Flags - Options (must be ::CU_TRSA_OVERRIDE_FORMAT) * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE * * \sa ::cuTexRefSetAddress, * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, * ::cudaBindTextureToMipmappedArray */ __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMipmappedArray(CUtexref hTexRef, CUmipmappedArray hMipmappedArray, unsigned int Flags); /** * \brief Binds an address as a texture reference * * \deprecated * * Binds a linear address range to the texture reference \p hTexRef. Any * previous address or CUDA array state associated with the texture reference * is superseded by this function. Any memory previously bound to \p hTexRef * is unbound. * * Since the hardware enforces an alignment requirement on texture base * addresses, ::cuTexRefSetAddress() passes back a byte offset in * \p *ByteOffset that must be applied to texture fetches in order to read from * the desired memory. This offset must be divided by the texel size and * passed to kernels that read from the texture so they can be applied to the * ::tex1Dfetch() function. * * If the device memory pointer was returned from ::cuMemAlloc(), the offset * is guaranteed to be 0 and NULL may be passed as the \p ByteOffset parameter. * * The total number of elements (or texels) in the linear address range * cannot exceed ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH. * The number of elements is computed as (\p bytes / bytesPerElement), * where bytesPerElement is determined from the data format and number of * components set using ::cuTexRefSetFormat(). * * \param ByteOffset - Returned byte offset * \param hTexRef - Texture reference to bind * \param dptr - Device pointer to bind * \param bytes - Size of memory to bind in bytes * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE * * \sa ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, * ::cudaBindTexture */ __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetAddress(size_t *ByteOffset, CUtexref hTexRef, CUdeviceptr dptr, size_t bytes); /** * \brief Binds an address as a 2D texture reference * * \deprecated * * Binds a linear address range to the texture reference \p hTexRef. Any * previous address or CUDA array state associated with the texture reference * is superseded by this function. Any memory previously bound to \p hTexRef * is unbound. * * Using a ::tex2D() function inside a kernel requires a call to either * ::cuTexRefSetArray() to bind the corresponding texture reference to an * array, or ::cuTexRefSetAddress2D() to bind the texture reference to linear * memory. * * Function calls to ::cuTexRefSetFormat() cannot follow calls to * ::cuTexRefSetAddress2D() for the same texture reference. * * It is required that \p dptr be aligned to the appropriate hardware-specific * texture alignment. You can query this value using the device attribute * ::CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT. If an unaligned \p dptr is * supplied, ::CUDA_ERROR_INVALID_VALUE is returned. * * \p Pitch has to be aligned to the hardware-specific texture pitch alignment. * This value can be queried using the device attribute * ::CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT. If an unaligned \p Pitch is * supplied, ::CUDA_ERROR_INVALID_VALUE is returned. * * Width and Height, which are specified in elements (or texels), cannot exceed * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH and * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT respectively. * \p Pitch, which is specified in bytes, cannot exceed * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH. * * \param hTexRef - Texture reference to bind * \param desc - Descriptor of CUDA array * \param dptr - Device pointer to bind * \param Pitch - Line pitch in bytes * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE * * \sa ::cuTexRefSetAddress, * ::cuTexRefSetAddressMode, ::cuTexRefSetArray, * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, * ::cudaBindTexture2D */ __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetAddress2D(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, size_t Pitch); /** * \brief Sets the format for a texture reference * * \deprecated * * Specifies the format of the data to be read by the texture reference * \p hTexRef. \p fmt and \p NumPackedComponents are exactly analogous to the * ::Format and ::NumChannels members of the ::CUDA_ARRAY_DESCRIPTOR structure: * They specify the format of each component and the number of components per * array element. * * \param hTexRef - Texture reference * \param fmt - Format to set * \param NumPackedComponents - Number of components per array element * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE * * \sa ::cuTexRefSetAddress, * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, * ::cudaCreateChannelDesc, * ::cudaBindTexture, * ::cudaBindTexture2D, * ::cudaBindTextureToArray, * ::cudaBindTextureToMipmappedArray */ __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetFormat(CUtexref hTexRef, CUarray_format fmt, int NumPackedComponents); /** * \brief Sets the addressing mode for a texture reference * * \deprecated * * Specifies the addressing mode \p am for the given dimension \p dim of the * texture reference \p hTexRef. If \p dim is zero, the addressing mode is * applied to the first parameter of the functions used to fetch from the * texture; if \p dim is 1, the second, and so on. ::CUaddress_mode is defined * as: * \code typedef enum CUaddress_mode_enum { CU_TR_ADDRESS_MODE_WRAP = 0, CU_TR_ADDRESS_MODE_CLAMP = 1, CU_TR_ADDRESS_MODE_MIRROR = 2, CU_TR_ADDRESS_MODE_BORDER = 3 } CUaddress_mode; * \endcode * * Note that this call has no effect if \p hTexRef is bound to linear memory. * Also, if the flag, ::CU_TRSF_NORMALIZED_COORDINATES, is not set, the only * supported address mode is ::CU_TR_ADDRESS_MODE_CLAMP. * * \param hTexRef - Texture reference * \param dim - Dimension * \param am - Addressing mode to set * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE * * \sa ::cuTexRefSetAddress, * ::cuTexRefSetAddress2D, ::cuTexRefSetArray, * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, * ::cudaBindTexture, * ::cudaBindTexture2D, * ::cudaBindTextureToArray, * ::cudaBindTextureToMipmappedArray */ __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetAddressMode(CUtexref hTexRef, int dim, CUaddress_mode am); /** * \brief Sets the filtering mode for a texture reference * * \deprecated * * Specifies the filtering mode \p fm to be used when reading memory through * the texture reference \p hTexRef. ::CUfilter_mode_enum is defined as: * * \code typedef enum CUfilter_mode_enum { CU_TR_FILTER_MODE_POINT = 0, CU_TR_FILTER_MODE_LINEAR = 1 } CUfilter_mode; * \endcode * * Note that this call has no effect if \p hTexRef is bound to linear memory. * * \param hTexRef - Texture reference * \param fm - Filtering mode to set * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE * * \sa ::cuTexRefSetAddress, * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, * ::cuTexRefSetFlags, ::cuTexRefSetFormat, * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, * ::cudaBindTextureToArray */ __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetFilterMode(CUtexref hTexRef, CUfilter_mode fm); /** * \brief Sets the mipmap filtering mode for a texture reference * * \deprecated * * Specifies the mipmap filtering mode \p fm to be used when reading memory through * the texture reference \p hTexRef. ::CUfilter_mode_enum is defined as: * * \code typedef enum CUfilter_mode_enum { CU_TR_FILTER_MODE_POINT = 0, CU_TR_FILTER_MODE_LINEAR = 1 } CUfilter_mode; * \endcode * * Note that this call has no effect if \p hTexRef is not bound to a mipmapped array. * * \param hTexRef - Texture reference * \param fm - Filtering mode to set * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE * * \sa ::cuTexRefSetAddress, * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, * ::cuTexRefSetFlags, ::cuTexRefSetFormat, * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, * ::cudaBindTextureToMipmappedArray */ __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMipmapFilterMode(CUtexref hTexRef, CUfilter_mode fm); /** * \brief Sets the mipmap level bias for a texture reference * * \deprecated * * Specifies the mipmap level bias \p bias to be added to the specified mipmap level when * reading memory through the texture reference \p hTexRef. * * Note that this call has no effect if \p hTexRef is not bound to a mipmapped array. * * \param hTexRef - Texture reference * \param bias - Mipmap level bias * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE * * \sa ::cuTexRefSetAddress, * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, * ::cuTexRefSetFlags, ::cuTexRefSetFormat, * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, * ::cudaBindTextureToMipmappedArray */ __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMipmapLevelBias(CUtexref hTexRef, float bias); /** * \brief Sets the mipmap min/max mipmap level clamps for a texture reference * * \deprecated * * Specifies the min/max mipmap level clamps, \p minMipmapLevelClamp and \p maxMipmapLevelClamp * respectively, to be used when reading memory through the texture reference * \p hTexRef. * * Note that this call has no effect if \p hTexRef is not bound to a mipmapped array. * * \param hTexRef - Texture reference * \param minMipmapLevelClamp - Mipmap min level clamp * \param maxMipmapLevelClamp - Mipmap max level clamp * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE * * \sa ::cuTexRefSetAddress, * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, * ::cuTexRefSetFlags, ::cuTexRefSetFormat, * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, * ::cudaBindTextureToMipmappedArray */ __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMipmapLevelClamp(CUtexref hTexRef, float minMipmapLevelClamp, float maxMipmapLevelClamp); /** * \brief Sets the maximum anisotropy for a texture reference * * \deprecated * * Specifies the maximum anisotropy \p maxAniso to be used when reading memory through * the texture reference \p hTexRef. * * Note that this call has no effect if \p hTexRef is bound to linear memory. * * \param hTexRef - Texture reference * \param maxAniso - Maximum anisotropy * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE * * \sa ::cuTexRefSetAddress, * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, * ::cuTexRefSetFlags, ::cuTexRefSetFormat, * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, * ::cudaBindTextureToArray, * ::cudaBindTextureToMipmappedArray */ __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMaxAnisotropy(CUtexref hTexRef, unsigned int maxAniso); /** * \brief Sets the border color for a texture reference * * \deprecated * * Specifies the value of the RGBA color via the \p pBorderColor to the texture reference * \p hTexRef. The color value supports only float type and holds color components in * the following sequence: * pBorderColor[0] holds 'R' component * pBorderColor[1] holds 'G' component * pBorderColor[2] holds 'B' component * pBorderColor[3] holds 'A' component * * Note that the color values can be set only when the Address mode is set to * CU_TR_ADDRESS_MODE_BORDER using ::cuTexRefSetAddressMode. * Applications using integer border color values have to "reinterpret_cast" their values to float. * * \param hTexRef - Texture reference * \param pBorderColor - RGBA color * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE * * \sa ::cuTexRefSetAddressMode, * ::cuTexRefGetAddressMode, ::cuTexRefGetBorderColor, * ::cudaBindTexture, * ::cudaBindTexture2D, * ::cudaBindTextureToArray, * ::cudaBindTextureToMipmappedArray */ __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetBorderColor(CUtexref hTexRef, float *pBorderColor); /** * \brief Sets the flags for a texture reference * * \deprecated * * Specifies optional flags via \p Flags to specify the behavior of data * returned through the texture reference \p hTexRef. The valid flags are: * * - ::CU_TRSF_READ_AS_INTEGER, which suppresses the default behavior of * having the texture promote integer data to floating point data in the * range [0, 1]. Note that texture with 32-bit integer format * would not be promoted, regardless of whether or not this * flag is specified; * - ::CU_TRSF_NORMALIZED_COORDINATES, which suppresses the * default behavior of having the texture coordinates range * from [0, Dim) where Dim is the width or height of the CUDA * array. Instead, the texture coordinates [0, 1.0) reference * the entire breadth of the array dimension; * - ::CU_TRSF_DISABLE_TRILINEAR_OPTIMIZATION, which disables any trilinear * filtering optimizations. Trilinear optimizations improve texture filtering * performance by allowing bilinear filtering on textures in scenarios where * it can closely approximate the expected results. * * \param hTexRef - Texture reference * \param Flags - Optional flags to set * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE * * \sa ::cuTexRefSetAddress, * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, * ::cuTexRefSetFilterMode, ::cuTexRefSetFormat, * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, * ::cudaBindTexture, * ::cudaBindTexture2D, * ::cudaBindTextureToArray, * ::cudaBindTextureToMipmappedArray */ __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetFlags(CUtexref hTexRef, unsigned int Flags); /** * \brief Gets the address associated with a texture reference * * \deprecated * * Returns in \p *pdptr the base address bound to the texture reference * \p hTexRef, or returns ::CUDA_ERROR_INVALID_VALUE if the texture reference * is not bound to any device memory range. * * \param pdptr - Returned device address * \param hTexRef - Texture reference * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE * * \sa ::cuTexRefSetAddress, * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, * ::cuTexRefGetAddressMode, ::cuTexRefGetArray, * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat */ __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetAddress(CUdeviceptr *pdptr, CUtexref hTexRef); /** * \brief Gets the array bound to a texture reference * * \deprecated * * Returns in \p *phArray the CUDA array bound to the texture reference * \p hTexRef, or returns ::CUDA_ERROR_INVALID_VALUE if the texture reference * is not bound to any CUDA array. * * \param phArray - Returned array * \param hTexRef - Texture reference * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE * * \sa ::cuTexRefSetAddress, * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat */ __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetArray(CUarray *phArray, CUtexref hTexRef); /** * \brief Gets the mipmapped array bound to a texture reference * * \deprecated * * Returns in \p *phMipmappedArray the CUDA mipmapped array bound to the texture * reference \p hTexRef, or returns ::CUDA_ERROR_INVALID_VALUE if the texture reference * is not bound to any CUDA mipmapped array. * * \param phMipmappedArray - Returned mipmapped array * \param hTexRef - Texture reference * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE * * \sa ::cuTexRefSetAddress, * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat */ __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMipmappedArray(CUmipmappedArray *phMipmappedArray, CUtexref hTexRef); /** * \brief Gets the addressing mode used by a texture reference * * \deprecated * * Returns in \p *pam the addressing mode corresponding to the * dimension \p dim of the texture reference \p hTexRef. Currently, the only * valid value for \p dim are 0 and 1. * * \param pam - Returned addressing mode * \param hTexRef - Texture reference * \param dim - Dimension * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE * * \sa ::cuTexRefSetAddress, * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, * ::cuTexRefGetAddress, ::cuTexRefGetArray, * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat */ __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetAddressMode(CUaddress_mode *pam, CUtexref hTexRef, int dim); /** * \brief Gets the filter-mode used by a texture reference * * \deprecated * * Returns in \p *pfm the filtering mode of the texture reference * \p hTexRef. * * \param pfm - Returned filtering mode * \param hTexRef - Texture reference * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE * * \sa ::cuTexRefSetAddress, * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, * ::cuTexRefGetFlags, ::cuTexRefGetFormat */ __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetFilterMode(CUfilter_mode *pfm, CUtexref hTexRef); /** * \brief Gets the format used by a texture reference * * \deprecated * * Returns in \p *pFormat and \p *pNumChannels the format and number * of components of the CUDA array bound to the texture reference \p hTexRef. * If \p pFormat or \p pNumChannels is NULL, it will be ignored. * * \param pFormat - Returned format * \param pNumChannels - Returned number of components * \param hTexRef - Texture reference * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE * * \sa ::cuTexRefSetAddress, * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags */ __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetFormat(CUarray_format *pFormat, int *pNumChannels, CUtexref hTexRef); /** * \brief Gets the mipmap filtering mode for a texture reference * * \deprecated * * Returns the mipmap filtering mode in \p pfm that's used when reading memory through * the texture reference \p hTexRef. * * \param pfm - Returned mipmap filtering mode * \param hTexRef - Texture reference * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE * * \sa ::cuTexRefSetAddress, * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, * ::cuTexRefSetFlags, ::cuTexRefSetFormat, * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat */ __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMipmapFilterMode(CUfilter_mode *pfm, CUtexref hTexRef); /** * \brief Gets the mipmap level bias for a texture reference * * \deprecated * * Returns the mipmap level bias in \p pBias that's added to the specified mipmap * level when reading memory through the texture reference \p hTexRef. * * \param pbias - Returned mipmap level bias * \param hTexRef - Texture reference * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE * * \sa ::cuTexRefSetAddress, * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, * ::cuTexRefSetFlags, ::cuTexRefSetFormat, * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat */ __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMipmapLevelBias(float *pbias, CUtexref hTexRef); /** * \brief Gets the min/max mipmap level clamps for a texture reference * * \deprecated * * Returns the min/max mipmap level clamps in \p pminMipmapLevelClamp and \p pmaxMipmapLevelClamp * that's used when reading memory through the texture reference \p hTexRef. * * \param pminMipmapLevelClamp - Returned mipmap min level clamp * \param pmaxMipmapLevelClamp - Returned mipmap max level clamp * \param hTexRef - Texture reference * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE * * \sa ::cuTexRefSetAddress, * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, * ::cuTexRefSetFlags, ::cuTexRefSetFormat, * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat */ __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMipmapLevelClamp(float *pminMipmapLevelClamp, float *pmaxMipmapLevelClamp, CUtexref hTexRef); /** * \brief Gets the maximum anisotropy for a texture reference * * \deprecated * * Returns the maximum anisotropy in \p pmaxAniso that's used when reading memory through * the texture reference \p hTexRef. * * \param pmaxAniso - Returned maximum anisotropy * \param hTexRef - Texture reference * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE * * \sa ::cuTexRefSetAddress, * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, * ::cuTexRefSetFlags, ::cuTexRefSetFormat, * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat */ __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMaxAnisotropy(int *pmaxAniso, CUtexref hTexRef); /** * \brief Gets the border color used by a texture reference * * \deprecated * * Returns in \p pBorderColor, values of the RGBA color used by * the texture reference \p hTexRef. * The color value is of type float and holds color components in * the following sequence: * pBorderColor[0] holds 'R' component * pBorderColor[1] holds 'G' component * pBorderColor[2] holds 'B' component * pBorderColor[3] holds 'A' component * * \param hTexRef - Texture reference * \param pBorderColor - Returned Type and Value of RGBA color * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE * * \sa ::cuTexRefSetAddressMode, * ::cuTexRefSetAddressMode, ::cuTexRefSetBorderColor */ __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetBorderColor(float *pBorderColor, CUtexref hTexRef); /** * \brief Gets the flags used by a texture reference * * \deprecated * * Returns in \p *pFlags the flags of the texture reference \p hTexRef. * * \param pFlags - Returned flags * \param hTexRef - Texture reference * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE * * \sa ::cuTexRefSetAddress, * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, * ::cuTexRefGetFilterMode, ::cuTexRefGetFormat */ __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetFlags(unsigned int *pFlags, CUtexref hTexRef); /** * \brief Creates a texture reference * * \deprecated * * Creates a texture reference and returns its handle in \p *pTexRef. Once * created, the application must call ::cuTexRefSetArray() or * ::cuTexRefSetAddress() to associate the reference with allocated memory. * Other texture reference functions are used to specify the format and * interpretation (addressing, filtering, etc.) to be used when the memory is * read through this texture reference. * * \param pTexRef - Returned texture reference * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE * * \sa ::cuTexRefDestroy */ __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefCreate(CUtexref *pTexRef); /** * \brief Destroys a texture reference * * \deprecated * * Destroys the texture reference specified by \p hTexRef. * * \param hTexRef - Texture reference to destroy * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE * * \sa ::cuTexRefCreate */ __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefDestroy(CUtexref hTexRef); /** @} */ /* END CUDA_TEXREF_DEPRECATED */ /** * \defgroup CUDA_SURFREF_DEPRECATED Surface Reference Management [DEPRECATED] * * ___MANBRIEF___ surface reference management functions of the low-level CUDA * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ * * This section describes the surface reference management functions of the * low-level CUDA driver application programming interface. * * @{ */ /** * \brief Sets the CUDA array for a surface reference. * * \deprecated * * Sets the CUDA array \p hArray to be read and written by the surface reference * \p hSurfRef. Any previous CUDA array state associated with the surface * reference is superseded by this function. \p Flags must be set to 0. * The ::CUDA_ARRAY3D_SURFACE_LDST flag must have been set for the CUDA array. * Any CUDA array previously bound to \p hSurfRef is unbound. * \param hSurfRef - Surface reference handle * \param hArray - CUDA array handle * \param Flags - set to 0 * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE * * \sa * ::cuModuleGetSurfRef, * ::cuSurfRefGetArray, * ::cudaBindSurfaceToArray */ __CUDA_DEPRECATED CUresult CUDAAPI cuSurfRefSetArray(CUsurfref hSurfRef, CUarray hArray, unsigned int Flags); /** * \brief Passes back the CUDA array bound to a surface reference. * * \deprecated * * Returns in \p *phArray the CUDA array bound to the surface reference * \p hSurfRef, or returns ::CUDA_ERROR_INVALID_VALUE if the surface reference * is not bound to any CUDA array. * \param phArray - Surface reference handle * \param hSurfRef - Surface reference handle * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE * * \sa ::cuModuleGetSurfRef, ::cuSurfRefSetArray */ __CUDA_DEPRECATED CUresult CUDAAPI cuSurfRefGetArray(CUarray *phArray, CUsurfref hSurfRef); /** @} */ /* END CUDA_SURFREF_DEPRECATED */ /** * \defgroup CUDA_TEXOBJECT Texture Object Management * * ___MANBRIEF___ texture object management functions of the low-level CUDA * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ * * This section describes the texture object management functions of the * low-level CUDA driver application programming interface. The texture * object API is only supported on devices of compute capability 3.0 or higher. * * @{ */ /** * \brief Creates a texture object * * Creates a texture object and returns it in \p pTexObject. \p pResDesc describes * the data to texture from. \p pTexDesc describes how the data should be sampled. * \p pResViewDesc is an optional argument that specifies an alternate format for * the data described by \p pResDesc, and also describes the subresource region * to restrict access to when texturing. \p pResViewDesc can only be specified if * the type of resource is a CUDA array or a CUDA mipmapped array. * * Texture objects are only supported on devices of compute capability 3.0 or higher. * Additionally, a texture object is an opaque value, and, as such, should only be * accessed through CUDA API calls. * * The ::CUDA_RESOURCE_DESC structure is defined as: * \code typedef struct CUDA_RESOURCE_DESC_st { CUresourcetype resType; union { struct { CUarray hArray; } array; struct { CUmipmappedArray hMipmappedArray; } mipmap; struct { CUdeviceptr devPtr; CUarray_format format; unsigned int numChannels; size_t sizeInBytes; } linear; struct { CUdeviceptr devPtr; CUarray_format format; unsigned int numChannels; size_t width; size_t height; size_t pitchInBytes; } pitch2D; } res; unsigned int flags; } CUDA_RESOURCE_DESC; * \endcode * where: * - ::CUDA_RESOURCE_DESC::resType specifies the type of resource to texture from. * CUresourceType is defined as: * \code typedef enum CUresourcetype_enum { CU_RESOURCE_TYPE_ARRAY = 0x00, CU_RESOURCE_TYPE_MIPMAPPED_ARRAY = 0x01, CU_RESOURCE_TYPE_LINEAR = 0x02, CU_RESOURCE_TYPE_PITCH2D = 0x03 } CUresourcetype; * \endcode * * \par * If ::CUDA_RESOURCE_DESC::resType is set to ::CU_RESOURCE_TYPE_ARRAY, ::CUDA_RESOURCE_DESC::res::array::hArray * must be set to a valid CUDA array handle. * * \par * If ::CUDA_RESOURCE_DESC::resType is set to ::CU_RESOURCE_TYPE_MIPMAPPED_ARRAY, ::CUDA_RESOURCE_DESC::res::mipmap::hMipmappedArray * must be set to a valid CUDA mipmapped array handle. * * \par * If ::CUDA_RESOURCE_DESC::resType is set to ::CU_RESOURCE_TYPE_LINEAR, ::CUDA_RESOURCE_DESC::res::linear::devPtr * must be set to a valid device pointer, that is aligned to ::CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT. * ::CUDA_RESOURCE_DESC::res::linear::format and ::CUDA_RESOURCE_DESC::res::linear::numChannels * describe the format of each component and the number of components per array element. ::CUDA_RESOURCE_DESC::res::linear::sizeInBytes * specifies the size of the array in bytes. The total number of elements in the linear address range cannot exceed * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH. The number of elements is computed as (sizeInBytes / (sizeof(format) * numChannels)). * * \par * If ::CUDA_RESOURCE_DESC::resType is set to ::CU_RESOURCE_TYPE_PITCH2D, ::CUDA_RESOURCE_DESC::res::pitch2D::devPtr * must be set to a valid device pointer, that is aligned to ::CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT. * ::CUDA_RESOURCE_DESC::res::pitch2D::format and ::CUDA_RESOURCE_DESC::res::pitch2D::numChannels * describe the format of each component and the number of components per array element. ::CUDA_RESOURCE_DESC::res::pitch2D::width * and ::CUDA_RESOURCE_DESC::res::pitch2D::height specify the width and height of the array in elements, and cannot exceed * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH and ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT respectively. * ::CUDA_RESOURCE_DESC::res::pitch2D::pitchInBytes specifies the pitch between two rows in bytes and has to be aligned to * ::CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT. Pitch cannot exceed ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH. * * - ::flags must be set to zero. * * * The ::CUDA_TEXTURE_DESC struct is defined as * \code typedef struct CUDA_TEXTURE_DESC_st { CUaddress_mode addressMode[3]; CUfilter_mode filterMode; unsigned int flags; unsigned int maxAnisotropy; CUfilter_mode mipmapFilterMode; float mipmapLevelBias; float minMipmapLevelClamp; float maxMipmapLevelClamp; } CUDA_TEXTURE_DESC; * \endcode * where * - ::CUDA_TEXTURE_DESC::addressMode specifies the addressing mode for each dimension of the texture data. ::CUaddress_mode is defined as: * \code typedef enum CUaddress_mode_enum { CU_TR_ADDRESS_MODE_WRAP = 0, CU_TR_ADDRESS_MODE_CLAMP = 1, CU_TR_ADDRESS_MODE_MIRROR = 2, CU_TR_ADDRESS_MODE_BORDER = 3 } CUaddress_mode; * \endcode * This is ignored if ::CUDA_RESOURCE_DESC::resType is ::CU_RESOURCE_TYPE_LINEAR. Also, if the flag, ::CU_TRSF_NORMALIZED_COORDINATES * is not set, the only supported address mode is ::CU_TR_ADDRESS_MODE_CLAMP. * * - ::CUDA_TEXTURE_DESC::filterMode specifies the filtering mode to be used when fetching from the texture. CUfilter_mode is defined as: * \code typedef enum CUfilter_mode_enum { CU_TR_FILTER_MODE_POINT = 0, CU_TR_FILTER_MODE_LINEAR = 1 } CUfilter_mode; * \endcode * This is ignored if ::CUDA_RESOURCE_DESC::resType is ::CU_RESOURCE_TYPE_LINEAR. * * - ::CUDA_TEXTURE_DESC::flags can be any combination of the following: * - ::CU_TRSF_READ_AS_INTEGER, which suppresses the default behavior of * having the texture promote integer data to floating point data in the * range [0, 1]. Note that texture with 32-bit integer format would not be * promoted, regardless of whether or not this flag is specified. * - ::CU_TRSF_NORMALIZED_COORDINATES, which suppresses the default behavior * of having the texture coordinates range from [0, Dim) where Dim is the * width or height of the CUDA array. Instead, the texture coordinates * [0, 1.0) reference the entire breadth of the array dimension; Note that * for CUDA mipmapped arrays, this flag has to be set. * - ::CU_TRSF_DISABLE_TRILINEAR_OPTIMIZATION, which disables any trilinear * filtering optimizations. Trilinear optimizations improve texture filtering * performance by allowing bilinear filtering on textures in scenarios where * it can closely approximate the expected results. * - ::CU_TRSF_SEAMLESS_CUBEMAP, which enables seamless cube map filtering. * This flag can only be specified if the underlying resource is a CUDA array * or a CUDA mipmapped array that was created with the flag ::CUDA_ARRAY3D_CUBEMAP. * When seamless cube map filtering is enabled, texture address modes specified * by ::CUDA_TEXTURE_DESC::addressMode are ignored. Instead, if the ::CUDA_TEXTURE_DESC::filterMode * is set to ::CU_TR_FILTER_MODE_POINT the address mode ::CU_TR_ADDRESS_MODE_CLAMP * will be applied for all dimensions. If the ::CUDA_TEXTURE_DESC::filterMode is * set to ::CU_TR_FILTER_MODE_LINEAR seamless cube map filtering will be performed * when sampling along the cube face borders. * * - ::CUDA_TEXTURE_DESC::maxAnisotropy specifies the maximum anisotropy ratio to be used when doing anisotropic filtering. This value will be * clamped to the range [1,16]. * * - ::CUDA_TEXTURE_DESC::mipmapFilterMode specifies the filter mode when the calculated mipmap level lies between two defined mipmap levels. * * - ::CUDA_TEXTURE_DESC::mipmapLevelBias specifies the offset to be applied to the calculated mipmap level. * * - ::CUDA_TEXTURE_DESC::minMipmapLevelClamp specifies the lower end of the mipmap level range to clamp access to. * * - ::CUDA_TEXTURE_DESC::maxMipmapLevelClamp specifies the upper end of the mipmap level range to clamp access to. * * * The ::CUDA_RESOURCE_VIEW_DESC struct is defined as * \code typedef struct CUDA_RESOURCE_VIEW_DESC_st { CUresourceViewFormat format; size_t width; size_t height; size_t depth; unsigned int firstMipmapLevel; unsigned int lastMipmapLevel; unsigned int firstLayer; unsigned int lastLayer; } CUDA_RESOURCE_VIEW_DESC; * \endcode * where: * - ::CUDA_RESOURCE_VIEW_DESC::format specifies how the data contained in the CUDA array or CUDA mipmapped array should * be interpreted. Note that this can incur a change in size of the texture data. If the resource view format is a block * compressed format, then the underlying CUDA array or CUDA mipmapped array has to have a base of format ::CU_AD_FORMAT_UNSIGNED_INT32. * with 2 or 4 channels, depending on the block compressed format. For ex., BC1 and BC4 require the underlying CUDA array to have * a format of ::CU_AD_FORMAT_UNSIGNED_INT32 with 2 channels. The other BC formats require the underlying resource to have the same base * format but with 4 channels. * * - ::CUDA_RESOURCE_VIEW_DESC::width specifies the new width of the texture data. If the resource view format is a block * compressed format, this value has to be 4 times the original width of the resource. For non block compressed formats, * this value has to be equal to that of the original resource. * * - ::CUDA_RESOURCE_VIEW_DESC::height specifies the new height of the texture data. If the resource view format is a block * compressed format, this value has to be 4 times the original height of the resource. For non block compressed formats, * this value has to be equal to that of the original resource. * * - ::CUDA_RESOURCE_VIEW_DESC::depth specifies the new depth of the texture data. This value has to be equal to that of the * original resource. * * - ::CUDA_RESOURCE_VIEW_DESC::firstMipmapLevel specifies the most detailed mipmap level. This will be the new mipmap level zero. * For non-mipmapped resources, this value has to be zero.::CUDA_TEXTURE_DESC::minMipmapLevelClamp and ::CUDA_TEXTURE_DESC::maxMipmapLevelClamp * will be relative to this value. For ex., if the firstMipmapLevel is set to 2, and a minMipmapLevelClamp of 1.2 is specified, * then the actual minimum mipmap level clamp will be 3.2. * * - ::CUDA_RESOURCE_VIEW_DESC::lastMipmapLevel specifies the least detailed mipmap level. For non-mipmapped resources, this value * has to be zero. * * - ::CUDA_RESOURCE_VIEW_DESC::firstLayer specifies the first layer index for layered textures. This will be the new layer zero. * For non-layered resources, this value has to be zero. * * - ::CUDA_RESOURCE_VIEW_DESC::lastLayer specifies the last layer index for layered textures. For non-layered resources, * this value has to be zero. * * * \param pTexObject - Texture object to create * \param pResDesc - Resource descriptor * \param pTexDesc - Texture descriptor * \param pResViewDesc - Resource view descriptor * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE * * \sa * ::cuTexObjectDestroy, * ::cudaCreateTextureObject */ CUresult CUDAAPI cuTexObjectCreate(CUtexObject *pTexObject, const CUDA_RESOURCE_DESC *pResDesc, const CUDA_TEXTURE_DESC *pTexDesc, const CUDA_RESOURCE_VIEW_DESC *pResViewDesc); /** * \brief Destroys a texture object * * Destroys the texture object specified by \p texObject. * * \param texObject - Texture object to destroy * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE * * \sa * ::cuTexObjectCreate, * ::cudaDestroyTextureObject */ CUresult CUDAAPI cuTexObjectDestroy(CUtexObject texObject); /** * \brief Returns a texture object's resource descriptor * * Returns the resource descriptor for the texture object specified by \p texObject. * * \param pResDesc - Resource descriptor * \param texObject - Texture object * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE * * \sa * ::cuTexObjectCreate, * ::cudaGetTextureObjectResourceDesc, */ CUresult CUDAAPI cuTexObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc, CUtexObject texObject); /** * \brief Returns a texture object's texture descriptor * * Returns the texture descriptor for the texture object specified by \p texObject. * * \param pTexDesc - Texture descriptor * \param texObject - Texture object * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE * * \sa * ::cuTexObjectCreate, * ::cudaGetTextureObjectTextureDesc */ CUresult CUDAAPI cuTexObjectGetTextureDesc(CUDA_TEXTURE_DESC *pTexDesc, CUtexObject texObject); /** * \brief Returns a texture object's resource view descriptor * * Returns the resource view descriptor for the texture object specified by \p texObject. * If no resource view was set for \p texObject, the ::CUDA_ERROR_INVALID_VALUE is returned. * * \param pResViewDesc - Resource view descriptor * \param texObject - Texture object * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE * * \sa * ::cuTexObjectCreate, * ::cudaGetTextureObjectResourceViewDesc */ CUresult CUDAAPI cuTexObjectGetResourceViewDesc(CUDA_RESOURCE_VIEW_DESC *pResViewDesc, CUtexObject texObject); /** @} */ /* END CUDA_TEXOBJECT */ /** * \defgroup CUDA_SURFOBJECT Surface Object Management * * ___MANBRIEF___ surface object management functions of the low-level CUDA * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ * * This section describes the surface object management functions of the * low-level CUDA driver application programming interface. The surface * object API is only supported on devices of compute capability 3.0 or higher. * * @{ */ /** * \brief Creates a surface object * * Creates a surface object and returns it in \p pSurfObject. \p pResDesc describes * the data to perform surface load/stores on. ::CUDA_RESOURCE_DESC::resType must be * ::CU_RESOURCE_TYPE_ARRAY and ::CUDA_RESOURCE_DESC::res::array::hArray * must be set to a valid CUDA array handle. ::CUDA_RESOURCE_DESC::flags must be set to zero. * * Surface objects are only supported on devices of compute capability 3.0 or higher. * Additionally, a surface object is an opaque value, and, as such, should only be * accessed through CUDA API calls. * * \param pSurfObject - Surface object to create * \param pResDesc - Resource descriptor * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE * * \sa * ::cuSurfObjectDestroy, * ::cudaCreateSurfaceObject */ CUresult CUDAAPI cuSurfObjectCreate(CUsurfObject *pSurfObject, const CUDA_RESOURCE_DESC *pResDesc); /** * \brief Destroys a surface object * * Destroys the surface object specified by \p surfObject. * * \param surfObject - Surface object to destroy * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE * * \sa * ::cuSurfObjectCreate, * ::cudaDestroySurfaceObject */ CUresult CUDAAPI cuSurfObjectDestroy(CUsurfObject surfObject); /** * \brief Returns a surface object's resource descriptor * * Returns the resource descriptor for the surface object specified by \p surfObject. * * \param pResDesc - Resource descriptor * \param surfObject - Surface object * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE * * \sa * ::cuSurfObjectCreate, * ::cudaGetSurfaceObjectResourceDesc */ CUresult CUDAAPI cuSurfObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc, CUsurfObject surfObject); /** @} */ /* END CUDA_SURFOBJECT */ /** * \defgroup CUDA_PEER_ACCESS Peer Context Memory Access * * ___MANBRIEF___ direct peer context memory access functions of the low-level * CUDA driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ * * This section describes the direct peer context memory access functions * of the low-level CUDA driver application programming interface. * * @{ */ /** * \brief Queries if a device may directly access a peer device's memory. * * Returns in \p *canAccessPeer a value of 1 if contexts on \p dev are capable of * directly accessing memory from contexts on \p peerDev and 0 otherwise. * If direct access of \p peerDev from \p dev is possible, then access may be * enabled on two specific contexts by calling ::cuCtxEnablePeerAccess(). * * \param canAccessPeer - Returned access capability * \param dev - Device from which allocations on \p peerDev are to * be directly accessed. * \param peerDev - Device on which the allocations to be directly accessed * by \p dev reside. * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_DEVICE * \notefnerr * * \sa * ::cuCtxEnablePeerAccess, * ::cuCtxDisablePeerAccess, * ::cudaDeviceCanAccessPeer */ CUresult CUDAAPI cuDeviceCanAccessPeer(int *canAccessPeer, CUdevice dev, CUdevice peerDev); /** * \brief Enables direct access to memory allocations in a peer context. * * If both the current context and \p peerContext are on devices which support unified * addressing (as may be queried using ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING) and same * major compute capability, then on success all allocations from \p peerContext will * immediately be accessible by the current context. See \ref CUDA_UNIFIED for additional * details. * * Note that access granted by this call is unidirectional and that in order to access * memory from the current context in \p peerContext, a separate symmetric call * to ::cuCtxEnablePeerAccess() is required. * * Note that there are both device-wide and system-wide limitations per system * configuration, as noted in the CUDA Programming Guide under the section * "Peer-to-Peer Memory Access". * * Returns ::CUDA_ERROR_PEER_ACCESS_UNSUPPORTED if ::cuDeviceCanAccessPeer() indicates * that the ::CUdevice of the current context cannot directly access memory * from the ::CUdevice of \p peerContext. * * Returns ::CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED if direct access of * \p peerContext from the current context has already been enabled. * * Returns ::CUDA_ERROR_TOO_MANY_PEERS if direct peer access is not possible * because hardware resources required for peer access have been exhausted. * * Returns ::CUDA_ERROR_INVALID_CONTEXT if there is no current context, \p peerContext * is not a valid context, or if the current context is \p peerContext. * * Returns ::CUDA_ERROR_INVALID_VALUE if \p Flags is not 0. * * \param peerContext - Peer context to enable direct access to from the current context * \param Flags - Reserved for future use and must be set to 0 * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED, * ::CUDA_ERROR_TOO_MANY_PEERS, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_PEER_ACCESS_UNSUPPORTED, * ::CUDA_ERROR_INVALID_VALUE * \notefnerr * * \sa * ::cuDeviceCanAccessPeer, * ::cuCtxDisablePeerAccess, * ::cudaDeviceEnablePeerAccess */ CUresult CUDAAPI cuCtxEnablePeerAccess(CUcontext peerContext, unsigned int Flags); /** * \brief Disables direct access to memory allocations in a peer context and * unregisters any registered allocations. * Returns ::CUDA_ERROR_PEER_ACCESS_NOT_ENABLED if direct peer access has * not yet been enabled from \p peerContext to the current context. * * Returns ::CUDA_ERROR_INVALID_CONTEXT if there is no current context, or if * \p peerContext is not a valid context. * * \param peerContext - Peer context to disable direct access to * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_PEER_ACCESS_NOT_ENABLED, * ::CUDA_ERROR_INVALID_CONTEXT, * \notefnerr * * \sa * ::cuDeviceCanAccessPeer, * ::cuCtxEnablePeerAccess, * ::cudaDeviceDisablePeerAccess */ CUresult CUDAAPI cuCtxDisablePeerAccess(CUcontext peerContext); /** * \brief Queries attributes of the link between two devices. * * Returns in \p *value the value of the requested attribute \p attrib of the * link between \p srcDevice and \p dstDevice. The supported attributes are: * - ::CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK: A relative value indicating the * performance of the link between two devices. * - ::CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED P2P: 1 if P2P Access is enable. * - ::CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED: 1 if Atomic operations over * the link are supported. * - ::CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED: 1 if cudaArray can * be accessed over the link. * * Returns ::CUDA_ERROR_INVALID_DEVICE if \p srcDevice or \p dstDevice are not valid * or if they represent the same device. * * Returns ::CUDA_ERROR_INVALID_VALUE if \p attrib is not valid or if \p value is * a null pointer. * * \param value - Returned value of the requested attribute * \param attrib - The requested attribute of the link between \p srcDevice and \p dstDevice. * \param srcDevice - The source device of the target link. * \param dstDevice - The destination device of the target link. * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_DEVICE, * ::CUDA_ERROR_INVALID_VALUE * \notefnerr * * \sa * ::cuCtxEnablePeerAccess, * ::cuCtxDisablePeerAccess, * ::cuDeviceCanAccessPeer, * ::cudaDeviceGetP2PAttribute */ CUresult CUDAAPI cuDeviceGetP2PAttribute(int* value, CUdevice_P2PAttribute attrib, CUdevice srcDevice, CUdevice dstDevice); /** @} */ /* END CUDA_PEER_ACCESS */ /** * \defgroup CUDA_GRAPHICS Graphics Interoperability * * ___MANBRIEF___ graphics interoperability functions of the low-level CUDA * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ * * This section describes the graphics interoperability functions of the * low-level CUDA driver application programming interface. * * @{ */ /** * \brief Unregisters a graphics resource for access by CUDA * * Unregisters the graphics resource \p resource so it is not accessible by * CUDA unless registered again. * * If \p resource is invalid then ::CUDA_ERROR_INVALID_HANDLE is * returned. * * \param resource - Resource to unregister * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_HANDLE, * ::CUDA_ERROR_UNKNOWN * \notefnerr * * \sa * ::cuGraphicsD3D9RegisterResource, * ::cuGraphicsD3D10RegisterResource, * ::cuGraphicsD3D11RegisterResource, * ::cuGraphicsGLRegisterBuffer, * ::cuGraphicsGLRegisterImage, * ::cudaGraphicsUnregisterResource */ CUresult CUDAAPI cuGraphicsUnregisterResource(CUgraphicsResource resource); /** * \brief Get an array through which to access a subresource of a mapped graphics resource. * * Returns in \p *pArray an array through which the subresource of the mapped * graphics resource \p resource which corresponds to array index \p arrayIndex * and mipmap level \p mipLevel may be accessed. The value set in \p *pArray may * change every time that \p resource is mapped. * * If \p resource is not a texture then it cannot be accessed via an array and * ::CUDA_ERROR_NOT_MAPPED_AS_ARRAY is returned. * If \p arrayIndex is not a valid array index for \p resource then * ::CUDA_ERROR_INVALID_VALUE is returned. * If \p mipLevel is not a valid mipmap level for \p resource then * ::CUDA_ERROR_INVALID_VALUE is returned. * If \p resource is not mapped then ::CUDA_ERROR_NOT_MAPPED is returned. * * \param pArray - Returned array through which a subresource of \p resource may be accessed * \param resource - Mapped resource to access * \param arrayIndex - Array index for array textures or cubemap face * index as defined by ::CUarray_cubemap_face for * cubemap textures for the subresource to access * \param mipLevel - Mipmap level for the subresource to access * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_INVALID_HANDLE, * ::CUDA_ERROR_NOT_MAPPED, * ::CUDA_ERROR_NOT_MAPPED_AS_ARRAY * \notefnerr * * \sa * ::cuGraphicsResourceGetMappedPointer, * ::cudaGraphicsSubResourceGetMappedArray */ CUresult CUDAAPI cuGraphicsSubResourceGetMappedArray(CUarray *pArray, CUgraphicsResource resource, unsigned int arrayIndex, unsigned int mipLevel); /** * \brief Get a mipmapped array through which to access a mapped graphics resource. * * Returns in \p *pMipmappedArray a mipmapped array through which the mapped graphics * resource \p resource. The value set in \p *pMipmappedArray may change every time * that \p resource is mapped. * * If \p resource is not a texture then it cannot be accessed via a mipmapped array and * ::CUDA_ERROR_NOT_MAPPED_AS_ARRAY is returned. * If \p resource is not mapped then ::CUDA_ERROR_NOT_MAPPED is returned. * * \param pMipmappedArray - Returned mipmapped array through which \p resource may be accessed * \param resource - Mapped resource to access * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_INVALID_HANDLE, * ::CUDA_ERROR_NOT_MAPPED, * ::CUDA_ERROR_NOT_MAPPED_AS_ARRAY * \notefnerr * * \sa * ::cuGraphicsResourceGetMappedPointer, * ::cudaGraphicsResourceGetMappedMipmappedArray */ CUresult CUDAAPI cuGraphicsResourceGetMappedMipmappedArray(CUmipmappedArray *pMipmappedArray, CUgraphicsResource resource); /** * \brief Get a device pointer through which to access a mapped graphics resource. * * Returns in \p *pDevPtr a pointer through which the mapped graphics resource * \p resource may be accessed. * Returns in \p pSize the size of the memory in bytes which may be accessed from that pointer. * The value set in \p pPointer may change every time that \p resource is mapped. * * If \p resource is not a buffer then it cannot be accessed via a pointer and * ::CUDA_ERROR_NOT_MAPPED_AS_POINTER is returned. * If \p resource is not mapped then ::CUDA_ERROR_NOT_MAPPED is returned. * * * \param pDevPtr - Returned pointer through which \p resource may be accessed * \param pSize - Returned size of the buffer accessible starting at \p *pPointer * \param resource - Mapped resource to access * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_INVALID_HANDLE, * ::CUDA_ERROR_NOT_MAPPED, * ::CUDA_ERROR_NOT_MAPPED_AS_POINTER * \notefnerr * * \sa * ::cuGraphicsMapResources, * ::cuGraphicsSubResourceGetMappedArray, * ::cudaGraphicsResourceGetMappedPointer */ CUresult CUDAAPI cuGraphicsResourceGetMappedPointer(CUdeviceptr *pDevPtr, size_t *pSize, CUgraphicsResource resource); /** * \brief Set usage flags for mapping a graphics resource * * Set \p flags for mapping the graphics resource \p resource. * * Changes to \p flags will take effect the next time \p resource is mapped. * The \p flags argument may be any of the following: * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE: Specifies no hints about how this * resource will be used. It is therefore assumed that this resource will be * read from and written to by CUDA kernels. This is the default value. * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_READONLY: Specifies that CUDA kernels which * access this resource will not write to this resource. * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITEDISCARD: Specifies that CUDA kernels * which access this resource will not read from this resource and will * write over the entire contents of the resource, so none of the data * previously stored in the resource will be preserved. * * If \p resource is presently mapped for access by CUDA then * ::CUDA_ERROR_ALREADY_MAPPED is returned. * If \p flags is not one of the above values then ::CUDA_ERROR_INVALID_VALUE is returned. * * \param resource - Registered resource to set flags for * \param flags - Parameters for resource mapping * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_INVALID_HANDLE, * ::CUDA_ERROR_ALREADY_MAPPED * \notefnerr * * \sa * ::cuGraphicsMapResources, * ::cudaGraphicsResourceSetMapFlags */ CUresult CUDAAPI cuGraphicsResourceSetMapFlags(CUgraphicsResource resource, unsigned int flags); /** * \brief Map graphics resources for access by CUDA * * Maps the \p count graphics resources in \p resources for access by CUDA. * * The resources in \p resources may be accessed by CUDA until they * are unmapped. The graphics API from which \p resources were registered * should not access any resources while they are mapped by CUDA. If an * application does so, the results are undefined. * * This function provides the synchronization guarantee that any graphics calls * issued before ::cuGraphicsMapResources() will complete before any subsequent CUDA * work issued in \p stream begins. * * If \p resources includes any duplicate entries then ::CUDA_ERROR_INVALID_HANDLE is returned. * If any of \p resources are presently mapped for access by CUDA then ::CUDA_ERROR_ALREADY_MAPPED is returned. * * \param count - Number of resources to map * \param resources - Resources to map for CUDA usage * \param hStream - Stream with which to synchronize * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_HANDLE, * ::CUDA_ERROR_ALREADY_MAPPED, * ::CUDA_ERROR_UNKNOWN * \note_null_stream * \notefnerr * * \sa * ::cuGraphicsResourceGetMappedPointer, * ::cuGraphicsSubResourceGetMappedArray, * ::cuGraphicsUnmapResources, * ::cudaGraphicsMapResources */ CUresult CUDAAPI cuGraphicsMapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream); /** * \brief Unmap graphics resources. * * Unmaps the \p count graphics resources in \p resources. * * Once unmapped, the resources in \p resources may not be accessed by CUDA * until they are mapped again. * * This function provides the synchronization guarantee that any CUDA work issued * in \p stream before ::cuGraphicsUnmapResources() will complete before any * subsequently issued graphics work begins. * * * If \p resources includes any duplicate entries then ::CUDA_ERROR_INVALID_HANDLE is returned. * If any of \p resources are not presently mapped for access by CUDA then ::CUDA_ERROR_NOT_MAPPED is returned. * * \param count - Number of resources to unmap * \param resources - Resources to unmap * \param hStream - Stream with which to synchronize * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_HANDLE, * ::CUDA_ERROR_NOT_MAPPED, * ::CUDA_ERROR_UNKNOWN * \note_null_stream * \notefnerr * * \sa * ::cuGraphicsMapResources, * ::cudaGraphicsUnmapResources */ CUresult CUDAAPI cuGraphicsUnmapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream); /** @} */ /* END CUDA_GRAPHICS */ /** * \defgroup CUDA_DRIVER_ENTRY_POINT Driver Entry Point Access * * ___MANBRIEF___ driver entry point access functions of the low-level CUDA driver API * (___CURRENT_FILE___) ___ENDMANBRIEF___ * * This section describes the driver entry point access functions of the low-level CUDA * driver application programming interface. * * @{ */ /** * \brief Returns the requested driver API function pointer * * Returns in \p **pfn the address of the CUDA driver function for the requested * CUDA version and flags. * * The CUDA version is specified as (1000 * major + 10 * minor), so CUDA 11.2 * should be specified as 11020. For a requested driver symbol, if the specified * CUDA version is greater than or equal to the CUDA version in which the driver symbol * was introduced, this API will return the function pointer to the corresponding * versioned function. * * The pointer returned by the API should be cast to a function pointer matching the * requested driver function's definition in the API header file. The function pointer * typedef can be picked up from the corresponding typedefs header file. For example, * cudaTypedefs.h consists of function pointer typedefs for driver APIs defined in cuda.h. * * The API will return ::CUDA_ERROR_NOT_FOUND if the requested driver function is not * supported on the platform, no ABI compatible driver function exists for the specified * \p cudaVersion or if the driver symbol is invalid. * * The requested flags can be: * - ::CU_GET_PROC_ADDRESS_DEFAULT: This is the default mode. This is equivalent to * ::CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM if the code is compiled with * --default-stream per-thread compilation flag or the macro CUDA_API_PER_THREAD_DEFAULT_STREAM * is defined; ::CU_GET_PROC_ADDRESS_LEGACY_STREAM otherwise. * - ::CU_GET_PROC_ADDRESS_LEGACY_STREAM: This will enable the search for all driver symbols * that match the requested driver symbol name except the corresponding per-thread versions. * - ::CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM: This will enable the search for all * driver symbols that match the requested driver symbol name including the per-thread * versions. If a per-thread version is not found, the API will return the legacy version * of the driver function. * * \param symbol - The base name of the driver API function to look for. As an example, * for the driver API ::cuMemAlloc_v2, \p symbol would be cuMemAlloc and * \p cudaVersion would be the ABI compatible CUDA version for the _v2 variant. * \param pfn - Location to return the function pointer to the requested driver function * \param cudaVersion - The CUDA version to look for the requested driver symbol * \param flags - Flags to specify search options. * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_NOT_SUPPORTED, * ::CUDA_ERROR_NOT_FOUND * \note_version_mixing * * \sa * ::cudaGetDriverEntryPoint */ CUresult CUDAAPI cuGetProcAddress(const char *symbol, void **pfn, int cudaVersion, cuuint64_t flags); /** @} */ /* END CUDA_DRIVER_ENTRY_POINT */ CUresult CUDAAPI cuGetExportTable(const void **ppExportTable, const CUuuid *pExportTableId); /** * CUDA API versioning support */ #if defined(__CUDA_API_VERSION_INTERNAL) #undef cuMemHostRegister #undef cuGraphicsResourceSetMapFlags #undef cuLinkCreate #undef cuLinkAddData #undef cuLinkAddFile #undef cuDeviceTotalMem #undef cuCtxCreate #undef cuModuleGetGlobal #undef cuMemGetInfo #undef cuMemAlloc #undef cuMemAllocPitch #undef cuMemFree #undef cuMemGetAddressRange #undef cuMemAllocHost #undef cuMemHostGetDevicePointer #undef cuMemcpyHtoD #undef cuMemcpyDtoH #undef cuMemcpyDtoD #undef cuMemcpyDtoA #undef cuMemcpyAtoD #undef cuMemcpyHtoA #undef cuMemcpyAtoH #undef cuMemcpyAtoA #undef cuMemcpyHtoAAsync #undef cuMemcpyAtoHAsync #undef cuMemcpy2D #undef cuMemcpy2DUnaligned #undef cuMemcpy3D #undef cuMemcpyHtoDAsync #undef cuMemcpyDtoHAsync #undef cuMemcpyDtoDAsync #undef cuMemcpy2DAsync #undef cuMemcpy3DAsync #undef cuMemsetD8 #undef cuMemsetD16 #undef cuMemsetD32 #undef cuMemsetD2D8 #undef cuMemsetD2D16 #undef cuMemsetD2D32 #undef cuArrayCreate #undef cuArrayGetDescriptor #undef cuArray3DCreate #undef cuArray3DGetDescriptor #undef cuTexRefSetAddress #undef cuTexRefSetAddress2D #undef cuTexRefGetAddress #undef cuGraphicsResourceGetMappedPointer #undef cuCtxDestroy #undef cuCtxPopCurrent #undef cuCtxPushCurrent #undef cuStreamDestroy #undef cuEventDestroy #undef cuMemcpy #undef cuMemcpyAsync #undef cuMemcpyPeer #undef cuMemcpyPeerAsync #undef cuMemcpy3DPeer #undef cuMemcpy3DPeerAsync #undef cuMemsetD8Async #undef cuMemsetD16Async #undef cuMemsetD32Async #undef cuMemsetD2D8Async #undef cuMemsetD2D16Async #undef cuMemsetD2D32Async #undef cuStreamGetPriority #undef cuStreamGetFlags #undef cuStreamGetCtx #undef cuStreamWaitEvent #undef cuStreamAddCallback #undef cuStreamAttachMemAsync #undef cuStreamQuery #undef cuStreamSynchronize #undef cuEventRecord #undef cuEventRecordWithFlags #undef cuLaunchKernel #undef cuLaunchHostFunc #undef cuGraphicsMapResources #undef cuGraphicsUnmapResources #undef cuStreamWriteValue32 #undef cuStreamWaitValue32 #undef cuStreamWriteValue64 #undef cuStreamWaitValue64 #undef cuStreamBatchMemOp #undef cuMemPrefetchAsync #undef cuLaunchCooperativeKernel #undef cuSignalExternalSemaphoresAsync #undef cuWaitExternalSemaphoresAsync #undef cuStreamBeginCapture #undef cuStreamEndCapture #undef cuStreamIsCapturing #undef cuStreamGetCaptureInfo #undef cuStreamGetCaptureInfo_v2 #undef cuGraphUpload #undef cuGraphLaunch #undef cuDevicePrimaryCtxRelease #undef cuDevicePrimaryCtxReset #undef cuDevicePrimaryCtxSetFlags #undef cuIpcOpenMemHandle #undef cuStreamCopyAttributes #undef cuStreamSetAttribute #undef cuStreamGetAttribute #undef cuGraphInstantiate #undef cuMemMapArrayAsync #undef cuMemFreeAsync #undef cuMemAllocAsync #undef cuMemAllocFromPoolAsync #undef cuStreamUpdateCaptureDependencies CUresult CUDAAPI cuMemHostRegister(void *p, size_t bytesize, unsigned int Flags); CUresult CUDAAPI cuGraphicsResourceSetMapFlags(CUgraphicsResource resource, unsigned int flags); CUresult CUDAAPI cuLinkCreate(unsigned int numOptions, CUjit_option *options, void **optionValues, CUlinkState *stateOut); CUresult CUDAAPI cuLinkAddData(CUlinkState state, CUjitInputType type, void *data, size_t size, const char *name, unsigned int numOptions, CUjit_option *options, void **optionValues); CUresult CUDAAPI cuLinkAddFile(CUlinkState state, CUjitInputType type, const char *path, unsigned int numOptions, CUjit_option *options, void **optionValues); CUresult CUDAAPI cuTexRefSetAddress2D_v2(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, size_t Pitch); typedef unsigned int CUdeviceptr_v1; typedef struct CUDA_MEMCPY2D_v1_st { unsigned int srcXInBytes; /**< Source X in bytes */ unsigned int srcY; /**< Source Y */ CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */ const void *srcHost; /**< Source host pointer */ CUdeviceptr_v1 srcDevice; /**< Source device pointer */ CUarray srcArray; /**< Source array reference */ unsigned int srcPitch; /**< Source pitch (ignored when src is array) */ unsigned int dstXInBytes; /**< Destination X in bytes */ unsigned int dstY; /**< Destination Y */ CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */ void *dstHost; /**< Destination host pointer */ CUdeviceptr_v1 dstDevice; /**< Destination device pointer */ CUarray dstArray; /**< Destination array reference */ unsigned int dstPitch; /**< Destination pitch (ignored when dst is array) */ unsigned int WidthInBytes; /**< Width of 2D memory copy in bytes */ unsigned int Height; /**< Height of 2D memory copy */ } CUDA_MEMCPY2D_v1; typedef struct CUDA_MEMCPY3D_v1_st { unsigned int srcXInBytes; /**< Source X in bytes */ unsigned int srcY; /**< Source Y */ unsigned int srcZ; /**< Source Z */ unsigned int srcLOD; /**< Source LOD */ CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */ const void *srcHost; /**< Source host pointer */ CUdeviceptr_v1 srcDevice; /**< Source device pointer */ CUarray srcArray; /**< Source array reference */ void *reserved0; /**< Must be NULL */ unsigned int srcPitch; /**< Source pitch (ignored when src is array) */ unsigned int srcHeight; /**< Source height (ignored when src is array; may be 0 if Depth==1) */ unsigned int dstXInBytes; /**< Destination X in bytes */ unsigned int dstY; /**< Destination Y */ unsigned int dstZ; /**< Destination Z */ unsigned int dstLOD; /**< Destination LOD */ CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */ void *dstHost; /**< Destination host pointer */ CUdeviceptr_v1 dstDevice; /**< Destination device pointer */ CUarray dstArray; /**< Destination array reference */ void *reserved1; /**< Must be NULL */ unsigned int dstPitch; /**< Destination pitch (ignored when dst is array) */ unsigned int dstHeight; /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */ unsigned int WidthInBytes; /**< Width of 3D memory copy in bytes */ unsigned int Height; /**< Height of 3D memory copy */ unsigned int Depth; /**< Depth of 3D memory copy */ } CUDA_MEMCPY3D_v1; typedef struct CUDA_ARRAY_DESCRIPTOR_v1_st { unsigned int Width; /**< Width of array */ unsigned int Height; /**< Height of array */ CUarray_format Format; /**< Array format */ unsigned int NumChannels; /**< Channels per array element */ } CUDA_ARRAY_DESCRIPTOR_v1; typedef struct CUDA_ARRAY3D_DESCRIPTOR_v1_st { unsigned int Width; /**< Width of 3D array */ unsigned int Height; /**< Height of 3D array */ unsigned int Depth; /**< Depth of 3D array */ CUarray_format Format; /**< Array format */ unsigned int NumChannels; /**< Channels per array element */ unsigned int Flags; /**< Flags */ } CUDA_ARRAY3D_DESCRIPTOR_v1; CUresult CUDAAPI cuDeviceTotalMem(unsigned int *bytes, CUdevice dev); CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev); CUresult CUDAAPI cuModuleGetGlobal(CUdeviceptr_v1 *dptr, unsigned int *bytes, CUmodule hmod, const char *name); CUresult CUDAAPI cuMemGetInfo(unsigned int *free, unsigned int *total); CUresult CUDAAPI cuMemAlloc(CUdeviceptr_v1 *dptr, unsigned int bytesize); CUresult CUDAAPI cuMemAllocPitch(CUdeviceptr_v1 *dptr, unsigned int *pPitch, unsigned int WidthInBytes, unsigned int Height, unsigned int ElementSizeBytes); CUresult CUDAAPI cuMemFree(CUdeviceptr_v1 dptr); CUresult CUDAAPI cuMemGetAddressRange(CUdeviceptr_v1 *pbase, unsigned int *psize, CUdeviceptr_v1 dptr); CUresult CUDAAPI cuMemAllocHost(void **pp, unsigned int bytesize); CUresult CUDAAPI cuMemHostGetDevicePointer(CUdeviceptr_v1 *pdptr, void *p, unsigned int Flags); CUresult CUDAAPI cuMemcpyHtoD(CUdeviceptr_v1 dstDevice, const void *srcHost, unsigned int ByteCount); CUresult CUDAAPI cuMemcpyDtoH(void *dstHost, CUdeviceptr_v1 srcDevice, unsigned int ByteCount); CUresult CUDAAPI cuMemcpyDtoD(CUdeviceptr_v1 dstDevice, CUdeviceptr_v1 srcDevice, unsigned int ByteCount); CUresult CUDAAPI cuMemcpyDtoA(CUarray dstArray, unsigned int dstOffset, CUdeviceptr_v1 srcDevice, unsigned int ByteCount); CUresult CUDAAPI cuMemcpyAtoD(CUdeviceptr_v1 dstDevice, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount); CUresult CUDAAPI cuMemcpyHtoA(CUarray dstArray, unsigned int dstOffset, const void *srcHost, unsigned int ByteCount); CUresult CUDAAPI cuMemcpyAtoH(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount); CUresult CUDAAPI cuMemcpyAtoA(CUarray dstArray, unsigned int dstOffset, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount); CUresult CUDAAPI cuMemcpyHtoAAsync(CUarray dstArray, unsigned int dstOffset, const void *srcHost, unsigned int ByteCount, CUstream hStream); CUresult CUDAAPI cuMemcpyAtoHAsync(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount, CUstream hStream); CUresult CUDAAPI cuMemcpy2D(const CUDA_MEMCPY2D_v1 *pCopy); CUresult CUDAAPI cuMemcpy2DUnaligned(const CUDA_MEMCPY2D_v1 *pCopy); CUresult CUDAAPI cuMemcpy3D(const CUDA_MEMCPY3D_v1 *pCopy); CUresult CUDAAPI cuMemcpyHtoDAsync(CUdeviceptr_v1 dstDevice, const void *srcHost, unsigned int ByteCount, CUstream hStream); CUresult CUDAAPI cuMemcpyDtoHAsync(void *dstHost, CUdeviceptr_v1 srcDevice, unsigned int ByteCount, CUstream hStream); CUresult CUDAAPI cuMemcpyDtoDAsync(CUdeviceptr_v1 dstDevice, CUdeviceptr_v1 srcDevice, unsigned int ByteCount, CUstream hStream); CUresult CUDAAPI cuMemcpy2DAsync(const CUDA_MEMCPY2D_v1 *pCopy, CUstream hStream); CUresult CUDAAPI cuMemcpy3DAsync(const CUDA_MEMCPY3D_v1 *pCopy, CUstream hStream); CUresult CUDAAPI cuMemsetD8(CUdeviceptr_v1 dstDevice, unsigned char uc, unsigned int N); CUresult CUDAAPI cuMemsetD16(CUdeviceptr_v1 dstDevice, unsigned short us, unsigned int N); CUresult CUDAAPI cuMemsetD32(CUdeviceptr_v1 dstDevice, unsigned int ui, unsigned int N); CUresult CUDAAPI cuMemsetD2D8(CUdeviceptr_v1 dstDevice, unsigned int dstPitch, unsigned char uc, unsigned int Width, unsigned int Height); CUresult CUDAAPI cuMemsetD2D16(CUdeviceptr_v1 dstDevice, unsigned int dstPitch, unsigned short us, unsigned int Width, unsigned int Height); CUresult CUDAAPI cuMemsetD2D32(CUdeviceptr_v1 dstDevice, unsigned int dstPitch, unsigned int ui, unsigned int Width, unsigned int Height); CUresult CUDAAPI cuArrayCreate(CUarray *pHandle, const CUDA_ARRAY_DESCRIPTOR_v1 *pAllocateArray); CUresult CUDAAPI cuArrayGetDescriptor(CUDA_ARRAY_DESCRIPTOR_v1 *pArrayDescriptor, CUarray hArray); CUresult CUDAAPI cuArray3DCreate(CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR_v1 *pAllocateArray); CUresult CUDAAPI cuArray3DGetDescriptor(CUDA_ARRAY3D_DESCRIPTOR_v1 *pArrayDescriptor, CUarray hArray); CUresult CUDAAPI cuTexRefSetAddress(unsigned int *ByteOffset, CUtexref hTexRef, CUdeviceptr_v1 dptr, unsigned int bytes); CUresult CUDAAPI cuTexRefSetAddress2D(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR_v1 *desc, CUdeviceptr_v1 dptr, unsigned int Pitch); CUresult CUDAAPI cuTexRefGetAddress(CUdeviceptr_v1 *pdptr, CUtexref hTexRef); CUresult CUDAAPI cuGraphicsResourceGetMappedPointer(CUdeviceptr_v1 *pDevPtr, unsigned int *pSize, CUgraphicsResource resource); CUresult CUDAAPI cuCtxDestroy(CUcontext ctx); CUresult CUDAAPI cuCtxPopCurrent(CUcontext *pctx); CUresult CUDAAPI cuCtxPushCurrent(CUcontext ctx); CUresult CUDAAPI cuStreamDestroy(CUstream hStream); CUresult CUDAAPI cuEventDestroy(CUevent hEvent); CUresult CUDAAPI cuDevicePrimaryCtxRelease(CUdevice dev); CUresult CUDAAPI cuDevicePrimaryCtxReset(CUdevice dev); CUresult CUDAAPI cuDevicePrimaryCtxSetFlags(CUdevice dev, unsigned int flags); CUresult CUDAAPI cuMemcpyHtoD_v2(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount); CUresult CUDAAPI cuMemcpyDtoH_v2(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount); CUresult CUDAAPI cuMemcpyDtoD_v2(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount); CUresult CUDAAPI cuMemcpyDtoA_v2(CUarray dstArray, size_t dstOffset, CUdeviceptr srcDevice, size_t ByteCount); CUresult CUDAAPI cuMemcpyAtoD_v2(CUdeviceptr dstDevice, CUarray srcArray, size_t srcOffset, size_t ByteCount); CUresult CUDAAPI cuMemcpyHtoA_v2(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount); CUresult CUDAAPI cuMemcpyAtoH_v2(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount); CUresult CUDAAPI cuMemcpyAtoA_v2(CUarray dstArray, size_t dstOffset, CUarray srcArray, size_t srcOffset, size_t ByteCount); CUresult CUDAAPI cuMemcpyHtoAAsync_v2(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount, CUstream hStream); CUresult CUDAAPI cuMemcpyAtoHAsync_v2(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount, CUstream hStream); CUresult CUDAAPI cuMemcpy2D_v2(const CUDA_MEMCPY2D *pCopy); CUresult CUDAAPI cuMemcpy2DUnaligned_v2(const CUDA_MEMCPY2D *pCopy); CUresult CUDAAPI cuMemcpy3D_v2(const CUDA_MEMCPY3D *pCopy); CUresult CUDAAPI cuMemcpyHtoDAsync_v2(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream); CUresult CUDAAPI cuMemcpyDtoHAsync_v2(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream); CUresult CUDAAPI cuMemcpyDtoDAsync_v2(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream); CUresult CUDAAPI cuMemcpy2DAsync_v2(const CUDA_MEMCPY2D *pCopy, CUstream hStream); CUresult CUDAAPI cuMemcpy3DAsync_v2(const CUDA_MEMCPY3D *pCopy, CUstream hStream); CUresult CUDAAPI cuMemsetD8_v2(CUdeviceptr dstDevice, unsigned char uc, size_t N); CUresult CUDAAPI cuMemsetD16_v2(CUdeviceptr dstDevice, unsigned short us, size_t N); CUresult CUDAAPI cuMemsetD32_v2(CUdeviceptr dstDevice, unsigned int ui, size_t N); CUresult CUDAAPI cuMemsetD2D8_v2(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height); CUresult CUDAAPI cuMemsetD2D16_v2(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height); CUresult CUDAAPI cuMemsetD2D32_v2(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height); CUresult CUDAAPI cuMemcpy(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount); CUresult CUDAAPI cuMemcpyAsync(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount, CUstream hStream); CUresult CUDAAPI cuMemcpyPeer(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount); CUresult CUDAAPI cuMemcpyPeerAsync(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount, CUstream hStream); CUresult CUDAAPI cuMemcpy3DPeer(const CUDA_MEMCPY3D_PEER *pCopy); CUresult CUDAAPI cuMemcpy3DPeerAsync(const CUDA_MEMCPY3D_PEER *pCopy, CUstream hStream); CUresult CUDAAPI cuMemsetD8Async(CUdeviceptr dstDevice, unsigned char uc, size_t N, CUstream hStream); CUresult CUDAAPI cuMemsetD16Async(CUdeviceptr dstDevice, unsigned short us, size_t N, CUstream hStream); CUresult CUDAAPI cuMemsetD32Async(CUdeviceptr dstDevice, unsigned int ui, size_t N, CUstream hStream); CUresult CUDAAPI cuMemsetD2D8Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height, CUstream hStream); CUresult CUDAAPI cuMemsetD2D16Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height, CUstream hStream); CUresult CUDAAPI cuMemsetD2D32Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height, CUstream hStream); CUresult CUDAAPI cuStreamGetPriority(CUstream hStream, int *priority); CUresult CUDAAPI cuStreamGetFlags(CUstream hStream, unsigned int *flags); CUresult CUDAAPI cuStreamGetCtx(CUstream hStream, CUcontext *pctx); CUresult CUDAAPI cuStreamWaitEvent(CUstream hStream, CUevent hEvent, unsigned int Flags); CUresult CUDAAPI cuStreamAddCallback(CUstream hStream, CUstreamCallback callback, void *userData, unsigned int flags); CUresult CUDAAPI cuStreamAttachMemAsync(CUstream hStream, CUdeviceptr dptr, size_t length, unsigned int flags); CUresult CUDAAPI cuStreamQuery(CUstream hStream); CUresult CUDAAPI cuStreamSynchronize(CUstream hStream); CUresult CUDAAPI cuEventRecord(CUevent hEvent, CUstream hStream); CUresult CUDAAPI cuEventRecordWithFlags(CUevent hEvent, CUstream hStream, unsigned int flags); CUresult CUDAAPI cuLaunchKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra); CUresult CUDAAPI cuLaunchHostFunc(CUstream hStream, CUhostFn fn, void *userData); CUresult CUDAAPI cuGraphicsMapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream); CUresult CUDAAPI cuGraphicsUnmapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream); CUresult CUDAAPI cuStreamWriteValue32(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags); CUresult CUDAAPI cuStreamWaitValue32(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags); CUresult CUDAAPI cuStreamWriteValue64(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags); CUresult CUDAAPI cuStreamWaitValue64(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags); CUresult CUDAAPI cuStreamBatchMemOp(CUstream stream, unsigned int count, CUstreamBatchMemOpParams *paramArray, unsigned int flags); CUresult CUDAAPI cuMemPrefetchAsync(CUdeviceptr devPtr, size_t count, CUdevice dstDevice, CUstream hStream); CUresult CUDAAPI cuLaunchCooperativeKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams); CUresult CUDAAPI cuSignalExternalSemaphoresAsync(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS *paramsArray, unsigned int numExtSems, CUstream stream); CUresult CUDAAPI cuWaitExternalSemaphoresAsync(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS *paramsArray, unsigned int numExtSems, CUstream stream); CUresult CUDAAPI cuStreamBeginCapture(CUstream hStream); CUresult CUDAAPI cuStreamBeginCapture_ptsz(CUstream hStream); CUresult CUDAAPI cuStreamBeginCapture_v2(CUstream hStream, CUstreamCaptureMode mode); CUresult CUDAAPI cuStreamEndCapture(CUstream hStream, CUgraph *phGraph); CUresult CUDAAPI cuStreamIsCapturing(CUstream hStream, CUstreamCaptureStatus *captureStatus); CUresult CUDAAPI cuStreamGetCaptureInfo(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out); CUresult CUDAAPI cuStreamGetCaptureInfo_v2(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out, CUgraph *graph_out, const CUgraphNode **dependencies_out, size_t *numDependencies_out); CUresult CUDAAPI cuGraphUpload(CUgraphExec hGraph, CUstream hStream); CUresult CUDAAPI cuGraphLaunch(CUgraphExec hGraph, CUstream hStream); CUresult CUDAAPI cuStreamCopyAttributes(CUstream dstStream, CUstream srcStream); CUresult CUDAAPI cuStreamGetAttribute(CUstream hStream, CUstreamAttrID attr, CUstreamAttrValue *value); CUresult CUDAAPI cuStreamSetAttribute(CUstream hStream, CUstreamAttrID attr, const CUstreamAttrValue *param); CUresult CUDAAPI cuIpcOpenMemHandle(CUdeviceptr *pdptr, CUipcMemHandle handle, unsigned int Flags); CUresult CUDAAPI cuGraphInstantiate(CUgraphExec *phGraphExec, CUgraph hGraph, CUgraphNode *phErrorNode, char *logBuffer, size_t bufferSize); CUresult CUDAAPI cuMemMapArrayAsync(CUarrayMapInfo *mapInfoList, unsigned int count, CUstream hStream); CUresult CUDAAPI cuMemFreeAsync(CUdeviceptr dptr, CUstream hStream); CUresult CUDAAPI cuMemAllocAsync(CUdeviceptr *dptr, size_t bytesize, CUstream hStream); CUresult CUDAAPI cuMemAllocFromPoolAsync(CUdeviceptr *dptr, size_t bytesize, CUmemoryPool pool, CUstream hStream); CUresult CUDAAPI cuStreamUpdateCaptureDependencies(CUstream hStream, CUgraphNode *dependencies, size_t numDependencies, unsigned int flags); #elif defined(__CUDA_API_PER_THREAD_DEFAULT_STREAM) static inline CUresult cuGetProcAddress_ptsz(const char *symbol, void **funcPtr, int driverVersion, cuuint64_t flags) { const int procAddressMask = (CU_GET_PROC_ADDRESS_LEGACY_STREAM| CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM); if ((flags & procAddressMask) == 0) { flags |= CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM; } return cuGetProcAddress(symbol, funcPtr, driverVersion, flags); } #define cuGetProcAddress cuGetProcAddress_ptsz #endif #ifdef __cplusplus } #endif #if defined(__GNUC__) #if defined(__CUDA_API_PUSH_VISIBILITY_DEFAULT) #pragma GCC visibility pop #endif #endif #undef __CUDA_DEPRECATED #endif /* __cuda_cuda_h__ */ triton-2.0.0/python/triton/third_party/cuda/lib/000077500000000000000000000000001440023377100217045ustar00rootroot00000000000000triton-2.0.0/python/triton/third_party/cuda/lib/libdevice.10.bc000077500000000000000000016352001440023377100243710ustar00rootroot00000000000000BC5b 0$IY>-D2!  #AI29 %b EB B28K 2H CF2BH$PAQAFQ# tpv(6pHvhqv`y!a ءa!svrwx60yv(6wHwr6(vHvAޡ A¡ʡ  ʁҡڀ!   ڠ0p`y(pwhww60xhvz@b!aƁA@̡ !<zpys(60xhvz@` ́!0p`y(`!Ha=rHyq`rhzt`6p`va Aaaʡ !0p`y(pwhsphrhxxtpz(yhr`th!ڡ!܁Aڠ!ڡ ܡ ءޡ a !svrwx6pppyhs6hpt!a a֡ Aށa ġA`Awp0rhs6hpt!a aʡ ́svr61@l0Xj mx$aAءڀ!ء !؁0p`y(`ryhxrtrhsvr!a !ڡ!܁Aڠ!yrwx60yv(6wHwr6(vHvhwxwhv(p0pwhtps60xhvz@ޡ !a !svrypuhxwrzxyhqs0r6tЇr !̡ !svrzrhyxs60vxp  m 6L`I`B L L``Ȅ (eB t2" d$$ㄡL M 4 T0Gip0s K0 NP(I(ppp@ePPP "(( @88pA Ehh 2pppIȤRR)G᠁\p@0E8 "pMȦ S)A|"qPP8hh8 QadDGE9E KT r*ǁA〃q@TtP+*eÊp@]P/+CCpEfeh8h 244|Z! 60$7pF88(D#M%L>4k)/l$ppd "( ѧ@~ISD O-Nc,NcHPP R)(((`p Α&p`!l$44,wx|:hwt t68xvHq `Pmv@z`tvq`m`s z0rvs z`t@2Rd)%;a($ `S S4H7Rip@!@C < y H0䑀`C@!@C < y, `0`ȃ@!@C < y, `0`ȃ@!@C < y, `0`ȃ@!@C < y h0р`C@!@C < y h0р` ` @!G `@!OC1 ! G `#@0! y@!`0  y<|@C>@!`0  y<~@C?@!`0  y<~@C?@!O(@'` @0  yU6?}nEܟ[ѝ6w^\o=Tw_ ~]vy/usvH2כ?Og=ɱ'Me Sw ^߷WBM~ 4A8ip RC&HM % QR(S8THUXVhWxX &5A8NjpH ¡Rc&KM .5A8^jpሩ !SYZ[\]^_ 3! ASq 5! aSs 7!@ SufFVF7%]3f=C8ÌByxsq 3 BΡf0=C8=C==xtp{yHppzpvxp 0n0P3!!af0;;C9<<;v`{h7hrh7pp`v(vvxw_qry,0bȡ̡a!āa֐C9C9C9C98C8;/<;; ipXrpthx`ttSP@ P3 (A!܁fQ8C:;8yq`w` r(wzXpC=8C9a A @3dY\a A H1dY\a A @/dY\a A H-dY\a A,tP%:bp KDA8YYYa  X*%Y\Y`a  X(%Y\Y`a  `&%Y\Y`a  `$%Y\Y`a  X"%Y\Y`a  X %Y\Y`a  `$Y\Y`a  `$Y\Y`a  X$Y\Y`a  X$Y\Y`a  `$Y\Y`a  `$Y\Y`a A,tP5WE W6" Y\Y`a A,tP5WE W5A8Y\Y`a A @dY\a A hdY\a  P4A@Y\Y`Yda  P4A@Y\Y`Yda A @dY\a A hdY\a D,t0@C:u#` 7C2B0bP 9ÈA td B@YZ 4!1 )a A /Ya D,t0@C:u#` 7C2B0bP ]8ÈA td B@YZ 4!1 )a A .Ya A .Y a D,t0@C;u#` 7C2B0b` dKF1K0a B@Y\Y`Z 4!1 ЩCa D,t0@C;u#` 7C2B0b` dKF1K0a B@Y\Y`Z 4!1 ЩCa D,t0@C:u#` 7C2B0bP -ÈA td B@Y\Z 4!1 )a  + A@Y\Y`a  + A@Y\Y`a A A*Y\a A )Y\a A A)Y\a "D,t0@C:u#` 7C2B0bP )ÈA td B@Y\7?q=v2h!1 ȩ C a A tdY\a D,t0@C:u#` 7C2B0bP (ÈA td B@Y\Z 4!1 )a D,t0@C:u#` 7C2B0bP ÈA td B@Y\Z 4!1 )a #D,t0@C,1bP `NpC,Ca#`н3KpA>,1P12`dY\w SU/A>q!> !1 AP Ba A AY\a  A@Y\Y`a  A@Y\Y`a  A@Y\Y`a  A@Y\Y`a  A@Y\Y`a  A@Y\Y`a  A@Y\Y`a  A@Y\Y`a D,t0@C;u#` 7C2B0b` d ;F1K0a B@Y\Y`Z 4!1 ЩCa D,t0@C;u#` 7C2B0b` d:F1K0a B@Y\Y`Z 4!1 ЩCa D,t0@C;u#` 7C2B0b` d:F1K0a B@Y\Y`Z 4!1 ЩCa D,t0@C;u#` 7C2B0b` d:F1K0a B@Y\Y`Z 4!1 ЩCa D,t0@C;u#` 7C2B0b` d`:F1K0a B@Y\Y`Z 4!1 ЩCa D,t0@C;u#` 7C2B0b` d0:F1K0a B@Y\Y`Z 4!1 ЩCa D,t0@C;u#` 7C2B0b` d:F1K0a B@Y\Y`Z 4!1 ЩCa D,t0@C;u#` 7C2B0b` d7F1K0a B@Y\Y`Z 4!1 ЩCa D,t0@C:u#` 7C2B0bP l7,0bP l7,0P dY Z 4!1 )a D,t0@C:u#` 7C2B0bP l`7,0bP l`7,0P dY Z 4!1 )a D,t0@C:u#` 7C2B0bP l07,0bP l07,0P dY Z 4!1 )a D,t0@C:u#` 7C2B0bP l7,0bP l7,0P dY Z 4!1 )a A AY a A Y a A AY a A Y a A AY a A Y a A AY a A Y a A AY a A Y a D,t0@C:u#` 7C2B0bP r3,0bP r3,0Pq dY > !1 )a D,t0@C:u#` 7C2B0bP r`3,0bP r`3,0Pq dY > !1 )a D,t0@C:u#` 7C2B0bP r03,0bP r03,0Pq dY > !1 )a D,t0@C:u#` 7C2B0bP r3,0bP r3,0Pq dY > !1 )a D,t0@C:u#` 7C2B0bP r2,0bP r2,0Pq dY > !1 )a D,t0@C:u#` 7C2B0bP r2,0bP r2,0Pq dY > !1 )a D,t0@C:u#` 7C2B0bP rp2,0bP rp2,0Pq dY > !1 )a D,t0@C:u#` 7C2B0bP r@2,0bP r@2,0Pq dY > !1 )a A AY a A Y a A AY a A Y a A AY a A Y a A AY a A Y a  ؁B@YYa A Y a A AY a D,t0@C:u#` 7C2B0bP x.,0bP x.,0P dYZ 4!1 )a D,t0@C:u#` 7C2B0bP x.,0bP x.,0P dYZ 4!1 )a D,t0@C:u#` 7C2B0bP xP.,0bP xP.,0P dYZ 4!1 )a D,t0@C:u#` 7C2B0bP x .,0bP x .,0P dYZ 4!1 )a D,t0@C:u#` 7C2B0bP x+,0bP x+,0P dYZ 4!1 )a D,t0@C:u#` 7C2B0bP x+,0bP x+,0P dYZ 4!1 )a D,t0@C:u#` 7C2B0bP x+,0bP x+,0P dYZ 4!1 )a D,t0@C:u#` 7C2B0bP x`+,0bP x`+,0P dYZ 4!1 )a A Ya A AYa A Ya A AYa A Ya A AYa A Ya A AYa A Y,a A AY,a A Y,a A AY,a A Y,a A AY,a A Y,a A AY,a A Y,a A AY,a A Y,a A AY,a A Y,a A AY,a A Y,a A AY,a D,t0@C:u#` 7C2B0bP$ 0&,0bP$ 0&,0P D YZ 4!1 )a A Ya A8Y\a A8Y\a A8Y\a A8Y\a  A8Y\a  A8Y\a $D,t0@C:u#` 7C2B0bP ÈA t8b B@Y7SUP2!/!M2h!1 ȩ C a $D,t0@C:u#` 7C2B0bP ]ÈA t5b B@Y7SUP2!/8R2h!1 ȩ C a $D,t0@C:u#` 7C2B0bP ÈA t2b B@Y7SUP2!Ϳ8]ąLZ 4!1 ȩ C a %D,t0@C:uq0/#`$ 7D2B0bP ݌ÈA t3r Bd7@S Mv2YfAXh!1 ȩ C a $D,1Fjt0@C;uE 2bP `pNpC@,C #`Х0K0AJ,0P dY7S؅O\dO!1 C a KJ,t0@C:u#` 7C2B0bP ÈA t8b ˆA :  DA#,1bP ,@GB1bP `NpCЄ,Á#`@n Ɉ %,A2PdVZ0mp7_ TąL70S M4O\t 7_R!QąLZ  >]!1 (CՄ! Ca <G,t0@CD,t0@C;u56 n @0rdldld {`999<ԃ8<A#`4 7E2B0b` d@ ,0b` d@$,0P dYY6ZPD1[A T!1 ЩCa >D,t0@C;u56 n @0rdldld {`999<ԃ8<A#`4 7E2B0b` d@ ,0b` d@$,0P dYY6ZPD1[A: !1 ЩCa >D,t0@C;u56 n @0rdldld {`999<ԃ8<A#`4 7E2B0b` d@ ,0b` d@$,0P dYY6ZPD1[AZ !1 ЩCa >D,t0@C;u56 n @0rdldld {`999<ԃ8<A#`4 7E2B0b` d@ ,0b` d@$,0P dYY6ZPD1[Az !1 ЩCa ^,t0@C:uq "c xA" C? A  c 7F ~c #` 7O2A0bP `0OpC,0#`н4K@AN,1P0 71PA]P1Kp Tpr@bpX1Xd0ˀp[\{ǖpĈA AIpC0a0$p",Ep0K T$@ 2bP >!R`ĈA KpCa08 A90Ka@G4SfEA#`5 ,#` i0K C,4P 7x ē Ո A<- 1KP Tp$P#`? A,u#`$C`#`4`a WU( 7 1pۈ ?D7K TpD&h+'l\݈A t;5@GlA8)6[@OM@ >q!% >][lӶ >}xӸ O[p0=HT5>q!~7_0@GO\_M PQ2!SąLU >t&[ mp7_0@GO\dO!1 Э `Z0@j- J1"Z j!^- Aȩ Cl& PM@03aa A, 1W8C:9bOז[0mO[04nO0SąL>r7_,0H3Q2%> !1 ȫD q  rjՄ!"a A tdYa 'D,t0@C:u -#`4 7E2B0bP 8ÈA t ; EO7dY7SHO\dO!1a A,tP>'lfYa A,tP7lqZCpl@YYa &D,t0@C:u #`$ 7D2B0bP 8ÈA tt ; t+(A@Y7SHO\dO!1a &D,t0@C:u #`$ 7D2B0bP 8ÈA tt ; t+(A@Y7SHO\dO!1a l  t0@C;u DB -qA"-;bP `NpC,C@E 7 2CP@!N0K@ T#` 7S2HPJ.pC0,qL:Kp T@!1bP `OpC,C#`S7KAO,2PƱ BPTt9A0A2@L0bP `pRb0A24N0bP H,3bP H,3PppO7 ,nkPppC,4q,T 7s ppC,Cew[hAp,h;A0ˀe^p[h ^k0K 7( [ 7B \tÆp*1A0ܐ 2|`0K! 7* a0pC+1 c0K0 H `0 2 ,CAP*, tB d;bP `pW0B2 + hAp,A Tp0g0K@ ,HP j X@%^@ Oy~m߶Z0tmO6_04nO&[ ei=@T?"5@SąL6[ -HT?"5@SąLnOZ0m_G0?LD_LO\t#DD ŔąL!1 !k" ZP BՂb, rBP S @8DY03aeȄ!a Uz 1@V@ #t0@#PtPPPP tPrP?tPnP%kΜ9kΜ9CF EQPtP   d!`A8`8P.8YbpC`0@dt%(*>H 22Gf YdBq#9V8`XCRh ,C ,@.8ᆠ `qR2A24N@ @@,X@!%x(,3Pq 7vp  7v ؀!@P RBpb g00UpCi0@U)@NP%f EH8@"jP*RkeP,h:dF@@ZY 6@,6K T(f T| 7d0Pq@š\G4`+`p1e %T>ţ \p 8,CA0 B26C9AAJ` f `B `W`Q[tf0Pfe0PqFe0PqFe0KP a0ˀt@BhPr0P T Ta 4ᆠ`A  \p 9,CA0ܐ2 C9,CA0K 7p opi0qر!')!Y9% E2 @e`i, T(j0q́@Łs"g0KP AAA I,A0KpzR I,A0K q܃Vp{TZ>A8QY@LqT#>mS NU,>\8U_T#~nS NU,>^ M >׆\ M >&h M >^ M >fh M >h M >i M >F_ M >߶i M >i8UA42_H3Q,4BD ąLf\ >m[T#>o#>m{_P >hP >v^P >ז^P >ۦiP >iP >FhP >Fj` >״um#~o#!1 @[`( DaZ @0@a c N1F0~)t0@C;uq(0F(Ai$? c A?T D@ D)B1E )샒C8;8쁒C88:A쁒} l l l 7@ST<5@ D c9aq 0F c 6F!: c{ֲ7F@a zc/Z0F 3p(T 7g 0K0 1K0 TD p؈A LpCa0@ 9 lƈ 9m@GB1bP `PSp0A2H0bp hN Ɉ ;%q@$qȈA NpCpa0ˠ, A<5t00#`1K Tp",#`? ,C#`<#`(<pC,TA AYpC a0IA taU ӈA tae @GDt BL7* &* 7 P+,5b` dpBP @GT5K@ [)h Vp Q)\5*1K Tne@X 7  aU(8#``$hB&l+lC  0 (ȖBa" ,Avlp!6 i-r*B@(l.r+C@nH! f.(_Pp@AP Pp0*@f `8NJHux2nY $%*3"%` (dzY080o@*  (! ~ 8nA0e1% N# f `⠓ `㉋Y282 g#4A0'` 102a%@A0 ^"Y43Πᆀ,`! %XY5h$ 51(0Pn" fؠ A0@2%pA0@:%p ` n" f`A0c-` 184@hdb@F  Ћ&% !f `*8Adn" l&A0.` 184/f `@@ % n" xb'A01/` 184/f `@@ %Ѓ !F  L4Y=`@@ f `@@ . f `#@b 5`D!1(0pnV# fA0@+Ң4f Ba@@ Р 0YP?@a@*8!> `SąL1XT5>q!#>r#kH3Q,!%LX0>q!_1O[qTT5>q!- >m[uӸ >tӷi >xӻ >A"OۨAO_?LD?EG@ 4BD ąL>b O[FkD>#>Vm@tM[ӽ>]k@t][AOm#~}7_0GO\#>r7_,0H3Q2M#O\#>r2#DD  >q!S#>rB#DD P04O\t7#DD  >q!#>r(7_0GO\#\OH3Q,O\Ȅ4Mv~A#DD  >q!S#>r:#DD  >q!#>bLDH2ȹ`45 0ąL>#_` DDe(#>#>mʁ_[r >#>0SHO\#>bO8>#>1[Y-/L!1 Cڂ!JmA! @j 0>[` x`Z@B_ !k`Z`@`0 b0(a b N1F0~)t0@C;uq(0F(Ai$? c A?T D@ D)B1E )샒C8;8쁒C88:A쁒} l l l 7@ST<5@ D c9aq 0F c 6F!: c{ֲ7F@a zc/Z0F 3p(T 7g 0K0 1K0 TD p؈A LpCa0@ 9 lƈ 9m@GB1bP `PSp0A2H0bp hN Ɉ ;%q@$qȈA NpCpa0ˠ, A<5t00#`1K Tp",#`? ,C#`<#`(<pC,TA AYpC a0IA taU ӈA tae @GDt BL7* &* 7 P+,5b` dpBP @GT5K@ [)h Vp Q)\5*1K Tne@X 7  aU(8#``$hB&l+lC  0 (ȖBa" ,Avlp!6 i-r*B@(l.r+C@nH! f.(_Pp@AP Pp0*@f `8NJHux2nY $%*3"%` (dzY080o@*  (! ~ 8nA0e1% N# f `⠓ `㉋Y282 g#4F  F'Y38`@@ Ƞb F  z(f `#@82CY4P`` f `4`Ġ@@ `b!@0e`6F  b F  b *8 6('>t!h0ex9F  d-Y8`@@ hY9184Hphb@*88%Y:9P9(+?t!0n F  v.Y:`@@ @ k%A0@:.b*8;(/Dt!0n F  /Y<ȃ`@@ @ o%ЃA0@:>b@*8<1(0@nJ# fA0@60%A0@"0%胁 j BY?`Ġ@@ Tc!`0ePF  LP Ә%A0@34f Ba#@YQP p `SąL1XT5>q!#>r#kH3Q,!%LX0>q!_1O[qTT5>q!- >mtӷU >۸ӹi >xӻ >A"OۨAO_?LD?EG@ 4BD ąL>b OFkD>#>fm@tMӽ>]k@t][AO[m#~}7_0GO\#>r7_,0H3Q2PH3Q,? TL/8R2M#O\#>r2#DD  >q!S#>r7#DD  >q!#>r(7_0GO\#\OH3Q,O\Ȅ\PH3Q,O\Ȕ4Mv~:#DD  >q!#>bLDH2ȹ`45 0ąL>#_` DDe(#>#>mʁ_[r >#>0SHO\#>bO8>#>1[Y-/L!1 ȵCڂ!LmA! @j 0?[`y"l Bׂ!`kja  L1F0~)t0@C=uq(0F(Ai$? c A?T D@ D)B1E )샒C8;8쁒C88:A쁒} l l l 7@ST<5@ D c9aq 0Fock/Z c 7F0N1Z3p( 7f 0K0 1K0 TD p؈A ALpCa0@ : kƈ :l @GB1bP ``So0A2H0bp hN Ɉ A;%p@$qȈA NpC`a0ˠ, <5s00#`1K Tp",#`` ,C#`(<#`T(<pC,TA YpC a0IA tbe ӈA tbu @GDt BL7) &) 7 +,5b` dB@ @GT5K@ [)h V` Q)\5)1K Tle@X 7  a/U(8#``$hB&lC+lC  0 (ȖBa" ,Avln!6 i-r*B@(.r+B@nH! f.^Pp@AP Pp0*@f `8NJ@aux2nY $%*3%` (DyꃑY080o@* (! } 8nA0e1% N# f `␓ `㉋Y282 g#4 1(0 n~" f A0@*$f `@@ `L(Y43`Ġ@@ b! 0eH5F  k@Pb` F  HRb` *8" 51(0pn" fؠ A0@%pA0@"%p ` F  f-Y7`@@ @ &f `@@ @)&f `#@,1(01n" f`A0@2%A0@"% aF  |.Y:`@@ РtɉY;184Htb*8;1(03n" f A0@%ЃA0@"%Ѓ !F  ,4Y=`@@ ࿆ f `@@ ' f `#@b5` % 胁 9胂ᆀ5`B! b *8?(:/t!0eQ /% N) 0`؀`0vkpL>#>} 7_0@GO\TM PQ2>#745Ob1P„8/-2\H3Q,4BD ąL>b_[1i(#>#>mOgPSąL>#m0mO]04nO]0nO^04o DC4#DD SD 0!N+DO\#G?LD_02#m0toO[AO}@>#>k@t]5m #>#>mO׶$9>]fnPH>7S45>q!]TTP5>q!%>]߆k@0D@3_M#O\#>r+#DD  >q!S#>r.#DD  >q!#>bHD4ٺ0!Q#>#?LD_02H3Q,O\d#_` DDm3EYY@HݍH3Q,O\TH#6_e(#>#~mʁoM@ >q!p45 +DO\#745@EDTąL1[Y-/L!1 Cڂ(^m`R[0-H`8l!-BZ0@l-0h-X9 & 0 a 4i 1F PG N;Ec 7F()jc?$ 7Ȯ0F0N^o ch/A %I0bP ݊PQB4Ta0 + 6K0Qe@GP%#` 7[2E0bp hPK6#`Td,1PP7A2H0b` d;0KA$qHI A,,3P-#`+e ,C#`d c0KA,3P:A2@T0bP ` Sm0A2DR0bp hNiР,4bp hNj,4P T@'L@7u lA NpCa0`] <0Kq@GXUj:AL7A#`c (,C#`dx,6bp hOGl ڈA ZpCP a0u ? x#`Dpa0K Tp'tu{ *,7A [pC a0aT0K #`&e*@Gc0P by#`, 7B2#`w% -,CA0b` dC g0b` dC g0Pe , Tp@cA8E&_0t}##DD # 52o%LTP5>q!Xnӷ >uӼy >| >l P45 0ąL^ 5 >[fAAM PQ2H3Q,O\Ta\45 0ąLH3Q,O\ty\45 0ąLH3Q,O\Ȕ=X45 0ąL>r'7_0GO\!1 Cւ"!$Q~"j!\-0AZ QԄ!`c0aca Hl t0@C:uq e00F  +J2 c 7Fock/*Ӽ0Fj-{cƠ7FZ tP1F #` 7S2B0bP ]E1K Tp#@e܃7t P@ 4T 7w 1,5bp hOa@GTU?pCa0ˀqA XpCa0ːi =ASۈ =݁c@Gd6KtWIpC@ a0ݐxU) 7 a `0bp hRE@GxWKpC a0~U+ 7 a  J0K0#`(u|, TpDb0bP ``W0B2A#`-5l`0K`#`-Epa0K`AB7 gA A_pC a0ˀiT 0K#`4e.@Gk0P jA@K7SHO\dO׷?LD/!/8үQC,>q! >}_M Q2Q >pӸ >{ӽm >sӹ >mH3Q,O\dO~5mO׵e9DT5>q!m\45 0ąLH3Q,O\tș\45 0ąLH3Q,O\Ȕȁ\45 0ąLX~AO߅H3Q,O\#wSC-ąL!1 `Cׂ!xk 9`1pZ @ Bւ!j1Ԅ!`΄! Ca u 1F PG GO=Ec 7F()jc?$zA c{ֲ7F *Ӽ0F@a zch/A c xUU0bP PUB5Xa0 , 6K0aq@GP%#`! 7\2E0bp hKD6#`Th,1PPTȈA IpC a0p /yC7KGx@G2bP `Re0A2(K0bp h0N̈ 80ˈA ALpCa08 : M<#`D8f0K TpD#8#`6 A,#`4Hi0K 1,4PeɈA OpCpa00Q < T#`Dho0KP Tp$P#`a ,u#`$H`#`4`a ׈A YpC a0ːi A? Ml#`D8v0K TpD&h#`kE ),#`"54h0K,7PĄ7 `0PgxɁW{ :B2 D@'97 cpA ^pC` a0@eȔ 0K`#`2+@GPg0P fA t>  hA hpC a0ːj8,9, TpDj0Kbp#`% 9,CA0bP ]i,AWZ0K 7 2#` :,CA0b` dF8 )0K0@'!  a^0>mWq?LD_02mSHO\dOgR?LD_02^0H~mf[04~#DD  >q!S# >}pӻm3E7@?LD_02qӼ oӦh0mO״eDT5>q!1\45 0ąLH3Q,O\dIh45 +DO\dOY@HuumOyh45KH+DO\dO`?LD_02DT 5>q!m`45 0ąL\0t+7_0GO\dO7!1 @k`Zp0l- pZ I0_- 0$qAAԄ!`o0a50L04aca i V1F0~)t0@C;uq(0F(Ai$? c A?T D@ D)B1E )샒C8;8쁒C88:A쁒} l l l 7@ST<5@ D c9aq 0Fzc u7Fzc ڵ7Fe ~c(7Fmv1< ,C#`<#`<pC,TA YpC a0IA t`E ӈA t`U @GDt BL7) &) 7 +,5b` d`B@ @GT5K@ [)h V` Q)\5)1K Tle@X 7  a/U(8#``$hB&lC+lC  0 (ȖBa" ,Avln!6 i-r*B@(L.r+B@nH! f.^Pp@AP Pp0*@f `8NJ@aux2nY $%*3%` (DyꃑY080o@* (! } 8nA0e1% N# f `␓ `㉋Y282 g#4 1(0n~" f A0@*$f `@@ L(Y43`Ġ@@ b! 0eH5F  |k@Pb` F  |HRb` *8" 51(0`n" fؠ A0@%pA0@"%p ` F  b-Y7`@@  &f `@@ )&f `#@A0`.`A 184hᢉY:184h/Y:9`Ġ@@ `b!0e;F  f `@@ Ƞ:o*8;1(0n" f A0@B1%ЃA0@J1%Ѓ !F  4Y=`Ġ@@ 0;Y>1(.`*8b>n"e?f `?pڠEt!H0eP .1(.-YPPP p &_ Oqӵm >mrӷ= >M Q2Q >_M@ >q!]0nOvn0d>mOfm0toȁ>}7ՏHHS>q!X5mȁ=DTR2ȹ\45 0ąLXӵ}77_0GO\#\45 0ąL8>#>m%PT5>q!S#745Ob1P„8/-2(!Q#>bȁX45 0ąL>bO׵ >]vl@t H#dvjpH>#>}%eH3Q,!%L 52 _ML6LąL>bHD4YfAH3Q,O\Th45O@+DO\t:#DD  >q!S#s?LD_02>r 7_0@GO\dLT5>q!1H6kpD>#> 1[Y-/LܖA=$!1 CقfQ$((@ Caȝ-`BZ0k-@0_-` W C!a jk '1F  e0F $A"PG X`Ec 6F c̶7F?.Ͽ쎿0F@"Y~c+/{A* c7Fj<A?  E2BP%@'t TpppVE^qqa#`% 7]2 D0bp hK#`Dt,A1P@ňA JpC0a0` 8 #` aLjA LpCa0ː( 9 ,#`D$d0K TpD"(#`5 A,#`44g0K,3PЌ 7r  ,A4bp hO@G<4bP `Su0A2HS0bp h`OԈ =P! ӈA YpCa0PY > \#`Dtt0Kp TpD%X#`i A(,#` 5h#` Eh وA [pC` a0˰q AI f݈ AIg@Gl7bP `0W0B2x_0bp hR,0K _n , 7 a  ALA) c0bp hSs` , TpDbP70 d*pA ipC a0Pf*,*, TpDf0Kp 79 i,1KdTwQSąL#QSąL#?LD/a>q!} >]lӶ >}[}Ӹi >{Ӻ >pӼU >yF[P >f\@tM[fAtӵmO׵PT5>q!#ASąLM PQ2>r'7_0@GO\#WPSąL5#PSąLu#PSąL#PSąL#QSąL5#GQSąLu#!1 ЯCׂ!k`Z0o-@ P[ Cւ!LkP 0ATQ`Zp \-& a P,tPjP 1F ~PG O>Ec 7F c  (Ap-{ ְЈA <)  4,È A*M3K0 TpD #` 7Q2E0bP ], ƈA tDAEI T A ?Y  R Ɉ +1K Tp! 5aPB1bP ``RpCa0ˠ, A/60#`t1K Tp",#`+5 A,C#`0K|,3Ps]#`Q !YgpS0Q0O\#>r 7_0@GO\#>bOZ0mO7PSąL>#[04~ #DD  >q!#>r#DD P0?q=O\dO!1 CՂ!jm`Z@PV aȩ C<a P,tP餅ΛkP 1F ~PG O>Ec 7F c  (Ap-{ ְЈA <)  4,È A*M3K0 TpD #` 7Q2E0bP ], ƈA tDAEI T A ?Y  R Ɉ +1K Tp! 5aPB1bP ``RpCa0ˠ, A/60#`t1K Tp",#`+5 A,C#`0K|,3Ps]#`Q "YgpS0Q0O\#>bO7PSąL>#&[0mOۗPS4ąL>#ǐ?LD_02>#[04~#DD P0?q=5]ąL\ 4!1 CՂ!jm`Z@PV aȩ C<a Z,t0@C:u=jA"/T<{B&tP@c  ( ȿa0F  q  c kA?#` 7R2B0bP bO?LDO\O+DO\t7_R!QąLf^0d>mOwA?LD_02xF\P$>ӖYSHO\#[0~#DD  >q!#!1 aZ0j-0! ZP Z`B@ o`05ajф!a Ri &t0@C:uq =jA"/T<{B&tP@c  ( ȿa0F  q  c kA?nN1Z#`T 7W2B0bP ]=eÈA t pC \Ʊqe{ǎ7A2F0b` d;1KpNrQƈA tT 7g HAWR,2bP ]I@G B20˰0[AhAp,3P P1bP `PSq0A24N0bp hNĠ<#`T,3PTd1bP `Sw0A2@Q0bp h O@H#`,4PUR@spVPwǎAwRGU) 7  Ո :1KP Tp$P A* 7 $,6bp h`R@GX5p02dZp@'dv-l5{A.ȈA A_pC a0u M ވ M @GpB7bP ` Z0C2|`Ԁ,A,A Tp'A ipCPa0 c,, TpcP$#` :,CA0bp h@VC;,0KpQ@pF7SHO\t!#DD  >q!SnOwR?LD_02w0SC-ąL>r#DD BD ąLF\0t~7_R!QąLQ?LD_02^0d>}#DD  >q!A >GR?LD_02\P$>ӖYSHO\#h0d~mO^04oOp?LD_02>bO_@tM~ӽ >]!1  @kA1[  ĠZ Z@B`W @CՂ!jr1Ԅ!`d0a007alՄ! C a [,#t0@C:uq9c Gs8<<>9Ѓ>9́A@c  8 `@Ac"ɇ7F/[A0Z0#` 7O2B0bP ];EÈA t$ pC ɈA tA! tR0bP `OpC,Ca#`4P,1bp h KDuQpb H$ q^l;KU 7f 7t͈ /}7K Tp"0#`- ,#`4 a0K,4PȔ 7p Д,4bp hNc@GD4bP `pSs0A2PU0bp hNֈ ;XA ՈA OpCa0pa < |و <}@G\6K TpvY@,wP?LD_027SHO\t #DD  >q!}#DD  >q!n7Q?LD_02]0H~mOZ0t}#DD  >q!mO&]04nOA?LD_02Ysv]0m1[F!1 Cւ(# `V rS& `M`04afDŽ!a Gm t0@C;uq(`@Q1F@r2\ֿ0F0溂d c{3`0F䞃 (#` 7Q2B0bP Q  DAWO,1bP ]=i@GB1``,Ñ B2 Ip7L7l\pl;KqpC \pp \pCh0@c\p[Ah ^k0bP `@So0A24N0b` d`R7K<ΈA ANpC`a0E Kyc0K G@G@BTA#`? ,D#`@3 Ո A,A5P@ 7 Ӡy0Kz0K Tp%\#`iE ),C#` 5 }0K,6PUHT 0bP `W0B2p]0bp hR4 ވ J)x ݈A ]pC0 a0A0bp hRCu a0bp hR a0P - 7 b0 L= LAD@G0 ,CAP , TpBdpp A8,AP0, Tp fp9l5{V-\9pp j, Tpi0K?,@gC7SHO\t$#DD 4ůQC,>q!= >]kӶ >}vULTP5>q!{ӻ >}ӽ} >M@ >q!SmOH3Q,O\daX45 0ąL>bO_@LOۡ1ѾmDזh@L[fAfAH3Q,O\Tȅ\45 0ąLM >q!=U  5da0PA t?T(#`1K6ta0ˀ$A tb(#`2K pL"$ '2,L0K4xL 7e  ,4bp hKa@,4P 7j Ӏ,4bp h0NDB@GD4bP `Sm0A2PU0bp hNCsֈ :XA ՈA MpC@a0pa A; d#`DLl0K Tp%`T2bP `Sx0A2h[0bp h0OCp#`Dh,7PU 7 S t0Ku0K TpD'x p,A0Kwx BP27VZCp, Tp@aA84f^@L_w1>}7_TąL[0tmOv\0mO]0tnO]0~ 7/ATąLA?LD_02YkӖY7q?LD_02>r#DD  >q!#q?LD_02>r#DD  >q!H3Q,? !M+DO\!1 |km-@ Z 8Ca G0QՂb& MP B(a b t0@C:uq  P@N:Ec%7FY{cM7F?{ci*˹7F@j-~c  (2 #` 7P2B0bP ;IÈA t4 BIAȈA A>U  5da0PA t?T(#`1K6ta0ˀ$A tb(#`2K pL"$ '2,L0K4xL 7e  ,4bp hKa@,4P 7j Ӏ,4bp h0NDB@GD4bP `Sm0A2PU0bp hNCsֈ :XA ՈA MpC@a0pa A; d#`DLl0K Tp%`T2bP `Sx0A2h[0bp h0OCp#`Dh,7P 7A 2K 1K TpD'x P,APu ,A Tp_@,Y7q?LD_02>r#DD  >q!#q?LD_02>bT mH3Q,O\#]0}#DD 4ůQC,>q! >pӸ1 >MKP4>q!]> !1 8C a  QՂb dA95anB1L`a ek %1F  e0F jA"PG X`Ec 6F cgH/د  c7Fv-90Fj~c /0/ q  E2BP%@'r TpnpVpE^qqa#`$ 7\2 D0bp hKæ#`Dp,A1P@ňA JpC a0` A8}7K,2PpT 7g Ӏ,2bp h`ND2@G$2bP `@Sj0A20M0bp hNCcΈ :8͈A ANpCa0@ < D#`DLk0K Tp#@#`>E ,4#`4\n0K@,5P0U 7{ ,5bp hOD7@GT5bP `V~0B2`Y0bp hOC,6bp hO,6P( 7 `z0KЍq{0K Tp&pU* 7 l0K#`*EA@Gx7bP `W0B2#`/%0 KA)c0Pa 70 d^*pA AhpC a0Pf*,T*, TpDf0Kp 7B8 iV,1KdTwQSąL#QSąL#]0tmO_0mO]0tnO^0nOF]0toOlAO[q5m`O׶)>]_PD>hPD>7BSąL\M PQ2>r 7_0@GO\ȩt45@EDc>q!ӝLT5>q!\AM PQ2մBM PQ2յBM PQ2նCM PQ2շ\DM PQ2ոEM PQ2չ!1 Cׂ!k`Z0o-  @[ xCւ!Jk @r( 8 H5ւ!js 0a hk '1F  e0F jA"PG X`Ec 6F cgH/د  c7Fv-90Fj~c /0/ q 0F6~ E2BP%@'t TpppVE^qqa#`% 7]2 D0bp hK#`Dt,A1P@ňA JpC0a0` 8 #` aLjA LpCa0ː( 9 ,#`D$d0K TpD"(#`5 A,#`44g0K,3PЌ 7r  ,A4bp hO@G<4bP `Su0A2HS0bp h`OԈ =P! ӈA YpCa0PY > \#`Dtt0Kp TpD%X#`i A(,#` 5h#` Eh وA [pC` a0˰q AI f݈ AIg@GlWAȈA ]pC a0} J l`0bp hRD, Tp'|#`y A,,CA0bp hSrP ,1K0(pC c0@h\*pǎ. 7 e` AN1 ANBA@G`h0P g@p+C0ːj,plBj0P iAB TwQSąL#QSąL#]0tmO_0mO]0tnO^0nOV]0toOmAOq5m`O׶->]_PD>&hPD>7BSąL\M PQ2>r 7_0@GO\ȩt45@EDc>q!ӝLT5>q!\AM PQ2մBM PQ2յBM PQ2նCM PQ2շ\DM PQ2ոEM PQ2չ!1 Cׂ!k`Z0o-@ P[ Cւ!Lk @r( 8 H5ւ!@kt 0a |n 't0@C:uq  A" "q  c 6F< p0FP9~c/( c7F A?T "q #`d 7X2B0b` dp;5K0Ά [hz`pqVG;bP `pRc0A2E0bp hK`0|ƈ /@GBQHpXc #`/ A,Á#`T$#`dF$qȈA MpC a0ˠ, : 0#`D,i0K Tp",#`9U ,C#`4ZT1 ԈA ZpC` a0`] ? `#`D|y0K Tp%\#`m *,C#`$5l#`$EAl ڈA \pC a0u AJ h ވ AJi @GpBWȈA ^pC` a0A0bp hR,Ao0K `0bP `W0B2#`40 M*d0Pb 70 e7` f^,pA jpC`a0ph:,AT:,A Tph0KX 7- k, Tpl@'GL7S0 ąLV[0tmCfh0D~mCۖ]0}$7_0@GO\Ȥ_M PQ26_0tnO[{ӻ  >}[fA\JM PQ2M PQ2մ)>]۷1SąL}7_0@GO\T߇1SąL5~ 7_0@GO\TF]0~ 7_0@GO\T^047_0@GO\T0SąL~7_0@GO\T&\0toO״>]׆_04!1& Cׂ!k`Z0x- [ Cւ!Tk @r( 8 H5ւ!j^-H0rji0a20@L06alՄ! C<`& L0a @ @1F tP 1F  e0F jA"PG XbEc 6Fp `0FP~cyi/螓 c7F A?  11!VXA "q  cγ/ x. c7FЛ:-ϯ9"0F`j~tPeepC`b0A001 Af @'J0P [DgWlB\Gp؈A MpCa0@ A; k#`D2K` Tp!uA 7r  o0KSp0K Tp! #`aU ,#`4 s0K,3P 7| ,3bp hO@G43bP `V0B2@Q0bp h R҈ HH шA \pC` a00Q I T#`'E`0KP Tp$P#`u *,u#`,5p0Kё,6Pp+ 7 0K!0K TpD&h#`5 A-,#`65*x#`6E+x ]i #` ., O la0bp hSD,A Tp'A jpCa0 c0K@#`a5@@G0' ;2A {#`u A:,A0bp hV@ h0bp hVPh0Pf ,A TpPd0 2'[, Tp iP,9< 7 m A. AC.@G#`u A>,A0b` d0JC/ (0K B? 7 s`A zpCa0@uAR,AAN,A= 7$ wA t1A t@ ЁRxPA }pCa0ˠ{0|0b` dKM@G聰lĄ^rqHqǎXM 77 } nP0bp h['A, TpD~PHpX;!#`) O,(B0bp h^,Q,(WK,1K  B(X 7d @ Ay Kˆ Ay L@G@ #`3v AZ,)B0bp h^30bp h^D0P ˆA pCa0ː @,*Q,* TpD*0bP `cq1E2B+#`5d;1K #`Eh<1K  B+] 7x A~ Oˆ A~X@G #`g A_,,B0bp h_,-Ad1K@ ! ,h 7 `  h0bp h0bD,- TpD-PȈA pCa0ˀ n0K #`)F@G #`w Ak,.B0bp hbq ,A/1K  /\*pCPc0 *pCf0 jpǎm 7  A1È ABA@G0P pC-G0@@6@,9 TpB9x0Kpo`TSąL#TSąLu#'TSąLu#i0tmOۖi0mo04nOV_0nOvn04oOSMO\t<7_GO\#j >]k@t][ӵmO׷>]l@tӵnE#>mE#~mE#mO׼M>]m@tݟM PQ2>r+#DD P0O\d#uhpD>R>mG#5wUSąL5#GUSąL#USąL5#gpSąL5#}E7_0@GO\T>r 7_0@GO\T>RpSąL#u~7_0@GO\T>RWqSąLu#~/7_0GO\t7_0@GO\T>RqSąL#uK7_0@GO\T>bO۷>m^@MqvmOXT5>q!#6}$7_0@GO\Ȥ]unOۼ>mRSąL>bۖYh@_M PQ2>rZ7_0@GO\șATSąL'S0 ąL'6SąL>r 7_0@GO\T>r?7_0@GO\T>!19 Cۂ!m`q[0@- 0k pCڂ!Hm @r( 8 H5ق!l-H l9`[0- _ Cׂ!k`Z0o-XQ 0@Ql-  PZ @ J!`W0 @L@& Xa B 9t0@C:uq 0Fü8c GC:uy@&n@0rdl@qD A" "q  c 6F< p0FP9~c/( c7F  C""  tP#` 7\2B0bP >yÈA t pC  7b A',1bP ]@GQcȈA JpCa0ˀ$ 0K@G B2K`   7n L0,3bp hpN@G0B3bP `PSq0A2m]0tnOk0nOh0t7/ATąL1S0 ąL>r7_/QąLVj04oO5mO׵U>]fh@t}{5nO׹>]h@t567_0@GO\T>r37_0@GO\T>r07_0@GO\T>r$7_0@GO\T>r*7_0@GO\T>r#DD  >q!S}#DD  >q!ePT5>q!#CSąL\M/Q?O\t 7_GO\tA7_0@GO\#k`D>b~mF#oGM PQ2շ\HM PQ2չ1[<<!1' Cق!l``[0@- b 0C؂!k @ 9Yzk굀$aPX!C`Z @# !& P &Մ! C(8& 0a x <t0@C:uq y@&n@0rdl@qDq `0F  tPP1F ~c /  c2½7F A=hN0FkN~c/Z c8̋GP10F  AB#` 7^2B0bP ?,0bP ?,0P 1bP T@pC,Q@' 1 7i πȈ i0K Tp!lg^rqǖqǎ 7v S l0Km0K TpD"(p U2bP `V|0A20M0bp hpOrP,3bp hpOs`,3PЌU( 7 Ӡ,A4bp hOb@G<4bP `V0B2HS0bp hRÓԈ AHP! ӈA [pC a0PY I T\#`&ET~0Kp TpD%X#`t ,,#`+5d0K,6PU- 7 ,A7bp hSg @Gl7bP `W0B2x_0bp hPS,A0K ߈A hpCa0bԀ0K0#`:E+@G V9 7 dP X nf0bp hV, Tpe0bP `Z0C2#`e0 AY.i0Pg 70 j 7` kǞ/pA AnpCpa0mU,,,, Tpm0K 78 p,A Tp@'āE? 7! s@A t5A tE@Ǵ@; ,Ap^Jl5{@'؁VcGQSąL#QSąLu#7SHO\dOFi0mO\0L>m_0tnOF_04oOFh04~7_GO\#WSHO\dnӶ]@tM{umO׶>]6]0nO׸U>mF#_HM PQ2մHRSąL#QSąL5#QSąL#wQSąL5#YBSąL M PQ2>r27տ8F>q!ӹLT5>q!BM PQ2շCM PQ2չ1[D!1 [0- @b Cׂ!k`Z0@y- [ `(CIiG Z @RkA2FZ 80@L@& 0a 6k 1Fjt0@C;uq>  c ̋7Fkc p ڍs-دZA?0c /5I2bP #`$ 7T2B0bP <]ÈA t pC` DPV#` 7\2G0bp h`K #`Dd`0K Tp!1bP `PR`0A2$J0bp hKCG,#`Dx,2P 74_04A9_4xˈA AKpCa0< 9Ј 9!@G83bP ` Sm0A2DR0bp hNRӈ A:L ҈A MpC@a0@U ; PX#`DDj0K` Tp%T#` 6Kq@Gh6bP `pV0B2t^0bP ]Ju߈A t)| ^#`m *,A0bp h@R 1 I)1K B 7) c@A @'! i ,CApf0P e`p1ph@g@p.B0ːjT.@'B6\0$>}7_TąLP?LD_02Q?LD_027Q?LD_02gQ?LD_02Q?LD_02Q?LD_02]0tm^0P~m6_0PmOS؅O\dOP?LD_02'B?LD_02m0I >ݛxӆ^0mOYv]0nO[04!1 !k`Z0j-p!@%aZ0^-8@$Q095a107anل! CL& L0a U, t0@C;uT H } u DpԱS1@#`t 7I2B0bP :1ÈA t ˆA A<1  DAN,1bP ];E@GBqq#``\;b` N {ViYTAmA\R %2bP `pRb0A2H0bp hK1$#`T,A2P 7i JT 7k A'G,4bP  ЈA LpCa0
q!@TR2ՖYYvZ 4!1 AZ0@_-(jxBՂ!tj95& L0a O, t0@C;uT H B?#`d 7H2B0bP 9-ÈA t ˆA <- A DAN,1bP ;A@GBqq#``\;b` VN {ViXTAiA\R %2bP ``Ra0A2H0bp hK1$#`T,A2P 7h AO,3bP >@G(R#;ܠ,C@' dYY\PDSES2M@ >q! >]qӶ >}_H3Q,O\dOM@ >q!S !1 CՂ"a V ЩCԄ! C$0& 0a g  t0@Cq!U@TW2f]0tmOf^0mO[0tnO[0nO'SMO\t 7_TąLmHT5>q!S7_TąLeimm>mOSąLS4O\t7_0UąL!1 Cւ"`ZPւ"B\-0!Z` g`Z0N- &b0am؄! CHx& L 01aa f  t0@Cq!U@TW2v]0t}#DD  >q!nO߆Z0mO&[04nO[0~#DD  >q!@TR2՗SHO\dwv^@tMpӼeim! >_M#R4O\t7IP4>q!1q!!1  Cl-(!`kj-(`jA  @hF V ةCԂ!rja0aoۄ! CT& L0@2aca ws  t0@C=uT H } u DpStP#`d 7H2B0bP ]:5ÈA t ˆA <- A DAN,1bP ;I@GB1bP `pOpC ,Á#`S5KA7O,A2PS5\a0ˠ,A t0#`6K Tp",-P 0bP `@RpC,C#`@TAψ A>Y,3P 7d PVH#`@Y@G@B4bP `Rg0A2LT0b` d0RC5KPaT1 ԈA LpCa0`] AI!,6b` dPR6K Tp%\#`8 ,C#`@-5hۈ AK,6P 7x \x#`@/,7P]pVQ@Apl1%H R!#`me *, I0K#`$u,A Tp'A \pC a0 c d0bp hRB@G0#`w ,,CA0bp hR3 K=1KpQU- 7 hA _pC a0ːjA'N,AG,- 7 lA tA t@ ǤmPb pp)0o@GL7A2ā';K dP&hPDfhPD7SHO\t7_0HSąLm@T M2S\O\Ȕjy@TW2&[04nO^0tmOv[0nOSMO\t!#DD  >q!ӑX45 0ąL}'#DD  >q!S}M@ >q!SmOYYYY \@tM[r%@TR2SHO\tvӵmO׷>mO=q!!1) AZPB0ׂ! `ZPւ"0 jA  @hF V CԂ!tj95p C!`a0aoۄ! CT& L0@2aca {r  t0@C=uT H B?R #`t 7I2B0bP :9ÈA t ˆA <1  DAN,1bP ;M@GB1bP `OpC0,Á#`ѓ5KAGO,A2P5`a0ˠ,A t0#`6K Tp",-P 0bP `PRpCa08 >U,3b` dOV4K TpD#8#`*5 A,#`@U҈ ?m,4P 7j PT#`@$EX@GL5bP `@Sm0A2XW0b` d`R6KT`a ׈A ANpC@a0ːi K ,6b` dR6K TpD&h#`>u A,#`@0|ވ L%`0K Tp'tl|ATP~[L u Rb0H)AȈA [pC a0A0bp hPR1 AI1K `0bP `0W0B2#`*(,Բd0Pb A ^pC0 a0Pf g0bp hRC@G`#`}e .,A0bP [ j0bP [j0Ph A/ 7 kA t m0bP _,A Tplp BPT < 7 n, Tp2V8ppC,A0KiTFhPDvhPD7SHO\t7_0HSąLm@T M2S\O\Ȕ[zy@TW2Z04nO^0tmO6[0nO\04 7_TąL1@TR2gSHO\T[q5m`A```O׷ >M PQ2DT5>q!ӑLT5>q!SM MGMO\dO_ M#R4O\dO׵=>]ۖ]@tM >q!ImCӖ]0$~}7_TąLF\0 ~mOצZ0mOV[04nO!SąLmdFDHOv׷S\O\!1 AZ`B0 `(Z`D0`Z0@]-8` bBԂ!rjB1@& pM@02ada c  0@#1SOA A" CH 8ƈA ;!  AN,0bP :@G0K@ Tp1Z 3Lh0P4 9*c4Ta0`A t4 #`1K Tp!#` 7Z2$J0b` dOC3K,,c9``+/* 8@f N5*Wh 8`IlA`4exBpA0b `A0\j%F s)b@H*pd!pYj & f(,1(0):n f*+184f A0@x2 iJL `q4PqLY#YgĠ@@ ``!0eظ`@@ @M%F  Yn#61(0@+QnH! f/1(Y01(Y0O* Q>Y0`@ *8 23Z0 >mCӆ]0$~}7_TąL6\0 ~mO]0mOF[04nOw!SąLe M@ >q!SmEDH>mOM@ >q!~7_0@GO\dOSES2jAAHOfA0pv]`DD-q!SmOg@S Mv2jӶ Oo0 >}H3Q,O\dO1[I!1 CՂ#(jAY-8 b & L 01aa Q,t0@C:uqZ0F  Zy@&n@0rdl@qD#`t 7I2B0bP 9-ÈA t ˆA <1  DA#,1bP ]@GBQP #`RPA %%Q #`d 7X2H0bp hKP$#`d,A2PԕȈA Hq  10̈ -D3K Tp", c 1K TpBc0ef  Ah0K Tpƒ8Y7SHO\t #DD  >q!SmOg@S Mv2jӶ Op0 >}_H3Q,O\dO1[7<!1  f!& L 01aa j= t0@C:uq `0F򲟍{0F0jh|0F.*/Aݮ0F9|΍9^ cӭ/c`0Fp>4 c;F@s8<<>9Ѓ>9́A@c3%~{¿0F0{cv/ A c)L0F۳{c/1(0F(Ai$? c ABK@AB!)\>(I;9;8A=́(I;9C:A=́(I?,} l@x3tP>EP@SP@0F+J=  tP1Fc?/zA c 4/Z10Fj- 3p() 7 AwV0K0AwV0K0 TpD  * %BT 1bP `0W0B2F0bp hR0,1bp hR@,1P`- 7 HԀ,2bp hRD @G B2bP `W0B2,L0bp h@SCC ͈ MN-4̈A hpCa0< AN .@#`9EL0K Tp#<#`5 A9,C$#`>5\0K0,4P : 7 T,5bp h0VD@GPB5bP `[0C2\X0bp hVC,A6bp hV,A6Pؖ< 7 `0Kq0K Tp&l#`u A>,C#`r5;|#`rEA;| ވA xpCa0a0K #`wEA<@G#` H,A0bp hWC,Aa0KP1I 7( fp Ah zh0bp hZ, Tpg0bP `^+1D2#`5? ik0Pi A e CHHD 52K#`u AN,CA0bP },Aw_0KIpp pCa0ˠpAPA #`&& Y,CA0bp h[,1,VK,1K0Y 7i tP x K x L@GЁP#`0 [,A0bp hp^C3y0bp hp^Cy0Pw A ApC0a0ˠ{ ,1, Tp{0bP `cv1E2#`N A|0P} A pCa0 ,(,( Tp(0bP `@f1F2)#`5Xˆ ~b00P ˆA ApCpa0` p0K #` FA?@Gp #`n k,C*B0bP ],*Ac"1K  *TR&ؙl 7 A ȁ: 0bP rN,+ T+pn 7  wˆ x!@G #` o,,B0bp hc^ 0bp hc _0P ˆA pC@a0P X,-,- TpD-0bP `j1G2B.#`НˆA t0P pC`,.C0bP `k1G2B/#`еˆA tv0P 7A0ݐ'28#`V },8C0b` d0Wn0b` d0Wo0P ,;\l%V0ZlDq@X,9 T(4Y9 T*4]9 7 ` 1bp Ab&h66B0 Bpla5#6 b̹=  `؀K<( d _ d c!0e8tJ=c",!2;1P-7F} "Yt8ss8s̡j,!*J,#;0eHuN%PCH ; Yu`B@`g aցa@SYLNP JnP*@R YvpTDYwv0a*2p 'āpYx8w ]w0 ](E1(0;Vn\% fAA0@d$Ff a@@ Yyx䡐*1(0Ynj% fA0f aĠ@@ `Y{8z1(0\nv% fA0@D%ȇA0@#d%ȇ F  DWp Y}؇`@@ >f a@@ >!f a#A؇bF  P\s Y~`@@ @bF  Ĭ(f a#A2Fu Y`@$f Db)BbĠ@@ e!p0e F  Ne$F  Pe$*8b$(nt!0e0 F  no^z Y@`@@ Y184p;b &*8%1(0hnV& fdb&A0@60%A0@#0% a&j tE Yȉ`Ġ@@ `f!0eF  Vhu%A0@3\]f pb#lBYhL#Y%؉ tBȉYHh FGeSąL"OM@ >q! >]_TM PQ2," _{Ӷ >}*Ӹ >[Ӻ >ӽ  >A"O+i@tM(um#>mO׶>]Vj@tۦmO׺>]ﶉ@t[uoI#>#>mHD4巍=$%d45 0ąLEd45 0ąLad45 0ąLYTP5>q!>bO۴->mm@m[mO۸I">mm@ M#O\dOۼ>mP?LD_02P?LD_02Q?LD_027Q?LD_02gQ?LD_02Q?LD_02Q?LD_02'R?LD_02Q?LD_02W?LD_02"vo`Oߴ>}o@m[mO߸>}GSąLX7oOߺQ>}@_fH3Q,HS1O\dʁOr >#>V}MT5>q!1_ߕM PQ2>#45Ob1P„8/-2xmO#DD SD 0!N+DO\#y 9iT8d#GO\#>rb7_0GO\#>rK7IP4>q!Y45 0ąL>r^7ՏHHS>q!Qd45 0ąL9d45 0ąL0!Q#>#gS45>q!遁4Mw@?LD_02]H3Q,O\TӇb?LD_02X45 0ąL}3#DD  >q!S}H3Q,O\Tc?LD_02 Y45 0ąL5<#DD  >q!SߏH3Q,O\TWd?LD_02iMTR2H3Q,O\dV|,Y(#>#~ 1[-HؖaN&!1W `6,`0  A(8  -0`[0-(`o-B[0-,@FA B@n Cڂ!jm`S[0@- f رC؂!bl,`[0- _ CNa @ea"i*²Z( r`^ @ & X C„!Ja0a?0L & ` C!`h0a20L07alք! C@h& L0a d> t0@C:uq `0F֮z c:5F Wc}-׷0FГccJn6FA| ck"7Frc9Ѓ>9́A@cw.(8ۋ+ c+7F cb/e){ c7F0Nc#7F cْ,|c0@Q.?PQ P 0@ws` $n00r`dld ld0ld@{$0n0r``dld ld0l`{$,C#`q5:|#`qE;| ވA opCa0a0K #`vE<@G#` H,A0bp hWC,Aa0KP1WI 7' fp h zph0bp hZ, TpgPQA>D T R),ANK 71 jA tu l0bP {, Tpkp! BL7M N,MA0bP IBN 7< n Am`p0bp hP[(A, Tpo0bP `0b?1E2ā#`5 *1K0#`E$+1K0Y 7f tP o K oK@GЁP#`- Z,A0bp h@^$y0bp h@^4y0Pw A pCa0ˠ{M\51K#`uM@G聰#`76 A],CA0bp h^0bp h^D0P} A pCa0 =1K #`EO@G #`a _,(B0bp h_,A)0KP 1 )h 7 p A t50bP ],* Tp)PAZH2bP `f1F2*#` 4ˆA ȁ80Pq 6bP `@g1F2B+#`+6E4K #`+F̅4K  B+Ym 7 !v1ˆ %wA@G #`~ o,,B0bp hPc^ 0bp hPc^0P ˆA pCa0P Ag`0Kp #`Y@GԂ` Ah A ApCpa0ː AWjf0K #`Е@G䂠 w; tCG0 Z{ 7 Zjˆ Zk@G pqpXkhUU0`F0Pq8d0Pq8t028'(FnN6 ƈ! l1 B0P0hB&1 `؀0U@0K(ȖN7a"Тz.P-7xn.P-7yF`!Pp@A܀Pp0*@f ap a AkA>+?`M`!(6b*rn($ f!B >X%HC@~8A*s8A!*A!J`DnP$ fa*!f aHYv8`*8bf a ؁q! >]SM PQ2*" _+" Ӹ >m}ӷ" >[Ӻ >ۢӼA >[yVi@tMum#>mO׶}>]Fj@tunO׺>]6l@t&uoI#>#>m#~m #>#>mHD41$d45 0ąL9d45 0ąLUd45 0ąLYTP5>q!>bO۴5>m&m@m۵mO۸>m6@`H3Q,O\#>bOۼ>mP?LD_02P?LD_02Q?LD_027Q?LD_02gQ?LD_02Q?LD_02Q?LD_02'R?LD_02Q?LD_02S4E2ٻvo`Oߴ>}o@mmO߸ ">}GSąLӷnO߻>}@ݟeH3Q,HS1O\dʁOr >#>|MT5>q!1q_M PQ2>#45Ob1P„8/-2X8}mgH3Q,!%L 52](#>#>}K7_,0H3Q2WM Q2MGMO\tp#DD  >q!M#O\#>r#DD  >q!#>r#DD  >q!#>bLDH2aT8d#GO\#y` DDP45 0ąLWb?LD_02X45 0ąLu}0#DD  >q!SmߌH3Q,O\Tgc?LD_02X45 0ąLu~B#DD  >q!SH3Q,O\TWd?LD_02X45 0ąL~W7_TąL>#{,9(#>#~ 1[-Hؖ!M&!1W p`6,C`  D-0A[` `[P "[0- !-`z[0-X9 pk Cڂ!Fm~`<[0-@ Pc C؂!Ll!`Z0}-@ ^ 4 (Ii 4$a0kA S C„!Ja0a?0L & ` C!`h0a20L07alք! C@h& L0a m t0@C:uq+wحǻ0Fvc./A]n c  \c ;8AYVT q  !0F`.~c;ǿ/ԪA^ڵ c+7F > ¿0Fü8tP1F  `0F򲟍{0F0jh|0F.*/Aݮ0F9|΍9^ cӭ/c`0Fp>4 c;Fs8<<>9Ѓ>9́A@c3%~{¿0F0{cv/ >Ad c=7F8(0F(Ai$? c ABK@AB!)\>(I;9;8A=́(I;9C:A=́(I?,} l@x3tP>EP@SP@0F+J=A2(ӿ0Fock/*Ӽ0Fj-{cƠ7FZ c /©诚0F0zc^/N7 c3=7Fq<A$M0FЇ}h~cJ/2=A c{7FlN c}6FL | cȫ.pm5͍ý0F?A]  < c80'\`0Fygc~=6F,ncY$/iƼ0Ffwcu7FpmJA c7F0:aԢ߾0Fsc Hր,2bp hp[D@G B2bP `Pba1E2,L0bp h[C͈ oO4̈A pCa0< Ax X@#`ELc1K Tp#<#`/ A[,C$#`5\f1K0,4P 7phA2PUPZ,A5P LP2pETzV {^ 7} ,6bp h`_\,@GXUG\ĈA ApC0a0ːi  ۈ 1@Gd6bP `f1F2p]0bp hbbވ A]x ݈A pCa0A0bp h`bCa0bp h`ba0P#`t l,A0bp hbd0bp hbd0Pb A ApCpa0Pfp,, TpDf0bP `g1F2A#`56p1K#`5Ft1KBo 7 k  am0bp hc,A Tpl0bP `j1G2#`?6k gp0Pn Ai #` {,CA0bp hPf,m0K0{ 7 tP n v0bp hfs, Tpupp n,Ap;bP `k1G2A#`s 0K#`s 1KB@'e 0,ApUA@'eۈ 7% ~A t0bP ݸ,( Tp0G1 QzĈA pC"a0 Z{ 1K@ #`D{@G0 #`# ,C)B0bp hj0bp hjD0P ˆA ApCp#a0ˀ ,*,* Tp*0bP `o:2H2+#`6H1K #`FL1K  +܏ 7a  ˆ X@G #`(G& ,C,B0bp hk"0bp hkD"0P ˆA A˥pC&a0@ $2K` #`FA@GЂP #`2& ,-B0bp hn,A.q)2K q .ܜ 7u  k"0bp hn,/ Tp.0bP `sx2I2/#`6ˆ q#0P ˆA ApC'a0P42K #`FA@G#`f'* ,8C0bp ho,A992KP19ݩ 7 p  #0bp h r(,: Tp9P(HD?5R&QD ƈA pCP+a0ˠA7w0K#`+@G'; tJ0p A&PA #`+ ,;C0bp hsl2,<\l2K; 7  A țÈ A @Gă #`w. A,=C0bp h vC3'0bp h vC'0P ÈA pC.a0p ,A>1,A> Tp=0bP `P{2K2>#`l'מÈ  '0P ÈA pC/a0,?с,? TpD?0bP `{2K2DH#`v7Ĉ b@*"1P !ĈA pC 2a00$p0KP#`{G?@G@#`W2 Ʉ,ID0bP ,JA{!1KaIT*R&_ 7/ )A A +1bP rPf,J TDJ*q 75 ,  Ĉ "@G#`3 ΄,KD0bp h0{H 11bp h0{11P/!ĈA A=3L2L#`ׄ0K@#` 1K@!L` 6 ل,CMD0bP `0Kp#`Љ`71P5!pC.,NE0bP ` 7j 9A t(,NA@G䄠73; tM0`!`7 ܄,COD0b` dnnp6?1b` dno6?1P=!,[\l% V`/ZDrp/X,AX TX T X 7 b1 ] %`=K 0  " &h [n sdaqlTN&6 i*-drL@(LrL@nHt' fȢ,j%XK N d 3`,*,`,*,(gtK `' f,z!f b("~`-"(DlꃹY88b@;`/h F.`A-"(b`-*8A-%` x'%h` 򸉁 ` ӸF/A0u Nh!H0epF  D1nf b@@ @nf.*8-($ GF  1p`!."1(90a`.F @̎% A..A0| hh!0eF  ^1tG%A0@W Yf b#BA0~ rh!ء0e F  h1@|g%ȋA0@Z !f b#Bb F  F2 `A/"102=%A0 +Yt`/"%Y0 /F  V28 `4B#184|h@4F  18% !4ʉ'n( fF4A0 h!0e F  1؝Fwf Lc@@ qvg4*84Ҙ%@Y(+t!0n F  v2 `!5T#184Y184จYT`|'1(0P Yh`@@ @f \c@@ @#f \c#XChA0 Hi!00ex F  160%A0@ . f bc#^CZ``A6l#1(0x`V Y٠`@@ $"~Di6F  L20gf jc#fCYڀbL#Yܘ% %lc 7*8B7F  2`A7v#184xɀV%A0@ j@Yvc@mJYLA0 xi!0e F  n2f c@@ &!g <*871(0( q Y`@@ Ѐ' Y184ɀHDh<*8B<1(0Pt Y0`@@ ,)Y184ˀpLh<*8<1(0x`v YH`@@ ,AJY1840ˀTh@=*8=1(0y Y``@@ `-jY184Xˀ\h=*8b=1(0͠{ Yx`@@ .Y184ˀdh>*8=1(0@~ Y`@@ Р.! jh`>F  2 f c#CA0 i!0eF  2 f c@@ @/ th>*8>1(0@π Y`@@ /a zh ?F  2 f c#CA0 Rj!X0eЏF  H3 f c@@ Ѐ4"h?*8B?1(0 Y`@@ 5h?F  R3 f c#CA0 fj!0e F  \3 f Dd@@ 5bh@D*8Dd mJYL6A0 xj!0e( F  n3pf Nd@@ 67"hD*8D1(0(  YI`@@ Ѐ7Y184̀Hh@E*8E1(0P Ya`@@ Y184πDiF*8F1(0@ Y`@@ Р>! Ji GF  3 f rd#nDA01 j!0eБ F  3 f xd@@ @? TiG*8BG1(0@Ӏ Y`@@ ?a ZiGF  3 f ~d#zD @f f`e= `)*1(0Ӡ Y1`@@ EYbiLF  P4%Bf d#DA0; dk!0e(3 F  Z4Z%8A0@ Ėf d#D0A0= nk!ȭ0e@4 F  d4\%PA0@ Gf d#DHA0P xk!0eX6 F  n4^%hA0@ ɗf d#D`A0R k!0ep7 F  x4p%A0@ Lf d#DxA0U k!@0e9 F  4r%A0@0 Μf d#DA0W k!h0e: F  4 f d@@ LB iN*8N:1(0 Y;`@@ `M i OF  4 f d#D @f fĠ@@ @Wn+ fOA0l5 Y>1(Vr%蓁 O.8vAQ`!0e(? F ku A0r Bl!0eP F  4见f Be@@ ЀO"i T*8OP1(0PĆ YQ`@@ TYR184ՀHBjT*8BTQ1(0x`ņ YR1`@@ TYS1840ՀpkT*8TS1(0Ɔ YTI`@@ `UPj@UF  V5P)f Te#PEHA0| jl!0eXV F  `5.%hA0@X f Ze#VE`A0 tl!0epW F  j58%A0@Z .f `e#\ExA0 ~l!0eY F  t5Ġf fe@@ @Wb \j`V*8"VY1(0@ـ̆3 YZ`Ġ@@ { djVF `f le#hE K*ؤA0 l!p0e\ F @H3f reĠ@@ 4b W*V\F  l6ᆀ`AWv%184h`hWF  5 K% aWF  v6 `W|%184aWF  5H% WF  6`\%184נa@\F  5pB% !\F  6``a\%1(.l0%(A06Yrqe!8a\%1(0XԆT YsA`Ġ@@ ` ]F ; f e#E@3n`a^-ePwF  6x `a]%102h Lm]F  Z4xS%h X]f `+1-ư j,@Ƕò f e]>x8:p ^CYxC9Pp0F t,HфMن@F`4AQ-7֘D@ =qYw9HA[f A4uEfpla,`wpl,` 7$ 2/Au,W%Ps' QU0K0/ /X TATpgX]y%PF] e0p- R/A2PqbpCsa0`/j\q, TP2Pq @a H Q QB4X&ppsa0ˀ/ 1K//ƽ@!AA Tp<.D@A y@GiK pԈA  f7]2K#` c2bp hak<6K//B@0'#`1pCva0/ j0̈A A,A TĿ sp0w A݅,ȌL0bp ho,Ɍo@2K@2!2Ȍpw ޅ,CɘL0bp h0o,Ɍo1Kp2Q2#qw ,ʤL0b` dÄ E1K22Bʔ7 +2,A@H,3bP `Ɓ 7 .2 Aw03bp hA, Tp/S :^2L#`qpCza0 33poM̈  , Tp33K`363P 4cA͔7 oA 7^2 #`10K3#`11K3q3Δ7 #'rA (7_2#`90K3#`91K33Όr0~ A,CL0bp h,όp@pa0K337 `C6A +7_2#`e0~c3bp hPƁs7K066@' 3e, Tp@6g3POf3K6-p6i3Pᕈh3K66 ~,ڰMpƱl3P kcM@g`?LD_02X45 0ąLv~#DD  >q!H3Q,O\d7SHO\d́#0#>#>o۫TT5>q! >mۮӷ 6 >ӹ >[?Ӽ) >gB2_[ӽ2$ X45 0ąLw}#DD  >q!}H3Q,O\tx@t]_H3Q,O\tu?LD_02>r#DD  >q![ӵm O5m O׹>]6x@t}5nO׽.>]Ӵ>]CSąLXgB2ObCO[n Ot}v#DD  >q!SQd45 0ąL+>]ӹy+>]Ӷ'>}#DD  >q!S#>b Ot#DD  >q!~ߤH3Q,O\Ȅח)`?LD_02>m)`?LD_02} X45 0ąLxo O4~#DD  >q!'$)T45 0ąLAT45 0ąLMT45 0ąLTT5>q!S3>mP?LD_02?LD_02>#>r#DD  >q!S7_,0H3Q25T45 0ąL3>mGSąLؾB56mOۼ>mo@}H3Q,O\ȔזYw*`?LD_02巽>m*`?LD_02 >}צPt]pSąL>#>rS#DD  >q!S#+`?LD_02 X45 0ąL~7_0@GO\d5>}{@M6n O׵n F#_H3Q,HS1O\dOߺq>}@}[r >#>ӗ?LD_02պvn O5mO߼!>}ʠTT5>q!S}\TP5>q!H3Q,O\Ȥߗ/45Ob1P„8/-2\H3Q,O\Ȥf~@߬H3Q,O\Ȥ&@4NxmO߹">f@4"xn OA">wPSHO\#>#&@4[%8oO㽽.>]׸.>]׹}2(#>#~m O}#DD  >q!Bum_H3Q,O\ȴG,`?LD_0213>w,`?LD_023>gu?LD_02>r#DD  >q!Ӯ_H3Q,O\ȴ@t^MGMO\dO">Ƌ@t_k@?LD?EG@ 4BD ąL>#v@t+9~|#DD  >q!SaTT5>q!SMU.>]ed45 0ąL.>]ӻ#>v@tH3Q,O\7-`?LD_02=#>f@^b/>֚@nnzn O븙/>@[szo Ol45 0ąL8B4M_ML6LąL>r#DD  >q!.>]׻/>f@^_H3Q,O\ȴy'>߶@nz{n O'>f@{o Oa45 0ąL5#fPtM?LD_02ռFM PQ2յ'8>#>ʠ,} XT5>q!S#>b O*>צ@4oۣm OQ*>疩@4n O*>0D>m C?LD_02>#>b CSC-ąL>dA6 `8>#>#>m 2$8!Q#>#>b ClP >ːH>#>SHO\#&Ȑ DDS֪@tO}}\#DD  >q!#&@toۮ=n #>#>V@t}n O2$H#Tt45@EDc>q!ӕa45 0ąLu#@t۳=o Oy&_g `45 0ąLu#QSąL#8`SC-ąL>TT5>q!qd45 0ąL @T?"!MQąLg?LD/8DL2`45 0ąL6#-S45>q!H3Q,O\T `45 0ąL>u?LD_02>r67_0@GO\Ȥȉ `T5>q!#>?LD_02ն6@>#>#>}:7_0@GO\#RSąL#-pSąL>#n #>pL>#~}7_0@GO\dfM#O\#>#?LD_02>#>r7_0@GO\Tȭ#DD SD 0!NKc=tąL>#>rgH3Q,!%L 52/(#>#>}l7ՏHHS>q!H3Q,O\daqhT8d#GO\#>r7_0GO\#>rD#DD  >q!H3Q,O\T>r7ՏHHS>q!}d45 0ąLȑTT5>q!S!HT$ATąL>r7՟ЅO\d #|p >~}#DD  >q!H__uH3Q,O\Ȕ#H#t#DD  >q!S]yH3Q,O\Tߧ`?LD_02X45 0ąL~M#DD  >q!#`?LD_02սm&,O M PQ2>rs#DD  >q!Sȕ LTR2|H3Q,O\dwRSąLu#1[<-HؖA|&nwe0@¶ L!1 `,08 A(8^ . ▃CrAh`) `! !7X0 `Q,X@ 8oCm` 4  Da;,(t6X0 (` ,`_Ck`x   b0 Pfq9`, WCj`W  !J5X0 `1r( 8 H h`h,`LCh`  @!^4X0 (` ,`? `ȡ$4#EdUe!3X@B0!|3X0m p`,H !2X`y,0  0"-Dg,` , `w,@ Q``  P!`0X0  ``[0-  { hCނ!Fo~`[0- v 8 (Ii n$a-  s C܂!Hn`|[0- n (C+0 Pfq"jڵCڂ!\m1`U[0@- g K BAFq(i ey aHRB0LV& 0+ Cʄ!Te2aR0@LO& ' CɄ!dm2a50LL& # CȄ!ld32a0@L@E& p" CDŽ!c1a0|L>& P CDŽ!c1aޘ0@nL& 0M0@3aeDŽ!a Qk  t0@C:uq "c#/.kh~0FP/rct/A1 c 0`0F𢢪8N;Aho c}-hú0FЧ9Aڹȍ> cr/̬ c馤.A?\`d0 n`n`s `@pd{@99% c7FJ ػ4 cmY7F+A1(0F(Ai$? c ABK@AB!)\>(I;9;8A=́(I;9C:A=́(I?,} l@x3tP>EP@SP@0F+J=  tP1Fc/!8zA c 4/Z10Fj- c A@c p @eZ-P1F jcgH/د  c7Fv-90Fj~c /0/{ c{ &H0FJ,̍0F vcr/mz c*]/~[0F ʴ cw.`0F0>ljc_k6F`i  c"$7Fscp/, 0FkX{cy7F/xå c}*0tɺ9!YA ch-/,}A|܍<ξ0F`j5̫) c7Fx0F~cO'83p(K 7. A^01K0A^11K0 TpD  L DL4K6 L xE2bP `_51D2H0bp h[31,A2bp h[4A,A2PN 7< ր,3bp hP[Dr@G(2bP `0b?1E24N0bp h[Cψ nN<ΈA pC`a0E o OH#`EL?1K Tp$D#`- Z,D#`5\b1KP,A5P@Ș[ 7p Ab`0KAba0K Tp%\ AZ %ET1bP `cv1E2h[0bp h_k1,7bp h_lA,7P^ 7} ׀,7bp h`_D@Gt7bP `@f1F2A#`54r1K #`E8s1K BYi 7 c@  ] ]@G@#`n k,A0bp hPbåh0bp hPbh0Pf A pCa0ːj,, TpDj0bP `g1F2A#`/6h b0n0Pl A ApCa0p`1K#`4Fi@G#` x,A0bp hc,1K@!x 7 u`  pw0bp hcD, TpDv0bP `j1G2A#`c6ԁl v0z0Px A ApCa0˰|٠1K#`hFm@G FH ňA pCa0Awk0K #`*@G ; tèG0 A'PA #`" ,(B0bp hg1,A)y1KP 1 ) 7( p A zˆ AIz@Gp #`" A,C*B0bp h`jC0bp h`j0P ˆA pC #a0  ,+1,+ Tp+0bP `o52H2,#`F|ˆ F0P ˆA pC#a0 ,-,- Tp,0bP `0r?2I2Ԃ-#`6~ˆ b0P ˆA ʑpC`&a0ˀ p0K #`FA=@G #`-& ,.B0bP ,A/Ao0K  /Tp"R&\ 7s A Aj 0bP rZ,8 T/pܝ 7y   0È 1"@G #``' ,9C0bp hpo 0bp hpoI0P ÈA AمpC0*a0p[#,A:#,A: Tp90bP `v2J2:#`ЙÈA tf0P pC#,C;D0bP `w2J2;#`бÈA tlw0P 7A0݀32C=#`xG+ ,b]Vex *o% Nx -RmY`+x=`+! 8v@.*8!. n"f.YP/m.N.N8 &*`+=F  1p`."184H`e.F  R1 f% .ꈟ1(0ǀ Yȋ`@@ Dg@/F  ^1 btf b#BȋA0~ rh!ء0e F  h1ә%A0@Z tf b#BA0 |h!0eF  r1ՙ%A0@\ guf @c#BA0 h!(0e F  |1$י%A0@_ uf Fc#BCA0 h!P0e F  1.ٙ%0A0@q lvf Lc#HC(A0 h!x0e8 F  18ۙ%HA0@t vf Rc#NC@A0 h!0ePF  1@wf Xc@@ РB vg5*8B51(0< Yp`@@ @ |g5F  1 wf ^c#ZCp @F  2`6b#184``6%A0@| 6hYbcĠ@@ ,QnH) ff6A0@~ }Y184`҇% 6Nv!`1e 8܈cG  2Xᆠ`6p#1840a 7F  L2H%ȍ 7f tcrV `a7x#8;ؚ f xcvэRҡA0 li!0e F  b2b%A0@ n f ~c#zC HF  @3`<#184 n\g%A0@ uYclbYdA0 i!00e(F  ~20f c@@ '"Lh<*8<1(0h u YH`@@ Ѐ,IY184 ˀHTh@=*8=1(0x Y``@@ -iY184Hˀp\h=*8b=1(0`z Yx`@@ -AY184pˀdh>*8=1(0} Y`@@ `.᪡Y184ˀlh`>*8">1(0Ϡ Y`@@ /ˡY184ˀth>*8>1(00@ Y`@@ Р/! zh ?F  2 f c#CA0 Nj!H0eЏF  D3 f c@@ @4 h?*8B?1(0π Y`@@ 4a h?F  N3 f c#C @f fAШe j(*@1(0Ϡ Y`@@ 65hDF  `3X#Bf Hd#DDA0 tj!0e( F  j36%8A0@ čf Nd#JD0A0 ~j!0e@ F  t38%PA0@ Gf Td#PDHA0 j!00eX F  ~3:%hA0@ Ɏf Zd#VD`A0 j!X0ep F  3<%A0@ Lf `d#\DxA0 j!0e F  3>%A0@ Ώf fd#bDA0 j!0e F  3 f ld@@ =B DiF*8F1(0 Y`@@ `> Ji GF  3 f rd#nDA00 j!0eБ F  3 f xd@@ ? TiG*8BG dl戕Y舕VA05 Lk!@0e0 F  B4pf d@@ DW"\i L*8G01(0x` Y1`@@ DY21840рHdiL*8BL11(0 Y21`@@ `EY3184XрpliL*8L31(0Ӡ Y4I`@@ FAʥY5184рti@M*8M41(0@ Y5a`@@ РFY6184р|iM*8bM61(0ෆ Y7y`@@ @G Y8184рiN*8M71(0@Հ Y8`@@ G! i`NF  ~4 f d#DA0V k!X0e: F  4 f d@@ ЀL iN*8N: dlF  f5x `O$1(Up%ГA0^5Y=<pc nس  Yٓ`Ġ@ B 1(0@ Y>`@@ РN|iOF  4'"f d#DA0q k!0eP F  4~%A0@= f Be#DA0t Hl!00eQ F  4% A0@? f He#DEA0v Rl!X0e(S F  H5()f Ne@@ ЀTJjT*8TS1(0ņ YTI`@@ UBYU184HՀRj@U*8UT1(0`Ɔ YUa`@@ UcYV184pՀZjU*8bUV1(0dž YWy`@@ `VA TjVF  f5 `f `e#\ExA0 zl!0eY F f feĠ@@ :x `j`V*8"VY`M /1(0H٠̆3 YZ`Ġ@@ /aVF @2 f le hE 8lĠ@@ e5n, fnWA0@s f re@@ \ĤY\[peĠ@@ f8n, fteWA0@u f xe@@ `]$ňY^]veĠ@@ g:n, fzWA0@x f ~e@@ ^ňY_^|eĠ@@ g=n, f%\A0t6Yq1(Ng0% !\f_A0 @m!0e r F  f eĠ@@ : `\*8\r4!XՂYsYaĠ@@ mTnT- f%]A0  ܠf e@@ D7Yute^.0 +3̢ 0 lk30 b]*kueA34ehw nP0o@fc@ ̂M!m83m`4!FrCnY0l@أv %`d gDPכiQo(Ȗ Far(Ȗ+ pC"oa0.ԝ)QrEXBe8wb [nh/Q (8 XE2Pq@K%AjXBeڕjXF w`ra0/ 1K // 7) B/A!ƅW@A %/2/ TƼ]%{AUPkr 7 7 b/Aݚ2P b, Tt @B, TĽ TpĽkzGps A؅,CK0bp hk76K/#`c2P"K!To2bP ` 7h /A ȁ,i@@A/p؈A  l7]2 #` %̈ -!3P"̈A A q7]2#` )7̈ $3P"#L1#`;pCwa0P2&k@'3b` dPAQ@G`2e` ߅,ʤL0K2*3Py2Rʌqw A,ʰL0bp h1,Aˌop57K22˔7 .#3A 7^2 #`!\3r13bp hA9$7K32, TpB̨x 3e` tə\0bP `0 7 4S3  5̈  E@GЌP3` tC˱\0bP ` 7 73  7̈  G@G܌3#`pC{a0ˠ3;p0l`0K3#`7\@G茰3g ,CτM0bP `p 7 >3 ,،p`\@G3a3P `LF c3K06b#, Tpc'@GK<", TpxB@c!, Tpz 7 i6@'6`?LD_02X45 0ąLv~#DD  >q!H3Q,O\d7SHO\d́"0#>#>Fo۪-TT5>q! >m[ӷ >>ӹ  >ӻ5 >͛=ӽ #$#$2$ X45 0ąLw}#DD  >q!}_H3Q,O\tm@t]H3Q,O\tr?LD_02>r#DD  >q!۷ӵm O[5mO׹3>]Fn@t}[5nO׽+>]Ӵ>]@SąLB5]B5mۼn O4~N#DD  >q!SEd45 0ąL+>]ӹ+>]Ӻ'>}#DD  >q!S#>b F#_cBO_H3Q,O\Ȅ)`?LD_02Y X45 0ąLx}#DD  >q!_aH3Q,O\#>r`7_,0H3Q21T45 0ąL=T45 0ąLIT45 0ąL%TT5>q!S>mWQ?LD_027?LD_02>#>r#DD  >q!S#DD  >q!nO۴1>mۗSąLXӶnOۻ>mF{@[9}#DD  >q!S^[fAH3Q,O\Ȕߖ{@]_H3Q,O\Ȕ6|@}[Bu]M PQ2 X45 0ąLy+#DD  >q!S#{@mM PQ2>bO X45 0ąLnO۸>}:?LD/8DL277mO߻2(#>#m ʁO_H3Q,O\T>r#DD  >q!BuM7o ʁ_ۻB5}M Q21TT5>q!S #DD SD 0!NKc=tąL>#g+`?LD_02鵕>+`?LD_02麁>m,`?LD_02A3>'SI]ąL}@28n O㹭>6@4~[M@ >q!/Ӹn O߹E#>v@4Bu2~#DD  >q!XB5H3Q,O\ȴw,`?LD_02q#>ӧ,`?LD_02#>r?LD_02>r#DD  >q!ӮH3Q,O\ȴ&@t^ߴMGMO\d #>#>&@t[<}iH3Q,!%L 52cnO&>G?LD_02ջm@M PQ2մBu}@H3Q,O\dB5h9o O罅'>Ɲ@^uӺm O뷵&>6@nӺn O}/>&@ޛ-Á_"$H#Tml45 0ąLB5ݟML6LąL>b #|ӗ@?LD_02q;m O\45 0ąLm O'>@n O!*>6@ΟH3Q,O\T>rW#DD  >q!STT5>q!S]y.>]׶]2(#>#>m _M PQ2>#֨@4O|m O]*>Ʃ@4|n O*>ﶪ@4|o Cf0D~}#DD  >q!#>#0D}7_0GO\#)6 [b"0#>#>0Dn #>#>mNDH2a&/$H#T"0#>#>׶ DDSۇSHO\#ʐH>#>8PSąL>떫@tOH3Q,O\.>v@t,"*>׆@t۰}#DD P0O\t=#DD  >q!S]I+>v@t}o#>p >#>}Z#DD  >q!Sed45 0ąLTT5>q!SmXTP5>q!>b O @T?"!MQąL?LD/8DL2Y pT8d#GO\#>6ؐd>#{ӷ/?LD_02H3Q,O\T\45 0ąLܷM PQ2>#<~7_0@GO\Ȥa45 0ąL# O_vM PQ2e \T5>q!p@M PQ2շX-12_cM PQ2>#gS4E2ȁl45 0ąLM PQ2>#>rmH3Q,!%LX0>q!?LD?EG@ 4BD ąL>b O_M#O\#'r?LD_02>b E#>m E#~}7_,0H3Q2@M Q2XeBo[gBH3Q,O\#?LD_02շM#O\#>r#DD  >q!@H3Q,O\Ȅp@M PQ2ո\MGMO\#?LD_02>#>bHD4yH3Q,O\Tק`?LD_02X45 0ąLu~#DD  >q!S_H3Q,O\tX45 0ąLuoK#>#>'@SąLH3Q,O\T>r7_TąL>#g`?LD_02TT5>q!S1[-HؖW&nˠve@¶ L!1 `,0A. A(8 [ @, CrA0` `! !7X0 P`Q,X@ 8nCl` 4R@ Ɓ$4,(f6X0 `,^Ck`u  P b0 Pfq`,@VCj`T  0!4X0@ `1r( 8 Hћ 0`a,GCh`  Т!P4X0 (@(CIiG3X0{ (`,7Cf`  !N3X0h `x,`.V0aȡ$4#E!2X0@X @`=,& ',@$Cc`y  0 b@1Ћ``,(`d1X@!N0X@B !o`[ r1- w C݂!n``[0- o Cۂ!m`_[0-hQ( 8IUY-hEI(@ Zm`Q[0-  @g 8yU]ئeDaHBMJ $E 8Ϛ-X!@Ԅ!f2aX0@L@U& p* Cʄ!dz2a;0LN& &P CɄ!da2a0@LF& 0# CȄ!Td'2a0L?&  CDŽ!c1a0yL@<&  Cń! CHx& L 0a /<t0@C;uq "c#/.kh~0FP/rct/A1 c 0`0F𢢪8N;Aho c}-hú0FЧ9Aڹȍ> cr/̬ c馤.A?\`d0 n`n`s `@pd{@99% c7FJ ػ4 cmY7F+A1(0F(Ai$? c ABK@AB!)\>(I;9;8A=́(I;9C:A=́(I?,} l@x3tP>EP@SP@0F+J=  tP1Fc/!8zA c 4/Z10Fj- c A@c p @eZ-P1F jcgH/د  c7Fv-90Fj~c /0/{ c{ &H0FJ,̍0F vcr/mz c*]/~[0F ʴ cw.`0F0>ljc_k6F`i  c"$7Fscp/, 0FkX{cy7F/xå c}*0tɺ9!YA ch-/,}A|܍<ξ0F`j5̫) c7Fx0F~cO'8+wحǻ0Fvc./A]n ci!-/y: cs:-Aȗhk cx>-"  c.5ٍ0Fp=> cwy.mO`0Fоsjc3%~{¿0F0{cv/ >Ad c=7F2 c7F`pjc#f.̥$ c7FLwg0F=I~cry/aA cL7Fжޢ8`0FӲ~s=0Fp)_9* c0%7FpsMsc,lp/0FPo{c;7F0&4 c .a_a=h cki1/AmrݍfҾ0FA9s̯0Fu{c/a c#7Fyc  R 3p(y 7 AWj1K0AWj1K0 TpD  z Dz4K6 z xE2bP ` k1G2H0bp hf1,A2bp hfA,A2Pܚ| 7 ـ,3bp hfD@G(2bP `k1G24N0bp h0gC#ψ |<ΈA ApCa0E  }H#`xFL1K Tp$D#`" ,D#`}6\1KP,A5P@ 7( Awn`0KAwna0K Tp%\ A %HT"1bP `0o.2H2h[0bp hj#2,7bp hj$B,7Pی 75 ڀ,7bp hjDr"@Gt7bP `o82H2A#`64*2K #`F8+2K B\ 7? c@ A  AӋ@G@#`''& ,A0bp hk%#h0bp hk6#h0Pf A ˝pC&a0ːj,, TpDj0bP `sl2I2A#`6 b#n0Pl A pC0'a0p`>2K#`F@G#`;g' ,A0bp h o,c2K@!ݞ 7} u`  p`&w0bp hpoD, TpDv0bP `Pv2J2A#`6ԁ v&z0Px A ڕpCp*a0˰|ܠn2K#`!G@G pIH ňA ݽpC+a0Aw0K #`*@G '; tJ0 A'PA #`~+ ,(B0bp hPs}2,A)Ԝ}2KP 1 ) 7 p  ˆ I@Gp #`7. A,C*B0bp hsCC*0bp hsS*0P ˆA ApC.a0  ,+1,+ Tp+0bP ` {2K2,#`i*תˆ Aڵ *0P ˆA pC@/a0 ,-,- Tp,0bP `{2K2Ԃ-#`s7ˆ b@+0P ˆA ApC/a0ˀ p0K #`xGA=@G #`2 Ȅ,.B0bP ],A/AW{0K  /T+R&_ 7+ A ȁ 0bP r f,8 T/p 71   È "@G #`G3 ̈́,9C0bp h{ 0bp h{I0P ÈA pC3a0p^/,A:Ԟ/,A: Tp90bP `0 7` A t0bP ,; Tp:00"`@6 ل,;C0bP ] f0K#`Е`p0P õ 7A0?l2C=#`1pC6a0 ,=,= Tp<0K[h $V@2Q\)5 V1K`CvCw ɠ,=Cpl (8#``)$hB&llC  0 (ȖCfa" ,=ܣ [n@ @fF@A9Pp0 !@A=Pp0 Y|ЇvF:+B(ùrC~M@*%Ї0 J0!mm2n Y}GY~8}0ah0e .$n tx**0~8~n(!  8nAٟ0ejo% Nf Bb?z x% `  `@G/A0t Fh!(0e F  0FGtf Jb@@ tFg$*8b$($ GF  1Pᆀ`$N"1(9a%F @Ǝ%@0 $.A0z `h!0eH F  X1G%XA0@V Yf Vb#RBPA0} jh!0e`F  b1@g%pA0@X !f \b#XBhb F  @2ءᆠ`%`"102=%A0 +Y` s@`A&f"%Y@0 i&F  P2ᆠ`&l"184g&F  ~18% &q'n( fp'A0 h!`0e F  1@|f vb@@ qg`'*8"'%Y=(˞t!0nX F  p2ᆠ`'~"184xY184xจY~<@`!~'1(0`> Y`@@ f b@@ #f b#BA0 @i!0e F  160%0A0@~ . f b#B(Z`,"1(0`T YH`@@ `$xh@-F  F20f b#BHY̢L#Y%h Nb-*8-eYbHY0b!UNe ]3#cg by%ba."(zq!% N @.N8 `(*!`+=F  V30`."184`!e.F  2 nh% .H1(0̀w Yȋ`@@ -th@/F  2!bf b#BȋA0 i!0e F  2%A0@ f b#BA0 i!0eF  21%A0@ gf @c#BA0 i!0e F  2$3%A0@ f Fc#BCA0 @j!0e F  2.5%0A0@ lf Lc#HC(A0 Jj!80e8 F  B387%HA0@ f Rc#NC@A0 Tj!`0ePF  L3@f Xc@@ 4B h5*8B51(0ॆ Yp`@@ `5 h5F  V3 f ^c#ZCp @F  3`6b#184``6%A0@ 6hYbcĠ@@ ?nx* ff6A0@ Y184`B% 6Nv! 1e 8LcG  P4ᆠ`6p#184a 7F  ~3H%ȍ 7f tcrV`a7x#8Z f xcvэҡA0 j!0e F  3b%A0@ n f ~c#zC VJF  r4`<#184 n\g%A0@ uYclbYd^A00 j!0e(F  3𥡗f c@@ ?_"|i<*8<1(00  YH`@@ Р? Y184πHi@=*8=1(0X Y``@@ @D)Y184рpi=*8b=1(0` Yx`@@ DAJY1848рi>*8=1(0 Y`@@ ЀEjY184`рi`>*8">1(0Ӡ Y`@@ FY184рi>*8>1(0@ Y`@@ F! i ?F  l4 f c#CA0R ~k!0eЏF  v4 f c@@ `G i?*8B?1(0HՀ Y`@@ La i?F  4 f c#C @f fA@e j@**1(0ՠ Y`@@ MBjDF  4)Bf Hd#DDA0[ k!0e( F  4%8A0@7 Ĥf Nd#JD0A0^ k!ȯ0e@ F  4%PA0@9 Gf Td#PDHA0p k!0eX F  4%hA0@< ɥf Zd#VD`A0s Bl!0ep F  4%A0@> Lf `d#\DxA0u Ll!@0e F  D5%A0@Q Φf fd#bDA0x Vl!h0e F  N5 f ld@@ TB tjF*8F1(0Ɔ Y`@@ ЀU zj GF  X5 f rd#nDA0} jl!0eБ F  b5 f xd@@ V jG*8BG dl戬Y般A0 |l!0e0 F  t50f d@@ @W"j L*8G01(0@`̆2 Y1`@@ WIY2184ՀHjL*8BL11(0h͆5 Y21`@@ Ѐ\iY3184 ׀pjL*8L31(0٠͆7 Y4I`@@ ]AY5184H׀j@M*8M41(0@Ά: Y5a`@@ ]᪫Y6184p׀jM*8bM61(0Ά< Y7y`@@ `^˫Y8184׀jN*8M71(0ۀφ? Y8`@@ _! j`NF  5 f d#DA0 Bm!0e: F  5 f d@@ Р_ DkN*8N: dlF  68 `O$1(mp%ГA06Y=<c nW ffOBA0*dĠ@@ nXnf- fOA0@ `kxY?184h ؚ% OF  6`O@%184`@fk TF  d6 ښ% TF  @7صᆠ`ATF%184`hlTF  n6 >% aTF  J7@`TL%184ـݚ%8A0@ zkYSRLeĠ@@ @urn- fP%UA0@ f Te@@ lYUTReĠ@@ uun- fVUA0@ nf Ze@@ l#YVUXeĠ@@ vwn- f\UA0@ 1HYX184Xۀ% pUF  r7`!Vd%1(u%A0P7xYYXdY 6)ddĠ@@ |~n- fhVA0 Y[1(9р%V.A0 Dn! 0e\ F  6v-%ȕA0@ ^f re#nEA0 Nn!H0eЕ] F  F7@~m%A0@ p#f xe#tEؕA0 Xn!p0e_ F  P7hm%A0@ r#f ~e#zEA0 bn!0ep F  f eĠ@@ : `@\*8\pnme| F  7 `\%1(3%0A078Ysr@c n fe]A0~n!0e@t F  |5pz%PA0 _ ޠf e#EHY{88 MK7޴{**෭߰-Yv8x=X]*_֥k@`]%8AA6 CA0~ c4!F``!фMcd Ur` %`{d rHPiQ&(Ȗ~ Fa* 񂹀1ܐ^2 rJԱ\@Ν(Ȗ+ FV, Tλ TĻhRBa  7`,N0bp h`Έ nb0K;;BT2bP `Y Ya0˰;w0Έ xD,A Tp3bP `AZ Za0;w  3bp hPb@@G;' ;d0 S>dWǎx pC@A2#`1ψ {P3P#,A TpP3 + s>dA3P 惔OdȈA } 7,O0bP 0K>#`x 3P#pCA1˰>3ATAyPpCpA2O#`!a0a0K>#`!a@a0K>>BypCA2#`& 3bp h`a0K?>ypCA2#`+ 3bp ha0K@?!?y@pC`A2ԏ#`0 3bp h1a0Kp?Q?ypCA2O#`5 (3bp hPb0K??BzpCA2#`: )3bp hѝb0K??z0pCPA2#`? a,x@* 4P#ψA A 7,CP0bp h@`Ј b0K0BBzОpCA2P #`i k , y@,&4P$$ЈA  7, P0bp hЈ Db0KBqB zppCA2 #`s u, y@.,4P*$ЈA A 7,C P0bp hЈ b0KBB {pC0A2P #`}  , y@824P0$P e DD T*cc52K0R#`h0h 4TCA t, A+@GАPCg7A0ݠA,CHݐ0bP ~PA #`h0h 8C  Ј @GC#`!h0h ;C b,Az@$@GC#`&i0i >C AC,z@8@GC#`+i0i a$F d,{@L'@G F#`0i0i dTF #\"ш $`#@GPF#`5i0i gF g,A{@t'@GF#`:i0i jF ACd0KF#`b, Tp k4bP ` a0Fn{0>ш , TpD n4bP `@ Aa0GqA7HшA t3, Tp qTAR&h #`lj0j sDGA x,Aw@@Ȃ@Gp؈A  7,Q0bp h`A4KG#`)gHx4Pv$шA  7,CQ0bp h0KG#`.$k{4Py$шA  7,Q0bp h0A0KG#`38o~4P|$шA A 7,R0bP ]`0KJ#`}4P$pCP0 J~ pC@A2)#`Ѝ~`4bP g0KPJ1J)ݪ; t JˆA  7,)R0b` di0KJ#`@i4P$,-]lh `hD\l `j,* T-RJ} ԱpCP2R+'(FjP0 `=j$hB&lC0hB&1 冱 `5 sZ7,*ݣ [n0`V`؀+V-*V@Aܠ r[ᆄf0epW *W%XK N d wkPp0*@f ^ip\ Zi \Ik555B]+],#;@ f`)J@YX8X2^iX fdiB5B5 x`*A!*DX8Yb^A!J^ `j)| % N%cf liw!Ƞ*8H%f ni lfpCA2Yxx=,A TÏ Ta?6Pq ` QwB VAr 7(,YP71K0f fF@AÍ Tp,éZ0bp h°,A@pa0Kjqj{ a0ˠj0pCPA2#`+{0Kj#`+{0Kjj@'je, Tp=j6Px 6K k,k6P[ 6K@k0k q|@2Ԛ7@'ԚAk u} FA0bP ` Aa0ˀkAG6bP  x@Gk A,îa ȈA  z 7|,Z0bp h`Û6bp h`B6P&ڈA   7,ï[0bp h€y0Kn#`+ 6P&ۈA A  7,[0bp hy0K@n#` ,6P&ۈA   7,C[0bp hP y0Kpn#`%,6P&ۈA   7,[0bp hpy0Kn#`*,6P&p'1˰nTg@'@ún'l(D([p6bP `0ʂ a0n0|ۈ  , TpD VG1bP ` a0o0 ,@1, Tp 6bP `B a00o0 6ۈ A Dr,A Tpļ 6bP `0˂ a0`o04:ۈ  , Tp 6bP ` Aa0ːo0H>ۈ  , TpD 6bP ` a0oб0\bۈ  6, Tp 6bP ` B a0o 0pf܈ A Dw,A TpĿ 7bP `p΂  a0 r#0$7bp h!k@G0r#`,0 %gr ( }0Kpr#`,g, TpD &WAȈA < -( 7/(,ʥ\0bp hl0Kr#`,m*7P('܈A = 2( 74(,ʱ\0bp hʂ}0܈ + s1Krr˝(pC2Ǟ;bP ` A a0r0ܲ 0Ks#`,BA@Gs27P 1Gp+2gA@'"sA #`+-0 5gsA t'-,͍Aw҂<@GԜ`s A ,΁^PQ1bP `0ӂ  a0ːs:,΍, TpD :7bP ` A a0s=0 ?܈ ; D, Tp =7bP `  a0s`г04c݈ = C,A Tp `7bP ` B  a0 vc0Hg݈ A> Ą, Tp c7bP `pւ( ) a0Pvf0\k݈ ? , TpD f7bP `) A* a0ˀvi0po݈ H D, Tp i7bP `+ + a0˰vl0m7bp h!t@Gv#`v-Ģ0̢ nv AK }0Kw#`--g, Tp o7bP `ׂ- . a0wrȴ0s7bp h |@Gĝ w#`-0 tWw M }0K`w#`7-q, Tp u7bP `P8 8 a0pwx0y7bp ha@Gܝw#`-0 zw AX ~0Kw#`a-{, Tp {7bP `ڂ: ; a0w~07bp h` @Gw)~~5RATA,#`-أ0 'zA t-,Aw+@G z7A00? ,'A-TȈA Ay ) 7"),C^0bp h7bp hB7P'ވA z %) 7'),^0bp h€~0Kz#`- 7P'ވA { *) 7,),^0bp h`Ќ0Kz#`-7P'ވA } /) 71),^0bp h 0K{#`-< 7P'ވA A~ 4) 76),C^0bp h7bp hB7P'ވA  9) 7;),^0bp hP0K`{#`-7P'ވA  >) 7`),^0bp hވ n 0K{q{0pCPB2#`- g7bp h0K{{pCB2#`7bP  !1K{{U`TI!')ȈA A p) 7r),_0bP r0K ~#` ,7Pq'6bP `] ^ a00~ȷ0߈ | L ,A Tp 7bP `^ A_ a0`~ܷ̤0߈ } M , Tp 7bP `Ph h a0ː~ܤ0߈  N , TpD 7bP `Bi i a0~Ag߈A tf., Tp 702 A  ) 7),_0bP  f0K #`бp7P'ѥ BL70B00'0pCPB2_#`@2-Z 7b` d 0K`AB`pqp%Up5ŖB7Pq.qY TK_ [ h0ː aw-1bp B 0  r)lC  0 (ȖXPK!6 ḇSwla".(r )T d 7(Pp0`a/(c",!2;1P-7КB@*%࿁?0п?MJhMkMjxMK F0e? JF!f oᆂF0e? BnSBpSxC?~𿁊?*0;A!*MnP*OA.EL0e@Sb@ N``C # *8?BY *bp 4ܯ1(0xyAMᆀM0e(`@@ ܅A=f N01840wXOY *8BE1(0y!N`N0e@ `Ġ@@ @a@A0Y *! 8lĠ@@ ;<`a A0@yY F  ] bTXf Z0  F  ^Sa!S! f\0x 184wQa%`@@ ߅Fb p@x`Ġ@@ QR`! A0 b`A0 b` @t!HU! fh0 %` Ε   F  ^PUa!`U! fn0 1840y~T%`@@ 䅃Sa  @t!U! ft0 1(0{!V`V0e`@@ Рf z0184hyFEY *8bY f ~0 t!U! dTQ!1(0{AWᆀW0e `@@ f 0184yḈY1 *8 ! V \0nQF`Ġ@@ qr`a 2 A0@ o%(`@@ #f 01 2 F  X_8Wa!HW! f08 184{a` A0@. f 03 3 j T]0eHà `Ġ@@ `wx`A 5 A0@2FU%``@@ @LRf 05 5 f 06 4 4%x`  7 6 f 0D7 F  x_Wa!W! f0 184{V%`@@ `Z!f` @` DiMe@L551(0 _ d0e`@@ `f 0184{65Y< *8A0HVnLVY< 0F  @_f 0184}HfS%`#001(0p!e`e0eà`@@ Р􅡱MaA0@6Y? *8A0\Vn`VY? @1F  T_$f B1184P}vS%`#0@11(0afᆠf0e``@@ MaA0@7YR *8BaA0pVntVYR L1F  h_8f N1184}S%8`#J1L11(0gg0e@ `@@ ! S%P`@@ B S%P`#P1R11(08Alᆀl0eXŀ`@@ S%h`@@  S%h`#V1X11(0`l m0ep`@@ `a S%`@@ ` S%`#\1^11(0mm0e@`@@ S%`@@ "S%`#b1d11(0!n`n0eŠ`@@ РS%`@@ РS%`#h1j11(0no0e`@@ @AS%`@@ @bS%`#n1p1dDOeL??1(0 o t0eŠ`@@ `f |1184D?Y_ *8A04HWnLWY_ 1F  @pf 1184HFT%`#~111(0p!u`u0e``@@ Р1QaA0@DYr *8BaA09\Wn`WYr 1F  Tp$f 1184PVT%8`#111(0avᆠv0e@ `@@ rQa@A0@EYu *8!A0>pWntWYu 1F  hp8f 1184fT%h`#111(0šww0ep`@@ ! lT%`@@ B nT%`#111(08A|ᆀ|0e@`@@  tT%`@@  vT%`#111(0`| }0eǠ`@@ ` a |T%`@@ `  ~T%`#11 d?; Se &(L L1(0~@~0e@`@@ Ѐ 11f 1184`ÁLbLY} *8"AA0]WnWY~ 1F  p3f 1184ÁHT%`#111(0Aᆀ0e$`@@ QSa $A0@;gMY *8$A0rWnBXY F2F  p$7f H2184ÁT% `#D2F21(0Hǁ0e($`@@ Sa$A0@PlNY *8$$A0wRXnVXY R2F  Jq8;f T2184(ŁT%P`#P2R21(00eXɀ%`@@ @! T%h`@@ @B T%h`#V2X21(0aᆠ0ep%`@@  T%`@@  T%`#\2^2 d?A0rXnvXY f2F ǁf h21(Nr%`#d2f288vAsXf3@1(u A0XnXY n2F  ~qhUa`Ua'A0@_\U ZU%`#l2n21(0h@0e@'`@@ ЀUa`'A0@r$VY *8"'A'A0XnXY z2F  qf |2184Hǁp\X%`#x2z21(0Aᆀ0e,`@@ \$\f 2184pAW)WY *8',A0XnXY 2F  q.^f 2184ǁzU% `#221(0ˁ0e(,`@@ Wa,A0@|.\Y *8,,A0BYnFYY 2F  qĀf 2184ǁ%P`#221(0X0eXˀ-`Ġ@@ :bj-A0trY *8b-- X 6)1(00ep-`Ġ@@ a.A0Y *-- 8lĠ@@ /Z[`!. A0@6Y F  `r Xf 2  F  rea!e! f2 184ca%`@@ Р&Db. @`Ġ@@ `4_p`. A0@:Y F  trpX f 2  F  Psga!(g! f2 1(40%`Ġ@@ :`/ @`!cae@6`Ġ@@ 5tu`/ A0Ts0Y F S f @3 Nn`xY J3F  lsga!g! fD3 102 LY% `@@ 78ea4 @` 6 -!  "A,U%0`Ѣ44CD V`4 d# <:Pp0F \Yc4!F`m`4!Fr ,D@ 9-`4{d m+ 4uᲠE(ȖVe!`\ [n[A1ܐ f8 JԱ\@Ν(ȖVXg!`T`f0Pq8jA ,` ,\ ,` T•0p f\ Bf0Pq 7r-,CkԳس&-,kК?f0Pq ` THkk'(D, QB4X,7ܠ fH Bf0P f0PqlAf0Pil TDl@Gf`|P҂^Q#`m:0 f0p,Am#`$:)-0KP@Gf fPH #`t:0Ķ f0f0bP r1K@Af fpPpCpB2fA0bp hւ2K A[ ,nA - 7-,o#`6: r- ̵@f0PSpC B2fA0b` dAg0b` dCQg0P7-,xg0Pl!4A A- 7-,Cy#`d:C_ p~-g0P37-,z#`:0ķ gB,z#`k:വ0K@Gg g0KО,A{7-t\ o-#`:0 g`X,|#`t:1K@Gg gPި : v-ԵpC B2g A0bp h p|,| A %. 7'.,}#`: g0bp h,}APAl : gpCB2gA0bp hC1Bx ` ,~,~dAF g0Pn,d g0K@'Kh0PU.! Tpx6p - "hk ;K0@'h`h0Kp .,Cp#`!;0 &hAׯ (AׯC ( Tp‰  ,C*@m 7 `. +oAȈA e. 7g.,-#`:00K Ad.c.h0P,-pCB2hA0bp h`À0K Dr,AA o. 7q.,3#`: 4@8k.h0P23@pC`B2h`A0bp h 0Kp ,Q`A y. 7{.,9#`: :@`s.h0P89pCB2hA0bP `0KУA t?;,ApCB1A y.Dy.ĈA . 7.,Ï`#`";00K |.{.i0P?`pCB2i0A0bp hpÀ0K@ D,!0A A. 7.,Cf#`,; g@8.i0PefPpCpB2iA0bp h 0K At,A . 7.,Úl#`6; m@`.i0PklpCB2iA0bp h0K D,ᦁA A. 7.,Cr#``; a ,#``;b0, A . 7.,u#`e; f`,#`e;gp,APA . 7.,Ýx#`j; k,A#`j;l,AqA . 7.,{#`o; p,#`o;q,A A. 7.,C~#`t; u ,#`t;v0,ѧA . 7.,#`y; z`,#`y;{p,A Q QC T@ 2K0A '/ 7)/,#`Сp A*j0P ; t // qj A'B+TȈA 1/ 73/,#`;Ի0л0K A..j0P`pCB2jA0bp h À0KЪ D,AA ;/ 7=/,#`; @8.j0PpC B2j A0bp h 0K0 , A  e/ 7g/,#`;p0K` A%/`$/j0PpCB2jA0bp h`0K D,AqA  o/ 7q/,#`; a,#`;b,A At/ 7v/,C#`; g@@A=j0PpCB2kA0bP ] 0K A t-<,A7/HpC0B2k@A0bP r0KPA ȁ,A1@6bP ` A a0`  ,#`#<a/H Tp¹#`q<0 k0k0bp hCR1K@Gk k0bP ``C a0A A  ,#`-<8i/ TpB#`{<ؾ0 kAwk0bP a0K@Gk k002kpA0bP ` a00A t}<,A#`p TpļW7A0ݐ ,#`<0 ky/k0b` d00K@Gk k0K`VVVBTUV Ak0Pqǯ@Ł h0A @_(8#``/I1 B006B0 Bpla" S@0KA 0Z_ V_ТV_ d d.ľPp0 {.P-7PB@nH\pY 5j%XK N d q.Pp0*@f 58 ~ *_1נ J0`pY B6j !f B68  2`Ad B_B_x`dـ `@D6B68 @"d (! n@pY J6j!f J6 f L68׀ J6x"_dY *d`#L60x JA/A0pnpY R6F  y_a@eA0@{_%P`#P6R6($ rpA0DqnHqY X6F @vwf Z61(9݁%h` V6X6F  z@a!P! f\6x 184 aA%`@@ ЀB ef p@x`Ġ@@ @VW`!f A0@2Y F  RzHp f f6  F  za!! fh6 102 f l6102(f l6 A]`f Y f r6Ѹ@`Ġ@@ ]^`Ag A0@Y F  nz88f x6 1Aq`g A0qnqY ~6F  |záalA0@ppp%`#|6~6%`@l N@ rAu`a 1A0qnqY 6F  z0alA0@%(`#66(.nq4pF  p{a!! f68 184xY F  z8>bm 0@8`Ġ@@ |}`!m A0@ f 6184``m H@P۠`nqY 6F  {a!! f6p 184롰ÁY F  z0pam h@p`n NxZ۠L#Y f 6҈@``n ND56n@]Y 6*!f 6 Rbn c=`+Ak%`66nY 6^LqY *846Mn ۀ6`+@!*@A8 cA0|rnrY 6F  t{\qXf 6184xŁ%`#66#$1(0H,,0eo`@@ YYf 6184bY *8ooA0rnrY B7F  {[f D7184(Hnq%`#@7B71(0-.0e݀t`@@ @atA0@gY *8bttA0rnrY N7F  {$_f P7184x~q%@`#L7N71(0/@/0eH@u`@@ Ѐa`uA0@lY *8"uAuA01rnrY Z7F  {8sf \7184q%p`#X7Z71(08A4ᆀ40exv`@@ ! q%`@@ B q%`#^7`71(0`4 50e`v`@@ `ć q%`@@ ` q%`#d7f7 dĠ@@ @·`v A0@6`Y F  R| ڠ f n7  F  |a!! fp7 184pAqf t7184paq f t7 .v!a fv7 8rA0PvsnzsY z7F  n|8b%`@@ T(Y *8wwY *8wJwU!e |Ls!Y *8|wɡthĠ@@ ``A| A0@1lf 7184 b| @ߠdĠ@@ `| A0@4uY F  |!f 7  f :nY J8Jvhr hrA0\snsY 7F  |aa}A0@8nr lr%``#771(0!?`?0eh}`@@ Р·a}A0@:D'Y *8}}A0qsn@tY 7F  |f 7184p|r%`#771(0@aDᆠD0e߀~`@@ χA a~A0@?I,Y *8b~~A0vPtnTtY 7F  H}f 7184 r%`#771(0EE0e@`@@ ՇKa`A0@TN-Y *8"AA0{dtnhtY 7F  \} -Y F  \} -Y *8A0~ntnrtY @8F  f} .YF  f} @.Y*8A0xtn|tYF8F  p}֠.YF  p} .Y*8Ba @f :n!Y8jr rA0tntYP8F  }aa A0@pr r%H`#N8P81(0xAMᆀM0eP``@@ ܇aA0@s/Y*8BaA0tntY\8F  }f ^8184XpDs%x`#Z8\81(0NN0e `@@ އA*a@A0@x4Y*8!A0tntYh8F  }f j8184Ts%`#f8h81(0OT0e`@@ @߇kaA0@}5Y*8†ᆃA0FunJuYt8F  } 6YF  } @6Y*8"AA0PunTuYz8F  H~̠6YF  H~ 6Y*8A0Zun^uY8F  R~ 7Y0F  R~ @7Y0*8 @f :n#Y18xs xsA0lunpuY28F  d~aaA0@~s |s%0`#881(0aWᆠW0e8`@@  a A0@DY:*8bA0unuY;8F  ~ >Y<F  ~ >Y<*8ŽᎃA0unuY<8F  ~ ?Y=F  ~ @?Y=*8"A @f :1(0 _ d0e`Ġ@@ :iA0Y?*8Cׁc n`d`PBA0u8 1(0p!e`e0e@`@@ Рf F9184(!EEYQ*8"AA0\vn`vYRJ9F  Tf L9184PHXt%0`#H9J91(0afᆠf0e8`@@ a A0@^YT*8A0pvntvYUV9F  hAaA0@jtht%``#T9V91(0Agg0eh`@@ aA0@ GYW*8A0$vnvYXb9F  |f d9184xt%`#`9b91(0`Al m0e倖`@@ `A RjA0@1PYZ*8bA0$vnvY[n9F A¥f p91(%`#l9n9@e`BVvA0$vnvY\t9F @Z}f v91(9h%` r9t9F  ~a!! fx9 184a!%`@@ fi @`Ġ@@ `pA0@YpF  Hpv f 9_pF  Ґ a!0! f9 184Aa% `@@  gb @`Ġ@@ `sA0ԐYsF SC" f 9rsYt9F  xa!! f9P 1( 3%X`Ġ@@ :8$`` H@Pv!xx!e`aĠ@@ `wA0 ܀mYwF  l~xf 9vwf 988 vv v v*v vv!Yx*7998\߀`A f9 AA6 c@ n )F`4Afvن@F`4AQ-7зC0l@أf 98HA0DPZplF Fa< Pw(8 Ia0@A=AD!` ܉lL FV,A=@6@s T@1@;\ ;` 7A a0`=A ,=`ρQpCC2sAP\\}s0PqP T>=HspBT]%{AU@\q a0ˠ>A ,>ρ,?,>Os@s0K@As@ Tp?qA ;5bP `0 a0? :f;t0bp h  Tp??P 2bP `D a0ҁHA ȁ,H#` ?# TDHHp؈A X{< 7}<,I%#`H 4<(&#At0P$%%pC C2tA0bp h0K ?I,AJqҁA1+=A Z< 7<,J+#`@%H,K#`@%H,KҁA1=pCC2tA0K,K--H.%pCC2tA0bp h10K JA,LӁA99=pCPC2tpA0bP `D- -a0@AM AL3<42<t0bp h  6 TpMBM8t0P 6qAYa=pCCL7PC0bP `0/ 8a0ːӁN N5t0bp hq; TpDNN A9a0A j< 7<,O=#`bI0K XGt0P<=&pCC2tA0bp hpİ,AX#`gI\u0P?` <a0 ցYA m< 7<,Xd#`nIi<0KP [<u0Pcdu0P e@ai2K,ZNgfu0PZj TpƁ,Z]p u0K@cku0PupCH< nu;K@'u`u0KpCI#=,C\rgv0`-,]sv0bP `I Ja0Pׁ]A tI0KpA tI0Kp@Gu u0@2uA0bP `K Ka0ːׁ^A tI0KA tI0K@Gu uPr;4KA@u0PA&@g_h| Tx{uPv0 ~vaR<1bP `N AOa0h mD2v0bp hpD Tp_hbf :1(00Q"`0e `Ġ@@ :%YF Q"%0`#H;J;88vA Lwz`{c f R;%H`3`D\x:X p Vz'?LD_02յH^fI>#>#>O_AM PQ2>#vpKSąLHmO;s >#>#~m ́hT5>q!SM~#DD  >q!SȝؐL>#>ؐL>#>W?LD_02>?LD_02Դ}#DD  >q!oȁ.#DD  >q!S#>#>dT5>q!#>R 3l45 0ąL#9_H3Q,O\Ȅșt45 0ąLȩXTR2tm Otm O4m Om O״} #DD  >q!S^H7SąL>##7_tH3Q,O\Ȕ>R />]p45 0ąLM&PtM+?LD_02廏XB5[2Ȟ]/>]Ӽq/>]ӽ!/>].#DD  >q!#>#>5S칟2ȞW(~mG#{ӧ?LD_02ս1>]*p45 0ąLHNvPt]ߗSC-ąL>#>#]Bu[Bu1HumH3Q,O\Ȥ>b O5o Ou~7՟ЅO\#YLqD>m Ouo#>#>>m Ou}7_0GO\#n#>#>~mG#{Pt]Y4?LD_02>#>#l45 0ąLw#N#Cӵ]"CӵM[/Hӵ}%Cӵm(Cӵ'CӵfH3Q,O\ȴ>rﻝ2>]ۺ2>]2l45 0ąLx#9_[,CӵXM PQ2nM@ >q!nV !OBH3Q,O\T>#M.Hӵ_gH3Q,O\d>RtT5>q!#>#>r TTR2hT5>q!Sؽ} 7_0@GO\#>2>]yT8d#GO\#>#>b O7m O7n O׷m O}#DD  >q!M6Pt}Pt}fPt}Pt}l45 ;E!_ąL>R6>]EhTP5>q!>#>r7_TąL>#7Mo;D>>mO4}#DD  >q!S#>#5[*H5N[oC5^qC5neH3Q,O\#>#>b Om O8~ #DD  >q!SH珰?LD_02巏zo O׸~7_0GO\#>#5߬H3Q,O\Ȕ>b On O8o OxnO}#DD  >q!S}Ե~ 7_0@GO\dn OxoOtmOߴmOtoO4nOtnOߴnOnO4#DD  >q!SHfPtPtۧ=SąLHGwH3Q,O\ȴ5mOm O9~#DD  >q!I;>]%;>];>]!:>]7>]絉V>}׸M{>}׹:>]tT8d#GO\#>RM"tTP5>q!>#>r߻uV>}'x45 0ąL4~7ՏHHS>q!H]YQ]v)D>o[>mK#عזQmRtVPt׶PtPtVPtPtPtRtFPtPt6YQ]FQmv^Qm]Qm]Qm&^Qmf\Q]\Q]_QmQ]f_Qm:0SHO\#m"H#8u |'KpSES2mO6oK#xӧ?LD_02ٵ}#DD  >q!#>#??LD_02>#5M Q2#>b!#>#0vVRt'ɰ?LD_02ݹ;nO;mO;~ 7_0@GO\d6oO} #DD  >q!HNFPt7SHO\#=SąLu#>Rt&@>#>#];>]ﺕ;>] >>]Z>}t45 0ąL16lTP5>q!>#zMEE_M@ >q!HNw[?LD_02ݻ:} 7_0@GO\T>#9tEwݟ{H3Q,O\ȴ>R#DD SD 0!NKc=tąL>#>R#DD #NQH>q!H̰?LD_02}} #DD  >q!nOq!#9MC5@M PQ2մ[C5߷M PQ2>#<#J?LD_02ռHm'?LD_02u}#DD  >q!S}unO<#DD #NQH>q!\?LD/QąL>罝{>}߶I>>]>>]q>>]>]>>]Z>}߼;>]#l45 0ąLu#wn~EME7[CuOCu_GuoCuCu[Cu۰E7qM Q2v >#>>}7՟ԅO\#PtQtPtf)D>_E7Cu[GE7ΛEݟBM Q2#>#E7^M#O\#>r׸{>}z>}繱W>}ۺA[>}㻕[>}綁Z>}ߺ[>}總Z>}u|T8d#GO\#>#+H3Q,O\Ȕ>{>}Ql45 0ąLy#ޛEӷn1H3Q,O\Ȥ>_>} b>}E_>}]_>}q_>}罁[>}Z>}UXT5>q!#>b#>#>~,>_vBML6LąL>#w[DO[`ȃ@>#>#m XT?"!MQąL>#>r lH3Q,!%L 52tnO}7_0@GO\T>#}pSES2mO߹~#DD  >q!]_>}%p45 0ąLH]-SąLضE7H3Q,O\Ȅt~ #DD  >q!S#>#z%Fӷ[*Fn'FN[E},FGӷ!Fӷ#FӷۧGӷNۤG~+F1FޛEӷ[/F0FD_[6ĂH^b>}ﺑb>}yB$H#є9/hT5>q!S}#DD  >q!ӽȭק:SX]ąLƌQ׷?LD_02;oO}cH3Q,HS1O\#>r-z>}Ec>}c>}}c>}c>}/l45 0ąLv#G7AH3Q,O\T>#M#v_BH3Q,O\T>#y;F7ۦs >#>#6k<#>#>bӹ_s >#>#6l45 0ąL#u~['D4MymH#>m HD46o H#a |WoSąL#>bA'#DD SD 0!NKc=tąL>#>rc>}i*h45 0ąLtmO}mO}`H3Q,O\txmO}nO=nO߽mOnO=oO߽nO}#DD  >q!#>#uOD4MMtgpS4E2un K#>#>ӧS45>q!7n#>#Ӈ?LD_02ս~7_TąL>bO,ض_۫!Ov@>#>#>WM 45Ob1P„8/-2w} 7_0@GO\d#>#5חPSąL>#pSąLu#>b"O'pSES26~#DD  >q!S#>r`45 0ąL#Fx0} #DD  >q!SȽjpSąL>>b"Oת?LD_02>#9[1t45 0ąL5#>R׶] >M'u2Hy0toO>HM% >]Ӽuk8]>m HD4nN#>#>OAH3Q,O\#6o_oH3Q,O\Ȅ>R붵n8H>}#DD  >q!S#>r cH3Q,HS1O\#>^$H#є!pT8d#GO\#>#7iS4E2HhSąLq+t45 0ąL5#>۶S(>}#DD  >q!}ȝF DDSV DDSg)?LD_02ᵏM PQ2ջWɰ?LD_02ݻ{~`H3Q,O\txm}>m}~}7_,0H3Q2\H3Q,O\Tm #>#>ƙL>#>#~}7_0@GO\THӧ[?LD_02ݽzm HD4n HD4;#DD  >q!ӎH׸S45>q!w~ #DD  >q!nHzA>#>#>b߽O[CO[>~7_0@GO\(AH3Q,O\T>#u͟ ?LD?EG@ 48LO\#>#>r\T5>q!SmM PQ2>#=#DD  >q!*p45 0ąLH^gؠSąL>>t45 0ąL5#>r!*PTR2w=SąL#>KSąLHmk?LD_029M@ >q!tm #>#>Ӈ/45Ob1P„8ͯQC,>q!ȩ#l45 0ąLx#~bH4MyobH4MyHH3Q,O\d,H3Q,O\Ȕ>El45 0ąLuH3Q,O\Ȕ>R9*d45 0ąL>p45 0ąLHM)?LD_02᷏\pH3Q,O\t7_0@GO\T>#=mс}>mс}~mс}m#>#>#8vSHO\#>#>r 7_0@GO\y~ 7ՏHHS>q!=} `H3Q,O\t}#DD  >q!S#>rlT5>q!#>#m>}#DD  >q!ȭ翰?LD_02ٷ:7ՏHHS>q!} 7_0@GO\T >>b۴_gH3Q,O\ȴ>r[D#>#>#6N=XT5>q!S#>r#DD  >q!S~ rH3Q,!%LX0>q!H~ۦ H>#>#>r>}#DD  >q!#>#u__M PQ2շH7Z?LD_02ٹ9~7_0@GO\T>#g+?LD_02鵏H3Q,O\d>r #DD  >q!SM}7_0@GO\Tܶ~ aML6LąL>##y۷3]>m#>#v'OH3Q,O\ȴ>r%#DD  >q!#>r 7_0@GO\d4}#DD  >q!#5@H3Q,O\T_H3Q,O\Ȥ~ #DD  >q!SȽ׷ɰ?LD_02ᵏ~#DD  >q!1\45 0ąLܠML6LąL>##\?LD_02᷏m #>O$M PQ2>#>r*p45 0ąLHNGPS4E2Ƚ놙D>#>#>7?LD_02չH^7pSąL5#>r#DD  >q!ȝ^?LD_02鷏H3Q,O\d>r7ՏHHS>q!&H3Q,O\t>3lT5>q!S#>#yƃ@>#>#vM+tT5>q!S#>#>9l45 0ąLu#5߾M PQ2ո׶ -ܻO38>#>#>r~}#DD  >q!nȡd45 0ąLl45 0ąL#3H3Q,O\ȴ>lT?"!MQąL>#>#AH3Q,O\d>#m>m #>#vN"h45 0ąLܸ~:#DD  >q!SM@H3Q,O\d_wH3Q,O\Ȥ>R `45 0ąL#7?LD_02>#9ݟ$AH3Q,O\T>#uH3Q,O\tȵ#l45 0ąL#M PQ2>#>2l45 0ąL#9mM@ >q!=b8m>}7_0@GO\T>#=SąLHvH3Q,O\Ȕ>R`45 0ąLz#wM45Ob1P„8ͯQC,>q!ԷmO״>]PS4E2H&)Bt}ӵmO׺>]{@tnO׼1>]l?LD_02>#>Rp45 0ąLHMN?LD_027#DD  >q!S]ȍ,?LD_02JH3Q,O\Ȕ\H3Q,O\T߶(BMM#O\#>#>r'#DD #NQH>q!tnOi/dT5>q!#>h45 0ąLu#>r7՟ЅO\#7}aML6LąL>#>#[v} 7_0@GO\T>#9_H3Q,O\tE`T5>q!;?LD_02>gZ?LD_02ٻynO)&l45 0ąLz#~_hH3Q,O\ȴ>r>mf(BiDػO_bCM PQ2>#:AH3Q,O\T>#w})AH3Q,O\T>#MzH3Q,O\TN@ݟ@M PQ2ոHobM PQ2ȑ `45 0ąLx#Ȱ?LD_02ٹz#DD  >q!M"wmO߶I">}B 7}oH3Q,!%LX0>q!>mVB[*woO߹q">}@C6mC6}[C6M6nO۴~`H3Q,O\d8mO4nOt~7_0GO\#>#@4NH3Q,O\ȴHF@4^bH3Q,O\H떋@4n{mO#>ƌ@4yn O㽍&>Q]Q]6QMQMQM6Q]ۗpSąL#>bO5~]7IP4>q!e>m&l45 0ąLy#n'H3Q,O\Ȅ>%C>mӽEF>m׻eF>m׼F>m"#DD SD 0!N+DO\#>#m_0M@ >q!HmO45 0ąLH@tNxym O&>߶Btpyn O)'>붝@t[u9oO6}#DD  >q!#>Re"|45 0ąL>#>#H3Q,O\t4~#DD  >q!HQmv,^>m!5QmQ]QmVQm?LD_02>#Qm,?LD_02XqDӶs:m O뵝'>&+2 >m O +>㦫@_H3Q,O\T>#7}?LD_02ջ]@AS45>q!m㷼SąLu#>1*>;?LD_02>QmGkSC-ąL>q!ȝV@[vD]: 7ՏHHS>q!H~FQ}+b`SyG>m߶G>m߷)>m߸F>m۷G>mߺIw8>m#>#>_{} #DD  >q!S#xwH3Q,O\ȴ)+t45 0ąL#>h45 0ąLq!n OE*>Ǿ?LD_02յ}#DD  >q!S}Hƭ@ΟM PQ2յ@ۦӻmO77ՏхO\t 7_0GO\#>#:}~ #DD  >q!^H@ޛ}D}H3Q,O\ȴ>RI3l45 0ąL#y(?LD_02>#>#7NcB#O{n O"h45 0ąL~ #DD  >q!S#>#zm=|mO?LD/QąL>b+>T8d#GO\#>>re&$+>羚@4m O󼥃>B4ۺӼ}7_0@GO\T>#,_zD<8>#>#>}#DD  >q!#>rӽeJ>mJ>m߽F(>>mJ#_[D6NgDvݛD6nM@ >q!mNJ45 0ąLȍ7S45>q!N?LD_02>#>#ָ@tO9}} nH3Q,!%L 52H~v@t7}n OI.>@tn O`45 0ąLu#*Q/?LD_02ȩ*p45 0ąLHN?LD_02ṏznO} #DD  >q!H~+QϰH>#>#>mO~1#DD  >q!#>r#DD  >q!iK>m2l45 0ąLy#9[G6^ۼDӶ6}oOyn!#>#>׺OG6۷DvH3Q,O\T>#,QWj45Ob1P„8ͯQC,>q!Hl?LD_02>#>Rl45 0ąLw#u_BS45>q!Hn&Q)Qw?LD_02ܼmO9mOymOnO:m! #|#mUN>m)K>mAK>m绡N>mK>mQJ>mN>m-N>m뽥K>m뵁N>m0V,Q㗭?LD_02ջHnצ9Q'p?LD_02>bЁȭB@#>#>#׶ ܺoDΛ!EޛDӶ~1ŒK>mAN>m3#DD SD 0!NKc=tąL>#>hT5>q!S}#DD  >q!}#DD  >q!S}ȍSC-ąL>=oOq!#6+E6_?LD_02>#>#9^/E6|H3Q,O\dNVQ4n'45Ob1P„8ͯQC,>q!H3Q,O\dƻQ4~6Q4NQ4NvKQQ4N6LQQ4N7S]ąL8섃@>#>#u6QQGȰ?LD_02ٻ;mO6#DD  >q!Hט?LD_02鷏;nO=mO}} #DD  >q!צMQQMQFNQG>pSąL>#}oOn #>#~}#DD  >q!S~ 7_TąL>#>#ݟML6LąL>#45 0ąLHOQNQfQ4^6>@>#>#5ӶQ4^6Q4NFQ4^?LD_02շ\M#O\#>r #DD  >q!#>׵Ig>Ӽ^@ӦQ4F?D>#>#uQ4nfQQ4^ƝQ4^G?LD_02>#Q4n7- SES2w>׹f>Ӵo>/h45 0ąLzmOu#DD P0O\#6Q4NE>#>#>O[sF8F8ߒH3Q,O\T>#6Q4nזQ4nQ4n߆Q4n㦯Q4n:?LD_02>#>r#DD  >q!]ȝ=,H]O8#>#>R~mO9mOmOmO7mOw} #DD  >q!H45 ;E!_ąL>#VQ4~vQ4NƺQ4~G=SąL#>Q4~Q4Q4^ۆQ4nQ4^Q4f D>#>#>>mO9oO8}f7_0@GO\TFmt?LD?EG@ 4BD ąL>#w9G8?GF8ߡH3Q,O\t>r#DD  >q!H6Q4ӆQ4Q4w?LD_02}??LD_02>#5z'O}eo>o>dT5>q!S]s>]/dT5>q!#>纹'o[,GӸ-GӸ.GӸ2GӸ0GӸ[F8^5GxF8M PQ2նHQ4~Q4^Q4g?LD_02ٽnJ#>Oۨ,"z(mJ#>o3GӸ^74MQ4ߧn?LD_02>br>t45 0ąLu#>Rt45 0ąL)t45 0ąL5#>rm6x45 0ąLmPDH2Ƚ/l45 0ąL#~eH3Q,O\T>R`45 0ąLu#j?LD_02ȁhT5>q!w~7#DD  >q!S#>bO;}7IP4>q!{ #DD  >q!HD>bO~mH#q`45 0ąL>#&D>b>nH#9i45 0ąLu#>b>m HD4mO:~ 7_0@GO\#>>9`T5>q!#>#hSHO\#>#5_H3Q,O\Ȕܴ~#DD  >q!}I3$H#фM3$H#ф#l45 0ąL#BH3Q,O\#>#:CM#O\#>#:myH3Q,O\#>>r12l45 0ąLw#n_@M PQ2\lH3Q,O\t>R&l45 0ąL#7_M PQ2ջHW?LD_02ӗ}?LD_02սm7?LD_02>#>dT5>q!S#>bA+t45 0ąL5#> `45 0ąLy#?LD_02ջ\#M PQ2}&,M>mش?LD_02պhT5>q!S}=}#DD  >q!SȝӇKSąL>#>#_"H3Q,O\d>A&l45 0ąLz#_kBMGMO\#7qM#R4O\dOc8>#>#>חSąL&`TR2\?LD_02嵏~#DD  >q!ӝ*p45 0ąLHNH?LD_02յH]O45 0ąLH} >#>Ot~}t7_0GO\#>#>r#DD  >q!}ȝؐ >>ؐ >>S4E2h@H3Q,O\tܷBH3Q,O\T>#}H3Q,O\Ȕ>r #DD  >q!H7?LD_02ս\rH3Q,O\Ȅ>R\T5>q!>?LD_02>#>#5nM PQ2H~6D>#{vD>#{|L>#>#6%pTP5>q!H}?LD_02>#>Qi45 0ąL#>r#DD  >q!#>RӸ _8~}7_0@GO\#>#>r6@T?V>q!^;?LD_025s >#>#>֘#8#>#>#~m#>#>bӸO(AH3Q,O\T>#uvÒ콏)30#>#>'?LD_02չ:~#DD  >q!S^ȭ)?LD_02ݻ>}#DD  >q!S]ܵ}7_0@GO\d\'?LD_02>#>#mG8>#>#>O6CMGMO\#yqG8>#>#>_eH3Q,O\T>RYl45 0ąLAM PQ2?LD/QąL>R #DD SD 0!NKc=tąL>#>r#DD  >q!O-dT5>q!S}v@zp?LD_02>b E#>m E#~m E#m E#m E#>n E#~n E#n!ρۗ`?LD_02<#>#>׻_`s >#>#x p45 0ąLHM*?LD_02幏=H3Q,O\Ȅ>r"#DD  >q!#>#>r'x45 0ąL}#DD  >q!pTP5>q!}z?LD_02>#7}ߕH3Q,O\T>#Ȱ?LD_02ݷ}#DD  >q!ӝHSąL>#>#ۤ콏t}?LD_02պ]ذD>#>#>} 7_0@GO\T>#y^zH3Q,O\ȴ>RaLTR2vM PQ2HO7)?LD_02ݽcGt >#>#>O[mGt >#>#>_1"[-HؖW&nːve@¶ Lܖ2w a[&n0mrU- e8wA %8Hؖ2 y[y$lm-COe@Kb MHޖ 2<1q!1H SbvJ,ND pJ, DC'`$I,H 0̓X`I,0`$ Ă"6DI,`% I, GbH,@GA B"` @S$4ʂ$AdH,(!ƐX0@ a>,݇C`q Â!p}X0 pa>,ԇV0aȡ$4#EÂ!|X0@ @aq>,̇C` ÂB D| b Â!{XP C%=,C(Aq,@ CaHBÂ!xX0 a( DaH4l<,(xX0 a.<, C` Â!wX0 Xa;,0 0$Qq(&4QȢ$@ ;,vC` PÂ% #A@wX5DPKXFqd$lC` Â!PvXFA B !uX` :,^E  uX0@ @ttX@ Â!sX0}  A !a9,6C` 0Â!rX0@Z a?9,@'C`HQ 0@J a8,C`w Â!qX0/ (@(CIiGtqX0@, `a8,@C`< 0Â!pX0@ a?8,! @%a)&.    Â!_X0 PaAr1 }aD0`0/, @X}a'/, !wa! ua.,X9`ԅC`< @‚!\X0@ sa%.,ąC췰` ‚![X0y oa-,C̶`pQ( 8IUYQ0 C` P‚!ZX0Z ka=-,C̴` ‚!HZX` FJ,N=RDM,,Cȳ x ‚!YX09 ea,,X` 05I8b$0@ 2> p‚!XX0 aaAr1paaD0İ`0',, @aa|+, ![a!@ Ya++,X9_C̫`w ‚!~UX0 (Ua*,`OCة`8 ‚!TX B@ FJ,N=b*, GCШ` ‚!JTX0 Oa),! @%a)Oa),`<C` @‚!^SX0j (Ma),`/Cإ@9Yz RX0@Z @|RX0@N Ia)), $a @> b 0‚!QXP AC`+(,@C`0 b P!`?X0 `|,C|`  !Z>X0 `, ߃C{`w !=Xi@ DaH,L  P!4X0 (``,`GCh`  +0 Pfq"j:`&,@?P@ g`    !f3X0j `Ar1`D0e` 0k, @ `, !`!@+ `o,X9C``  Ђ!F0X0-  ~ ؽCނ!bo`[0- w C a @ea"i*JͰ( r-  s C܂!ZnP (۴ JYi8!i0@ Q$ (AtYqudPfpcAo  зCAۂ m1@(z[p Bĺ-8! j @0Qق# lPda 1[pC-8@" Df aGQ؂#(zl@<a [p0-8@" Dc 0@A؂#(dl@1a [p0ŋ-8@ Db `GQ؂#(Tl! P)a [p0-8@b D!b GQ؂#(k@a ZpC-8! L$ $A1q4 c;F@s8<<>9Ѓ>9́A@c3%~{¿0F0{cv/ A c)L0F۳{c/1(0F(Ai$? c ABK@AB!)\>(I;9;8A=́(I;9C:A=́(I?,} l@x3tP>EP@SP@0F+J=  tP1Fc?/zA c 4/Z10Fj- cl'0F0Scx5F'{} c=996F0$fc;.i·0FPv.rc0/7FzA c3뺰7F?n) cw.(8ۋ+ c+7F .A[70F~c/!83Lq `@EC;uqml0FЯ'N3p( 7C:,CA kpCa0 A tĈA t@G0C1PAPB;D#` <,q#`m; #`m; aLjA npCa0ː( \ <,#`rE$0K TpD"(#` A?,#`w540K,3PЌH 7"  ,A4bp hWĴ@G<4bP `^%1D2HS0bp hZԈ Ah?P! ӈA {pCa0PY i H\#`Et#1Kp TpD%X#` AL,#`5Ih#`EIh وA A~pC`a0˰q l f݈ lg@Gl7bP `_91D2x_0bp hP[,/1K ߈A pCa0b21K0#`EL@G #`(6 AY,A0bp h[C,a71K`ABZ 7j g y zi0bp h@^,A Tph0bP ` cm1E2#`5O Azl0Pj A e aEHXDb52K#`> ^,A0bP ,Ac0KZpp &,Nā0bP ZBh 7 r0 t0bp h_nA, Tps0bP `f1F2ԁ#`$6 p1Kp#`$F$q1KpQYk 7 x A ] AN]@G#`w& m,A0bp hb}0bp hbĔ}0P{ A pCa0_\{1K #`3_@G#` o,C(B0bp hc0bp hcD0P ˆA pC0a0@ 1K` #`=Fi@GP #`f z,)B0bp h f,A*0K q *z 7 A t0bP ݟ,+ Tp*PkH2bP `k1G2+#` ,5ˆA K0Pq 6bP `k1G2B,#`u6F4K #`uF4K  B, 7  @ !1ˆ %A@Ĝ@ #`6" A,-B0bp hg x 0bp hgNx0P ˆA ApC"a0ː An`0K #`Х[@G䂠 y A pC#a0 Anf0K #`н@G ; tîH0 웍 78  AmA|È Am|@G pqpЇ@X}hUU[C}`G0Pq9d0Pq9t029'(FB? ƈ! C~1 B0۰0hB&1 `؀0'_@0KP(Ȗ\Da"ѢF.P-7FB.P-7FT`!Qp@A6Pp0*@f ap a A;lkȑG+GV`A!Zb`*Atn(n% fءB >%hC`~Xal*u8A!*0A!J MnP% fZ!f ahYx8`k*8f a A0@ 5f c@@ @%YcĠ@@ /Ynj) f>A02Y1(.0% >9fEA0 ri!إ0eF  f cĠ@@ : `@?*8?8T!*YaĠ@@ 4pn) f?A0 0 ܀f c@@ 7Yc`E.?+#? Hik%,P bD* # CcpbGl-h ^3P ΈA m 7,CO0bP lψA tc0K0>> 7 BA DĈA o 7,CO0bp h`>=3bp h`A>>3P#ψA y 7!,O0bp hc0K>#`{ ?3P#ψA Az$ 7&,O0bp hc0K>#`H3P#ψA {) 7+,O0bp hP d0K?#`I3P#ψA |. 70,CO0bp hpd0K0?#`J3P#ψA ~3 75,O0bp hd0K`?#`K3P#ψA A8 7:,O0bp h@/ψ md0K?q?xГpCA2#` f0,v@M3P#ψA b 7d,CO0bp h7ψ od0K??xppCA2P#` p,w@O"4P $ЈA Al 7n,P0bp hP?Ј ze0KPB1B xpC0A2 #` z0, w@Y(4P&$ЈA v 7x,C P0bp h gЈ |(e0KBB U RATAHZQZQC 4bP `ph ia0B.A7+ЈA tc, TpD .tqpp /f2 #`e@ 2bP `k ka0C2x`P, xpa, TpD 24bP ``Al la0@C5x0 yЈ AD, Tp 54bP `m na0pC8x04}Ј ,A Tp 84bP `n Aoa0ˠC;x0HЈ $, Tp ;4bP `Px xa0C>xPE,x`V, TpD >4bP `Ay ya0Fay0pш AD, Tp a4bP `z {a00Fdy0ke4bp h`!@G@F#`g0g ftF ,y@?@GpF#`g0g iFA t,AAH@GFf lRHn2bP ` a0Fmt0шA AM, T mt{ pC@A2#`~ Dq4bp h!4KGF{ppCA2#`  t4bp h0Ab1K@G!G{pCA2ԑ#`4 w4bp hꁣ1KpGQG{pC0A2Q#`{z4bP a0KGGB 7 {KA 7 79,Q0bP f0KG#`{p~4P|$љh BL7kA0G$|pCA2R(#`@~4b` dg0K JJB(VППVПVПATџU A@(}4Pq0JW) 7 ,C)RplA  ƈ! ؃AR&h6h &h\ [n`E`؀GdA \4 X4ТX4 d hPp0 S.P-7f`A@nHR5YSBh4Pp@A܀`*0S8SSG,AL+"L,!2442n^ `!T)(_ %PCH% p `aX)($M.DM>Ӏg Xi VRY%L*c@BNP 4+`/h 4 8nq fZɕ@YWV2Xi*D \i Q"\i*C*8LCJ@/A05n5YX`@@ @O= f fi@@ @O> f fi#bIB@W1(0__0eZ F @l<f liĠ@@ b*ZF  ?`!( fn A0@q Y\184`%ȥ %F  ?@`!P ftiA0@s Y^184 9׀%ु %aF  ?p`! fzɗA0 VbF  Z=(Y_^| t! f)Yq% NpiĠ@@ `  `a)1845%(A0@B^Yrqȩ t! fIA0z6n~6YsB`@@ p p f i@@ q p f i#I@Yu%X NH'ԩ t!( 5 1(0XAlm0e`v F  ?0aF  ?8.b*8v*/~n6u 5F  fPp`! f A0@ o%A0@% x'F  pP`! fiA0@ f i@@  Yzyրdn6Yzʧ`Ġ@@   `)184נfY|184fY|{i *8 z*d`f isA|%ু NVHاY~\L*)  'A `F*8* `6 bĠ@@   `@*1848Aa F  NP(䊘% 'f D@0KO9ek 8lĠ@@   `P*1(0C!^v0e0 F %XA0>0f VjĠ@@ z  `!T*1(.d`F @%X1)夈T2 T`AVp@gJM Tp.T@JM TJM Tp4/Tp2N7pX'HPTĈA ,  7,KT0bP + ,LA1KSRK 7- 1%SA-H9.HI/HI0,L TpFLL TpFLL TpFLL TpNLq0eV(!n7Y`*8cn@8Y|60f ljjJY2uT%@ BOo'IS2?LD_02ջ #>#>ḿ0#>#>W8?LD_02ռ8~#DD  >q!SmVQHQHFL>#>fL>#>צj@tm`D!QO[umOQ\45 0ąLnO׹ >]j@tMuo O4}#DD  >q!Sm]7/?LD_02ն7o#>#u_=B0}זk@tB5}۫m O״n O4~#DD  >q!#>#mߖm@]6nOe#DD #NQH>q!nOۺ>mvo@l@H3Q,O\Ȅ[BuMdH3Q,O\t>>]ӵ.>]׶/>]׷>m ABuM Q2H߶Pt]'LSC-ąL>#>bߺe/>]׻)/>]׸=/>]#DD #NQH>q!9mp#DD  >q!S# H3Q,O\]x@Mw}#DD  >q!S#'uSąL>#>bO%x45 0ąLԺ}#DD  >q!S#>#6[wnO߶eB>}z@ݛ-mO5o OuoOߺ>}G?LD_02մ䵵/>]۶q.>]ӽ/>]۴/>]۵/>]ۺ/>]M h45 0ąL6oOv~c7_0GO\#>#PtMz@4NxmOLTR2N@ML6LąL>#u(}7՟ԅO\dOlTP5>q!>#N%x~7_0GO\# A47 OӷUSHO\#>#a$SMF ԾmH#uh45 0ąL5mH#u}">f A4[#8n On O6oH#uYtT8d#GO\#>bh45 0ąL5#:m O6nO紵">F@tnH3Q,O\Ȥ">V@t2ynOm#>@t[*C+C,C&CMCm_)H3Q,O\T>bۻ#>8?LD_02սxn Ovo OnO7o Owo Owm O8n OmOx}#DD  >q!H֎@N4H3Q,O\#>b߽C O[`Ā#>Ə@~<zm O-&>@[e:#DD  >q!S#n O l45 0ąL#ݟ?LD?EG@ 48LO\#>#u8C53?LD?EG@ 48LO\#>#zo Oxo O׸m Om Oym OxnOmOnH#u3>]㴩7>]I7>]7>]紅7>]i&>F@^}Cu_H3Q,O\ȴ&>7I?LD_02շvm O׹mρ{;45Ob1P„8ͯQC,>q!HH~}lH3Q,!%L 527o Oax45 0ąLԺ~&#DD  >q!SMx45 0ąLxoD&>@np;oρ{g,@SHO\#M@ >q!ش~#DD  >q!S]HnCӵ^{oO3$H#d׶7>]>>PtPt&PtFPtv@4Os|m O>>f@4|nOy'>F@4[z|o Oyn HD46oO:7_0GO\#Ul45 0ąL8>#>#~m:>]:>] 8!Q#>#>bOznO~#DD  >q!#AM@ >q!=C4MvC5yH3Q,O\T>@tO|}} #DD  >q!#,?LD_02Hmw>?LD_02M@to"*>@tm O`45 0ąLu#@t[}#DD  >q!#v]@H3Q,O\Ȅ_C4MMӽnO׺}#DD #NQH>q!"<߰d>#>>}#DD  >q!#~I#DD  >q!MȵdT5>q!U \T5>q!Ȏ?LD?EG@ 48LO\#>#;}7_,0H3Q2~H3Q,O\ȝPT$ATąL>ǂ?LD_02նsH3Q,O\T>/?LD_02ո8}#DD  >q!#>#~ߗM#O\#>#N?LD_02>#>bYl45 0ąL5#mA4M5o_M PQ2>#>r7/ATąL8S45>q!~7_0GO\#ȮߝML6LąL>#*?LD_02Ⱦ H3Q,O\T>r7_,0H3Q2}V DDSF DDSƨ H1IT$ATąL>b N#>#>RӺO_fH3Q,O\Ȕd45 0ąL%H#t#DD  >q!z}#DD  >q!8~#DD  >q!S#MH3Q,O\Ȕ]۷8PSES2HJPSES2m9S4E2HKS4E2}w?LD_02ոM PQ2>#>RIh45 0ąL#y~7_TąL>#>blT5>q!S#>#m^M PQ2M@ >q!ymOi0nO1mOfi0nO6 14oO_@H3Q,O\TΛ4BOBؼ_[ӹ9+$ l45 0ąLu#5^_&H3Q,O\T>b`45 0ąLu#G+?LD_02>#>S45>q!eđ Lma45 0ąL>#H>#>#d45 0ąLaaT5>q!#>#9SąL>#>R-PT?"!MQąL>ߦ@>#>#5}.ܸOȍ.ܸoȍ*4H`45 0ąL#(45Ob1P„8ͯQC,>q!^߻@M#O\#H3Q,O\ȤH}ח??LD_02>bhT5>q!w}#DD  >q!SH#AM#O\#M[7M>m ԸO۰8>#>#5g?LD_02QF ז, o?L0>mH#6MQ68#>#>>m _[Ex45 0ąL:~#DD  >q!#>#6_\?LD?EG@ 4BD ąL>#?LD_02ջس2E+,ع_|M PQ2>#>?LD_02>/?LD_02պԸ}#DD  >q!S#>#I?LD_02պ6n<&H}-*8#>#>R>}#DD  >q!60tX@?LD_02Y;s >#>#6L>#>Rӽ_gtFH*8#>#>R~}7՟ЅO\dM68#>#>~}#DD  >q!SMH}WH?LD_02մ~#DD  >q!S#9}`H3Q,HS1O\#}C#DD  >q!Sah45 0ąLu#n M#>#>b>}#DD  >q!#mH3Q,O\ȴ]*?LD_02>#>w9SąL9}7_0@GO\d}ֹd>#>I pT8d#GO\#>6p >#>m#>g?LD_02սH3Q,O\TM,?LD_02>Rۺ3<#>#>RӶOk@H3Q,O\d~M PQ2>#{n #>RO#=׷p?LD_02>r#DD  >q!SHI?LD_02ջvn C >mL#>#NYT5>q!#>r#DD  >q!SHߺH3Q,O\T>R`T5>q!HH?LD_02յ5#DD  >q!#>#(?LD_02#DD SD 0!NKc=tąL>#>r07ՏHHS>q!H3Q,O\de`T5>q!VPD>FPD>Ӈ`?LD_02\TR2H3Q,O\H}&Q$H;SąLH7SąLu4TB>q!6(H]>}#DD  >q!#vk@H3Q,O\tlԵ,H>}#DD  >q!#>#*H3Q,O\d>bx45 0ąL8oRh45 0ąLt#DD  >q!#Y SHO\d@TR>q!S&D>#>#>m #>#>R>}7_0GO\#>#z7ՏHHS>q!輍64H41[-CHؖN&n0Oe1 2m1-Ce ` Hޖ 2y0q!1 @lRXP pGa(,C`p ‚!BQX0 a8$Ppn(,0A4@0 CaA" 0BB~`  !z>X@ q  Q$1 8`A! `!`[ r1-  o Cۂ!m`T[0- g Cق!le`0[0- c 8 (Ii Xl$aC-!0aŁ"izFlAPƁ$ʢ_  0$QZ FA$Ȣׂ&0 "Ae40Q (@ 8*jQiɺ2L> ` ̀ \@ q (LPQ R0S@TPU`VpWo0q < HEUaymAPfphjlW & 8Yy(MQ( _-h@8DYjA@Ɓ$ʢW  (YZ0q Ղ&(IŮ4 DaH,uA8HEMQ(K[-h@8DYjA@Ɓ$ʲV 0$QZ0q Ղ&(I40q (` z09`Iѩ b@S JM & O Cʄ!d=a a Jt0@C:uq `@@s8<<>9Ѓ>9́A@cf/7Fr1A81 Ac 0F¼8c.6F  82Z0c / `0F;Bcx4F bHRc=5F@" bc:y6Fnc;>7F zc #`)U ,C #`ЕT #`Е pC, 7 1PB0bP #`2 ,q#` #` aLjA MpCPa0ː( ; ,#`D$p0K TpD"(#`< ,#`44s0K,3PЌU 7  ,A4bp hOĄ@G<4bP ``V0B2HS0bp hOõԈ A?P! ӈA t8HH #`ЕJ0bP `W0B2TV0bP [׈A tle\Q pCP ,@sU)1bP `W0B2h[0bp hR)4܈ Kp [#`~ /,C#`551K ,7P 742AB@'8 7 b0 AOł d0bp hSp, Tpc0bP `Z0C2#`b5,0Kp#`bE00KpQ; 7 h Y . YQ.@G#` <,A0bp hVC m0bp hV m0Pk A= 7 n \ 6K#`rEp, Tpo0bP `^0C2ā#`ЉV7K0#`Љ7K0T1bP ``^"1D2A#`},Rv0Pt pCC0px , Tpv0ܐ 2;K 7A= {, Tp 7H }A), TpB|0bP ``_21D2(#`@ ,A(R+0P ˆA ~pCa0 0b` dK H@G0 #`  O,C)B0b` dK0Kp #`@0P ˆA ApC0a0ˀ Ӱ 0b` dPN/@G P2bP `bg1E2B+#`CKˆ x0P ˆA pCa0 ׀,A,,A, Tp+0bP `Pcq1E2,#`5451K@ #`E861K@ ! ,] 7x ` A| RNˆ A|N@GԂ` #`? A_,.B0bp h`_0bp h`_0P ˆA pC a0˰ ,A/щ,A/ Tp.0bP `f1F2/#` 6AYÈ b`0P ˆA pCa0`i1K0#`%FZ@G #`s Al,9C0bp hb,9n1K`AB9m 7   p@0bp hbD,A: Tp90K:7SHO\dӧ$S0 ąL >][sӷ >tӹ] >wӻ >m[xӼ >ݟMԅO\dO۷5mE#>mO]DTR2}\45 0ąLzӵ}'#DD  >q!#7S؅O\#r?LD_02>bO׷>]h@t~6#DD  >q!5nO׼A>]SC-ąLfi@M[vmO۷m>m۶k@6nOۻ>ml@BH3Q,O\TCH3Q,O\t_DH3Q,O\ȄEH3Q,O\ȔYH3Q,O\ȤBH3Q,O\dSH3Q,O\6Q#DD  >q!SMH3Q,O\Twe?LD_02iY45 0ąL}]#DD  >q!SH3Q,O\TYq?LD_02>r$#DD  >q!#r?LD_02>rc#DD  >q!SH3Q,O\ȴȥQ45 0ąLw$SC-ąLET 5>q!1[F!1K pCۂ!LmA@Ɓ$4Aj سCق!lAf !vl6`[0- b Z FJ,`Zp0E{-Ha kA0^ Cւ!dk` A(HZ0_- W ثCՂ!j?`0@:L& 0 C„!ta0aZ0@,L&  8 C„!Ba~0ajф! C,@& 0a X Lt0@C:uq-`@@s8<<>9Ѓ>9́A@c/+A/ ci}/:+0F0~c 7Fjc%70F  c 7F`vc\ˠ7F0j-{c7F~c <8 0F T qAh*/AicxuY묍xM܍ #`)U ,C #`ЕT #`Е pC, 7 1PB0bP #`2 ,q#` #` aLjA MpCPa0ː( ; ,#`D$p0K TpD"(#`< ,#`44s0K,3PЌU 7  ,A4bp hOĄ@G<4bP ``V0B2HS0bp hOõԈ A?P! ӈA t8HH #`ЕJ0bP `W0B2TV0bP [׈A tle\Q pCP ,@sU)1bP `W0B2h[0bp hR)4܈ Kp [#`~ /,C#`551K ,7P 742AB@'8 7 b0 AOł d0bp hSp, Tpc0bP `Z0C2#`b5,0Kp#`bE00KpQ; 7 h Y . YQ.@G#` <,A0bp hVC m0bp hV m0Pk A= 7 n \ 6K#`rEp, Tpo0bP `^0C2ā#`ЉV7K0#`Љ7K0T1bP ``^"1D2A#`},Rv0Pt pCC0px , Tpv0ܐ 2;K 7A= {, Tp 7H }A), TpB|0bP ``_21D2(#`@ ,A(R+0P ˆA ~pCa0 0b` dK H@G0 #`  O,C)B0b` dK0Kp #`@0P ˆA ApC0a0ˀ Ӱ 0b` dPN/@G VZCp,9T[[ 7n y0bp hp^3A,+ Tp+0bP `Pcq1E2,#`5 51K #`E$61K  ,] 7x 0 A| MNˆ A|N@GȂ0 #`? A_,C-B0bp h`_0bp h`_0P ˆA pC a0ˀ p,.,. Tp.0bP `f1F2/#` 6pe1K #` Ftf1K  /j 7 A a0bp hPb,8 Tp/0bP `0g1F28#`*6A[È g0P Ci !#`zf n,9C0bp hcA 0bp hcQ0P ,9 Tp 9d7SHO\dOVo0mӧ$S0 ąLI >[sӷ] >ӹ >w@T?q=L]ąLn0toOAO۷5mO]DTR2}\45 0ąLzӵ}'#DD  >q!#7S؅O\#r?LD_02>bO׷>]fh@tH3Q,O\#_@tunO׼A>]SC-ąLfi@MۦvmO۷U>m۶k@6nOۻ>mP?LD_02շP?LD_02P?LD_02Q?LD_02GQ?LD_02m@XH3Q,O\ȤH3Q,O\Te?LD_02QY45 0ąL}W#DD  >q!S}H3Q,O\Te?LD_02չeq\45 0ąLH3Q,O\tȩ\45 0ąLH3Q,O\Ts?LD_02>rf#DD  >q!ITP5>q!~D7_GO\1[F!1R Cۂ!m@` @eazLm{`;[0@-(`l!- c XC؂!BlA_ `(CI% 0_ ha@ y-(`k`Z0l-,0q Z  W CՂ!jz`Zp0\-8` ­ S @809L`&  C„!na0aX0+L@& p  C„!`Ԅ! C8X& L`a 0g #t0@C:uq'3>A^ c`:e/`ֵލ+þ0Fp&y=0F v(cۛ7F  AB#`4 7U2B0bP <]ÈA t pCp   6ta0PA t#`Q,ARb 2K Tp! tCA0ܰAL7#`*e ,#`T,3bp hNB0pa0`eT 7o S ψ :1K TpD#8 tA0bP `St0A2@Q0bp hNE5H#`dX,4P 7tA0bP `S{0A2LT0bp h`O7T#`dt,A5P@ 7A(tCA0bP ``V0B2XW0bp hO ؈ A?d@`a pBL7A#`m *,C#`$U0K,6P 7+,n,7P Yt x a0AP1Ka0P ZA ^pC` a0 cTb d0bp hRA@G0 a0PfASBBV ZCp, TpBHdA857SHO\t#DD  >q!}#DD  >q!~#DD  >q!S~#DD  >q!~#DD  >q!mOߦ[0tmOGR?LD_026]04nO@?LD_02M@ >q!SmO >[fAXxӺ >!1 @Cׂ!tk`Z0@i-< @Ɓ$W %A !j Q@Ԅ!`ۄ! CT& L01aa q (1F PG N;Ec 7F0 AXn ce/A{۹ cw/䭽T q;N0FP(fc.Z.0FMލߥe]1bP `OpCp,C #` ,0bp hKq ˆA tjTpC ANˆA Hy ,Ca#` ,1bp hKQƈA JpCPa0ˀ$ / (#`D$a0K Tp"$#`- ,#`44d0KЌ,A3PT 7o  ,4bp hNĔ@G83bP `pSr0A2DR0bp hNӈ ;XL RفF%r^lr5{A XpC a0PY >A,5bp hOw`v0Kp TpD%X#`h5 A),#`4Ah#`Dh وA A[pC a0˰q I o݈ Ip@Gl7bP ` W0B2x_0bp hR,Q0K ߈A ]pC@ a0bԐ0K0#`.E)@G #`|u A.,A0bp h0S,0K`AB/ 7 g N i0bp hS),A Tph0bP ``Z0C2#`=5B, AO l0Pj A jpCa0n 0K#`bE-@G@ _P?LD_02P?LD_02Q?LD_02?LDODT,O\dO׆h0L>mO7Q?LD_02^04nO^0mO_04#DD  >q! >[mfh@tMoump7_0@GO\t#DD  >q!S#q?LD_02>r #DD  >q!#7r?LD_02>r&#DD  >q!S#r?LD_02>r,#DD  >q!#\@t}]X45 0ąL>bO׶!>]!1 Cׂ!k`Z0z-` [ Cb0 PZPE4̢$a 4Ղ!j? @rjo0aiЄ! C(8& 0a  ?t0@C:uq `@s8<<>9Ѓ>9́A@c 7F   c1f7Fxwc/y.0 c7F@uo6+m= cM7F  tP8P H@c oAB +q ȿa0@Bc2#`" 7` AOb0K0AOc0K0 TpD A00bP #`*e ,Q#`@S ƈ ,1PPT0bP `Sl0A2H0bp hpNj00Ɉ 9$qH#`6% ,#`4H,3bp hNT0ˈA NpCa08 q! >]Ӷ >}M Q25IT5>q!mO\0tnOf]0~37_0GO\T>bO]04oOx5mO׵>]SHO\#SąLCSC-ąL>R[~u~D#DD  >q!#fkP$>@SąLM PQ2>r7_0@GO\ȔyPT5>q!#GBSąLӵnO׻!>]^@t}}5~'7_0@GO\T>r-7_0@GO\T>r67_0@GO\T>bO׽}>m6i@M[vmO۷ >mw?LDO\O+DO\dp9#DD DTW+DO\t7_0@GO\dUPT5>q!#ASąL\M PQ2>r 7_0@GO\#RSąLu#SSąL#1[E %!y!11 - pg @0k"lA0c [P ؂!Fl`Z0}- ^ (Cׂ!vkAP 0Ad[ Cւ"0`Z4̢$qB0@+L@& p ȃ C a o It0@C:uq*`0F y@&n@0rdl@qDq  c  ($:.ߍ?0F{cø/. = c۸7F}zc/6 c  8 d c /  c{7@eZPH@c w c oA #`(E ,C #`Б #`БT pC,A h5AA ALpCa0P ,1b` d>1Kp TpD! 7u H ,2bp hNq@8@G BRrpĈA AOpCa0˰0 = 1KЌA@G,3bP ` V~0B28O0bp hO@#`T,4P( 7  }0K0~0K0 TpD$H#`l *,U#`#5 0K`,5PPU+ 7 Д,A6bp hRc @G\6bP ``W0B2h[0bp hR ܈ AK*p ۈA ^pC a0y L W+|#`2E`0K TpD'x#` /,A0bp hpS b0bp hpSD' b0P` A AipC0a00d0KP#` _,,B0bp hP_0bp hP_0P ˆA pCa0` A_,.A_,. Tp-Pd@d#`jf j,C.B0bp h_Z 0bp h_Z0P BMj%QĈA pCa0 Xh,/l,/ Tp/0bP `pg1F28#`.f0K#`.v1K 8s]#`Q 0bP ` j1G28#`960K@#`9F1K@!8Tq H290PI`,9 Tp9*dFl@LO[1~mT q!1 >}M Q2IT5>q!S}G7_0@GO\dM PQ2sS M1 2O\td7_0@GO\ȔMM Q2չXӶ >M PQ2]0tnO{5mO&]04~A7_0@GO\dOf_@tm}X7/QąL>#@SąLM PQ2>r7_0@GO\Ȕ>]ASąL\ M PQ2>bOh@t[nO׼>]_@t[u~(7_0@GO\T>r^#DD  >q!#>r.7_0@GO\T>r77_0@GO\T>bOmUT >q!=ETPe>q!>mh@MvmOۺ>mfj@vnOۼ>m?LDSąLYl@MNH3Q,?U 52w}7_0@GO\dYPT5>q!#ASąLM PQ2>r 7_0@GO\#RSąLu#SSąL#1[F '!y!1H   Cۂ!|m`Y[@k H Q`>[0-@$0qmaбE B-(`^l*`[0-` `_ Cׂ!kⵠ A(HEiljICւ!dkAZ Z0@i- qEI(aFk0DՂ!j0q B09L &  x C„!Na0aP0L@& 0a d ;1F PG N;Ec  4M;N0FP(fc.Z.0FMލߥ Aciop0FxҼc$,Y0F@<Aj\`d0 n`n`s `@pd{@99q1bP `OpC,C #`@ È p,0P 7A72g 7AW2gT 7c OaȈ g0K Tp!#`IPiĈA AKpCa0ː( 9Pˈ 9a@G$2bP ` Sm0A20M0bp hNΈ A:8͈A MpC@a0@ ; MD#`D8n0K Tp#@#`}##DD DT M?΂ąL[0tmO߶j0L>m\0tnO\0mO\04nOvj >]f^@t]zӵ}&7_0@GO\t#DD  >q!#f_@tH3Q,O\Ȅ>]q?LD_02>bO׼>]h@tnO`?LD_02u>]߇SC-ąLi@Mۤv}2#DD  >q!}5#DD  >q!}8#DD  >q!~;#DD  >q!S~,7_/a>q!U45 0ąLU45 0ąLT45 0ąLP45 0ąLYp?LD_02>r#DD  >q!#q?LD_02>r #DD  >q!#$SC-ąL DT 5>q!1[<!1, XCق!\l)`[ 0q J,N=`ZP ׂ!k`Z0o-  [ hCZ aBA 95aT0)L & ؃ C!`r0a70L & 0@& 0a  >1F PG N;Ec  4M;N0FP(fc.Z.0FMލߥ Aciop0FxҼc$,Y0F@<Aj\`d0 n`n`s `@pd{@99| #yP^#`" 7_2B0b` d >0K0G p,qp,CqA AJpC`a0` ,2b` d0>3K Tp!#`IPlĈA LpCa0ː( 9ˈ 9@G$2bP `PSp0A20M0bp hNΈ ;I8͈A NpCpa0@ A< D#`D8q0K Tp#@#`? ,4#`4Ht0K@1,5P0 7 p,5bp hO@GT5bP `V0B2`Y0bp hRڈ H]h وA [pC a0˰q AI a ݈ AIb @Gl7bP `0W0B2x_0bp hR,q0K ߈A ^pCP a0b԰0K0#`/EA*@G  }#`~ .,A0bp hPS@ AMD1K`ABp(0ˀi@hg0bP `PZ0C2#`0K#` 1K91bP ^#`u A:,CA0bp h0V0,.0K; 7 p Z H/ Z/@G#` <,A0bp hVCu0bp hV#u0Ps A npCa0`w ,1, Tpw0bP `^0C2#`w5\0K#`wE`0KH 7" | _ \; _;@G#`U I,B0bp hZC,A(!0K (J 7, 0 i fP0bp h`Z,) Tp(0bP =pC=0P L1K` P )L 75 ,,A*B@G 0PكP ȈA pCa0˰  0b` dKH@G DSC-ąLM0Q2 >]Ӷ >}۬!LTP5>q!rӹA >[uӻ >͛vӽ>mO״>]ז^@tm{}2#DD  >q!#j@tH3Q,O\Ȅȍt45Q48 2ȱXTHӿ8]ąL>r>#DD  >q!#Gt?LD_02>b#>m#~mO׻>]h@t~AOߋH3Q,O\#_@tۡ5#DD  >q!S#>r#DD  >q!#>r#DD  >q!#>r#DD  >q!#>r#DD  >q!S#>r#DD  >q!#>r #DD  >q!#>bO۴Y>m׶i@mH3Q,O\#>rJ7OD4U2YfA\H3Q,O\t\45 0ąL\H3Q,O\ȴ5ET 5>q!әLT5>q!1[<!1( Cق!l1`[0-@@%a[0~-(k`Z0y- [ Cւ!VkA_-P @ ZP Ղ"A0qAXG BAՂ!rj0aUa  Lt0@C:uq `@Q@c/ 0FO 1<@G䁠#` L,A0bp hZ,0KBM 78  Al+,A(ĖHK@G #`D0bP `b<1D2(#`@ 0b` dK 1K@ ! (TJ2bP ``bb1E2)#` 3Kp #` 4Kp Q )TpĈA pCa0ˀ (0bp h0^@G uMpǞMpǎ\ 7u A<,A+r@G #`W0bP `cy1E2+#`0K #`1K  + 7a!2Ă,wl~5^0P `B#`iV i,,B0bp hbĐˆ 6KP 1 -id1bP `f1F2-#`& ,.B0P Ba T PPltqpA t;vT HKDĈA pCa0ː 1,.XA,. TpD.0bP `@j1G2B/#`; ,/B,0P BA q!#>r)7_0@GO\Ȕ܋M PQ2>#WcSąLȝIT 5>q!}7_TąLo0t}7_0@GO\#>bO۷sSąLu#>rA7_0@GO\T># SąLm >A >uӻ >ӹ1>mO״I >ӵ}7_TąL>bO׷>]h@t͛ӵnO׽>]6h@tq]TP5>q!S|5~J#DD DTW+DO\#SC-ąL&i@MۥvmO۶>mj@vnO۷>mVk@ۮvo`Oߴ>}צn@mM PQ2>rX#DD BD ąL>rc7ODPąL5SąL>rM7_0GO\#M PQ2>#gbSąLȱXT5>q!#>r27_0@GO\M PQ2մ\T5>q!SmM PQ2պ-DT5>q!]TTP5>q!Ue45 0ąLȩQ45 0ąL&x`LyO_DM0Q2M@ >q!1[>-]H!1A pC܂ o @y[0@-0` RmA°g ;[P ق!lf`0[0-  c hC؂!FlAP 0A_ ȯCׂ"0`Z04̢$qbE n- b0qV 0:L& 0& L`02aa  t0@C:uq `0Fy@&n@0rdl@qDqz~ ccֺ7Fmc  \c ;8AYVT q  !0F`.~c;ǿ/ԪA^ڵ c+7F > ¿0Fü8tP1F  k0F}H<cj8z0F與^8c&8c80F c   c`7F ݆܍>0Ffzc:ϸ/p9kkA>mYAe_hȳXY:X:gi> c-/jmi c띿/c|*(`0Fock/*Ӽ0Fj-{cƠ7FZ tP1F He#`u ,,C #`U- #`ŕ- pCP , pP ,C A! t0TȈA _pC a0` AMp Ȉ AM @G1bP `0Z0C2$J0bp hS ˈ N /,ʈA jpC`a04 O /8#`?E<0K Tp#4 T8 T2K Tp@Gh8'l^qqa#`E =,C$#`o52K0,4P ա1bP `[0C2PU0bp hPW0,5bp hPW@,5PP? 7! Հ,A6bp hWD@G\6bP `^$1D2h[0bp hWC3܈ _=p ۈA A{pCa0y i =|#`EL0K TpD'x#` L,A0bp hZõb0bp hZb0P` A }pCPa00d,Aс,A Tpd0bP `_81D2#`5H lb@h0Pf A ApCa0ːj`'1K#`EJ@GX 7e l o ln0bp h[D, Tpm0bP `bh1E2#`%11 x́Lq0Po 741 r`ǞMpA ApC0a00t*,AW*,A Tpt0K`PT 7[ wǖO1KpOR1bP `f}1E2#`@D {0b` dO1KT1bP `pf1F2A#`@'E,T ~0P| A[ppC=,B0K {P1bP `g1F2(#`(&C\ˆ !0P ˆA pC@a0P ؀,),) TpD)0bP `g1F2B*#`264y1K #`2F8z1K  B*o 7  R_ˆ _@G UEx 7 A Xhˆ Ah@G #`v Az,C,B0bp h fCW0bp h fg0P ˆA pCa0@  1K` #`gFj@GЂP #`йG0bP ``k1G2.#`mF h0K #`mVi0K q .@ ,n 7l AlĈA pCa0 m 1K #`uVm@G #` A,8C0bp hgr0bp hgD0P ÈA pC`"a00,A9,A9 Tp80bP `n)2H29#`6H1K#`FL1Ka9ț 70  A xÈ Ay@G#`6# A,;C0bp hjw0bp hjD0P ÈA pC#a01K#`F{@G#`!# ,1bp hs,O TpO=1bP `z2K2X#`b7ň 5&a1P?!ňA pC.a0 cݐ#l2K@#`gGH@G0#`/ ,CYE0bp hv,Y#q2KpQY螽 7 h A cB'j1bp hw,Z TpZiQȈA pC/a0˰lݐ&i2K#`wG@G A nA "o1P na@G_n!_mqV/Q[p6bP `,3L2E\#`72K #`G2K B\!21bP ``23L2]#`G2ň A A2u1Ps!ňA pC3a0`wހ,^,^ Tp]w1bP ` 7> y  ň @G䅠#`%pC06a0} ,_1,_ Tp_}1bP ` 7h  A ƈ A@G#`/pC6a0 ,i,i Tph1bP `@ 7r a  a31bp h~,i TpDi1bP `A 7w   f31bp h,j TpjQAȈA {3M2k#`7ƈ m1P!ƈA 3N2k#`'i3ƈ ́1P!Ɖ7l3 Q ǞpA A3N2l#`$1ƈ  P1P!,Am TpmP1 2mg7[v,m Tpm Qx,n Tpqm@+ ,n Tp~ Bn 7 ,Aa; ,CoF0bP 3K#`a;1P!ƈA A(3N2Gx#`Љb0LjA t1P!p Ap3,CyH0ܐ1`;2Kp`Vy Tp|gzW'l^qqa#`pC ?a0ˀa02K#`k@GuOb`? ,zG0bp h 3K#`q>1P!LjA /3O2{#`v Lj D>1P!LjA 9 4P2ć|#`{ Lj ?1P!LjA A:%4P2G}#` Lj (D?1P!LjA ;*4P2~#` Lj A)?1P!LjA </4P2~#` Lj *D?1P!LjA >44P2#` a?1bp h! 4KcC ,H0bp h@`#4K "#`g@B"2P "Hi #`#pCFa00"$b0Ȉ .D,A TpĈ$2bP ` 7f &r" /rB,b0.D@Gp"' ;2HC{2ı#`1pCFa0ˠ"+c 0K"#`BA@G"-2P ,Bp+Q0"/qlP/2P .HPR1 2bP ݜP#``pCGa0#2c08Ȉ =, TpD2RȈA Y4R2H#`@  ,`@62P4"H5, A*,ÍH0K#92Pcp#eJ +,H0bp hP1,dp4K##ŽTES:R2H#`upC0Ka0#?dB Ɉ K-, Tp?2K &b2P `ATS:RL7 #`~pCKa00&dd`X,Adp\,A TpĘdRe@V:SL7 #`pC@Na0`&gd`x,dp|, Tpg2bP `9 7 i& Xk2bp h , TpDjRrC:S2 #`pCOa0&nePQ9Ɉ AZ, TpDn2K'&Кf@O =,CI0bP m,AG1K0''T21@'u O2KP'yt" Tp srV0OQ<[p6bP `AI 7' vr' hC>-x2bp hO, TpwR>ňA {+5T2䉞#`O{2bp h`?4K''gS L,I0bp h,f@$!5K''BgPS M,ßJ0bp hА,Af@8%5K*'gS O,J0bp hP ,f@L)5K@*!*¨hS AX,CJ0bp hp,f@`-5Kp*Q*h@V Y,J0bp h,f@t15K**BhV Z,êJ0bp h@45K*#`bPS2P"ʈA n5U2ʫ#` fS2bp hq95K+*«T2bP `] 7v "+ {,g@A@GĊ +#`=pCWa0@+g b5ʈ }s0V, Tprp AY,íJpfűwV;bP ``h 7 + A@ 2bp hП*, Tp2K++ 7B_ +[1K++Ȯ?Qru1bP ` k 7 + ADh 2bp hA!Z, Tp2K.%.)p?LD_02~7_TąL0t}57_0@GO\#>r7_0@GO\T>#SąLH3Q,O\Ȅ>r"7_0@GO\T>#0m O_0tnO6h0n O?LD_02ἏXfh@tMum Ovl@tm X45 0ąL~#DD  >q!#&l@tun O׷*>]i@t[:B_Ouo O4m Ot}7_0@GO\&>]ӷM>]?LD_02ݶH3Q,O\T>r#DD P0O\tPtMW*?LD_02>[;B__@M PQ2պvmO۶>m&k@ۭӶn O۹>m6j@Mn Oa45 0ąL6#G?LD_02麏q@M PQ2պXfAܮM PQ2յظ>}n@M۾}D#DD  >q!S#*?LD_02>R6@߱M PQ2չؼ->}x@ӷmO߼e>}fx@bH3Q,O\Ȕ>r7_0@GO\T>>r#DD  >q!Si TT5>q!dT5>q!Sma+>PSąL#SąL5#<#y@4^Ӹ}.7_0@GO\X8nO㹡>{@4[8oO x45KH +DO\t7_0@GO\T>#3SC-ąL>bO]*صO_M PQ2պa45 0ąL#PtM|@tNym Oa>}@tyn#>m#~m#m#m#>nO缝>~@tyV#DD  >q!SȩTT5>q!SMa45 0ąL7#@Nzm O">֨@!znOA">V@&zoH#U >׷?LD_02鴏\H3Q,O\d>r#DD  >q!ӝ`45 0ąL:#+pSąL#vn Oy">׆@nH3Q,O\ȴ}#>ߦ@􎟲M PQ2պؽQ#>@[,;oO">7-`SąL9`45 0ąL#PSąLu#@4O0|m OXT5>q!S#>b G#6fpD>b~mO)#>3S0 ąL>bO#>&@4M PQ2շ=#>rSąL#>r#DD  >q!'>6P >Ɯ@tOp}}7_0GO\#&>ۆ@t[zm O=&>@tߣ@H3Q,? TLĿ8F2]H3Q,O\T>b OOi}#DD  >q!Na45 0ąL#PSąL#7+SHO\Tח?LD_02ٴ\sM PQ2>>raH3Q,? TLĿ8F2gH3Q,O\Ȥ>rh#DD  >q!Q\T5>q!H3Q,O\TdT5>q!SH_ \T5>q!S}H3Q,O\! PT5>q!#-`SąL1 LT5>q!\dH3Q,O\Ȕ>r7_0@GO\T>r7_0@GO\T>>r7_0@GO\T>>r7_0@GO\T>rcSąLdT5>q!SM\T5>q!SMxMDO\t7_0@GO\T>#W?LD_02ẏwM@ >q!Sm ##>}#DD  >q!#֜PD>PD>Pd>Ӈ`?LD_02ȭa45 0ąL7#?LD_02շ\kH3Q,O\ȴ>rw#DD  >q!TT5>q!S|45@EDc>q!vH3Q,b H>q!)`T5>q!S##?LD_02ٶH3Q,O\T#DD  >q! \T5>q!SM}H3Q,O\t \T5>q!S_M PQ2>r7_0@GO\Ȥ}eH3Q,O\Ȕ>r7_0@GO\T>1[F !y!1N @`,`WCj`W  !F5X0 `t,MCi @ 9Yz4X0 `18,FCh`  @!3XPg `@f`  `!2X0@^ `n, -Ce`  !j2X` FJ,N=RDM0,%Cd  0 Dac`x  0!1X0. `,@Cb`P  !0X0@ `(CIiG Ca` CA!l0X0  X`&, C߂!o`[0- { Cނ!Ro1r( 8 H5ނ!nw`[0@- 0v C b0 Pfq"[0-  o طCۂ!m@ A(@Yzm- `k XCڂ!Dm}`;[0- f C؂!rl @r( 8 H5؂!fl' [0- _ ZP0ׂ0@ 9Yz"i 4k:Q$4IUY@pai赀# @Yz"i* .L2pDa0@O$MTe B(Qfq艤, 0aȴZq 0@4Q& F^ D82ayHʺ:N o Ġ @ ր `A$0qdX(@4QuuxAAāx-  Z C4 @ea"i*2226010ZQ( 0@4Qua\-،(IyHʺ,l@H,Lq! >]oA\Z ~ !1 @ZP Ԅ!a ^M,t0@C;u CB#`4 7E2B0bP 9!ÈA t ; t,,pC,Q#`й3l@<{Vh ^̈A >M A AGO,2bP =e@Gq7A0[0pa0ː(}pVi}^K1,,3P!,YYS4E2lӵ Oo0 DT4D>q!> !1 !tj,J DaԄ! C a  )t0@C;uq `0F   tP1F c $8AA\`d0 n`n`s `@pd{@99 n@ A?C"R #`D 7V2B0bP ]=aÈA t ˆA ?e  DAO,1bP >}@GBQ] A2p H 7d ,#`@4@G$2bP ]:UPS 0M; c0d;24PPD H@28OPC,3P `ռt2,Ct5 7z A4K@S4K@ Tp$L  ԑ0K` TpBQ@'X4K Tp#\@'`v\p0 l#`jU ),#`!5j0Ka,7P)AT AIBP yppplG[A*hG[+huB{ǎA ApG,7P)-@]8@Łt0 7  9, T\x7t^.T;bP `Z0C2#`d5<0K@#`dE@0K@!:L1bP `0[0C2#`j5,Q(g0Pe A npCPa0ˀiՠ%j0bp hVĦ@GU^V> 7 k A] 5K#`uE,A Tpl0bP `0^0D2#`w5A2K#`wE2KAWZAĈA zpCpa0r@,Q, TpDrPjPqw,Wp8b` #Qs\J{& ,CApp J,Ap(x0K`TRB [H@z0P!A pC@a0˰|A7b,AA7b,A Tp|p7 BPT`<k 7l ~,(r[Zh ^0PA8|bSąLȑXT5>q!#>r 7_R!QąL>#wbSąLȩXT5>q!#>r7_TąL[0tmOGQSąL>#]0mO&l0t~4dH3Q,8CKc=t3ąL>bO_0nOSSąL>#F[ DTR29>]6i@tmsQ΄#>Ӷ]@3!D3q8LL>#}37_0@GO\#wQ΄#>?LD_0P 525m#>i04oD9B4[B]>#>mD9B4>ӗQ453HLQ2@T?"5@SąLkp#>mev7_0@GO\#kp#~ 1[r<!1 >[@Bc ȱE Œ-0( 1 ! Z0{-(!|k! D `%A0j3( QՂ!tj}0a=0a q .t0@C;uq `0F   tP1F c $8AA\`d0 n`n`s `@pd{@99 n@ A?C" 2F  R #`t 7Y2B0bP >mÈA t ˆA AHq  DAOa0K`AOb0K` Tp! 71ˠGPap HT 7g ,#`@4@G$2bP ;PS 0M; c0d;24PPHD H@28OP,3P `t2,Ct5 7} A4K@4K@ Tp$L A 0K` TpBQ@'X4K Tp#\@'`v\p0E l#`m *,#`$5j0Ka,7P*AT JB yppplG[+hG[+huB{ǎA7 ApG,7P)-@a]8@Łt0 7  9, T]x7t_.T;bP `[0C2#`g5<0K@#`gE@0K@!T;L1bP ``[0C2#`m5,Q(g0Pe A npCa0ˀiՠ%j0bp h WĦ@G^? 7 k ^ 5K#`xE,A Tpl0bP ``^!1D2#`z5A2K#`zE2KAZAĈA A{pCa0r@,Q, TpDrPjPqw,Wp8b` V&Qs\K{) ,CApp J,Ap+x0K`TRB [H@šZp Tpr1x0KF Tp,by0PaA pC`0˰|Z 2K 72!2A L ~p A% A\,BP}p,(B0bP `cs1E2B)#`:V ],(B0bp h_<1,)WO3K@ ! (TY2K` 5E@GP 0 ~}pq7 A pC0a0ː A'f,*A'f,* TpD*pv BPT <" 7 A0 ୂw bSąLȑXT5>q!#>r 7_R!QąL>#wbSąLȩXT5>q!#>r7_TąLv[0tmOGQSąL>#Vm0mOGC4538L8O\#]0tnOm0nOSSąL>#n DTR2>]i@tmtQ΄#>]@3!D3}8LL>#mD9B4ߌM PQ2H#^@3!D3->]ߧ?LD_0P 52ٯA?D9Oun_E>]l04D#DD  >q!A?D9_5mu  _@3!D3#>mH>md9@##DD H3QG D4DE4>q!e#DD  _C3ąL>rK7ՏH >q!@#O[fApfAM PQ2@#_1[r<!1# @T[0- !C؂" c [QB[p~-Z 0 Q|`( fIZ0NM`&B„!`a  /t0P1SO c p @Ac 7F HK y@&n@0rdl@qD-AA C? q "c  8 -u[B A T7A2B0bP H,0bP H,0P  7e DAWRi0K`AWRj0K` Tp!e 71ˠGPgp H 7n ,#`@4@G$2bP =PS 0M; c0d;́24PPD H@28OP,3P `Tt2,Ct5U( 7 SA4K@4K@ Tp$L  ! 0K` TpBQ@'X4K Tp#\@'`v\p0 l#`u A,,#`,5j0Ka,7P,AT LB yppplG[,hG[A-huB{ǎA ApG,7P)-@A]8@Łt0 7  ;, Thx7t_E8T;bP `[0C2#`o5<0K@#`oE@0K@!q!#>r 7_R!QąL>#wbSąLȩXT5>q!#>r7_TąLn0t}7_0@GO\#>bOm0}4dH3Q,8CKc=t3ąL>bOF\0nO6^0tK7ՏH >q!}07_0@GO\#>bOAM@ >q!XumO&n i8LL>#>mD9B4_yQ΄#>7cSąL>8LL>#>nOp453BD ąLj@tۤӵmD9_unO״m ءLmOGd?LD_028LL>#mO׸L>mu<LL>O2YH>mH>md9@#e#DD  _C3ąL>rGpH3Q,D0 DTC5?9C!M?QO\tP7ՏH >q!%@#O[fApfAM PQ21@#_1[t<!10 @m1ۂB@0@ @f [0@-(`flAbD! @b !XlH0EgY9 SP€G1 I  LȎ$8#`@3M@G0B3KZ0D>m`A`CZ0>mOSC-ąLW SC-ąL>mO!1 PD i S Щ C8(& a 2I,0F  ?#`0K T4 !4@E R*pC`,052K0 T4e7 2H0,h0PA5@E#A1@Z0D>m`A`CY0>mO!1 (D d`0@a _/ H1F c  } upS1F c  ^c o8 `@PBcGs8<<>9Ѓ>9́A@cq7F0kNzc}ֺ7F oc 07@PeZPȋ?P 1F T D"tP@K5@c /eg #`СSPgHA#`.u A,C #`Щ0K0AR,0P 7kL7ԃ#`5 ,Q#`T#`ŔAňA NpCPa0ˠp0,ÉA$0@G p(d@p܁Vp{xGp,U@ 0K Tp‚(@J%A A RA #`u ,,C#`,U )<#`,e D)<ΈA ^pCP a0E AL *H#`1E$0K Tp$D#` .,D#`@944Ո A,A5P@V/ 7 0`#`@>DP@GXUU-`1bP `Z0C2dZ0bp h0VGl#`cEA1K TpD&h#` ;,#`@k$,7b` dFc@x ]EATA@AT0A AopCa0A0b` dpGra0b` dpGsa0P#` A?,A0b` dGwd0b` dGxd0Pb A DT@T@AT0l=lC@ 0 #` K,CA0bP {,A^31KpQ 7 2Ae i0P h&g0bP `_41D2#`@4,l0b` dPK3KJ1bP `b:1D2#`D,"o0Pm A pCa0qs r0bp h[F@G#`+F Y,A0bp h ^2 u0bp h ^3u0Ps A pCa0`w4 % @!胑 #,?C0K=p,AJ 7 A tg*!1bP ٱ,AH Tp? qj± BL7  "p00$@K#S"10+zB2I br3`0`'@'q! >m1!>}`L#mƄ1!>f~0mO{0nO\04oO ~mo7SHO\#bSąL>r #u]0toO׹>]߶y@tM[5C7_0GO\#5#Fy@tuoȄH]>]V^p0>#>}#DD  >q!#>r>#DD PO\#&SM2>mFSM2յ~Ӷ}S#DD P0XO\dO=`45 0ąL>#&o@}6nO۹1>mj@6oO۽ H#єWaSC-ąLyXTP5>q!#>b`A`HD4>}fl@]_M Q2>#o@7nO9u45S 52>}45?U5D DH>q!XT 5>q!#>b_׺~]׶l@m_M PQ2>#v0L>m95a H#Ѵw"SHO\#@TR2 S4E2]v]TTP5>q!(8>#}$#DD c2IY45 0ąL>b#>m q=_RH3Q,O\Ȅb?LD8V#M4D O\d94H#ts7ՏHHS>q!#aSC-ąLTT 5>q!șITR2>bG#>w'SHO\ȔMTR21[9<!1` @jn@-`[0@-0!|[P ۂB` k E -  g bD! @g HCق!l!@- b ЯCׂ!kAr^ kA`^ Z@[ Z@@Z BaaQ8d y@ ʀ ` ؠ @@ V  Ii.+6 @ ƀ Ղ i :DYr( PR$s C JzJ@0A0JL& 0 0& 0a  010F ^0c 7F c G#1VXϢ5 cd~7F** c7,7F#. cG@c朣b8 0F Ȃ c   `0F  c b8AB 62j 7t0 ,0PpC D,A1P 11PEq 70p A aE2K` 7$a02$JP,2P  7d  b0KЌc0K Tp"0#`1u A,#`4 f0K,4P 7n Ш,4bp hN@GD4bP `Sq0A2PU0bp h Oֈ <XA ՈA XpCa0pa = d#`D`s0K Tp%`#`e A,#`4pv0K,7P( 7 z0K!{0K TpD'x#`oU ),A0bp h`R,q0K B 7+ c@A u d0P c`A ]pC a0PfS g0b` d`Ov@@G` |p ,AP}p QC i0P h,-- 7 kA tm0bP ]h,A Tpl00A0o,p, TpF 7@2Aq0Po01 sP 0K0 F@'AR1E 7@2ԁ%,: 7 w AZy0bp hVp,A Tpx0bP `p[0C2#`n5,0K#`nE00K< 7 } \ < \Q<@G#`u A>,(B0bp hWCE0bp hWU0P ˆA ypCa00 ՠ,A),A) Tp(0bP `^!1D2)#`5|0K #`EA?@Gp #`e J,C*B0bp hpZ,*Q!1K  *J 7- k i0bp hZ,+ Tp+PmHD@ sp,+BPtpPuHI@ HAJ@ HE@2p-A2,T{pC0`0 Lp,, tpB,0,, Tp,+@'i B@s?LD_02mu&\@\o_ MDO\du Vh@\۪Ӻ OM4ŀT@>q!9}45O0@+DO\ԟH3Q,O\tHnh0\mOf]0tnO׶]0nOk >]7q?LD_02>bOe\45 0ąLXy}\45 0ąLmO׸>]vi@tqӶ>]]04nO5X45 0ąL>bO`45 0ąLහ?LD_02>n  _  H3Q,O\Tm㇄?LD_02>[fAH3Q,O\TY\45 0ąLH3Q,O\Ȕa45 0ąL'?LD?ůQC,>q!!1 C؂!dl-`[0 (4LFA@Q-Xa  ^ (Cׂ!vk`Z0k-D0 Q`Z@W @0Z@@W *K 2´` 0$Q`a @ "jQ0( 8IUYqy[-8@" hA`PYI@0Q0duQ, 8O- ȉ@Љx ZY p!jx|Av!a~!Qa,ba"a!y0zz6{{H4|>}01~2(03P5`6 7x( 898 :H ;<Ѓ= >0 ?PQX R`H X `x S8TpU`TPVpWZ 0Q95aW0@L &  a +D,tP0@C\pD#}7ՏH >q!H3Q,D ůQC,>q!%-I >]lӆ\pD#> !1 ZPBZaȡ$8IUYa !rjB10a C,151@ ,C @gA8YYY0L> !1 ЩCa LM, t0@C:u B@@?#`t 7I2B0bP 9-ÈA t ;0d0@A @b%X``0rlm B`.`8B!f J" p,U$pC,Zq,2Pq0eK@YZ` >C?C(?))B*)*,+B,-B--.HIIq /D//C889JDK9:KL8 -"@H0bBX`h "jlⲪ6- :02203@4P5`Uh7x98Љ9:;=Ѓ< >?QPQ S0R THU`WpV XYZ\[\Ѕ]^pqrs(sHtPuhv wwzy{P }| ~}(  8 ( @P`pZ@Ɓ$  ȩ C a A ZYa A Ya A ZYa A,tP>#`"T `@Ya A,tP#`"T pC0A8Ya =H, t0@C!:u@y) C?#` 72 A0bP `NpC, #` 7 2 D0bP bN- ; 1Kp 5bP n/U72F0@\`0K` TBqA4F@YF[ 4!1 r a A,PT#` 4а BPT@A8Ya A,tP#`2K1bP n.F Ph KdYYa -Q,Kt0@C! 2:/c    2F?(?vނ3/c= !"mh2FZZ?   /c   ^0F`tڤ_6(8b@@ 7J185@A0 Ye F @ȃ``hIh ,3Pq", 2b` fCA0bp jOw <S0 =SP =Sp A%Ɉ A>A`#`v0bp jO`#`v0bp jOAa#`0bp jON0bP n@:B z#``#e7 88@E4`:B2 !1 @k~-8 b@ FA BׂC(k!m-(@jA B@GSX& pa R,tPRA eQtP B #tPEBsPG$n p`0rll l0l`l` l`0{$`t````l`@r``J`n v@`````ld0{``$`t````0l0`@t``J@n0nPs `` p`l`0{$0n0nPs `l` lJ@nPs ``` r`l``{$n00r`l` l` qJ@n00r` r`l` {$0n0r``0l` l`pJ@n0nPs `` q`l`{$0n0nPs ` l` l` rJ@0r```` s`0l`p```$`t````dl p```J`n v@`````@q` l s{```$;AAB`)@ ε/샒<9CȃCC쁒;؃C8C(I;=8>A8A?쁒;؃8C(I;=8>8A?A쁒C=8A=́(<ԃ88A=́<A88C=88ȃ(<ԃ8<A88;؃<C(I;=8>ȃ(C9A=́<<<<A8A8A8A88888(I;=8>A8A?쁒;؃C8(I;=8>8A?쁒;؃8(I898<A88988CCC(I898A=́<A8898A=́(I;=8>bpCa08 QFp%D\XD@"43b` $8&h66B0 B0܃2m_0@~mV\p<~m>mHD4٧4M7mDC4eY H#tvi DDN1[L-Lqe –$_!1 A[0-(!FkA1pV 0 (CIq(&jBNa I,<tP0@B c     0Fp78ih>芒jc /0Fy ~d,Hcz ]Ajj0F     cz}7)j~y2F x-q?5n,H/c8(1J1 jjB1P#`#UpC,C #``i@E#~L0bP n2BCw0b` v3D0bP b t 1bp jK(Ȓ   -A #`b0bp j`K #`b0bp jKA!#`b0bp jK!#``($#`4R /R@ 8S` 84bP n@6A %T6:A2 D0K@ T4 2PpCa0PA #`qSm^0HmL>m`L!1 @xkn- !GA B(AZpi-@ DWa e Tt0@C!:uoA) C?5 2:/c    2F?(?vނ3/c= !"mh2FZZ?   /c   ^0F`tڤ_6b@@ >H!F @ )@A0Pz @1(18+Wn`e``@@ >` f F``8Y(B3Pшb4#``i#`T )S ˆ ?)T@ˆ AH)T`ˆ H)QT)0bp j`R@ #`' 0bp jRA #`)0bp jR #`+@ЈA A#`# B0K T4JA a,= 7 V> 7 V> 7 ?>4p,6bP n>#1bP n>$Q?pC`0ˀepa`0K T\vAhd pCa0˰q 8H|"!#`v  $185ȫ$dNbA0P LB(b@@ ԠF @ &f:/10qȓxYo:x8@a' T4b^ .P C0b` f`OEQypC0a0@, Ȉ k F$(h #`5$nApR2bp j[!Z Ȉ n R%(h #`5T oVpCa0a,A T4 _AOpCa0 c`u1K0 H@EC0>^pL>#>} hH3Q,!%L 52ąL>r7ՏHHS >q! Op >]vӶ >}[I@T4#ąL S42 OqAXזYm y#DD SD 0!N+D2YA!_A!_H3Q,4HBD >q!> !1 !lQz [@B_ Q Z@^ ZP0ւhAq0"& 0& M@a e Tt0@C!:uoA) C?5 2:/c    2F?(?vނ3/c= !"mh2FZZ?   /c   ^0F`tڤ_6b@@ >H!F @ )@A0Pz @1(18+Wn`e``@@ >` f F``8Y(B3Pшb4#``i#`T )S ˆ ?)T@ˆ AH)T`ˆ H)QT)0bp j`R@ #`' 0bp jRA #`)0bp jR #`+@ЈA A#`# B0K T4JA a,= 7 V> 7 V> 7 ?>4p,6bP n>#1bP n>$Q?pC`0ˀepa`0K T\vAhd pCa0˰q 8H|"!#`v  $185ȫ$dNbA0P LB(b@@ ԠF @ &f:/10qȓxYo:x8@a' T4b  C\ A`@@ ᆀ,`&Z" BA4A0P`7IP@F @ fkH ,(AA185 v d@@ ' B4A0P`7KP@F @ ~kA0P 8@/` 185/"#Y0h0@/`A 1858&b` *A 0% &$a 2>^pL>#>} hH3Q,!%L 52ąL>r7ՏHHS >q! Op >]vӶ >}I@T4#ąL S42 OqAזYm y#DD SD 0!N+D2YA!_A!_H3Q,4HBD >q!> !1 !lQ{ [@B_ Q Z@^ ZP0ւhAq0"& 0& M@a wS,It0@C!:uoAT 5AP   2F  ?9N-c8a* 2F?(?vނ3/c= !"mh2FZZ?   /c   ^0F`tڤ_64 Kq! O}A!_zA!O۫Ӧj e=|45KH +D2~A!o!1  !XlQ+`[P BZ@B^ @kQ1 b`D![  CA@(@ȩ C<Pa vS,It0@C!:uoAT 5AP   2F  ?9N-c8a* 2F?(?vނ3/c= !"mh2FZZ?   /c   ^0F`tڤ_6q! >}zA!O}A!_k eE|45KH +D2~A!o!1  !dlQ1`[@Bb @kAz- !GA B(AZpl- aZ& a #O,Ot0@C!:u@y) C?5 2:/c   A^s>uK0F+[~)'كc=7-ϒ0F¦|?yܷcl`nA|0Fgyٮ?{چ˛1c\(\Aޢwث0F ?jjjc   3aK0bP `@R`02 A0bP ``Rb02 B0bP `Rd02 D0bP bRhpcа B\pLjA ͰA AʹAg ,Ca ,1P @1Kp T  ,#``)Tk0KN0l@\u0.hr F @ )p=(8b@@ Ԁ7J1850A0 Ye F @C``hIh ,3Pq", RJh w0bp jOp#` w0bp j0Oq#`w0bp jPOq#`w0bp jpOr#`$w0bp jOr#`,w0bp jOs#`4w0bp jOs<2bp jO$ 7H @EB#`#UXA ]% FT IA0bp jRA-1bp jRK0 #`+% K0K T4B1YwS42w wA1OH3Q,4BD >q!-#DD SD 0!N+D2H]0d>mO!1 !k}-%A !f E BZP0Ղd0#& L`0a K,;tP0F  IA?@?J ? A 5   c   ۳!0F@b}6t?{N¯=c*:^vlx<0F=2ʸ?>:9cZZ   AQ0F.+?!8=>P1#`%Ӎ7 xh0A'u!1(7PaĠ@@ ܠeYaAT*# ``T10Y1@Y,":@+ 10;$n f #1(7aĠ@@ g:5 10;"0@0K` T4DQ`D l5TPn Ĉ 8S#`DT@s0bp jN0#` s0bp jN1#`s0bp jN1#`sPQE,1bp j0O%HSM#``~zPB#`U Ĉ A?} TpE%AA( {а#`$B0bP n`:zp72 IpаU A$0,2PPd ]0~m\00>m_0@~mr >mO[fAoA,O!1 @kAr1[ # brk ւ!dkAla K,BtP0F  IA?@?J ? A 5   c   ۳!0F@b}6t?{N¯=c*:^vlx<0F=2ʸ?>:9cZZ   AQ0F.+?!8=>P1@ cd:?,h#`'  7` |!1(7`aĠ@@ eYaAV*# ``V20Y2@Y(,":@+010;$n f #1(7aĠ@@ `gJ6 10;"0E@0K` T4DQ`D n5TPp Ĉ A9S#`DT@u0bp jNP#` u0bp jNQ#`u0bp jOQ#`uPQe(,1bp jPO%HSM#``|PB#`U Ĉ ? TpE%A A(' {а#`&B0bP n:zp72 Ipаu A$0,2PPT 2bp jR*&]P>&\P>[ eE)4T>q!ȁO[p,4!1 D 91`Z@p[ jk`ք!a K,BtP0F  IA?@?J ? A 5   c   ۳!0F@b}6t?{N¯=c*:^vlx<0F=2ʸ?>:9cZZ   AQ0F.+?!8=>P1@ cjo歬#`'  7` |!1(7`aĠ@@ eYaAV*# ``V20Y2@Y(,":@+010;$n f #1(7aĠ@@ `gJ6 10;"0E@0K` T4DQ`D n5TPp Ĉ A9S#`DT@u0bp jNP#` u0bp jNQ#`u0bp jOQ#`uPQe(,1bp jPO%HSM#``|PB#`U Ĉ ? TpE%A A(' {а#`&B0bP n:zp72 Ipаu A$0,2PPT 2bp jR*&]P>&\P>[ eE)4T>q!ȁO[p,4!1 D 91`Z@p[ jk`ք!a ;O,VtPժ9+@ c      2FP%/l?]$c,0y~co0FPz>c?pL2c9k9+8jj0F  ?AP  &APl#P(5P+XP0F#ll?XߥM'crNy0F@<?y~L2c9k9+jj@Q0F.+?!8=>P1#`/sp71 70 A@A@oRq@m0bp jK#` m0bp jK#`m0bp jK#`mP"#`(K|NA #`i# 7r D!br!1(7XaĠ@@ mYbAh8 *!an f %n f%v@@ `o`A3e@`Ġ@@ `oA0pj*`@@ t,A2Pt T%B 0bP b WA#`$0bp jPRAT@uRA2bp jRˆ JB*T0ˆ AKD*TPˆ KF*TpBEI:RĈ MPO e42bp jpS#,dЂP1bp jV-e0K-4BÎUA ;B0˰0B?1K T4(@EC%3K T44!M@i0L>m#~m#>mO[fA|AO[AOA__CM/2wO!1 @ZlAr1_ # bk ׂ!k|- !@A Bj- !PC0@a N,=0F-=Z?  kc   2F<>t-cO*MA }0F~?ɞ*Hcoߖ% ]db0F֠ZZ?hjjc+    @2@Cku5c,"5BA""A#`e؈A A#`mڈ Aڈ A)7bp j`Jd#`و A*C6bp jJRPd#`و A+G6bp jJRd#`(و A,6bP nP3A %qW#``;C#`8pA M} A,C #`0K0AS,0P pppCa0p1P,qfа,1bP np6pǎ 7p HAS,2bP O@G Bro B@\p ,r@#`u#`t#5 Ձ @,M(U2K T4,A3PѸ 0Y^0d} `H3Q,Hӏ/O\dOv]0d>mC] #DD  M?"0ҿL>q! !1 Cւ!TkA! 0Z 9Pjv Q0Ԅ! C a P,;0F  I  k2cs|0F*ӵ6??I4c5Nd=A3s0FP'{ ª?}[,܆vcb1J1 AXjj0FZZ?   c   } upաVtPP0@K5@kP#``e#``d#``m#`'7D#`ۈ A*6bp jJR@l#`ۈ A+6bp jJRl#`$ۈ A,6bp j KL0bP nP3BEU A A;bP `PSa0A2B0bP ]L ÈA t1E •7qA 7j   7l  ;Kp T4PƈA ANµA7AQA1A2 IP` 0K T4#`p pA [pCa0ˠ,A th50#`С1K Tp",; 1B24NPv4bP n7i0bP n7jP)\ !@D! BPE @E#4n</ A87454 !1 `Cׂ!VkA@Z (I jw(r`ENM0a P,D0F||:"H  kc   ~"2FȻ2>?d]/c6Z{o  !1 Cׂ!^kAZ @8Dj{(r FNM0a R,AtPȺ@ c ޲ܣ5  &A#0F  ɌKq/cs| c5mu/cOimA?0F_q?g ,Hcs !A#mh0F}?jjjcZZ  @QRP0F  R #`$]qp DZ D 2K0bP n2b0bp jJc߈A F @ Z @ F  F(A0PW F @ `I1 Yf C8`aA0PZ"a F @ l)`@@ b F @ p)(`@@ c F @ t)8`@@ `d F @ x)H`@@ Ԡe F @ |)XƠFF @ )hᆦ `0s@!Q\f#``kAA\A#``nAAR ,Á%0K T4a (,C@E D nP h04,srа@ECЌU+\4p_p A+,5 @E#<3K T4"@Y^PL>&^PL>V\Pd>^0L>mק?LDO\S,2qAO[uAO!1 ZƁ$4Bi-(Q0FNM@a F,=tP@ c ޲ܣ5  &1F  ɌKq/cs|X 185襇 `@@ `5185@ A0Pp (` `a YbQ*\ a@@ Ԡ@XA0Ps 51858XA0Pt 5185HXA0Pu 5185XYA0Pv 5185h@YA0Pw 5A0Px fi0e(?,1PqGTW+ AP W ؁߄BPD#`p`0p A  ppw `0ː(,2PAB #`!P  7 4,3P4/3bP n:0bP n:ˆA  5V 숁 ؁A@Y_PL>&_PL>V]Pd>ӧ?LDO\S,2~ _uAO[yAO!1 !kA`[ ` 8DYF1@`I@@Z0_- !ȩ C(a I,WtP0F(?   1Fj0F   Kqc!o,mAvJ0Fp¿2FrO?-z/c1J1 jj#`)L 7c02GPc WȈA ta\ЀeAR`ƈ -AP#`e0bp j`KP#`e0bp jKAQ#`e0bp jKQ#` e0bp jKAR(2`f A #`D8,1P0AO XkD#` BA#` 91bp jN#` A;QĈ ;BP gPg0P,1PѸCA#`s #`v#PChŽ   #`S >S@ A?S` ?T AHT H*T 0K T4! 6^0D>mCזYF]PP>G`?LD>q!!1 %A !k!@o- @حCւgQA !Jk}`ZQĐS a O,K0@#0F+J"HABBPon 1F+J?5=i/cn{{|o2FPd*l?򵺾6/cj.J:<2Fiꚸ?'g/c5OA5 l2Fek$|?M(/cx^͖q/z2Fpߓ<?zIL2/c 9k9++jj2F+J?#` čܓ7 t,C0 2C0bP n2`0|h0A0bP n2d0bP n2ˆA  5V 숁 A,sdа N2Fpfа O2F0bP n`3m0L7#`9#`<#PhŽ 4#``!i#``$mTA #`$AS 9CS@ 9ES` :GS :IS ;KS ;MS <OS <QS ASPU q ,ÁEB pƈA pC h0ː(A@E#$2bP n70bP n:ˆA  5ЂV 숁 AB7A1,,3P 4 -3K T4 M@ YY_@Lo[~1mD^@L_!1 ( zY9fkA! @ [ GA Ղ"AB 9a G,G0F  ?   "P00F*}cn()2NAH0F࿌{h+c뿫lAJkj0F@4vJ鿌Լ?}c$ 0F6g 쿌y6[=cxo Ox<0F@%2& 9cXZZ(@#`  70A < 1bp jPJЍR#`y ;܀,0@E0P@R`0bp jJ#` `0bp jK#``0bp j K#``0bp j@K#`$`0bp j`K#`,`0bp jK#`4`0bp jK#`<`0bp jK#`D`0bp jKL2bp jNAU 7A 1K` T4Ba#`e#`h#PhŽ dY[@LO[o1~mT6!1 ( zY9jA! @W @jB]-8 aa /L,Ut0@C:us5oۚ coY ˞"h0Fa+|俌DZ .c..0F?ơ,wc!d<~n0F~(9?jjc     @#s<ă<Ѓ<<>9Ѓ>9}@l@tPB c     @ @ #c    hْ0F+J?AB#` wǎ7A2B0bP ? ÈA tD 71@AȈ ,A@#`d0bp j K@#`d0bp j@KAA#`d0bp j`KA#` d0bp jKAB#`(d0bp jKB#`0dPA##`؁,#`dYA0bP n06A ufp0b` v6DP!A$#`CA#`l0bp jNq1bp jNES0#`p 7 4,1PR 7Ts 2(H0`1ˠ$,2A2bp j`Oˆ =*Sˆ A>*Sˆ >*Sˆ A?*S0bp jOAˆ Hb #`!*T@RȈ I( PN80bP n:C ;b` v ;D0K T4@J@ YF^@LO1>}7_TąLv^@L_o1[O!1 Z@p[ !lkA [ ,(Iih" 80q 0Ah-  C@Ԅ!a {N,xtP̙0Feok?e,{+2Fp;*?vܻ8ccpO;\*:*>ߒ0Fp3?%ckgx朳0FZZ?59N-c8a hْ2F      tPB#s<ă<Ѓ<<>9Ѓ>9}@l@0F]q?Zs/cxk)ikhơJ0F#sڎ}?aǦ.|c{8:yoh_<0Fgޱ?mhcl~nO朳朓0FPZZ?BP h٢2F+J"H#`1#` ##`3 7k A0Ȉ A-A#`k0bp jpK#`k0bp jKA#`k0bp jK#` k0bp jKA#`(k0bp jK#`0kPA##`Dph0 A m0b` fCm#``;DAA#``=,1P 04YM0bP n6B\ A0bP n@7A uvp0b` v7DP!A$#`CB#`0bp jOq(1bp jOES0#`P>Q W#``C#`%e [)Tˆ I])Tˆ AJ_)T0bp jRAˆ KbP #`-A)T@0bp jRAˆ LfP #`s /,q#``$ 0K2bp jPSA aph0ː(BÎ( p*2,L0b` fK+81b` f0K0(,A3PѸ!0 Y]@LO1~mDE&i@L1[2!1& Cׂ"a BZPC ׂ"!D0 Qa0 b`QLkZ@B@V @jX- ! `Z@BS @vj:a ;^,d0F  ?A" `5   mtP6PRP5@ c   ۳!0F@b}6t?{N¯=c*:^vlx<0F=2ʸ?>:9cZZ   AA c¬n3_@ @#42c?pL2c9k9+8jjg#`1 7h0 0bP n`3ph0 Aj iA0pF  vf%*D`0(8 `@ `@*8 `G  80YF  F1(78F ܆L1  a@@AXQB)D #` ?1bp jO #` <QG PA #` (S ˆ A>(S@ˆ >(S`ˆ A?(TDB KT@ RF##`$Ӷ\0tmCx0D~m 8>#>m#>m#~mO[wAOAOۢA_A1OBM/2sOA5O۳O_M/2i:9cZZ   AQ0F.+?!8=>P1@#s<ă<Ѓ<<>9Ѓ>9}@l@tPժ9+@ cuY)zI0Fұ|?;zce;~d*0F@朳?cjj#`2#`5% 7lp ܌0bP n3ph0 An mA0pF  ~f%*D`0(8a`@b`@*: `G  :0YF  F1(7X`F LQ  a@@AXQD }#` c1bp jPO #` =QG PA #`D (S ˆ A?(S@ˆ ?(T`ˆ AH(TDB KT@ RF##`(\0tm 8>#>m&]P>&^P>Fl 9=-4T>q!AȁO۱A]_[t,YOeq!ei>nφx]ڤ{0F9 :+|?تh봛zcJ۱Ad*0F@蜳朳?jjjtP8@ c¬n3_@ @#0F  ?#` !QB@#`4\; ,C0;@,0uPARK2bp jKr 8BS0 8DSP 9FSpAS,X0K 0bP n6A  ,QE܁AF  F1(7ah* a`}>0YC >@YC ,":@+10;$n f$1(7aĠ@@ oR 10;0"0'@0K T4`G 5UP Ĉ AJ8T#`+DT@+0bp jS #`1 0bp j S #`30bp j@S #`5PQ;,1bp jS%HTU OP #``s PB#`aU Ĉ X TpE%A A4 {а#`jB0bP n>zp72,Lpа5A00,A3PP8 MBSȈA AĈA #`3Z#``CScąL>b r >#>}7տ8FąLV^ #>6^ e_yAO_!1 @dlAr1b # bk` ׂ!k}- !hA Bl- !xCd( Z0@hMa E,%tPP#P42mC[0 ~mD!M>mCצYL]PDӶ[`DD!!~mÀ01[8!1 E k-` ZP0"Z0@^- Z- ЩCa rN, 0@HP0@ #PA@ ȈA 9@E#0Pq   Ax ȈA :#``Y`Ġ@@ ܠ_ &` &* ؁PA=2Pq)2 |h0`, 0(h #` TpC8  7$H2 I0bp jJð %H*ю@c@1@0KPRA#` T@ɝA7A20M0K T4K@fZ0 >mC6\0$~mVZ,@dӦYL\PDV[`DD !~mÀ0!1 j]-(@j]- :`a J,tP0@P(#0F  IAqZ} upStP0F      tP#` T 2H0 @h0 A 72 D0b` v 2GQA$#`"P@a 0Pq% T  r p+,A0†/ f #1(9a1(9b8B0 DP5AF l .A0$(2dĠ@@ :x .h 1(7ؙ`Ġ@@ @g7 10;"185`F @ \*(6b@@  F @ )A0Pq ࠉA0Prj@F @ A0p F  F0 BP F F  F  F1(7= 10;"%HXf 8$p f[`e 8f^0L>mr >}7_,0H3Q2o!1 !dkl-"Zp0ANMa L,$tP0@P(#0F  IAqZ} upStP1Fj5   "c   x<0FPZZ?BR AB#`BKUA W2H0 Th0 A 72 D0b` vp2GQ%#`'P@a 0Pq% T  | .A01YF @f%8F @%8* @Y`@@ `g1(O2@F 103H 103PC(5b@@ @ 185 185# 185('E J1858'% F @ I8103!185P'!xF  &1(7h1B H@0b` v`7F0KT #` f  ; E~pC a0ː(A,,2P ,uʈA ˆA A#`3` Z#``C[`e 8^0L>mr >mw`S45>q!> !1 ȭ  ւB Q Ԅ!a Zx !tPReA @#A #c   )T BR PGaX`E9PE^P4@ c     BP#`  V`i 0\'(D 7\p#` 4#```pC,C0  0bP n 3D a#``6#@G#N l0,#` &  ,1A2H0K TrpЂ1ܐ2$JPB,2Pqph08A >p,C ;K t4B12K T4"8ЁS}0bP n`6A} ,DA NA7AQ1A2HQ0bP bVp|а BPT|A 7 T+A \pC `0`QA \pC `0`UA ]pC `0`]A A[¥B7ApAh#`s #` 7 X 7@*%`,6Pq@ 7 #` z + \thpx 7B/,%|p9B0a@A`@|#``#,/ 7 e0A kpC `0Pd8 7 e`A AjÕC7ApAA ÈA AC A=,A0@@*gpA@ A=,A0@p k, T h0p 7B>,AP:C2U@!L0l0b` v?@E#M, T4|B@Ec t0PspJC0`u 7H vp, T4aFd+S42q0M _u0} 0 0Q>mYYV]LAD~mt ~mt >mO J0DD_PLhPL_PLiPLviPL_0t}7ՏHHS >q!!1 Z`r C (H"S  %H0@L@a u],ztP(#0F  IAC ? A 5   c   }0F뮮^t?z=cׯ|^4.x<0FnI2ʸ?99c   Ajj00FlN1 T D"P0F<>>A 57xrxc     &3F.+!8=>/c<]m꿾J0FP$Hyq?Ļ>~9c~ !۷emh0F (?5jjc1ZZ  @Cq!}>mHD4庩A#DD  M?"0ҿL>q!X4M9o8>&zPLFj DDN[fAfAX4MmHD4- H#lPӗ0454]O[4M7SąL!1/ bhC܂!m궠LFA B(`[[@@k X[0- `D w`8[0- @Cق!Vl&@@0!1 b<@ DaH0B_  ^ @kxM@04aa gz =0FP{8#H7-9c.zA8)+pl2Fe揟̫>8c(o% Ald2FƧm.y=4cu"h)/<2F,=.tzï{wc`³y:̦r2F1࿌؎rccY`aHj;O2F}i'Hmp c79^5\0F9~>`?˃(/cg݃A `~0F$M >{`/c}ڭ0F&M{r?9o쏣/cǡ~Ou?0F5 ߼?>娆/c)/2X0F)Hvm* ce49}ݾ2Fq<=}睻h+crIa}xIhn2F맼ޖ0*\bc=ylα2FƦa*p|c:YgzA+:x2F0-뒹̲g8tc+/Hm󮾗B#1Qa[mAAoABA? -P   2F  ?:(2c2Hfgjo 3F -3N%Hi}*/c;yo2FP?.p,2c${L=.¿!3F6?H*y 2c(qA:l2Ft{j?/cܢ}N(H3o/0F+J?9N-c+#  #Ae@ c8a 2F?(?vނ3/c= !"mh2FZZ?   /cz}7)j~y2F x-q?5n,H/c8(1J1 jjBQACSu)@A!B1P  3aK0q4 3aK0bP b`[0C1A CU?0bp jV#`j 0bp jV#`l0bp jV#`n0bp jV#`p$0bp jW#`r,0bp j0W#`t40bp jPW#`v<0bp jpWD2K U? QA?Ĉ _C#`}0bp jW#`0bp jZA#`0bp j Z#` 0bp j@ZA#`(0bp j`Z#`00bp jZA#`80bp jZ#`@PA$x0ܐP2FPjQA?Ĉ lC#`0bp j [#`0bp j@[A#`0bp j`[#` 0bp j[A#`(0bp j[#`00bp j[A#`80bp j[#`@PA$w0bP `c02$G0bP `c02$H0bP `f 12$J0bP bcpа B\p͈A ˆA AB# H, J,3P (1K TP"0 K,C8A BD,Ĉ }JW#`DJ*1bp j_#` *1bp j_#`*1bp j_#` *1bp jb*Ĉ IJXĈ KJXĈ MJXĈ OJXÈA C+!0RKȈA K;@p0.h" F @ d,bA0P bB((b@@ F @ Rnfbex`@@ Af {%*A‡QM UM@ =!%0bP `k|12HQ0bP `k~12Hch 7 h 7 i 7 rѰ B\pوA <=#`Up:2\X0@`3p]p@U6F2h[0b` fP^\xᅌU0l@D0+b@@ J6185/"F @ D- aĠ@@ Ġ`!aA0)%*C6*1K T @E#эWh1bp jj#` 1bp jj#`1bp jk#`1b` f_I2bp j0kƈ hZ0ƈ hZPƈ lZA#`;Pi ؁X AY ~,, T4x WjpCa0b1bP nVE #``l%#`iuRm \ , T4J``0Pa,] 7) f@A ڭpC"`0`e] 7+ fpA ±H7ApAA ^ˆA ^B0 ,A0@@*hpA@ ,A0b` ffoP"2bP n@b1x r$A0P JjG185HD"F @ V.8aĠ@@ a!@a 10q8!whY7h6@؀`@šKk @E#k` o F?\ A`@@ ̠ N`AFV$ B!"4A0P`7P@F @ nH ,&BA185 v d@@ ' B"4A0P`7P@F @ nA0P 8ᆀU` 185;"Y8h8@ޠGt!0e9F @ n6%F$f ` 1"Y;%F@Y;h`*;%ȃ2gxpL>#>} 7ՏHHS >q!}$"OA"OAOӵ" O+" o+" >?LD?EG@ 4BD >q! >}A"_A"OӶ >[*ӆ "OMKH#8O\#>r7ՏHHS >q!AXAXfA?LD/!/8үQC,O\#>b#O_?LD?EG@ 4BD >q!H3Q,HS1ąL{L>#>#>}*7ՏHHS >q!M?LD?EG@ 4BD >q!!1+  !T0Xp @  !o- ! (A[1w aػn bY! "!ط0 Q부@B"s[1,J6aE-(@flt0aa j~ 50F@j= H̶ʷ0c-쳱;Ic*0F0ɂ,>\!+fcv{+w0F|#i?th)_c$80F :XZ?amc  A>,t0F  ?gkm c)^g=-cOj2F%hv>~ .:(cb>!ߺh2Fp¼&'ή-bcƮ$2FdrcӳjN*2F9z+.=c32F&N7 (H2 cg袲kbnmk0F2Țe=/c 080F0ƪy{i-l߶/c韋i~";r k0F0ө=m󡿌4z#/cqʶAާ60Fy.̂꿌5no/c*=x.uNPGalE9PE^@ @P4@ c     0F/h/sH1˂2cdλY)oʲd 3Fpm˷?z/c/-k  0F/l+Ȍt\ñ-c0*m^-h%\!1Fl6`Ȍ}H c An80F01cܢ}N(H3o/0F+J?9N-c+#  #Ae@ c  2F?(?vނ3/c= !"mh2FZZ?   /cz}7)j~y2F x-q?5n,H/c8(1J1 jjBQACSu)@A!B1P  2Fo<3aK0q4 3aK0bP b0[0C1A C>0bp j`V#`g 0bp jV#`i0bp jV#`k0bp jV#`m$0bp jV#`o,0bp jW#`q40bp j W#`s0bp jW#`z 0bp jW#`|0bp jW#`~0bp jW#`$0bp jZ#`,0bp j0Z#`40bp jPZ#`Ĉ jC#`0bp jZ#`0bp jZA#`0bp j[#` 0bp j0[A#`(0bp jP[#`00bp jp[A#`80bp j[@2Kp#`6 >,Cr#`8 >,C#`: ?,C#` 3  ; 1KЌ)Q)T:2,L0@0@"\p@%3D2ԁX+IBW Ĉ A|A0bp j _ATȈ }D#`(1bp j`_#`(1bp j_A#`(1bp j_#`IWĈ AIWĈ IXМĈ AIXĈ IX ÈA C)!4RJȈA J:@p0.h F @ Z,pbA0P ^B(b@@ F @ Hnbbex`@@ ġe z%*AQM UDM@ ;!%0bP `pk{12HQ0bP `k}12Hc_ 7 Zh 7 h 7 rѰ B\pوA A;=#`U`:2\X0@`2p]p@U6F2h[0b` f^xمU0l@D0tb@@ *6185س`/"F @ , aĠ@@ `!aA0)%*C61K T @E#эWZh1bp jpj#` 1bp jj#`1bp jj#`1b` f_I2bp jjƈ BhZ0ƈ ADhZPƈ ZA#`6PBi AO AY ~,, T4x WjpCa0b1bP n0VE #``g%#`duRl Z , T4J``0Pa, 7( f@A ٩pC"`0`e 7* fpA A­H7ApAA ]ˆA A]B/ ,A0@@*hpA@ ,A0b` ff0+oP2bP n_Px p$A0P FJG185 D"F @ L.(aĠ@@ a!0a 10q!wXY7h6@؀`@ŁKk @E#k` j F?\ A`@@ ᆠN`!FT$ B"4A0P`7ȈP@F @ xnH ,%BA185 v d@@ @' B"4A0P`7ԊP@F @ nA0P 88`U` 185`;"Y8h8@ޠ`Gt!x0e9F @ n%F$f ` 1"Y;%@Y;h`*;%ȃƄbeY=؃`EY=h=0`AG|e>fFFf `؃ ivxpL>#>} 7ՏHHS >q!i$OA"O[AOӵ" O)" o[*" ?LD?EG@ 4BD >q! >}+5$e$}" >ӹ">m_ӶOMKH#8O\#>r7ՏHHS >q!XAAXfA?LD/!/8үQC,O\#>b#O_?LD?EG@ 4BD >q!H3Q,HS1ąL6{L>#>#>}*7ՏHHS >q!A?LD?EG@ 4BD >q!!1-  !J0Xp   !o- !h (A[1pw ai bY! h!4̢$a @-  ⶀұM㺟0F0| '俌+nc  سio ؖB@ c   HC C C`A P  0F   =)c$.wiI$0F`n^?*m6cmc*sΚsΊ0F@ZZ?   tP8@ c¬n3_@ @#0F0ۡ?9cnM}˦tg}2Fc5peY/c++kʛoɂm0F3ۆ-i`n/c;q^5A]0Fprʋ,Ϻh->1/c82ث0F@ަrL<-/c$;X%X{ػ0FҤZΓo_- cr4\m*0F0l+c o>~c3  l)2F<>,I/{.c;=~xn2F0}zvh?Iz޶cwΛ:((}q}2Feϸ?qM7ct9aoʰ>˟2F0 8o?c c}]ۮA8n2F@joq\-Oe/cy.Z \zX>0Fpg.?フ)⪷/c$)~zt{0F7shia9/c N홆l0F銇7p˺d/cz*+ p| 0F@ۼPGaZjE9PE^P4@ c5J Ҫ虚!1F@>*5Ȍux cqZbη}?0F}뿌c"ɫhk"1F|y/qȌ/+c8{ނ8w0FPN 9#ݞZ%cA,h(0F+J?9N-c+#  #Ae@ c8a 2F?(?vނ3/c= !"mh2FZZ?   /cz}7)j~y2F x-q?5n,H/c8(1J1 jjBEPd8<šVZB AB)5   /c8kth%N0F0?:⩞?>9 +/cmȦ%Ayin2F){^?lLi/c}j|k2Fym?//cxxnҫ-Aٿl2Fbګ3?Ȍ4ia)c(3+z' 1Fҡj*-ȌƷO˱cy*-Z0F,ᅩ.j62P3aK0q4 3aK0q4 3aK0q4 #`   A)%2bp jf Lj Y0Lj YPLj YpLj TQD ԓ3bP nOp "h0 AE""&"!A0p}F  i%*DP`0(8  *F`a83e `84e("EA0B!E`8A0pF  Rjy@A0"S@ECP1PqV#QCA A%#BZ Ȉ A0bp jPkAHTȈ AHp##`72bp jkp##`72bp jkAq##`72bp jkq#%Q8H#`PB=Ȉ P=񈁁 A[=2b` vV?#%1bp jnP&B[&BWTĈ c,rdҰW& ;bp j@oy #`vppC&,C& [y,2P , uJI #` ow A L&QAĈ AȵI&#`"l2bp j0r&#`$l2bp jPrA&#`&l2bp jpr&#`( l2bp jrA&#`*(l2bp jr&#`,0l2bp jrA&#`.8l2bp jr&#`0@lRA$0PI1F @ T2 U%FD fGZ Y>`@@ %ug*>%&@b@f Ba *f DaHYQh|Da*bQ( %Yn)fLډ j'F @ v2vF @ x2185 hg@@ 'A0P vF @ 2 185 8hg@@ @,A0P vF @ 2185( Xhg@@ ,A0P vF @ 2185H xhg@@ @-A0P vF @ 2$185h hg@@ -A0P vF @ 2,185 hLf aeje@Tej'*185ˠv185 hg@@ /`A0P vF @ 2 185 0hg@@ Ԁ/ࠝA0P vF @ 2185 Phg@@ 4`A0P vF @ D3185 phg@@ Ԁ4࡝A0P vF @ L3"1858 hg@@ 5`A0P vF @ T3*185X hg@@ Ԁ5࢝ 0%bAV!vb@@ 6 hg@@ @6@A0P vF @ h3185 (hg@@ 6A0P vF @ p3185 Hhg@@ @7@A0P vF @ x3185 hhg@@ 7A0P v*dF  x4ᆀ`X!1(0ZYWh`Ġ@@ L\nl f\A0 Bta f daĠ@@ 1(7ht*%n f`!` *V/W`ႃ%r! Y}`Ġ@@ ĀLשz185 j@@ > F @ 3@F @ 3p}F @ 3185 g@@ Ԡ?hA0P }F @ 3185 8g@@ DAg@@ @D aA0P }F @ H4185( `g@@ DaA0P }F @ P4185H g@@ @E@"A06 ʧt )¯dĠ@@ &`؀X`h} VĈ ;%E[ChPUĈ < #` 8 A< P A3KP @ , I/h"BJ')!A0^ jj!)0e[F  ~5ᆠ`!1(0 Y]`Ġ@@ `\nt ft!A0r zj!)0eЅ]F @ |5XvA bF  1xF  1*+n fx`*VT]xႃ% Y_`@@ <̥f ơ2A0p a :4XW#`: 1bp jASBX#`<%k0 AO  ߕ@E#DFc`*1~AF ~A00 `@@ ԀU ~j@@ ԠU@A0PW F @ ^5185 (~j@@ VA0PY cF  @4HA0PZ *A0PZ F @ l5185 j@@ WA0P\ íF @ t5 hĠ@@ ܀'`Ġ@@ 'A0 }`!%(#g*`JS+n, fA0p A0p zG 2A0p A0p ŒG 2Yshsa!+Y|1(0P@YuH`Ġ@@ mn fAA0 k! /0eXvF @ 6Cn%xA0p paĠ@@ `/\@t!50ehwRYwX Ah b*hwnR- f!A007 f adĠ@@ ܠD`؀`H VĈ xC#`*%Ԫ1bp jAP^ˆA 00 8`4K0 H<;Yz8:a*1cL;Twj Bo#``Fv t|W"(h #`? Fd(h #`b I(h #`e L(h #`h O((h #`k Rj(h #`n U0bp jAqű7 ` D6o0PC~pCoa0pm0׊@E#b0P9Pl,>@EÁ>0Kݨ0P*0K 0PѸ)0܀s4F2DHV ;K?d?LD/8DL>q!M#O\d G#>>#>F>#>Ӷ >#>ӦL>ROӵ2 O[+2 o+2 ߜ?LD?EG@ 4BD >q!2 B"OB"_ӷ2 >rXT4#ąL>#>b _*{P0>&0n#>m O&y@>#>#u',ԺO~`@>r. >mj"">]ƚ>#>#>}57ՏHHS >q!غ&,_!Á >w‚HOn"m#DD SD 0!N+D2ع+>]F0~g7/!H>q!'>]ӖYfy@>#>#vyP@>#VL>#>R>m #O[u}y#DD #NQH2M'O\d #>#>Vyp@>#>ro7ՏHHS >q!5.O?LD?EG@ 4BD >q!E&4Q$شM&,OBO[B_BoۡBBM/2#DD # 52ąL>#>r^7ՏHHS >q!\?LD?EG@ 4BD >q!aHT?"!M1ąL>b #>#>#GS42?LD/!/8үQC,O\#>rblH3Q,!%L 52ąL>#>rUhH3Q,!%L 52ąL>!1m  !V8Xp   !7X@B@ n(A  FA Bm Bm bY! `! `~ D0 Q,  P`! 5X10 !3X@ e,GA Bd1 b&0Qd$8͒(0a: !v1XPbrfQ0 b, `  dAނ0 b@ F ނ!Po!@- @C[@Bpw ,%A0Igap 7aEC-(@hnA0s [0@NM &   C,a : , 0F H3o/2F`o?4#2cˮ+> c0FdM"|l^6c)_uX0Fo>8|Z:c򭻦. Ax l2Fpʶ?>!L`9/c;nʯ޾*2F.>?Ȇvc9dʗ}>9hx<2Foi>jjcoe؆z[A9  2F?   cYq[AᆴY 1F`Ӷiz ڽϦ/cm! g0F0n8フ0!/c+˶-0Fk7h8g=/c3~oz(}8y0F㾚ʸnjfi/c銺Kϸc4z0Fo.}n) c̣{~/xڭ2F`9|>>n'/c)fYw%|i0F0(->j^b/c;-zAb߭0FιZq?tjhO|/c$}/Aܧ0FP馪M?g /cx[J.c@#1Q[oAAoABA? -P   2F  ?z?ڋc~j̲knf 1Fj*Y'Ȍ}}۸cާzx)`0F: ?0ۊ82c%plo[$"3Fp:8x sH1[ :2c~o-| 3Fп:-?cg/cy_A,h(0F0ۡ?芒jc xqfKʠ,0F  HAPon 30F@,꾌iO/cLAz{ςt0FpȮpڵjjjc   ^0F`tڤ_6:9cZZ   A8@ c¬n3_@#0F0w-ڣX"H2~,coI/ɫ2FPnk=2?t~g/c=Mn2j2Fsu??)޷/c(O.#y,y2Fh´p?9=/c $^ 82F:t?l8o/cg컾;n\2F7㲭?.=j/c+*ڰ?2F}-迌1/c jªd 1F싞%HXc,{鈻|͊N0Fp޵,w'Ja/c>2*[(2Fv\>̚'Nb/c,˵9+hp>2Fֺ&.?zmcn? 2F`c?g9c͚9o~2F/^?<ޢ? c: Z[Au+. 1F@h>(H"#cc6ki <=è=k2Fp]ڼ)y+-cd|JA]1 ߮2Fz635cy %}2F>ÿ릿"ڦ#c5Jnꮏm3k2F乆俌?9*c}JA+!3F}䭎7Hڇ 2c;{A_2Fj[?r  c<%lrX>ޚ="1FP߯ΦȌ1(`!c|[?nӹ= 1Fc; Ȍ}c0FZ#ȐB"3aK0q4 3aK0q4 3aK0q4 #`  ~ ~4K` ~ *E2bp j@fLj A~Y0Lj ~YPLj A~YpGZ3G1@ 釐#`o 1bp jg#`q1bp j g#`s1bp j@g#`u$1bp j`g#`w,1bp jg#`y41bp jg#`{<1bp jgD2K ~ ԤQ~Ĉ AG#`1bp j0j#`1bp jPjA#`1bp jpj#` 1bp jjA#`(1bp jj#`01bp jjA#`81bp jj@2K e,eGК~1bp jPk#` 1bp jpk#`1bp jk#`1bp jk#`$1bp jk#`,1bp jk#`41bp jn#`\pm0Pq@i pC3a0o]8,8\0l@G0 b@@ `@dΡrfA0P jg185x aĠ@@ @vnJfe8F @Vc *78r0Pqؚq0Pшq`:Rӗ@(n%103H=(#t!0nx vP  d@@ @ v  d@@ Ԡ ! v  d@@ v  d@@ ` v  d@@ A v@  d@@  F @ 08W@u`a 185ÀBg*a9(0n\' fA0P< QY;h:@`*(-%ȃY<hx ;%Ѓn!f `CЃY>hL`AL䆘e?u%FpgY?Dd* ?1(7@941(7X4n fBAZ A A #`07@E/@a  ( TL(( 7 c p p!>h0@ Tq6XDuV6;b` vsH0 2)#`='Pk6W#``cGf La!J"(Jp}`Ġ@@ Ā&GA0Ps A~F @ 1`@@  *d@@ Ԁ f@@ Ԡ@țA0Pw !oF @ 1185 (f@@ țA0Py #o*J&%F @ 1(@hd@@ w210;tF BwB F @ 1D*b@@ $t $A0P `@ΆnaG @ N2!1(7H==c fP!㡆 b *S%P2!*)*!R@ ]%(zbZ!({vb@@ & hg@@ '@A0P vF @ t2185 (hg@@ Ԁ'A0P vF @ |2185 Hhg@@ ,@A0P vF @ 2185 hhg@@ Ԁ,A0P vF @ 2 1858 hg@@ -@A0P vF @ 2(185X hg@@ Ԁ-A0P v*d$`vYWxbvډA0P j'A0P vF @ 2185 hg@@ .A0P vF @ 2185 @hg@@ @/ A0P vF @ 2185 `hg@@ /A0P vF @ @3185 hg@@ @4 A0P vF @ H3&185( hJf FbmjeX*mj'*185P͠v185X hg@@ Ԁ5`A0P vF @ \3 185x 0hg@@ 6ࠝA0P vF @ d3185 Phg@@ Ԁ6`A0P vF @ l3185 phg@@ 7࡝ %A0 i!x'0eYF  n4`f!1(0`xYZ`Ġ@@ @F9yj!.8Y\1(78àtF  00 I{Y[`'0%UpϠ[ f pahA@`$r!1(1uj^'F @ 3}185 A0P  A0P 'hA0P }F @ 3185 g@@ >hA0P }F @ 3185Ϡ}185 Hg@@ Ԁ?@aA0P }F @ 3185 hg@@ DaA0P }F @ D4 185 gĠ@@ ĠLF'1(7H 6 'ZqAN1bp jA0RI#`.$%O1bp jAPRˆA pC`R,.B0b` rP@E# t? `JJ`Ġ@@ Vn fzA0\ j!+0ewF  t5ᆀ`|!1(0Yp`Ġ@@ Wn f!A0[ n% A0pv A0pu JYq`-0%U q f aA`!103YtldĠ@@ ܠ.`؀` VĈ [yeZCiPUĈ A\ #` pC W,9C0b` we F0P瀘%Hnrar0a@@ > 185(ՠ1850 j@@ T`A0PT F @ R5 185P 0j@@ `UอA00 d@@ ԠU j@@ U@A0PW F @ `5185 (j@@ @VA0PY 1(7 1(7@(!x@@ '!(t!x0ePuf aIQ0HJ Yvh`Ġ@@ 'bĠ@@ @,A0 `Ġ@@ `,cĠ@@ ,A0 `*uhvaF  6ᆠ`!!1(0(࿆Yx`Ġ@@ ln fAA0 h@a f aĠ@@ ܠ.\1(7 * 1nH f`*VAyႃ% Y{`@@ Mp`Y}(q/1(7 6 )qAU1bp jA_P/6Ĉ A~ cEl0bP b; 752C>#` !;@E#rNc@*QAȇF |"@Av_ QpCfa0Pb/ Ȉ / Ȉ Ca/ Ȉ / Ȉ A!/ Ȉ / Ȉ Cmp\q ,>C0bp j,? T4> P`b:Z2?#`o (f0P,? T44?[2K!1PѠr?@Ec7 ,H T4 H@EC ,AI T4 I@EA p̩Y1p+ 7ܚ (ZV1KWV ;K BJ@E#cw?LD/8DL>q!M#O\d G#>Ӗ>#>ּ>#>F >#>VL>ROa‚H}2 O)2 o*2 _?LD?EG@ 4BD >q!2 ,2 ӷ1/$& >_MKH#8O\#>#& 2>m #>#>b’Hm. >]tBOum O֚0tnOƊ0n #>#~m#>#>6P>6P>v>#>#>m#>#>ׂS42][m#>#>745Ob1P„8ͯQC,O\#>b&p #>m #>@tmQT4#ąL>#p@>b>m #>#>b OyP@>#L>#>R>m` _״# >2H#>?LD/8DL>q!.,O_M#O\#>b#>#x@>#>RfpL>#>} nH3Q,!%L 52ąL>#>#ƋL>#>#>拰 >#>#>m E#~}J7տ8O\d E#>m r >&PD>VPD>㆛PD>gS45KH +D2&YT?"!M1ąL>#>r1nH3Q,!%L 52ąL>#>b!S421,M#O\#>#>r@7՟>q!#DD # 52ąL>#45Ob1P„8ͯQC,O\#>#45Ob1P„8ͯQC,O\#!1r  !H8Xp  !7X@Bк n(A  FA Bm Bm bY! ``! `z D0 Q,  `!@ ~5X@0 b71,`7, @`, !`j  !\2Xp @K !1X@Bp[ b(AT  FA Bb ? Ba bY! `!  `[=N,JFA BB[@~ oY5Ee@Ɓ$4sNI -.@fz[P0܂" .ahE-(@Vn!@NM&  P Ca %j 0F H3o/2F`o?4#2cˮ+> c0FdM"|l^6c)_uX0Fo>8|Z:c򭻦. Ax l2Fpʶ?>!L`9/c;nʯ޾*2F.>?Ȇvc9dʗ}>9hx<2Foi>jjcoe؆z[A9  2F?   cYq[AᆴY 1F`Ӷiz ڽϦ/cm! g0F0n8フ0!/c+˶-0Fk7h8g=/c3~oz(}8y0F㾚ʸnjfi/c銺Kϸc4z0Fo.}n) c̣{~/xڭ2F`9|>>n'/c)fYw%|i0F0(->j^b/c;-zAb߭0FιZq?tjhO|/c$}/Aܧ0FP馪M?g /cx[J.c@#1Q[GoAAoABA? -P   2F  ?z?ڋc~j̲knf 1Fj*Y'Ȍ}}۸cާzx)`0F: ?0ۊ82c%plo[$"3Fp:8x sH1[ :2c~o-| 3Fп:-?cg/cy_A,h(0F0ۡ?芒jc xqfKʠ,0F  HAPon 30F@,꾌iO/cLAz{ςt0FpȮpڵjjjc   ^0F`tڤ_6:9cZZ   A8@ c¬n3_@#0F0w-ڣX"H2~,coI/ɫ2FPnk=2?t~g/c=Mn2j2Fsu??)޷/c(O.#y,y2Fh´p?9=/c $^ 82F:t?l8o/cg컾;n\2F7㲭?.=j/c+*ڰ?2F}-迌1/c jªd 1F싞%HXc,{鈻|͊N0Fp޵,w'Ja/c>2*[(2Fv\>̚'Nb/c,˵9+hp>2Fֺ&.?zmcn? 2F`c?g9c͚9o~2F/^?<ޢ? c: Z[Au+. 1F@h>(H"#cc6ki <=è=k2Fp]ڼ)y+-cd|JA]1 ߮2Fz635cy %}2F>ÿ릿"ڦ#c5Jnꮏm3k2F乆俌?9*c}JA+!3F}䭎7Hڇ 2c;{A_2Fj[?r  c<%lrX>ޚ="1FP߯ΦȌ1(`!c|[?nӹ= 1Fc; Ȍ}c0FZ#ȐB"Pz~0FPâc>h8h8/cͯoqL;y/2F𖮍{>?#J>/c4~:# )|d2F`zg? `Ϧc  (c[A܏ 1Fꦛ޴ HlJy'/c8\ñ _%K?~2F2>,c?mؒ 2FPX,?9c+2Fp}ޡ?. *cm0/tނ2F@c;a?eYc M<?. 1FP+.}L%Hhzcζ2>pA真0F0?,q v/c-̃YoΒ4'0Fۃ-Yo6/cާ7knA>4m0Fp缩?'w/c>]$L}0FH꿌 /cc`[h< 1Fwڵ(Hxh/cd|wAǵ\2Fpw%>%ac'}*z[2F@¨r-?'hLco|q8.2FOo?dٞicix|;K2F2˞? cͻ^Jchy!3F j9zfHоz 2cr<$^Aֿ2Fmnk?j/c+/"3F@4ڣH"_,}2coˊzA?ί; 3F+=?譓(Hζ/cHj{2F+Joe!c_-8舧zZ0F0$8i|"cͦq.⺝zY0F 19翌:'c~h=!)0Fp(?}Ice#.Ni4k0F -Jg c)\Z2h0F1:cy,A-&; 0F7Ny1<+c,~J)&0F.k%Ht fc &~:7jz2Fpڦe:tk`.cd9q%2Fmͣtﯛ0/c:<y/(c?}2Fk>"l«/c_<,Aܯ뺯2F'm+5?{/c.rOo*zI2F@'.ע?=mrOc$A sm 2Fpn?n4c+ꚢkA{8h 1F0'Hvdޫ/c./gA꫻'2F`i⽾0:c(Nf(2Fw$,8 ;O:c|Hsig~ 2FP<Ψ.c5-o7+dzM2F3忌/9c#ly*m!1F0.5ȌM. u c $"}?0FP,/뿌c ~&- ڸx"3FphH Τί2cƮH骬 3F>-? ߫Lζ/c~j 2@K53aK0q4 3aK0q4 3aK0q4 3aK0q4 3aK0q4 3aK0q4 3aK0q4 3aK0q4 3aK0q4 3aK0q4 3aK0q4 3aK0q4 #` ;pC:,C :p:,ð A3bp j~:#` 3bp j:#`3bp j0:#`S##` h4  D3bp j :#`3bp jA!:#`3bp j!:#` 3bp jA":#` `,3bp j AΈ A#:#`$`<3bp j`ANp4:,q5NT0:1bp j3bp jA Έ A  :#`.`3bp jA`Έ A !:#`2`$3bp j@AΈ A ":#`6`43bp jAΈ A#:@2K e0:,eN`0:`3bp jA0Έ A!:#`ba3bp j@ApΈ A":#`fa(3bp jAΈ A#:#`ja83bp jANA A:320J0bP ` 7 c`: ,#` N7ApA@#`& #`6 7  7@*\#8,4Pq@ 7 x#` B+Ĉ (:#`A0bp jPA@2bp jpA3bp jA Έ A*;#`b3bp jA`Έ A+;#`:#`b(3bp jAΈ ,:#`b83bp jPAΈ -:#`R8c ĎTH##`  j}185 gA0Pq }185 aĠ@@ &g!ȝaAA0Db`h $F4T%A Z#42TT0bP `A 7# #`npCpB`0ˀYA \)42`W0bP ` 7) Xe 5p,7bP nzψA A7.  76@*` 1K T&l ,C#``s`3K` e ꆂ185` !rhA0P j185 !aĠ@@ `41nJhe 10q`2xY0h(.,A Tt T4b`0b` fPAdB d'4bp jA0Ј OqB#`= d'4bp jApш A+I2bp jA (4bp j A Ј XB#`d e(4bp j`A`Ј Y_0bP n~B ,#``'e7 b0, T48a0P baPZ pC`Ja0@e1bP n`E 6#``'#`wR񈁁 ؁ ,A T4 `c0Pd,gK ,,CA0bP `, 7 iA ~42#` R7ApAA A 0bP nA.TpK:2 /, T\j 1K 7 n ?GrPpJ2bP n A0l@G0+p b@@ Ԡ6`ΡiA0P V185 h%aĠ@@ ĀEvn\ie8F @jc *78r0PqgB T4bF_ F) R F  f202FL~ \`7*Q@F @ 3`7*Q@F @ 3`7 +Q@F @ 3`7+Q@F @ 3`7,Q@F @ 3`7,Q@F @ 3`@@ Ԁ?1nf* fA0P Lf `q` 185`إy%Ff `2"Y<%ȃ=<`@*A<%؃w Af `1!B؃1'b (b* >nr"f2ՒYPh?0`Ġ@@ @41(7 MtBYP$ B0bP np0bP n0K     ( TF(@0Pq0( 7 qe p pVh0@ TOXDuēVO;b` vp 7d ` A 9ˆA 9C>]c@f La!J"(J`Ġ@@ WJA0P8 aF @ 4`@@ @N *d@@ N Dj@@ N@A0P< AF @ 4185 (Dj@@ `OA0P> C*JF*Z%F @ @5(@hd@@ `Tʬ210;P HA0 JJb@@ U`-!F @ T5ZJJ185hxf T~av@@ V<A0p nj*e@T+YThT`Na@*TT+^ j)t `$ŧYVhŧA0Pp |*A0Pq F @ 5185 zj@@ Ԡ\A0Ps F @ 5185@ @zj@@ ] A0Pu F @ 5185` `zj@@ Ԡ]A0Pw F @ 5185 zj@@ ^ A0Py F @ 5185 zj@@ Ԡ^A0P{ F @ 5Yn*f\ |*F @ 5𩠧F @ 5185 zj@@ _A0P F @ B6185 8zj@@ `dA0P F @ J61850 Xzj@@ dA0P F @ R6185P xzj@@ `eA0P F @ Z6185p zj@@ e *%bb!b@@ Ԁf zj@@ Ԡf@A0P F @ n6185 (zj@@ gA0P F @ v6185 Hzj@@ Ԡg@A0P F @ ~6185 hzj@@ lA0P *d@$F  ~7`d!1(0@φ<YZ`Ġ@@ `|>n fhA0 Ba f paĠ@@ >1(7t*3n fl`*VZlႃ%U Yȅ`Ġ@@ |)R{185 Hm@@ @o F @ 6@F @ 6F @ 6185 j@@ tA0P ®F @ D7 185 8j@@ ԀtBj@@ Ԡt A0P F @ N7185@ `j@@ uA0P F @ V7185` j@@ Ԡu@"A0 ʧS )dĠ@@ `V`؀X`h} VĈ %5nCnPVĈ  #` -8 -P@E# t?Y ՂrMJ`Ġ@@ |n fzA01m!70ewF  8ᆠ`|!1(00߆Yp`Ġ@@ n f!A00n% A0p; A0p: HNYq``90%UP q f aA`!103HcYtdĠ@@ _`؀` V-Ĉ yrCpP1WĈ   #` ypCo,9C0b`  A,: T49 @dt0f athĠ t103 F @ ^8F @ `8~185 m@@ @A0PF @ h8~185 8n@@ `t$185185 m@@ `A0PF @ v8 185 0Xn@@ ԠF  ~5F  5J910;0@@ YuX``*Othut(t!0e`vF  5 F  5(J`910;@F  58F  5@J910;@f aQ[`lf aĠ@@ nL fA0wPo!8=0exF  9P`!!1(1P.vA bF  5A0p rᆠ`a!n j f abx>a.8Yz8xa!Ƚ0e{F  7%ЇA0p aB q:4X^#`w{CrPWĈ + #` "pC@z,>C0b` Ⱊ;,A> T4> @t0f a|h|0aT@Z"@Avw A QF>:_L7A(2bp j `A@@2bp jPAXD2bp jApH2bp jAL2bp j AP2bp jP#`WK:`  A=D,? T4> P`{:`  A>C,? T4B? 0K M,AH T4"1P ,!1K0︂ $1P+#1KP *@&1PѸi%1ܰ 'p/2DJWAÖ AĹ[@EC+1P*ĈA _ 7V2l%2bp jҁ6bp jA ۈ Ko#`0Dt6bp j A`ۈ Lo#`4HUoQAE nn 7 -AE'A0p A0p YhP8Xb0Zbi Xba f^b&B4e?,":O+10;p 3Y`Ġ@@ }aĠ@@ ܠ}F 7 SX3,L T4DL@01Pq1D ~D A AyAb Ĉ ] ^AuDT@uA _ _ h AhA h hAEI ,KvPB=Ȉ j#Tp`0b` v 1Ĉ k Tb Ĉ Al BWTĈ m AęcаwAÎvB0bP nÁ\ ,MDp4lc,AM T4M031K` %BM-xp  7A |]wwލw{7bp jpA0݈ zw#`ލw{7bp jAp݈ {w#`ލw({7bp jA݈ |w#`ލw8{7bp j0A݈ }wD2K  9AY|]wwލw{7bp jA0݈ w#`ލw{7bp jAp݈ w#`!ލx({7bp j0A݈ w#`%ލx8{7bp jpA݈ wD2K iw,NDPjwQň A|݈ w#`/ލx{7bp jAP݈ w#`3ލx {7bp jPA݈ w#`7ލx0{7bp jA݈ w#`;ލx@{WA$1bP `\ \`0=z@pC02O#`e0e ?A pu4p,Xsp #`BypC2X Dh`0K @XaqA@  Aia0˰d{ A,Ĉ  AyDyP{y7bp jA0ވ A{#`{y7bp jApވ ވ A2{#`z,7bp j Aވ A3{#`z<7bp j`AÈA PhR#Ef #`D@v0.h A0PW `A0PX ͠A0PY A05a,"10qxYh@Ȃ`T^PE A A 7,ZE0bP `} |`0ˀy|`pC`2EZ#`?g0g kA A 7,ZE0bP br4p,[uuUП7,C[E0@hn1P +,[ TZn1h p XytQY{2bP nA0l@\0s+ 185 + 185 @( 185 @1(1`  Y`@@ a2 Yh@`@A\s1Pш\s1b` f@A|pA` A` ` a Aa Aa  I2bp j~}#`1}#`2}#`3}#`4}#`5F#`64bP nA0bP nہP~027ᆀ]0n8T`7ŝ4A0P#  d@@ ! vi@A185`` 8}P@F @ ?`7џ4A0P)  d@@ @ F @ ?88t! fL4A0P,_Yh@J VnL6YH`@@ @AUq f RcP9Yhh/(d5f Xcama5՘%hnvB`Yh|''֘%xnBpґ@\%PyAɠe j^P2 P2A0PP2N2185A!N2185A!N2185A! N2185A!(N2185 A!0N2185(A!8N21850A!@N21858A!HN2185@A!PN2185HA!XN2185PA!`N2185XA!hN2185`A!pN2185hA!xN2185pA!N2185xA!N2185A!N2185A!N2185A!N2185A!N2185A!N2185A!N2Ynh Y٠\%%185A%$A0P$A0P$A0P$A0P$A0P$A0P0$A0P0$A0P1$A0P1$A0P2$A0P2$A0P3$A0P3$A0P4$A0P4$A0P5$A0P5$A0P6$A0P6$A0P7$A0P7$A0P8$ 0%@Aɠe *mP2 P2A0P:P2N2185C!N2185C!N2185C! N2185C!(N2185C!0N2185C!8N2185C!@N2185C!HN2185C!PN2185C!XN2185E!`N2185E!hN2185E!pN2185E!xN2Y1(0I!w w0eȍF  DR`! fr7A0z7nv7YЍ`Ġ@@ Ġ9 vA b7F  d>(A0pL t! fv7}0%U}A7ݸ f zcrCᆀ}0e(F @ PRp*ub@@ @@(`@@ ` F @ xQ@F @ |Q `@@ @`@@ ``@@ `@@ @`@@ ``@@ Ԁ`@@ Ԡ F @ Q F @ Q F @ Q F @ Q F @ Q F @ Q F @ Q F @ Q F @ Q$1(1I|*@@*h dĠ@@ ܀`؀ `\`%A#`($E#`(eA#`(#` n)8 ,oG0b` pē,x T4o A> 7   F  BS`! f#<A0x8nt8Y`Ġ@@ 4`<#1(08M0e(F  RS`! f<A0 a f cĠ@@ gĠ@@ 2t!( f=0%@U`< f cC@ 0ePF  TQZ8%pj8A0p0a/`/qAApCp Ep 0 ,{G0b` 0Af0Kzjc=*ChF hA00^`@@ .F @ RF @ RF @ RF @ R F @ R F @ RPF  QHA0P8~8185K!~8185K!~8185K! ~8185K!(~8185K!08185K!A0pA0p8A0t! f>YhpcA ܣ44t! fC>A0pA0p8A0`Ġ@@ cĠ@@ 8x@@ !%F >*">0% A09n9Y`Ġ@@ Ftt`>#1(0Qa 0eF @ VTSa f cĠ@@ \1(7A!*p@w`?#n 9YXC b@?*>n9Y`@@ `&p`?J|9A0pta"E`(EqA=Asp#`)A#`)#` c*pCP08{3@E#T%Fc0@PAj"@AvA0b` fa74t;A(~90np(h #`&* Ie Ȉ A )瀂2bp jA90Hnt(h #`2* U0bp j0Bqy Za0"!ب@da@E#pCA2Ȉ#`:* t0K0" "H@EC!HE &2K`" %", T4vS@E;!,A T46@EJ!p~3 +"pа,2P+bp{,CL0bP b ,JP #`j*pp0bp jB p0bp jB0p0bp jЦB@p0bp jBPp0bp jB`p0bp jBpp0bp jBpP Q QC #`(8#`(< BKh0"0TҁXK,`Ġ@@ `aĠ@@ aF*EE*E 8\d`^Lnj;0Y`x fdF*h:"10;XI `aFh$1(7XIA1(7PIa(x: 10;ILI42P3@A&#%v5v#` "+ہ#`*o#`*A0bp j`B@uRA2bp jYY#`*Y#`*Y#`*Y#`*Y#`*Y#`*Y%QgR,1bp j@B@ RF##`*Pdb@@ >b@@ @>(b@@ ԀaĠ@@ 1nb>fNA0Y:h:AR98v2Pq 'B T4b 'FĮ#`e,||0bp j`B |0bp jpB0|0bp jB@|0bp jBP|0bp jB`|0bp jBp}0b` fBPA@  @  @  A A AA  !  ԫ  eѢ a0p'x@E݉c' r'``'| a0ː'z@P  #`#+#`&+ %}#``*+, T4J`x2Py,pC2 #`%-~0~ ~'A I  7,I0bP b- ; 1K *#`5+ˆA  P: *p@*'\p2Pq@~"pCA2 #``(,R0 '@' ?b@@ s(185g!@(185(g!@1(1iYR1`@@ !<w`T*TR, T H`*a*& TRP{  A00 t!HA! ? BA d@@ v~@A185gaH ,P@F @ Y`74A0P' B d@@ ԀA v@~@A1858iaA0P+,n~PYTI`@@ `"Aa U*US)0p@1`AUV%185xia8YUhUAReU*aU-%pYWhLhUf ^e\eV*Uf beA`t$"f XfA^<(ffV22b@@ Ԁ22F @ zZ2F @ |Z2F @ ~Z2F @ Z 2F @ Z 2F @ Z2F @ Z2F @ Z2F @ Z2F @ Z2F @ Z2F @ Z2F @ Z2F @ Z2F @ Z 2F @ Z"2F @ Z$2F @ Z&2F @ Z(2F @ Z*2F @ Z,2F @ Z.2*d`eZ̃bVl%() *( F @ Z( `@@ @ `@@ ` `@@ @ `@@ ` `@@ Ԁ `@@ Ԡ `@@ !`@@ ௅ !`@@ @!`@@ `!`@@ @!`@@ `!`@@ Ԁ!`@@ Ԡ!`@@ "`@@ അ "`@@ @"`@@ `"`@@ @"`@@ `"`@@ Ԁ"`@@ Ԡ"Lf Tfm<(fnWJ22b@@ @22F @ f[2F @ h[2F @ j[2F @ l[ 2F @ n[ 2F @ p[2F @ r[2F @ t[2F @ v[2F @ x[2F @ z[2F @ |[2F @ ~[2F @ [2*d eF  z\Ga!G fv%WA0QnQY]ѕ`Ġ@@ }|`aWx%1(1q!t.n%A0p7JgĠ@@ өnQY^`(I f |ebW.8Y_8]|e!HI! fN\A02\DRx185o!DR185o!A0P A0P<<185o!<185o!<185o! <185o!(<185q!0<185q!8<185qσA0PσA0PσA0PσA0PσA0PσA0PσA0PσA0PσA0P|F @ \( JR )$1(7i!6 <<84XĈ  J I0Ĉ  J BPĈ  K ˆA  pCВ0.8Ьy0K ..P*"&(&)!A0_RnRYr`Ġ@@ ܅`\%1(0wa/`/0e8r F  ]Ka!K f\A0sRnRYsA`Ġ@@ ׅRh!.8Yu1(7g1(7g@4nDSYtQ`PM f ebL@!].8Yu8se!pM! fإ]A000f 2dĠ@@ ܠ`؀ `\`B#`.v)x՚#`.w)B#`.x)#` .pC0.8P@E# .%o vev1e@@ 185hu/`/A0PWa/A0PWa/A0PXb/A0PXb/A0PYc/A0PY7A00d@@ ԠօF @ l]F @ n]F @ p]F @ r] F @ t] F @ v] hĠ@@ ܠ`Ġ@@ x@@ 쀬!(_nSYx`@^*/^xhxx)4nSYy`Ġ@@ bĠ@@ @x@@ !1(7Xk1(7pk)510;kAf eQL^f LfĠ@@ ``_%1(0h{DᆠD0e{ F  ^@Qa!0Q f%_A0hNTh!.8Y~1(7kF  Zȅ VT`E0eЗ} ``_*V|傃%a_`_%103s!S%dĠ@@ ą`؀` b`\`EB#`)/)C٨0Ĉ   BPĈ A A ˆA  002 #` ), 3P }p0f Bfe_hĠ_ 1!nT } .P `eܭ A a0B n (h #``/ F? Ȉ  CaQ2bp j`ApDT0+ 4S3  >+=+о =+Ծ=+ؾ=+ܾ=+=+ =+$=+(=+,=+0=+4=+8=+<=+@=+T, 7H>+ 6s3AM>+D>+Ĉ  >+=+=+ =+=+=+=+=+ =+$=+(=+,=+Ŀ0=+ȿ4=+̿8=+п<=+T, 7 ,LPfBTBBгˆ  гˆ  гˆ  ѳˆ A Aѳˆ  ѳˆ  ѳˆ ҳˆ AAҳˆ ҳˆ ҳˆ  ӳˆ A Aӳˆ  ӳˆ  ӳBΈA )=+ 7=+,L0bP `` `03;pC2L#` 8а B\p`3bP n`(Ժ0   `03? 7l+, T=#1K63 7p+,M0bP b*T #`>8B #`?8A0bp jC@2bp j  #`c8 #`d8 #`e8 #`f8 #`g8 #`h8 #`i80bp jC0bp jC0bp jC0bp jІC0bp jCШ0bp jCਭ0bp jC0bp jC0bp j C ÈA -P RA#E #`x/D9 P9 +185![!)185![A(185![A1(1nVfflA0wf f G`Tp^PE% %0bP `P A `0P6dpC2#`)90 hc6A J+ 7+,ڜM0bP `В  `0ˀ6i\ ; 1K6#`;/<#`:/@ A `0ˠ6k 7+, Ti#1K66 7+,C۸M0b` f,AT 2)@2) Wb@@ wWb@@ wW(b@@ `waĠ@@ '1nWfnA0ʀ'YhA܆|R8q3Pq`6 T4b6܌#`80bp jPC 0bp j`C00bp jpC@0bp jCP0bp jC`0bp jCp0b` fC@B0ˆ A?0ˆ ?0ˆ ?1ˆ HA1ˆ AHˆ H p e A a0 7s@Eɍ7 "7`7 A a0@7u@ľPԾ #`/#`/ %ԯ#``/,A T4 `s3Pt,pC2#`90 y7A mo, 7m,,CM0bP b- ; 1K7#`/ˆA  P :ȱ {7p@*c7\p}3Pq@y#pCB2#``8 S Ƃ )@) Xb@@ .s>185!8cA?185!@cA1(18ρ4Y `@@ w0Oa t*t!, T>Y::& r[Pa! A00w t!e! X`c! B1 d@@ 4v@A185PaH ,g,P@F @ Zs`7N 4A0P' B d@@ `6A v@@A185aA0P+nYY!`@@ @7"+gat*at)0cAy`tL'185acYhAHgt*t-%HYhȲ@tf TgRg`u*߱Auf Xg1!VgALlY fZugAYh1XgG@Y f^v2dv*u 1(7hMF  pH4e jMHY#ea 0@EĎ/@a;; TLd; K f00; 7&-,NP* XDu#``8 pʂ2Ԏ#`8#`8 , W#``8)`.`v*v 8h`Ġ@@ OlZb@@ `E@ia@@ ԀE F @ Zt@*@F @ bte!ea@@ @F@ea@@ `F`ea@@ ԀFea@@ ԠFea@@ Fea@@ Fe$ʗ@ʬ%F @ vt(@hd@@ G*G qY10;0arJb@@ `LZb@@ ԠL+* b@@ MCf tZhkaG @ t!1(7pA.`wr'8Zb w*wۙ%Н!w*)JZ%Yn|Y靠% 185!A0P>A0P?A0P?A0PPA0PPA0PQA0PQA0PRA0PRA0PSA0PSA0PTA0PTA0PUA0PUA0PVA0PV A0PW A0PW A0PX A0PX A0PY 0%ge *fY YA0P[YY185!Y185!Y185! Y185!(Y185!0Y185!8Y185!@Y185!HY185!PY185!XY185!`Y185!hY185 !pY185(!xY1850!Y1858!Y185@!Y185H!Y185P!YYn@|Y & 185x!A0PxA0PxA0PyA0PyA0PzA0PzA0P{A0P{A0P|A0P|A0P}A0P}A0P~A0P~ %A0f\nf\Y`Ġ@@ o`|'1(0ƅᆠ0e F @ vqa f gĠ@@ ܀51(7Xt*qA`|'n \YX;.G0 b}*|n\YJ`Ġ@@ @tש^'F @ hvF @ jv`@@ f *d@@ gF @ rvF @ tvF @ vvF @ xvF @ zvF @ |vF @ ~viia@@ l ia@@ l@ia@@ @l`ia@@ `lia@@ Ԁlia@@ Ԡlia@@ lia@@ lia@@ m@"A0ʧ1FsAF  t   V|.1bp jCpBR~. 1bp j0CP.1bp jPC0bP b302#` q8@+@E#>D(kJsAJ`Ġ@@ `ZY`}'1(0օᆀ0eh F  wua!u f}A0t]nr]Yy`Ġ@@ `^]`~'1(1a]vA b~F  btxF  `t*uA_`A~'n ]YX.G b~*~n]Y`@@ ̀dԅY1(7!6 JXJX84XĈ A ^0Ĉ  BPĈ AA ˆA  c0غ02#` 8 3PŰp0f g~hĠ~ 103!F @ wu!ua@@ `}@ua@@ Ԁ}`ua@@ Ԡ}ua@@ }ua@@ }ua@@ ~wa@@ g$185Aׅ ׅA0P!ׅA0P!ׅA0P"ׅA0P"ׅA0P܅A0P1(7!1(7A(10;AA`A'%؟'*D JwA`'1(7 Ձ1(78ա(10;XAF  Pu8F  Vu@JXwG ^uYhg!۟Y1(0@! 0e F  Ty{a!{ fBA0V^n^Y`Ġ@@ ĠZ8vA bF  tuA0p\rt!@{ fF0% UB f JhBH ᆀ0e0 F  Zwf T1(7!6 ^Y^Y84XŽĈ  Ps/ 1bp jCPu/1bp jC0bP b; 7?/ (B Y,A T4 !'D,Y8T0!!*10!#LpFB d75/#``;F8pCBL7 B" Ȉ Ay2bp jÐAXz^00n(h #`< O( Ȉ A*{2bp jP#`<W:: +B ,D, T4 !*T`07/,C P0bp j@ .4P-, T44? [2KC14P|C @Ec7#, T4; @EC#,A T4; @EA54` , P0p ,Pp/4l,] ;KC CB@E#cCi4 h0C>\ ;KF 8ܾ02BшA N8 78 a$FA0QŽ@@EçEe4PѨ FO T44-PFbԔpC2'4XE ,A!f hĠ@@ Pph!(*Pk/8 0e8#4 -%HY4i5p;!9!4#2 #>w<45 ;E!O\#> #DD # 52ąL>#>R׺1.$A3$b06{ >#>#&>]&@tM)5n O׷J>]8At5oO׻%N>]+>#>#{F,>#>#{׷SX2YA%?4Խ1 #DD #NQH2y~@7՟>q!#H3Q,HS1ąL>׹E:>mfAmm #> ArvoK#>#<֪e>#>R>rߴO[8ӶnOTT?"!M1ąL>ӻ*#>bO۸i lT?"!M1ąL>#>#v[w}#DD #NQH2fAmO߶eb>}:A`woH#7lT?"!M1ąL>#>#8۠nO߼9w |܁D>r󽟛u}7ՏHHS >q!nH#7f@ԹӶ\A4N1ȭKA4~x}hnH3Q,!%L 52ąL>#>#VA[8nH#75N~mvA4rxnH#71w |۶L>#>rOd8oOis>܁D>r_?ÂIk>&Atny}7ՏHHS >q!=o_߹ij>Atc9oO#DD SD 0!N+D2k> >#>R>&hA5nxǀ#DD SD 0!N+D2HMg45Ob1P„8ͯQC,O\#>b>r7՟>q!ع~hH3Q,4HBD >q!5n#>##Nw>m_?DHb0vAuH#DD SD 0!N+D2ԵmG#}&9qD>}hH3Q,4HBD >q!:n #V@69qD>m #>#>#>m #>#>#~m #>#>#>m #>#>#~m#>#>#u~QNH]s@ܺӇS2Դn #>#>m#>#>H}6 #>>}m7/!H>q!~$m>m #>#>vٰ>#>#>m#>#>#%#DD # 52ąL>#>#5#uO9#(طm #>#>6L>RM>m #=&ڰH>#>#~m #>#>b #DD SD 0!N+D2߆@>#>#L>R:$ni#H}}>m#>#VhD>#Z(H~mJ#>Rozv >#>#>b>}sH3Q,!%L 52ąL>#>#>bI+4Զ#DD SD 0!N+D2ش~qH3Q,!%L 52ąL>#>#xMCB_CBO[<#{7pS42m#>#>r__MKH#8O\#>#>ӴU_8m>m#>#>b۷_wHH>}qH3Q,!%L 52ąL>#>#N~@M#O\#>#uM#O\#g<45KH +D2}7/!H>q!@>#>b>r7՟>q!tn #>#>!*4#DD SD 0!N+D2mNۖpD>#>m#>b>b G#>pD>#>n G#>׷,45KH +D29nH#b wFD>b߽?Ӷ!#(|m OD>b߽[{ӵ+ >;ӷ6 >ۯӽuJmL#>#IVHH>m#>#>rO>}mH3Q,!%L 52ąL>#uMÂHNB<>#>#O[1H}&pD>#~~7տ8O\#5m[x4uw?LD/8DL>q!k0ؽF@>#>xL>#>bߵO?LD?EG@ 4BD >q!}o #>#>#>m r >#uN0}W*pS42ndT?"!M1ąL>#>׶n0ؼV@>b>b_##mB<ظ_0DHO0>r`T4#ąL>#>#ǡS42)=T>q!?LD?EG@ 4BD >q!u~7ՏHHS >q!غ}7ՏHHS >q!غع}u7ՏHHS >q!$#qB<ع_4]>m #>#>#S42HM45Ob1P„8ͯQC,O\#>#'S2H]߷>45Ob1P„8ͯQC,O\#>Ӽ%/$ؼ)/$ؼQV@Թ@>#>#H3Q,HS1ąL>b#H}>}7Տ>q!^놮>#vk(~m #>##'.45Ob1P„8ͯQC,O\#>bm#DD SD 0!N+D2xm>m #>#iHT>q!~7տ8O\#62MHߦM>R6@>#>#>[ >#>#>R_[k$H>m #>bOĂhT?"!M1ąL>#>b>b#O$m>m#>#>#N(H>m #>R_r@?LD/8DL>q!HML>Rȝ>}7՟>q!\ ?LD?EG@ 4BD >q!ض:DؽO:DؽoM#O\#>rC$APT?"!M1ąL>#;S42]N<0>#5 |45 ;E!O\#u:Dؽ_"?LD?EG@ 4BD >q!m #>#>>}nH3Q,!%L 52ąL>#6#(S28PS42HMVD>#7NI#,HΛ8fD>#7N]k(ȍm#>#>#v4شQ_8m>m E#>}oH3Q,!%L 52ąL>#>blT?"!M1ąL>#>#u][|Co|C|C_}C~C_@MKH#8O\#>#x][8禌A>#>#>#5[0DHO_@M#O\#>#>#6|C~C_?LD/!/8үQC,O\#>#~H>#>#mS@ԽԺG/S245Ob1P„8ͯQC,O\#>#>#GS4216m# #DD SD 0!N+D2ع8o#s8Ƚ>m#>#>r_ۭD#O۰D#_!1 8a=,@HaQh=, a< @yX@B !yX1ÂC(tyX@ÂyfI0"a<,(AtxX=N,JFA B$<, aAr1  B~ `}밀]Qs:,8 b@N`:, !a~ @sXp @{  !sXpy !nsX1,Q尀`-0Y iEI(A䰀&D.9,4À@8DYyQpq LUP0 vY1 bGA Bఀ  /, !}aA 0}ab  a{a! "q%Q`8 B!waA8 ( r1BuB܅Q-,8 b`-, !hka[ @ZXp O  !lZXpEM !PZX1,Q0A- a%A !JYX@@@BQ 1 bAI9WX@л‚0 b`u1+, u+, @ ]a+, !]a ,%A0'rfa&&gQ*,8 b]*, !Sa @TXp @  !TXp !fTX1,Qܧ<0x iEI(A5D),@8EaQ(, Ca @^PX@B‚ !BPX1C(?X@yfI0,(A>X9 ( r1BB}aQCJ 1"P$Qn,@(Ah`l, H`™ 0`A @v̶aK/ck|6XAncM0FPk.9?ܯ"˳./c{rL=g{0F κ *:Xcᮟs~ <2Fp1=h0g=c;K˞Az玫n2Fg-8{ެݺ9chҤ^-82F@|'jߩtYi3ch\0_#]j2F2ϫ翌/%Λnc.j9k׶ 1FP?N㡛m/cijsy޹0F ַ>/c~ʫm {%.0Fޣ1ݞ7?8/cnlkr?-!ȇ0Fe{!?q!'/c ܲ.,{J0F/?6{*/1Q±SNAAoABA? -P   2F  ?:(2c2Hfgjo 3F -3N%Hi}*/c;yo2FP?.p,2c${L=.¿!3F6?H*y 2c(qA:l2Ft{j?/cܢ}N(H3o/0F+J?9N-c+#  #Ae@ c8a 2F?(?vނ3/c= !"mh2FZZ?   /cz}7)j~y2F x-q?5n,H/c8(1J1 jjBQACS;u)@A!B1P  2F@j= H̶ʷ0c-쳱;Ic*0F0ɂ,>\!+fcv{+w0F|#i?th)_c$80F :XZ?amc  A>,t0F  ?gkm c)^g=-cOj2F%hv>~ .:(cb>!ߺh2Fp¼&'ή-bcƮ$2FdrcӳjN*2F9z+.=c32F&N7 (H2 cg袲kbnmk0F2Țe=/c 080F0ƪy{i-l߶/c韋i~";r k0F0ө=m󡿌4z#/cqʶAާ60Fy.̂꿌5no/c*=x.uNA!1F, fȌ9fm c(˪u|׷/0F}'c  N޿ ؎#3F[q ǶH5J{)2cp=p!3F !]/Hܲ+x72c|_8ƨ2Fp?.j62c` )T 5   tPx1SNQ8:&1F`õ~=t>A 3aK0q4 3aK0q4 3aK0q4 3aK0q4 3aK0q4 3aK0q4 q`0A0bP b0g1F1 A Fn1bp j`bp#`' 1bp jbq#`)1bp jbq#`+1bp jbr#`-$1bp jbr#`/,1bp jcs#`141bp j cs#`3<1bp j@ctD2K n DQnĈ AFp#`:1bp jcp#`<1bp jcAq#`>1bp jcq#`` 1bp jfAr#`b(1bp j0fr#`d01bp jPfAs#`f81bp jpfs#`h@QA$y0ܐQ2GPjQnĈ AFp#`n1bp jfp#`p1bp jgAq#`r1bp j0gq#`t 1bp jPgAr#`v(1bp jpgr#`x01bp jgAs#`z81bp jgs#`|@QA$x0bP `o12(H0bP `o12(I0bP `o12(K0bP b`opѰ B\pΈA JˆA JB o, Ę,A3P 1K T"4 , r?Ұ B\pڈA k=#`U0&: 2`Y0@ dJ]p@YB6K2l\0b` f j|шX0l@D0+6 b@@ `M185 G"F @ ~.aĠ@@ @`!ؑaA0*%*C821K T @E#Z=2bp jv##`j =2bp jv##`l=2bp jv##`n2b` fkI2bp jw?Ȉ A]0Ȉ ]PȈ A]A#`P  AY ̄,A0K7P `_ig ̈́,A0bP n bĈA A% 1bP nbH p#``-&c0P(b@ECu0bP ``A 7 gPA  2 2#`*pCp.`0ph`@ . ; 1K#`8 #`7 7 ip7`0K[h 1Kt 7t l ʅCACȈA Ay@q0.h F @ /s{b@@ q*_185@`\A0 a! a 10q 'wpY7h7@ڀ`@šrl @E#l` , ZH-W\ AЕ`@@ @ u`]% B+4A0P`7P@F @ oH ,BA185`` 8BA185 `x DBA1858` PBA185P`A0P +t! 0e9F @ \0 %F E Y9`@@ @!^Rf*a9%&@b@f `*f `CY<h|`@*<%0#0e}F @ Z1@_Y>`@@ Ԡ}e@@ `ȗA0P "_F @ 0|185 0e@@ @ ȗA0P1 $_F @ 0|185( Pe@@ `ɗA0P3 &_F @ 0|185H pe@@ @ ɗA0P5 (_*d$})f%*З185@_|185 e@@ @`ȗA0P9 "_F @ 0|185 0e@@ ȗA0P; $_F @ 0|185 Pe@@ @`ɗA0P= &_F @ 0|185 pe@@ ɗ %2Зb!D!(3З@_b@@ `} e@@ Ԁ@ȗA0PR !_F @ L1|1858 (e@@ ȗA0PT #_F @ T1|185X He@@ Ԁ@ɗA0PV %_F @ \1|185x he@@ ɗA0PX '_*d`F  \2hᆠ}`F!1(0}YR `Ġ@@ @&n fJA0 a f RaĠ@@ DaĠ@@ Fᆀ~`P!n  f PabS f RaJA@ᆀ`!T!1(1`*X!F @ 1185H A0Pu  A0Pv f&A0Pv AfF @ 1185x df@@ A0Px CfF @ 1185@f185 Hbf@@ Ԡ@A0P{ %fF @ 1185 hbf@@ A0P} 'fF @ 1185 aĠ@@ 'qF1(7 6 L&2qA 1bp jPA BI#`'#$1bp jA@BˆA _pCPB,*B0b` Pb@E# $3 ^HJ`Ġ@@ 57n f\A0 h!p#0ep_F  f3@`!^!1(0̀9YX`Ġ@@ 6;n fbAA0 n%A0p{F   t!#0eZ~YZXz dA b*(ZnB* flA00\ 01%ЅHF  0 `\`%JAeW4Ĉ \ eJEe0bP b 7q42B.#` qV}3K  -Y]84!*1#F  1A0P (A0P aF @ @39185 h@@ @4A0P cF @ H3Z103 IF @ L3F @ N3:185@ h@@ 5A0P F @ V3 Q185` A0p A0p G j0K Y]`*o]h]](t!h0e_F  p0 F  v0(J%10;@F  x08F  ~0@J0%10;@f |aQzㅁf aĠ@@ G}n fA0 i!'0epF  4ᆠ`A!1(1рΟvA bF  0A0p6 r@`!n L f abL4.8Ys8 qa!P0e8tF  2u%XyA0p{ a94XT#`tRCgP/UĈ A> #` qpCO,C:C0b` opB0P造%Xnsas01AcN.P S0b` faf:WL7T?A(艂2bp j`A@쉂2bp j@AX2bp jpAp2bp jA2bp jУ A2bp jP#`aWpC_a0i@du5K:7  I,; T4; 0K M,< T4ă0PM(0K@00PјP0܀bgE2=5ZB@E#؃Q 8rJ,>CP,&R2K=pË2?J ;K`"#` XUZ10,C?GPDc4bP bA 7R8n TH`&#`dZ@ku0.h³ F @ 6,)=b@@ m̄"F @ 6L1(1nle`F @0j%*%:^PEgRB#`pCj`0.o k ,KE0bP `P 7 2A 62DL#`pC`k`0 3oܭ5p,MhhUk:2DM ρ,AM T^3!1K`$BM 7c 7 De&#`n@u0.h" F @ f7`+sb@@ Ԡv քB"F @ n7dM1(1P` YfrB'A0Y @f tbr "1K7!@E#7aĈ A0bp j6bp jA ڈ Pk#`Do6bp jPA`ڈ qo#``4$Ɉ ڈ A`k#`o6bp jAPڈ AAn#`4bP nA0bP n0P񈁁 BP :^2DO=1Px<@EDi ,OD0bP n1bP nЧAQ񈁁 AB0bP n1bP nR񈁁 AB0KXO T4O`=1K#`~pCs`00ars ρ,XE0bP ` A 7? cAA p`װ B\pg1bP n.j 7d eap~`0K`YeqA@` ,ZE0b` f0A8,[8Ԝl b 185 ȹsb@@ @焊"F @ 8vN1(1fb-A0 ܁f bI r1Kh!@E#ha R b.  F  t702T10n0 vP d@@ `v d@@ ! v d@@ v d@@ Ԁ v d@@ A v@ d@@ @ F @ F988p t! f-A0PSf ba 0ex F @ T9.f bqYhh'(d`.f bQ@.%nDBYh|$%ƄA5 Re %F .p$݈YЋ0 ;%ЋF /F @ T:`!xàe Jx *x F @ x9x p`@@ Ԡ@p`@@ `p`@@ p`@@ p`@@ p`@@ @p`@@ `q`@@ Ԁ q`@@ Ԡ@q`@@ `q`@@ q`@@ q`@@ q`@@ @q`@@ `ràHf czàe z *x F @ 9x p`@@ @p`@@ @`p`@@ `p`@@ Ԁp`@@ Ԡp`@@ p`@@ q`@@ q`@@ @q`@@ @`q`@@ `q`@@ Ԁq`@@ Ԡq`@@ q`@@ ràHf c! 0(f4R7 7 b@@ Ԁ7 7 F @ J:7 F @ L:7 F @ N:7 F @ P: 7 F @ R: 7 F @ T:7 F @ V:7 F @ X:7 F @ Z:7 F @ \:7 F @ ^:7 F @ `:7 F @ b:7 F @ d:7 F @ f: 7 *d >F  `;x`!x fF#4A00n0Y`Ġ@@ 9 8 `a4H#1(1h`Nn%8A0p NaĠ@@ |Tn0Y0` f LcbQ 4.8Y8Lc! f5A0`0185P 0185X A0P A0P00185x 0185 0185 0185 (0185 00185 80185A0PA0PA0PA0PA0PA0PA0PA0PF @ ~;0 0 )1(7 6 Z/jqA-AvbT.Cvb .Evb w 7  @E#{T *B()!A01n1YX`Ġ@@ @ƃx w `5z#1(0``0exF  l<`! f^5A01n1Y׀`Ġ@@ ƃ1h!.8Y1(701(7(@n1Yؐ` f dcbS !6.8Y8dc! fh6A00q f pu dĠ@@ `؀п`H V#1bp j AW% 1bp j@AP'1bp j`A0bP b 7 q e@,m T4m A&Y8V 6*1 6F  :A0P11185 1185 1185 1185 (1185 01185( 8l2103 IF @ N< `@@ Ń@`@@ Ń``@@ @Ń`@@ `Ń`@@ ԀŃ`@@ ԠŃF  ^9F  d9JǀG l94@=0e f tcr 01=0e F  r9 F  x9(JɀG 9A0p^A0ppN2A0r`7*e70htc=F  z=`! f7A0_2n2Y`Ġ@@   `<#1(1/n% A0pwpaĠ@@ \`4nF3Y`8 f cbPVA<.8Y8 c!X f<A00-Y(q@F  : R  R  V1bp jA9kCye lEye { ^,yG0b` ⰷ;b0Kpy,c =*<C@F @5Ae B\ A~QF:i eˀ2bp j `A@2z0$n (h #` Lg/ Ȉ !ˀ2bp j A2{0T 7{,zG0bp jPg0KHzf a0{0f,A{ T4{ 1K M,| T4+1P_1K 1PѠ^1K@ @|D1P|41P !p2}G4X{$#`F0K`lY%0j`VZ Z2 6bP bApY* +f c_?ڀD*Hd_?D*?D*8c HdA fDGG$F @ X>1(0H'\0eF +`%A0J? f c#C p!@e*@!@j*@j2@f c1 c1 c1 cq î B0k ",A T4(Q:k "2"Ae`H!@E#"$b\ %ȏ /j %0=0 3wD2%wS42HmSHO\dOf0mOV04n #>#>#=QH0toD1m O0tnO[8BO[A2OC!QO[`Ӻ >[av@tm_M#O\#n O״+>]f@t͛oun G#9pD>~}hH3Q,4HBD >q!6n G#9FpD>>n G#9@tHNY'>]@tݛ#đ Lm0TBąL@]ۻmO۸34H>>m@m6mH3Q,!%L 52ąL>#u=C#_4ӵn#>#a?>mY@u}{H3Q,HS1ąL>ReMlT?"!M1ąL>#>#5_q@M#O\#[o6}nH3Q,!%L 52ąL>#>#VL>#>#>@ۺHM45Ob1P„8ͯQC,O\#>#>6`$S? ~mFm01B  , o%‚4ܴ>0:f >#>>}f7ՏHHS >q!7 u܀D>R׵_pH]׾m H#u]%7 uf܀D>R׵3H#>#F0 >}7ՏHHS >q!q!}y7՟>q!ӽ#DD SD 0!N+D2? 45KH +D2HL>#>>m#p45 ;E!O\# >#>Ӗd>#>RӵOE#DD SD 0!N+D2f@?LD?EG@ 4BD >q!HmVL>RH}>}7/!H>q!v}#7/!H>q!,"#DD # 52ąL>#>Ƌ>#>#~m#>#>?LD/8DL>q!踵*4Ƚ)#DD SD 0!N+D2#>7S 2#m #>#>R߻4UiT?"!M1ąL>#>#9oF0O[B#ܸO[E+4ȍ#:mH#~? D>RO#&D>R[(3HM>m#>#>R_eM#O\#>#>R|45 ;E!O\#?LD?EG@ 4BD >q!HB#ܹObBO[bB_bBoz?LD?EG@ 4BD >q!|n E#m #>#>#v >#>#=ӧPS42H]GS45KH +D2I+4ȝ>8H>m#>#>R_۽’N? R>0Ӗn@>#>#{n#>#>RO۹Hn@>#>w45Ob1P„8ͯQC,O\#>#>Ra60ԴӧS42M#O\#S2HvH>#>#}6H>#>#}!1? @\PX0 (`b  `! @N>X@B`1(AxDBqD4K q, `A BiEI(AxǃD:,p@Pa FA Bt(ABt , !`Q,@`! @@, `A?PCAF,< ׂ(Nk@wa Zp0E]-8@@0D1W xGQF!H(ZA BԄ!c1a0@>L@a R,tP簂ȏ@ c     0F)Hcog?cH**#1Flί_ȌЫƳʢ8cHn"1FqyqȌ` /cH#슂+, <0F0bI?Hywc|%{di^0F4?r942t-cO*MA }0F~?ɞ*Hcoߖ% ]db0F֠ZZ?hjjc+  } up1ZtP֌0@ @P@ cfijYҶ#i0F0.ޫ<q+8;:c0A_~ 0F<߿(ӹ=a-*(kcpL#s Lô0F`e1->z !c{)tx<0F x<?c   H#` <5 #`l 7p 02P0bP bSE#`0bp j OAS@#` )S ˆ =)S@ˆ >)S`ˆ >)Sˆ ?)Sˆ ?)Sˆ H)Tˆ H)Tˆ I)6b` f0GAPĈ JyA/*0b` fpGB1bp jRB`HȈ K #` 0b` f`J+T ň L Д+0bp jPS #`6 0bp jpS #`80bp jS #`:0bp jS #`<$0bp jS"#`>,0bP n >A A%U A A\pA AxpC` a00A t5#`1KP Tp  ; 1pC1p ,Arа5$^2bP n0?epǎ8 7 A^,3bP z@G(r B@\p #T7bP n?`P;\ !`! ؁ CPE @E#4l<*S}HD #`5<V È k<V@È l<V`È l<VÈ m<VÈ m<VÈ n<VÈ n<VÈ oQ\D<A8l0<>mOW0454bE#>mE#}7O\O\dE#~ 1[N!1 pCق!nlA! @c 9PJl#  Q@ׂ!kAp_ !kBx-P@ 8ޚ0a 1R,tPȏ@ c     0F ,Ha(KH2c|n A؊l#3F ߺHj ^82c M i;"3Fpu͆orHlæ:ÿ2c0><}2FǤ$Z/c*??.2Fsίir942t-cO*MA }0F~?ɞ*Hcoߖ% ]db0F֠ZZ?hjjc+  } up1ZtP֌0@ @PP0Fkǫ)<){c(h< {0F<.b=j eʆic>'8"^0Fm﹛p>4Lic_/ A׷ y)0F ɮpi?e:.c!]ZA#` <5 #`l 7p 02P0bP bSE#`0bp j OAS@#` )S ˆ =)S@ˆ >)S`ˆ >)Sˆ ?)Sˆ ?)Sˆ H)Tˆ H)Tˆ I)6b` f0GAPĈ JyA/*0b` fpGB1bp jRB`T+T 0bP n ; L #``ˆ LP̔ MB #`50bp j`S #`70bp jSA #`90bp jS #`; 0bp jSA #`=(0bp jS2#`PR,\uǎ- 7 A[,A1bP ]o@G q B@\p ! 7. $ [c,A2P! #`SpA {pC@a0ˠ,A t50#`Э1K Tp",7; 18BA9ЈA }#`5Õ ؁ ́ =U2K T4B,3PѨB!8#`` =ˆA  ňA  #`.4PZ#``5Dш ?Z>0bp j0[p#` 0bp jP[q#`0bp jp[q#`0bp j[r#`$0bp j[r#`,0bp j[s#`40bp j[s#`<0b` fRTb2bp j0^|!@Eefm0<>mOW0454bE#>mE#~}7O\O\dE# 1[N!1 pCق!nlA! @c 9PJl#  Q@ׂ!kAp_ !kBx-@ aP@Ԅ!a 4F,v0F|||xcr7o;9~0F@ڵ./msޗdc#1A9Ύm0F`Zq<o ^2F0iak俌[5c{)ϼAi;0F   Q0F&z26?nO4c'=.5\s0Fv ª?xZ+܆vcϋ1J1 jj0FZZ?   t0@ c   2oۗ A#` ##`AR -CR@ -ER` .GR .IR /KR /MR 8OS 8QS 9SS@ 9US` :WSa1bp jN Qm WȈA t\Ѐ%ASLj =A#`p0bp j`O#`p0bp jOA#`p0bp jO#` p0bp jOA(2ܠq H40K08@EC0bP OM4`Ae DT0 w,Q@Ec180bP n:C };b` v@;DA8 _`<΂ X4530O\d!1 Ei9(Cւnô˪IiEI(Al$@s5aa tS,t0@C:u 5   "c   JCB ,B ? 5   2cۻ,Ax>0Fno?x (c6lOqYv.y0F9=?))L2c9k9+Ajj0F.+?AP  1FpyKϫ^콌99/c?xA̛qm0F`: dlh7+)cz?Iƭml2F vH(߭Jyco x(Aqp2F:?$? ˧-/ci]v~ɷ':0F zM5mw}?/cdȗaZ)n)2F~,n+c&Ω ` 5  crj$A/7/0FPާ{d[<cmIc)vHҶ {m2F`4۫shr?Hc )Jΰ0F h̪u.~?z/ce8m[|>A\d2FY{?)㽺/cAmq 0Fʶp"H  ` c)O 2F01m&?ʓ"/c"/M% 2F߲9#7?'J1/c."|=kz:Z2F t?ȇe/c/Op=OK2F! 7ƨ&zh!c_ױ{N) -)T #` pA hpC a0A t|5 #`1K0 TpD ; 1C2KPp1bp jRÈA A#`#PXA9*G v'(A09s(!JA+810;%xA0 185@A0P `@@ @#A0P)}b@@ E1858 *d@@ !A0P y185X`A0P y185hA0P y185xA0P y185 185a*`@@ Ԡ:{F  'n f #|A0P‡F @ *a@@ ԀF @ * a@@ F @ *0a@@ F @ B+@a@@ @F @ F+Pa@@ ԀF @ J+`a@@ F @ N+pa@@ F @ R+a@@ @F @ V+a@@ ԀF @ Z+a@@ F @ ^+a@bĠ@@ @A0p }@Y jЇ185-}a@@ !F @ n+a@@ "F @ r+(a@@ @#F @ v+8a@@ Ԁ$F @ z+Ha@@ %F @ ~+Xa@@ &F @ +ha@@ @'F @ +xa@@ Ԁ(F @ +a@@ )f !F @ +|185H@A0P |185XA0P |185hA0P |185xA0P |185@A0P |185A0P |185A0P |185‡Yd!C@f  !%F @ l6nvA /86`iYhĠ@@ @rc!`y%.$vhGxf o%*M C27454mCfo0DmCvo0DmO[fA!1 .j"qfQ0 bZCق%A !tlAc @jl4@ ؂!^l- b  BP95aa iV,0F      cHd? ϴ.!@RP8P@ c      3F˲(^?썷jc<97o0F`?embzc?:2d*0F!朳?ijjc¬n  1FpyKϫ^콌99/c?xA̛qm0F`: dlh7+)cz?Iƭml2F vH(߭Jyco x(Aqp2F:?$? ˧-/ci]v~ɷ':0F zM5mw}?/cdȗaZ)n)2F~,n+c&Ω Ay\cp0  n`n`v@`@p`@qD5  c   گc.d0F.<< Hۣc#h.A~+0Fо&۲k?8ª^nc<)cm-͢xa 1F.$Hl̃- c dzˮ:?0F0k?w,nc{;- `5   mtPV5P5@ c>>nφx]ڤ{0F9 :+|?تh봛zcJ۱Ad*0F@蜳朳?jjjcs|) qP_邢x0F)O?;$c ʊ_M`<0F@ ?aI۪ͣc}mj'0F ?5#``4p90 p9C1  0bP n;A %V`j9!A0 a@@ wJt F G *H(A`Ġ@@ !F @ *a@@ 185 d@@ Ԁ!@1850@F @ *( 185H+za@@ @aF @ *a@@ ԀbF @ *(a@@ cF @ *8a@@ dF @ *HEF @ *` 185r *185{a@@ F @ *a@@ `F @ *(a@@ ԠF @ *8a@@ F @ @+Ha@@ F @ D+Xa@@ `F @ H+ha@@ ԠF @ L+xa@@ F @ P+a@@ F @ T+a@@ `F @ X+a@@ Ԡ,"F @ ^KYjĠ@@ b!ȇ3e `hXaĠ@@ ܀ 8c` YF  t1(7! B-x[m6~0L~mFyP>FzP>y AEE4T>q!IȁO,e1[}!1; @@oAr1v # bnl  ݂!nA- !عCۂ"A n 9E - CB][@k D`[[0Aj .j"qfQ0 bJCق%A !tlAc @jl4@ ؂!^l-  b (0@a V,0F      cHd? ϴ.!@RP8P@ c      3F˲(^?썷jc<97o0F`?embzc?:2d*0F!朳?ijjc¬n  1FpyKϫ^콌99/c?xA̛qm0F`: dlh7+)cz?Iƭml2F vH(߭Jyco x(Aqp2F:?$? ˧-/ci]v~ɷ':0F zM5mw}?/cdȗaZ)n)2F~,n+c&Ω Ay\cp0  n`n`v@`@p`@qD5  c   گc.d0F.<< Hۣc#h.A~+0Fо&۲k?8ª^nc<)cm-͢xa 1F.$Hl̃- c dzˮ:?0F0k?w,nc{;- `5   mtP6P5P5@ c>>nφx]ڤ{0F9 :+|?تh봛zcJ۱Ad*0F@蜳朳?jjjcs|) qP_邢x0F)O?;$c ʊ_M`<0F@ ?aI۪ͣc}mj'0F ?܊0/c`uuA9 9  7P2C0b` fJC1bP nP;A %Vj:!A0 a@@ |aJv F G *h(`Ġ@@ @!F @ *a@@ Ԡ 1850 d@@ !@*$185P@F @ *( 185h+|a@@ F @ *a@@ F @ *(a@@ @F @ *8a@@ ԀF @ *HEF @ *` 185ȫN *؇185}a@@ ԠAF @ *a@@ BF @ @+(a@@ CF @ D+8a@@ `DF @ H+Ha@@ ԠEF @ L+Xa@@ FF @ P+ha@@ GF @ T+xa@@ `HF @ X+a@@ ԠIF @ \+a@@ JF @ `+a@@ KJ-H!F @ fKYjĠ@@ ܠLa!3e `hXaĠ@@ 8@c` fC1(7!cĠ@@ + @.BPr r1b` vFt1%Q ]ht#``lE`0bP bcA#`y1bp j^AW@u2bp j_| BW#`Dh1bp j`_#` 1bp j_#`1bp j_#`1bp j_#`$Q"#`0LX'AF#` iUTo1bp j`bD0#`'ZԕP#`*[X ň [X@ň [X`ň [0bp j cA#`3(l1bp j@c#`50l1bp j`cA#`78l1bp jc@RJ@҈A /ˆA /ph0ˀ$A\[`Ġ@@ A0prf Ap8 :*An f'n  f'? `6v@@ `A 3e``Ġ@@ A0pxj/H*`@@ W,A3P T-%H@"0bP bnAň#`z'2bp jgAY@uRA2bp jjƈ ABjZ0ƈ DjZPƈ AFjZpFEIԏRaĈ PO e42bp jj#ԏdfy #>Fy !]OAOQ__M/2eUDT?T>q!1[}!1Q 8D 91p`[@v ni܂!mAPo KFA Bۂ"ۂ!m!- @C"][0@-@@ @5I8 ( r1Pق!l Q>AD 9`[@`c hl3A c 0`-0`Z` oA j@#`F V B0DZ-0`Z` gA j95a a M,tP@ c   H   2F  ?   2c'xx2F`5}03쾷ckz82|i-̫2F5je?(K|cxK.Lb[v2F * ??y.?/c3^ִ)]; ,O0Fpxl.i俌>8c o+0F`>ݻM+c~>AOi2F?   c   ,h0F  I  k2c¬n3_2F*ӵ6??I4c5Nd=A3s0FP'{ ª?}[,܆vcb1J1 AXjj0FZZ?   tPR#=@ ##`=S#`>c؁  7w 0TQBA MBS 9A0bp jNAȈ ~0bp jN~( CS#`dň ;B #`0bp jO #`0bp j0OA #`0bp jPO #` 0bp jpOA #`(0bp jO #`00bp jOA #`80bp jO #`@0bp jOA #` H0bp jR #`"P0bp j0RA #`$X0bp jPR@0bP b`WA#`(0bp jRAT@uRȈ AKA8 1b` f@JB0bp jSQ9AT2bp j0S+ˆA #`55B+R #`7B1bp jSB+#`9 +T ˆ N+T@ˆ AO+T`ˆ O+Uˆ AX +UÈ X= 4bP n>ˆA  5• ؁YF*3Z@0b` v ?TH+D#`q%B0bp j WD@2` `   ]D0ܠ  D<1K`@,3ܰ 0  7B8,C h0p,e6@EC$2K T4!,vj@LO1~mTe!1" C&A ![0-(@HlA-B"Z0}-*j"qfQ0 bBZ@`[  @Aւ!bkA1@[ p1Sa ?X,t0@C:u !B?P   1F      c   H=2F ;1uM߯,9/c#Ϊ0Fcc j lffc=ʒ/_.޺Ҡ2FӲֳ]~ȭJO=cn=8:5m2F@<˓?:ۢK$/c'~2Fߊ ?>j/cӯm뿏0FhZ?+=c      3F  /Ȍ   c     0F  Yh0c ޲ܣ5  &1F  ɌKq/cs|Ա RB @ECB1p $ ITH1bp jV( \ #`U10b` fN8U Lj A] ؕ80bp jpW#`x 0bp jW#`z0bp jW#`|0bp jW#`~$0bp jW#`,21bP n@BA  %U ؁ A  epA pCa0ˀ$A t5(#`1K Tp"$; 1@-p14,sаU8y0B3bP nPCgpǎX[ 7o Ab,A4bP ]@G<t B@\p $T7bP nF`PH\ !$D" EPE @E#HlP*(TA#`AP(ĵD7A0PY,5P(A\`A8(i@LO۸1վmT #DD  M?"0ҿL>q! >]w)#DD  M?"0ҿL>q!#l eE#DD  M?"0ҿL>q!XAO۶A_AoEM2!1( Cڂ!lA! @ g 9Ple`$ ? b !VlB-\5I8 ( r1_ @k!@z- @8Cׂ# bk `0Ԅ! C a Q,0F  0HAB5s>u/c   -f@@ c   H  0F  $Ȍޣ/chN]"2F0ϸ;>?1c9a80A n֨i0FУ,-u?⭛. |/c0-m=5ܪ4|ߣ0F湈|êx[Ӧ/ct]0̳<-M2F+|?/'/civ2F7{?vc=5 0F   H   2c ޲ܣ5  &1F  ɌKq/cs|q!S#mT_!1( C&A ![0@-(@rl- B"[0-*j"qfQ0 bBZ@^ P @Aׂ!kA! @P^  kR@$|`Z@Spa  &0@P0F      tPP0F}e.?l ޷/cڴ!鏺{2Fߥ &ϣ}(zZ&c*6 vt2FP?4_=/c|H: d0F;鿌_!g/c9?5   mtPlPP5@ cz}0FbLi(|?^zc+_Ӹ%d*0F眳朳?   cZZ 9YB3tP8@ c¬n3_0F  ?7xrxc     &3F.+!8=>/c<]m꿾J0FP$Hyq?Ļ>~9c~ !۷emh0F (?5jjc1ZZ  @Cru5c,")  ."BP"~ס:2F`)܂!Hq~e/c!mDf pfP8 Gj,ZO숁 ؁pb0EA  #`i4XU )(Yh! 8{J4`Ġ@@ ̂: !F @ +Nc@@ Ԁ 185 *d@@ "A0P 185د`A0P 185诠A0P (ncA0P (@. (*/f"f"M!103+Mb@@ `8.1(7%hĠ@@ ۸ 10;%"18511A00JƠ#A0P[ 1851& F @ |8`@@ ̀#A0P0`@@ @$"1853*  *`6`! фM` ` f#f"M!(H10;觀 CR1855C=2# mm`4!F$>b@@ ~$A0PU#`@@ Ԁ "("*`$5 ("*`A0P[?F  v*185ȵ"D@d@@ 185صDA0P^@$1855 c@@ ?F @ ~-c@@ ?F @ -(c@@ @?F @ -8c@@ Ԁ?F @ -Hc@@ EOF @ -XdĠ@@ `A0pJZ@A0"A0p.A0CN`AA0n%F 3;bIE8vA b1xbY^a %*kĠ@@ ܀1(0;n$ f2-1(YmĠ@@ :#f pc fN61(7HoĠ@@ T10;x-"90BPAJA0r@f F`%*Un@@ @ *81859LA0sZwA0P u185 F @ r. A0Pc`MF @ v.d@@ ԀaMF @ z. d@@ bMF @ ~.0d@@ cMF @ .@d@@ @dMF @ .Pd@@ ԀeMF @ .`֤FF @ .ᆐia@ag `>Û%m f fadHeXF @ TdN&fȠ %n`@e07N8vEb 185i)I%@83803neH5F @ n "^f `!ҀAH O8vAGba 185 %`XF *b 5Q0 Y6p`@@ (bg` * 6h6``@@ @t&șA0P 185``șA0P 185pșA0P 185șA0P 185 əA0P 185`əA0P 185əA0P 185a) h.W8vNaT!1(0@W]Y9`Ġ@@ @^nv f`A0p1ӛ nzE fA00%F~A0p4 F  L(J@A07#A0v .P FgA0P pF ]d@@ t F  ~.F @ O]t185@HA0P t185HA0P t185HA0P 1038$185ؿu e@@ ԀA]F @ /e@@ B]F @ /(e@@ A0P 1(7`Ġ@@ ԄG TM*#nN' fY;h:p`A 蠺`u` 1(7h5bĠ@@ ֊G hMF  b1(75A0\Y<h;`!A0P qA0: `a`! 1030=:Y=h<``@F  |~F  z~*|*`@@ ܈`Ġ@@ @ (b@@ Ԁ uA0P3  %5 DpafA00+RPI 醲`7P@F @ 0`7HP@F @ 0`7P@F @ 0`7NP@F @ 0`7P@F @ 0`7TP@F @ @1`@@ ` |I Y>`@@ Ԁ"áY>h>@`wt!80e?F @ P1f `A0s h`B!103ȿ2"YPhP@`@*t!F @ B2@A0 2#!F @ d13185 A0PZ 185|185 g@@ `0A0P\ |F @ r1 185 0g@@ `0A0P^ |F @ z1185 Pg@@ `1A0Pp |*hd@@ @*d@@ ԀVhĠ@@ A0Ps .jA00 185@ hl@@ A0Pu Z(`A0Pu F @ 1185h Xh@@ `A0Pw F @ 1185 @Xh@@ @ aA0Py F @ 1T2@0bP P]pC@Gh00 ԐGW,A)X; >*b@@ h(&YRhQHaF!A0@|1850@(2ʃ8;n(e0SF  \o1(7h=LɕF hf NaL*2 F  knxe@Td *SYW1(1x J1(1 (V x fV۟%hYVhUpπf \aRAhYWh\ W%|@xYY(~'%!f fa0d!p v{ DDpӵ]# >mmӷ >[ӹ'>m _[,}O?LD_#@ 2X4M:mHD4鶁 O۩  o[' " + " . !* ϛ e H#ѴYy DDӮE?LD_#@ 2>#`d> H#W?LDO\ϯQC,>q! O0 o۬0 [*0" [-0" ϛ0 H#6mq"(8>#~m#>vp>#~}#DD c5OD>q!}¡#Oa!1&oۤ!5m |45O0/MH 2٥!m yq!O?LD/!/8үQC,O\#f{` DD >&d>_ H#}:7/!H2 =T424!1 P`~ `y 0`Q(n `,JFA Bf4D,H` 8B3X1Г !x2X0M A c@ ,8 bQ,8bCc`([ @0q 0aa%A !V0X@"  @!o`[P&A ![0- PI1- w !nw `v =`[0-`Bۂ FA Bۂ(IfQm趀 ۂ!mA- !C8DYa 9[@Bpf @l D 5`[pB@ E- ^ PCׂ!k`Z0@y- 0^ Cׂ!kA@Ɓ$ʂ[ 0$QZ0q ւ&(I4 DaH, a @eMQ(l-h@8DYbkA@Ɓ$[ 0$QZ0q ւ&(I< DaH,LZ :a0Qq.,ӶZ q$0D$Uua 8 CՂ!jm`Z0Z-@ V @CՂ!jc`Z`@ ȑ(2FP 5   "c    h٢0F+J"H+0 /ch>An-x?2F޳ ~?-+܆v/cZZ  2F  ? ncLMmN|0Fָݳ ª(()(cZZ T Du8OqSPEPP @ c    `5   mtPV5P+XP0F#ll?XߥM'crNy0F@<?y~L2c9k9+jj0F   HA"P0F<>>#` 7 ; EpC a0ːA0bP bS0bP bA ۘpC@ h0 A0bP bRpC \*4,12F0K* 7 ĕ* 7 HMSA *,C#``h,2P #`P; W#``C#` $\@@#`&̂R,Ȉ J Q-0bp jR0 #`, 0bp jR1 #`.0bp jR1 #`00b` fGI2bp j Sˆ L,T0ˆ AM,TPˆ M9TA#`P- AAey =,@E-2P (7 1bP np>E #``##`sRB9 ,3P( T4D1#`mC? 7A2m#~mOVoP0>&zP@>6zP@>7ScąLFnp #>mE#>m#>&ip>#>m8>ipH>#>m#>6jpH>#}aH3Q,4HBD >q!XfAMKH#8҃ąL>r7/!̓ąLm0P> !1 @dnAr1r # bm` ۂ!m- !hQt(A9[0@-(!l- ! *(A[1_ aЯCׂ kA ā4[ 6 8Yy&,*QF,J>nφx]ڤ{0F9 :+|?تh봛zcJ۱Ad*0F@蜳朳?jjjc   T D"P0F<>>B5   c   (Aq܋(!3F0?(⼻ƤȌ~-2cz9-oMyi#3FlȌϫ+u<2c,L9_!1FrOȌ|--2c=3?b#3FЧ :Ȍ)'+g=2tP0F0Ii1?//c&N>Ώ1:v%,2F Ӡ梊l?9d(/c+8L| 2Fwn}?l |/cHɺ2l ,2F0Nl;?+ ㍫ `:5,H0FPʭ ?ry.z3c1.tϸJ0F ֨?.[mceӱA1>0F9L>o}c9'y,kcئ,2Feɞ?3z/c):,=5Njo0FmnM࿌)o3l/c`:̒ykȦ2Fwmϟn3#` A<,C#`3 <,#` U5#`>0bp jSAU@2bp j VÈ XB=U0È AYD=UPÈ YG=OHH 7 I܃=CF  G1(7ahF a0H8A?ᆁ`H`H*~ aG  0Y F  G1(7蟡@$F HL   q@@AQND 9#` O1bp jpZ#` AjQG PA #`KV Ĉ kKV@Ĉ AlKV`Ĉ lKTDD KV@ RF##`n#>m#~mO[AO!BO!B_GM/2%OAmض_AشOۨA,صO[fAجHMԺO[H5!1s @oAr1@~ # boⷀ ނ!xo- !C"X[FA B>[0- ȳDw`;[@BPg @l!- p `Z'1Qq"0ĈGzyGyAG{{|Ԉ|$N,H}EҍDGօefu@sla c@ѕxmoq 9} p~ P 0 p @  !` | "{@"w0P "y0"x# `P"`"#"/-*A%a'+'2<7a3!4Q8=;:#>`#a!cA`&e")"*bb"g"h1&dR&fj"km!spoAuawy|{!Aaq&h&j&l&nW 0!0$Qyj a,J012(38Z"E]]x0t{TI|?PQR(S81 10284@4P5`6p7T@95aa BG,tP@ c   H 5B   A#`TpC,C ,2bP n/AV,0P 8p  72EP [H{A2ESC@ŁQ _0) YYZ`8~mOm!1 (Da ~N,tP#AC!l s w0P(v@aA%qAÎO02AE$0KT!%Y=nt f`LY>%dĠ@@ bK%) QIp z,CA`A("Y?=BpT0WH0P(} ,( T(\а#`:$#`=$PCYhŽ,( T4(K(@Ep ,) T40(@Ed B@2&k@ _m\0tmO6]0L>mEp >mE8>mq=Njp~mO[s׆^ m}v_ ->oߛm05m_6m0m$ 1Oy" B v^H(,mq=ӖYY!1 PCق" @g @l`Z@W Щ P0ˡ INa  $t0@C!;u@yA? ) @"P `-AQ  @5PQ(0F  IAB? D AB#` 72 A0bP `RpC, #`" 72 hTA ːA AːAČ QԌ0 7bp  7cp E4pC \4p 0 ;ܠ,C ;K,2K T4¢g0ЬA0D1b` v >0p2A <,Apа EP2' ;`,CAP0KW ;ܠ,Apаp0K"q0Pp,* 742 ;K 32)#`U@8bP n?R Ck,AP=ȈA @w0P(v`u0Pqv@pCRh0ˀyT2bP n?A"PH,A T4  T7(l &րZJ=G B0KS TFL}PB4A  Ԅ7f {A @"` O`8Y=&`F!8a+ 1(7С҉ BpBTpðh0 !@!p@\ ,( T( (0K0  f4,) T4( Tz0 .D%00p  72* i,*BP,,* t4)8ˆA Fz#`u4A A#``|0P0K  Xā 0PѨA8=m@ [1>mqQ O۰~mGQF_p4L~m >][wA,OA\8_[s\ m}_ nHQDi>oD9Oۦvh  ~M׵ ~mۺ׸ ~}w" B V^H(,m$ 1A\ei!1 Cڂ" Pj @l"@Ɓ؂!Xl)`Z0q ZIGNa t %t0@C! f `0> ~``܁G  $F  $Jx! 4 L$IT:bP n ?(1bP n?(QHA AĈ ň ApQg0ˠi 7&q j D7A1˰lJ4p mAL,AJ4p nD, T4@E#0иD00tL4,A* 7@0PPTAJA  I7  vpAY,!#`+TXGD@E؁@AU {,APMȈA  V@7y0P(x@ w0Pqxwpޠ%V[COh=!0(, TY T(LL}0Pq ypA F\а#`fTP8E2U,0KT@-'r`K]`8Y=ᆠ-`F!8a+ 1(7 BpBTpP#h0 !@!p@ȑ\ ,( T( (0K0  f4,) T4( Tr0 ~A%Pkp@ ,)B00B0` `&:G2*U T 1K p )P0qT #`܆QB#`dR ԱZ:p0PѸ 0Pq0K ]  }0K | M6m@ O۶~m]0L>mEp >mGQ_p4L~mOf^PӖlp>mq=Nf] ]~moh %~nHQ΄I D9h`@3)L~mD9oۥQ&k  ~Mk  ~]׷QnD9xp@3~mD9x0m_v^H(,~m$ 1o}" B ߆lPL5lp>m`A`!1u |[0- `_[0-  Qbf Cق!|l"AJQZ pm@ơ,H@ ʀ @ ڀ `'@@'z|BJAFRNZAVjAfb^zAvrnAƁ~AցAAB$JB$F$L$P'"%T%^b%X%f&bB&n&j&j6p( 4EDUօՂ!vjA2pS  aX @a yU, PTt0@ %0@A #P() ;bP bPN- ; E _p A BStQ7AQpC0,0]0K@l 72E0K@$pCp,q5A x#`*#Pæ;b` v3D0K@ 7  7G2$JP@"$|1ܐh@2,M0ܠb 20MP,A3С pC0@p1@A j@"<4 q,C4 r,45L !]а@I:d`S@ YY\@ O[q~m%\ 4!1 e@Ɓ$4EDUօeua cS 9b @00q a z 1@V@ #t0@#PtPPPP tPrP?tPnP%kΜ9kΜ9CF EQP   d!`A8`8P.8ᆠYbpC,@8a"1(pC``0`v%(f ET*>Xk`c*W8J 7h  7h  ,z(JB@@58Y HpCi085 @`` \p @a432@R0 A2DR`:Aeu '(H \ ),T ,UB9AAJ`%*"j LYkB"8H!" L f0K T|%nd f6.,A7Pd0P1@ABYpZY `` *Pp 7p a0p i0 c!&)!Y2% E0Z Dk1K@@@,A 7:,A0K lA@hpr0PqLr0PquY5 p 7  mpC2`0n 7 ` m, tHk0P2A :,A`G>sf `@* *0 L1Kt0P Th0q,A T(x` Tx` Tx` 7" vp, T(aH 7 $ x,A T(,TPZA= A8QY@LXqT#>mS NU,>ߦ\8U_T#~nS  NU,>f^ M >f\ M >h M >^ M >Fh M >h M >i M >&_ M >ߖi M >i8UA42_H3Q,4BD ąLF\ >mۧT#>o#>mz^P >_P >V^P >v^P >ۆiP >iP >&hP >&j` >״um#~o#!1S @[`& DAZ @Z@Ղ0Z-(`jQ d Z`EDUq\Xm\%AAAAAAAAB((B))B**B++B,,B--B..B//C88C99C::C;;C<>C??DHHDIIDJJDKKDLLDMMDNNDOOEXX48dS & 0I٩ Ca #A,0F  ?AB#`20bP n.D V 숁  $A +Y!1 )a A,0F  ?AB#`20bP n.D V 숁  $A +(P@Y!1 D(Ii ˴ @ Ơ `Ƞ ` @BAFJNRAVZ^bAfjnrAvz~AƁAցAA$BB$F$J$N%RB%V%Z%^&bB&f&j&n'rB'v'z'~,B,Ƃ,,-B-ւ--.B.../B//B4F4J4N5RC5/@#4V5Z5^6bC6f6j6n7rC7v7z7~C>>>?C???DBDDFDJDNERDEVEZE^FbDFfFjFnGrDGvGzG~LDLƄLLMDMքMMNDNNNODOOOTdApAAlikoCȁ1uq (y{} @ ` *B++B,,S p // @`9C::C;;0Pp>C>?@HT#A%a')J`,/1!3A5a79;=b?cAeYEZZE[[E\\E]]E^^E__FhhFiiFjjFkkFllFmmFnnFooGxxGyyGzzG{{G||G}}G~~GHȈHȉHȊHȋ<A@4P5`6p701$9P` 2p9:&.2m:Z0@Na C,051@ ,C @EcA8YYY0L> !1 ЩCa cQ,PTt0P1QSOAAoAC?5BAJ?#` $԰ BPT01Pa0AQf!B0K5X`0P A ?a  6``0PA >A7ApA$#`/S#`.cx:x`0p pa`0K TqA$TpCa0ˠ,AA<,4c20MPl0hnz,4bP bRs0bP n3A A(DjpG(D 7l0 X,4bP RkAf C1P4p'p YS42vV\0t !1 !\kA*0q 0@4QuaqAAAAAAAAB((B))Ղ}rAvz~AƁa tHu8 $Qցi,>m˸2 300`5h4@589:x8p8=;;Q>>P(RT8ST!pTxxyzXV`WpXYYZ[{|.//889<T0IU]q['a0hpiPd`nojt0up {vw |}0 p P   @ ,BA ƁB,B V |[@ 8DYy"H2L*ʺ ` : ` ʀ `ڀ D!FLNT!VXZp!`!tAfahj/` CAG(K8OISi[EYWAy_AAAЁ(())**+B+.,Ă.,-X-0 DaA HY)®,6#1(01X504848`7h79;:=ȃ=>?R(0q 0@4QuaqAAAAAAAAB((B))B**B++B,,B--B..B//C88C99C::C;;C<>C??DHHDIIDJJDKKDLLDMMDNNDOOEXXEYYEZZE[[E\\E]]E^^E__FhhFiiFjjFkkFllFmmFnnFooGxxzG{yyz{G||G}}G~~GHȈHȉHȊHȋHȌHȍHȎHȏIɘIəIɚIɛIɜIɝIɞIɟJʨJʩJʪJʫJʬJʭJʮJʯK˸K˹K˺K˻K˼K˽K˾K˿LȈȐLɘɠLʨʰL˸LLLLπM؈ؐM٘٠MڨڰM۸MMMM߀NNNNNNNNOOOOOOOOPP P P P P PPQQQQQQQQR((R))R**R++R,,R--R..R//S88S99S::S;;S<>S??THHTIITJJTKKTLLTMMTNNTOOUXXUYYUZZU[[U\\U]]U^^U__VhhViiVjjVkkVllVmmVnnVooWxxWyyWzzW{{W||W}}W~~WX؈X؉X؊X؋X،X؍X؎X؏Y٘YٙYٚYٛYٜYٝYٞYٟZڨZکZڪZګZڬZڭZڮZگ[۸[۹[ۺ[ۻ[ۼ[۽[۾[ۿ\ȉȑ\əɡ\ʩʱ\˹\\\\ρ]؉ؑ]ٙ١]کڱ]۹]]]]߁^^^^^^^^________`#`P '` +`Ђ /` 3`P 7`;`Ѓ?acaPgakaІoasaPwa{aЇb(bP)b*bЊ+b,bP-b.bЋ/c8cP9c:cЎ;c<cP=c>cЏ?dH#dPI'dJ+dВK/dL3dPM7dN;dГO?eXcePYgeZkeЖ[oe\seP]we^{eЗ_fhfPifjfКkflfPmfnfЛogxgPygzgО{g|gP}g~gПh#hP'h+hТ/h3hP7h;hУ?iciPgikiЦoisiPwi{iЧjjPjjЪjjPjjЫkkPkkЮkkPkkЯl#lP'l+lв/l3lP7l;lг?mcmPgmkmжomsmPwm{mзnnPnnкnnPnnлooPooоooPooпp#pP 'p +p /pÁ 3pPÁ 7pÁ;pÁ?qƁcqPƁgqƁkqƁoqǁsqPǁwqǁ{qǁrʁ(rPʁ)rʁ*rʁ+rˁ,rPˁ-rˁ.rˁ/s΁8sP΁9s΁:s΁;sρ<sPρ=sρ>sρ?tҁH#tPҁI'tҁJ+tҁK/tӁL3tPӁM7tӁN;tӁO?uցXcuPցYguցZkuց[ouׁ\suPׁ]wuׁ^{uׁ_vځhvPځivځjvځkvہlvPہmvہnvہowށxwPށywށzwށ{w߁|wP߁}w߁~w߁x⁈#xP⁉'x⁊+x⁋/xが3xPき7xぎ;xく?y恘cyP恙gy恚ky恛oy灜syP灝wy灞{y灟zꁨzPꁩzꁪzꁫz끬zP끭z끮z끯{{P{{{{P{{|#|P'|+|/|3|P7|;|?}c}Pg}k}o}s}Pw}{}~~P~~~~P~~PP #(P '( +(Ђ /( 3(РP 7(ࠐ ;(Ѓ ?( c(P g( k(І o( s(СP w(ࡐ {(Ї (( (P) (* (Њ, 0$Qq(&.,6>012(384H5X6h7x89:;<=>?PQR(S8THUXVhWxXYZ[\]^_pqr(s8tHuXvhwxxyz{|}~  ( 8 H X h x   ( 8 H X h x   ( 8 H X h x ؈ ٘ ڨ ۸ Ѝ (8HXhxDe`gij kDnpr0tPvpxz`(())B+** p+ K" @ L--B.B 0Pp0Pp?HHIIJJKKLĄLMԄMNNOOXXYYZZ[[\ą\]ԅ]^^__hhiijjkklĆlmԆmnnooxxyyzz{{|ć|CP}}D!"!""#B"%b"'")R","X/#1"#3B#5b#7#9#;#IIəIɚIɛIɜIɝIɞIɟJʨJʩJʪJd*+2+R+r+++++.2.R.r...../2/R/ 䋾 ȄȌ ɔɜ ʤʬ ˴˼ Č Ԍ ؄، ٜٔ ڤڬ ۴ۼ č ԍ 脎锎ꤎ봎ĎԎďԏ?B\P P P P P PPQQQQlFa nFpGr4sDGudGwGyG{G}GJ@Z[\]^ H_XyH ,-ۺ2 @  ` ؠ  ` `@!DaHLP!TaX\`!d!ġ@0BSHa J, PTt0@ #c   ) `@C?#` Ұ BPT14a0AE$#` $԰ B2 D0K 7 Q0PU [RɈA #`!s\TwBpCq,Áe\X4,A2bP \!f  39$p Y[ 4!1M !j1@Ղc @(jL0q 0@4QuaAy_AAAAAAB((B))B**B++B,,B--B..B//C88C99C::C;;C<>C??DHHDIIDJJDKKDX-H ˴012(3`z"i:Z@>@ DaH,L`b1dQfqhjlnpr1tQvqxz|~1Qq1Qq1Qq1Qq ""2"$R"&r"("*","."0#22#4R#6r#8#:#<#>#`&b2&dR&fr&h&j&l'r2'tR'vr'x'z&n&p'|'~'*2*R*r*****+2+R+r+++++.2.R.r...../2/R/r///// 2"32$S2&s2(2*2,2.2032334S36s383:3<3>3`6b36dS6fs6h6j6l6n6p7r37tS7vs7x7z7|7~7:3:S:s:::::;3;S;s;;;;;>' P(Ii ˴ ` ` Ƞ @@BAFJNRAVZ^bAfjnrAvz~AƁAցAA$BB$F$J$N%RB%V%Z%^&bB&f&j&n'rB'v'z'~,B,Ƃ,,-B-ւ--.B.../B///4BC4F4J4N5RC5V5Z5^6bC6f6j6tc7x7|7#<6p#7ƃ<<=C=փ==>C>>>?C???DBDDFDJDNERDEVEZE^FbDFfFjFnGrDGvGzG~LDLƄLLMDMքMMNDNNNODOOOTBETFTJTdHb D!0/˾hEDU//yAȈȘAԁAAɔL(BɠLAʨʈ( )B˸)B*L++*,Ă,-B.-Ԃ-//8C989:C;;:<ă>??C=?DHOEXDOXYEMMLNNDLK]E^E]؅^_E[[Z\ą\EZYlĆlFkkFmmhiij_Fjj̬{zz{nFnnGxxoyyLjHHG}}|~~GʹHHԈȎȌKȏIIL̘IəIɛIɚ 䉞 ԉ ĉ ̪JJʫʬJ 䊯 πM ؘ؜ ċ K؋O- ./B/ /b XǙ"f, * Ӱ.,6` Ƞ Т33@4H 0P5`67x8X ?P=>胶S8TH<Q Rp :pWXUXVh 9[\x9YZ@pq^_ uXvhr0s@tX wxxy`֘z{|p | x Ї} ~ڰ ȍ 0P@h x Љ-lĆlmԆm0qn$HnnFo(N 1@aY5Iiv|{{|ć|ԇ}qAAȊHȋȌHȍ䈎AȏIɘIɚIɛĉ̉IɝI 􉟰JʨJʩJ AʫJʬԊ AʮJʯK˹K K˼K˽䋾K˿LȈ ɔɠLʨʰL˸L Č̸LL πM؈ؐMٜ ڤڰM۸M (MM( BNꤎ)NNԎ܎)NN)O,O+BBOOĂ,,B--B./B//C8    99C9 Ր:C:: ;PQQQQQŃ<R(<)*C*(*+R,,+-Ւ-..//8899::;;<œ<=S==>>???THHTIITJJTKKTLLTMMTNNTOOUXXUYYUZZU[[\ŕ\]Օ>]U^^U__Vhh>ijjkCkkVllVmmVnnV?ox?xWyy?WzzW{{W||W}}W~~WX؈X؉X؊X؋X،X؍X؎X؏Y٘YٙYٚYٛYٜYٝYٞYٟZڨZکZڪZګZڬZڭZڮZگ[۸[۹[ۺ[ۻ[ۼ[۽[۾[ۿ\ȉȑ\əɡ\ʩʱ\˹\\\\ρ]؉ؑ]ٙ١]کڱ]۹]]]]߁^^^^^^^^________`#`P '` +`Ђ /` 3`P 7`;`Ѓ?acaPgakaІoasaPwa{aЇb(bP)b*bЊ+b b0-Jbp.b/b8c09cp:c;c<c0=cp>c?cH!d0I%d'dJ+IdВK/dL3dPM7dN;dГO?eXcePYgeZkeЖ[oe\seP]we^{eЗ_#hf0AJifpjfkflf0mfpnfofxg0ygpzg{g|g0IgP}g,~gПh#hP'h+hТ/h3hP7h;hУ?iciPgikiЦoiALi0ui>qyi}ij0jpjjj0jpjjk0kpkkk0kpkk2#lPO'l+lв/l3lP7l;lг?mcmPgmkmжomsmPwm{}ma!n0npnnn0npnno0opooo0opoo4#pPX 'p +p /pÁ 3pPÁ 7pÁ;pÁ?qƁcqPƁgqƁkqƁoqǁsqPǁwqǁ{qǁrʁ(rPʁ)rKr+r:,r0-rp.r/r8s09sp:s;s<s0=sp>s?sH!t0I%tpJ)tK-t/tӁL3OtPӁM7tӁN;tӁO?uցXcuPցYguցZkuց[ouׁ\suPׁ]wuׁ^{uׁ_vځhvPځivځjvځkvہlvPہm-qnvAMovxw0ywpzw{w|w0}wp~ww!x0%xp)x-x1x05xp9x=xay0egy恚7ky恛oy灜syP灝wy灞{y灟zꁨzPꁩzꁪzꁫz끬zP끭z끮z끯{P{0Z{p{{{0{p{{!|0%|p)|-|1|05|p9|=|a}0e}pi}m}fs}PAZw}{}~~P~~~~P~~Pm0p !(0 %(p )( -( 1(Ƞ0 5(ؠp 9(蠰 =( a(0 e(p i( m( q( s(СP ow(ࡐ {(Ї (( (P) (* (Њ+ (, (ТP- (࢐. (Ћ/ (8 (P9 (: (Ў; (< (УP= (࣐B\裰? ( sH !)0I %)pJ ))K -)L 1)Ȥ0M 5)ؤpN 9)褰O =)X a)0Y e)pZ i)[ m)]\ s)ХPvQ] w)ॐ^ {)З_ )h )Pi )j )Кk )l )ЦPm )ঐn )Лo )x w)Py )z )^О{ )| )ЧP} )৐~ )П ) #*P '* +*Т /* 3*ШP 7*ਐ ;*У ?* c*P g* k*Ц o* s*ЩP w*੐ {*Ч * *P * *Ъ * *ЪP *ઐ *Ы * *P * *Ю * *ЫP *ૐ *Я * #+P '+ ++в /+ 3+ЬP 7+ଐ ;+г ?+ c+P g+ k+ж o+ s+ЭP w+୐ {+з + +P + +к + +ЮP +ஐ +л + +P + +о + +ЯP +ௐ +п +‚ {1 %,p_ ), -, 1,Ȱ0 5,ذp 9,谰 =, a,0 e,p i, m, q,ȱ0 u,رp y,豰 },( ,0) ,p* ,,ʂ+ ,h˂, ,вP˂- ,ಐ˂. ,˂/ ,΂8 ,9 ,p: 1,; ,< ,ȳ0= ,سp> ,賰? ς? ,BiH #-P҂I '-҂J +-jjkk҂K /-ӂL 3-дPӂM 7-ഐӂN ;-ӂO ?-ւX c-䵠Y e-pZ i-[ m-\ q-ȵ0] u-صp^ {-ׂ_ -ڂh -Pڂi -ڂj -ڂk -ۂl -жPۂm -ඐۂn -ۂo -ނx -Pނy -ނz -ނ{ -߂| -зP߂} -ැ߂~ -S : 0#Qќ+öZQ,JL=N#]VEMXmfPd0bgu|iknlps tPvpz|{ 0 P p P ` ` @00P@ "'a%A#*,.06Q412q:8><`Qd1bfqhilno!qAsQxqvy}~Aaq:6@8DYyHʺL۸ ` Ƞ ` ؠ ` `@!DaHLP!TaX\`!dahlp!tax|!aȡ!aء!a!a@"$Db$H$L$P"%Tb%X%\%`"&db&hb'j&n'rB'x'|'",b,Ȣ,,"-b-آ--".b..."/b///@#4Dc4H4L4P#5Tc5X5\a   BA8YYY!1 w @@BAFJNRAVZ^bAfjnrAvz~AƁAցAA$BB$F$J$N%RB%V%Z%^&bB&f&j&n'rB'a @ea"i*2m:01 203@4P5` 0@P`pЋ 0@P`p׀ؐ٠ڰЍ 0@P`pЏ!1AQaqё01!213A4Q5a6q789:;<ѓ=>?PQ!R1SATQUaVqWXYZ[\ѕ]^_pq!r1sAtQuavqwxyz{|ї}~!1AQaqљᙞ!1AQaqћ᛾!1AQaqׁؑ١ڱѝ!1AQaqџ"2BRbr¡ҡ01"223B4R5b6r789:;£<ң=>?PQ"R2SBTRUbVrWXYZ[¥\ҥ]^_pq"r2sBtRubvrwxyz{§|ҧ}~"2BRbr©ҩ⩞"2BRbr«ҫ⫾"2BRbrׂؒ٢ڲ­ҭ"2BRbr¯ү#3CScsñӱ01#233C4S5c6s789:;ó<ӳ=>?PQ#R3SCTSUcVsWXYZ[õ\ӵ]^_pq#r3sCtSucvswxyz{÷|ӷ}~#3CScsùӹ㹞#3CScsûӻ㻾#3CScs׃ؓ٣ڳýӽ#3CScsÿӿ  F0  N0@  V0` ^0  f0 n0  v0 ~0 1 0 à 3 0@ 5 0`à 7 0 9 0à; 0 = 0à? 0 Q F1 ŠS N1@ U V1`ŠW ^1 Y f1Š[ n1 ] v1Š_ ~1 q 1 Ǡs 1@ u 1`Ǡw 1 y 1Ǡ{ 1 } 1Ǡ 1 $ F2 ɠ$ N2@ % V2`ɠ% ^2 & f2ɠ& n2 ' v2ɠ' ~2 , 2 ˠ, 2@ - 2`ˠ- 2 . 2ˠ. 2 / 2ˠ/ 2 4 F3 ͠4 N3@ 5 V3`͠5 ^3 6 f3͠6 n3 7 v3͠7 ~3 < 3 Ϡ< 3@ = 3`Ϡ= 3 > 3Ϡ> 3 ? 3Ϡ? 3 D F4 ѠD N4@ E V4`ѠE ^4 F f4ѠF n4 G v4ѠG ~4 L1 4 ӠL3 4@ M5 4`ӠM7 4 N9 4ӠN; 4 O= 4ӠO? 4 TQ F5 ՠTS N5@ UU V5`ՠUW ^5 VY f5ՠV[ n5 W] v5ՠW_ ~5 \q 5 נ\s 5@ ]u 5`נ]w 5 ^y 5נ^{ 5 _} 5נ_ 5 d F6 ٠d N6@ e V6`٠e ^6 f f6٠f n6 g v6٠g ~6 l 6 ۠l 6@ m 6`۠m 6 n 6۠n 6 o 6۠o 6 t F7 ݠt N7@ u V7`ݠu ^7 v f7ݠv n7 w v7ݠw ~7 | 7 ߠ| 7@ } 7`ߠ} 7 ~ 7ߠ~ 7  7ߠ 7 F8 ᠄N8@ V8`᠅^8 f8᠆n8 v8᠇~8 18 㠌38@ 58`㠍78 98㠎;8 =8㠏?8 QF9 堔SN9@ UV9`堕W^9 Yf9堖[n9 ]v9堗_~9 q9 砜s9@ u9`砝w9 y9砞{9 }9砟9 F: 頤N:@ V:`頥^: f:頦n: v:頧~: : 렬:@ :`렭: :렮: :렯: F; N;@ V;`^; f;n; v;~; ; ;@ ;`; ;; ;; F< N<@ V<`^< f<n< v<~< 1< 3<@ 5<`7< 9<;< =<?< QF= SN=@ UV=`W^= Yf=[n= ]v=_~= q= s=@ u=`w= y={= }== F> N>@ V>`^> f>n> v>~> > >@ >`> >> >> F? N?@ V?`^? f?n? v?~? ? ?@ ?`? ?? ??A!FP ANP@A!VP`A^PA!fPAnPA!vPA~PC! 1P C 3P@C! 5P`C 7PC!9PC;PC!=PC?PE!QFQ ESNQ@E!UVQ`EW^QE!YfQE[nQE!]vQE_~QG!qQ GsQ@G!uQ`GwQG!yQG{QG!}QGQI!$FR I$NR@I!%VR`I%^RI!&fRI&nRI!'vRI'~RK!,R K,R@K!-R`K-RK!.RK.RK!/RK/RM!4FS M4NS@M!5VS`M5^SM!6fSM6nSM!7vSM7~SO!<S O<S@O!=S`O=SO!>SO>SO!?SO?SQ!DFT QDNT@Q!EVT`QE^TQ!FfTQFnTQ!GvTQG~TS!L1T SL3T@S!M5T`SM7TS!N9TSN;TS!O=TSO?TU!TQFU UTSNU@U!UUVU`UUW^UU!VYfUUV[nUU!W]vUUW_~UW!\qU W\sU@W!]uU`W]wUW!^yUW^{UW!_}UW_UY!dFV YdNV@Y!eVV`Ye^VY!ffVYfnVY!gvVYg~V[!lV [lV@[!mV`[mV[!nV[nV[!oV[oV]!tFW ]tNW@]!uVW`]u^W]!vfW]vnW]!wvW]w~W_!|W _|W@_!}W`_}W_!~W_~W_!W_Wa!FX aNX@a!VX`a^Xa!fXanXa!vXa~Xc!1X c3X@c!5X`c7Xc!9Xc;Xc!=Xc?Xe!QFY eSNY@e!UVY`eW^Ye!YfYe[nYe!]vYe_~Yg!qY gsY@g!uY`gwYg!yYg{Yg!}YgYi!FZ iNZ@i!VZ`i^Zi!fZinZi!vZi~Zk!Z kZ@k!Z`kZk!ZkZk!ZkZm!F[ mN[@m!V[`m^[m!f[mn[m!v[m~[o![ o[@o![`o[o![o[o![o[q!F\ qN\@q!V\`q^\q!f\qn\q!v\q~\s!1\ s3\@s!5\`s7\s!9\s;\s!=\s?\u!QF] uSN]@u!UV]`uW^]u!Yf]u[n]u!]v]u_~]w!q] ws]@w!u]`ww]w!y]w{]w!}]w]y!F^ yN^@y!V^`y^^y!f^yn^y!v^y~^{!^ {^@{!^`{^{!^{^{!^{^}!F_ }N_@}!V_`}^_}!f_}n_}!v_}~_!_ _@!_`_!__!__!Fp Np@!Vp`^p!fpnp!vp~p! 1p á 3p@! 5p`á 7p!9pá;p!=pá?p!QFq šSNq@!UVq`šW^q!Yfqš[nq!]vqš_~q!qq ǡsq@!uq`ǡwq!yqǡ{q!}qǡq!$Fr ɡ$Nr@!%Vr`ɡ%^r!&frɡ&nr!'vrɡ'~r!,r ˡ,r@!-r`ˡ-r!.rˡ.r!/rˡ/r!4Fs ͡4Ns@!5Vs`͡5^s!6fs͡6ns!7vs͡7~s!<s ϡ<s@!=s`ϡ=s!>sϡ>s!?sϡ?s!DFt ѡDNt@!EVt`ѡE^t!FftѡFnt!GvtѡG~t!L1t ӡL3t@!M5t`ӡM7t!N9tӡN;t!O=tӡO?t!TQFu աTSNu@!UUVu`աUW^u!VYfuաV[nu!W]vuաW_~u!\qu ס\su@!]uu`ס]wu!^yuס^{u!_}uס_u!dFv ١dNv@!eVv`١e^v!ffv١fnv!gvv١g~v!lv ۡlv@!mv`ۡmv!nvۡnv!ovۡov!tFw ݡtNw@!uVw`ݡu^w!vfwݡvnw!wvwݡw~w!|w ߡ|w@!}w`ߡ}wa A, t2XPn`v@`@p`@q`@rF#`` dYY1[@"a A, t2XPn`v@`@p`@q`@rF#`` dYY1[@2a A, t2XPn`v@`@p`@q`@rF#`` dYY1[@Ba A, t2XPn`v@`@p`@q`@rF#`` dYY1[@Rq R"d DH9@ TR1~SU_G_L8T?"HLĠg7L@SXq,7U QI LS4HR1~SUP5~7/,ٲ(pM0Ѡ.7_,0p,7U0tS4 -KMU1>0 ~0T 5*0-S0 D)M#d,5+S0 -AMGQ>-THRXy,7U]QSfֲTT3G_L_h-ುMG @Dv,KMU1d>7_DG1hR1~SUC-Sˢ*7_,0Gx\T44HPBNM7@S ML*7_,0G] HR1~SU?ԲcҲTT6,~X,7UM-=:-K ?4>V- -KMU1`SGjt^$T?]DR1~SU O-Bj(T$AdѲTT3SbղTT6S7_0Π78S,7UQ6&-KMU1~~f,T$<:~FM lM#R4,T?"5@Sؕ,7UeDS*??S D4 c(TdҲTTGG?薿M  -KMU1~$<~V`0T8s`-KMU1~$<~Ŕ v(T5mԲTT6G_Lz Գ@T =;S0 P,7UE?SC5C(7_,0L)JM3b >`=SED\OG7/!H|HTPBH3Q-7տ,PB`ֲTT3G_L_h=M#HTPBH397ՏHHS -KMU1T~(r0TD :INLSс=Τ&7?8T6LY@,7UM!~Ŕ)DM3H-7տ,PBQc~ҲTTHG_ 'M/Q?:ϴS{,7U]cҲTTSb~ؼ,7UԲ؃5/ -KMU1>,G<_8TC\PLDdвTTvG~(TQHbҲTTSb~,7UDS8S0 4(hJM#dL4(v% M0Q)7Տ4],0Q#7_G DT?Lv8Gn0T e)SH3e;༦SH>(T? 8a;ѲTTD`ֲTT3G_L_XN,7U8_hM#cb T *nS4 KlS@\Pz$T1 HR1~SU? =`ҲTTHG_(IM3Ԣ78s 4R1~SU؅FDTH/!H4E*<S4G77տ,L9 PR1~SU_؈=~1e`-KMU1~$<~Ŕ4_,T?PˁX ,7U007OI`ҲTTSb~ظ,7UԲ؅5. -KMU1>,vGDR1~SUPˁ HR1~SUO-]QfղTT6S75Nh &AM/aM4c=p>;S1P MUh@/bMG8S75NCaҲTT6,~X,7UM-=6, -KMU1~aSG)XR1~SU/8bҲTTG?Xw,7U0Qj9,S B8TH?M/7_/a*8P"SH?4`& T4?0?S7տ,LILSDb,T?P@cUMO@S!)MԪHJS::ЋS D4SaҲTT3G?x,7U Q;ZS4v7 A48"SL? ցv,T4haM G^$8THO\O67_ x.0T Q #S8"܋0TP e-KMU1>q>C-~6$T? !r4TD00,Td ؞,7URD&,`-KMU1~aSG_L* 4DT8d#G @R1~SU؃5>`=S3fk,TQP(@IML6L@5@MK4hb~ب,7U]ѣm7_0@#AQcvҲTT G?X,7U*SC a/L S?BQ)-KMU1>b~ٳ@,7U,~XƳ,7U pE DR1~SU?G}sDTŏGf@,T?Uز@,7U,~X@,7U,~ MѠ#7_0G%TH"@$THZ $HTPBԲѠ27_HTGzl0TP59KSDDf,7UM~ŔJ ̲HTPBԲQc`ԲTTKDEDj0T!QeILS1ZP?SC-DIEMS `MRѠ,7?8T6,Di}MGMQR1~SU_0SJ-"SC aSGqw0T1 5OSC-Y,7U pb вTT K/{0T1 Q;`<S,T?q=L]X,7UE4h{ M,],0%7?8Tv8GXR1~SU? e~1e~27àF7_0Uh4sMcuaϲTT3H#JjSDTNX T<S ]X,7UQSp-KMU1E=~1e~aֲTT6,~Ŕ®EM?#X,7U7՟ԅHR1~SU 5`ҲTTGG_Xv,7U0QSILSBMj/"SԲ؃C G DT8ҏ4b T?H*/0"SԲ؃C G: tDTPBG~ TR'5M' HT?,PB47_DMԨ!KM,,({MSb,e0-KMU1E5`ӲTTKP4Q -SC a/Q, SC a/eZ;ߵStaѲTT6cѲTT6Qc%ҲTT3r ئ@,7UM!~a,7URS-KMU1~?,7UH~aѲTT6Qi7HHSGMSB(T? !akֲTT G_L_(AwM'\TPB/Nc,Nc(sMSH3 ʢSADTˢ)7O!dLԨ@!AMKH#8̳҃,7U@ hHT?4],0eֲTT6,~ŔqMSH3e HpISck,7U*-KMU1`#d,7U` 4E~T7?ET#X,7UH~1e~C7_0HSXC,7UM/q=LR1~SU/AG?(M4c0-KMU1E=~p7տ8FM-KMU1>\> ,7UADT7տ,C5C7?00TD0*>`=S DR1~SU_G DR1~SU_G DR1~SU_G$TX"70TD >>>Scv+-KMU1~a/e;`8ScO؋75NCaвTT3HSdB0T?T@-KMU1`~Ŕك,7UOLVp@ @4#C7_ Sq@M ΃IX ,7UADT:nֲTTG_L_-MKMaղTTS!7?HP4Ѡ78R,AM3PX},7UcвTT6G DT?,v8GMAR1~SU/ SE5-KMU1@ _},7Uѣ7O- 4jSEQ*.SԲ56-KMU1`#~lrHMM)kKMSbLD)eKMSbLYn,7UM>0 ~JT?֘v,7U0Q>Ю?Sc} DR1~SU,vGm DR1~SU,vG DR1~SU,vGu DR1~SU,vGPR1~SU?؅1~1eKPmSC5vHTH_ H D7L@c@,7UC5?X@,7UC5_ؠ@,7Ub~(MGQNaвTT6G @R1~SU_؈5(-KMU1~a#~X,7UQf5вTT/(R(THtriton-2.0.0/python/triton/tools/000077500000000000000000000000001440023377100170315ustar00rootroot00000000000000triton-2.0.0/python/triton/tools/__init__.py000066400000000000000000000000001440023377100211300ustar00rootroot00000000000000triton-2.0.0/python/triton/tools/aot.py000066400000000000000000000041551440023377100201730ustar00rootroot00000000000000import argparse import sys import triton import triton._C.libtriton.triton as libtriton if __name__ == '__main__': # valid source and target formats VALID_FORMATS = ['triton-ir', 'triton-gpu-ir', 'llvm-ir', 'ptx'] # set up the argument parser # TODO: conditional requirements parser = argparse.ArgumentParser() parser.add_argument('src', help="Source file to compile") parser.add_argument('--target', required=True, help="Target format, one of: " + ', '.join(VALID_FORMATS)) parser.add_argument('--sm', type=int, help="Compute capability to compile for") parser.add_argument('--ptx-version', type=int, help="PTX version to compile for") # parse the args args = parser.parse_args() # TODO: clean-up and re-use triton.compiler primitive functions # check for validity of format arguments if args.target not in VALID_FORMATS: print("Invalid target format: " + args.target) sys.exit(0) # parse source file to MLIR module context = libtriton.ir.context() module = libtriton.ir.parse_mlir_module(args.src, context) module.context = context # optimizer triton-ir module = triton.compiler.optimize_triton_ir(module) if args.target == 'triton-ir': print(module.str()) sys.exit(0) if not args.sm: raise argparse.ArgumentError(None, "Must specify --sm for PTX compilation") # triton-ir -> triton-gpu-ir module = triton.compiler.ttir_to_ttgir(module, num_warps=4, num_stages=3, compute_capability=args.sm) if args.target == 'triton-gpu-ir': print(module.str()) sys.exit(0) # triton-gpu-ir -> llvm-ir module = triton.compiler.ttgir_to_llir(module, extern_libs=None, compute_capability=args.sm) if args.target == 'llvm-ir': print(module) sys.exit(0) if not args.ptx_version: raise argparse.ArgumentError(None, "Must specify --ptx-version for PTX compilation") # llvm-ir -> ptx module = triton.compiler.llir_to_ptx(module, compute_capability=args.sm, ptx_version=args.ptx_version) assert args.target == 'ptx' print(module) triton-2.0.0/python/triton/tools/build_extern.py000066400000000000000000000333401440023377100220720ustar00rootroot00000000000000import argparse import subprocess from abc import ABC, abstractmethod from typing import Dict, List, Optional class Symbol: _name: str _op_name: str _ret_type: str _arg_names: List[str] _arg_types: List[str] def __init__( self, name: str, op_name: str, ret_type: str, arg_names: List[str], arg_types: List[str], ) -> None: ''' A symbol is a function declaration. :param name: name of the symbol :param op_name: name of the operation :param ret_type: return type of the operation :param arg_names: names of the arguments :param arg_types: types of the arguments ''' self._name = name self._op_name = op_name self._ret_type = ret_type self._arg_names = list(arg_names) self._arg_types = list(arg_types) @property def name(self) -> str: return self._name @property def op_name(self) -> str: return self._op_name @property def ret_type(self) -> str: return self._ret_type @property def arg_names(self) -> List[str]: return self._arg_names @property def arg_types(self) -> List[str]: return self._arg_types def convert_type(type_str) -> Optional[str]: if type_str == "i32": return "int32" elif type_str == "u32": return "uint32" elif type_str == "i64": return "int64" elif type_str == "u64": return "uint64" elif type_str == "float": return "fp32" elif type_str == "double": return "fp64" else: # ignore other types, such as pointer types return None def to_unsigned(type_str) -> str: if type_str == "int32": return "uint32" elif type_str == "int64": return "uint64" else: return type_str class ExternLibrary(ABC): _name: str _path: str _symbols: Dict[str, Symbol] _format: bool _grouping: bool def __init__( self, name: str, path: str, format: bool = True, grouping: bool = True, ) -> None: ''' Abstract class for extern library. :param name: name of the library :param path: path of the library :param format: whether to format the generated stub file ''' self._name = name self._path = path self._symbols = {} self._format = format self._grouping = grouping @property def name(self) -> str: return self._name @property def path(self) -> str: return self._path @property def symbols(self) -> Dict[str, Symbol]: return self._symbols @property def grouping(self) -> bool: return self._grouping @abstractmethod def parse_symbols(self, input_file) -> None: pass @abstractmethod def _output_stubs(self) -> str: pass def generate_stub_file(self, output_dir) -> None: file_str = self._output_stubs() if file_str is None or len(file_str) == 0: raise Exception("file_str is empty") output_file = f"{output_dir}/{self._name}.py" with open(output_file, "w") as f: f.write(file_str) f.close() if self._format: subprocess.Popen(["autopep8", "-a", "-r", "-i", output_file], stdout=subprocess.PIPE).communicate() subprocess.Popen(["isort", output_file], stdout=subprocess.PIPE).communicate() class Libdevice(ExternLibrary): _symbol_groups: Dict[str, List[Symbol]] def __init__(self, path) -> None: ''' Constructor for Libdevice. :param path: path of the libdevice library ''' super().__init__("libdevice", path) self._symbol_groups = {} @staticmethod def _extract_symbol(line) -> Optional[Symbol]: # Extract symbols from line in the following format: # "define [internal] @(,)" entries = line.split("@") ret_str = entries[0] func_str = entries[1] # Get ret_type, skip internal symbols ret_strs = ret_str.split() if ret_strs[1] == "internal": return None ret_type = convert_type(ret_strs[1]) if ret_type is None: return None # Get function name func_strs = func_str.split("(") func_name = func_strs[0].replace("@", "") op_name = func_name.replace("__nv_", "") if 'ieee' in op_name: return None # Get arg_types arg_strs = func_strs[1].split(",") arg_types = [] arg_names = [] for i, arg_str in enumerate(arg_strs): arg_type = convert_type(arg_str.split()[0]) if arg_type is None: return None arg_name = 'arg' + str(i) arg_types.append(arg_type) arg_names.append(arg_name) if op_name == "sad": # Special case for sad, where the last argument is an unsigned int arg_types[-1] = to_unsigned(arg_types[-1]) elif op_name.startswith("u"): # LLVM does not differentiate between signed and unsigned integer type. # We have to convert the types to unsigned ret_type = to_unsigned(ret_type) for i, arg_type in enumerate(arg_types): arg_types[i] = to_unsigned(arg_type) return Symbol(func_name, op_name, ret_type, arg_names, arg_types) def _group_symbols(self) -> None: symbol_set = {} for symbol in self._symbols.values(): op_name = symbol.op_name symbol_set[op_name] = symbol # Group functions together by renaming. renaming = { 'llabs': 'abs', 'acosf': 'acos', 'acoshf': 'acosh', 'dadd_rd': 'add_rd', 'fadd_rd': 'add_rd', 'dadd_rn': 'add_rn', 'fadd_rn': 'add_rn', 'dadd_ru': 'add_ru', 'fadd_ru': 'add_ru', 'dadd_rz': 'add_rz', 'fadd_rz': 'add_rz', 'asinf': 'asin', 'asinhf': 'asinh', 'atanf': 'atan', 'atan2f': 'atan2', 'atanhf': 'atanh', 'brevll': 'brev', 'cbrtf': 'cbrt', 'ceilf': 'ceil', 'clzll': 'clz', 'copysignf': 'copysign', 'cosf': 'cos', 'coshf': 'cosh', 'cospif': 'cospi', 'cyl_bessel_i0f': 'cyl_bessel_i0', 'cyl_bessel_i1f': 'cyl_bessel_i1', 'fdiv_rd': 'div_rd', 'ddiv_rd': 'div_rd', 'fdiv_rn': 'div_rn', 'ddiv_rn': 'div_rn', 'fdiv_ru': 'div_ru', 'ddiv_ru': 'div_ru', 'fdiv_rz': 'div_rz', 'ddiv_rz': 'div_rz', 'erff': 'erf', 'erfcf': 'erfc', 'erfcinvf': 'erfcinv', 'erfcxf': 'erfcx', 'erfinvf': 'erfinv', 'expf': 'exp', 'exp10f': 'exp10', 'exp2f': 'exp2', 'expm1f': 'expm1', 'fabsf': 'abs', 'fabs': 'abs', 'fast_fdividef': 'fast_dividef', 'fdimf': 'fdim', 'ffsll': 'ffs', 'floorf': 'floor', 'fmaf': 'fma', 'fmaf_rd': 'fma_rd', 'fmaf_rn': 'fma_rn', 'fmaf_ru': 'fma_ru', 'fmaf_rz': 'fma_rz', 'fmodf': 'fmod', 'uhadd': 'hadd', 'hypotf': 'hypot', 'ilogbf': 'ilogb', 'isinff': 'isinf', 'isinfd': 'isinf', 'isnanf': 'isnan', 'isnand': 'isnan', 'j0f': 'j0', 'j1f': 'j1', 'jnf': 'jn', 'ldexpf': 'ldexp', 'lgammaf': 'lgamma', 'llrintf': 'llrint', 'llroundf': 'llround', 'logf': 'log', 'log10f': 'log10', 'log1pf': 'log1p', 'log2f': 'log2', 'logbf': 'logb', 'umax': 'max', 'llmax': 'max', 'ullmax': 'max', 'fmaxf': 'max', 'fmax': 'max', 'umin': 'min', 'llmin': 'min', 'ullmin': 'min', 'fminf': 'min', 'fmin': 'min', 'dmul_rd': 'mul_rd', 'fmul_rd': 'mul_rd', 'dmul_rn': 'mul_rn', 'fmul_rn': 'mul_rn', 'dmul_ru': 'mul_ru', 'fmul_ru': 'mul_ru', 'dmul_rz': 'mul_rz', 'fmul_rz': 'mul_rz', 'umul24': 'mul24', 'umulhi': 'mulhi', 'mul64hi': 'mulhi', 'umul64hi': 'mulhi', 'nearbyintf': 'nearbyint', 'nextafterf': 'nextafter', 'norm3df': 'norm3d', 'norm4df': 'norm4d', 'normcdff': 'normcdf', 'normcdfinvf': 'normcdfinv', 'popcll': 'popc', 'powif': 'pow', 'powi': 'pow', 'powf': 'pow', 'rcbrtf': 'rcbrt', 'frcp_rd': 'rcp_rd', 'drcp_rd': 'rcp_rd', 'frcp_rn': 'rcp_rn', 'drcp_rn': 'rcp_rn', 'frcp_ru': 'rcp_ru', 'drcp_ru': 'rcp_ru', 'frcp_rz': 'rcp_rz', 'drcp_rz': 'rcp_rz', 'remainderf': 'remainder', 'urhadd': 'rhadd', 'rhypotf': 'rhypot', 'rintf': 'rint', 'rnorm3df': 'rnorm3d', 'rnorm4df': 'rnorm4d', 'roundf': 'round', 'rsqrtf': 'rsqrt', 'frsqrt_rn': 'rsqrt_rn', 'usad': 'sad', 'scalbnf': 'scalbn', 'signbitf': 'signbit', 'signbitd': 'signbit', 'sinf': 'sin', 'sinhf': 'sinh', 'sinpif': 'sinpi', 'sqrtf': 'sqrt', 'fsqrt_rd': 'sqrt_rd', 'dsqrt_rd': 'sqrt_rd', 'fsqrt_rn': 'sqrt_rn', 'dsqrt_rn': 'sqrt_rn', 'fsqrt_ru': 'sqrt_ru', 'dsqrt_ru': 'sqrt_ru', 'fsqrt_rz': 'sqrt_rz', 'dsqrt_rz': 'sqrt_rz', 'fsub_rd': 'sub_rd', 'dsub_rd': 'sub_rd', 'fsub_rn': 'sub_rn', 'dsub_rn': 'sub_rn', 'fsub_ru': 'sub_ru', 'dsub_ru': 'sub_ru', 'fsub_rz': 'sub_rz', 'dsub_rz': 'sub_rz', 'tanf': 'tan', 'tanhf': 'tanh', 'tgammaf': 'tgamma', 'truncf': 'trunc', 'y0f': 'y0', 'y1f': 'y1', 'ynf': 'yn' } for symbol in self._symbols.values(): op_name = symbol.op_name if op_name in renaming: op_name = renaming[op_name] symbol._op_name = op_name if op_name in self._symbol_groups: self._symbol_groups[op_name].append(symbol) else: self._symbol_groups[op_name] = [symbol] def parse_symbols(self, input_file) -> None: if len(self.symbols) > 0: return output = subprocess.check_output(["grep", "define", input_file]).decode().splitlines() for line in output: symbol = self._extract_symbol(line) if symbol is None: continue self._symbols[symbol.name] = symbol self._group_symbols() def _output_stubs(self) -> str: # Generate python functions in the following format: # @extern.extern # def (, _builder=None): # arg_type_symbol_dict = {[arg_type]: {(symbol, ret_type)}} # return extern.dispatch("libdevice", , , , _builder) import_str = "from . import core, extern\n" import_str += "import os\n" header_str = "LIBDEVICE_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), \"..\", \"third_party\", \"cuda\", \"lib\", \"libdevice.10.bc\")" func_str = "" for symbols in self._symbol_groups.values(): func_str += "@extern.extern\n" func_name_str = f"def {symbols[0].op_name}(" for arg_name in symbols[0].arg_names: func_name_str += f"{arg_name}, " func_name_str += "_builder=None):\n" return_str = f"\treturn extern.elementwise(\"{self._name}\", LIBDEVICE_PATH, [" for arg_name in symbols[0].arg_names: return_str += f"{arg_name}, " return_str += "], \n" arg_type_symbol_dict_str = "{" for symbol in symbols: arg_type_symbol_dict_str += "(" for arg_type in symbol.arg_types: arg_type_symbol_dict_str += f'core.dtype("{arg_type}"),' ret_type = f'core.dtype("{symbol.ret_type}")' arg_type_symbol_dict_str += "): (\"" + symbol.name + "\", " + ret_type + "),\n" arg_type_symbol_dict_str += "}" return_str += arg_type_symbol_dict_str return_str += ", _builder)\n" func_str += func_name_str + return_str + "\n" file_str = import_str + header_str + func_str return file_str class LLVMDisassembler: _path: str _ll_file: str def __init__(self, path) -> None: ''' Invoke llvm-dis to disassemble the given file. :param path: path to llvm-dis ''' self._path = path self._ll_file = "/tmp/extern_lib.ll" def disasm(self, lib_path: str) -> None: subprocess.Popen([self._path, lib_path, "-o", self.ll_file], stdout=subprocess.PIPE).communicate() @property def ll_file(self) -> str: return self._ll_file @property def path(self) -> str: return self._path extern_libs = ["libdevice"] def build( llvm_dis_path: str, lib_path: str, lib_name: str, output_dir: str, ) -> None: ''' Interface function to build the library file. :param llvm_dis_path: path to the llvm-dis binary :param lib_path: path to the external library file :param lib_name: name of the library :param output_dir: path to the output directory ''' if lib_name == "libdevice": extern_lib = Libdevice(lib_path) else: raise Exception(f"Unknown extern library: {lib_name}") llvm_disassembler = LLVMDisassembler(llvm_dis_path) llvm_disassembler.disasm(lib_path) extern_lib.parse_symbols(llvm_disassembler.ll_file) extern_lib.generate_stub_file(output_dir) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--llvm-dis", dest="llvm_dis_path", help="Path to llvm-dis", default="llvm-dis") parser.add_argument("--lib-path", dest="lib_path", help="Path to the extern library") parser.add_argument("--lib-name", dest="lib_name", help="Name of the extern library") parser.add_argument("--output", dest="output_dir", help="Output file path", default="/tmp/") args = parser.parse_args() build(args.llvm_dis_path, args.lib_path, args.lib_name, args.output_dir) triton-2.0.0/python/triton/tools/disasm.py000066400000000000000000000107611440023377100206700ustar00rootroot00000000000000# MIT License # Copyright (c) 2020 Da Yan @ HKUST # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. import re import subprocess FLINE_RE = re.compile(r'\s*/\*\w{4}\*/\s*([^;]*;)\s*/\* 0x(\w{16}) \*/\s*') SLINE_RE = re.compile(r'\s*/\* 0x(\w{16}) \*/\s*') FNAME_RE = re.compile(r'\s*Function : (\w+)\s*') BRA_RE = re.compile(r'(.*BRA(?:\.U)? )(0x\w+);') def parseCtrl(sline): enc = int(SLINE_RE.match(sline).group(1), 16) stall = (enc >> 41) & 0xf yld = (enc >> 45) & 0x1 wrtdb = (enc >> 46) & 0x7 readb = (enc >> 49) & 0x7 watdb = (enc >> 52) & 0x3f yld_str = 'Y' if yld == 0 else '-' wrtdb_str = '-' if wrtdb == 7 else str(wrtdb) readb_str = '-' if readb == 7 else str(readb) watdb_str = '--' if watdb == 0 else f'{watdb:02d}' return f'{watdb_str}:{readb_str}:{wrtdb_str}:{yld_str}:{stall:x}' def processSassLines(fline, sline, labels): asm = FLINE_RE.match(fline).group(1) # Remove tailing space if asm.endswith(" ;"): asm = asm[:-2] + ";" ctrl = parseCtrl(sline) # BRA target address if BRA_RE.match(asm) is not None: target = int(BRA_RE.match(asm).group(2), 16) if target in labels: pass else: labels[target] = len(labels) return (f'{ctrl}', f'{asm}') def extract(file_path, fun): if fun is None: sass_str = subprocess.check_output(["cuobjdump", "-sass", file_path]) else: sass_str = subprocess.check_output(["cuobjdump", "-fun", fun, "-sass", file_path]) sass_lines = sass_str.splitlines() line_idx = 0 while line_idx < len(sass_lines): line = sass_lines[line_idx].decode() # format: # function : # .headerflags: ... # /*0000*/ asmstr /*0x...*/ # /*0x...*/ # Looking for new function header (function: ) while FNAME_RE.match(line) is None: line_idx += 1 if line_idx < len(sass_lines): line = sass_lines[line_idx].decode() else: return fname = FNAME_RE.match(line).group(1) ret = '' ret += f'Function:{fname}\n' line_idx += 2 # bypass .headerflags line = sass_lines[line_idx].decode() # Remapping address to label labels = {} # address -> label_idx # store sass asm in buffer and them print them (for labels) # (ctrl, asm) asm_buffer = [] while FLINE_RE.match(line) is not None: # First line (Offset ASM Encoding) fline = sass_lines[line_idx].decode() line_idx += 1 # Second line (Encoding) sline = sass_lines[line_idx].decode() line_idx += 1 asm_buffer.append(processSassLines(fline, sline, labels)) # peek the next line line = sass_lines[line_idx].decode() # Print sass # label naming convention: LBB#i for idx, (ctrl, asm) in enumerate(asm_buffer): # Print label if this is BRA target offset = idx * 16 if offset in labels: label_name = f'LBB{labels[offset]}' ret += f'{label_name}:\n' ret += ctrl + '\t' # if this is BRA, remap offset to label if BRA_RE.match(asm): target = int(BRA_RE.match(asm).group(2), 16) target_name = f'LBB{labels[target]}' asm = BRA_RE.sub(rf'\1{target_name};', asm) ret += asm + '\n' ret += '\n' return ret triton-2.0.0/python/triton/utils.py000066400000000000000000000031731440023377100174070ustar00rootroot00000000000000from __future__ import annotations import torch def cdiv(x, y): return (x + y - 1) // y def next_power_of_2(n): """Return the smallest power of 2 greater than or equal to n""" n -= 1 n |= n >> 1 n |= n >> 2 n |= n >> 4 n |= n >> 8 n |= n >> 16 n += 1 return n class MockTensor: """ Can be used in place of real tensors when calling: kernel.warmup(MockTensor(torch.float32), ...) """ @staticmethod def wrap_dtype(arg): if isinstance(arg, torch.dtype): return MockTensor(arg) return arg def __init__(self, dtype): self.dtype = dtype @staticmethod def data_ptr(): return 0 # optimistically assumes multiple of 16 class TensorWrapper: def __init__(self, base, dtype): self.dtype = dtype self.base = base self.is_cuda = base.is_cuda self.device = base.device def data_ptr(self): return self.base.data_ptr() def __str__(self) -> str: return f'TensorWrapper[{self.dtype}]({self.base})' def reinterpret(tensor, dtype): if isinstance(tensor, TensorWrapper): if dtype == tensor.base.dtype: # Reinterpreting to the original interpretation; return the base. return tensor.base else: # Reinterpreting a wrapped tensor to a different type. return TensorWrapper(tensor.base, dtype) elif isinstance(tensor, torch.Tensor): # A new wrapper is needed around an unwrapped tensor. return TensorWrapper(tensor, dtype) else: raise TypeError(f'Cannot reinterpret a {type(tensor)}.') triton-2.0.0/python/tutorials/000077500000000000000000000000001440023377100164005ustar00rootroot00000000000000triton-2.0.0/python/tutorials/01-vector-add.py000066400000000000000000000124451440023377100212260ustar00rootroot00000000000000""" Vector Addition ================= In this tutorial, you will write a simple vector addition using Triton and learn about: - The basic programming model of Triton. - The `triton.jit` decorator, which is used to define Triton kernels. - The best practices for validating and benchmarking your custom ops against native reference implementations. """ # %% # Compute Kernel # -------------------------- import torch import triton import triton.language as tl @triton.jit def add_kernel( x_ptr, # *Pointer* to first input vector. y_ptr, # *Pointer* to second input vector. output_ptr, # *Pointer* to output vector. n_elements, # Size of the vector. BLOCK_SIZE: tl.constexpr, # Number of elements each program should process. # NOTE: `constexpr` so it can be used as a shape value. ): # There are multiple 'programs' processing different data. We identify which program # we are here: pid = tl.program_id(axis=0) # We use a 1D launch grid so axis is 0. # This program will process inputs that are offset from the initial data. # For instance, if you had a vector of length 256 and block_size of 64, the programs # would each access the elements [0:64, 64:128, 128:192, 192:256]. # Note that offsets is a list of pointers: block_start = pid * BLOCK_SIZE offsets = block_start + tl.arange(0, BLOCK_SIZE) # Create a mask to guard memory operations against out-of-bounds accesses. mask = offsets < n_elements # Load x and y from DRAM, masking out any extra elements in case the input is not a # multiple of the block size. x = tl.load(x_ptr + offsets, mask=mask) y = tl.load(y_ptr + offsets, mask=mask) output = x + y # Write x + y back to DRAM. tl.store(output_ptr + offsets, output, mask=mask) # %% # Let's also declare a helper function to (1) allocate the `z` tensor # and (2) enqueue the above kernel with appropriate grid/block sizes: def add(x: torch.Tensor, y: torch.Tensor): # We need to preallocate the output. output = torch.empty_like(x) assert x.is_cuda and y.is_cuda and output.is_cuda n_elements = output.numel() # The SPMD launch grid denotes the number of kernel instances that run in parallel. # It is analogous to CUDA launch grids. It can be either Tuple[int], or Callable(metaparameters) -> Tuple[int]. # In this case, we use a 1D grid where the size is the number of blocks: grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),) # NOTE: # - Each torch.tensor object is implicitly converted into a pointer to its first element. # - `triton.jit`'ed functions can be indexed with a launch grid to obtain a callable GPU kernel. # - Don't forget to pass meta-parameters as keywords arguments. add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=1024) # We return a handle to z but, since `torch.cuda.synchronize()` hasn't been called, the kernel is still # running asynchronously at this point. return output # %% # We can now use the above function to compute the element-wise sum of two `torch.tensor` objects and test its correctness: torch.manual_seed(0) size = 98432 x = torch.rand(size, device='cuda') y = torch.rand(size, device='cuda') output_torch = x + y output_triton = add(x, y) print(output_torch) print(output_triton) print( f'The maximum difference between torch and triton is ' f'{torch.max(torch.abs(output_torch - output_triton))}' ) # %% # Seems like we're good to go! # %% # Benchmark # ----------- # We can now benchmark our custom op on vectors of increasing sizes to get a sense of how it does relative to PyTorch. # To make things easier, Triton has a set of built-in utilities that allow us to concisely plot the performance of your custom ops. # for different problem sizes. @triton.testing.perf_report( triton.testing.Benchmark( x_names=['size'], # Argument names to use as an x-axis for the plot. x_vals=[ 2 ** i for i in range(12, 28, 1) ], # Different possible values for `x_name`. x_log=True, # x axis is logarithmic. line_arg='provider', # Argument name whose value corresponds to a different line in the plot. line_vals=['triton', 'torch'], # Possible values for `line_arg`. line_names=['Triton', 'Torch'], # Label name for the lines. styles=[('blue', '-'), ('green', '-')], # Line styles. ylabel='GB/s', # Label name for the y-axis. plot_name='vector-add-performance', # Name for the plot. Used also as a file name for saving the plot. args={}, # Values for function arguments not in `x_names` and `y_name`. ) ) def benchmark(size, provider): x = torch.rand(size, device='cuda', dtype=torch.float32) y = torch.rand(size, device='cuda', dtype=torch.float32) if provider == 'torch': ms, min_ms, max_ms = triton.testing.do_bench(lambda: x + y) if provider == 'triton': ms, min_ms, max_ms = triton.testing.do_bench(lambda: add(x, y)) gbps = lambda ms: 12 * size / ms * 1e-6 return gbps(ms), gbps(max_ms), gbps(min_ms) # %% # We can now run the decorated function above. Pass `print_data=True` to see the performance number, `show_plots=True` to plot them, and/or # `save_path='/path/to/results/' to save them to disk along with raw CSV data: benchmark.run(print_data=True, show_plots=True) triton-2.0.0/python/tutorials/02-fused-softmax.py000066400000000000000000000165451440023377100217710ustar00rootroot00000000000000""" Fused Softmax ================= In this tutorial, you will write a fused softmax operation that is significantly faster than PyTorch's native op for a particular class of matrices: those whose rows can fit in the GPU's SRAM. You will learn about: - The benefits of kernel fusion for bandwidth-bound operations. - Reduction operators in Triton. """ # %% # Motivations # ------------ # Custom GPU kernels for elementwise additions are educationally valuable but won't get you very far in practice. # Let us consider instead the case of a simple (numerically stabilized) softmax operation: import torch import triton import triton.language as tl @torch.jit.script def naive_softmax(x): """Compute row-wise softmax of X using native pytorch We subtract the maximum element in order to avoid overflows. Softmax is invariant to this shift. """ # read MN elements ; write M elements x_max = x.max(dim=1)[0] # read MN + M elements ; write MN elements z = x - x_max[:, None] # read MN elements ; write MN elements numerator = torch.exp(z) # read MN elements ; write M elements denominator = numerator.sum(dim=1) # read MN + M elements ; write MN elements ret = numerator / denominator[:, None] # in total: read 5MN + 2M elements ; wrote 3MN + 2M elements return ret # %% # When implemented naively in PyTorch, computing :code:`y = naive_softmax(x)` for :math:`x \in R^{M \times N}` # requires reading :math:`5MN + 2M` elements from DRAM and writing back :math:`3MN + 2M` elements. # This is obviously wasteful; we'd prefer to have a custom "fused" kernel that only reads # X once and does all the necessary computations on-chip. # Doing so would require reading and writing back only :math:`MN` bytes, so we could # expect a theoretical speed-up of ~4x (i.e., :math:`(8MN + 4M) / 2MN`). # The `torch.jit.script` flags aims to perform this kind of "kernel fusion" automatically # but, as we will see later, it is still far from ideal. # %% # Compute Kernel # ---------------- # Our softmax kernel works as follows: each program loads a row of the input matrix X, # normalizes it and writes back the result to the output Y. # Note that one important limitation of Triton is that each block must have a # power-of-two number of elements, so we need to internally "pad" each row and guard the # memory operations properly if we want to handle any possible input shapes: @triton.jit def softmax_kernel( output_ptr, input_ptr, input_row_stride, output_row_stride, n_cols, BLOCK_SIZE: tl.constexpr ): # The rows of the softmax are independent, so we parallelize across those row_idx = tl.program_id(0) # The stride represents how much we need to increase the pointer to advance 1 row row_start_ptr = input_ptr + row_idx * input_row_stride # The block size is the next power of two greater than n_cols, so we can fit each # row in a single block col_offsets = tl.arange(0, BLOCK_SIZE) input_ptrs = row_start_ptr + col_offsets # Load the row into SRAM, using a mask since BLOCK_SIZE may be > than n_cols row = tl.load(input_ptrs, mask=col_offsets < n_cols, other=-float('inf')) # Subtract maximum for numerical stability row_minus_max = row - tl.max(row, axis=0) # Note that exponentiation in Triton is fast but approximate (i.e., think __expf in CUDA) numerator = tl.exp(row_minus_max) denominator = tl.sum(numerator, axis=0) softmax_output = numerator / denominator # Write back output to DRAM output_row_start_ptr = output_ptr + row_idx * output_row_stride output_ptrs = output_row_start_ptr + col_offsets tl.store(output_ptrs, softmax_output, mask=col_offsets < n_cols) # %% # We can create a helper function that enqueues the kernel and its (meta-)arguments for any given input tensor. def softmax(x): n_rows, n_cols = x.shape # The block size is the smallest power of two greater than the number of columns in `x` BLOCK_SIZE = triton.next_power_of_2(n_cols) # Another trick we can use is to ask the compiler to use more threads per row by # increasing the number of warps (`num_warps`) over which each row is distributed. # You will see in the next tutorial how to auto-tune this value in a more natural # way so you don't have to come up with manual heuristics yourself. num_warps = 4 if BLOCK_SIZE >= 2048: num_warps = 8 if BLOCK_SIZE >= 4096: num_warps = 16 # Allocate output y = torch.empty_like(x) # Enqueue kernel. The 1D launch grid is simple: we have one kernel instance per row o # f the input matrix softmax_kernel[(n_rows,)]( y, x, x.stride(0), y.stride(0), n_cols, num_warps=num_warps, BLOCK_SIZE=BLOCK_SIZE, ) return y # %% # Unit Test # ---------- # %% # We make sure that we test our kernel on a matrix with an irregular number of rows and columns. # This will allow us to verify that our padding mechanism works. torch.manual_seed(0) x = torch.randn(1823, 781, device='cuda') y_triton = softmax(x) y_torch = torch.softmax(x, axis=1) assert torch.allclose(y_triton, y_torch), (y_triton, y_torch) # %% # As expected, the results are identical. # %% # Benchmark # ------------- # Here we will benchmark our operation as a function of the number of columns in the input matrix -- assuming 4096 rows. # We will then compare its performance against (1) :code:`torch.softmax` and (2) the :code:`naive_softmax` defined above. @triton.testing.perf_report( triton.testing.Benchmark( x_names=['N'], # argument names to use as an x-axis for the plot x_vals=[ 128 * i for i in range(2, 100) ], # different possible values for `x_name` line_arg='provider', # argument name whose value corresponds to a different line in the plot line_vals=[ 'triton', 'torch-native', 'torch-jit', ], # possible values for `line_arg`` line_names=[ "Triton", "Torch (native)", "Torch (jit)", ], # label name for the lines styles=[('blue', '-'), ('green', '-'), ('green', '--')], # line styles ylabel="GB/s", # label name for the y-axis plot_name="softmax-performance", # name for the plot. Used also as a file name for saving the plot. args={'M': 4096}, # values for function arguments not in `x_names` and `y_name` ) ) def benchmark(M, N, provider): x = torch.randn(M, N, device='cuda', dtype=torch.float32) if provider == 'torch-native': ms, min_ms, max_ms = triton.testing.do_bench(lambda: torch.softmax(x, axis=-1)) if provider == 'triton': ms, min_ms, max_ms = triton.testing.do_bench(lambda: softmax(x)) if provider == 'torch-jit': ms, min_ms, max_ms = triton.testing.do_bench(lambda: naive_softmax(x)) gbps = lambda ms: 2 * x.nelement() * x.element_size() * 1e-9 / (ms * 1e-3) return gbps(ms), gbps(max_ms), gbps(min_ms) benchmark.run(show_plots=True, print_data=True) # %% # In the above plot, we can see that: # # - Triton is 4x faster than the Torch JIT. This confirms our suspicions that the Torch JIT does not do any fusion here. # - Triton is noticeably faster than :code:`torch.softmax` -- in addition to being **easier to read, understand and maintain**. # Note however that the PyTorch `softmax` operation is more general and will work on tensors of any shape. triton-2.0.0/python/tutorials/03-matrix-multiplication.py000066400000000000000000000323031440023377100235320ustar00rootroot00000000000000""" Matrix Multiplication ====================== In this tutorial, you will write a 25-lines high-performance FP16 matrix multiplication kernel that achieves performance on par with cuBLAS. You will specifically learn about: - Block-level matrix multiplications - Multi-dimensional pointer arithmetic - Program re-ordering for improved L2 cache hit rate - Automatic performance tuning """ # %% # Motivations # ------------- # Matrix multiplications are a key building block of most modern high-performance computing systems. # They are notoriously hard to optimize, hence their implementation is generally done by # hardware vendors themselves as part of so-called "kernel libraries" (e.g., cuBLAS). # Unfortunately, these libraries are often proprietary and cannot be easily customized # to accommodate the needs of modern deep learning workloads (e.g., fused activation functions). # In this tutorial, you will learn how to implement efficient matrix multiplications by # yourself with Triton, in a way that is easy to customize and extend. # # Roughly speaking, the kernel that we will write will implement the following blocked # algorithm to multiply a (M, K) by a (K, N) matrix: # # .. code-block:: python # # # do in parallel # for m in range(0, M, BLOCK_SIZE_M): # # do in parallel # for n in range(0, N, BLOCK_SIZE_N): # acc = zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=float32) # for k in range(0, K, BLOCK_SIZE_K): # a = A[m : m+BLOCK_SIZE_M, k : k+BLOCK_SIZE_K] # b = B[k : k+BLOCK_SIZE_K, n : n+BLOCK_SIZE_N] # acc += dot(a, b) # C[m : m+BLOCK_SIZE_M, n : n+BLOCK_SIZE_N] = acc; # # where each iteration of the doubly-nested for-loop is performed by a dedicated Triton program instance. # %% # Compute Kernel # ---------------- # # The above algorithm is, actually, fairly straightforward to implement in Triton. # The main difficulty comes from the computation of the memory locations at which blocks # of :code:`A` and :code:`B` must be read in the inner loop. For that, we need # multi-dimensional pointer arithmetics. # # Pointer Arithmetics # ~~~~~~~~~~~~~~~~~~~~ # # For a row-major 2D tensor :code:`X`, the memory location of :code:`X[i, j]` is given b # y :code:`&X[i, j] = X + i*stride_xi + j*stride_xj`. # Therefore, blocks of pointers for :code:`A[m : m+BLOCK_SIZE_M, k:k+BLOCK_SIZE_K]` and # :code:`B[k : k+BLOCK_SIZE_K, n : n+BLOCK_SIZE_N]` can be defined in pseudo-code as: # # .. code-block:: python # # &A[m : m+BLOCK_SIZE_M, k:k+BLOCK_SIZE_K] = a_ptr + (m : m+BLOCK_SIZE_M)[:, None]*A.stride(0) + (k : k+BLOCK_SIZE_K)[None, :]*A.stride(1); # &B[k : k+BLOCK_SIZE_K, n:n+BLOCK_SIZE_N] = b_ptr + (k : k+BLOCK_SIZE_K)[:, None]*B.stride(0) + (n : n+BLOCK_SIZE_N)[None, :]*B.stride(1); # # Which means that pointers for blocks of A and B can be initialized (i.e., :code:`k=0`) in Triton as: # # .. code-block:: python # # offs_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) # offs_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) # offs_k = tl.arange(0, BLOCK_SIZE_K) # a_ptrs = a_ptr + (offs_am[:, None]*stride_am + offs_k [None, :]*stride_ak) # b_ptrs = b_ptr + (offs_k [:, None]*stride_bk + offs_bn[None, :]*stride_bn) # # And then updated in the inner loop as follows: # # .. code-block:: python # # a_ptrs += BLOCK_SIZE_K * stride_ak; # b_ptrs += BLOCK_SIZE_K * stride_bk; # # # L2 Cache Optimizations # ~~~~~~~~~~~~~~~~~~~~~~~~ # # As mentioned above, each program instance computes a :code:`[BLOCK_SIZE_M, BLOCK_SIZE_N]` # block of :code:`C`. # It is important to remember that the order in which these blocks are computed does # matter, since it affects the L2 cache hit rate of our program. and unfortunately, a # a simple row-major ordering # # .. code-block:: Python # # pid = triton.program_id(0); # grid_m = (M + BLOCK_SIZE_M - 1) // BLOCK_SIZE_M; # grid_n = (N + BLOCK_SIZE_N - 1) // BLOCK_SIZE_N; # pid_m = pid / grid_n; # pid_n = pid % grid_n; # # is just not going to cut it. # # One possible solution is to launch blocks in an order that promotes data reuse. # This can be done by 'super-grouping' blocks in groups of :code:`GROUP_M` rows before # switching to the next column: # # .. code-block:: python # # # program ID # pid = tl.program_id(axis=0) # # number of program ids along the M axis # num_pid_m = tl.cdiv(M, BLOCK_SIZE_M) # # number of programs ids along the N axis # num_pid_n = tl.cdiv(N, BLOCK_SIZE_N) # # number of programs in group # num_pid_in_group = GROUP_SIZE_M * num_pid_n # # id of the group this program is in # group_id = pid // num_pid_in_group # # row-id of the first program in the group # first_pid_m = group_id * GROUP_SIZE_M # # if `num_pid_m` isn't divisible by `GROUP_SIZE_M`, the last group is smaller # group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M) # # *within groups*, programs are ordered in a column-major order # # row-id of the program in the *launch grid* # pid_m = first_pid_m + (pid % group_size_m) # # col-id of the program in the *launch grid* # pid_n = (pid % num_pid_in_group) // group_size_m # # For example, in the following matmul where each matrix is 9 blocks by 9 blocks, # we can see that if we compute the output in row-major ordering, we need to load 90 # blocks into SRAM to compute the first 9 output blocks, but if we do it in grouped # ordering, we only need to load 54 blocks. # .. image:: grouped_vs_row_major_ordering.png # # In practice, this can improve the performance of our matrix multiplication kernel by # more than 10\% on some hardware architecture (e.g., 220 to 245 TFLOPS on A100). # # %% # Final Result # ------------- # import torch import triton import triton.language as tl # % # :code:`triton.jit`'ed functions can be auto-tuned by using the `triton.autotune` # decorator, which consumes: # - A list of :code:`triton.Config` objects that define different configurations of # meta-parameters (e.g., BLOCK_SIZE_M) and compilation options (e.g., num_warps) to try # - An autotuning *key* whose change in values will trigger evaluation of all the # provided configs @triton.autotune( configs=[ triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8}, num_stages=3, num_warps=8), ], key=['M', 'N', 'K'], ) @triton.jit def matmul_kernel( # Pointers to matrices a_ptr, b_ptr, c_ptr, # Matrix dimensions M, N, K, # The stride variables represent how much to increase the ptr by when moving by 1 # element in a particular dimension. E.g. stride_am is how much to increase a_ptr # by to get the element one row down (A has M rows) stride_am, stride_ak, stride_bk, stride_bn, stride_cm, stride_cn, # Meta-parameters BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr, GROUP_SIZE_M: tl.constexpr, ACTIVATION: tl.constexpr, ): """Kernel for computing the matmul C = A x B. A has shape (M, K), B has shape (K, N) and C has shape (M, N) """ # ----------------------------------------------------------- # Map program ids `pid` to the block of C it should compute. # This is done in a grouped ordering to promote L2 data reuse # See above `L2 Cache Optimizations` section for details pid = tl.program_id(axis=0) num_pid_m = tl.cdiv(M, BLOCK_SIZE_M) num_pid_n = tl.cdiv(N, BLOCK_SIZE_N) num_pid_in_group = GROUP_SIZE_M * num_pid_n group_id = pid // num_pid_in_group first_pid_m = group_id * GROUP_SIZE_M group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M) pid_m = first_pid_m + (pid % group_size_m) pid_n = (pid % num_pid_in_group) // group_size_m # ---------------------------------------------------------- # Create pointers for the first blocks of A and B. # We will advance this pointer as we move in the K direction # and accumulate # a_ptrs is a block of [BLOCK_SIZE_M, BLOCK_SIZE_K] pointers # b_ptrs is a block of [BLOCK_SIZE_K, BLOCK_SIZE_n] pointers # see above `Pointer Arithmetics` section for details offs_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) offs_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) offs_k = tl.arange(0, BLOCK_SIZE_K) a_ptrs = a_ptr + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak) b_ptrs = b_ptr + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn) # ----------------------------------------------------------- # Iterate to compute a block of the C matrix # We accumulate into a `[BLOCK_SIZE_M, BLOCK_SIZE_N]` block # of fp32 values for higher accuracy. # `accumulator` will be converted back to fp16 after the loop accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) for k in range(0, K, BLOCK_SIZE_K): # Note that for simplicity, we don't apply a mask here. # This means that if K is not a multiple of BLOCK_SIZE_K, # this will access out-of-bounds memory and produce an # error or (worse!) incorrect results. a = tl.load(a_ptrs) b = tl.load(b_ptrs) # We accumulate along the K dimension accumulator += tl.dot(a, b) # Advance the ptrs to the next K block a_ptrs += BLOCK_SIZE_K * stride_ak b_ptrs += BLOCK_SIZE_K * stride_bk # you can fuse arbitrary activation functions here # while the accumulator is still in FP32! if ACTIVATION: accumulator = ACTIVATION(accumulator) c = accumulator.to(tl.float16) # ----------------------------------------------------------- # Write back the block of the output matrix C offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) c_ptrs = c_ptr + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :] c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N) tl.store(c_ptrs, c, mask=c_mask) # we can fuse `leaky_relu` by providing it as an `ACTIVATION` meta-parameter in `_matmul` @triton.jit def leaky_relu(x): return tl.where(x >= 0, x, 0.01 * x) # %% # We can now create a convenience wrapper function that only takes two input tensors # and (1) checks any shape constraint; (2) allocates the output; (3) launches the above kernel def matmul(a, b, activation=None): # checks constraints assert a.shape[1] == b.shape[0], "incompatible dimensions" assert a.is_contiguous(), "matrix A must be contiguous" assert b.is_contiguous(), "matrix B must be contiguous" M, K = a.shape K, N = b.shape assert ( K % 32 == 0 ), "We don't check memory-out-of-bounds with K so K must be divisible by BLOCK_SIZE_K" # allocates output c = torch.empty((M, N), device=a.device, dtype=a.dtype) # 1D launch kernel where each block gets its own program. grid = lambda META: ( triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv(N, META['BLOCK_SIZE_N']), ) matmul_kernel[grid]( a, b, c, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1), ACTIVATION=activation, ) return c # %% # Unit Test # ----------- # # We can test our custom matrix multiplication operation against a native torch implementation (i.e., cuBLAS) torch.manual_seed(0) a = torch.randn((512, 512), device='cuda', dtype=torch.float16) b = torch.randn((512, 512), device='cuda', dtype=torch.float16) triton_output = matmul(a, b, activation=None) torch_output = torch.matmul(a, b) print(f"triton_output={triton_output}") print(f"torch_output={torch_output}") if triton.testing.allclose(triton_output, torch_output): print("✅ Triton and Torch match") else: print("❌ Triton and Torch differ") # %% # Benchmark # -------------- # # Square Matrix Performance # ~~~~~~~~~~~~~~~~~~~~~~~~~~ # We can now compare the performance of our kernel against that of cuBLAS. Here we focus on square matrices, but feel free to arrange this script as you wish to benchmark any other matrix shape. @triton.testing.perf_report( triton.testing.Benchmark( x_names=['M', 'N', 'K'], # argument names to use as an x-axis for the plot x_vals=[ 8192 ], # different possible values for `x_name` line_arg='provider', # argument name whose value corresponds to a different line in the plot # possible values for `line_arg`` line_vals=['cublas', 'triton'], # label name for the lines line_names=["cuBLAS", "Triton"], # line styles styles=[('green', '-'), ('green', '--'), ('blue', '-'), ('blue', '--')], ylabel="TFLOPS", # label name for the y-axis plot_name="matmul-performance", # name for the plot. Used also as a file name for saving the plot. args={}, ) ) def benchmark(M, N, K, provider): a = torch.randn((M, K), device='cuda', dtype=torch.float16) b = torch.randn((K, N), device='cuda', dtype=torch.float16) if provider == 'cublas': ms, min_ms, max_ms = triton.testing.do_bench(lambda: torch.matmul(a, b), rep=100) if provider == 'triton': ms, min_ms, max_ms = triton.testing.do_bench(lambda: matmul(a, b), rep=100) perf = lambda ms: 2 * M * N * K * 1e-12 / (ms * 1e-3) return perf(ms), perf(max_ms), perf(min_ms) benchmark.run(show_plots=True, print_data=True) triton-2.0.0/python/tutorials/04-low-memory-dropout.py000066400000000000000000000143401440023377100227760ustar00rootroot00000000000000""" Low-Memory Dropout ================= In this tutorial, you will write a memory-efficient implementation of dropout whose state will be composed of a single int32 seed. This differs from more traditional implementations of dropout, whose state is generally composed of a bit mask tensor of the same shape as the input. You will learn about: - The limitations of naive implementations of Dropout with PyTorch - Parallel pseudo-random number generation in Triton """ # %% # Baseline # ------------- # The *dropout* operator was first introduced in [SRIVASTAVA2014]_ as a way to improve the performance # of deep neural networks in low-data regime (i.e. regularization). # # It takes a vector as input and produces a vector of the same shape as output. Each scalar in the # output has a probability :math:`p` of being changed to zero and otherwise it is copied from the input. # This forces the network to perform well even when only :math:`1 - p` scalars from the input are available. # # At evaluation time we want to use the full power of the network so we set :math:`p=0`. Naively this would # increase the norm of the output (which can be a bad thing, e.g. it can lead to artificial decrease # in the output softmax temperature). To prevent this we multiply the output by :math:`\frac{1}{1 - p}`, which # keeps the norm consistent regardless of the dropout probability. # # Let's first take a look at the baseline implementation. import tabulate import torch import triton import triton.language as tl @triton.jit def _dropout( x_ptr, # pointer to the input x_keep_ptr, # pointer to a mask of 0s and 1s output_ptr, # pointer to the output n_elements, # number of elements in the `x` tensor p, # probability that an element of `x` is changed to zero BLOCK_SIZE: tl.constexpr, ): pid = tl.program_id(axis=0) block_start = pid * BLOCK_SIZE offsets = block_start + tl.arange(0, BLOCK_SIZE) mask = offsets < n_elements # Load data x = tl.load(x_ptr + offsets, mask=mask) x_keep = tl.load(x_keep_ptr + offsets, mask=mask) # The line below is the crucial part, described in the paragraph above! output = tl.where(x_keep, x / (1 - p), 0.0) # Write-back output tl.store(output_ptr + offsets, output, mask=mask) def dropout(x, x_keep, p): output = torch.empty_like(x) assert x.is_contiguous() n_elements = x.numel() grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),) _dropout[grid](x, x_keep, output, n_elements, p, BLOCK_SIZE=1024) return output # Input tensor x = torch.randn(size=(10,)).cuda() # Dropout mask p = 0.5 x_keep = (torch.rand(size=(10,)) > p).to(torch.int32).cuda() # output = dropout(x, x_keep=x_keep, p=p) print(tabulate.tabulate([ ["input"] + x.tolist(), ["keep mask"] + x_keep.tolist(), ["output"] + output.tolist() ])) # %% # Seeded dropout # ------------- # Above implementation of dropout works fine, but it can be a bit awkward to deal with. Firstly # we need to store the dropout mask for backpropagation. Secondly, dropout state management can get # very tricky when using recompute/checkpointing (e.g. see all the notes about `preserve_rng_state` in # https://pytorch.org/docs/1.9.0/checkpoint.html). In this tutorial we'll describe an alternative implementation # that (1) has a smaller memory footprint; (2) requires less data movement; and (3) simplifies the management # of persisting randomness across multiple invocations of the kernel. # # Pseudorandom number generation in Triton is simple! In this tutorial we will use the # :code:`triton.language.rand` function which generates a block of uniformly distributed :code:`float32` # values in [0, 1), given a seed and a block of :code:`int32` offsets. But if you need it, Triton also provides # other :ref:`random number generation strategies `. # # .. note:: # Triton's implementation of PRNG is based on the Philox algorithm (described on [SALMON2011]_). # # Let's put it all together. @triton.jit def _seeded_dropout( x_ptr, output_ptr, n_elements, p, seed, BLOCK_SIZE: tl.constexpr, ): # compute memory offsets of elements handled by this instance pid = tl.program_id(axis=0) block_start = pid * BLOCK_SIZE offsets = block_start + tl.arange(0, BLOCK_SIZE) # load data from x mask = offsets < n_elements x = tl.load(x_ptr + offsets, mask=mask) # randomly prune it random = tl.rand(seed, offsets) x_keep = random > p # write-back output = tl.where(x_keep, x / (1 - p), 0.0) tl.store(output_ptr + offsets, output, mask=mask) def seeded_dropout(x, p, seed): output = torch.empty_like(x) assert x.is_contiguous() n_elements = x.numel() grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),) _seeded_dropout[grid](x, output, n_elements, p, seed, BLOCK_SIZE=1024) return output x = torch.randn(size=(10,)).cuda() # Compare this to the baseline - dropout mask is never instantiated! output = seeded_dropout(x, p=0.5, seed=123) output2 = seeded_dropout(x, p=0.5, seed=123) output3 = seeded_dropout(x, p=0.5, seed=512) print(tabulate.tabulate([ ["input"] + x.tolist(), ["output (seed = 123)"] + output.tolist(), ["output (seed = 123)"] + output2.tolist(), ["output (seed = 512)"] + output3.tolist() ])) # %% # Et Voilà! We have a triton kernel that applies the same dropout mask provided the seed is the same! # If you'd like explore further applications of pseudorandomness in GPU programming, we encourage you # to explore the `triton/language/random` folder! # %% # Exercises # ------------- # 1. Extend the kernel to operate over a matrix and use a vector of seeds - one per row. # 2. Add support for striding. # 3. (challenge) Implement a kernel for sparse Johnson-Lindenstrauss transform which generates the projection matrix one the fly each time using a seed. # %% # References # -------------- # # .. [SALMON2011] John K. Salmon, Mark A. Moraes, Ron O. Dror, and David E. Shaw, "Parallel Random Numbers: As Easy as 1, 2, 3", 2011 # .. [SRIVASTAVA2014] Nitish Srivastava and Geoffrey Hinton and Alex Krizhevsky and Ilya Sutskever and Ruslan Salakhutdinov, "Dropout: A Simple Way to Prevent Neural Networks from Overfitting", JMLR 2014 triton-2.0.0/python/tutorials/05-layer-norm.py000066400000000000000000000247661440023377100213000ustar00rootroot00000000000000""" Layer Normalization ==================== """ import torch import triton import triton.language as tl try: # This is https://github.com/NVIDIA/apex, NOT the apex on PyPi, so it # should not be added to extras_require in setup.py. import apex HAS_APEX = True except ModuleNotFoundError: HAS_APEX = False @triton.jit def _layer_norm_fwd_fused( A, Out, Weight, Bias, Mean, Rstd, stride, N, eps, BLOCK_SIZE: tl.constexpr, ): # position of elements processed by this program row = tl.program_id(0) Out += row * stride A += row * stride # compute mean mean = 0 _mean = tl.zeros([BLOCK_SIZE], dtype=tl.float32) for off in range(0, N, BLOCK_SIZE): cols = off + tl.arange(0, BLOCK_SIZE) a = tl.load(A + cols, mask=cols < N, other=0.).to(tl.float32) _mean += a mean = tl.sum(_mean, axis=0) / N # compute variance _var = tl.zeros([BLOCK_SIZE], dtype=tl.float32) for off in range(0, N, BLOCK_SIZE): cols = off + tl.arange(0, BLOCK_SIZE) a = tl.load(A + cols, mask=cols < N, other=0.).to(tl.float32) a = tl.where(cols < N, a - mean, 0.) _var += a * a var = tl.sum(_var, axis=0) / N rstd = 1 / tl.sqrt(var + eps) # write-back mean/rstd tl.store(Mean + row, mean) tl.store(Rstd + row, rstd) # multiply by weight and add bias for off in range(0, N, BLOCK_SIZE): cols = off + tl.arange(0, BLOCK_SIZE) mask = cols < N weight = tl.load(Weight + cols, mask=mask) bias = tl.load(Bias + cols, mask=mask) a = tl.load(A + cols, mask=mask, other=0.).to(tl.float32) a_hat = (a - mean) * rstd out = a_hat * weight + bias # # write-back tl.store(Out + cols, out, mask=mask) # Backward pass (DX + partial DW + partial DB) @triton.jit def _layer_norm_bwd_dx_fused(DX, DY, DW, DB, X, W, B, M, V, Lock, stride, N, eps, GROUP_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr): # position of elements processed by this program row = tl.program_id(0) cols = tl.arange(0, BLOCK_SIZE_N) mask = cols < N # offset data pointers to start at the row of interest X += row * stride DY += row * stride DX += row * stride # offset locks and weight/bias gradient pointer # each kernel instance accumulates partial sums for # DW and DB into one of GROUP_SIZE_M independent buffers # these buffers stay in the L2, which allow this kernel # to be fast lock_id = row % GROUP_SIZE_M Lock += lock_id Count = Lock + GROUP_SIZE_M DW = DW + lock_id * N + cols DB = DB + lock_id * N + cols # load data to SRAM x = tl.load(X + cols, mask=mask, other=0).to(tl.float32) dy = tl.load(DY + cols, mask=mask, other=0).to(tl.float32) w = tl.load(W + cols, mask=mask).to(tl.float32) mean = tl.load(M + row) rstd = tl.load(V + row) # compute dx xhat = (x - mean) * rstd wdy = w * dy xhat = tl.where(mask, xhat, 0.) wdy = tl.where(mask, wdy, 0.) mean1 = tl.sum(xhat * wdy, axis=0) / N mean2 = tl.sum(wdy, axis=0) / N dx = (wdy - (xhat * mean1 + mean2)) * rstd # write-back dx tl.store(DX + cols, dx, mask=mask) # accumulate partial sums for dw/db partial_dw = (dy * xhat).to(w.dtype) partial_db = (dy).to(w.dtype) while tl.atomic_cas(Lock, 0, 1) == 1: pass count = tl.load(Count) # first store doesn't accumulate if count == 0: tl.atomic_xchg(Count, 1) else: partial_dw += tl.load(DW, mask=mask) partial_db += tl.load(DB, mask=mask) tl.store(DW, partial_dw, mask=mask) tl.store(DB, partial_db, mask=mask) # release lock tl.atomic_xchg(Lock, 0) # Backward pass (total DW + total DB) @triton.jit def _layer_norm_bwd_dwdb(DW, DB, FINAL_DW, FINAL_DB, M, N, BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr): pid = tl.program_id(0) cols = pid * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) dw = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) db = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) for i in range(0, M, BLOCK_SIZE_M): rows = i + tl.arange(0, BLOCK_SIZE_M) mask = (rows[:, None] < M) & (cols[None, :] < N) offs = rows[:, None] * N + cols[None, :] dw += tl.load(DW + offs, mask=mask, other=0.) db += tl.load(DB + offs, mask=mask, other=0.) sum_dw = tl.sum(dw, axis=0) sum_db = tl.sum(db, axis=0) tl.store(FINAL_DW + cols, sum_dw, mask=cols < N) tl.store(FINAL_DB + cols, sum_db, mask=cols < N) class LayerNorm(torch.autograd.Function): @staticmethod def forward(ctx, x, normalized_shape, weight, bias, eps): # allocate output y = torch.empty_like(x) # reshape input data into 2D tensor x_arg = x.reshape(-1, x.shape[-1]) M, N = x_arg.shape mean = torch.empty((M, ), dtype=torch.float32, device='cuda') rstd = torch.empty((M, ), dtype=torch.float32, device='cuda') # Less than 64KB per feature: enqueue fused kernel MAX_FUSED_SIZE = 65536 // x.element_size() BLOCK_SIZE = min(MAX_FUSED_SIZE, triton.next_power_of_2(N)) if N > BLOCK_SIZE: raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.") # heuristics for number of warps num_warps = min(max(BLOCK_SIZE // 256, 1), 8) # enqueue kernel _layer_norm_fwd_fused[(M,)](x_arg, y, weight, bias, mean, rstd, x_arg.stride(0), N, eps, BLOCK_SIZE=BLOCK_SIZE, num_warps=num_warps) ctx.save_for_backward(x, weight, bias, mean, rstd) ctx.BLOCK_SIZE = BLOCK_SIZE ctx.num_warps = num_warps ctx.eps = eps return y @staticmethod def backward(ctx, dy): x, w, b, m, v = ctx.saved_tensors # heuristics for amount of parallel reduction stream for DG/DB N = w.shape[0] GROUP_SIZE_M = 64 if N <= 8192: GROUP_SIZE_M = 96 if N <= 4096: GROUP_SIZE_M = 128 if N <= 1024: GROUP_SIZE_M = 256 # allocate output locks = torch.zeros(2 * GROUP_SIZE_M, dtype=torch.int32, device='cuda') _dw = torch.empty((GROUP_SIZE_M, w.shape[0]), dtype=x.dtype, device=w.device) _db = torch.empty((GROUP_SIZE_M, w.shape[0]), dtype=x.dtype, device=w.device) dw = torch.empty((w.shape[0],), dtype=w.dtype, device=w.device) db = torch.empty((w.shape[0],), dtype=w.dtype, device=w.device) dx = torch.empty_like(dy) # enqueue kernel using forward pass heuristics # also compute partial sums for DW and DB x_arg = x.reshape(-1, x.shape[-1]) M, N = x_arg.shape _layer_norm_bwd_dx_fused[(M,)](dx, dy, _dw, _db, x, w, b, m, v, locks, x_arg.stride(0), N, ctx.eps, BLOCK_SIZE_N=ctx.BLOCK_SIZE, GROUP_SIZE_M=GROUP_SIZE_M, num_warps=ctx.num_warps) grid = lambda meta: [triton.cdiv(N, meta['BLOCK_SIZE_N'])] # accumulate partial sums in separate kernel _layer_norm_bwd_dwdb[grid](_dw, _db, dw, db, GROUP_SIZE_M, N, BLOCK_SIZE_M=32, BLOCK_SIZE_N=128) return dx, None, dw, db, None layer_norm = LayerNorm.apply def test_layer_norm(M, N, dtype, eps=1e-5, device='cuda'): # create data x_shape = (M, N) w_shape = (x_shape[-1], ) weight = torch.rand(w_shape, dtype=dtype, device='cuda', requires_grad=True) bias = torch.rand(w_shape, dtype=dtype, device='cuda', requires_grad=True) x = -2.3 + 0.5 * torch.randn(x_shape, dtype=dtype, device='cuda') dy = .1 * torch.randn_like(x) x.requires_grad_(True) # forward pass y_tri = layer_norm(x, w_shape, weight, bias, eps) y_ref = torch.nn.functional.layer_norm(x, w_shape, weight, bias, eps).to(dtype) # backward pass (triton) y_tri.backward(dy, retain_graph=True) dx_tri, dw_tri, db_tri = [_.grad.clone() for _ in [x, weight, bias]] x.grad, weight.grad, bias.grad = None, None, None # backward pass (torch) y_ref.backward(dy, retain_graph=True) dx_ref, dw_ref, db_ref = [_.grad.clone() for _ in [x, weight, bias]] # compare triton.testing.assert_almost_equal(y_tri, y_ref) triton.testing.assert_almost_equal(dx_tri, dx_ref) triton.testing.assert_almost_equal(db_tri, db_ref, decimal=1) triton.testing.assert_almost_equal(dw_tri, dw_ref, decimal=1) @triton.testing.perf_report( triton.testing.Benchmark( x_names=['N'], x_vals=[512 * i for i in range(2, 32)], line_arg='provider', line_vals=['triton', 'torch'] + (['apex'] if HAS_APEX else []), line_names=['Triton', 'Torch'] + (['Apex'] if HAS_APEX else []), styles=[('blue', '-'), ('green', '-'), ('orange', '-')], ylabel='GB/s', plot_name='layer-norm-backward', args={'M': 4096, 'dtype': torch.float16, 'mode': 'backward'} ) ) def bench_layer_norm(M, N, dtype, provider, mode='backward', eps=1e-5, device='cuda'): # create data x_shape = (M, N) w_shape = (x_shape[-1], ) weight = torch.rand(w_shape, dtype=dtype, device='cuda', requires_grad=True) bias = torch.rand(w_shape, dtype=dtype, device='cuda', requires_grad=True) x = -2.3 + 0.5 * torch.randn(x_shape, dtype=dtype, device='cuda') dy = .1 * torch.randn_like(x) x.requires_grad_(True) # utility functions if provider == 'triton': y_fwd = lambda: layer_norm(x, w_shape, weight, bias, eps) if provider == 'torch': y_fwd = lambda: torch.nn.functional.layer_norm(x, w_shape, weight, bias, eps) if provider == 'apex': apex_layer_norm = apex.normalization.FusedLayerNorm(w_shape).to(x.device).to(x.dtype) y_fwd = lambda: apex_layer_norm(x) # forward pass if mode == 'forward': gbps = lambda ms: 2 * x.numel() * x.element_size() / ms * 1e-6 ms, min_ms, max_ms = triton.testing.do_bench(y_fwd, rep=500) # backward pass if mode == 'backward': gbps = lambda ms: 3 * x.numel() * x.element_size() / ms * 1e-6 y = y_fwd() ms, min_ms, max_ms = triton.testing.do_bench(lambda: y.backward(dy, retain_graph=True), grad_to_none=[x], rep=500) return gbps(ms), gbps(max_ms), gbps(min_ms) test_layer_norm(1151, 8192, torch.float16) # bench_layer_norm.run(save_path='.', print_data=True) triton-2.0.0/python/tutorials/06-fused-attention.py000066400000000000000000000325131440023377100223120ustar00rootroot00000000000000""" Fused Attention =============== This is a Triton implementation of the Flash Attention algorithm (see: Dao et al., https://arxiv.org/pdf/2205.14135v2.pdf; Rabe and Staats https://arxiv.org/pdf/2112.05682v2.pdf) """ import pytest import torch import triton import triton.language as tl @triton.jit def _fwd_kernel( Q, K, V, sm_scale, L, M, Out, stride_qz, stride_qh, stride_qm, stride_qk, stride_kz, stride_kh, stride_kn, stride_kk, stride_vz, stride_vh, stride_vk, stride_vn, stride_oz, stride_oh, stride_om, stride_on, Z, H, N_CTX, BLOCK_M: tl.constexpr, BLOCK_DMODEL: tl.constexpr, BLOCK_N: tl.constexpr, ): start_m = tl.program_id(0) off_hz = tl.program_id(1) # initialize offsets offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M) offs_n = tl.arange(0, BLOCK_N) offs_d = tl.arange(0, BLOCK_DMODEL) off_q = off_hz * stride_qh + offs_m[:, None] * stride_qm + offs_d[None, :] * stride_qk off_k = off_hz * stride_qh + offs_n[None, :] * stride_kn + offs_d[:, None] * stride_kk off_v = off_hz * stride_qh + offs_n[:, None] * stride_qm + offs_d[None, :] * stride_qk # Initialize pointers to Q, K, V q_ptrs = Q + off_q k_ptrs = K + off_k v_ptrs = V + off_v # initialize pointer to m and l m_prev = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf") l_prev = tl.zeros([BLOCK_M], dtype=tl.float32) acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32) # load q: it will stay in SRAM throughout q = tl.load(q_ptrs) # loop over k, v and update accumulator for start_n in range(0, (start_m + 1) * BLOCK_M, BLOCK_N): # -- compute qk ---- k = tl.load(k_ptrs) qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32) qk += tl.dot(q, k) qk *= sm_scale qk = tl.where(offs_m[:, None] >= (start_n + offs_n[None, :]), qk, float("-inf")) # compute new m m_curr = tl.maximum(tl.max(qk, 1), m_prev) # correct old l l_prev *= tl.exp(m_prev - m_curr) # attention weights p = tl.exp(qk - m_curr[:, None]) l_curr = tl.sum(p, 1) + l_prev # rescale operands of matmuls l_rcp = 1. / l_curr p *= l_rcp acc *= (l_prev * l_rcp)[:, None] # update acc p = p.to(tl.float16) v = tl.load(v_ptrs) acc += tl.dot(p, v) # update m_i and l_i l_prev = l_curr m_prev = m_curr # update pointers k_ptrs += BLOCK_N * stride_kn v_ptrs += BLOCK_N * stride_vk # rematerialize offsets to save registers start_m = tl.program_id(0) offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M) # write back l and m l_ptrs = L + off_hz * N_CTX + offs_m m_ptrs = M + off_hz * N_CTX + offs_m tl.store(l_ptrs, l_prev) tl.store(m_ptrs, m_prev) # initialize pointers to output offs_n = tl.arange(0, BLOCK_DMODEL) off_o = off_hz * stride_oh + offs_m[:, None] * stride_om + offs_n[None, :] * stride_on out_ptrs = Out + off_o tl.store(out_ptrs, acc) @triton.jit def _bwd_preprocess( Out, DO, L, NewDO, Delta, BLOCK_M: tl.constexpr, D_HEAD: tl.constexpr, ): off_m = tl.program_id(0) * BLOCK_M + tl.arange(0, BLOCK_M) off_n = tl.arange(0, D_HEAD) # load o = tl.load(Out + off_m[:, None] * D_HEAD + off_n[None, :]).to(tl.float32) do = tl.load(DO + off_m[:, None] * D_HEAD + off_n[None, :]).to(tl.float32) denom = tl.load(L + off_m).to(tl.float32) # compute do = do / denom[:, None] delta = tl.sum(o * do, axis=1) # write-back tl.store(NewDO + off_m[:, None] * D_HEAD + off_n[None, :], do) tl.store(Delta + off_m, delta) @triton.jit def _bwd_kernel( Q, K, V, sm_scale, Out, DO, DQ, DK, DV, L, M, D, stride_qz, stride_qh, stride_qm, stride_qk, stride_kz, stride_kh, stride_kn, stride_kk, stride_vz, stride_vh, stride_vk, stride_vn, Z, H, N_CTX, num_block, BLOCK_M: tl.constexpr, BLOCK_DMODEL: tl.constexpr, BLOCK_N: tl.constexpr, ): off_hz = tl.program_id(0) off_z = off_hz // H off_h = off_hz % H # offset pointers for batch/head Q += off_z * stride_qz + off_h * stride_qh K += off_z * stride_qz + off_h * stride_qh V += off_z * stride_qz + off_h * stride_qh DO += off_z * stride_qz + off_h * stride_qh DQ += off_z * stride_qz + off_h * stride_qh DK += off_z * stride_qz + off_h * stride_qh DV += off_z * stride_qz + off_h * stride_qh for start_n in range(0, num_block): lo = start_n * BLOCK_M # initialize row/col offsets offs_qm = lo + tl.arange(0, BLOCK_M) offs_n = start_n * BLOCK_M + tl.arange(0, BLOCK_M) offs_m = tl.arange(0, BLOCK_N) offs_k = tl.arange(0, BLOCK_DMODEL) # initialize pointers to value-like data q_ptrs = Q + (offs_qm[:, None] * stride_qm + offs_k[None, :] * stride_qk) k_ptrs = K + (offs_n[:, None] * stride_kn + offs_k[None, :] * stride_kk) v_ptrs = V + (offs_n[:, None] * stride_qm + offs_k[None, :] * stride_qk) do_ptrs = DO + (offs_qm[:, None] * stride_qm + offs_k[None, :] * stride_qk) dq_ptrs = DQ + (offs_qm[:, None] * stride_qm + offs_k[None, :] * stride_qk) # pointer to row-wise quantities in value-like data D_ptrs = D + off_hz * N_CTX m_ptrs = M + off_hz * N_CTX # initialize dv amd dk dv = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32) dk = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32) # k and v stay in SRAM throughout k = tl.load(k_ptrs) v = tl.load(v_ptrs) # loop over rows for start_m in range(lo, num_block * BLOCK_M, BLOCK_M): offs_m_curr = start_m + offs_m # load q, k, v, do on-chip q = tl.load(q_ptrs) # recompute p = softmax(qk, dim=-1).T # NOTE: `do` is pre-divided by `l`; no normalization here qk = tl.dot(q, tl.trans(k)) qk = tl.where(offs_m_curr[:, None] >= (offs_n[None, :]), qk, float("-inf")) m = tl.load(m_ptrs + offs_m_curr) p = tl.exp(qk * sm_scale - m[:, None]) # compute dv do = tl.load(do_ptrs) dv += tl.dot(tl.trans(p.to(tl.float16)), do) # compute dp = dot(v, do) Di = tl.load(D_ptrs + offs_m_curr) dp = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32) - Di[:, None] dp += tl.dot(do, tl.trans(v)) # compute ds = p * (dp - delta[:, None]) ds = p * dp * sm_scale # compute dk = dot(ds.T, q) dk += tl.dot(tl.trans(ds.to(tl.float16)), q) # compute dq dq = tl.load(dq_ptrs) dq += tl.dot(ds.to(tl.float16), k) tl.store(dq_ptrs, dq) # increment pointers dq_ptrs += BLOCK_M * stride_qm q_ptrs += BLOCK_M * stride_qm do_ptrs += BLOCK_M * stride_qm # write-back dv_ptrs = DV + (offs_n[:, None] * stride_qm + offs_k[None, :] * stride_qk) dk_ptrs = DK + (offs_n[:, None] * stride_kn + offs_k[None, :] * stride_kk) tl.store(dv_ptrs, dv) tl.store(dk_ptrs, dk) empty = torch.empty(128, device="cuda") class _attention(torch.autograd.Function): @staticmethod def forward(ctx, q, k, v, sm_scale): BLOCK = 128 # shape constraints Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1] assert Lq == Lk and Lk == Lv assert Lk in {16, 32, 64, 128} o = torch.empty_like(q) grid = (triton.cdiv(q.shape[2], BLOCK), q.shape[0] * q.shape[1], 1) L = torch.empty((q.shape[0] * q.shape[1], q.shape[2]), device=q.device, dtype=torch.float32) m = torch.empty((q.shape[0] * q.shape[1], q.shape[2]), device=q.device, dtype=torch.float32) num_warps = 4 if Lk <= 64 else 8 _fwd_kernel[grid]( q, k, v, sm_scale, L, m, o, q.stride(0), q.stride(1), q.stride(2), q.stride(3), k.stride(0), k.stride(1), k.stride(2), k.stride(3), v.stride(0), v.stride(1), v.stride(2), v.stride(3), o.stride(0), o.stride(1), o.stride(2), o.stride(3), q.shape[0], q.shape[1], q.shape[2], BLOCK_M=BLOCK, BLOCK_N=BLOCK, BLOCK_DMODEL=Lk, num_warps=num_warps, num_stages=2, ) ctx.save_for_backward(q, k, v, o, L, m) ctx.grid = grid ctx.sm_scale = sm_scale ctx.BLOCK_DMODEL = Lk return o @staticmethod def backward(ctx, do): BLOCK = 128 q, k, v, o, l, m = ctx.saved_tensors do = do.contiguous() dq = torch.zeros_like(q, dtype=torch.float32) dk = torch.empty_like(k) dv = torch.empty_like(v) do_scaled = torch.empty_like(do) delta = torch.empty_like(l) _bwd_preprocess[(ctx.grid[0] * ctx.grid[1], )]( o, do, l, do_scaled, delta, BLOCK_M=BLOCK, D_HEAD=ctx.BLOCK_DMODEL, ) _bwd_kernel[(ctx.grid[1],)]( q, k, v, ctx.sm_scale, o, do_scaled, dq, dk, dv, l, m, delta, q.stride(0), q.stride(1), q.stride(2), q.stride(3), k.stride(0), k.stride(1), k.stride(2), k.stride(3), v.stride(0), v.stride(1), v.stride(2), v.stride(3), q.shape[0], q.shape[1], q.shape[2], ctx.grid[0], BLOCK_M=BLOCK, BLOCK_N=BLOCK, BLOCK_DMODEL=ctx.BLOCK_DMODEL, num_warps=8, num_stages=1, ) return dq, dk, dv, None attention = _attention.apply @pytest.mark.parametrize('Z, H, N_CTX, D_HEAD', [(4, 48, 1024, 64)]) def test_op(Z, H, N_CTX, D_HEAD, dtype=torch.float16): torch.manual_seed(20) q = torch.empty((Z, H, N_CTX, D_HEAD), dtype=dtype, device="cuda").normal_(mean=0.1, std=0.2).requires_grad_() k = torch.empty((Z, H, N_CTX, D_HEAD), dtype=dtype, device="cuda").normal_(mean=0.4, std=0.2).requires_grad_() v = torch.empty((Z, H, N_CTX, D_HEAD), dtype=dtype, device="cuda").normal_(mean=0.3, std=0.2).requires_grad_() sm_scale = 0.2 dout = torch.randn_like(q) # reference implementation M = torch.tril(torch.ones((N_CTX, N_CTX), device="cuda")) p = torch.matmul(q, k.transpose(2, 3)) * sm_scale for z in range(Z): for h in range(H): p[:, :, M == 0] = float("-inf") p = torch.softmax(p.float(), dim=-1).half() # p = torch.exp(p) ref_out = torch.matmul(p, v) ref_out.backward(dout) ref_dv, v.grad = v.grad.clone(), None ref_dk, k.grad = k.grad.clone(), None ref_dq, q.grad = q.grad.clone(), None # # triton implementation tri_out = attention(q, k, v, sm_scale) # print(ref_out) # print(tri_out) tri_out.backward(dout) tri_dv, v.grad = v.grad.clone(), None tri_dk, k.grad = k.grad.clone(), None tri_dq, q.grad = q.grad.clone(), None # compare triton.testing.assert_almost_equal(ref_out, tri_out) triton.testing.assert_almost_equal(ref_dv, tri_dv) triton.testing.assert_almost_equal(ref_dk, tri_dk) triton.testing.assert_almost_equal(ref_dq, tri_dq) try: from flash_attn.flash_attn_interface import flash_attn_func HAS_FLASH = True except BaseException: HAS_FLASH = False BATCH, N_HEADS, N_CTX, D_HEAD = 4, 48, 4096, 64 # vary seq length for fixed head and batch=4 configs = [triton.testing.Benchmark( x_names=['N_CTX'], x_vals=[2**i for i in range(10, 14)], line_arg='provider', line_vals=['triton'] + (['flash'] if HAS_FLASH else []), line_names=['Triton'] + (['Flash'] if HAS_FLASH else []), styles=[('red', '-'), ('blue', '-')], ylabel='ms', plot_name=f'fused-attention-batch{BATCH}-head{N_HEADS}-d{D_HEAD}-{mode}', args={'H': N_HEADS, 'BATCH': BATCH, 'D_HEAD': D_HEAD, 'dtype': torch.float16, 'mode': mode} ) for mode in ['fwd', 'bwd']] @triton.testing.perf_report(configs) def bench_flash_attention(BATCH, H, N_CTX, D_HEAD, mode, provider, dtype=torch.float16, device="cuda"): assert mode in ['fwd', 'bwd'] warmup = 25 rep = 100 if provider == "triton": q = torch.randn((BATCH, H, N_CTX, D_HEAD), dtype=dtype, device="cuda", requires_grad=True) k = torch.randn((BATCH, H, N_CTX, D_HEAD), dtype=dtype, device="cuda", requires_grad=True) v = torch.randn((BATCH, H, N_CTX, D_HEAD), dtype=dtype, device="cuda", requires_grad=True) sm_scale = 1.3 fn = lambda: attention(q, k, v, sm_scale) if mode == 'bwd': o = fn() do = torch.randn_like(o) fn = lambda: o.backward(do, retain_graph=True) ms = triton.testing.do_bench(fn, percentiles=None, warmup=warmup, rep=rep) return ms if provider == "flash": lengths = torch.full((BATCH,), fill_value=N_CTX, device=device) cu_seqlens = torch.zeros((BATCH + 1,), device=device, dtype=torch.int32) cu_seqlens[1:] = lengths.cumsum(0) qkv = torch.randn((BATCH * N_CTX, 3, H, D_HEAD), dtype=dtype, device=device, requires_grad=True) fn = lambda: flash_attn_func(qkv, cu_seqlens, 0., N_CTX, causal=True) if mode == 'bwd': o = fn() do = torch.randn_like(o) fn = lambda: o.backward(do, retain_graph=True) ms = triton.testing.do_bench(fn, percentiles=None, warmup=warmup, rep=rep) return ms # only works on post-Ampere GPUs right now bench_flash_attention.run(save_path='.', print_data=True) triton-2.0.0/python/tutorials/README.rst000066400000000000000000000005161440023377100200710ustar00rootroot00000000000000Tutorials ================== Below is a gallery of tutorials for writing various basic operations with Triton. It is recommended that you read through the tutorials in order, starting with the simplest one. To install the dependencies for the tutorials: .. code-block:: bash cd triton pip install -e './python[tutorials]' triton-2.0.0/test/000077500000000000000000000000001440023377100140105ustar00rootroot00000000000000triton-2.0.0/test/Analysis/000077500000000000000000000000001440023377100155735ustar00rootroot00000000000000triton-2.0.0/test/Analysis/test-alias.mlir000066400000000000000000000253561440023377100205410ustar00rootroot00000000000000// RUN: triton-opt %s --mlir-disable-threading -test-print-alias -split-input-file 2>&1 | FileCheck %s #AL = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> #BL = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}> #A_SHARED = #triton_gpu.shared<{vec = 2, perPhase = 2, maxPhase = 4, order = [1, 0]}> #A_SHARED_T = #triton_gpu.shared<{vec = 2, perPhase = 2, maxPhase = 4, order = [0, 1]}> #B_SHARED = #triton_gpu.shared<{vec = 2, perPhase = 2, maxPhase = 4, order = [1, 0]}> #C = #triton_gpu.mma<{versionMajor = 2, warpsPerCTA = [4, 1]}> #A_DOT = #triton_gpu.dot_op<{opIdx = 0, parent = #C}> #B_DOT = #triton_gpu.dot_op<{opIdx = 1, parent = #C}> // CHECK-LABEL: matmul_loop // There shouldn't be any aliasing with the dot op encoding. func @matmul_loop(%lb : index, %ub : index, %step : index, %A : !tt.ptr, %B : !tt.ptr) { %a_ptr_init = tt.broadcast %A : (!tt.ptr) -> tensor<128x32x!tt.ptr, #AL> %b_ptr_init = tt.broadcast %B : (!tt.ptr) -> tensor<32x128x!tt.ptr, #BL> %a_mask = arith.constant dense : tensor<128x32xi1, #AL> %a_other = arith.constant dense<0.00e+00> : tensor<128x32xf16, #AL> %b_mask = arith.constant dense : tensor<32x128xi1, #BL> %b_other = arith.constant dense<0.00e+00> : tensor<32x128xf16, #BL> %c_init = arith.constant dense<0.00e+00> : tensor<128x128xf32, #C> %a_off = arith.constant dense<4> : tensor<128x32xi32, #AL> %b_off = arith.constant dense<4> : tensor<32x128xi32, #BL> scf.for %iv = %lb to %ub step %step iter_args(%a_ptr = %a_ptr_init, %b_ptr = %b_ptr_init, %prev_c = %c_init) -> (tensor<128x32x!tt.ptr, #AL>, tensor<32x128x!tt.ptr, #BL>, tensor<128x128xf32, #C>) { %a_ = tt.load %a_ptr, %a_mask, %a_other {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<128x32xf16, #AL> %a = triton_gpu.convert_layout %a_ : (tensor<128x32xf16, #AL>) -> tensor<128x32xf16, #A_DOT> %b_ = tt.load %b_ptr, %b_mask, %b_other {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<32x128xf16, #BL> %b = triton_gpu.convert_layout %b_ : (tensor<32x128xf16, #BL>) -> tensor<32x128xf16, #B_DOT> %c = tt.dot %a, %b, %prev_c {transA = false, transB = false, allowTF32 = true} : tensor<128x32xf16, #A_DOT> * tensor<32x128xf16, #B_DOT> -> tensor<128x128xf32, #C> %next_a_ptr = tt.addptr %a_ptr, %a_off : tensor<128x32x!tt.ptr, #AL>, tensor<128x32xi32, #AL> %next_b_ptr = tt.addptr %b_ptr, %b_off : tensor<32x128x!tt.ptr, #BL>, tensor<32x128xi32, #BL> scf.yield %next_a_ptr, %next_b_ptr, %c : tensor<128x32x!tt.ptr, #AL>, tensor<32x128x!tt.ptr, #BL>, tensor<128x128xf32, #C> } return } // CHECK-LABEL: alloc func @alloc(%A : !tt.ptr) { // CHECK: %cst -> %cst %cst0 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A_SHARED> %cst1 = arith.constant dense<0.000000e+00> : tensor<16x32xf16, #AL> // CHECK: %0 -> %0 %cst2 = triton_gpu.alloc_tensor : tensor<16x16xf16, #A_SHARED> return } // CHECK-LABEL: convert func @convert(%A : !tt.ptr) { %cst0 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #AL> // CHECK: %0 -> %0 %cst1 = triton_gpu.convert_layout %cst0 : (tensor<16x16xf16, #AL>) -> tensor<16x16xf16, #A_SHARED> return } // CHECK-LABEL: trans func @trans(%A : !tt.ptr) { // CHECK: %cst -> %cst %tensor = arith.constant dense<0.000000e+00> : tensor<16x32xf16, #A_SHARED> // CHECK: %0 -> %cst %b = tt.trans %tensor : (tensor<16x32xf16, #A_SHARED>) -> tensor<32x16xf16, #A_SHARED_T> return } // CHECK-LABEL: insert_slice_async func @insert_slice_async(%A : !tt.ptr, %i1 : i1) { %a_ptr = tt.broadcast %A : (!tt.ptr) -> tensor<16x16x!tt.ptr, #AL> %mask = tt.splat %i1 : (i1) -> tensor<16x16xi1, #AL> %other = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #AL> // CHECK: %cst_0 -> %cst_0 %tensor = arith.constant dense<0.000000e+00> : tensor<1x16x16xf16, #A_SHARED> %index = arith.constant 0 : i32 // CHECK: %2 -> %cst_0 %a = triton_gpu.insert_slice_async %a_ptr, %tensor, %index, %mask, %other {axis = 0 : i32, cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<16x16x!tt.ptr, #AL> -> tensor<1x16x16xf16, #A_SHARED> return } // CHECK-LABEL: insert_slice func @insert_slice(%A : !tt.ptr, %i1 : i1) { %a_ptr = tt.broadcast %A : (!tt.ptr) -> tensor<16x16x!tt.ptr, #AL> %mask = tt.splat %i1 : (i1) -> tensor<16x16xi1, #AL> %other = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #AL> // CHECK: %cst_0 -> %cst_0 %tensor = arith.constant dense<0.000000e+00> : tensor<1x16x16xf16, #A_SHARED> %index = arith.constant 0 : index %a = tt.load %a_ptr, %mask, %other {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<16x16xf16, #AL> // CHECK: %3 -> %cst_0 %b = tensor.insert_slice %a into %tensor[%index, 0, 0][1, 16, 16][1, 1, 1]: tensor<16x16xf16, #AL> into tensor<1x16x16xf16, #A_SHARED> return } // CHECK-LABEL: extract_slice func @extract_slice(%A : !tt.ptr) { // CHECK: %cst -> %cst %cst0 = arith.constant dense<0.000000e+00> : tensor<1x16x16xf16, #A_SHARED> %index = arith.constant 0 : index // CHECK-NEXT: %0 -> %cst %cst1 = tensor.extract_slice %cst0[%index, 0, 0][1, 16, 16][1, 1, 1] : tensor<1x16x16xf16, #A_SHARED> to tensor<16x16xf16, #A_SHARED> return } // CHECK-LABEL: if_cat func @if_cat(%i1 : i1) { // CHECK: %cst -> %cst %cst0 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A_SHARED> // CHECK: %cst_0 -> %cst_0 %cst1 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A_SHARED> // CHECK: %0 -> %1,%1 %cst2 = scf.if %i1 -> tensor<32x16xf16, #A_SHARED> { // CHECK: %1 -> %1 %a = tt.cat %cst0, %cst1 {axis = 0} : (tensor<16x16xf16, #A_SHARED>, tensor<16x16xf16, #A_SHARED>) -> tensor<32x16xf16, #A_SHARED> scf.yield %a : tensor<32x16xf16, #A_SHARED> } else { // CHECK: %1 -> %1 %b = tt.cat %cst0, %cst1 {axis = 0} : (tensor<16x16xf16, #A_SHARED>, tensor<16x16xf16, #A_SHARED>) -> tensor<32x16xf16, #A_SHARED> scf.yield %b : tensor<32x16xf16, #A_SHARED> } return } // CHECK-LABEL: if_alias func @if_alias(%i1 : i1) { // CHECK: %cst -> %cst %cst0 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A_SHARED> // CHECK-NEXT: %cst_0 -> %cst_0 %cst1 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A_SHARED> // CHECK-NEXT: %0 -> %cst,%cst_0 %cst2 = scf.if %i1 -> tensor<16x16xf16, #A_SHARED> { scf.yield %cst0 : tensor<16x16xf16, #A_SHARED> } else { scf.yield %cst1 : tensor<16x16xf16, #A_SHARED> } return } // CHECK-LABEL: for func @for(%lb : index, %ub : index, %step : index, %A : !tt.ptr, %B : !tt.ptr) { // CHECK: %cst -> %cst %a_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A_SHARED> // CHECK-NEXT: %cst_0 -> %cst_0 %b_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A_SHARED> // CHECK-NEXT: %cst_1 -> %cst_1 %c_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A_SHARED> // CHECK-NEXT: %arg6 -> %cst // CHECK-NEXT: %arg7 -> %cst_0 // CHECK-NEXT: %arg8 -> %cst_1 // CHECK-NEXT: %0#0 -> %cst,%cst_0 // CHECK-NEXT: %0#1 -> %cst,%cst_0 // CHECK-NEXT: %0#2 -> %cst,%cst_0 %a_shared, %b_shared, %c_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a_shared_init, %b_shared = %b_shared_init, %c_shared = %c_shared_init) -> (tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED>) { scf.yield %b_shared, %a_shared, %a_shared : tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED> } return } // CHECK-LABEL: for_if func @for_if(%lb : index, %ub : index, %step : index, %A : !tt.ptr, %B : !tt.ptr, %i1 : i1) { // CHECK: %cst -> %cst %a_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A_SHARED> // CHECK-NEXT: %cst_0 -> %cst_0 %b_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A_SHARED> // CHECK-NEXT: %cst_1 -> %cst_1 %c_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A_SHARED> // CHECK-NEXT: %arg7 -> %cst // CHECK-NEXT: %arg8 -> %cst_0 // CHECK-NEXT: %arg9 -> %cst_1 // CHECK-NEXT: %0#0 -> %cst,%cst_0 // CHECK-NEXT: %0#1 -> %cst,%cst_0 // CHECK-NEXT: %0#2 -> %cst,%cst_0 %a_shared, %b_shared, %c_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a_shared_init, %b_shared = %b_shared_init, %c_shared = %c_shared_init) -> (tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED>) { scf.if %i1 { %index = arith.constant 8 : index // CHECK-NEXT: %1 -> %cst,%cst_0 %cst0 = tensor.extract_slice %a_shared[%index, 0][1, 32][1, 1] : tensor<128x32xf16, #A_SHARED> to tensor<32xf16, #A_SHARED> scf.yield } scf.yield %b_shared, %a_shared, %a_shared : tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED> } return } // CHECK-LABEL: for_if_for func @for_if_for(%lb : index, %ub : index, %step : index, %A : !tt.ptr, %B : !tt.ptr, %i1 : i1) { // CHECK: %cst -> %cst %a_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A_SHARED> // CHECK-NEXT: %cst_0 -> %cst_0 %b_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A_SHARED> // CHECK-NEXT: %cst_1 -> %cst_1 %c_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A_SHARED> // CHECK-NEXT: %arg7 -> %cst // CHECK-NEXT: %arg8 -> %cst_0 // CHECK-NEXT: %arg9 -> %cst_1 // CHECK-NEXT: %0#0 -> %cst // CHECK-NEXT: %0#1 -> %cst_0 // CHECK-NEXT: %0#2 -> %cst_2,%cst_2 %a_shared, %b_shared, %c_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a_shared_init, %b_shared = %b_shared_init, %c_shared = %c_shared_init) -> (tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED>) { // CHECK-NEXT: %arg11 -> %cst_1,%cst_2,%cst_2 // CHECK-NEXT: %1 -> %cst_2,%cst_2 %c_shared_next = scf.for %jv = %lb to %ub step %step iter_args(%c_shared_next = %c_shared) -> (tensor<128x32xf16, #A_SHARED>) { // CHECK-NEXT: %2 -> %cst_2,%cst_2 %c_shared_next_next = scf.if %i1 -> tensor<128x32xf16, #A_SHARED> { // CHECK-NEXT: %cst_2 -> %cst_2 %cst0 = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A_SHARED> scf.yield %cst0 : tensor<128x32xf16, #A_SHARED> } else { // CHECK-NEXT: %cst_2 -> %cst_2 %cst0 = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A_SHARED> scf.yield %cst0 : tensor<128x32xf16, #A_SHARED> } scf.yield %c_shared_next_next : tensor<128x32xf16, #A_SHARED> } scf.yield %a_shared, %b_shared, %c_shared_next : tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED> } return } triton-2.0.0/test/Analysis/test-alignment.mlir000066400000000000000000000613711440023377100214230ustar00rootroot00000000000000// RUN: triton-opt %s -test-print-alignment -split-input-file 2>&1 | FileCheck %s // CHECK-LABEL: cast func @cast() { // CHECK: Contiguity: [1] ; Divisibility: [1] ; Constancy: [1] ; ConstantValue: [1] %cst = arith.constant 1 : i32 // CHECK-NEXT: Contiguity: [1] ; Divisibility: [1] ; Constancy: [1] ; ConstantValue: [1] %0 = arith.extsi %cst : i32 to i64 // CHECK-NEXT: Contiguity: [1] ; Divisibility: [1] ; Constancy: [128] ; ConstantValue: [1] %cst_tensor = arith.constant dense<1> : tensor<128xi32> // CHECK-NEXT: Contiguity: [1] ; Divisibility: [1] ; Constancy: [128] ; ConstantValue: [1] %1 = tt.bitcast %cst_tensor : tensor<128xi32> -> tensor<128xi64> return } // ----- // CHECK-LABEL: add func @add() { // CHECK: Contiguity: [128] ; Divisibility: [1073741824] ; Constancy: [1] ; ConstantValue: [None] %0 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> // CHECK-NEXT: Contiguity: [1] ; Divisibility: [1] ; Constancy: [128] ; ConstantValue: [1] %1 = arith.constant dense<1> : tensor<128xi32> // CHECK-NEXT: Contiguity: [128] ; Divisibility: [1] ; Constancy: [1] ; ConstantValue: [None] %2 = arith.addi %0, %1 : tensor<128xi32> // CHECK-NEXT: Contiguity: [1] ; Divisibility: [1] ; Constancy: [128] ; ConstantValue: [127] %3 = arith.constant dense<127> : tensor<128xi32> // CHECK-NEXT: Contiguity: [1] ; Divisibility: [128] ; Constancy: [128] ; ConstantValue: [128] %4 = arith.addi %1, %3 : tensor<128xi32> return } // ----- // CHECK-LABEL: sub func @sub() { // CHECK: Contiguity: [128] ; Divisibility: [1073741824] ; Constancy: [1] ; ConstantValue: [None] %0 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> // CHECK-NEXT: Contiguity: [1] ; Divisibility: [1] ; Constancy: [128] ; ConstantValue: [1] %1 = arith.constant dense<1> : tensor<128xi32> // CHECK-NEXT: Contiguity: [128] ; Divisibility: [1] ; Constancy: [1] ; ConstantValue: [None] %2 = arith.subi %0, %1 : tensor<128xi32> // CHECK-NEXT: Contiguity: [1] ; Divisibility: [1] ; Constancy: [128] ; ConstantValue: [129] %3 = arith.constant dense<129> : tensor<128xi32> // CHECK-NEXT: Contiguity: [1] ; Divisibility: [128] ; Constancy: [128] ; ConstantValue: [128] %4 = arith.subi %3, %1 : tensor<128xi32> return } // ----- // CHECK-LABEL: mul func @mul() { // CHECK: Contiguity: [128] ; Divisibility: [1073741824] ; Constancy: [1] ; ConstantValue: [None] %0 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> // CHECK-NEXT: Contiguity: [1] ; Divisibility: [1] ; Constancy: [128] ; ConstantValue: [1] %1 = arith.constant dense<1> : tensor<128xi32> // CHECK-NEXT: Contiguity: [128] ; Divisibility: [1073741824] ; Constancy: [1] ; ConstantValue: [None] %2 = arith.muli %0, %1 : tensor<128xi32> // CHECK-NEXT: Contiguity: [1] ; Divisibility: [128] ; Constancy: [128] ; ConstantValue: [128] %3 = arith.constant dense<128> : tensor<128xi32> // CHECK-NEXT: Contiguity: [1] ; Divisibility: [128] ; Constancy: [128] ; ConstantValue: [128] %4 = arith.muli %3, %1 : tensor<128xi32> // CHECK-NEXT: Contiguity: [1] ; Divisibility: [2] ; Constancy: [128] ; ConstantValue: [2] %5 = arith.constant dense<2> : tensor<128xi32> // CHECK-NEXT: Contiguity: [1] ; Divisibility: [256] ; Constancy: [128] ; ConstantValue: [256] %6 = arith.muli %4, %5 : tensor<128xi32> return } // ----- // CHECK-LABEL: div func @div() { // CHECK: Contiguity: [128] ; Divisibility: [1073741824] ; Constancy: [1] ; ConstantValue: [None] %0 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> // CHECK-NEXT: Contiguity: [1] ; Divisibility: [1] ; Constancy: [128] ; ConstantValue: [1] %1 = arith.constant dense<1> : tensor<128xi32> // CHECK-NEXT: Contiguity: [128] ; Divisibility: [1073741824] ; Constancy: [1] ; ConstantValue: [None] %2 = arith.divsi %0, %1 : tensor<128xi32> // CHECK-NEXT: Contiguity: [1] ; Divisibility: [1] ; Constancy: [1] ; ConstantValue: [None] %3 = arith.divui %1, %0 : tensor<128xi32> // CHECK-NEXT: Contiguity: [1] ; Divisibility: [64] ; Constancy: [128] ; ConstantValue: [64] %4 = arith.constant dense<64> : tensor<128xi32> // CHECK-NEXT: Contiguity: [1] ; Divisibility: [16777216] ; Constancy: [64] ; ConstantValue: [None] %5 = arith.divsi %0, %4 : tensor<128xi32> // CHECK-NEXT: Contiguity: [1] ; Divisibility: [1] ; Constancy: [1] ; ConstantValue: [None] %6 = arith.divsi %4, %0 : tensor<128xi32> // CHECK-NEXT: Contiguity: [1] ; Divisibility: [64] ; Constancy: [128] ; ConstantValue: [64] %7 = arith.divsi %4, %1 : tensor<128xi32> // CHECK-NEXT: Contiguity: [1] ; Divisibility: [2] ; Constancy: [128] ; ConstantValue: [66] %8 = arith.constant dense<66> : tensor<128xi32> // CHECK-NEXT: Contiguity: [1] ; Divisibility: [1] ; Constancy: [2] ; ConstantValue: [None] %9 = arith.divui %0, %8 : tensor<128xi32> // CHECK-NEXT: Contiguity: [128] ; Divisibility: [8192] ; Constancy: [1] ; ConstantValue: [None] %10 = tt.make_range {end = 8320 : i32, start = 8192 : i32} : tensor<128xi32> // CHECK-NEXT: Contiguity: [1] ; Divisibility: [128] ; Constancy: [64] ; ConstantValue: [None] %11 = arith.divsi %10, %4 : tensor<128xi32> return } // ----- // CHECK-LABEL: rem func @rem() { // CHECK: Contiguity: [128] ; Divisibility: [1073741824] ; Constancy: [1] ; ConstantValue: [None] %0 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> // CHECK-NEXT: Contiguity: [1] ; Divisibility: [1] ; Constancy: [128] ; ConstantValue: [1] %1 = arith.constant dense<1> : tensor<128xi32> // CHECK-NEXT: Contiguity: [1] ; Divisibility: [4611686018427387904] ; Constancy: [128] ; ConstantValue: [0] %2 = arith.remsi %0, %1 : tensor<128xi32> // CHECK-NEXT: Contiguity: [1] ; Divisibility: [1] ; Constancy: [1] ; ConstantValue: [None] %3 = arith.remui %1, %0 : tensor<128xi32> // CHECK-NEXT: Contiguity: [1] ; Divisibility: [64] ; Constancy: [128] ; ConstantValue: [64] %4 = arith.constant dense<64> : tensor<128xi32> // CHECK-NEXT: Contiguity: [64] ; Divisibility: [64] ; Constancy: [1] ; ConstantValue: [None] %5 = arith.remsi %0, %4 : tensor<128xi32> // CHECK-NEXT: Contiguity: [1] ; Divisibility: [64] ; Constancy: [1] ; ConstantValue: [None] %6 = arith.remsi %4, %0 : tensor<128xi32> // CHECK-NEXT: Contiguity: [1] ; Divisibility: [2] ; Constancy: [128] ; ConstantValue: [66] %7 = arith.constant dense<66> : tensor<128xi32> // CHECK-NEXT: Contiguity: [2] ; Divisibility: [2] ; Constancy: [1] ; ConstantValue: [None] %8 = arith.remui %0, %7 : tensor<128xi32> return } // ----- // CHECK-LABEL: broadcast func @broadcast() { // CHECK: Contiguity: [1] ; Divisibility: [64] ; Constancy: [128] ; ConstantValue: [64] %0 = arith.constant dense<64> : tensor<128xi32> // CHECK-NEXT: Contiguity: [1, 1] ; Divisibility: [64, 1] ; Constancy: [128, 1] ; ConstantValue: [64] %1 = tt.expand_dims %0 {axis = 1 : i32} : (tensor<128xi32>) -> tensor<128x1xi32> // CHECK-NEXT: Contiguity: [1, 1] ; Divisibility: [64, 1] ; Constancy: [128, 128] ; ConstantValue: [64] %2 = tt.broadcast %1 : (tensor<128x1xi32>) -> tensor<128x128xi32> return } // ----- // CHECK-LABEL: splat func @splat(%arg0: !tt.ptr {tt.divisibility = 16 : i32}) { // CHECK: Contiguity: [1, 1] ; Divisibility: [16, 16] ; Constancy: [128, 128] ; ConstantValue: [None] %0 = tt.splat %arg0 : (!tt.ptr) -> tensor<128x128x!tt.ptr> return } // ----- // CHECK-LABEL: cmp func @cmp() { // CHECK: Contiguity: [128] ; Divisibility: [1073741824] ; Constancy: [1] ; ConstantValue: [None] %0 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> // CHECK-NEXT: Contiguity: [1] ; Divisibility: [4611686018427387904] ; Constancy: [128] ; ConstantValue: [0] %1 = arith.constant dense<0> : tensor<128xi32> // CHECK-NEXT: Contiguity: [1] ; Divisibility: [1] ; Constancy: [128] ; ConstantValue: [None] %2 = arith.cmpi eq, %0, %1 : tensor<128xi32> // CHECK-NEXT: Contiguity: [1] ; Divisibility: [1] ; Constancy: [128] ; ConstantValue: [None] %3 = arith.cmpi slt, %0, %1 : tensor<128xi32> // CHECK-NEXT: Contiguity: [1] ; Divisibility: [1] ; Constancy: [1] ; ConstantValue: [None] %4 = arith.cmpi sle, %0, %1 : tensor<128xi32> // CHECK-NEXT: Contiguity: [1] ; Divisibility: [1] ; Constancy: [128] ; ConstantValue: [None] %5 = arith.cmpi sge, %0, %1 : tensor<128xi32> // CHECK-NEXT: Contiguity: [1] ; Divisibility: [8] ; Constancy: [128] ; ConstantValue: [8] %6 = arith.constant dense<8> : tensor<128xi32> // CHECK-NEXT: Contiguity: [1] ; Divisibility: [1] ; Constancy: [8] ; ConstantValue: [None] %7 = arith.cmpi sgt, %0, %6 : tensor<128xi32> // CHECK-NEXT: Contiguity: [1] ; Divisibility: [1] ; Constancy: [128] ; ConstantValue: [0] %8 = arith.cmpi sgt, %1, %6 : tensor<128xi32> return } // ----- // CHECK-LABEL: logic func @logic() { // CHECK: Contiguity: [128] ; Divisibility: [1073741824] ; Constancy: [1] ; ConstantValue: [None] %0 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> // CHECK-NEXT: Contiguity: [1] ; Divisibility: [64] ; Constancy: [128] ; ConstantValue: [64] %1 = arith.constant dense<64> : tensor<128xi32> // CHECK-NEXT: Contiguity: [1] ; Divisibility: [16777216] ; Constancy: [64] ; ConstantValue: [None] %2 = arith.divsi %0, %1 : tensor<128xi32> // CHECK-NEXT: Contiguity: [1] ; Divisibility: [8] ; Constancy: [128] ; ConstantValue: [8] %3 = arith.constant dense<8> : tensor<128xi32> // CHECK-NEXT: Contiguity: [1] ; Divisibility: [134217728] ; Constancy: [8] ; ConstantValue: [None] %4 = arith.divsi %0, %3 : tensor<128xi32> // CHECK-NEXT: Contiguity: [1] ; Divisibility: [1] ; Constancy: [1] ; ConstantValue: [None] %5 = arith.andi %0, %1 : tensor<128xi32> // CHECK-NEXT: Contiguity: [1] ; Divisibility: [1] ; Constancy: [1] ; ConstantValue: [None] %6 = arith.ori %0, %1 : tensor<128xi32> // CHECK-NEXT: Contiguity: [1] ; Divisibility: [1] ; Constancy: [1] ; ConstantValue: [None] %7 = arith.xori %0, %1 : tensor<128xi32> // CHECK-NEXT: Contiguity: [1] ; Divisibility: [1] ; Constancy: [8] ; ConstantValue: [None] %8 = arith.andi %2, %4 : tensor<128xi32> // CHECK-NEXT: Contiguity: [1] ; Divisibility: [1] ; Constancy: [8] ; ConstantValue: [None] %9 = arith.ori %2, %4 : tensor<128xi32> // CHECK-NEXT: Contiguity: [1] ; Divisibility: [1] ; Constancy: [8] ; ConstantValue: [None] %10 = arith.xori %2, %4 : tensor<128xi32> return } // ----- // CHECK-LABEL: select func @select() { // CHECK: Contiguity: [128] ; Divisibility: [1073741824] ; Constancy: [1] ; ConstantValue: [None] %0 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> // CHECK-NEXT: Contiguity: [1] ; Divisibility: [4611686018427387904] ; Constancy: [128] ; ConstantValue: [0] %1 = arith.constant dense<0> : tensor<128xi32> // CHECK-NEXT: Contiguity: [1] ; Divisibility: [1] ; Constancy: [128] ; ConstantValue: [None] %2 = arith.cmpi eq, %0, %1 : tensor<128xi32> // CHECK-NEXT: Contiguity: [1] ; Divisibility: [1] ; Constancy: [128] ; ConstantValue: [None] %3 = arith.cmpi slt, %0, %1 : tensor<128xi32> // CHECK-NEXT: Contiguity: [1] ; Divisibility: [4611686018427387904] ; Constancy: [1] ; ConstantValue: [0] %4 = arith.constant 0 : i1 // CHECK-NEXT: Contiguity: [1] ; Divisibility: [4611686018427387904] ; Constancy: [128] ; ConstantValue: [0] %7 = tt.splat %4 : (i1) -> tensor<128xi1> // CHECK-NEXT: Contiguity: [1] ; Divisibility: [4611686018427387904] ; Constancy: [128] ; ConstantValue: [0] %5 = select %4, %3, %7 : tensor<128xi1> // CHECK-NEXT: Contiguity: [1] ; Divisibility: [1] ; Constancy: [128] ; ConstantValue: [None] %8 = "triton_gpu.select"(%7, %3, %2) : (tensor<128xi1>, tensor<128xi1>, tensor<128xi1>) -> tensor<128xi1> return } // ----- func @shift() { // CHECK: Contiguity: [128] ; Divisibility: [1073741824] ; Constancy: [1] ; ConstantValue: [None] %0 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> // CHECK-NEXT: Contiguity: [1] ; Divisibility: [8] ; Constancy: [128] ; ConstantValue: [8] %1 = arith.constant dense<8> : tensor<128xi32> // CHECK-NEXT: Contiguity: [1] ; Divisibility: [4] ; Constancy: [128] ; ConstantValue: [4] %2 = arith.constant dense<4> : tensor<128xi32> // CHECK-NEXT: Contiguity: [1] ; Divisibility: [274877906944] ; Constancy: [1] ; ConstantValue: [None] %3 = arith.shli %0, %1 : tensor<128xi32> // CHECK-NEXT: Contiguity: [1] ; Divisibility: [67108864] ; Constancy: [1] ; ConstantValue: [None] %4 = arith.shrsi %0, %2 : tensor<128xi32> // CHECK-NEXT: Contiguity: [1] ; Divisibility: [128] ; Constancy: [128] ; ConstantValue: [128] %5 = arith.shli %1, %2 : tensor<128xi32> return } // ----- func @max_min() { // CHECK: Contiguity: [128] ; Divisibility: [1073741824] ; Constancy: [1] ; ConstantValue: [None] %0 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> // CHECK-NEXT: Contiguity: [128] ; Divisibility: [64] ; Constancy: [1] ; ConstantValue: [None] %1 = tt.make_range {end = 192 : i32, start = 64 : i32} : tensor<128xi32> // CHECK-NEXT: Contiguity: [1] ; Divisibility: [1] ; Constancy: [1] ; ConstantValue: [None] %2 = arith.maxsi %0, %1 : tensor<128xi32> // CHECK-NEXT: Contiguity: [1] ; Divisibility: [1] ; Constancy: [1] ; ConstantValue: [None] %3 = arith.minsi %0, %1 : tensor<128xi32> // CHECK-NEXT: Contiguity: [1] ; Divisibility: [8] ; Constancy: [128] ; ConstantValue: [8] %4 = arith.constant dense<8> : tensor<128xi32> // CHECK-NEXT: Contiguity: [1] ; Divisibility: [4] ; Constancy: [128] ; ConstantValue: [4] %5 = arith.constant dense<4> : tensor<128xi32> // CHECK-NEXT: Contiguity: [1] ; Divisibility: [1] ; Constancy: [1] ; ConstantValue: [8] %6 = arith.maxsi %4, %5 : tensor<128xi32> return } // ----- // CHECK-LABEL: for func @for() { // CHECK: Contiguity: [1, 1] ; Divisibility: [4611686018427387904, 4611686018427387904] ; Constancy: [128, 32] ; ConstantValue: [0] %a_init = arith.constant dense<0> : tensor<128x32xi32> // CHECK-NEXT: Contiguity: [1, 1] ; Divisibility: [1, 1] ; Constancy: [128, 32] ; ConstantValue: [1] %b_init = arith.constant dense<1> : tensor<128x32xi32> // CHECK-NEXT: Contiguity: [1, 1] ; Divisibility: [4, 4] ; Constancy: [128, 32] ; ConstantValue: [4] %c_init = arith.constant dense<4> : tensor<128x32xi32> // CHECK-NEXT: Contiguity: [1] ; Divisibility: [128] ; Constancy: [1] ; ConstantValue: [128] %ub = arith.constant 128 : index // CHECK-NEXT: Contiguity: [1] ; Divisibility: [4611686018427387904] ; Constancy: [1] ; ConstantValue: [0] %lb = arith.constant 0 : index // CHECK-NEXT: Contiguity: [1] ; Divisibility: [16] ; Constancy: [1] ; ConstantValue: [16] %step = arith.constant 16 : index %a, %b, %c = scf.for %iv = %lb to %ub step %step iter_args(%a = %a_init, %b = %b_init, %c = %c_init) -> (tensor<128x32xi32>, tensor<128x32xi32>, tensor<128x32xi32>) { // CHECK-NEXT: Contiguity: [1] ; Divisibility: [16] ; Constancy: [1] ; ConstantValue: [None] %t = arith.index_cast %iv : index to i32 // CHECK: Contiguity: [1, 1] ; Divisibility: [1, 1] ; Constancy: [128, 32] ; ConstantValue: [None] // CHECK: Contiguity: [1, 1] ; Divisibility: [1, 1] ; Constancy: [128, 32] ; ConstantValue: [None] // CHECK: Contiguity: [1, 1] ; Divisibility: [4, 4] ; Constancy: [128, 32] ; ConstantValue: [4] scf.yield %b, %a, %c : tensor<128x32xi32>, tensor<128x32xi32>, tensor<128x32xi32> } return } // ----- // CHECK-LABEL: permute_2d func @permute_2d(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: i32 {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32}) { // CHECK: Contiguity: [1, 1] ; Divisibility: [1, 1] ; Constancy: [128, 128] ; ConstantValue: [1] %cst = arith.constant dense : tensor<128x128xi1> // CHECK-NEXT: Contiguity: [1, 1] ; Divisibility: [1, 1] ; Constancy: [1, 1] ; ConstantValue: [None] %cst_0 = arith.constant dense<0.000000e+00> : tensor<128x128xf32> // CHECK-NEXT: Contiguity: [128] ; Divisibility: [1073741824] ; Constancy: [1] ; ConstantValue: [None] %0 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> // CHECK-NEXT: Contiguity: [128] ; Divisibility: [1073741824] ; Constancy: [1] ; ConstantValue: [None] %1 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> // CHECK-NEXT: Contiguity: [128, 1] ; Divisibility: [1073741824, 1] ; Constancy: [1, 1] ; ConstantValue: [None] %2 = tt.expand_dims %0 {axis = 1 : i32} : (tensor<128xi32>) -> tensor<128x1xi32> // CHECK-NEXT: Contiguity: [1, 1] ; Divisibility: [16, 16] ; Constancy: [128, 1] ; ConstantValue: [None] %3 = tt.splat %arg1 : (i32) -> tensor<128x1xi32> // CHECK-NEXT: Contiguity: [1, 1] ; Divisibility: [17179869184, 16] ; Constancy: [1, 1] ; ConstantValue: [None] %4 = arith.muli %2, %3 : tensor<128x1xi32> // CHECK-NEXT: Contiguity: [1, 1] ; Divisibility: [16, 16] ; Constancy: [128, 1] ; ConstantValue: [None] %5 = tt.splat %arg0 : (!tt.ptr) -> tensor<128x1x!tt.ptr> // CHECK-NEXT: Contiguity: [1, 1] ; Divisibility: [16, 16] ; Constancy: [1, 1] ; ConstantValue: [None] %6 = tt.addptr %5, %4 : tensor<128x1x!tt.ptr>, tensor<128x1xi32> // CHECK-NEXT: Contiguity: [1, 128] ; Divisibility: [1, 1073741824] ; Constancy: [1, 1] ; ConstantValue: [None] %7 = tt.expand_dims %1 {axis = 0 : i32}: (tensor<128xi32>) -> tensor<1x128xi32> // CHECK-NEXT: Contiguity: [1, 1] ; Divisibility: [16, 16] ; Constancy: [1, 128] ; ConstantValue: [None] %8 = tt.broadcast %6 : (tensor<128x1x!tt.ptr>) -> tensor<128x128x!tt.ptr> // CHECK-NEXT: Contiguity: [1, 128] ; Divisibility: [1, 1073741824] ; Constancy: [128, 1] ; ConstantValue: [None] %9 = tt.broadcast %7 : (tensor<1x128xi32>) -> tensor<128x128xi32> // CHECK-NEXT: Contiguity: [1, 128] ; Divisibility: [1, 16] ; Constancy: [1, 1] ; ConstantValue: [None] %10 = tt.addptr %8, %9 : tensor<128x128x!tt.ptr>, tensor<128x128xi32> // CHECK-NEXT: Contiguity: [128, 1] ; Divisibility: [1073741824, 1] ; Constancy: [1, 1] ; ConstantValue: [None] %11 = tt.expand_dims %0 {axis = 1 : i32}: (tensor<128xi32>) -> tensor<128x1xi32> // CHECK-NEXT: Contiguity: [1, 1] ; Divisibility: [16, 16] ; Constancy: [128, 1] ; ConstantValue: [None] %12 = tt.splat %arg2 : (!tt.ptr) -> tensor<128x1x!tt.ptr> // CHECK-NEXT: Contiguity: [128, 1] ; Divisibility: [16, 1] ; Constancy: [1, 1] ; ConstantValue: [None] %13 = tt.addptr %12, %11 : tensor<128x1x!tt.ptr>, tensor<128x1xi32> // CHECK-NEXT: Contiguity: [1, 128] ; Divisibility: [1, 1073741824] ; Constancy: [1, 1] ; ConstantValue: [None] %14 = tt.expand_dims %1 {axis = 0 : i32} : (tensor<128xi32>) -> tensor<1x128xi32> // CHECK-NEXT: Contiguity: [1, 1] ; Divisibility: [16, 16] ; Constancy: [1, 128] ; ConstantValue: [None] %15 = tt.splat %arg3 : (i32) -> tensor<1x128xi32> // CHECK-NEXT: Contiguity: [1, 1] ; Divisibility: [16, 17179869184] ; Constancy: [1, 1] ; ConstantValue: [None] %16 = arith.muli %14, %15 : tensor<1x128xi32> // CHECK-NEXT: Contiguity: [128, 1] ; Divisibility: [16, 1] ; Constancy: [1, 128] ; ConstantValue: [None] %17 = tt.broadcast %13 : (tensor<128x1x!tt.ptr>) -> tensor<128x128x!tt.ptr> // CHECK-NEXT: Contiguity: [1, 1] ; Divisibility: [16, 17179869184] ; Constancy: [128, 1] ; ConstantValue: [None] %18 = tt.broadcast %16 : (tensor<1x128xi32>) -> tensor<128x128xi32> // CHECK-NEXT: Contiguity: [128, 1] ; Divisibility: [16, 1] ; Constancy: [1, 1] ; ConstantValue: [None] %19 = tt.addptr %17, %18 : tensor<128x128x!tt.ptr>, tensor<128x128xi32> // CHECK-NEXT: Contiguity: [1, 1] ; Divisibility: [1, 1] ; Constancy: [1, 1] ; ConstantValue: [None] %20 = tt.load %10, %cst, %cst_0 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<128x128xf32> tt.store %19, %20, %cst : tensor<128x128xf32> return } // ----- module { // This is a tiny test for verifying StoreOp-related alignment, It simply store a constant to a buffer. // CHECK-LABEL: store_constant_align func @store_constant_align(%addr: !tt.ptr {tt.divisibility = 16 : i32}, %n: i32 {tt.divisibility = 16 : i32}) { // CHECK: Contiguity: [1] ; Divisibility: [1] ; Constancy: [1] ; ConstantValue: [None] %pid = tt.get_program_id {axis = 0 : i32} : i32 // CHECK-NEXT: Contiguity: [1] ; Divisibility: [128] ; Constancy: [1] ; ConstantValue: [128] %c128_i32 = arith.constant 128 : i32 // CHECK-NEXT: Contiguity: [1] ; Divisibility: [128] ; Constancy: [1] ; ConstantValue: [None] %1 = arith.muli %pid, %c128_i32 : i32 // CHECK-NEXT: Contiguity: [128] ; Divisibility: [1073741824] ; Constancy: [1] ; ConstantValue: [None] %2 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> // CHECK-NEXT: Contiguity: [1] ; Divisibility: [128] ; Constancy: [128] ; ConstantValue: [None] %3 = tt.splat %1 : (i32) -> tensor<128xi32> // CHECK-NEXT: Contiguity: [128] ; Divisibility: [128] ; Constancy: [1] ; ConstantValue: [None] %4 = arith.addi %3, %2 : tensor<128xi32> // CHECK-NEXT: Contiguity: [1] ; Divisibility: [16] ; Constancy: [128] ; ConstantValue: [None] %5 = tt.splat %addr : (!tt.ptr) -> tensor<128x!tt.ptr> // CHECK-NEXT: Contiguity: [128] ; Divisibility: [16] ; Constancy: [1] ; ConstantValue: [None] %6 = tt.addptr %5, %4 : tensor<128x!tt.ptr>, tensor<128xi32> // CHECK-NEXT: Contiguity: [1] ; Divisibility: [16] ; Constancy: [128] ; ConstantValue: [None] %9 = tt.splat %n : (i32) -> tensor<128xi32> // CHECK-NEXT: Contiguity: [1] ; Divisibility: [1] ; Constancy: [16] ; ConstantValue: [None] %mask = arith.cmpi slt, %4, %9 : tensor<128xi32> // CHECK-NEXT: Contiguity: [1] ; Divisibility: [1] ; Constancy: [1] ; ConstantValue: [None] %cst = arith.constant dense<0.0> : tensor<128xf32> tt.store %5, %cst, %mask : tensor<128xf32> return } } // ----- // This IR is dumped from vecadd test. // Note, the hint {tt.divisibility = 16 : i32} for %n_elements affects the alignment of mask. // CHECK-LABEL: vecadd_mask_align_16 func @vecadd_mask_align_16(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %n_elements: i32 {tt.divisibility = 16 : i32}) { %c64_i32 = arith.constant 64 : i32 %0 = tt.get_program_id {axis = 0 : i32} : i32 %1 = arith.muli %0, %c64_i32 : i32 %2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> %3 = tt.splat %1 : (i32) -> tensor<64xi32> %4 = arith.addi %3, %2 : tensor<64xi32> %5 = tt.splat %arg0 : (!tt.ptr) -> tensor<64x!tt.ptr> %6 = tt.addptr %5, %4 : tensor<64x!tt.ptr>, tensor<64xi32> %7 = tt.splat %arg1 : (!tt.ptr) -> tensor<64x!tt.ptr> %8 = tt.addptr %7, %4 : tensor<64x!tt.ptr>, tensor<64xi32> %9 = tt.splat %n_elements : (i32) -> tensor<64xi32> // CHECK: Contiguity: [1] ; Divisibility: [1] ; Constancy: [16] ; ConstantValue: [None] ( %{{.*}} = arith.cmpi slt, %{{.*}}, %{{.*}} : tensor<64xi32> ) %mask = arith.cmpi slt, %4, %9 : tensor<64xi32> %11 = tt.load %6, %mask {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<64xf32> %12 = tt.load %8, %mask {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<64xf32> %13 = arith.addf %11, %12 : tensor<64xf32> %14 = tt.splat %arg2 : (!tt.ptr) -> tensor<64x!tt.ptr> // CHECK: Contiguity: [64] ; Divisibility: [16] ; Constancy: [1] ; ConstantValue: [None] ( %{{.*}} = tt.addptr %{{.*}}, %{{.*}} : tensor<64x!tt.ptr>, tensor<64xi32> ) %15 = tt.addptr %14, %4 : tensor<64x!tt.ptr>, tensor<64xi32> tt.store %15, %13, %mask : tensor<64xf32> return } // ----- // This IR is dumped from vecadd test. // Note, there is no divisibility hint for %n_elements, Triton should assume its divisibility to be 1 by default. // CHECK-LABEL: vecadd_mask_align_1 func @vecadd_mask_align_1(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %n_elements: i32) { %c64_i32 = arith.constant 64 : i32 %0 = tt.get_program_id {axis = 0 : i32} : i32 %1 = arith.muli %0, %c64_i32 : i32 %2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> %3 = tt.splat %1 : (i32) -> tensor<64xi32> %4 = arith.addi %3, %2 : tensor<64xi32> %5 = tt.splat %arg0 : (!tt.ptr) -> tensor<64x!tt.ptr> %6 = tt.addptr %5, %4 : tensor<64x!tt.ptr>, tensor<64xi32> %7 = tt.splat %arg1 : (!tt.ptr) -> tensor<64x!tt.ptr> %8 = tt.addptr %7, %4 : tensor<64x!tt.ptr>, tensor<64xi32> %9 = tt.splat %n_elements : (i32) -> tensor<64xi32> // CHECK: Contiguity: [1] ; Divisibility: [1] ; Constancy: [1] ; ConstantValue: [None] ( %{{.*}} = arith.cmpi slt, %{{.*}}, %{{.*}} : tensor<64xi32> ) %10 = arith.cmpi slt, %4, %9 : tensor<64xi32> %11 = tt.load %6, %10 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<64xf32> %12 = tt.load %8, %10 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<64xf32> %13 = arith.addf %11, %12 : tensor<64xf32> %14 = tt.splat %arg2 : (!tt.ptr) -> tensor<64x!tt.ptr> %15 = tt.addptr %14, %4 : tensor<64x!tt.ptr>, tensor<64xi32> tt.store %15, %13, %10 : tensor<64xf32> return } triton-2.0.0/test/Analysis/test-allocation.mlir000066400000000000000000000462501440023377100215710ustar00rootroot00000000000000// RUN: triton-opt %s -split-input-file --mlir-disable-threading -test-print-allocation 2>&1 | FileCheck %s #AL = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> #sliceAd0 = #triton_gpu.slice<{dim = 0, parent = #AL}> #BL = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}> #A_SHARED = #triton_gpu.shared<{vec = 2, perPhase = 2, maxPhase = 4, order = [1, 0]}> #A_SHARED_T = #triton_gpu.shared<{vec = 2, perPhase = 2, maxPhase = 4, order = [0, 1]}> #B_SHARED = #triton_gpu.shared<{vec = 2, perPhase = 2, maxPhase = 4, order = [1, 0]}> #C = #triton_gpu.mma<{versionMajor = 2, warpsPerCTA = [4, 1]}> #A_DOT = #triton_gpu.dot_op<{opIdx = 0, parent = #C}> #B_DOT = #triton_gpu.dot_op<{opIdx = 1, parent = #C}> module attributes {"triton_gpu.num-warps" = 4 : i32} { // CHECK-LABEL: matmul_loop func @matmul_loop(%lb : index, %ub : index, %step : index, %A : !tt.ptr, %B : !tt.ptr) { %a_ptr_init = tt.broadcast %A : (!tt.ptr) -> tensor<128x32x!tt.ptr, #AL> %b_ptr_init = tt.broadcast %B : (!tt.ptr) -> tensor<32x128x!tt.ptr, #BL> %a_mask = arith.constant dense : tensor<128x32xi1, #AL> %a_other = arith.constant dense<0.00e+00> : tensor<128x32xf16, #AL> %b_mask = arith.constant dense : tensor<32x128xi1, #BL> %b_other = arith.constant dense<0.00e+00> : tensor<32x128xf16, #BL> %c_init = arith.constant dense<0.00e+00> : tensor<128x128xf32, #C> %a_off = arith.constant dense<4> : tensor<128x32xi32, #AL> %b_off = arith.constant dense<4> : tensor<32x128xi32, #BL> scf.for %iv = %lb to %ub step %step iter_args(%a_ptr = %a_ptr_init, %b_ptr = %b_ptr_init, %prev_c = %c_init) -> (tensor<128x32x!tt.ptr, #AL>, tensor<32x128x!tt.ptr, #BL>, tensor<128x128xf32, #C>) { %a_ = tt.load %a_ptr, %a_mask, %a_other {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<128x32xf16, #AL> // CHECK: offset = 0, size = 4608 %a = triton_gpu.convert_layout %a_ : (tensor<128x32xf16, #AL>) -> tensor<128x32xf16, #A_DOT> %b_ = tt.load %b_ptr, %b_mask, %b_other {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<32x128xf16, #BL> // CHECK-NEXT: offset = 0, size = 4224 %b = triton_gpu.convert_layout %b_ : (tensor<32x128xf16, #BL>) -> tensor<32x128xf16, #B_DOT> %c = tt.dot %a, %b, %prev_c {allowTF32 = true, transA = false, transB = false} : tensor<128x32xf16, #A_DOT> * tensor<32x128xf16, #B_DOT> -> tensor<128x128xf32, #C> %next_a_ptr = tt.addptr %a_ptr, %a_off : tensor<128x32x!tt.ptr, #AL>, tensor<128x32xi32, #AL> %next_b_ptr = tt.addptr %b_ptr, %b_off : tensor<32x128x!tt.ptr, #BL>, tensor<32x128xi32, #BL> scf.yield %next_a_ptr, %next_b_ptr, %c : tensor<128x32x!tt.ptr, #AL>, tensor<32x128x!tt.ptr, #BL>, tensor<128x128xf32, #C> } return // CHECK-NEXT: size = 4608 } // Shared memory is available after a tensor's liveness range ends // CHECK-LABEL: reusable func @reusable(%A : !tt.ptr) { %cst1 = arith.constant dense : tensor<128x32xi1, #AL> %cst2 = arith.constant dense<0.000000e+00> : tensor<128x32xf16, #AL> %cst3 = arith.constant dense : tensor<32x128xi1, #AL> %cst4 = arith.constant dense<0.000000e+00> : tensor<32x128xf16, #AL> %c_init = arith.constant dense<0.00e+00> : tensor<128x128xf32, #C> %a_ptr = tt.broadcast %A : (!tt.ptr) -> tensor<128x32x!tt.ptr, #AL> %b_ptr = tt.broadcast %A : (!tt.ptr) -> tensor<32x128x!tt.ptr, #AL> %a1_ = tt.load %a_ptr, %cst1, %cst2 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<128x32xf16, #AL> // CHECK-NEXT: offset = 0, size = 4608 %a1 = triton_gpu.convert_layout %a1_ : (tensor<128x32xf16, #AL>) -> tensor<128x32xf16, #A_DOT> %a2_ = tt.load %b_ptr, %cst3, %cst4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<32x128xf16, #AL> // CHECK-NEXT: offset = 0, size = 1152 %a2 = triton_gpu.convert_layout %a2_ : (tensor<32x128xf16, #AL>) -> tensor<32x128xf16, #B_DOT> %a3_ = tt.load %a_ptr, %cst1, %cst2 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<128x32xf16, #AL> // CHECK-NEXT: offset = 0, size = 4608 %a3 = triton_gpu.convert_layout %a3_ : (tensor<128x32xf16, #AL>) -> tensor<128x32xf16, #A_DOT> %c = tt.dot %a1, %a2, %c_init {allowTF32 = true, transA = false, transB = false} : tensor<128x32xf16, #A_DOT> * tensor<32x128xf16, #B_DOT> -> tensor<128x128xf32, #C> %a4_ = tt.load %b_ptr, %cst3, %cst4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<32x128xf16, #AL> // CHECK-NEXT: offset = 0, size = 1152 %a4 = triton_gpu.convert_layout %a4_ : (tensor<32x128xf16, #AL>) -> tensor<32x128xf16, #B_DOT> %c1 = tt.dot %a3, %a4, %c {allowTF32 = true, transA = false, transB = false} : tensor<128x32xf16, #A_DOT> * tensor<32x128xf16, #B_DOT> -> tensor<128x128xf32, #C> return // CHECK-NEXT: size = 4608 } // A tensor's shared memory offset is larger than it needs to accommodate further tensors // %cst0->%c // %cst1->%cst4 // %cst3->%g->%h->%i // CHECK-LABEL: preallocate func @preallocate(%A : !tt.ptr) { // CHECK: offset = 0, size = 512 %cst0 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A_SHARED> // CHECK-NEXT: offset = 1024, size = 512 %cst1 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A_SHARED> // CHECK-NEXT: offset = 1536, size = 512 %cst2 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A_SHARED> // CHECK-NEXT: offset = 2048, size = 1024 %a = tt.cat %cst0, %cst1 {axis = 0} : (tensor<16x16xf16, #A_SHARED>, tensor<16x16xf16, #A_SHARED>) -> tensor<32x16xf16, #A_SHARED> // CHECK-NEXT: offset = 3072, size = 1024 %b = tt.cat %cst0, %cst2 {axis = 0} : (tensor<16x16xf16, #A_SHARED>, tensor<16x16xf16, #A_SHARED>) -> tensor<32x16xf16, #A_SHARED> // CHECK-NEXT: offset = 0, size = 1024 %c = tt.cat %cst1, %cst2 {axis = 0} : (tensor<16x16xf16, #A_SHARED>, tensor<16x16xf16, #A_SHARED>) -> tensor<32x16xf16, #A_SHARED> // CHECK-NEXT: offset = 1024, size = 1024 %cst4 = arith.constant dense<0.000000e+00> : tensor<32x16xf16, #A_SHARED> // CHECK-NEXT: offset = 6144, size = 2048 %e = tt.cat %a, %cst4 {axis = 0} : (tensor<32x16xf16, #A_SHARED>, tensor<32x16xf16, #A_SHARED>) -> tensor<64x16xf16, #A_SHARED> // CHECK-NEXT: offset = 8192, size = 2048 %d = tt.cat %b, %cst4 {axis = 0} : (tensor<32x16xf16, #A_SHARED>, tensor<32x16xf16, #A_SHARED>) -> tensor<64x16xf16, #A_SHARED> // CHECK-NEXT: offset = 10240, size = 2048 %f = tt.cat %c, %cst4 {axis = 0} : (tensor<32x16xf16, #A_SHARED>, tensor<32x16xf16, #A_SHARED>) -> tensor<64x16xf16, #A_SHARED> // CHECK-NEXT: offset = 0, size = 2048 %cst5 = arith.constant dense<0.000000e+00> : tensor<64x16xf16, #A_SHARED> // CHECK-NEXT: offset = 2048, size = 4096 %g = tt.cat %e, %cst5 {axis = 0} : (tensor<64x16xf16, #A_SHARED>, tensor<64x16xf16, #A_SHARED>) -> tensor<128x16xf16, #A_SHARED> // CHECK-NEXT: offset = 2048, size = 4096 %h = tt.cat %d, %cst5 {axis = 0} : (tensor<64x16xf16, #A_SHARED>, tensor<64x16xf16, #A_SHARED>) -> tensor<128x16xf16, #A_SHARED> // CHECK-NEXT: offset = 2048, size = 4096 %i = tt.cat %f, %cst5 {axis = 0} : (tensor<64x16xf16, #A_SHARED>, tensor<64x16xf16, #A_SHARED>) -> tensor<128x16xf16, #A_SHARED> return // CHECK-NEXT: size = 12288 } // Unused tensors are immediately released // CHECK-LABEL: unused func @unused(%A : !tt.ptr) { // CHECK: offset = 0, size = 1024 %cst0 = arith.constant dense<0.000000e+00> : tensor<32x16xf16, #A_SHARED> // CHECK-NEXT: offset = 0, size = 512 %cst1 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A_SHARED> // CHECK-NEXT: offset = 512, size = 512 %cst2 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A_SHARED> // CHECK-NEXT: offset = 1024, size = 1024 %a = tt.cat %cst1, %cst2 {axis = 0} : (tensor<16x16xf16, #A_SHARED>, tensor<16x16xf16, #A_SHARED>) -> tensor<32x16xf16, #A_SHARED> return // CHECK: size = 2048 } // cst0 is alive through the entire function, it cannot be released before the end of the function // CHECK-LABEL: longlive func @longlive(%A : !tt.ptr) { // CHECK: offset = 0, size = 512 %cst0 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A_SHARED> // CHECK-NEXT: offset = 512, size = 512 %cst1 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A_SHARED> // CHECK-NEXT: offset = 1024, size = 512 %cst2 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A_SHARED> // CHECK-NEXT: offset = 1536, size = 1024 %a = tt.cat %cst1, %cst2 {axis = 0} : (tensor<16x16xf16, #A_SHARED>, tensor<16x16xf16, #A_SHARED>) -> tensor<32x16xf16, #A_SHARED> // CHECK-NEXT: offset = 512, size = 512 %cst3 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A_SHARED> // CHECK-NEXT: offset = 1024, size = 512 %cst4 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A_SHARED> // CHECK-NEXT: offset = 1536, size = 1024 %b = tt.cat %cst3, %cst4 {axis = 0} : (tensor<16x16xf16, #A_SHARED>, tensor<16x16xf16, #A_SHARED>) -> tensor<32x16xf16, #A_SHARED> // CHECK-NEXT: offset = 1536, size = 512 %cst5 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A_SHARED> // CHECK-NEXT: offset = 1536, size = 512 %cst6 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A_SHARED> // CHECK-NEXT: offset = 1536, size = 1024 %c = tt.cat %cst3, %cst4 {axis = 0} : (tensor<16x16xf16, #A_SHARED>, tensor<16x16xf16, #A_SHARED>) -> tensor<32x16xf16, #A_SHARED> // CHECK-NEXT: offset = 512, size = 1024 %d = tt.cat %cst0, %cst0 {axis = 0} : (tensor<16x16xf16, #A_SHARED>, tensor<16x16xf16, #A_SHARED>) -> tensor<32x16xf16, #A_SHARED> return // CHECK-NEXT: size = 2560 } // CHECK-LABEL: alloc func @alloc(%A : !tt.ptr) { // CHECK: offset = 0, size = 512 %cst0 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A_SHARED> %cst1 = arith.constant dense<0.000000e+00> : tensor<16x32xf16, #AL> // CHECK-NEXT: offset = 0, size = 512 %cst2 = triton_gpu.alloc_tensor : tensor<16x16xf16, #A_SHARED> return // CHECK-NEXT: size = 512 } // CHECK-LABEL: scratch func @scratch() { %cst0 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #AL> // CHECK: scratch offset = 0, size = 512 %b = tt.reduce %cst0 {redOp = 1 : i32, axis = 0 : i32} : tensor<16x16xf16, #AL> -> tensor<16xf16, #sliceAd0> return // CHECK-NEXT: size = 512 } // CHECK-LABEL: trans func @trans(%A : !tt.ptr) { // CHECK: offset = 0, size = 1024 %tensor = arith.constant dense<0.000000e+00> : tensor<16x32xf16, #A_SHARED> %b = tt.trans %tensor : (tensor<16x32xf16, #A_SHARED>) -> tensor<32x16xf16, #A_SHARED_T> return } // CHECK-LABEL: insert_slice_async func @insert_slice_async(%A : !tt.ptr, %i1 : i1) { %a_ptr = tt.broadcast %A : (!tt.ptr) -> tensor<16x16x!tt.ptr, #AL> %mask = tt.splat %i1 : (i1) -> tensor<16x16xi1, #AL> %other = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #AL> // CHECK: offset = 0, size = 512 %tensor = arith.constant dense<0.000000e+00> : tensor<1x16x16xf16, #A_SHARED> %index = arith.constant 0 : i32 %a = triton_gpu.insert_slice_async %a_ptr, %tensor, %index, %mask, %other {axis = 0 : i32, cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<16x16x!tt.ptr, #AL> -> tensor<1x16x16xf16, #A_SHARED> return // CHECK-NEXT: size = 512 } // CHECK-LABEL: extract_slice func @extract_slice(%A : !tt.ptr) { // CHECK: offset = 0, size = 512 %cst0 = arith.constant dense<0.000000e+00> : tensor<1x16x16xf16, #A_SHARED> %index = arith.constant 0 : index %cst1 = tensor.extract_slice %cst0[%index, 0, 0][1, 16, 16][1,1,1] : tensor<1x16x16xf16, #A_SHARED> to tensor<16x16xf16, #A_SHARED> return // CHECK-NEXT: size = 512 } // B0 -> (B1) -> B0 // Memory used by B1 can be reused by B0. // CHECK-LABEL: if func @if(%i1 : i1) { // CHECK: offset = 0, size = 512 %cst0 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A_SHARED> // CHECK-NEXT: offset = 512, size = 512 %cst1 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A_SHARED> scf.if %i1 { // CHECK-NEXT: offset = 1024, size = 1024 %a = tt.cat %cst0, %cst1 {axis = 0} : (tensor<16x16xf16, #A_SHARED>, tensor<16x16xf16, #A_SHARED>) -> tensor<32x16xf16, #A_SHARED> // CHECK-NEXT: offset = 1024, size = 1024 %b = tt.cat %cst0, %cst1 {axis = 0} : (tensor<16x16xf16, #A_SHARED>, tensor<16x16xf16, #A_SHARED>) -> tensor<32x16xf16, #A_SHARED> } // CHECK-NEXT: offset = 0, size = 512 %cst2 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A_SHARED> // CHECK-NEXT: offset = 512, size = 512 %cst3 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A_SHARED> // CHECK-NEXT: offset = 1024, size = 1024 %a = tt.cat %cst2, %cst3 {axis = 0} : (tensor<16x16xf16, #A_SHARED>, tensor<16x16xf16, #A_SHARED>) -> tensor<32x16xf16, #A_SHARED> return // CHECK-NEXT: size = 2048 } // B0 -> (B1) -> (B2) -> B0 // Memory used by B0 cannot be reused by B1 or B2. // CHECK-LABEL: if_else func @if_else(%i1 : i1) { // CHECK: offset = 0, size = 512 %cst0 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A_SHARED> // CHECK-NEXT: offset = 512, size = 512 %cst1 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A_SHARED> scf.if %i1 { // CHECK-NEXT: offset = 1024, size = 1024 %a = tt.cat %cst0, %cst1 {axis = 0} : (tensor<16x16xf16, #A_SHARED>, tensor<16x16xf16, #A_SHARED>) -> tensor<32x16xf16, #A_SHARED> // CHECK-NEXT: offset = 1024, size = 1024 %b = tt.cat %cst0, %cst1 {axis = 0} : (tensor<16x16xf16, #A_SHARED>, tensor<16x16xf16, #A_SHARED>) -> tensor<32x16xf16, #A_SHARED> } else { // CHECK-NEXT: offset = 1024, size = 512 %cst2 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A_SHARED> // CHECK-NEXT: offset = 1536, size = 512 %cst3 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A_SHARED> // CHECK-NEXT: offset = 2048, size = 1024 %a = tt.cat %cst2, %cst3 {axis = 0} : (tensor<16x16xf16, #A_SHARED>, tensor<16x16xf16, #A_SHARED>) -> tensor<32x16xf16, #A_SHARED> } // CHECK-NEXT: offset = 1024, size = 1024 %a = tt.cat %cst0, %cst1 {axis = 0} : (tensor<16x16xf16, #A_SHARED>, tensor<16x16xf16, #A_SHARED>) -> tensor<32x16xf16, #A_SHARED> return // CHECK-NEXT: size = 3072 } // Block arguments and yields are memory aliases that do not trigger a new // allocation. // CHECK-LABEL: for func @for(%lb : index, %ub : index, %step : index, %A : !tt.ptr, %B : !tt.ptr) { // CHECK: offset = 0, size = 8192 %a_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A_SHARED> // CHECK-NEXT: offset = 8192, size = 8192 %b_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A_SHARED> // CHECK-NEXT: offset = 16384, size = 8192 %c_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A_SHARED> %a_shared, %b_shared, %c_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a_shared_init, %b_shared = %b_shared_init, %c_shared = %c_shared_init) -> (tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED>) { scf.yield %b_shared, %a_shared, %a_shared : tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED> } return // CHECK-NEXT: size = 24576 } // CHECK-LABEL: for_if_slice func @for_if_slice(%lb : index, %ub : index, %step : index, %A : !tt.ptr, %B : !tt.ptr, %i1 : i1) { // CHECK: offset = 0, size = 8192 %a_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A_SHARED> // CHECK-NEXT: offset = 8192, size = 8192 %b_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A_SHARED> // CHECK-NEXT: offset = 16384, size = 8192 %c_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A_SHARED> %a_shared, %b_shared, %c_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a_shared_init, %b_shared = %b_shared_init, %c_shared = %c_shared_init) -> (tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED>) { scf.if %i1 { %index = arith.constant 8 : index %cst0 = tensor.extract_slice %a_shared[%index, 0][1, 32][1, 1] : tensor<128x32xf16, #A_SHARED> to tensor<32xf16, #A_SHARED> scf.yield } scf.yield %b_shared, %a_shared, %a_shared : tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED> } return // CHECK-NEXT: size = 24576 } // c0 cannot be released in the loop // CHECK-LABEL: for_use_ancestor func @for_use_ancestor(%lb : index, %ub : index, %step : index, %A : !tt.ptr, %B : !tt.ptr, %i1 : i1) { // CHECK: offset = 0, size = 8192 %a_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A_SHARED> // CHECK-NEXT: offset = 8192, size = 8192 %b_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A_SHARED> // CHECK-NEXT: offset = 16384, size = 8192 %c_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A_SHARED> %a_shared, %b_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a_shared_init, %b_shared = %b_shared_init) -> (tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED>) { %c0 = tt.trans %c_shared_init : (tensor<128x32xf16, #A_SHARED>) -> tensor<32x128xf16, #A_SHARED_T> // CHECK-NEXT: offset = 24576, size = 8192 %c1 = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A_SHARED> scf.yield %b_shared, %a_shared: tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED> } return // CHECK-NEXT: size = 32768 } // a_shared_init, b_shared_init, and c_shared_init's liveness ranges are span over the entire function before cst2. // So they cannot be reused by cst0 and cst1, but can be reused by cst2. // CHECK-LABEL: for_if_for func @for_if_for(%lb : index, %ub : index, %step : index, %A : !tt.ptr, %B : !tt.ptr, %i1 : i1) { // CHECK: offset = 0, size = 8192 %a_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A_SHARED> // CHECK-NEXT: offset = 8192, size = 8192 %b_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A_SHARED> // CHECK-NEXT: offset = 16384, size = 8192 %c_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A_SHARED> %a_shared, %b_shared, %c_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a_shared_init, %b_shared = %b_shared_init, %c_shared = %c_shared_init) -> (tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED>) { %c_shared_next = scf.for %jv = %lb to %ub step %step iter_args(%c_shared_next = %c_shared) -> (tensor<128x32xf16, #A_SHARED>) { %c_shared_next_next = scf.if %i1 -> tensor<128x32xf16, #A_SHARED> { // CHECK-NEXT: offset = 24576, size = 8192 %cst0 = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A_SHARED> scf.yield %cst0 : tensor<128x32xf16, #A_SHARED> } else { // CHECK-NEXT: offset = 32768, size = 8192 %cst1 = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A_SHARED> scf.yield %cst1 : tensor<128x32xf16, #A_SHARED> } scf.yield %c_shared_next_next : tensor<128x32xf16, #A_SHARED> } scf.yield %a_shared, %b_shared, %c_shared_next : tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED> } // CHECK-NEXT: offset = 0, size = 8192 %cst2 = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A_SHARED> return // CHECK-NEXT: size = 40960 } } triton-2.0.0/test/Analysis/test-membar.mlir000066400000000000000000000452041440023377100207050ustar00rootroot00000000000000// RUN: triton-opt %s -split-input-file --mlir-disable-threading -test-print-membar 2>&1 | FileCheck %s #AL = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> #sliceAd0 = #triton_gpu.slice<{dim = 0, parent = #AL}> #BL = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}> #A_SHARED = #triton_gpu.shared<{vec = 2, perPhase = 2, maxPhase = 4, order = [1, 0]}> #A_SHARED_T = #triton_gpu.shared<{vec = 2, perPhase = 2, maxPhase = 4, order = [0, 1]}> #B_SHARED = #triton_gpu.shared<{vec = 2, perPhase = 2, maxPhase = 4, order = [1, 0]}> #C = #triton_gpu.mma<{versionMajor = 2, warpsPerCTA = [4, 1]}> #A_DOT = #triton_gpu.dot_op<{opIdx = 0, parent = #C}> #B_DOT = #triton_gpu.dot_op<{opIdx = 1, parent = #C}> module attributes {"triton_gpu.num-warps" = 4 : i32} { // CHECK-LABEL: matmul_loop // There shouldn't be any membar with the dot op encoding. func @matmul_loop(%lb : index, %ub : index, %step : index, %A : !tt.ptr, %B : !tt.ptr) { %a_ptr_init = tt.broadcast %A : (!tt.ptr) -> tensor<128x32x!tt.ptr, #AL> %b_ptr_init = tt.broadcast %B : (!tt.ptr) -> tensor<32x128x!tt.ptr, #BL> %a_mask = arith.constant dense : tensor<128x32xi1, #AL> %a_other = arith.constant dense<0.00e+00> : tensor<128x32xf16, #AL> %b_mask = arith.constant dense : tensor<32x128xi1, #BL> %b_other = arith.constant dense<0.00e+00> : tensor<32x128xf16, #BL> %c_init = arith.constant dense<0.00e+00> : tensor<128x128xf32, #C> %a_off = arith.constant dense<4> : tensor<128x32xi32, #AL> %b_off = arith.constant dense<4> : tensor<32x128xi32, #BL> scf.for %iv = %lb to %ub step %step iter_args(%a_ptr = %a_ptr_init, %b_ptr = %b_ptr_init, %prev_c = %c_init) -> (tensor<128x32x!tt.ptr, #AL>, tensor<32x128x!tt.ptr, #BL>, tensor<128x128xf32, #C>) { %a_ = tt.load %a_ptr, %a_mask, %a_other {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<128x32xf16, #AL> %a = triton_gpu.convert_layout %a_ : (tensor<128x32xf16, #AL>) -> tensor<128x32xf16, #A_DOT> %b_ = tt.load %b_ptr, %b_mask, %b_other {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<32x128xf16, #BL> %b = triton_gpu.convert_layout %b_ : (tensor<32x128xf16, #BL>) -> tensor<32x128xf16, #B_DOT> %c = tt.dot %a, %b, %prev_c {allowTF32 = true, transA = false, transB = false} : tensor<128x32xf16, #A_DOT> * tensor<32x128xf16, #B_DOT> -> tensor<128x128xf32, #C> %next_a_ptr = tt.addptr %a_ptr, %a_off : tensor<128x32x!tt.ptr, #AL>, tensor<128x32xi32, #AL> %next_b_ptr = tt.addptr %b_ptr, %b_off : tensor<32x128x!tt.ptr, #BL>, tensor<32x128xi32, #BL> scf.yield %next_a_ptr, %next_b_ptr, %c : tensor<128x32x!tt.ptr, #AL>, tensor<32x128x!tt.ptr, #BL>, tensor<128x128xf32, #C> } return } // CHECK-LABEL: raw_single_block func @raw_single_block(%A : !tt.ptr) { %cst1 = arith.constant dense : tensor<128x32xi1, #AL> %cst2 = arith.constant dense<0.000000e+00> : tensor<128x32xf16, #AL> %a_ptr = tt.broadcast %A : (!tt.ptr) -> tensor<128x32x!tt.ptr, #AL> %a1_ = tt.load %a_ptr, %cst1, %cst2 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<128x32xf16, #AL> %a1 = triton_gpu.convert_layout %a1_ : (tensor<128x32xf16, #AL>) -> tensor<128x32xf16, #A_SHARED> // CHECK: Membar 5 %a2 = triton_gpu.convert_layout %a1 : (tensor<128x32xf16, #A_SHARED>) -> tensor<128x32xf16, #A_SHARED> return } // CHECK-LABEL: war_single_block func @war_single_block(%A : !tt.ptr) { %cst1 = arith.constant dense : tensor<128x32xi1, #AL> %cst2 = arith.constant dense<0.000000e+00> : tensor<128x32xf16, #AL> %a_ptr = tt.broadcast %A : (!tt.ptr) -> tensor<128x32x!tt.ptr, #AL> %a1_ = tt.load %a_ptr, %cst1, %cst2 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<128x32xf16, #AL> %a1 = triton_gpu.convert_layout %a1_ : (tensor<128x32xf16, #AL>) -> tensor<128x32xf16, #A_SHARED> // CHECK: Membar 5 %a2 = triton_gpu.convert_layout %a1 : (tensor<128x32xf16, #A_SHARED>) -> tensor<128x32xf16, #AL> // a2's liveness range ends here, and a3 and a2 have the same address range. // So it makes sense to have a WAR dependency between a2 and a3. // CHECK-NEXT: Membar 7 %a3 = triton_gpu.convert_layout %a1_ : (tensor<128x32xf16, #AL>) -> tensor<128x32xf16, #A_SHARED> return } // CHECK-LABEL: scratch func @scratch() { %cst0 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A_SHARED> // CHECK: Membar 1 %a = tt.cat %cst0, %cst0 {axis = 0} : (tensor<16x16xf16, #A_SHARED>, tensor<16x16xf16, #A_SHARED>) -> tensor<32x16xf16, #A_SHARED> // CHECK-NEXT: Membar 3 %aa = triton_gpu.convert_layout %a : (tensor<32x16xf16, #A_SHARED>) -> tensor<32x16xf16, #AL> %b = tt.reduce %aa {redOp = 1 : i32, axis = 0 : i32} : tensor<32x16xf16, #AL> -> tensor<16xf16, #sliceAd0> return } // CHECK-LABEL: async_wait func @async_wait() { %cst0 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A_SHARED> // CHECK: Membar 1 %a = tt.cat %cst0, %cst0 {axis = 0} : (tensor<16x16xf16, #A_SHARED>, tensor<16x16xf16, #A_SHARED>) -> tensor<32x16xf16, #A_SHARED> triton_gpu.async_wait {num = 4 : i32} // CHECK-NEXT: Membar 4 %a_ = triton_gpu.convert_layout %a : (tensor<32x16xf16, #A_SHARED>) -> tensor<32x16xf16, #AL> return } // CHECK-LABEL: alloc func @alloc() { %cst0 = triton_gpu.alloc_tensor : tensor<16x16xf16, #A_SHARED> %a = tt.cat %cst0, %cst0 {axis = 0} : (tensor<16x16xf16, #A_SHARED>, tensor<16x16xf16, #A_SHARED>) -> tensor<32x16xf16, #A_SHARED> // CHECK: Membar 2 %b = triton_gpu.convert_layout %a : (tensor<32x16xf16, #A_SHARED>) -> tensor<32x16xf16, #AL> return } // CHECK-LABEL: extract_slice func @extract_slice() { %cst0 = arith.constant dense<0.000000e+00> : tensor<1x16x16xf16, #A_SHARED> %index = arith.constant 0 : index %cst1 = tensor.extract_slice %cst0[%index, 0, 0][1, 16, 16][1, 1, 1] : tensor<1x16x16xf16, #A_SHARED> to tensor<16x16xf16, #A_SHARED> // CHECK: Membar 3 %cst2 = triton_gpu.convert_layout %cst1 : (tensor<16x16xf16, #A_SHARED>) -> tensor<16x16xf16, #AL> // CHECK-NEXT: Membar 5 %cst3 = triton_gpu.convert_layout %cst2 : (tensor<16x16xf16, #AL>) -> tensor<16x16xf16, #A_SHARED> return } // CHECK-LABEL: trans func @trans() { %cst0 = arith.constant dense<0.000000e+00> : tensor<16x32xf16, #A_SHARED> %b = tt.trans %cst0 : (tensor<16x32xf16, #A_SHARED>) -> tensor<32x16xf16, #A_SHARED_T> return } // CHECK-LABEL: insert_slice_async func @insert_slice_async(%A : !tt.ptr, %i1 : i1) { %a_ptr = tt.broadcast %A : (!tt.ptr) -> tensor<16x16x!tt.ptr, #AL> %mask = tt.splat %i1 : (i1) -> tensor<16x16xi1, #AL> %other = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #AL> %tensor = triton_gpu.alloc_tensor : tensor<1x16x16xf16, #A_SHARED> %index = arith.constant 0 : i32 %a = triton_gpu.insert_slice_async %a_ptr, %tensor, %index, %mask, %other {axis = 0 : i32, cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<16x16x!tt.ptr, #AL> -> tensor<1x16x16xf16, #A_SHARED> // CHECK: Membar 6 %b = tt.cat %a, %a {axis = 0} : (tensor<1x16x16xf16, #A_SHARED>, tensor<1x16x16xf16, #A_SHARED>) -> tensor<2x16x16xf16, #A_SHARED> // CHECK: Membar 8 %c = tt.cat %b, %b {axis = 0} : (tensor<2x16x16xf16, #A_SHARED>, tensor<2x16x16xf16, #A_SHARED>) -> tensor<4x16x16xf16, #A_SHARED> return } // CHECK-LABEL: insert_slice func @insert_slice(%A : !tt.ptr, %i1 : i1) { %a_ptr = tt.broadcast %A : (!tt.ptr) -> tensor<16x16x!tt.ptr, #AL> %mask = tt.splat %i1 : (i1) -> tensor<16x16xi1, #AL> %other = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #AL> %tensor = arith.constant dense<0.000000e+00> : tensor<1x16x16xf16, #A_SHARED> %index = arith.constant 0 : index %al = tt.load %a_ptr, %mask, %other {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<16x16xf16, #AL> // CHECK: Membar 6 %a = tensor.insert_slice %al into %tensor[%index, 0, 0][1, 16, 16][1, 1, 1]: tensor<16x16xf16, #AL> into tensor<1x16x16xf16, #A_SHARED> // CHECK: Membar 8 %b = tt.cat %a, %a {axis = 0} : (tensor<1x16x16xf16, #A_SHARED>, tensor<1x16x16xf16, #A_SHARED>) -> tensor<2x16x16xf16, #A_SHARED> // CHECK: Membar 10 %c = tt.cat %b, %b {axis = 0} : (tensor<2x16x16xf16, #A_SHARED>, tensor<2x16x16xf16, #A_SHARED>) -> tensor<4x16x16xf16, #A_SHARED> return } // If branch inserted a barrier for %cst0 and %cst1, but else didn't, then the barrier should be inserted in the parent region // CHECK-LABEL: multi_blocks func @multi_blocks(%i1 : i1) { %cst0 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A_SHARED> %cst1 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A_SHARED> scf.if %i1 { // CHECK: Membar 2 %a = tt.cat %cst0, %cst1 {axis = 0} : (tensor<16x16xf16, #A_SHARED>, tensor<16x16xf16, #A_SHARED>) -> tensor<32x16xf16, #A_SHARED> scf.yield } else { %cst2 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A_SHARED> %cst3 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A_SHARED> // CHECK-NEXT: Membar 7 %b = tt.cat %cst2, %cst3 {axis = 0} : (tensor<16x16xf16, #A_SHARED>, tensor<16x16xf16, #A_SHARED>) -> tensor<32x16xf16, #A_SHARED> scf.yield } // CHECK-NEXT: Membar 10 %c = tt.cat %cst0, %cst1 {axis = 0} : (tensor<16x16xf16, #A_SHARED>, tensor<16x16xf16, #A_SHARED>) -> tensor<32x16xf16, #A_SHARED> return } // Both branches inserted a barrier for %cst0 and %cst1, then the barrier doesn't need to be inserted in the parent region // CHECK-LABEL: multi_blocks_join_barrier func @multi_blocks_join_barrier(%i1 : i1) { %cst0 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A_SHARED> %cst1 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A_SHARED> scf.if %i1 { // CHECK: Membar 2 %a = tt.cat %cst0, %cst1 {axis = 0} : (tensor<16x16xf16, #A_SHARED>, tensor<16x16xf16, #A_SHARED>) -> tensor<32x16xf16, #A_SHARED> scf.yield } else { // CHECK-NEXT: Membar 5 %a = tt.cat %cst0, %cst1 {axis = 0} : (tensor<16x16xf16, #A_SHARED>, tensor<16x16xf16, #A_SHARED>) -> tensor<32x16xf16, #A_SHARED> scf.yield } %a_ = triton_gpu.convert_layout %cst0 : (tensor<16x16xf16, #A_SHARED>) -> tensor<16x16xf16, #AL> return } // Read yielded tensor requires a barrier // CHECK-LABEL: multi_blocks_yield func @multi_blocks_yield(%i1 : i1) { %cst0 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A_SHARED> %cst1 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A_SHARED> %a = scf.if %i1 -> (tensor<32x16xf16, #A_SHARED>) { // CHECK: Membar 2 %a = tt.cat %cst0, %cst1 {axis = 0} : (tensor<16x16xf16, #A_SHARED>, tensor<16x16xf16, #A_SHARED>) -> tensor<32x16xf16, #A_SHARED> scf.yield %a : tensor<32x16xf16, #A_SHARED> } else { // CHECK-NEXT: Membar 5 %b = tt.cat %cst0, %cst1 {axis = 0} : (tensor<16x16xf16, #A_SHARED>, tensor<16x16xf16, #A_SHARED>) -> tensor<32x16xf16, #A_SHARED> scf.yield %b : tensor<32x16xf16, #A_SHARED> } %a_ = triton_gpu.convert_layout %cst0 : (tensor<16x16xf16, #A_SHARED>) -> tensor<16x16xf16, #AL> // CHECK-NEXT: Membar 9 %b = tt.cat %a, %a {axis = 0} : (tensor<32x16xf16, #A_SHARED>, tensor<32x16xf16, #A_SHARED>) -> tensor<64x16xf16, #A_SHARED> return } // Conservatively add a barrier as if the branch (%i1) is never taken // CHECK-LABEL: multi_blocks_noelse func @multi_blocks_noelse(%i1 : i1) { %cst0 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A_SHARED> %cst1 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A_SHARED> scf.if %i1 { // CHECK: Membar 2 %a = tt.cat %cst0, %cst1 {axis = 0} : (tensor<16x16xf16, #A_SHARED>, tensor<16x16xf16, #A_SHARED>) -> tensor<32x16xf16, #A_SHARED> scf.yield } %a_ = triton_gpu.convert_layout %cst0 : (tensor<16x16xf16, #A_SHARED>) -> tensor<16x16xf16, #AL> return } // Conservatively add a barrier as if the branch (%i2) is never taken // CHECK-LABEL: multi_blocks_nested_scf func @multi_blocks_nested_scf(%i1 : i1, %i2 : i1) { %cst0 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A_SHARED> %cst1 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A_SHARED> scf.if %i1 { scf.if %i2 { // CHECK: Membar 2 %b = tt.cat %cst0, %cst1 {axis = 0} : (tensor<16x16xf16, #A_SHARED>, tensor<16x16xf16, #A_SHARED>) -> tensor<32x16xf16, #A_SHARED> scf.yield } scf.yield } else { // CHECK-NEXT: Membar 6 %b = tt.cat %cst0, %cst1 {axis = 0} : (tensor<16x16xf16, #A_SHARED>, tensor<16x16xf16, #A_SHARED>) -> tensor<32x16xf16, #A_SHARED> scf.yield } // CHECK-NEXT: Membar 9 %a_ = triton_gpu.convert_layout %cst0 : (tensor<16x16xf16, #A_SHARED>) -> tensor<16x16xf16, #AL> return } // CHECK-LABEL: for func @for(%lb : index, %ub : index, %step : index, %A : !tt.ptr, %B : !tt.ptr) { %a_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A_SHARED> %b_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A_SHARED> %c_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A_SHARED> %a_shared, %b_shared, %c_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a_shared_init, %b_shared = %b_shared_init, %c_shared = %c_shared_init) -> (tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED>) { // CHECK-NEXT: Membar 3 %cst0 = tt.cat %a_shared, %b_shared {axis = 0} : (tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED>) -> tensor<256x32xf16, #A_SHARED> scf.yield %b_shared, %a_shared, %a_shared : tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED> } return } // Although a_shared and b_shared are synced before entering the loop, // they are reassociated with aliases (c_shared) and thus require a barrier. // CHECK-LABEL: for_alias func @for_alias(%lb : index, %ub : index, %step : index, %A : !tt.ptr, %B : !tt.ptr) { %a_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A_SHARED> %b_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A_SHARED> // CHECK-NEXT: Membar 2 %cst0 = tt.cat %a_shared_init, %b_shared_init {axis = 0} : (tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED>) -> tensor<256x32xf16, #A_SHARED> %c_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A_SHARED> %a_shared, %b_shared, %c_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a_shared_init, %b_shared = %b_shared_init, %c_shared = %c_shared_init) -> (tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED>) { %cst1 = tt.cat %a_shared_init, %b_shared_init {axis = 0} : (tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED>) -> tensor<256x32xf16, #AL> // CHECK-NEXT: Membar 6 %cst2 = tt.cat %a_shared, %b_shared {axis = 0} : (tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED>) -> tensor<256x32xf16, #AL> scf.yield %c_shared, %a_shared, %b_shared : tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED> } // CHECK-NEXT: Membar 9 %cst3 = tt.cat %cst0, %cst0 {axis = 0} : (tensor<256x32xf16, #A_SHARED>, tensor<256x32xf16, #A_SHARED>) -> tensor<512x32xf16, #A_SHARED> return } // Although cst2 is not an argument of scf.yield, its memory is reused by cst1. // So we need a barrier both before and after cst1 // CHECK-LABEL: for_reuse func @for_reuse(%lb : index, %ub : index, %step : index, %A : !tt.ptr, %B : !tt.ptr) { %a_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A_SHARED> %b_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A_SHARED> // CHECK-NEXT: Membar 2 %cst0 = tt.cat %a_shared_init, %b_shared_init {axis = 0} : (tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED>) -> tensor<256x32xf16, #A_SHARED> %c_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A_SHARED> %a_shared, %b_shared, %c_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a_shared_init, %b_shared = %b_shared_init, %c_shared = %c_shared_init) -> (tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED>) { // CHECK-NEXT: Membar 5 %cst1 = tt.cat %a_shared_init, %b_shared_init {axis = 0} : (tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED>) -> tensor<256x32xf16, #A_SHARED> // CHECK-NEXT: Membar 7 %cst2 = tt.cat %a_shared, %b_shared {axis = 0} : (tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED>) -> tensor<256x32xf16, #A_SHARED> scf.yield %c_shared, %a_shared, %b_shared : tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED> } // CHECK-NEXT: Membar 10 %cst3 = tt.cat %cst0, %cst0 {axis = 0} : (tensor<256x32xf16, #A_SHARED>, tensor<256x32xf16, #A_SHARED>) -> tensor<512x32xf16, #A_SHARED> return } // CHECK-LABEL: for_reuse_nested func @for_reuse_nested(%lb : index, %ub : index, %step : index, %A : !tt.ptr, %B : !tt.ptr) { %a_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A_SHARED> %b_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A_SHARED> // CHECK-NEXT: Membar 2 %cst0 = tt.cat %a_shared_init, %b_shared_init {axis = 0} : (tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED>) -> tensor<256x32xf16, #A_SHARED> %c_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A_SHARED> %a_shared, %b_shared, %c_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a_shared_init, %b_shared = %b_shared_init, %c_shared = %c_shared_init) -> (tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED>) { // CHECK-NEXT: Membar 5 %cst1 = tt.cat %a_shared_init, %b_shared_init {axis = 0} : (tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED>) -> tensor<256x32xf16, #A_SHARED> %a_shared_next, %b_shared_next, %c_shared_next = scf.for %ivv = %lb to %ub step %step iter_args(%a_shared_nested = %a_shared_init, %b_shared_nested = %b_shared_init, %c_shared_nested = %c_shared_init) -> (tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED>) { // CHECK-NEXT: Membar 7 %cst2 = tt.cat %a_shared_nested, %b_shared_nested {axis = 0} : (tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED>) -> tensor<256x32xf16, #A_SHARED> scf.yield %c_shared_nested, %a_shared_nested, %b_shared_nested : tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED> } scf.yield %c_shared, %a_shared, %b_shared : tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED> } // CHECK-NEXT: Membar 11 %cst3 = tt.cat %cst0, %cst0 {axis = 0} : (tensor<256x32xf16, #A_SHARED>, tensor<256x32xf16, #A_SHARED>) -> tensor<512x32xf16, #A_SHARED> return } } triton-2.0.0/test/CMakeLists.txt000066400000000000000000000011731440023377100165520ustar00rootroot00000000000000add_subdirectory(lib) llvm_canonicalize_cmake_booleans( MLIR_ENABLE_BINDINGS_PYTHON ) configure_lit_site_cfg( ${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.py.in ${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg.py MAIN_CONFIG ${CMAKE_CURRENT_SOURCe_DIR}/lit.cfg.py ) set(TRITON_TEST_DEPENDS triton-opt FileCheck ) add_lit_testsuite(check-triton-lit-tests "Running the triton regression tests" ${CMAKE_CURRENT_BINARY_DIR} DEPENDS ${TRITON_TEST_DEPENDS} ) set_target_properties(check-triton-lit-tests PROPERTIES FOLDER "Tests") add_lit_testsuites(TRITON-LIT-TESTS ${CMAKE_CURRENT_SOURCE_DIR} DEPENDS ${TRITON_TEST_DEPENDS}) triton-2.0.0/test/Conversion/000077500000000000000000000000001440023377100161355ustar00rootroot00000000000000triton-2.0.0/test/Conversion/triton_ops.mlir000066400000000000000000000146231440023377100212300ustar00rootroot00000000000000// RUN: triton-opt %s | FileCheck %s func @cast_ops(%scalar_ptr: !tt.ptr, %scalar_f32: f32, %scalar_i64: i64) { // scalar -> scalar // CHECK: i64 -> !tt.ptr %0 = tt.int_to_ptr %scalar_i64 : i64 -> !tt.ptr // CHECK: !tt.ptr -> i64 %1 = tt.ptr_to_int %scalar_ptr : !tt.ptr -> i64 // CHECK: f32 to f16 %2 = arith.truncf %scalar_f32 : f32 to f16 // 0D tensor -> 0D tensor %tensor_ptr_0d = tt.splat %scalar_ptr : (!tt.ptr) -> tensor> %tensor_f32_0d = tt.splat %scalar_f32 : (f32) -> tensor %tensor_i64_0d = tt.splat %scalar_i64 : (i64) -> tensor // CHECK: tensor -> tensor> %3 = tt.int_to_ptr %tensor_i64_0d : tensor -> tensor> // CHECK: tensor> -> tensor %4 = tt.ptr_to_int %tensor_ptr_0d : tensor> -> tensor // CHECK: tensor to tensor %5 = arith.truncf %tensor_f32_0d : tensor to tensor // 1D tensor -> 1D tensor %tensor_ptr_1d = tt.splat %scalar_ptr : (!tt.ptr) -> tensor<16x!tt.ptr> %tensor_f32_1d = tt.splat %scalar_f32 : (f32) -> tensor<16xf32> %tensor_i64_1d = tt.splat %scalar_i64 : (i64) -> tensor<16xi64> // CHECK: tensor<16xi64> -> tensor<16x!tt.ptr> %6 = tt.int_to_ptr %tensor_i64_1d : tensor<16xi64> -> tensor<16x!tt.ptr> // CHECK: tensor<16x!tt.ptr> -> tensor<16xi64> %7 = tt.ptr_to_int %tensor_ptr_1d : tensor<16x!tt.ptr> -> tensor<16xi64> // CHECK: tensor<16xf32> to tensor<16xf16> %8 = arith.truncf %tensor_f32_1d : tensor<16xf32> to tensor<16xf16> return } func @addptr_ops(%scalar_ptr: !tt.ptr, %scalar_i32: i32) { // scalar -> scalar // CHECK: !tt.ptr %0 = tt.addptr %scalar_ptr, %scalar_i32 : !tt.ptr, i32 // 0D tensor -> 0D tensor %tensor_ptr_0d = tt.splat %scalar_ptr : (!tt.ptr) -> tensor> %tensor_i32_0d = tt.splat %scalar_i32 : (i32) -> tensor // CHECK: tensor> %1 = tt.addptr %tensor_ptr_0d, %tensor_i32_0d : tensor>, tensor // 1D tensor -> 1D tensor %tensor_ptr_1d = tt.splat %scalar_ptr : (!tt.ptr) -> tensor<16x!tt.ptr> %tensor_i32_1d = tt.splat %scalar_i32 : (i32) -> tensor<16xi32> // CHECK: tensor<16x!tt.ptr> %2 = tt.addptr %tensor_ptr_1d, %tensor_i32_1d : tensor<16x!tt.ptr>, tensor<16xi32> return } func @load_store_ops_scalar(%ptr: !tt.ptr {tt.divisibility = 16 : i32}, %mask : i1) { // Test if Load/Store ops can handle scalar values %other = arith.constant 0.0e+0 : f32 // load scalar // CHECK: %[[L0:.*]] = tt.load %{{.*}} {cache = 1 : i32, evict = 1 : i32, isVolatile = true} : f32 %a = tt.load %ptr {cache = 1 : i32, evict = 1 : i32, isVolatile = true} : f32 // CHECK: %[[L1:.*]] = tt.load %{{.*}}, %{{.*}} {cache = 1 : i32, evict = 1 : i32, isVolatile = true} : f32 %b = tt.load %ptr, %mask {cache = 1 : i32, evict = 1 : i32, isVolatile = true} : f32 // CHECK: %[[L2:.*]] = tt.load %{{.*}}, %{{.*}}, %{{.*}} {cache = 1 : i32, evict = 1 : i32, isVolatile = true} : f32 %c = tt.load %ptr, %mask, %other {cache = 1 : i32, evict = 1 : i32, isVolatile = true} : f32 // store scalar // CHECK: tt.store %{{.*}}, %[[L0]] : f32 tt.store %ptr, %a : f32 // CHECK: tt.store %{{.*}}, %[[L1]], %{{.*}} : f32 tt.store %ptr, %b, %mask : f32 // CHECK: tt.store %{{.*}}, %[[L2]], %{{.*}} : f32 tt.store %ptr, %c, %mask : f32 return } func @reduce_ops_infer(%ptr: !tt.ptr, %v : tensor<1x2x4xf32>) { // Test if reduce ops infer types correctly // CHECK: %{{.*}} = tt.reduce %{{.*}} -> tensor<2x4xf32> %a = tt.reduce %v {redOp = 1 : i32, axis = 0 : i32} : tensor<1x2x4xf32> -> tensor<2x4xf32> // CHECK: %{{.*}} = tt.reduce %{{.*}} -> tensor<1x4xf32> %b = tt.reduce %v {redOp = 1 : i32, axis = 1 : i32} : tensor<1x2x4xf32> -> tensor<1x4xf32> // CHECK: %{{.*}} = tt.reduce %{{.*}} -> tensor<1x2xf32> %c = tt.reduce %v {redOp = 1 : i32, axis = 2 : i32} : tensor<1x2x4xf32> -> tensor<1x2xf32> // CHECK: %{{.*}} = tt.reduce %{{.*}} -> tensor<1xf32> %e = tt.reduce %b {redOp = 1 : i32, axis = 1 : i32} : tensor<1x4xf32> -> tensor<1xf32> // CHECK: %{{.*}} = tt.reduce %{{.*}} -> tensor<4xf32> %f = tt.reduce %a {redOp = 1 : i32, axis = 0 : i32} : tensor<2x4xf32> -> tensor<4xf32> // CHECK: %{{.*}} = tt.reduce %{{.*}} -> f32 %g = tt.reduce %f {redOp = 1 : i32, axis = 0 : i32} : tensor<4xf32> -> f32 // Avoid optimizations for c, e, and g %ptr1x2 = tt.splat %ptr : (!tt.ptr) -> tensor<1x2x!tt.ptr> %ptr1 = tt.splat %ptr : (!tt.ptr) -> tensor<1x!tt.ptr> tt.store %ptr1x2, %c : tensor<1x2xf32> tt.store %ptr1, %e : tensor<1xf32> tt.store %ptr, %g : f32 return } func @dot_ops_infer(%ptr: !tt.ptr, %v : f32) { // Test if reduce ops infer types correctly %v128x32 = tt.splat %v : (f32) -> tensor<128x32xf32> %v32x128 = tt.splat %v : (f32) -> tensor<32x128xf32> %v128x1 = tt.splat %v : (f32) -> tensor<128x1xf32> %v1x128 = tt.splat %v : (f32) -> tensor<1x128xf32> %zero128x128 = arith.constant dense<0.00e+00> : tensor<128x128xf32> %zero32x32 = arith.constant dense<0.00e+00> : tensor<32x32xf32> %zero1x1 = arith.constant dense<0.00e+00> : tensor<1x1xf32> // CHECK: %{{.*}} = tt.dot %{{.*}} -> tensor<128x128xf32> %r1 = tt.dot %v128x32, %v32x128, %zero128x128 {allowTF32 = true, transA = false, transB = false} : tensor<128x32xf32> * tensor<32x128xf32> -> tensor<128x128xf32> // CHECK: %{{.*}} = tt.dot %{{.*}} -> tensor<32x32xf32> %r2 = tt.dot %v32x128, %v128x32, %zero32x32 {allowTF32 = true, transA = false, transB = false} : tensor<32x128xf32> * tensor<128x32xf32> -> tensor<32x32xf32> // CHECK: %{{.*}} = tt.dot %{{.*}} -> tensor<128x128xf32> %r3 = tt.dot %v128x1, %v1x128, %zero128x128 {allowTF32 = true, transA = false, transB = false} : tensor<128x1xf32> * tensor<1x128xf32> -> tensor<128x128xf32> // CHECK: %{{.*}} = tt.dot %{{.*}} -> tensor<1x1xf32> %r4 = tt.dot %v1x128, %v128x1, %zero1x1 {allowTF32 = true, transA = false, transB = false} : tensor<1x128xf32> * tensor<128x1xf32> -> tensor<1x1xf32> %ptr128x128 = tt.splat %ptr : (!tt.ptr) -> tensor<128x128x!tt.ptr> %ptr32x32 = tt.splat %ptr : (!tt.ptr) -> tensor<32x32x!tt.ptr> %ptr1x1 = tt.splat %ptr : (!tt.ptr) -> tensor<1x1x!tt.ptr> tt.store %ptr128x128, %r1 : tensor<128x128xf32> tt.store %ptr32x32, %r2 : tensor<32x32xf32> tt.store %ptr128x128, %r3 : tensor<128x128xf32> tt.store %ptr1x1, %r4 : tensor<1x1xf32> return } triton-2.0.0/test/Conversion/triton_to_tritongpu.mlir000066400000000000000000000062141440023377100231610ustar00rootroot00000000000000// RUN: triton-opt %s -split-input-file -convert-triton-to-tritongpu=num-warps=2 | FileCheck %s func @ops() { // CHECK: module attributes {"triton_gpu.num-warps" = 2 : i32} {{.*}} %a = arith.constant dense<1.00e+00> : tensor<128x32xf16> %b = arith.constant dense<2.00e+00> : tensor<32x128xf16> %c = arith.constant dense<3.00e+00> : tensor<128x128xf32> %0 = tt.dot %a, %b, %c {allowTF32 = true, transA = false, transB = false} : tensor<128x32xf16> * tensor<32x128xf16> -> tensor<128x128xf32> return } // ----- func @load_ops(%ptr: !tt.ptr {tt.divisibility = 16 : i32}) { // Test if LoadOp is lowered properly (see #771) %ptrs = tt.splat %ptr : (!tt.ptr) -> tensor<128x!tt.ptr> %mask = arith.constant dense : tensor<128xi1> %other = arith.constant dense<0.0e+0> : tensor<128xf32> // CHECK: %{{.*}} = tt.load %{{.*}} {cache = 1 : i32, evict = 1 : i32, isVolatile = true} : {{.*}} %a = tt.load %ptrs {cache = 1 : i32, evict = 1 : i32, isVolatile = true} : tensor<128xf32> // CHECK: %{{.*}} = tt.load %{{.*}}, %{{.*}} {cache = 1 : i32, evict = 1 : i32, isVolatile = true} : {{.*}} %b = tt.load %ptrs, %mask {cache = 1 : i32, evict = 1 : i32, isVolatile = true} : tensor<128xf32> // CHECK: %{{.*}} = tt.load %{{.*}}, %{{.*}}, %{{.*}} {cache = 1 : i32, evict = 1 : i32, isVolatile = true} : {{.*}} %c = tt.load %ptrs, %mask, %other {cache = 1 : i32, evict = 1 : i32, isVolatile = true} : tensor<128xf32> tt.store %ptrs, %a : tensor<128xf32> tt.store %ptrs, %b : tensor<128xf32> tt.store %ptrs, %c : tensor<128xf32> return } // ----- func @reduce_ops(%ptr: !tt.ptr {tt.divisibility = 16 : i32}) { // Test if the total number of threadsPerWarp is 32 // Test if the total number of warps is 2 // CHECK: #blocked0 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [4, 8], warpsPerCTA = [1, 2], order = [0, 1]}> // CHECK: #blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 2], order = [0, 1]}> // CHECK: #blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [16, 2], warpsPerCTA = [1, 2], order = [0, 1]}> // CHECK: module attributes {"triton_gpu.num-warps" = 2 : i32} {{.*}} %c0 = arith.constant dense<1.00e+00> : tensor<4x4xf32> %c1 = arith.constant dense<2.00e+00> : tensor<8x2xf32> %c2 = arith.constant dense<3.00e+00> : tensor<16x16xf32> // CHECK: tensor<4x4xf32, #blocked0> -> tensor<4xf32, #triton_gpu.slice<{dim = 0, parent = #blocked0}>> %c0_ = tt.reduce %c0 {redOp = 1 : i32, axis = 0 : i32} : tensor<4x4xf32> -> tensor<4xf32> // CHECK: tensor<8x2xf32, #blocked1> -> tensor<2xf32, #triton_gpu.slice<{dim = 0, parent = #blocked1}> %c1_ = tt.reduce %c1 {redOp = 1 : i32, axis = 0 : i32} : tensor<8x2xf32> -> tensor<2xf32> // CHECK: tensor<8x2xf32, #blocked1> -> tensor<8xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> %c2_ = tt.reduce %c1 {redOp = 1 : i32, axis = 1 : i32} : tensor<8x2xf32> -> tensor<8xf32> // CHECK: tensor<16x16xf32, #blocked2> -> tensor<16xf32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>> %c3_ = tt.reduce %c2 {redOp = 1 : i32, axis = 0 : i32} : tensor<16x16xf32> -> tensor<16xf32> return } triton-2.0.0/test/Conversion/tritongpu_to_llvm.mlir000066400000000000000000001551101440023377100226140ustar00rootroot00000000000000// RUN: triton-opt %s -split-input-file --convert-triton-gpu-to-llvm | FileCheck %s module attributes {"triton_gpu.num-warps" = 4 : i32} { // CHECK: llvm.func @test_empty_kernel(%arg0: i32, %arg1: !llvm.ptr) // Here the 128 comes from the 4 in module attribute multiples 32 // CHECK: attributes {nvvm.kernel = 1 : ui1, nvvm.maxntid = 128 : i32} {{.*}} func @test_empty_kernel(%lb : index, %A : !tt.ptr) { // CHECK: llvm.return return } } // end module // ----- #blocked0 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> module attributes {"triton_gpu.num-warps" = 4 : i32} { // CHECK-LABEL: basic_load func @basic_load(%a_ptr_init : tensor<256x!tt.ptr, #blocked0>, %cst : tensor<256xi1, #blocked0>, %cst_0 : tensor<256xf32, #blocked0>) { // CHECK: llvm.inline_asm // CHECK: llvm.inline_asm %1 = tt.load %a_ptr_init, %cst, %cst_0 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked0> return } } // ----- #blocked0 = #triton_gpu.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> module attributes {"triton_gpu.num-warps" = 4 : i32} { // CHECK-LABEL: vectorized_load func @vectorized_load(%a_ptr_init : tensor<256x!tt.ptr, #blocked0>, %cst : tensor<256xi1, #blocked0>, %cst_0 : tensor<256xf32, #blocked0>) { // CHECK: llvm.inline_asm // CHECK-SAME: ld.global.b32 // CHECK: llvm.inline_asm // CHECK-SAME: ld.global.b32 %1 = tt.load %a_ptr_init, %cst, %cst_0 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked0> return } } // ----- #blocked0 = #triton_gpu.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [1], order = [0]}> module attributes {"triton_gpu.num-warps" = 1 : i32} { // CHECK-LABEL: vectorized_load_f16 func @vectorized_load_f16(%a_ptr_init: tensor<256x!tt.ptr, #blocked0>, %cst : tensor<256xi1, #blocked0>, %cst_0 : tensor<256xf16, #blocked0>) { // CHECK: llvm.inline_asm // CHECK-SAME: ld.global.b16 // CHECK: llvm.inline_asm // CHECK-SAME: ld.global.b16 %1 = tt.load %a_ptr_init, %cst, %cst_0 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf16, #blocked0> return } } // ----- // TODO: masked load with vectorization is pending on TODO #blocked0 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [8], order = [0]}> module attributes {"triton_gpu.num-warps" = 4 : i32} { // CHECK-LABEL: masked_load_const_other func @masked_load_const_other(%a_ptr_init : tensor<256x!tt.ptr, #blocked0>, %cst : tensor<256xi1, #blocked0>) { %cst_0 = arith.constant dense<0.000000e+00> : tensor<256xf32, #blocked0> %1 = tt.load %a_ptr_init, %cst, %cst_0 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked0> return } } // ----- // TODO: masked load with vectorization is pending on TODO #blocked0 = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [8], order = [0]}> module attributes {"triton_gpu.num-warps" = 4 : i32} { // CHECK-LABEL: masked_load_const_other_vec func @masked_load_const_other_vec(%a_ptr_init : tensor<256x!tt.ptr, #blocked0>, %cst : tensor<256xi1, #blocked0>) { %cst_0 = arith.constant dense<0.000000e+00> : tensor<256xf32, #blocked0> %1 = tt.load %a_ptr_init, %cst, %cst_0 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked0> return } } // ----- #blocked0 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [2], order = [0]}> module attributes {"triton_gpu.num-warps" = 2 : i32} { // CHECK-LABEL: global_load_store_no_vec func @global_load_store_no_vec(%arg0: !tt.ptr {tt.divisibility = 4 : i32}, %arg1: !tt.ptr {tt.divisibility = 4 : i32}, %arg2: !tt.ptr {tt.divisibility = 4 : i32}, %arg3: i32) { %c256_i32 = arith.constant 256 : i32 %0 = tt.get_program_id {axis = 0 : i32} : i32 %1 = arith.muli %0, %c256_i32 : i32 %2 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked0> %3 = tt.splat %1 : (i32) -> tensor<256xi32, #blocked0> %4 = arith.addi %3, %2 : tensor<256xi32, #blocked0> %5 = tt.splat %arg0 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked0> %6 = tt.addptr %5, %4 : tensor<256x!tt.ptr, #blocked0>, tensor<256xi32, #blocked0> %7 = tt.splat %arg1 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked0> %8 = tt.addptr %7, %4 : tensor<256x!tt.ptr, #blocked0>, tensor<256xi32, #blocked0> // Load 4 elements from vector0 // CHECK: "@${{.*}} ld.global.b32 { ${{.*}} }, [ ${{.*}} + 0 ]; // CHECK: "@${{.*}} ld.global.b32 { ${{.*}} }, [ ${{.*}} + 0 ]; // CHECK: "@${{.*}} ld.global.b32 { ${{.*}} }, [ ${{.*}} + 0 ]; // CHECK: "@${{.*}} ld.global.b32 { ${{.*}} }, [ ${{.*}} + 0 ]; // Load 4 elements from vector1 // CHECK: "@${{.*}} ld.global.b32 { ${{.*}} }, [ ${{.*}} + 0 ]; // CHECK: "@${{.*}} ld.global.b32 { ${{.*}} }, [ ${{.*}} + 0 ]; // CHECK: "@${{.*}} ld.global.b32 { ${{.*}} }, [ ${{.*}} + 0 ]; // CHECK: "@${{.*}} ld.global.b32 { ${{.*}} }, [ ${{.*}} + 0 ]; %9 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked0> %10 = tt.load %8 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked0> %11 = arith.addf %9, %10 : tensor<256xf32, #blocked0> %12 = tt.splat %arg2 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked0> %13 = tt.addptr %12, %4 : tensor<256x!tt.ptr, #blocked0>, tensor<256xi32, #blocked0> // Store 4 elements to global // CHECK: @${{.*}} st.global.b32 [ ${{.*}} + 0 ], { ${{.*}} }; // CHECK: @${{.*}} st.global.b32 [ ${{.*}} + 0 ], { ${{.*}} }; // CHECK: @${{.*}} st.global.b32 [ ${{.*}} + 0 ], { ${{.*}} }; // CHECK: @${{.*}} st.global.b32 [ ${{.*}} + 0 ], { ${{.*}} }; tt.store %13, %11 : tensor<256xf32, #blocked0> return } } // ----- #blocked0 = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [2], order = [0]}> module attributes {"triton_gpu.num-warps" = 2 : i32} { // CHECK-LABEL: global_load_store_vec4 func @global_load_store_vec4(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: i32) { %c256_i32 = arith.constant 256 : i32 %0 = tt.get_program_id {axis = 0 : i32} : i32 %1 = arith.muli %0, %c256_i32 : i32 %2 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked0> %3 = tt.splat %1 : (i32) -> tensor<256xi32, #blocked0> %4 = arith.addi %3, %2 : tensor<256xi32, #blocked0> %5 = tt.splat %arg0 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked0> %6 = tt.addptr %5, %4 : tensor<256x!tt.ptr, #blocked0>, tensor<256xi32, #blocked0> %7 = tt.splat %arg1 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked0> %8 = tt.addptr %7, %4 : tensor<256x!tt.ptr, #blocked0>, tensor<256xi32, #blocked0> // Load 4 elements from A with single one vectorized load instruction // CHECK: @${{.*}} ld.global.v4.b32 { ${{.*}}, ${{.*}}, ${{.*}}, ${{.*}} }, [ ${{.*}} + 0 ]; // Load 4 elements from B with single one vectorized load instruction // CHECK: @${{.*}} ld.global.v4.b32 { ${{.*}}, ${{.*}}, ${{.*}}, ${{.*}} }, [ ${{.*}} + 0 ]; %9 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked0> %10 = tt.load %8 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked0> %11 = arith.addf %9, %10 : tensor<256xf32, #blocked0> %12 = tt.splat %arg2 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked0> %13 = tt.addptr %12, %4 : tensor<256x!tt.ptr, #blocked0>, tensor<256xi32, #blocked0> // Store 4 elements to global with single one vectorized store instruction // CHECK: @$5 st.global.v4.b32 [ ${{.*}} + 0 ], { ${{.*}}, ${{.*}}, ${{.*}}, ${{.*}} }; tt.store %13, %11 : tensor<256xf32, #blocked0> return } } // ----- // This test verifies the vectorization of Load and Store Ops. #blocked = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [2], order = [0]}> // Note, the %n_elements doesn't have a "tt.divisibility" hint, so Triton assumes it's divisibility is 1, this should effect the mask's alignment and further restrict the load/store ops' vector width to be 1. module attributes {"triton_gpu.num-warps" = 2 : i32} { func @vecadd_masked_vec1(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %n_elements: i32) { %c64_i32 = arith.constant 64 : i32 %0 = tt.get_program_id {axis = 0 : i32} : i32 %1 = arith.muli %0, %c64_i32 : i32 %2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #blocked> %3 = tt.splat %1 : (i32) -> tensor<64xi32, #blocked> %4 = arith.addi %3, %2 : tensor<64xi32, #blocked> %5 = tt.splat %arg0 : (!tt.ptr) -> tensor<64x!tt.ptr, #blocked> %6 = tt.addptr %5, %4 : tensor<64x!tt.ptr, #blocked>, tensor<64xi32, #blocked> %7 = tt.splat %arg1 : (!tt.ptr) -> tensor<64x!tt.ptr, #blocked> %8 = tt.addptr %7, %4 : tensor<64x!tt.ptr, #blocked>, tensor<64xi32, #blocked> %9 = tt.splat %n_elements : (i32) -> tensor<64xi32, #blocked> %10 = "triton_gpu.cmpi"(%4, %9) {predicate = 2 : i64} : (tensor<64xi32, #blocked>, tensor<64xi32, #blocked>) -> tensor<64xi1, #blocked> // load op has a vector width = 1 due to the %mask's alignment // CHECK: ld.global.b32 %11 = tt.load %6, %10 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<64xf32, #blocked> %12 = tt.load %8, %10 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<64xf32, #blocked> %13 = arith.addf %11, %12 : tensor<64xf32, #blocked> %14 = tt.splat %arg2 : (!tt.ptr) -> tensor<64x!tt.ptr, #blocked> %15 = tt.addptr %14, %4 : tensor<64x!tt.ptr, #blocked>, tensor<64xi32, #blocked> tt.store %15, %13, %10 : tensor<64xf32, #blocked> return } } // ----- #blocked0 = #triton_gpu.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [1], order = [0]}> module attributes {"triton_gpu.num-warps" = 1 : i32} { // CHECK-LABEL: global_load_store_vec2 func @global_load_store_vec2(%arg0: !tt.ptr {tt.divisibility = 8 : i32}, %arg1: !tt.ptr {tt.divisibility = 8 : i32}, %arg2: !tt.ptr {tt.divisibility = 8 : i32}, %arg3: i32) { %c256_i32 = arith.constant 256 : i32 %0 = tt.get_program_id {axis = 0 : i32} : i32 %1 = arith.muli %0, %c256_i32 : i32 %2 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked0> %3 = tt.splat %1 : (i32) -> tensor<256xi32, #blocked0> %4 = arith.addi %3, %2 : tensor<256xi32, #blocked0> %5 = tt.splat %arg0 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked0> %6 = tt.addptr %5, %4 : tensor<256x!tt.ptr, #blocked0>, tensor<256xi32, #blocked0> %7 = tt.splat %arg1 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked0> %8 = tt.addptr %7, %4 : tensor<256x!tt.ptr, #blocked0>, tensor<256xi32, #blocked0> // Load 8 elements from A with four vectorized load instruction // CHECK: @${{.*}} ld.global.v2.b32 { ${{.*}}, ${{.*}} }, [ ${{.*}} + 0 ]; // CHECK: @${{.*}} ld.global.v2.b32 { ${{.*}}, ${{.*}} }, [ ${{.*}} + 0 ]; // CHECK: @${{.*}} ld.global.v2.b32 { ${{.*}}, ${{.*}} }, [ ${{.*}} + 0 ]; // CHECK: @${{.*}} ld.global.v2.b32 { ${{.*}}, ${{.*}} }, [ ${{.*}} + 0 ]; // Load 8 elements from B with four vectorized load instruction // CHECK: @${{.*}} ld.global.v2.b32 { ${{.*}}, ${{.*}} }, [ ${{.*}} + 0 ]; // CHECK: @${{.*}} ld.global.v2.b32 { ${{.*}}, ${{.*}} }, [ ${{.*}} + 0 ]; // CHECK: @${{.*}} ld.global.v2.b32 { ${{.*}}, ${{.*}} }, [ ${{.*}} + 0 ]; // CHECK: @${{.*}} ld.global.v2.b32 { ${{.*}}, ${{.*}} }, [ ${{.*}} + 0 ]; %9 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked0> %10 = tt.load %8 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked0> %11 = arith.addf %9, %10 : tensor<256xf32, #blocked0> %12 = tt.splat %arg2 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked0> %13 = tt.addptr %12, %4 : tensor<256x!tt.ptr, #blocked0>, tensor<256xi32, #blocked0> // Store 8 elements to global with four vectorized store instruction // CHECK: @${{.*}} st.global.v2.b32 [ ${{.*}} + 0 ], { ${{.*}}, ${{.*}} }; // CHECK: @${{.*}} st.global.v2.b32 [ ${{.*}} + 0 ], { ${{.*}}, ${{.*}} }; // CHECK: @${{.*}} st.global.v2.b32 [ ${{.*}} + 0 ], { ${{.*}}, ${{.*}} }; // CHECK: @${{.*}} st.global.v2.b32 [ ${{.*}} + 0 ], { ${{.*}}, ${{.*}} }; tt.store %13, %11 : tensor<256xf32, #blocked0> return } } // ----- #blocked0 = #triton_gpu.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [1], order = [0]}> module attributes {"triton_gpu.num-warps" = 1 : i32} { // CHECK-LABEL: global_load_store_vec8 func @global_load_store_vec8(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: i32) { %c256_i32 = arith.constant 256 : i32 %0 = tt.get_program_id {axis = 0 : i32} : i32 %1 = arith.muli %0, %c256_i32 : i32 %2 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked0> %3 = tt.splat %1 : (i32) -> tensor<256xi32, #blocked0> %4 = arith.addi %3, %2 : tensor<256xi32, #blocked0> %5 = tt.splat %arg0 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked0> %6 = tt.addptr %5, %4 : tensor<256x!tt.ptr, #blocked0>, tensor<256xi32, #blocked0> %7 = tt.splat %arg1 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked0> %8 = tt.addptr %7, %4 : tensor<256x!tt.ptr, #blocked0>, tensor<256xi32, #blocked0> // Load 8 elements from A with two vectorized load instruction // CHECK: @${{.*}} ld.global.v4.b32 { ${{.*}}, ${{.*}}, ${{.*}}, ${{.*}} }, [ ${{.*}} + 0 ]; // CHECK: @${{.*}} ld.global.v4.b32 { ${{.*}}, ${{.*}}, ${{.*}}, ${{.*}} }, [ ${{.*}} + 0 ]; // Load 8 elements from B with two vectorized load instruction // CHECK: @${{.*}} ld.global.v4.b32 { ${{.*}}, ${{.*}}, ${{.*}}, ${{.*}} }, [ ${{.*}} + 0 ]; // CHECK: @${{.*}} ld.global.v4.b32 { ${{.*}}, ${{.*}}, ${{.*}}, ${{.*}} }, [ ${{.*}} + 0 ]; %9 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked0> %10 = tt.load %8 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked0> %11 = arith.addf %9, %10 : tensor<256xf32, #blocked0> %12 = tt.splat %arg2 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked0> %13 = tt.addptr %12, %4 : tensor<256x!tt.ptr, #blocked0>, tensor<256xi32, #blocked0> // Store 8 elements to global with two vectorized store instruction // CHECK: @$5 st.global.v4.b32 [ ${{.*}} + 0 ], { ${{.*}}, ${{.*}}, ${{.*}}, ${{.*}} }; // CHECK: @$5 st.global.v4.b32 [ ${{.*}} + 0 ], { ${{.*}}, ${{.*}}, ${{.*}}, ${{.*}} }; tt.store %13, %11 : tensor<256xf32, #blocked0> return } } // TODO: Add a testcase to verify the optimization when ptr of the LoadOp // is from an addptr with const idx // ----- #blocked0 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> #blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [4, 1], order = [0, 1]}> module attributes {"triton_gpu.num-warps" = 4 : i32} { // CHECK-LABEL: basic_view_broadcast func @basic_view_broadcast(%arg : tensor<256xf32,#blocked0>) { // CHECK: llvm.mlir.undef // CHECK: %[[T0:.*]] = llvm.extractvalue // CHECK: %[[T1:.*]] = llvm.extractvalue %0 = tt.view %arg : (tensor<256xf32, #blocked0>) -> tensor<256x1xf32,#blocked2> // CHECK: llvm.mlir.undef // CHECK: llvm.insertvalue %[[T0]] // CHECK: llvm.insertvalue %[[T1]] // CHECK: llvm.insertvalue %[[T0]] // CHECK: llvm.insertvalue %[[T1]] // CHECK: llvm.insertvalue %[[T0]] // CHECK: llvm.insertvalue %[[T1]] // CHECK: llvm.insertvalue %[[T0]] // CHECK: llvm.insertvalue %[[T1]] %1 = tt.broadcast %0 : (tensor<256x1xf32,#blocked2>) -> tensor<256x4xf32, #blocked2> return } } // ----- #blocked0 = #triton_gpu.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [1], order = [0]}> module attributes {"triton_gpu.num-warps" = 4 : i32} { // CHECK-LABEL: basic_make_range func @basic_make_range() { // CHECK: nvvm.read.ptx.sreg.tid.x // CHECK: llvm.mlir.undef // CHECK: llvm.insertvalue // CHECK: llvm.insertvalue %0 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked0> return } } // ----- #blocked0 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> module attributes {"triton_gpu.num-warps" = 4 : i32} { // CHECK-LABEL: basic_addf func @basic_addf(%arg0 : tensor<256xf32,#blocked0>, %arg1 : tensor<256xf32,#blocked0>) { // CHECK: llvm.fadd // CHECK: llvm.fadd %1 = arith.addf %arg0, %arg1 : tensor<256xf32,#blocked0> return } } // ----- #blocked0 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> module attributes {"triton_gpu.num-warps" = 4 : i32} { // CHECK-LABEL: basic_addi func @basic_addi(%arg0 : tensor<256xi32,#blocked0>, %arg1 : tensor<256xi32,#blocked0>) { // CHECK: llvm.add // CHECK: llvm.add %1 = arith.addi %arg0, %arg1 : tensor<256xi32,#blocked0> return } } // ----- module attributes {"triton_gpu.num-warps" = 4 : i32} { // CHECK-LABEL: basic_program_id func @basic_program_id() { // CHECK: nvvm.read.ptx.sreg.ctaid.x : i32 %0 = tt.get_program_id {axis = 0 : i32} : i32 return } } // ----- #blocked0 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> module attributes {"triton_gpu.num-warps" = 4 : i32} { // CHECK-LABEL: basic_addptr func @basic_addptr(%arg0 : tensor<256x!tt.ptr,#blocked0>, %arg1 : tensor<256xi32,#blocked0>) { // CHECK: llvm.getelementptr // CHECK: llvm.getelementptr %0 = tt.addptr %arg0, %arg1 : tensor<256x!tt.ptr, #blocked0>, tensor<256xi32, #blocked0> return } } // ----- #shared0 = #triton_gpu.shared<{vec = 2, perPhase = 2, maxPhase = 4, order = [1, 0]}> module attributes {"triton_gpu.num-warps" = 4 : i32} { // CHECK: llvm.mlir.global external @global_smem // CHECK-LABEL: basic_alloc_tensor func @basic_alloc_tensor() { // CHECK: llvm.mlir.addressof @global_smem // CHECK-NEXT: llvm.bitcast // CHECK-NEXT: llvm.mlir.constant // CHECK-NEXT: llvm.getelementptr // CHECK-NEXT: llvm.bitcast %0 = triton_gpu.alloc_tensor : tensor<16x16xf16, #shared0> return } } // ----- #shared0 = #triton_gpu.shared<{vec = 2, perPhase = 2, maxPhase = 4, order = [1, 0]}> module attributes {"triton_gpu.num-warps" = 4 : i32} { // CHECK: llvm.mlir.global external @global_smem // CHECK-LABEL: basic_extract_slice func @basic_extract_slice() { // CHECK: llvm.mlir.addressof @global_smem // CHECK: llvm.extractvalue // CHECK-NEXT: llvm.extractvalue // CHECK-NEXT: llvm.extractvalue // CHECK-NEXT: llvm.extractvalue // CHECK-NEXT: llvm.extractvalue // CHECK-NEXT: llvm.extractvalue // CHECK-NEXT: llvm.extractvalue // CHECK-NEXT: llvm.add // CHECK-NEXT: llvm.mlir.constant(0 : i32) : i32 // CHECK-NEXT: llvm.add // CHECK-NEXT: llvm.mlir.constant(0 : i32) : i32 // CHECK-NEXT: llvm.add // CHECK-NEXT: llvm.mlir.constant(0 : i32) : i32 // CHECK-NEXT: llvm.mul // CHECK-NEXT: llvm.add // CHECK-NEXT: llvm.mul // CHECK-NEXT: llvm.add // CHECK-NEXT: llvm.mul // CHECK-NEXT: llvm.add // CHECK-NEXT: llvm.getelementptr %index = arith.constant 1 : index %0 = triton_gpu.alloc_tensor : tensor<128x16x32xf32, #shared0> %1 = tensor.extract_slice %0[%index, 0, 0][1, 16, 32][1, 1, 1] : tensor<128x16x32xf32, #shared0> to tensor<16x32xf32, #shared0> return } } // ----- module attributes {"triton_gpu.num-warps" = 4 : i32} { // CHECK-LABEL: basic_async_wait func @basic_async_wait() { // CHECK: cp.async.wait_group 0x4 triton_gpu.async_wait {num = 4: i32} return } } // ----- #block0 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [4], warpsPerCTA = [4], order = [0]}> #block1 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [8], warpsPerCTA = [4], order = [0]}> #block2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [4, 1], warpsPerCTA = [4, 1], order = [1, 0]}> #block3 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 8], warpsPerCTA = [1, 4], order = [1, 0]}> #slice2d1 = #triton_gpu.slice<{dim = 1, parent=#block2}> #slice3d0 = #triton_gpu.slice<{dim = 0, parent=#block3}> #AL = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> #A = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 4, order = [1, 0]}> module attributes {"triton_gpu.num-warps" = 4 : i32} { // CHECK-LABEL: basic_insert_slice_async_fallback func @basic_insert_slice_async_fallback(%arg0: !tt.ptr {tt.divisibility = 1 : i32}) { %off0_ = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #slice2d1> %off1_ = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<64xi32, #slice3d0> %off0 = tt.expand_dims %off0_ {axis = 1 : i32} : (tensor<16xi32, #slice2d1>) -> tensor<16x1xi32, #block2> %off1 = tt.expand_dims %off1_ {axis = 0 : i32} : (tensor<64xi32, #slice3d0>) -> tensor<1x64xi32, #block3> %broadcast_off0_scalar = tt.broadcast %off0 : (tensor<16x1xi32, #block2>) -> tensor<16x64xi32, #block2> %cst_scalar = arith.constant 64 : i32 %cst = tt.splat %cst_scalar : (i32) -> tensor<16x64xi32, #block2> %broadcast_off0_ = arith.muli %broadcast_off0_scalar, %cst : tensor<16x64xi32, #block2> %broadcast_off1_ = tt.broadcast %off1 : (tensor<1x64xi32, #block3>) -> tensor<16x64xi32, #block3> %broadcast_off0 = triton_gpu.convert_layout %broadcast_off0_ : (tensor<16x64xi32, #block2>) -> tensor<16x64xi32, #AL> %broadcast_off1 = triton_gpu.convert_layout %broadcast_off1_ : (tensor<16x64xi32, #block3>) -> tensor<16x64xi32, #AL> %off = arith.addi %broadcast_off0, %broadcast_off1 : tensor<16x64xi32, #AL> %a_init = tt.splat %arg0 : (!tt.ptr) -> tensor<16x64x!tt.ptr, #AL> %a_ptr = tt.addptr %a_init, %off : tensor<16x64x!tt.ptr, #AL>, tensor<16x64xi32, #AL> %tensor = triton_gpu.alloc_tensor : tensor<2x16x64xf16, #A> %index = arith.constant 1 : i32 // CHECK: llvm.load // CHECK-SAME: !llvm.ptr, 3> // CHECK: llvm.load // CHECK-SAME: !llvm.ptr, 3> %a = triton_gpu.insert_slice_async %a_ptr, %tensor, %index {axis = 0 : i32, cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<16x64x!tt.ptr, #AL> -> tensor<2x16x64xf16, #A> return } } // ----- #block0 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [4], warpsPerCTA = [4], order = [0]}> #block1 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [8], warpsPerCTA = [4], order = [0]}> #block2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [4, 1], warpsPerCTA = [4, 1], order = [1, 0]}> #block3 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 8], warpsPerCTA = [1, 4], order = [1, 0]}> #slice2d1 = #triton_gpu.slice<{dim = 1, parent=#block2}> #slice3d0 = #triton_gpu.slice<{dim = 0, parent=#block3}> #AL = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> #A = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 4, order = [1, 0]}> module attributes {"triton_gpu.num-warps" = 4 : i32} { // CHECK-LABEL: basic_insert_slice_async_v4 func @basic_insert_slice_async_v4(%arg0: !tt.ptr {tt.divisibility = 32 : i32}) { %off0_ = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #slice2d1> %off1_ = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<64xi32, #slice3d0> %off0 = tt.expand_dims %off0_ {axis = 1 : i32} : (tensor<16xi32, #slice2d1>) -> tensor<16x1xi32, #block2> %off1 = tt.expand_dims %off1_ {axis = 0 : i32} : (tensor<64xi32, #slice3d0>) -> tensor<1x64xi32, #block3> %broadcast_off0_scalar = tt.broadcast %off0 : (tensor<16x1xi32, #block2>) -> tensor<16x64xi32, #block2> %cst_scalar = arith.constant 64 : i32 %cst = tt.splat %cst_scalar : (i32) -> tensor<16x64xi32, #block2> %broadcast_off0_ = arith.muli %broadcast_off0_scalar, %cst : tensor<16x64xi32, #block2> %broadcast_off1_ = tt.broadcast %off1 : (tensor<1x64xi32, #block3>) -> tensor<16x64xi32, #block3> %broadcast_off0 = triton_gpu.convert_layout %broadcast_off0_ : (tensor<16x64xi32, #block2>) -> tensor<16x64xi32, #AL> %broadcast_off1 = triton_gpu.convert_layout %broadcast_off1_ : (tensor<16x64xi32, #block3>) -> tensor<16x64xi32, #AL> %off = arith.addi %broadcast_off0, %broadcast_off1 : tensor<16x64xi32, #AL> %a_init = tt.splat %arg0 : (!tt.ptr) -> tensor<16x64x!tt.ptr, #AL> %a_ptr = tt.addptr %a_init, %off : tensor<16x64x!tt.ptr, #AL>, tensor<16x64xi32, #AL> %tensor = triton_gpu.alloc_tensor : tensor<2x16x64xf32, #A> %index = arith.constant 1 : i32 // CHECK: llvm.inline_asm has_side_effects asm_dialect = att // CHECK-SAME: cp.async.cg.shared.global [ ${{.*}} + 0 ], [ ${{.*}} + 0 ], 0x10, 0x10 // CHECK: llvm.inline_asm has_side_effects asm_dialect = att // CHECK-SAME: cp.async.cg.shared.global [ ${{.*}} + 16 ], [ ${{.*}} + 0 ], 0x10, 0x10 // CHECK: llvm.inline_asm has_side_effects asm_dialect = att // CHECK-SAME: cp.async.commit_group %a = triton_gpu.insert_slice_async %a_ptr, %tensor, %index {axis = 0 : i32, cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<16x64x!tt.ptr, #AL> -> tensor<2x16x64xf32, #A> triton_gpu.async_commit_group return } } // ----- #block0 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [4], warpsPerCTA = [4], order = [0]}> #block1 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [8], warpsPerCTA = [4], order = [0]}> #block2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [4, 1], warpsPerCTA = [4, 1], order = [1, 0]}> #block3 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 8], warpsPerCTA = [1, 4], order = [1, 0]}> #slice2d1 = #triton_gpu.slice<{dim = 1, parent=#block2}> #slice3d0 = #triton_gpu.slice<{dim = 0, parent=#block3}> #AL = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> #A = #triton_gpu.shared<{vec = 1, perPhase = 1, maxPhase = 4, order = [1, 0]}> module attributes {"triton_gpu.num-warps" = 4 : i32} { // CHECK-LABEL: basic_insert_slice_async_v1 func @basic_insert_slice_async_v1(%arg0: !tt.ptr {tt.divisibility = 4 : i32}) { %off0_ = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #slice2d1> %off1_ = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #slice3d0> %off0 = tt.expand_dims %off0_ {axis = 1 : i32} : (tensor<16xi32, #slice2d1>) -> tensor<16x1xi32, #block2> %off1 = tt.expand_dims %off1_ {axis = 0 : i32} : (tensor<32xi32, #slice3d0>) -> tensor<1x32xi32, #block3> %broadcast_off0_scalar = tt.broadcast %off0 : (tensor<16x1xi32, #block2>) -> tensor<16x32xi32, #block2> %cst_scalar = arith.constant 32 : i32 %cst = tt.splat %cst_scalar : (i32) -> tensor<16x32xi32, #block2> %broadcast_off0_ = arith.muli %broadcast_off0_scalar, %cst : tensor<16x32xi32, #block2> %broadcast_off1_ = tt.broadcast %off1 : (tensor<1x32xi32, #block3>) -> tensor<16x32xi32, #block3> %broadcast_off0 = triton_gpu.convert_layout %broadcast_off0_ : (tensor<16x32xi32, #block2>) -> tensor<16x32xi32, #AL> %broadcast_off1 = triton_gpu.convert_layout %broadcast_off1_ : (tensor<16x32xi32, #block3>) -> tensor<16x32xi32, #AL> %off = arith.addi %broadcast_off0, %broadcast_off1 : tensor<16x32xi32, #AL> %a_init = tt.splat %arg0 : (!tt.ptr) -> tensor<16x32x!tt.ptr, #AL> %a_ptr = tt.addptr %a_init, %off : tensor<16x32x!tt.ptr, #AL>, tensor<16x32xi32, #AL> %tensor = triton_gpu.alloc_tensor : tensor<2x16x32xf32, #A> %index = arith.constant 1 : i32 // CHECK: llvm.inline_asm // CHECK-SAME: cp.async.ca.shared.global [ ${{.*}} + 0 ], [ ${{.*}} + 0 ], 0x4, 0x4 // CHECK: llvm.inline_asm // CHECK-SAME: cp.async.ca.shared.global [ ${{.*}} + 0 ], [ ${{.*}} + 0 ], 0x4, 0x4 // CHECK: llvm.inline_asm // CHECK-SAME: cp.async.ca.shared.global [ ${{.*}} + 0 ], [ ${{.*}} + 0 ], 0x4, 0x4 // CHECK: llvm.inline_asm // CHECK-SAME: cp.async.ca.shared.global [ ${{.*}} + 0 ], [ ${{.*}} + 0 ], 0x4, 0x4 // CHECK: llvm.inline_asm // CHECK-SAME: cp.async.commit_group %a = triton_gpu.insert_slice_async %a_ptr, %tensor, %index {axis = 0 : i32, cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<16x32x!tt.ptr, #AL> -> tensor<2x16x32xf32, #A> triton_gpu.async_commit_group return } } // ----- #block0 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [8], warpsPerCTA = [4], order = [0]}> #block2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 1], warpsPerCTA = [4, 1], order = [1, 0]}> #block3 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 8], warpsPerCTA = [1, 4], order = [1, 0]}> #slice2d1 = #triton_gpu.slice<{dim = 1, parent=#block2}> #slice3d0 = #triton_gpu.slice<{dim = 0, parent=#block3}> #AL = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> #A = #triton_gpu.shared<{vec = 1, perPhase = 1, maxPhase = 4, order = [1, 0]}> module attributes {"triton_gpu.num-warps" = 4 : i32} { // CHECK-LABEL: basic_insert_slice_async_v1_multictas func @basic_insert_slice_async_v1_multictas(%arg0: !tt.ptr {tt.divisibility = 4 : i32}) { %off0_ = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #slice2d1> %off1_ = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #slice3d0> %off0 = tt.expand_dims %off0_ {axis = 1 : i32} : (tensor<32xi32, #slice2d1>) -> tensor<32x1xi32, #block2> %off1 = tt.expand_dims %off1_ {axis = 0 : i32} : (tensor<32xi32, #slice3d0>) -> tensor<1x32xi32, #block3> %broadcast_off0_scalar = tt.broadcast %off0 : (tensor<32x1xi32, #block2>) -> tensor<32x32xi32, #block2> %cst_scalar = arith.constant 32 : i32 %cst = tt.splat %cst_scalar : (i32) -> tensor<32x32xi32, #block2> %broadcast_off0_ = arith.muli %broadcast_off0_scalar, %cst : tensor<32x32xi32, #block2> %broadcast_off1_ = tt.broadcast %off1 : (tensor<1x32xi32, #block3>) -> tensor<32x32xi32, #block3> %broadcast_off0 = triton_gpu.convert_layout %broadcast_off0_ : (tensor<32x32xi32, #block2>) -> tensor<32x32xi32, #AL> %broadcast_off1 = triton_gpu.convert_layout %broadcast_off1_ : (tensor<32x32xi32, #block3>) -> tensor<32x32xi32, #AL> %off = arith.addi %broadcast_off0, %broadcast_off1 : tensor<32x32xi32, #AL> %a_init = tt.splat %arg0 : (!tt.ptr) -> tensor<32x32x!tt.ptr, #AL> %a_ptr = tt.addptr %a_init, %off : tensor<32x32x!tt.ptr, #AL>, tensor<32x32xi32, #AL> %tensor = triton_gpu.alloc_tensor : tensor<2x32x32xf32, #A> %index = arith.constant 1 : i32 // CHECK: llvm.mlir.constant(0 : i32) : i32 // CHECK: llvm.mlir.constant(16 : i32) : i32 // CHECK: llvm.mul // CHECK: llvm.add // CHECK: llvm.inline_asm // CHECK-SAME: cp.async.ca.shared.global [ ${{.*}} + 0 ], [ ${{.*}} + 0 ], 0x4, 0x4 // CHECK: llvm.inline_asm // CHECK-SAME: cp.async.ca.shared.global [ ${{.*}} + 0 ], [ ${{.*}} + 0 ], 0x4, 0x4 // CHECK: llvm.inline_asm // CHECK-SAME: cp.async.ca.shared.global [ ${{.*}} + 0 ], [ ${{.*}} + 0 ], 0x4, 0x4 // CHECK: llvm.inline_asm // CHECK-SAME: cp.async.ca.shared.global [ ${{.*}} + 0 ], [ ${{.*}} + 0 ], 0x4, 0x4 // CHECK: llvm.inline_asm // CHECK-SAME: cp.async.ca.shared.global [ ${{.*}} + 0 ], [ ${{.*}} + 0 ], 0x4, 0x4 // CHECK: llvm.inline_asm // CHECK-SAME: cp.async.ca.shared.global [ ${{.*}} + 0 ], [ ${{.*}} + 0 ], 0x4, 0x4 // CHECK: llvm.inline_asm // CHECK-SAME: cp.async.ca.shared.global [ ${{.*}} + 0 ], [ ${{.*}} + 0 ], 0x4, 0x4 // CHECK: llvm.inline_asm // CHECK-SAME: cp.async.ca.shared.global [ ${{.*}} + 0 ], [ ${{.*}} + 0 ], 0x4, 0x4 // CHECK: llvm.inline_asm // CHECK-SAME: cp.async.commit_group %a = triton_gpu.insert_slice_async %a_ptr, %tensor, %index {axis = 0 : i32, cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<32x32x!tt.ptr, #AL> -> tensor<2x32x32xf32, #A> triton_gpu.async_commit_group return } } // ----- #blocked0 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> module attributes {"triton_gpu.num-warps" = 4 : i32} { // CHECK: basic_splat func @basic_splat(%ptr: !tt.ptr) { // CHECK: llvm.mlir.undef // CHECK: llvm.insertvalue // CHECK: llvm.insertvalue %0 = tt.splat %ptr : (!tt.ptr) -> tensor<256x!tt.ptr,#blocked0> return } } // ----- #blocked0 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> module attributes {"triton_gpu.num-warps" = 4 : i32} { // CHECK-LABEL: basic_store func @basic_store(%ptrs: tensor<256x!tt.ptr, #blocked0>, %vals: tensor<256xf32, #blocked0>, %mask: tensor<256xi1, #blocked0>) { // CHECK: llvm.inline_asm // CHECK-SAME: st.global.b32 [ ${{.*}} + 0 ], { ${{.*}} }; // CHECK: llvm.inline_asm // CHECK-SAME: st.global.b32 [ ${{.*}} + 0 ], { ${{.*}} }; tt.store %ptrs, %vals, %mask : tensor<256xf32, #blocked0> return } } // ----- #blocked0 = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [8, 4], warpsPerCTA = [1, 1], order = [1, 0]}> #blocked1 = #triton_gpu.blocked<{sizePerThread = [4, 1], threadsPerWarp = [4, 8], warpsPerCTA = [1, 1], order = [0, 1]}> module attributes {"triton_gpu.num-warps" = 1 : i32} { // CHECK: llvm.mlir.global external @global_smem() {addr_space = 3 : i32} : !llvm.array<0 x i8> // CHECK-LABEL: convert_layout_blocked_blocked func @convert_layout_blocked_blocked(%arg0: tensor<16x16xf32, #blocked0>) { // CHECK: llvm.mlir.addressof @global_smem // CHECK: llvm.store // CHECK-SAME: !llvm.ptr, 3> // CHECK: llvm.store // CHECK-SAME: !llvm.ptr, 3> // CHECK: llvm.store // CHECK-SAME: !llvm.ptr, 3> // CHECK: llvm.store // CHECK-SAME: !llvm.ptr, 3> // CHECK: llvm.store // CHECK-SAME: !llvm.ptr, 3> // CHECK: llvm.store // CHECK-SAME: !llvm.ptr, 3> // CHECK: llvm.store // CHECK-SAME: !llvm.ptr, 3> // CHECK: llvm.store // CHECK-SAME: !llvm.ptr, 3> // CHECK: nvvm.barrier0 // CHECK: llvm.load // CHECK-SAME: !llvm.ptr, 3> // CHECK: llvm.load // CHECK-SAME: !llvm.ptr, 3> // CHECK: llvm.load // CHECK-SAME: !llvm.ptr, 3> // CHECK: llvm.load // CHECK-SAME: !llvm.ptr, 3> // CHECK: llvm.load // CHECK-SAME: !llvm.ptr, 3> // CHECK: llvm.load // CHECK-SAME: !llvm.ptr, 3> // CHECK: llvm.load // CHECK-SAME: !llvm.ptr, 3> // CHECK: llvm.load // CHECK-SAME: !llvm.ptr, 3> %0 = triton_gpu.convert_layout %arg0 : (tensor<16x16xf32, #blocked0>) -> tensor<16x16xf32, #blocked1> return } } // ----- #blocked0 = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [8, 4], warpsPerCTA = [1, 1], order = [1, 0]}> #blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [16, 2], warpsPerCTA = [1, 1], order = [1, 0]}> module attributes {"triton_gpu.num-warps" = 1 : i32} { // CHECK: llvm.mlir.global external @global_smem() {addr_space = 3 : i32} : !llvm.array<0 x i8> // CHECK-LABEL: convert_layout_blocked_blocked_vec func @convert_layout_blocked_blocked_vec(%arg0: tensor<16x16xf32, #blocked0>) { // CHECK: llvm.mlir.addressof @global_smem // CHECK: llvm.store // CHECK-SAME: !llvm.ptr, 3> // CHECK: llvm.store // CHECK-SAME: !llvm.ptr, 3> // CHECK: nvvm.barrier0 // CHECK: llvm.load // CHECK-SAME: !llvm.ptr, 3> // CHECK: llvm.load // CHECK-SAME: !llvm.ptr, 3> %0 = triton_gpu.convert_layout %arg0 : (tensor<16x16xf32, #blocked0>) -> tensor<16x16xf32, #blocked1> return } } // ----- #blocked0 = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [8, 4], warpsPerCTA = [1, 1], order = [1, 0]}> #blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 8], warpsPerCTA = [1, 1], order = [1, 0]}> module attributes {"triton_gpu.num-warps" = 1 : i32} { // CHECK: llvm.mlir.global external @global_smem() {addr_space = 3 : i32} : !llvm.array<0 x i8> // CHECK-LABEL: convert_layout_blocked_blocked_multi_rep func @convert_layout_blocked_blocked_multi_rep(%arg0: tensor<16x16xf32, #blocked0>) { // CHECK: llvm.mlir.addressof @global_smem // CHECK: llvm.store // CHECK-SAME: !llvm.ptr, 3> // CHECK: nvvm.barrier0 // CHECK: llvm.load // CHECK-SAME: !llvm.ptr, 3> // CHECK: llvm.load // CHECK-SAME: !llvm.ptr, 3> // CHECK: nvvm.barrier0 // CHECK: llvm.store // CHECK-SAME: !llvm.ptr, 3> // CHECK: nvvm.barrier0 // CHECK: llvm.load // CHECK-SAME: !llvm.ptr, 3> // CHECK: llvm.load // CHECK-SAME: !llvm.ptr, 3> %0 = triton_gpu.convert_layout %arg0 : (tensor<16x16xf32, #blocked0>) -> tensor<16x16xf32, #blocked1> return } } // ----- #blocked0 = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [8, 4], warpsPerCTA = [1, 1], order = [1, 0]}> #shared0 = #triton_gpu.shared<{vec = 1, perPhase=2, maxPhase=8 ,order = [1, 0]}> #mma0 = #triton_gpu.mma<{versionMajor=2, warpsPerCTA=[1,1]}> #dot_operand_a = #triton_gpu.dot_op<{opIdx=0, parent=#mma0}> #dot_operand_b = #triton_gpu.dot_op<{opIdx=1, parent=#mma0}> module attributes {"triton_gpu.num-warps" = 1 : i32} { // CHECK-LABEL: convert_dot func @convert_dot(%A: tensor<16x16xf16, #blocked0>, %B: tensor<16x16xf16, #blocked0>) { %AA = triton_gpu.convert_layout %A : (tensor<16x16xf16, #blocked0>) -> tensor<16x16xf16, #shared0> %BB = triton_gpu.convert_layout %B : (tensor<16x16xf16, #blocked0>) -> tensor<16x16xf16, #shared0> // CHECK: llvm.inline_asm // CHECK-SAME: ldmatrix.sync.aligned.m8n8.x4 // CHECK: llvm.inline_asm // CHECK-SAME: ldmatrix.sync.aligned.m8n8.x4 %AA_DOT = triton_gpu.convert_layout %AA : (tensor<16x16xf16, #shared0>) -> tensor<16x16xf16, #dot_operand_a> %BB_DOT = triton_gpu.convert_layout %BB : (tensor<16x16xf16, #shared0>) -> tensor<16x16xf16, #dot_operand_b> %cst0 = arith.constant dense<0.000000e+00> : tensor<16x16xf32, #mma0> // CHECK: llvm.inline_asm // CHECK-SAME: mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 // CHECK: llvm.inline_asm // CHECK-SAME: mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 %D = tt.dot %AA_DOT, %BB_DOT, %cst0 {allowTF32 = true, transA = false, transB = false} : tensor<16x16xf16, #dot_operand_a> * tensor<16x16xf16, #dot_operand_b> -> tensor<16x16xf32, #mma0> return } } // TODO: problems in MLIR's parser on slice layout // #blocked0 = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [8, 4], warpsPerCTA = [1, 1], order = [1, 0]}> // module attributes {"triton_gpu.num-warps" = 1 : i32} { // func @make_range_sliced_layout() { // %0 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #blocked0}>> // return // } // } // ----- #blocked0 = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [32, 1], warpsPerCTA = [1, 4], order = [1, 0]}> #mma = #triton_gpu.mma<{versionMajor = 2, warpsPerCTA = [2, 2]}> module attributes {"triton_gpu.num-warps" = 1 : i32} { // CHECK: llvm.mlir.global external @global_smem() {addr_space = 3 : i32} : !llvm.array<0 x i8> // CHECK-LABEL: convert_layout_mmav2_block func @convert_layout_mmav2_blocked(%arg0: tensor<32x16xf32, #mma>) { // CHECK: llvm.store // CHECK-SAME: !llvm.ptr, 3> // CHECK: llvm.store // CHECK-SAME: !llvm.ptr, 3> // CHECK: nvvm.barrier0 // CHECK: llvm.load // CHECK-SAME: !llvm.ptr, 3> %0 = triton_gpu.convert_layout %arg0 : (tensor<32x16xf32, #mma>) -> tensor<32x16xf32, #blocked0> return } } // ----- #blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [2, 16], warpsPerCTA = [1, 4], order = [1, 0]}> #mma = #triton_gpu.mma<{versionMajor = 1, versionMinor = 3, warpsPerCTA = [2, 2]}> module attributes {"triton_gpu.num-warps" = 1 : i32} { // CHECK: llvm.mlir.global external @global_smem() {addr_space = 3 : i32} : !llvm.array<0 x i8> // CHECK-LABEL: convert_layout_mmav1_block func @convert_layout_mmav1_blocked(%arg0: tensor<32x64xf32, #mma>) { // CHECK: llvm.store // CHECK-SAME: !llvm.ptr, 3> // CHECK: llvm.store // CHECK-SAME: !llvm.ptr, 3> // CHECK: llvm.store // CHECK-SAME: !llvm.ptr, 3> // CHECK: llvm.store // CHECK-SAME: !llvm.ptr, 3> // CHECK: nvvm.barrier0 // CHECK: llvm.load // CHECK-SAME: !llvm.ptr, 3> %0 = triton_gpu.convert_layout %arg0 : (tensor<32x64xf32, #mma>) -> tensor<32x64xf32, #blocked> return } } // ----- #blocked0 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [8, 4], warpsPerCTA = [8, 1], order = [1, 0]}> #shared0 = #triton_gpu.shared<{vec = 8, perPhase = 2, maxPhase = 4, order = [1, 0]}> module attributes {"triton_gpu.num-warps" = 1 : i32} { // CHECK: llvm.mlir.global external @global_smem() {addr_space = 3 : i32} : !llvm.array<0 x i8> // CHECK-LABEL: convert_layout_blocked_shared func @convert_layout_blocked_shared(%arg0: tensor<128x32xf32, #blocked0>) { // CHECK: llvm.store // CHECK-SAME: !llvm.ptr, 3> // CHECK: llvm.store // CHECK-SAME: !llvm.ptr, 3> %0 = triton_gpu.convert_layout %arg0 : (tensor<128x32xf32, #blocked0>) -> tensor<128x32xf32, #shared0> return } } // ----- #blocked0 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [1], order = [0]}> #blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 8], warpsPerCTA = [1, 1], order = [1, 0]}> module attributes {"triton_gpu.num-warps" = 1 : i32} { // CHECK-LABEL: convert_blocked1d_to_slice0 func @convert_blocked1d_to_slice0(%src:tensor<32xi32, #blocked0>) { // CHECK-COUNT-4: llvm.load {{.*}} : !llvm.ptr, 3> %cvt = triton_gpu.convert_layout %src : (tensor<32xi32, #blocked0>) -> tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> return } } // ----- #blocked0 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [1], order = [0]}> #blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 8], warpsPerCTA = [1, 1], order = [1, 0]}> module attributes {"triton_gpu.num-warps" = 1 : i32} { // CHECK-LABEL: convert_blocked1d_to_slice1 func @convert_blocked1d_to_slice1(%src:tensor<32xi32, #blocked0>) { // CHECK-COUNT-32: llvm.load {{.*}} : !llvm.ptr, 3> %cvt = triton_gpu.convert_layout %src : (tensor<32xi32, #blocked0>) -> tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> return } } // ----- #blocked0 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [1], order = [0]}> #blocked1 = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [1], order = [0]}> module attributes {"triton_gpu.num-warps" = 1 : i32} { // CHECK-LABEL: convert_blocked_to_blocked_ptr func @convert_blocked_to_blocked_ptr(%src:tensor<32x!tt.ptr, #blocked0>) { // CHECK: llvm.ptrtoint // CHECK: llvm.store // CHECK: nvvm.barrier0 // CHECK: llvm.inttoptr // CHECK-COUNT-4: llvm.insertvalue %cvt = triton_gpu.convert_layout %src : (tensor<32x!tt.ptr, #blocked0>) -> tensor<32x!tt.ptr, #blocked1> return } } // ----- #blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [2, 16], warpsPerCTA = [1, 4], order = [1, 0]}> #shared = #triton_gpu.shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0]}> #mma = #triton_gpu.mma<{versionMajor = 2, warpsPerCTA = [2, 2]}> #dot_operand_a = #triton_gpu.dot_op<{opIdx=0, parent=#mma}> #dot_operand_b = #triton_gpu.dot_op<{opIdx=1, parent=#mma}> module attributes {"triton_gpu.num-warps" = 4 : i32} { func @matmul_kernel_dot_operand_layout(%ptr:!tt.ptr {tt.divisibility = 16 : i32}, %a:tensor<128x32xf16, #shared>, %b:tensor<32x256xf16, #shared>) { %cst = arith.constant dense<0.000000e+00> : tensor<128x256xf32, #mma> // CHECK: ldmatrix.sync.aligned.m8n8.x4.shared.b16 %a_mat = triton_gpu.convert_layout %a : (tensor<128x32xf16, #shared>) -> tensor<128x32xf16, #dot_operand_a> %b_mat = triton_gpu.convert_layout %b : (tensor<32x256xf16, #shared>) -> tensor<32x256xf16, #dot_operand_b> %28 = tt.dot %a_mat, %b_mat, %cst {allowTF32 = true, transA = false, transB = false} : tensor<128x32xf16, #dot_operand_a> * tensor<32x256xf16, #dot_operand_b> -> tensor<128x256xf32, #mma> %38 = triton_gpu.convert_layout %28 : (tensor<128x256xf32, #mma>) -> tensor<128x256xf32, #blocked> %30 = tt.splat %ptr : (!tt.ptr) -> tensor<128x1x!tt.ptr, #blocked> %36 = tt.broadcast %30 : (tensor<128x1x!tt.ptr, #blocked>) -> tensor<128x256x!tt.ptr, #blocked> tt.store %36, %38 : tensor<128x256xf32, #blocked> return } } // ----- #blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [2, 16], warpsPerCTA = [1, 4], order = [1, 0]}> #shared0 = #triton_gpu.shared<{vec = 4, perPhase = 1, maxPhase = 8, order = [1, 0]}> #shared1 = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 4, order = [1, 0]}> #mma = #triton_gpu.mma<{versionMajor = 1, versionMinor = 3, warpsPerCTA = [2, 2]}> #dot_operand_a = #triton_gpu.dot_op<{opIdx=0, parent=#mma, isMMAv1Row=true}> #dot_operand_b = #triton_gpu.dot_op<{opIdx=1, parent=#mma, isMMAv1Row=true}> module attributes {"triton_gpu.num-warps" = 4 : i32} { func @matmul884_kernel_dot_operand_layout(%ptr:!tt.ptr {tt.divisibility = 16 : i32}, %a:tensor<32x64xf16, #shared0>, %b:tensor<64x64xf16, #shared1>) { %cst = arith.constant dense<0.000000e+00> : tensor<32x64xf32, #mma> // CHECK: ldmatrix.sync.aligned.m8n8.x4.shared.b16 %a_mat = triton_gpu.convert_layout %a : (tensor<32x64xf16, #shared0>) -> tensor<32x64xf16, #dot_operand_a> %b_mat = triton_gpu.convert_layout %b : (tensor<64x64xf16, #shared1>) -> tensor<64x64xf16, #dot_operand_b> %28 = tt.dot %a_mat, %b_mat, %cst {allowTF32 = true, transA = false, transB = false} : tensor<32x64xf16, #dot_operand_a> * tensor<64x64xf16, #dot_operand_b> -> tensor<32x64xf32, #mma> %38 = triton_gpu.convert_layout %28 : (tensor<32x64xf32, #mma>) -> tensor<32x64xf32, #blocked> %30 = tt.splat %ptr : (!tt.ptr) -> tensor<32x1x!tt.ptr, #blocked> %36 = tt.broadcast %30 : (tensor<32x1x!tt.ptr, #blocked>) -> tensor<32x64x!tt.ptr, #blocked> tt.store %36, %38 : tensor<32x64xf32, #blocked> return } } // ----- #blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [2, 16], warpsPerCTA = [1, 4], order = [1, 0]}> #shared = #triton_gpu.shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0]}> #dot_operand_a = #triton_gpu.dot_op<{opIdx=0, parent=#blocked}> #dot_operand_b = #triton_gpu.dot_op<{opIdx=1, parent=#blocked}> module attributes {"triton_gpu.num-warps" = 4 : i32} { func @matmul_fmadot(%ptr:!tt.ptr {tt.divisibility = 16 : i32}, %a:tensor<32x16xf32, #shared>, %b:tensor<16x32xf32, #shared>) { %cst = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #blocked> // CHECK: llvm.intr.fmuladd %a_mat = triton_gpu.convert_layout %a : (tensor<32x16xf32, #shared>) -> tensor<32x16xf32, #dot_operand_a> %b_mat = triton_gpu.convert_layout %b : (tensor<16x32xf32, #shared>) -> tensor<16x32xf32, #dot_operand_b> %28 = tt.dot %a_mat, %b_mat, %cst {allowTF32 = false, transA = false, transB = false} : tensor<32x16xf32, #dot_operand_a> * tensor<16x32xf32, #dot_operand_b> -> tensor<32x32xf32, #blocked> %30 = tt.splat %ptr : (!tt.ptr) -> tensor<32x1x!tt.ptr, #blocked> %36 = tt.broadcast %30 : (tensor<32x1x!tt.ptr, #blocked>) -> tensor<32x32x!tt.ptr, #blocked> tt.store %36, %28 : tensor<32x32xf32, #blocked> return } } // ----- #mma = #triton_gpu.mma<{versionMajor=2, warpsPerCTA=[2, 2]}> #shared = #triton_gpu.shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0]}> #blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [2, 16], warpsPerCTA = [1, 4], order = [1, 0]}> #dot_operand_a = #triton_gpu.dot_op<{opIdx=0, parent=#mma}> #dot_operand_b = #triton_gpu.dot_op<{opIdx=1, parent=#mma}> module attributes {"triton_gpu.num-warps" = 4 : i32} { // CHECK-LABEL: matmul_tf32dot func @matmul_tf32dot(%ptr:!tt.ptr {tt.divisibility = 16 : i32}, %a:tensor<32x16xf32, #shared>, %b:tensor<16x32xf32, #shared>) { %cst = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #mma> // CHECK: llvm.inline_asm // CHECK-SAME: ldmatrix.sync.aligned.m8n8.x4.shared.b16 // CHECK-SAME: (vector<1xf32>, vector<1xf32>, vector<1xf32>, vector<1xf32>) // CHECK: llvm.inline_asm // CHECK-SAME: ldmatrix.sync.aligned.m8n8.x4.shared.b16 // CHECK-SAME: (vector<1xf32>, vector<1xf32>, vector<1xf32>, vector<1xf32>) %a_mat = triton_gpu.convert_layout %a : (tensor<32x16xf32, #shared>) -> tensor<32x16xf32, #dot_operand_a> %b_mat = triton_gpu.convert_layout %b : (tensor<16x32xf32, #shared>) -> tensor<16x32xf32, #dot_operand_b> // CHECK: llvm.inline_asm // CHECK-SAME: mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 // CHECK: llvm.inline_asm // CHECK-SAME: mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 // CHECK: llvm.inline_asm // CHECK-SAME: mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 // CHECK: llvm.inline_asm // CHECK-SAME: mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 %28 = tt.dot %a_mat, %b_mat, %cst {allowTF32 = true, transA = false, transB = false} : tensor<32x16xf32, #dot_operand_a> * tensor<16x32xf32, #dot_operand_b> -> tensor<32x32xf32, #mma> %38 = triton_gpu.convert_layout %28 : (tensor<32x32xf32, #mma>) -> tensor<32x32xf32, #blocked> %30 = tt.splat %ptr : (!tt.ptr) -> tensor<32x1x!tt.ptr, #blocked> %36 = tt.broadcast %30 : (tensor<32x1x!tt.ptr, #blocked>) -> tensor<32x32x!tt.ptr, #blocked> tt.store %36, %38 : tensor<32x32xf32, #blocked> return } } // ----- #blocked0 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> module attributes {"triton_gpu.num-warps" = 4 : i32} { // CHECK-LABEL: atomic_add_f32 func @atomic_add_f32(%arg0 : tensor<256x!tt.ptr, #blocked0>, %arg1 : tensor<256xi1, #blocked0>, %arg2 : tensor<256xf32, #blocked0>) { // CHECK: llvm.inline_asm // CHECK-SAME: atom.global.gpu.add.f32 %0 = "tt.atomic_rmw" (%arg0, %arg2, %arg1) {atomic_rmw_op = 5 : i32} : (tensor<256x!tt.ptr, #blocked0>, tensor<256xf32, #blocked0>, tensor<256xi1, #blocked0>) -> tensor<256xf32, #blocked0> return } } // ----- #blocked0 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> module attributes {"triton_gpu.num-warps" = 4 : i32} { func @test_get_program_id(%a: tensor<32x!tt.ptr, #blocked0>) { %blockidx = tt.get_program_id {axis=0:i32} : i32 %blockidy = tt.get_program_id {axis=1:i32} : i32 %blockidz = tt.get_program_id {axis=2:i32} : i32 // CHECK: nvvm.read.ptx.sreg.ctaid.x // CHECK: nvvm.read.ptx.sreg.ctaid.y // CHECK: nvvm.read.ptx.sreg.ctaid.z %v0 = arith.addi %blockidx, %blockidy : i32 %v1 = arith.addi %v0, %blockidz : i32 %0 = tt.splat %v1 : (i32) -> tensor<32xi32, #blocked0> tt.store %a, %0 : tensor<32xi32, #blocked0> return } } // ----- #blocked0 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> module attributes {"triton_gpu.num-warps" = 4 : i32} { func @test_get_num_program(%a: tensor<32x!tt.ptr, #blocked0>) { // CHECK: nvvm.read.ptx.sreg.nctaid.x // CHECK: nvvm.read.ptx.sreg.nctaid.y // CHECK: nvvm.read.ptx.sreg.nctaid.z %blockdimx = tt.get_num_programs {axis=0:i32} : i32 %blockdimy = tt.get_num_programs {axis=1:i32} : i32 %blockdimz = tt.get_num_programs {axis=2:i32} : i32 %v0 = arith.addi %blockdimx, %blockdimy : i32 %v1 = arith.addi %v0, %blockdimz : i32 %0 = tt.splat %v1 : (i32) -> tensor<32xi32, #blocked0> tt.store %a, %0 : tensor<32xi32, #blocked0> return } } // ----- #blocked0 = #triton_gpu.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [1], order = [0]}> module attributes {"triton_gpu.num-warps" = 4 : i32} { // CHECK-LABEL: test_index_cache func @test_index_cache() { // CHECK: nvvm.read.ptx.sreg.tid.x %0 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked0> // CHECK-NOT: nvvm.read.ptx.sreg.tid.x %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked0> return } } // ----- #blocked0 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [8, 4], warpsPerCTA = [8, 1], order = [1, 0]}> #shared0 = #triton_gpu.shared<{vec = 8, perPhase = 2, maxPhase = 4, order = [1, 0]}> module attributes {"triton_gpu.num-warps" = 1 : i32} { // CHECK-LABEL: test_base_index_cache func @test_base_index_cache(%arg0: tensor<128x32xf32, #blocked0>) { // CHECK: nvvm.read.ptx.sreg.tid.x %0 = triton_gpu.convert_layout %arg0 : (tensor<128x32xf32, #blocked0>) -> tensor<128x32xf32, #shared0> // CHECK-NOT: nvvm.read.ptx.sreg.tid.x %1 = triton_gpu.convert_layout %arg0 : (tensor<128x32xf32, #blocked0>) -> tensor<128x32xf32, #shared0> return } } // ----- #blocked0 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [8, 4], warpsPerCTA = [8, 1], order = [1, 0]}> #shared0 = #triton_gpu.shared<{vec = 8, perPhase = 2, maxPhase = 4, order = [1, 0]}> module attributes {"triton_gpu.num-warps" = 1 : i32} { // CHECK-LABEL: test_index_cache_different_block func @test_index_cache_different_block(%arg0: tensor<128x32xf32, #blocked0>, %arg1: i1) { // CHECK: nvvm.read.ptx.sreg.tid.x %0 = triton_gpu.convert_layout %arg0 : (tensor<128x32xf32, #blocked0>) -> tensor<128x32xf32, #shared0> scf.if %arg1 { // CHECK-NOT: nvvm.read.ptx.sreg.tid.x %1 = triton_gpu.convert_layout %arg0 : (tensor<128x32xf32, #blocked0>) -> tensor<128x32xf32, #shared0> } return } } triton-2.0.0/test/Target/000077500000000000000000000000001440023377100152365ustar00rootroot00000000000000triton-2.0.0/test/Target/tritongpu_to_llvmir.mlir000066400000000000000000000006731440023377100222530ustar00rootroot00000000000000// RUN: %PYTHON -m triton.tools.aot %s --target=llvm-ir --sm=80 | FileCheck %s // == LLVM IR check begin == // CHECK-LABEL: ; ModuleID = 'LLVMDialectModule' // CHECK: define void @test_empty_kernel // CHECK: !nvvm.annotations // CHECK: !{void (i32, half addrspace(1)*)* @test_empty_kernel, !"maxntidx", i32 128} module attributes {"triton_gpu.num-warps" = 4 : i32} { func @test_empty_kernel(%lb : index, %A : !tt.ptr) { return } } triton-2.0.0/test/Target/tritongpu_to_ptx.mlir000066400000000000000000000005341440023377100215550ustar00rootroot00000000000000// RUN: %PYTHON -m triton.tools.aot %s --target=ptx --sm=80 --ptx-version=63 | FileCheck %s // CHECK-LABEL: // Generated by LLVM NVPTX Back-End // CHECK: .version 6.3 // CHECK: .target sm_80 // CHECK: .address_size 64 module attributes {"triton_gpu.num-warps" = 4 : i32} { func @test_empty_kernel(%lb : index, %A : !tt.ptr) { return } } triton-2.0.0/test/Triton/000077500000000000000000000000001440023377100152675ustar00rootroot00000000000000triton-2.0.0/test/Triton/combine.mlir000066400000000000000000000200651440023377100175730ustar00rootroot00000000000000// RUN: triton-opt %s -split-input-file -canonicalize -triton-combine // RUN: triton-opt %s -split-input-file -canonicalize -triton-combine | FileCheck %s // CHECK-LABEL: @test_combine_dot_add_pattern func @test_combine_dot_add_pattern() -> (tensor<128x128xf32>, tensor<128x128xf32>) { // CHECK: %[[d:.*]] = arith.constant dense<3.000000e+00> : tensor<128x128xf32> // CHECK: %[[b:.*]] = arith.constant dense<2.000000e+00> : tensor<128x128xf32> // CHECK: %[[a:.*]] = arith.constant dense<1.000000e+00> : tensor<128x128xf32> %a = arith.constant dense<1.0> : tensor<128x128xf32> %b = arith.constant dense<2.0> : tensor<128x128xf32> %zero = arith.constant dense<0.0> : tensor<128x128xf32> %d = arith.constant dense<3.0> : tensor<128x128xf32> %dot_out = tt.dot %a, %b, %zero {allowTF32 = true, transA = false, transB = false} : tensor<128x128xf32> * tensor<128x128xf32> -> tensor<128x128xf32> // CHECK-NEXT: %[[res0:.*]] = tt.dot %[[a]], %[[b]], %[[d]] {allowTF32 = true} : tensor<128x128xf32> * tensor<128x128xf32> -> tensor<128x128xf32> %res0 = arith.addf %dot_out, %d : tensor<128x128xf32> // CHECK-NEXT: %[[res1:.*]] = tt.dot %[[a]], %[[b]], %[[d]] {allowTF32 = true} : tensor<128x128xf32> * tensor<128x128xf32> -> tensor<128x128xf32> %res1 = arith.addf %d, %dot_out : tensor<128x128xf32> return %res0, %res1 : tensor<128x128xf32>, tensor<128x128xf32> } // COM: CHECK-LABEL: @test_combine_addptr_pattern func @test_combine_addptr_pattern(%base: !tt.ptr) -> tensor<8x!tt.ptr> { %off0 = arith.constant 10 : i32 %off1 = arith.constant 15 : i32 // 10 + 15 = 25 // COM: CHECK-NEXT: %[[cst:.*]] = arith.constant dense<25> : tensor<8xi32> %base_ = tt.broadcast %base : (!tt.ptr) -> tensor<8x!tt.ptr> // COM: CHECK-NEXT: %[[tmp0:.*]] = tt.broadcast %{{.*}} : (!tt.ptr) -> tensor<8x!tt.ptr> %idx0 = tt.broadcast %off0 : (i32) -> tensor<8xi32> %idx1 = tt.broadcast %off1 : (i32) -> tensor<8xi32> // COM: CHECK-NEXT: %1 = tt.addptr %[[tmp0]], %[[cst]] : tensor<8x!tt.ptr>, tensor<8xi32> %ptr0 = tt.addptr %base_, %idx0 : tensor<8x!tt.ptr>, tensor<8xi32> %ptr1 = tt.addptr %ptr0, %idx1 : tensor<8x!tt.ptr>, tensor<8xi32> return %ptr1 : tensor<8x!tt.ptr> } // CHECK-LABEL: @test_combine_select_masked_load_pattern func @test_combine_select_masked_load_pattern(%ptr: tensor<8x!tt.ptr>, %cond: i1) -> (tensor<8xf32>, tensor<8xf32>) { %mask = tt.broadcast %cond : (i1) -> tensor<8xi1> %false_val = arith.constant dense<0.0> : tensor<8xf32> // CHECK: %[[res1:.*]] = tt.load %{{.*}}, %{{.*}}, %{{.*}} {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<8xf32> %x = tt.load %ptr, %mask, %false_val {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<8xf32> %0 = select %cond, %x, %false_val : tensor<8xf32> // CHECK: %[[res2:.*]] = tt.load %{{.*}}, %{{.*}}, %{{.*}} {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<8xf32> %y = tt.load %ptr, %mask, %false_val {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<8xf32> %1 = select %cond, %y, %false_val : tensor<8xf32> // CHECK: return %[[res1]], %[[res2]] : tensor<8xf32>, tensor<8xf32> return %0, %1 : tensor<8xf32>, tensor<8xf32> } // CHECK-LABEL: @test_combine_select_masked_load_fail_pattern func @test_combine_select_masked_load_fail_pattern(%ptr: tensor<8x!tt.ptr>, %dummy_load: tensor<8xf32>, %dummy_broadcast: tensor<8xi1>, %cond0: i1, %cond1: i1) -> (tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) { %false_val = arith.constant dense<0.0> : tensor<8xf32> // Case 1: value at the "load" position is not an "op". Select should not be canonicalized. // CHECK: %{{.*}} = select %{{.*}}, %{{.*}}, %{{.*}} : tensor<8xf32> %0 = select %cond0, %dummy_load, %false_val : tensor<8xf32> // Case 2: value at the "broadcast" position is not an "op". Select should not be canonicalized. %real_load0 = tt.load %ptr, %dummy_broadcast, %false_val {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<8xf32> // CHECK: %{{.*}} = select %{{.*}}, %{{.*}}, %{{.*}} : tensor<8xf32> %1 = select %cond0, %real_load0, %false_val : tensor<8xf32> // Case 3: condition of "broadcast" is not the same as the condition of "select". Select should not be canonicalized. %cond0_ = tt.broadcast %cond0 : (i1) -> tensor<8xi1> %real_load1 = tt.load %ptr, %cond0_, %false_val {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<8xf32> // CHECK: %{{.*}} = select %{{.*}}, %{{.*}}, %{{.*}} : tensor<8xf32> %2 = select %cond1, %real_load1, %false_val : tensor<8xf32> return %0, %1, %2 : tensor<8xf32>, tensor<8xf32>, tensor<8xf32> } // CHECK-LABEL: @test_combine_broadcast_constant_pattern func @test_combine_broadcast_constant_pattern(%cst : f32) -> tensor<8x2xf32> { // CHECK: %[[cst:.*]] = arith.constant dense<1.000000e+00> : tensor<8x2xf32> %const = arith.constant dense<1.0> : tensor<8xf32> %bst_out = tt.broadcast %const : (tensor<8xf32>) -> tensor<8x2xf32> // CHECK-NEXT: return %[[cst]] : tensor<8x2xf32> return %bst_out : tensor<8x2xf32> } // CHECK-LABEL: @test_canonicalize_masked_load_pattern func @test_canonicalize_masked_load_pattern(%ptr: tensor<8x!tt.ptr>) -> (tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) { %true_mask = arith.constant dense : tensor<8xi1> %false_mask = arith.constant dense : tensor<8xi1> %other_val = arith.constant dense<0.0> : tensor<8xf32> // true_mask with other // CHECK: %[[res1:.*]] = tt.load %{{.*}} {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<8xf32> %x = tt.load %ptr, %true_mask {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<8xf32> // true_mask without other // CHECK: %[[res2:.*]] = tt.load %{{.*}} {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<8xf32> %y = tt.load %ptr, %true_mask, %other_val {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<8xf32> // false_mask with other. It should become "other" (i.e., %y) %z = tt.load %ptr, %false_mask, %y {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<8xf32> // CHECK: return %[[res1]], %[[res2]], %[[res2]] : tensor<8xf32>, tensor<8xf32>, tensor<8xf32> return %x, %y, %z: tensor<8xf32>, tensor<8xf32>, tensor<8xf32> } // CHECK-LABEL: @test_canonicalize_masked_load_fail_pattern func @test_canonicalize_masked_load_fail_pattern(%ptr: tensor<8x!tt.ptr>, %mask: tensor<8xi1>) -> (tensor<8xf32>, tensor<8xf32>) { %other_val = arith.constant dense<0.0> : tensor<8xf32> // Case: value at the "mask" position is not an "op". Load should not be canonicalized. // CHECK: %[[res1:.*]] = tt.load %{{.*}}, %{{.*}} {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<8xf32> %x = tt.load %ptr, %mask {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<8xf32> // CHECK: %[[res1:.*]] = tt.load %{{.*}}, %{{.*}}, %{{.*}} {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<8xf32> %y = tt.load %ptr, %mask, %other_val {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<8xf32> return %x, %y: tensor<8xf32>, tensor<8xf32> } // CHECK-LABEL: @test_canonicalize_masked_store_pattern func @test_canonicalize_masked_store_pattern(%ptr: tensor<8x!tt.ptr>, %val: tensor<8xf32>) { %true_mask = arith.constant dense : tensor<8xi1> %false_mask = arith.constant dense : tensor<8xi1> // CHECK: tt.store %{{.*}}, %{{.*}} : tensor<8xf32> tt.store %ptr, %val, %true_mask : tensor<8xf32> // The following store should disappear. // CHECK-NEXT: return tt.store %ptr, %val, %false_mask : tensor<8xf32> return } // CHECK-LABEL: @test_canonicalize_masked_store_fail_pattern func @test_canonicalize_masked_store_fail_pattern(%ptr: tensor<8x!tt.ptr>, %val: tensor<8xf32>, %mask: tensor<8xi1>) { // Case: value at the "mask" position is not an "op". Store should not be canonicalized. // CHECK: tt.store %{{.*}}, %{{.*}}, %{{.*}} : tensor<8xf32> tt.store %ptr, %val, %mask : tensor<8xf32> return } triton-2.0.0/test/Triton/vecadd.mlir000066400000000000000000000415561440023377100174150ustar00rootroot00000000000000// RUN: triton-opt %s -verify-diagnostics module { func @add_kernel__Pfp32_Pfp32_Pfp32_i32_i32_i32__(%arg0: !tt.ptr, %arg1: !tt.ptr, %arg2: !tt.ptr, %arg3: i32, %arg4: i32, %arg5: i32) { %0 = tt.get_program_id {axis = 0 : i32} : i32 %c256_i32 = arith.constant 256 : i32 %1 = arith.muli %0, %c256_i32 : i32 %2 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32> %3 = tt.broadcast %1 : (i32) -> tensor<256xi32> %4 = arith.addi %3, %2 : tensor<256xi32> %5 = tt.broadcast %arg3 : (i32) -> tensor<256xi32> %6 = arith.cmpi slt, %4, %5 : tensor<256xi32> %7 = tt.broadcast %arg0 : (!tt.ptr) -> tensor<256x!tt.ptr> %8 = tt.addptr %7, %4 : tensor<256x!tt.ptr>, tensor<256xi32> %9 = tt.broadcast %arg1 : (!tt.ptr) -> tensor<256x!tt.ptr> %10 = tt.addptr %9, %4 : tensor<256x!tt.ptr>, tensor<256xi32> %cst = arith.constant 0.000000e+00 : f32 %11 = tt.broadcast %cst : (f32) -> tensor<256xf32> %c0_i32 = arith.constant 0 : i32 %c32_i32 = arith.constant 32 : i32 %12 = arith.index_cast %c0_i32 : i32 to index %13 = arith.index_cast %arg4 : i32 to index %14 = arith.index_cast %c32_i32 : i32 to index %15:3 = scf.for %arg6 = %12 to %13 step %14 iter_args(%arg7 = %11, %arg8 = %8, %arg9 = %10) -> (tensor<256xf32>, tensor<256x!tt.ptr>, tensor<256x!tt.ptr>) { %cst_0 = arith.constant 0.000000e+00 : f32 %18 = tt.broadcast %cst_0 : (f32) -> tensor<256xf32> %19 = tt.load %arg8, %6, %18 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32> %cst_1 = arith.constant 0.000000e+00 : f32 %20 = tt.broadcast %cst_1 : (f32) -> tensor<256xf32> %21 = tt.load %arg9, %6, %20 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32> %22 = arith.addf %19, %21 : tensor<256xf32> %23 = arith.addf %arg7, %22 : tensor<256xf32> %24 = tt.broadcast %arg5 : (i32) -> tensor<256xi32> %25 = tt.addptr %arg8, %24 : tensor<256x!tt.ptr>, tensor<256xi32> %26 = tt.broadcast %arg5 : (i32) -> tensor<256xi32> %27 = tt.addptr %arg9, %26 : tensor<256x!tt.ptr>, tensor<256xi32> scf.yield %23, %25, %27 : tensor<256xf32>, tensor<256x!tt.ptr>, tensor<256x!tt.ptr> } %16 = tt.broadcast %arg2 : (!tt.ptr) -> tensor<256x!tt.ptr> %17 = tt.addptr %16, %4 : tensor<256x!tt.ptr>, tensor<256xi32> tt.store %17, %15#0, %6 : tensor<256xf32> return } } // module { // func @add_kernel__Pfp32_Pfp32_Pfp32_i32_i32_i32__(%arg0: !tt.ptr, %arg1: !tt.ptr, %arg2: !tt.ptr, %arg3: i32, %arg4: i32, %arg5: i32) { // %c64 = arith.constant 64 : index // %c32 = arith.constant 32 : index // %c0 = arith.constant 0 : index // %cst = arith.constant 0.000000e+00 : f32 // %c256_i32 = arith.constant 256 : i32 // %0 = tt.get_program_id {axis = 0 : i32} : i32 // %1 = arith.muli %0, %c256_i32 : i32 // %2 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #triton_gpu<"coalesced encoding">> // %3 = tt.broadcast %1 : (i32) -> tensor<256xi32, #triton_gpu<"coalesced encoding">> // %4 = arith.addi %3, %2 : tensor<256xi32, #triton_gpu<"coalesced encoding">> // %5 = tt.broadcast %arg3 : (i32) -> tensor<256xi32, #triton_gpu<"coalesced encoding">> // %6 = "triton_gpu.cmpi"(%4, %5) {predicate = 2 : i64} : (tensor<256xi32, #triton_gpu<"coalesced encoding">>, tensor<256xi32, #triton_gpu<"coalesced encoding">>) -> tensor<256xi1, #triton_gpu<"coalesced encoding">> // %7 = tt.broadcast %arg0 : (!tt.ptr) -> tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">> // %8 = tt.addptr %7, %4, : tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">>, tensor<256xi32> // %9 = tt.broadcast %arg1 : (!tt.ptr) -> tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">> // %10 = tt.addptr %9, %4, : tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">>, tensor<256xi32> // %11 = tt.broadcast %cst : (f32) -> tensor<256xf32, #triton_gpu<"coalesced encoding">> // %12 = arith.index_cast %arg4 : i32 to index // %13 = arith.cmpi slt, %c0, %12 : index // %14 = tt.broadcast %cst : (f32) -> tensor<256xf32, #triton_gpu<"coalesced encoding">> // %15 = tt.broadcast %13 : (i1) -> tensor<256xi1, #triton_gpu<"coalesced encoding">> // %16 = arith.andi %6, %15 : tensor<256xi1, #triton_gpu<"coalesced encoding">> // %17 = triton_gpu.copy_async %8, %16, %14 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">> -> tensor<256xf32, #triton_gpu<"coalesced encoding">> // %18 = tt.broadcast %cst : (f32) -> tensor<256xf32, #triton_gpu<"coalesced encoding">> // %19 = tt.broadcast %13 : (i1) -> tensor<256xi1, #triton_gpu<"coalesced encoding">> // %20 = arith.andi %6, %19 : tensor<256xi1, #triton_gpu<"coalesced encoding">> // %21 = triton_gpu.copy_async %10, %20, %18 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">> -> tensor<256xf32, #triton_gpu<"coalesced encoding">> // %22 = tt.broadcast %arg5 : (i32) -> tensor<256xi32, #triton_gpu<"coalesced encoding">> // %23 = tt.addptr %8, %22, : tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">>, tensor<256xi32> // %24 = tt.broadcast %arg5 : (i32) -> tensor<256xi32, #triton_gpu<"coalesced encoding">> // %25 = tt.addptr %10, %24, : tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">>, tensor<256xi32> // %26 = arith.cmpi slt, %c32, %12 : index // %27 = tt.broadcast %cst : (f32) -> tensor<256xf32, #triton_gpu<"coalesced encoding">> // %28 = tt.broadcast %26 : (i1) -> tensor<256xi1, #triton_gpu<"coalesced encoding">> // %29 = arith.andi %6, %28 : tensor<256xi1, #triton_gpu<"coalesced encoding">> // %30 = triton_gpu.copy_async %23, %29, %27 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">> -> tensor<256xf32, #triton_gpu<"coalesced encoding">> // %31 = tt.broadcast %cst : (f32) -> tensor<256xf32, #triton_gpu<"coalesced encoding">> // %32 = tt.broadcast %26 : (i1) -> tensor<256xi1, #triton_gpu<"coalesced encoding">> // %33 = arith.andi %6, %32 : tensor<256xi1, #triton_gpu<"coalesced encoding">> // %34 = triton_gpu.copy_async %25, %33, %31 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">> -> tensor<256xf32, #triton_gpu<"coalesced encoding">> // %35 = tt.broadcast %arg5 : (i32) -> tensor<256xi32, #triton_gpu<"coalesced encoding">> // %36 = tt.addptr %23, %35, : tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">>, tensor<256xi32> // %37 = tt.broadcast %arg5 : (i32) -> tensor<256xi32, #triton_gpu<"coalesced encoding">> // %38 = tt.addptr %25, %37, : tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">>, tensor<256xi32> // %39 = arith.cmpi slt, %c64, %12 : index // %40 = tt.broadcast %cst : (f32) -> tensor<256xf32, #triton_gpu<"coalesced encoding">> // %41 = tt.broadcast %39 : (i1) -> tensor<256xi1, #triton_gpu<"coalesced encoding">> // %42 = arith.andi %6, %41 : tensor<256xi1, #triton_gpu<"coalesced encoding">> // %43 = triton_gpu.copy_async %36, %42, %40 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">> -> tensor<256xf32, #triton_gpu<"coalesced encoding">> // %44 = tt.broadcast %cst : (f32) -> tensor<256xf32, #triton_gpu<"coalesced encoding">> // %45 = tt.broadcast %39 : (i1) -> tensor<256xi1, #triton_gpu<"coalesced encoding">> // %46 = arith.andi %6, %45 : tensor<256xi1, #triton_gpu<"coalesced encoding">> // %47 = triton_gpu.copy_async %38, %46, %44 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">> -> tensor<256xf32, #triton_gpu<"coalesced encoding">> // %48 = tt.broadcast %arg5 : (i32) -> tensor<256xi32, #triton_gpu<"coalesced encoding">> // %49 = tt.addptr %36, %48, : tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">>, tensor<256xi32> // %50 = tt.broadcast %arg5 : (i32) -> tensor<256xi32, #triton_gpu<"coalesced encoding">> // %51 = tt.addptr %38, %50, : tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">>, tensor<256xi32> // %52:12 = scf.for %arg6 = %c0 to %12 step %c32 iter_args(%arg7 = %11, %arg8 = %8, %arg9 = %10, %arg10 = %17, %arg11 = %30, %arg12 = %43, %arg13 = %21, %arg14 = %34, %arg15 = %47, %arg16 = %51, %arg17 = %49, %arg18 = %c64) -> (tensor<256xf32, #triton_gpu<"coalesced encoding">>, tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">>, tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">>, tensor<256xf32, #triton_gpu<"coalesced encoding">>, tensor<256xf32, #triton_gpu<"coalesced encoding">>, tensor<256xf32, #triton_gpu<"coalesced encoding">>, tensor<256xf32, #triton_gpu<"coalesced encoding">>, tensor<256xf32, #triton_gpu<"coalesced encoding">>, tensor<256xf32, #triton_gpu<"coalesced encoding">>, tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">>, tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">>, index) { // %55 = arith.addf %arg10, %arg13 : tensor<256xf32, #triton_gpu<"coalesced encoding">> // %56 = arith.addf %arg7, %55 : tensor<256xf32, #triton_gpu<"coalesced encoding">> // %57 = tt.broadcast %arg5 : (i32) -> tensor<256xi32, #triton_gpu<"coalesced encoding">> // %58 = tt.addptr %arg8, %57, : tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">>, tensor<256xi32> // %59 = tt.broadcast %arg5 : (i32) -> tensor<256xi32, #triton_gpu<"coalesced encoding">> // %60 = tt.addptr %arg9, %59, : tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">>, tensor<256xi32> // %61 = arith.addi %arg18, %c32 : index // %62 = arith.cmpi slt, %61, %12 : index // %63 = tt.broadcast %cst : (f32) -> tensor<256xf32, #triton_gpu<"coalesced encoding">> // %64 = tt.broadcast %62 : (i1) -> tensor<256xi1, #triton_gpu<"coalesced encoding">> // %65 = arith.andi %64, %6 : tensor<256xi1, #triton_gpu<"coalesced encoding">> // %66 = triton_gpu.copy_async %arg17, %65, %63 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">> -> tensor<256xf32, #triton_gpu<"coalesced encoding">> // %67 = tt.broadcast %cst : (f32) -> tensor<256xf32, #triton_gpu<"coalesced encoding">> // %68 = triton_gpu.copy_async %arg16, %65, %67 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">> -> tensor<256xf32, #triton_gpu<"coalesced encoding">> // %69 = tt.broadcast %arg5 : (i32) -> tensor<256xi32, #triton_gpu<"coalesced encoding">> // %70 = tt.addptr %arg17, %69, : tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">>, tensor<256xi32> // %71 = tt.broadcast %arg5 : (i32) -> tensor<256xi32, #triton_gpu<"coalesced encoding">> // %72 = tt.addptr %arg16, %71, : tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">>, tensor<256xi32> // scf.yield %56, %58, %60, %arg11, %arg12, %66, %arg14, %arg15, %68, %72, %70, %61 : tensor<256xf32, #triton_gpu<"coalesced encoding">>, tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">>, tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">>, tensor<256xf32, #triton_gpu<"coalesced encoding">>, tensor<256xf32, #triton_gpu<"coalesced encoding">>, tensor<256xf32, #triton_gpu<"coalesced encoding">>, tensor<256xf32, #triton_gpu<"coalesced encoding">>, tensor<256xf32, #triton_gpu<"coalesced encoding">>, tensor<256xf32, #triton_gpu<"coalesced encoding">>, tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">>, tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">>, index // } // %53 = tt.broadcast %arg2 : (!tt.ptr) -> tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">> // %54 = tt.addptr %53, %4, : tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">>, tensor<256xi32> // tt.store %54, %52#0, %6 : tensor<256xf32, #triton_gpu<"coalesced encoding">> // return // } // } triton-2.0.0/test/TritonGPU/000077500000000000000000000000001440023377100156435ustar00rootroot00000000000000triton-2.0.0/test/TritonGPU/coalesce.mlir000066400000000000000000000077141440023377100203170ustar00rootroot00000000000000// RUN: triton-opt %s -split-input-file -tritongpu-coalesce -canonicalize | FileCheck %s #blocked0 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> #blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [4, 1], order = [0, 1]}> #blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 4], order = [0, 1]}> #slice1dim1 = #triton_gpu.slice<{dim = 1, parent = #blocked1}> #slice2dim0 = #triton_gpu.slice<{dim = 0, parent = #blocked2}> module attributes {"triton_gpu.num-warps" = 4 : i32} { // CHECK: [[row_layout:#.*]] = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [2, 16], warpsPerCTA = [4, 1], order = [1, 0]}> // CHECK: [[col_layout:#.*]] = #triton_gpu.blocked<{sizePerThread = [4, 1], threadsPerWarp = [16, 2], warpsPerCTA = [1, 4], order = [0, 1]}> // CHECK: [[load_ptr:%.*]] = triton_gpu.convert_layout {{.*}} -> tensor<64x64x!tt.ptr, [[row_layout]]> // CHECK: [[load_mask:%.*]] = triton_gpu.convert_layout {{.*}} -> tensor<64x64xi1, [[row_layout]]> // CHECK: [[load_other:%.*]] = triton_gpu.convert_layout {{.*}} -> tensor<64x64xf32, [[row_layout]]> // CHECK: [[load_val:%.*]] = tt.load [[load_ptr]], [[load_mask]], [[load_other]] {{.*}} : tensor<64x64xf32, [[row_layout]]> // CHECK: [[store_ptr:%.*]] = triton_gpu.convert_layout {{.*}} -> tensor<64x64x!tt.ptr, [[col_layout]]> // CHECK: [[store_val:%.*]] = triton_gpu.convert_layout {{.*}} -> tensor<64x64xf32, [[col_layout]]> // CHECK: [[store_mask:%.*]] = triton_gpu.convert_layout {{.*}} -> tensor<64x64xi1, [[col_layout]]> // CHECK: tt.store [[store_ptr]], [[store_val]], [[store_mask]] func @transpose(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: i32 {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32}) { %cst = arith.constant dense : tensor<64x64xi1, #blocked1> %cst_0 = arith.constant dense<0.000000e+00> : tensor<64x64xf32, #blocked1> %00 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #slice1dim1> %01 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #slice2dim0> %1 = tt.expand_dims %00 {axis = 1 : i32} : (tensor<64xi32, #slice1dim1>) -> tensor<64x1xi32, #blocked1> %2 = tt.splat %arg1 : (i32) -> tensor<64x1xi32, #blocked1> %3 = arith.muli %1, %2 : tensor<64x1xi32, #blocked1> %4 = tt.splat %arg0 : (!tt.ptr) -> tensor<64x1x!tt.ptr, #blocked1> %5 = tt.addptr %4, %3 : tensor<64x1x!tt.ptr, #blocked1>, tensor<64x1xi32, #blocked1> %6 = tt.expand_dims %01 {axis = 0 : i32} : (tensor<64xi32, #slice2dim0>) -> tensor<1x64xi32, #blocked2> %7 = tt.broadcast %5 : (tensor<64x1x!tt.ptr, #blocked1>) -> tensor<64x64x!tt.ptr, #blocked1> %8 = tt.broadcast %6 : (tensor<1x64xi32, #blocked2>) -> tensor<64x64xi32, #blocked2> %9 = triton_gpu.convert_layout %8 : (tensor<64x64xi32, #blocked2>) -> tensor<64x64xi32, #blocked1> %10 = tt.addptr %7, %9 : tensor<64x64x!tt.ptr, #blocked1>, tensor<64x64xi32, #blocked1> %11 = tt.splat %arg2 : (!tt.ptr) -> tensor<64x1x!tt.ptr, #blocked1> %12 = tt.addptr %11, %1 : tensor<64x1x!tt.ptr, #blocked1>, tensor<64x1xi32, #blocked1> %13 = tt.splat %arg3 : (i32) -> tensor<1x64xi32, #blocked2> %14 = arith.muli %6, %13 : tensor<1x64xi32, #blocked2> %15 = tt.broadcast %12 : (tensor<64x1x!tt.ptr, #blocked1>) -> tensor<64x64x!tt.ptr, #blocked1> %16 = tt.broadcast %14 : (tensor<1x64xi32, #blocked2>) -> tensor<64x64xi32, #blocked2> %17 = triton_gpu.convert_layout %16 : (tensor<64x64xi32, #blocked2>) -> tensor<64x64xi32, #blocked1> %18 = tt.addptr %15, %17 : tensor<64x64x!tt.ptr, #blocked1>, tensor<64x64xi32, #blocked1> %19 = tt.load %10, %cst, %cst_0 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<64x64xf32, #blocked1> tt.store %18, %19, %cst : tensor<64x64xf32, #blocked1> return } }triton-2.0.0/test/TritonGPU/combine.mlir000066400000000000000000002554151440023377100201600ustar00rootroot00000000000000// RUN: triton-opt %s -split-input-file -tritongpu-combine 2>&1 | FileCheck %s #layout0 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> #layout1 = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> #layout2 = #triton_gpu.mma<{version = 2, warpsPerCTA = [4, 1]}> // CHECK: [[target_layout:#.*]] = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> // CHECK: [[row_layout:#.*]] = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [2, 16], warpsPerCTA = [1, 4], order = [1, 0]}> // CHECK: [[col_layout:#.*]] = #triton_gpu.blocked<{sizePerThread = [4, 1], threadsPerWarp = [16, 2], warpsPerCTA = [4, 1], order = [0, 1]}> // CHECK: [[col_layout_novec:#.*]] = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [4, 1], order = [0, 1]}> // CHECK-LABEL: cst func @cst() -> tensor<1024xi32, #layout1> { %cst = arith.constant dense<0> : tensor<1024xi32, #layout0> %1 = triton_gpu.convert_layout %cst : (tensor<1024xi32, #layout0>) -> tensor<1024xi32, #layout1> // CHECK-NOT: triton_gpu.convert_layout // CHECK: return %cst : tensor<1024xi32, [[target_layout]]> return %1: tensor<1024xi32, #layout1> } // CHECK-LABEL: range func @range() -> tensor<1024xi32, #layout1> { %0 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #layout0> %1 = triton_gpu.convert_layout %0 : (tensor<1024xi32, #layout0>) -> tensor<1024xi32, #layout1> // CHECK-NOT: triton_gpu.convert_layout // CHECK: return %0 : tensor<1024xi32, [[target_layout]]> return %1: tensor<1024xi32, #layout1> } // CHECK-LABEL: splat func @splat(%arg0: i32) -> tensor<1024xi32, #layout1> { %0 = tt.splat %arg0 : (i32) -> tensor<1024xi32, #layout0> %1 = triton_gpu.convert_layout %0 : (tensor<1024xi32, #layout0>) -> tensor<1024xi32, #layout1> // CHECK-NOT: triton_gpu.convert_layout // CHECK: return %0 : tensor<1024xi32, [[target_layout]]> return %1: tensor<1024xi32, #layout1> } // CHECK-LABEL: remat func @remat(%arg0: i32) -> tensor<1024xi32, #layout1> { %0 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #layout0> %1 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #layout0> %2 = arith.muli %0, %1 : tensor<1024xi32, #layout0> %3 = triton_gpu.convert_layout %2 : (tensor<1024xi32, #layout0>) -> tensor<1024xi32, #layout1> %4 = tt.splat %arg0 : (i32) -> tensor<1024xi32, #layout0> %5 = triton_gpu.convert_layout %2 : (tensor<1024xi32, #layout0>) -> tensor<1024xi32, #layout1> %6 = arith.addi %3, %5 : tensor<1024xi32, #layout1> return %6: tensor<1024xi32, #layout1> // CHECK: %0 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, [[target_layout]]> // CHECK: %1 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, [[target_layout]]> // CHECK: %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, [[target_layout]]> // CHECK: %3 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, [[target_layout]]> // CHECK: %4 = arith.muli %0, %2 : tensor<1024xi32, [[target_layout]]> // CHECK: %5 = arith.muli %1, %3 : tensor<1024xi32, [[target_layout]]> // CHECK: %6 = arith.addi %4, %5 : tensor<1024xi32, [[target_layout]]> // CHECK: return %6 : tensor<1024xi32, [[target_layout]]> } // CHECK-LABEL: remat_load_store func @remat_load_store(%arg: !tt.ptr {tt.divisibility = 16 : i32}) { %0 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #layout0> %1 = tt.splat %arg : (!tt.ptr) -> tensor<64x!tt.ptr, #layout0> %2 = tt.addptr %1, %0 : tensor<64x!tt.ptr, #layout0>, tensor<64xi32, #layout0> %3 = tt.load %2 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<64xi32, #layout0> // CHECK-NOT: triton_gpu.convert_layout %4 = triton_gpu.convert_layout %3 : (tensor<64xi32, #layout0>) -> tensor<64xi32, #layout1> %5 = triton_gpu.convert_layout %2 : (tensor<64x!tt.ptr, #layout0>) -> tensor<64x!tt.ptr, #layout1> tt.store %5, %4 : tensor<64xi32, #layout1> return } // Don't rematerialize vectorized loads // CHECK-LABEL: remat_expensive func @remat_expensive(%arg: !tt.ptr {tt.divisibility = 16 : i32}) { %0 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #layout1> %1 = tt.splat %arg : (!tt.ptr) -> tensor<64x!tt.ptr, #layout1> %2 = tt.addptr %1, %0 : tensor<64x!tt.ptr, #layout1>, tensor<64xi32, #layout1> %3 = tt.load %2 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<64xi32, #layout1> // CHECK: triton_gpu.convert_layout // CHECK-NOT: triton_gpu.convert_layout %4 = triton_gpu.convert_layout %3 : (tensor<64xi32, #layout1>) -> tensor<64xi32, #layout0> %5 = triton_gpu.convert_layout %2 : (tensor<64x!tt.ptr, #layout1>) -> tensor<64x!tt.ptr, #layout0> tt.store %5, %4 : tensor<64xi32, #layout0> return } // Don't rematerialize loads when original and target layouts are different // CHECK-LABEL: remat_multi_layout func @remat_multi_layout(%arg: !tt.ptr {tt.divisibility = 16 : i32}) { %0 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #layout0> %1 = tt.splat %arg : (!tt.ptr) -> tensor<64x!tt.ptr, #layout0> %2 = tt.addptr %1, %0 : tensor<64x!tt.ptr, #layout0>, tensor<64xi32, #layout0> %3 = tt.load %2 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<64xi32, #layout0> // CHECK: triton_gpu.convert_layout // CHECK-NOT: triton_gpu.convert_layout %4 = triton_gpu.convert_layout %3 : (tensor<64xi32, #layout0>) -> tensor<64xi32, #layout2> %5 = triton_gpu.convert_layout %2 : (tensor<64x!tt.ptr, #layout0>) -> tensor<64x!tt.ptr, #layout2> tt.store %5, %4 : tensor<64xi32, #layout2> return } // Always rematerialize single value loads // CHECK-LABEL: remat_single_value func @remat_single_value(%arg: !tt.ptr {tt.divisibility = 16 : i32}) { %0 = tt.splat %arg : (!tt.ptr) -> tensor<1x!tt.ptr, #layout1> %1 = tt.load %0 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1xi32, #layout1> // CHECK-NOT: triton_gpu.convert_layout %2 = triton_gpu.convert_layout %1 : (tensor<1xi32, #layout1>) -> tensor<1xi32, #layout0> %3 = triton_gpu.convert_layout %0 : (tensor<1x!tt.ptr, #layout1>) -> tensor<1x!tt.ptr, #layout0> tt.store %3, %2 : tensor<1xi32, #layout0> return } // CHECK-LABEL: if func @if(%arg0: i32, %arg1: !tt.ptr {tt.divisibility = 16 : i32}) { // CHECK-NOT: triton_gpu.convert_layout %c32_i32 = arith.constant dense<32> : tensor<1024xi32, #layout1> %0 = tt.get_program_id {axis = 0 : i32} : i32 %1 = tt.splat %0 : (i32) -> tensor<1024xi32, #layout1> %2 = arith.muli %1, %c32_i32 : tensor<1024xi32, #layout1> %3 = arith.addi %2, %c32_i32 : tensor<1024xi32, #layout1> %4 = arith.cmpi sgt, %0, %arg0 : i32 %5 = tt.splat %arg1 : (!tt.ptr) -> tensor<1024x!tt.ptr, #layout0> scf.if %4 { %6 = triton_gpu.convert_layout %2 : (tensor<1024xi32, #layout1>) -> tensor<1024xi32, #layout0> tt.store %5, %6 : tensor<1024xi32, #layout0> } return } // CHECK-LABEL: if_convert_else_not func @if_convert_else_not(%arg0: i32, %arg1: !tt.ptr {tt.divisibility = 16 : i32}) { %c32_i32 = arith.constant dense<32> : tensor<1024xi32, #layout0> %0 = tt.get_program_id {axis = 0 : i32} : i32 %1 = tt.splat %0 : (i32) -> tensor<1024xi32, #layout0> %9 = tt.splat %0 : (i32) -> tensor<1024xi32, #layout1> %2 = arith.muli %1, %c32_i32 : tensor<1024xi32, #layout0> %3 = arith.addi %2, %c32_i32 : tensor<1024xi32, #layout0> %4 = arith.cmpi sgt, %0, %arg0 : i32 %5 = tt.splat %arg1 : (!tt.ptr) -> tensor<1024x!tt.ptr, #layout1> %8 = scf.if %4 -> tensor<1024xi32, #layout1> { %6 = triton_gpu.convert_layout %2 : (tensor<1024xi32, #layout0>) -> tensor<1024xi32, #layout1> scf.yield %6 : tensor<1024xi32, #layout1> } else { scf.yield %9 : tensor<1024xi32, #layout1> } // CHECK-NOT: triton_gpu.convert_layout tt.store %5, %8 : tensor<1024xi32, #layout1> return } // CHECK-LABEL: if_not_else_convert func @if_not_else_convert(%arg0: i32, %arg1: !tt.ptr {tt.divisibility = 16 : i32}) { %c32_i32 = arith.constant dense<32> : tensor<1024xi32, #layout0> %0 = tt.get_program_id {axis = 0 : i32} : i32 %1 = tt.splat %0 : (i32) -> tensor<1024xi32, #layout0> %9 = tt.splat %0 : (i32) -> tensor<1024xi32, #layout1> %2 = arith.muli %1, %c32_i32 : tensor<1024xi32, #layout0> %3 = arith.addi %2, %c32_i32 : tensor<1024xi32, #layout0> %4 = arith.cmpi sgt, %0, %arg0 : i32 %5 = tt.splat %arg1 : (!tt.ptr) -> tensor<1024x!tt.ptr, #layout1> %8 = scf.if %4 -> tensor<1024xi32, #layout1> { scf.yield %9 : tensor<1024xi32, #layout1> } else { %7 = triton_gpu.convert_layout %3 : (tensor<1024xi32, #layout0>) -> tensor<1024xi32, #layout1> scf.yield %7 : tensor<1024xi32, #layout1> } // CHECK-NOT: triton_gpu.convert_layout tt.store %5, %8 : tensor<1024xi32, #layout1> return } // CHECK-LABEL: if_else_both_convert func @if_else_both_convert(%arg0: i32, %arg1: !tt.ptr {tt.divisibility = 16 : i32}) { %c32_i32 = arith.constant dense<32> : tensor<1024xi32, #layout0> %0 = tt.get_program_id {axis = 0 : i32} : i32 %1 = tt.splat %0 : (i32) -> tensor<1024xi32, #layout0> %2 = arith.muli %1, %c32_i32 : tensor<1024xi32, #layout0> %3 = arith.addi %2, %c32_i32 : tensor<1024xi32, #layout0> %4 = arith.cmpi sgt, %0, %arg0 : i32 %5 = tt.splat %arg1 : (!tt.ptr) -> tensor<1024x!tt.ptr, #layout1> %8 = scf.if %4 -> tensor<1024xi32, #layout1> { %6 = triton_gpu.convert_layout %2 : (tensor<1024xi32, #layout0>) -> tensor<1024xi32, #layout1> scf.yield %6 : tensor<1024xi32, #layout1> } else { %7 = triton_gpu.convert_layout %3 : (tensor<1024xi32, #layout0>) -> tensor<1024xi32, #layout1> scf.yield %7 : tensor<1024xi32, #layout1> } // CHECK: triton_gpu.convert_layout // CHECK-NOT: triton_gpu.convert_layout tt.store %5, %8 : tensor<1024xi32, #layout1> return } #blocked0 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> #blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [4, 1], order = [0, 1]}> #slice1dim1 = #triton_gpu.slice<{dim = 1, parent = #blocked1}> #blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 4], order = [0, 1]}> #slice2dim0 = #triton_gpu.slice<{dim = 0, parent = #blocked2}> #blocked3 = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [2, 16], warpsPerCTA = [1, 4], order = [1, 0]}> #blocked4 = #triton_gpu.blocked<{sizePerThread = [4, 1], threadsPerWarp = [16, 2], warpsPerCTA = [4, 1], order = [0, 1]}> // CHECK-LABEL: transpose func @transpose(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: i32 {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32}) { // CHECK-NOT: triton_gpu.convert_layout // CHECK: [[loaded_val:%.*]] = tt.load {{.*}}, {{%cst.*}}, {{%cst.*}} {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<64x64xf32, [[row_layout]]> // CHECK: [[cvt_val:%.*]] = triton_gpu.convert_layout [[loaded_val]] : (tensor<64x64xf32, [[row_layout]]>) -> tensor<64x64xf32, [[col_layout]]> // CHECK: tt.store {{.*}}, [[cvt_val]], {{%cst.*}} : tensor<64x64xf32, [[col_layout]]> // CHECK: return %cst = arith.constant dense<0.000000e+00> : tensor<64x64xf32, #blocked1> %cst_0 = arith.constant dense : tensor<64x64xi1, #blocked1> %00 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #slice1dim1> %01 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #slice2dim0> %1 = tt.expand_dims %00 {axis = 1 : i32} : (tensor<64xi32, #slice1dim1>) -> tensor<64x1xi32, #blocked1> %2 = tt.splat %arg1 : (i32) -> tensor<64x1xi32, #blocked1> %3 = arith.muli %1, %2 : tensor<64x1xi32, #blocked1> %4 = tt.splat %arg0 : (!tt.ptr) -> tensor<64x1x!tt.ptr, #blocked1> %5 = tt.addptr %4, %3 : tensor<64x1x!tt.ptr, #blocked1>, tensor<64x1xi32, #blocked1> %6 = tt.expand_dims %01 {axis = 0 : i32} : (tensor<64xi32, #slice2dim0>) -> tensor<1x64xi32, #blocked2> %7 = tt.broadcast %5 : (tensor<64x1x!tt.ptr, #blocked1>) -> tensor<64x64x!tt.ptr, #blocked1> %8 = tt.broadcast %6 : (tensor<1x64xi32, #blocked2>) -> tensor<64x64xi32, #blocked2> %9 = triton_gpu.convert_layout %8 : (tensor<64x64xi32, #blocked2>) -> tensor<64x64xi32, #blocked1> %10 = tt.addptr %7, %9 : tensor<64x64x!tt.ptr, #blocked1>, tensor<64x64xi32, #blocked1> %11 = tt.splat %arg2 : (!tt.ptr) -> tensor<64x1x!tt.ptr, #blocked1> %12 = tt.addptr %11, %1 : tensor<64x1x!tt.ptr, #blocked1>, tensor<64x1xi32, #blocked1> %13 = tt.splat %arg3 : (i32) -> tensor<1x64xi32, #blocked2> %14 = arith.muli %6, %13 : tensor<1x64xi32, #blocked2> %15 = tt.broadcast %12 : (tensor<64x1x!tt.ptr, #blocked1>) -> tensor<64x64x!tt.ptr, #blocked1> %16 = tt.broadcast %14 : (tensor<1x64xi32, #blocked2>) -> tensor<64x64xi32, #blocked2> %17 = triton_gpu.convert_layout %16 : (tensor<64x64xi32, #blocked2>) -> tensor<64x64xi32, #blocked1> %18 = tt.addptr %15, %17 : tensor<64x64x!tt.ptr, #blocked1>, tensor<64x64xi32, #blocked1> %19 = triton_gpu.convert_layout %10 : (tensor<64x64x!tt.ptr, #blocked1>) -> tensor<64x64x!tt.ptr, #blocked3> %20 = triton_gpu.convert_layout %cst_0 : (tensor<64x64xi1, #blocked1>) -> tensor<64x64xi1, #blocked3> %21 = triton_gpu.convert_layout %cst : (tensor<64x64xf32, #blocked1>) -> tensor<64x64xf32, #blocked3> %22 = tt.load %19, %20, %21 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<64x64xf32, #blocked3> %23 = triton_gpu.convert_layout %22 : (tensor<64x64xf32, #blocked3>) -> tensor<64x64xf32, #blocked1> %24 = triton_gpu.convert_layout %18 : (tensor<64x64x!tt.ptr, #blocked1>) -> tensor<64x64x!tt.ptr, #blocked4> %25 = triton_gpu.convert_layout %23 : (tensor<64x64xf32, #blocked1>) -> tensor<64x64xf32, #blocked4> %26 = triton_gpu.convert_layout %cst_0 : (tensor<64x64xi1, #blocked1>) -> tensor<64x64xi1, #blocked4> tt.store %24, %25, %26 : tensor<64x64xf32, #blocked4> return } // CHECK-LABEL: loop func @loop(%arg0: !tt.ptr, %arg1: i32, %arg2: !tt.ptr, %arg3: i32, %arg4: i32) { // CHECK-NOT: triton_gpu.convert_layout // CHECK: [[loop_ret:%.*]]:2 = scf.for {{.*}} -> (tensor<64x64xf32, [[row_layout]]>, tensor<64x64x!tt.ptr, [[row_layout]]>) // CHECK-NEXT: {{.*}} = tt.load {{.*}} : tensor<64x64xf32, [[row_layout]]> // CHECK-NEXT: {{.*}} = arith.addf {{.*}} : tensor<64x64xf32, [[row_layout]]> // CHECK-NEXT: {{.*}} = tt.addptr {{.*}} : tensor<64x64x!tt.ptr, [[row_layout]]>, tensor<64x64xi32, [[row_layout]]> // CHECK-NEXT: scf.yield {{.*}} : tensor<64x64xf32, [[row_layout]]>, tensor<64x64x!tt.ptr, [[row_layout]]> // CHECK-NEXT: } // CHECK-NEXT: {{.*}} = triton_gpu.convert_layout [[loop_ret]]#0 : (tensor<64x64xf32, [[row_layout]]>) -> tensor<64x64xf32, [[col_layout_novec]]> // CHECK-NOT: triton_gpu.convert_layout %cst = arith.constant dense : tensor<64x64xi1, #blocked1> %cst_0 = arith.constant dense<64> : tensor<64x64xi32, #blocked1> %c1 = arith.constant 1 : index %c32 = arith.constant 32 : index %c0 = arith.constant 0 : index %cst_1 = arith.constant dense<0.000000e+00> : tensor<64x64xf32, #blocked1> %00 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #slice1dim1> %01 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #slice2dim0> %1 = tt.expand_dims %00 {axis = 1 : i32} : (tensor<64xi32, #slice1dim1>) -> tensor<64x1xi32, #blocked1> %2 = tt.splat %arg1 : (i32) -> tensor<64x1xi32, #blocked1> %3 = arith.muli %1, %2 : tensor<64x1xi32, #blocked1> %4 = tt.splat %arg0 : (!tt.ptr) -> tensor<64x1x!tt.ptr, #blocked1> %5 = tt.addptr %4, %3 : tensor<64x1x!tt.ptr, #blocked1>, tensor<64x1xi32, #blocked1> %6 = tt.expand_dims %01 {axis = 0 : i32} : (tensor<64xi32, #slice2dim0>) -> tensor<1x64xi32, #blocked2> %7 = tt.broadcast %5 : (tensor<64x1x!tt.ptr, #blocked1>) -> tensor<64x64x!tt.ptr, #blocked1> %8 = tt.broadcast %6 : (tensor<1x64xi32, #blocked2>) -> tensor<64x64xi32, #blocked2> %9 = triton_gpu.convert_layout %8 : (tensor<64x64xi32, #blocked2>) -> tensor<64x64xi32, #blocked1> %10 = tt.addptr %7, %9 : tensor<64x64x!tt.ptr, #blocked1>, tensor<64x64xi32, #blocked1> %11:2 = scf.for %arg5 = %c0 to %c32 step %c1 iter_args(%arg6 = %cst_1, %arg7 = %10) -> (tensor<64x64xf32, #blocked1>, tensor<64x64x!tt.ptr, #blocked1>) { %23 = triton_gpu.convert_layout %arg7 : (tensor<64x64x!tt.ptr, #blocked1>) -> tensor<64x64x!tt.ptr, #blocked3> %24 = triton_gpu.convert_layout %cst : (tensor<64x64xi1, #blocked1>) -> tensor<64x64xi1, #blocked3> %25 = triton_gpu.convert_layout %cst_1 : (tensor<64x64xf32, #blocked1>) -> tensor<64x64xf32, #blocked3> %26 = tt.load %23, %24, %25 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<64x64xf32, #blocked3> %27 = triton_gpu.convert_layout %26 : (tensor<64x64xf32, #blocked3>) -> tensor<64x64xf32, #blocked1> %28 = arith.addf %arg6, %27 : tensor<64x64xf32, #blocked1> %29 = tt.addptr %arg7, %cst_0 : tensor<64x64x!tt.ptr, #blocked1>, tensor<64x64xi32, #blocked1> scf.yield %28, %29 : tensor<64x64xf32, #blocked1>, tensor<64x64x!tt.ptr, #blocked1> } %12 = tt.splat %arg2 : (!tt.ptr) -> tensor<64x1x!tt.ptr, #blocked1> %13 = tt.addptr %12, %1 : tensor<64x1x!tt.ptr, #blocked1>, tensor<64x1xi32, #blocked1> %14 = tt.splat %arg3 : (i32) -> tensor<1x64xi32, #blocked2> %15 = arith.muli %6, %14 : tensor<1x64xi32, #blocked2> %16 = tt.broadcast %13 : (tensor<64x1x!tt.ptr, #blocked1>) -> tensor<64x64x!tt.ptr, #blocked1> %17 = tt.broadcast %15 : (tensor<1x64xi32, #blocked2>) -> tensor<64x64xi32, #blocked2> %18 = triton_gpu.convert_layout %17 : (tensor<64x64xi32, #blocked2>) -> tensor<64x64xi32, #blocked1> %19 = tt.addptr %16, %18 : tensor<64x64x!tt.ptr, #blocked1>, tensor<64x64xi32, #blocked1> %20 = triton_gpu.convert_layout %19 : (tensor<64x64x!tt.ptr, #blocked1>) -> tensor<64x64x!tt.ptr, #blocked1> %21 = triton_gpu.convert_layout %11#0 : (tensor<64x64xf32, #blocked1>) -> tensor<64x64xf32, #blocked1> %22 = triton_gpu.convert_layout %cst : (tensor<64x64xi1, #blocked1>) -> tensor<64x64xi1, #blocked1> tt.store %20, %21, %22 : tensor<64x64xf32, #blocked1> return } // CHECK-LABEL: vecadd func @vecadd(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: i32) { // CHECK-NOT: triton_gpu.convert_layout %c256_i32 = arith.constant 256 : i32 %0 = tt.get_program_id {axis = 0 : i32} : i32 %1 = arith.muli %0, %c256_i32 : i32 %2 = tt.splat %1 : (i32) -> tensor<256xi32, #layout1> %3 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #layout1> %4 = tt.splat %1 : (i32) -> tensor<256xi32, #layout1> %5 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #layout1> %6 = tt.splat %1 : (i32) -> tensor<256xi32, #layout1> %7 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #layout1> %8 = tt.splat %arg0 : (!tt.ptr) -> tensor<256x!tt.ptr, #layout1> %9 = arith.addi %6, %7 : tensor<256xi32, #layout1> %10 = tt.splat %arg1 : (!tt.ptr) -> tensor<256x!tt.ptr, #layout1> %11 = arith.addi %4, %5 : tensor<256xi32, #layout1> %12 = tt.addptr %8, %9 : tensor<256x!tt.ptr, #layout1>, tensor<256xi32, #layout1> %13 = tt.load %12 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #layout1> %14 = triton_gpu.convert_layout %13 : (tensor<256xf32, #layout1>) -> tensor<256xf32, #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [2], order = [0]}>> %15 = tt.addptr %10, %11 : tensor<256x!tt.ptr, #layout1>, tensor<256xi32, #layout1> %16 = tt.load %15 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #layout1> %17 = triton_gpu.convert_layout %16 : (tensor<256xf32, #layout1>) -> tensor<256xf32, #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [2], order = [0]}>> %18 = arith.addf %14, %17 : tensor<256xf32, #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [2], order = [0]}>> %19 = tt.splat %arg2 : (!tt.ptr) -> tensor<256x!tt.ptr, #layout1> %20 = arith.addi %2, %3 : tensor<256xi32, #layout1> %21 = tt.addptr %19, %20 : tensor<256x!tt.ptr, #layout1>, tensor<256xi32, #layout1> %22 = triton_gpu.convert_layout %18 : (tensor<256xf32, #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [2], order = [0]}>>) -> tensor<256xf32, #layout1> tt.store %21, %22 : tensor<256xf32, #layout1> return } // Select has args with different element types // CHECK-LABEL: select func @select(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32}) { // CHECK-NOT: triton_gpu.convert_layout %cst = arith.constant dense<30000> : tensor<1x1xi32, #blocked2> %cst_0 = arith.constant dense<30000> : tensor<1x512xi32, #blocked2> %c512 = arith.constant 512 : index %c30000 = arith.constant 30000 : index %c0 = arith.constant 0 : index %cst_1 = arith.constant dense<2048> : tensor<1x1xi32, #blocked2> %cst_2 = arith.constant dense<0.000000e+00> : tensor<1x512xf64, #blocked2> %0 = tt.get_program_id {axis = 0 : i32} : i32 %1 = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32, #blocked0> %2 = triton_gpu.convert_layout %1 : (tensor<1xi32, #blocked0>) -> tensor<1xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> %3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<1xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<1x1xi32, #blocked1> %4 = triton_gpu.convert_layout %3 : (tensor<1x1xi32, #blocked1>) -> tensor<1x1xi32, #blocked2> %5 = tt.splat %0 : (i32) -> tensor<1x1xi32, #blocked2> %6 = arith.addi %5, %4 : tensor<1x1xi32, #blocked2> %7 = "triton_gpu.cmpi"(%6, %cst_1) {predicate = 2 : i64} : (tensor<1x1xi32, #blocked2>, tensor<1x1xi32, #blocked2>) -> tensor<1x1xi1, #blocked2> %8 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #blocked0> %9 = triton_gpu.convert_layout %8 : (tensor<512xi32, #blocked0>) -> tensor<512xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>> %10 = tt.expand_dims %9 {axis = 0 : i32} : (tensor<512xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>>) -> tensor<1x512xi32, #blocked2> %11 = arith.muli %6, %cst : tensor<1x1xi32, #blocked2> %12 = tt.broadcast %11 : (tensor<1x1xi32, #blocked2>) -> tensor<1x512xi32, #blocked2> %13 = tt.splat %arg0 : (!tt.ptr) -> tensor<1x512x!tt.ptr, #blocked2> %14 = tt.broadcast %7 : (tensor<1x1xi1, #blocked2>) -> tensor<1x512xi1, #blocked2> %15 = scf.for %arg3 = %c0 to %c30000 step %c512 iter_args(%arg4 = %cst_2) -> (tensor<1x512xf64, #blocked2>) { %16 = arith.index_cast %arg3 : index to i32 %17 = tt.splat %16 : (i32) -> tensor<1x512xi32, #blocked2> %18 = arith.addi %17, %10 : tensor<1x512xi32, #blocked2> %19 = "triton_gpu.cmpi"(%18, %cst_0) {predicate = 2 : i64} : (tensor<1x512xi32, #blocked2>, tensor<1x512xi32, #blocked2>) -> tensor<1x512xi1, #blocked2> %20 = arith.addi %18, %12 : tensor<1x512xi32, #blocked2> %21 = tt.addptr %13, %20 : tensor<1x512x!tt.ptr, #blocked2>, tensor<1x512xi32, #blocked2> %22 = arith.andi %19, %14 : tensor<1x512xi1, #blocked2> %23 = triton_gpu.convert_layout %21 : (tensor<1x512x!tt.ptr, #blocked2>) -> tensor<1x512x!tt.ptr, #blocked3> %24 = triton_gpu.convert_layout %22 : (tensor<1x512xi1, #blocked2>) -> tensor<1x512xi1, #blocked3> %25 = tt.load %23, %24 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x512xf64, #blocked3> %26 = triton_gpu.convert_layout %25 : (tensor<1x512xf64, #blocked3>) -> tensor<1x512xf64, #blocked2> %27 = arith.andi %14, %19 : tensor<1x512xi1, #blocked2> %28 = "triton_gpu.cmpf"(%arg4, %26) {predicate = 4 : i64} : (tensor<1x512xf64, #blocked2>, tensor<1x512xf64, #blocked2>) -> tensor<1x512xi1, #blocked2> %29 = arith.andi %27, %28 : tensor<1x512xi1, #blocked2> %30 = "triton_gpu.select"(%29, %26, %arg4) : (tensor<1x512xi1, #blocked2>, tensor<1x512xf64, #blocked2>, tensor<1x512xf64, #blocked2>) -> tensor<1x512xf64, #blocked2> %31 = triton_gpu.convert_layout %21 : (tensor<1x512x!tt.ptr, #blocked2>) -> tensor<1x512x!tt.ptr, #blocked3> %32 = triton_gpu.convert_layout %30 : (tensor<1x512xf64, #blocked2>) -> tensor<1x512xf64, #blocked3> %33 = triton_gpu.convert_layout %27 : (tensor<1x512xi1, #blocked2>) -> tensor<1x512xi1, #blocked3> tt.store %31, %32, %33 : tensor<1x512xf64, #blocked3> scf.yield %30 : tensor<1x512xf64, #blocked2> } return } // Make sure the following IR doesn't hang the compiler. // CHECK-LABEL: long_func func public @long_func(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: !tt.ptr {tt.divisibility = 16 : i32}, %arg4: !tt.ptr {tt.divisibility = 16 : i32}, %arg5: !tt.ptr {tt.divisibility = 16 : i32}, %arg6: !tt.ptr {tt.divisibility = 16 : i32}, %arg7: !tt.ptr {tt.divisibility = 16 : i32}, %arg8: !tt.ptr {tt.divisibility = 16 : i32}, %arg9: !tt.ptr {tt.divisibility = 16 : i32}, %arg10: !tt.ptr {tt.divisibility = 16 : i32}, %arg11: !tt.ptr {tt.divisibility = 16 : i32}, %arg12: !tt.ptr {tt.divisibility = 16 : i32}, %arg13: !tt.ptr {tt.divisibility = 16 : i32}, %arg14: !tt.ptr {tt.divisibility = 16 : i32}, %arg15: !tt.ptr {tt.divisibility = 16 : i32}, %arg16: i32 {tt.divisibility = 16 : i32}) { %cst = arith.constant dense<1.000000e+00> : tensor<1024xf32, #blocked0> %cst_0 = arith.constant dense<5.000000e-04> : tensor<1024xf32, #blocked0> %cst_1 = arith.constant dense<0.999499976> : tensor<1024xf32, #blocked0> %cst_2 = arith.constant dense<1.000000e+04> : tensor<1024xf32, #blocked0> %cst_3 = arith.constant dense<5000> : tensor<1024xi32, #blocked0> %cst_4 = arith.constant dense<150> : tensor<1024xi32, #blocked0> %cst_5 = arith.constant dense : tensor<1024xi1, #blocked0> %cst_6 = arith.constant dense<2> : tensor<1024xi32, #blocked0> %cst_7 = arith.constant dense<4999> : tensor<1024xi32, #blocked0> %cst_8 = arith.constant dense<2499> : tensor<1024xi32, #blocked0> %cst_9 = arith.constant dense<2500> : tensor<1024xi32, #blocked0> %cst_10 = arith.constant dense<0.91629076> : tensor<1024xf32, #blocked0> %c2499_i32 = arith.constant 2499 : i32 %cst_11 = arith.constant dense<1024> : tensor<1024xi32, #blocked0> %c1024_i32 = arith.constant 1024 : i32 %cst_12 = arith.constant dense<1> : tensor<1024xi32, #blocked0> %cst_13 = arith.constant dense<0.000000e+00> : tensor<1024xf32, #blocked0> %cst_14 = arith.constant dense<0> : tensor<1024xi32, #blocked0> %0 = tt.get_program_id {axis = 0 : i32} : i32 %1 = arith.muli %0, %c1024_i32 : i32 %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked0> %3 = tt.splat %1 : (i32) -> tensor<1024xi32, #blocked0> %4 = arith.addi %3, %2 : tensor<1024xi32, #blocked0> %5 = "triton_gpu.cmpi"(%4, %cst_11) {predicate = 2 : i64} : (tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi1, #blocked0> %6 = tt.splat %arg5 : (!tt.ptr) -> tensor<1024x!tt.ptr, #blocked0> %7 = tt.addptr %6, %4 : tensor<1024x!tt.ptr, #blocked0>, tensor<1024xi32, #blocked0> %8 = triton_gpu.convert_layout %7 : (tensor<1024x!tt.ptr, #blocked0>) -> tensor<1024x!tt.ptr, #blocked1> %9 = triton_gpu.convert_layout %5 : (tensor<1024xi1, #blocked0>) -> tensor<1024xi1, #blocked1> %10 = tt.load %8, %9 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xf32, #blocked1> %11 = triton_gpu.convert_layout %10 : (tensor<1024xf32, #blocked1>) -> tensor<1024xf32, #blocked0> %12 = tt.splat %arg7 : (!tt.ptr) -> tensor<1024x!tt.ptr, #blocked0> %13 = tt.addptr %12, %4 : tensor<1024x!tt.ptr, #blocked0>, tensor<1024xi32, #blocked0> %14 = triton_gpu.convert_layout %13 : (tensor<1024x!tt.ptr, #blocked0>) -> tensor<1024x!tt.ptr, #blocked2> %15 = triton_gpu.convert_layout %5 : (tensor<1024xi1, #blocked0>) -> tensor<1024xi1, #blocked2> %16 = tt.load %14, %15 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xi64, #blocked2> %17 = triton_gpu.convert_layout %16 : (tensor<1024xi64, #blocked2>) -> tensor<1024xi64, #blocked0> %18 = tt.splat %arg8 : (!tt.ptr) -> tensor<1024x!tt.ptr, #blocked0> %19 = tt.addptr %18, %4 : tensor<1024x!tt.ptr, #blocked0>, tensor<1024xi32, #blocked0> %20 = triton_gpu.convert_layout %19 : (tensor<1024x!tt.ptr, #blocked0>) -> tensor<1024x!tt.ptr, #blocked1> %21 = triton_gpu.convert_layout %5 : (tensor<1024xi1, #blocked0>) -> tensor<1024xi1, #blocked1> %22 = tt.load %20, %21 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xf32, #blocked1> %23 = triton_gpu.convert_layout %22 : (tensor<1024xf32, #blocked1>) -> tensor<1024xf32, #blocked0> %24 = arith.subf %cst_13, %11 : tensor<1024xf32, #blocked0> %25 = math.exp %24 : tensor<1024xf32, #blocked0> %26 = arith.sitofp %cst_12 : tensor<1024xi32, #blocked0> to tensor<1024xf32, #blocked0> %27 = arith.addf %25, %26 : tensor<1024xf32, #blocked0> %28 = arith.divf %26, %27 : tensor<1024xf32, #blocked0> %29 = tt.addptr %arg6, %c2499_i32 : !tt.ptr, i32 %30 = tt.load %29 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32 %31 = arith.subf %11, %cst_10 : tensor<1024xf32, #blocked0> %32 = arith.subf %cst_13, %31 : tensor<1024xf32, #blocked0> %33 = math.exp %32 : tensor<1024xf32, #blocked0> %34 = arith.addf %33, %26 : tensor<1024xf32, #blocked0> %35 = arith.divf %26, %34 : tensor<1024xf32, #blocked0> %36 = tt.splat %30 : (f32) -> tensor<1024xf32, #blocked0> %37 = "triton_gpu.cmpf"(%36, %35) {predicate = 3 : i64} : (tensor<1024xf32, #blocked0>, tensor<1024xf32, #blocked0>) -> tensor<1024xi1, #blocked0> %38 = "triton_gpu.select"(%37, %cst_14, %cst_9) : (tensor<1024xi1, #blocked0>, tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi32, #blocked0> %39 = "triton_gpu.select"(%37, %cst_8, %cst_7) : (tensor<1024xi1, #blocked0>, tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi32, #blocked0> %40 = arith.subi %39, %38 : tensor<1024xi32, #blocked0> %41 = "triton_gpu.cmpi"(%40, %cst_14) {predicate = 2 : i64} : (tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi1, #blocked0> %42 = "triton_gpu.cmpi"(%41, %cst_5) {predicate = 1 : i64} : (tensor<1024xi1, #blocked0>, tensor<1024xi1, #blocked0>) -> tensor<1024xi1, #blocked0> %43 = arith.remsi %40, %cst_6 : tensor<1024xi32, #blocked0> %44 = "triton_gpu.cmpi"(%43, %cst_14) {predicate = 1 : i64} : (tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi1, #blocked0> %45 = arith.divsi %40, %cst_6 : tensor<1024xi32, #blocked0> %46 = arith.subi %45, %cst_12 : tensor<1024xi32, #blocked0> %47 = "triton_gpu.select"(%44, %46, %45) : (tensor<1024xi1, #blocked0>, tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi32, #blocked0> %48 = "triton_gpu.select"(%42, %47, %45) : (tensor<1024xi1, #blocked0>, tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi32, #blocked0> %49 = arith.addi %38, %48 : tensor<1024xi32, #blocked0> %50 = "triton_gpu.cmpi"(%38, %39) {predicate = 2 : i64} : (tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi1, #blocked0> %51 = "triton_gpu.select"(%50, %49, %cst_14) : (tensor<1024xi1, #blocked0>, tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi32, #blocked0> %52 = tt.splat %arg6 : (!tt.ptr) -> tensor<1024x!tt.ptr, #blocked0> %53 = tt.addptr %52, %51 : tensor<1024x!tt.ptr, #blocked0>, tensor<1024xi32, #blocked0> %54 = triton_gpu.convert_layout %53 : (tensor<1024x!tt.ptr, #blocked0>) -> tensor<1024x!tt.ptr, #blocked0> %55 = tt.load %54 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xf32, #blocked0> %56 = "triton_gpu.cmpf"(%55, %35) {predicate = 3 : i64} : (tensor<1024xf32, #blocked0>, tensor<1024xf32, #blocked0>) -> tensor<1024xi1, #blocked0> %57 = "triton_gpu.cmpi"(%56, %cst_5) {predicate = 0 : i64} : (tensor<1024xi1, #blocked0>, tensor<1024xi1, #blocked0>) -> tensor<1024xi1, #blocked0> %58 = arith.andi %57, %50 : tensor<1024xi1, #blocked0> %59 = arith.addi %51, %cst_12 : tensor<1024xi32, #blocked0> %60 = "triton_gpu.select"(%58, %59, %38) : (tensor<1024xi1, #blocked0>, tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi32, #blocked0> %61 = arith.andi %56, %50 : tensor<1024xi1, #blocked0> %62 = "triton_gpu.select"(%61, %51, %39) : (tensor<1024xi1, #blocked0>, tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi32, #blocked0> %63 = "triton_gpu.cmpi"(%60, %62) {predicate = 2 : i64} : (tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi1, #blocked0> %64 = arith.subi %62, %60 : tensor<1024xi32, #blocked0> %65 = "triton_gpu.cmpi"(%64, %cst_14) {predicate = 2 : i64} : (tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi1, #blocked0> %66 = "triton_gpu.cmpi"(%65, %cst_5) {predicate = 1 : i64} : (tensor<1024xi1, #blocked0>, tensor<1024xi1, #blocked0>) -> tensor<1024xi1, #blocked0> %67 = arith.remsi %64, %cst_6 : tensor<1024xi32, #blocked0> %68 = "triton_gpu.cmpi"(%67, %cst_14) {predicate = 1 : i64} : (tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi1, #blocked0> %69 = arith.divsi %64, %cst_6 : tensor<1024xi32, #blocked0> %70 = arith.subi %69, %cst_12 : tensor<1024xi32, #blocked0> %71 = "triton_gpu.select"(%68, %70, %69) : (tensor<1024xi1, #blocked0>, tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi32, #blocked0> %72 = "triton_gpu.select"(%66, %71, %69) : (tensor<1024xi1, #blocked0>, tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi32, #blocked0> %73 = arith.addi %60, %72 : tensor<1024xi32, #blocked0> %74 = "triton_gpu.select"(%63, %73, %cst_14) : (tensor<1024xi1, #blocked0>, tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi32, #blocked0> %75 = tt.addptr %52, %74 : tensor<1024x!tt.ptr, #blocked0>, tensor<1024xi32, #blocked0> %76 = triton_gpu.convert_layout %75 : (tensor<1024x!tt.ptr, #blocked0>) -> tensor<1024x!tt.ptr, #blocked0> %77 = tt.load %76 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xf32, #blocked0> %78 = "triton_gpu.cmpf"(%77, %35) {predicate = 3 : i64} : (tensor<1024xf32, #blocked0>, tensor<1024xf32, #blocked0>) -> tensor<1024xi1, #blocked0> %79 = "triton_gpu.cmpi"(%78, %cst_5) {predicate = 0 : i64} : (tensor<1024xi1, #blocked0>, tensor<1024xi1, #blocked0>) -> tensor<1024xi1, #blocked0> %80 = arith.andi %79, %63 : tensor<1024xi1, #blocked0> %81 = arith.addi %74, %cst_12 : tensor<1024xi32, #blocked0> %82 = "triton_gpu.select"(%80, %81, %60) : (tensor<1024xi1, #blocked0>, tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi32, #blocked0> %83 = arith.andi %78, %63 : tensor<1024xi1, #blocked0> %84 = "triton_gpu.select"(%83, %74, %62) : (tensor<1024xi1, #blocked0>, tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi32, #blocked0> %85 = "triton_gpu.cmpi"(%82, %84) {predicate = 2 : i64} : (tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi1, #blocked0> %86 = arith.subi %84, %82 : tensor<1024xi32, #blocked0> %87 = "triton_gpu.cmpi"(%86, %cst_14) {predicate = 2 : i64} : (tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi1, #blocked0> %88 = "triton_gpu.cmpi"(%87, %cst_5) {predicate = 1 : i64} : (tensor<1024xi1, #blocked0>, tensor<1024xi1, #blocked0>) -> tensor<1024xi1, #blocked0> %89 = arith.remsi %86, %cst_6 : tensor<1024xi32, #blocked0> %90 = "triton_gpu.cmpi"(%89, %cst_14) {predicate = 1 : i64} : (tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi1, #blocked0> %91 = arith.divsi %86, %cst_6 : tensor<1024xi32, #blocked0> %92 = arith.subi %91, %cst_12 : tensor<1024xi32, #blocked0> %93 = "triton_gpu.select"(%90, %92, %91) : (tensor<1024xi1, #blocked0>, tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi32, #blocked0> %94 = "triton_gpu.select"(%88, %93, %91) : (tensor<1024xi1, #blocked0>, tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi32, #blocked0> %95 = arith.addi %82, %94 : tensor<1024xi32, #blocked0> %96 = "triton_gpu.select"(%85, %95, %cst_14) : (tensor<1024xi1, #blocked0>, tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi32, #blocked0> %97 = tt.addptr %52, %96 : tensor<1024x!tt.ptr, #blocked0>, tensor<1024xi32, #blocked0> %98 = triton_gpu.convert_layout %97 : (tensor<1024x!tt.ptr, #blocked0>) -> tensor<1024x!tt.ptr, #blocked0> %99 = tt.load %98 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xf32, #blocked0> %100 = "triton_gpu.cmpf"(%99, %35) {predicate = 3 : i64} : (tensor<1024xf32, #blocked0>, tensor<1024xf32, #blocked0>) -> tensor<1024xi1, #blocked0> %101 = "triton_gpu.cmpi"(%100, %cst_5) {predicate = 0 : i64} : (tensor<1024xi1, #blocked0>, tensor<1024xi1, #blocked0>) -> tensor<1024xi1, #blocked0> %102 = arith.andi %101, %85 : tensor<1024xi1, #blocked0> %103 = arith.addi %96, %cst_12 : tensor<1024xi32, #blocked0> %104 = "triton_gpu.select"(%102, %103, %82) : (tensor<1024xi1, #blocked0>, tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi32, #blocked0> %105 = arith.andi %100, %85 : tensor<1024xi1, #blocked0> %106 = "triton_gpu.select"(%105, %96, %84) : (tensor<1024xi1, #blocked0>, tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi32, #blocked0> %107 = "triton_gpu.cmpi"(%104, %106) {predicate = 2 : i64} : (tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi1, #blocked0> %108 = arith.subi %106, %104 : tensor<1024xi32, #blocked0> %109 = "triton_gpu.cmpi"(%108, %cst_14) {predicate = 2 : i64} : (tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi1, #blocked0> %110 = "triton_gpu.cmpi"(%109, %cst_5) {predicate = 1 : i64} : (tensor<1024xi1, #blocked0>, tensor<1024xi1, #blocked0>) -> tensor<1024xi1, #blocked0> %111 = arith.remsi %108, %cst_6 : tensor<1024xi32, #blocked0> %112 = "triton_gpu.cmpi"(%111, %cst_14) {predicate = 1 : i64} : (tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi1, #blocked0> %113 = arith.divsi %108, %cst_6 : tensor<1024xi32, #blocked0> %114 = arith.subi %113, %cst_12 : tensor<1024xi32, #blocked0> %115 = "triton_gpu.select"(%112, %114, %113) : (tensor<1024xi1, #blocked0>, tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi32, #blocked0> %116 = "triton_gpu.select"(%110, %115, %113) : (tensor<1024xi1, #blocked0>, tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi32, #blocked0> %117 = arith.addi %104, %116 : tensor<1024xi32, #blocked0> %118 = "triton_gpu.select"(%107, %117, %cst_14) : (tensor<1024xi1, #blocked0>, tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi32, #blocked0> %119 = tt.addptr %52, %118 : tensor<1024x!tt.ptr, #blocked0>, tensor<1024xi32, #blocked0> %120 = triton_gpu.convert_layout %119 : (tensor<1024x!tt.ptr, #blocked0>) -> tensor<1024x!tt.ptr, #blocked0> %121 = tt.load %120 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xf32, #blocked0> %122 = "triton_gpu.cmpf"(%121, %35) {predicate = 3 : i64} : (tensor<1024xf32, #blocked0>, tensor<1024xf32, #blocked0>) -> tensor<1024xi1, #blocked0> %123 = "triton_gpu.cmpi"(%122, %cst_5) {predicate = 0 : i64} : (tensor<1024xi1, #blocked0>, tensor<1024xi1, #blocked0>) -> tensor<1024xi1, #blocked0> %124 = arith.andi %123, %107 : tensor<1024xi1, #blocked0> %125 = arith.addi %118, %cst_12 : tensor<1024xi32, #blocked0> %126 = "triton_gpu.select"(%124, %125, %104) : (tensor<1024xi1, #blocked0>, tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi32, #blocked0> %127 = arith.andi %122, %107 : tensor<1024xi1, #blocked0> %128 = "triton_gpu.select"(%127, %118, %106) : (tensor<1024xi1, #blocked0>, tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi32, #blocked0> %129 = "triton_gpu.cmpi"(%126, %128) {predicate = 2 : i64} : (tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi1, #blocked0> %130 = arith.subi %128, %126 : tensor<1024xi32, #blocked0> %131 = "triton_gpu.cmpi"(%130, %cst_14) {predicate = 2 : i64} : (tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi1, #blocked0> %132 = "triton_gpu.cmpi"(%131, %cst_5) {predicate = 1 : i64} : (tensor<1024xi1, #blocked0>, tensor<1024xi1, #blocked0>) -> tensor<1024xi1, #blocked0> %133 = arith.remsi %130, %cst_6 : tensor<1024xi32, #blocked0> %134 = "triton_gpu.cmpi"(%133, %cst_14) {predicate = 1 : i64} : (tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi1, #blocked0> %135 = arith.divsi %130, %cst_6 : tensor<1024xi32, #blocked0> %136 = arith.subi %135, %cst_12 : tensor<1024xi32, #blocked0> %137 = "triton_gpu.select"(%134, %136, %135) : (tensor<1024xi1, #blocked0>, tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi32, #blocked0> %138 = "triton_gpu.select"(%132, %137, %135) : (tensor<1024xi1, #blocked0>, tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi32, #blocked0> %139 = arith.addi %126, %138 : tensor<1024xi32, #blocked0> %140 = "triton_gpu.select"(%129, %139, %cst_14) : (tensor<1024xi1, #blocked0>, tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi32, #blocked0> %141 = tt.addptr %52, %140 : tensor<1024x!tt.ptr, #blocked0>, tensor<1024xi32, #blocked0> %142 = triton_gpu.convert_layout %141 : (tensor<1024x!tt.ptr, #blocked0>) -> tensor<1024x!tt.ptr, #blocked0> %143 = tt.load %142 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xf32, #blocked0> %144 = "triton_gpu.cmpf"(%143, %35) {predicate = 3 : i64} : (tensor<1024xf32, #blocked0>, tensor<1024xf32, #blocked0>) -> tensor<1024xi1, #blocked0> %145 = "triton_gpu.cmpi"(%144, %cst_5) {predicate = 0 : i64} : (tensor<1024xi1, #blocked0>, tensor<1024xi1, #blocked0>) -> tensor<1024xi1, #blocked0> %146 = arith.andi %145, %129 : tensor<1024xi1, #blocked0> %147 = arith.addi %140, %cst_12 : tensor<1024xi32, #blocked0> %148 = "triton_gpu.select"(%146, %147, %126) : (tensor<1024xi1, #blocked0>, tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi32, #blocked0> %149 = arith.andi %144, %129 : tensor<1024xi1, #blocked0> %150 = "triton_gpu.select"(%149, %140, %128) : (tensor<1024xi1, #blocked0>, tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi32, #blocked0> %151 = "triton_gpu.cmpi"(%148, %150) {predicate = 2 : i64} : (tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi1, #blocked0> %152 = arith.subi %150, %148 : tensor<1024xi32, #blocked0> %153 = "triton_gpu.cmpi"(%152, %cst_14) {predicate = 2 : i64} : (tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi1, #blocked0> %154 = "triton_gpu.cmpi"(%153, %cst_5) {predicate = 1 : i64} : (tensor<1024xi1, #blocked0>, tensor<1024xi1, #blocked0>) -> tensor<1024xi1, #blocked0> %155 = arith.remsi %152, %cst_6 : tensor<1024xi32, #blocked0> %156 = "triton_gpu.cmpi"(%155, %cst_14) {predicate = 1 : i64} : (tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi1, #blocked0> %157 = arith.divsi %152, %cst_6 : tensor<1024xi32, #blocked0> %158 = arith.subi %157, %cst_12 : tensor<1024xi32, #blocked0> %159 = "triton_gpu.select"(%156, %158, %157) : (tensor<1024xi1, #blocked0>, tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi32, #blocked0> %160 = "triton_gpu.select"(%154, %159, %157) : (tensor<1024xi1, #blocked0>, tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi32, #blocked0> %161 = arith.addi %148, %160 : tensor<1024xi32, #blocked0> %162 = "triton_gpu.select"(%151, %161, %cst_14) : (tensor<1024xi1, #blocked0>, tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi32, #blocked0> %163 = tt.addptr %52, %162 : tensor<1024x!tt.ptr, #blocked0>, tensor<1024xi32, #blocked0> %164 = triton_gpu.convert_layout %163 : (tensor<1024x!tt.ptr, #blocked0>) -> tensor<1024x!tt.ptr, #blocked0> %165 = tt.load %164 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xf32, #blocked0> %166 = "triton_gpu.cmpf"(%165, %35) {predicate = 3 : i64} : (tensor<1024xf32, #blocked0>, tensor<1024xf32, #blocked0>) -> tensor<1024xi1, #blocked0> %167 = "triton_gpu.cmpi"(%166, %cst_5) {predicate = 0 : i64} : (tensor<1024xi1, #blocked0>, tensor<1024xi1, #blocked0>) -> tensor<1024xi1, #blocked0> %168 = arith.andi %167, %151 : tensor<1024xi1, #blocked0> %169 = arith.addi %162, %cst_12 : tensor<1024xi32, #blocked0> %170 = "triton_gpu.select"(%168, %169, %148) : (tensor<1024xi1, #blocked0>, tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi32, #blocked0> %171 = arith.andi %166, %151 : tensor<1024xi1, #blocked0> %172 = "triton_gpu.select"(%171, %162, %150) : (tensor<1024xi1, #blocked0>, tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi32, #blocked0> %173 = "triton_gpu.cmpi"(%170, %172) {predicate = 2 : i64} : (tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi1, #blocked0> %174 = arith.subi %172, %170 : tensor<1024xi32, #blocked0> %175 = "triton_gpu.cmpi"(%174, %cst_14) {predicate = 2 : i64} : (tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi1, #blocked0> %176 = "triton_gpu.cmpi"(%175, %cst_5) {predicate = 1 : i64} : (tensor<1024xi1, #blocked0>, tensor<1024xi1, #blocked0>) -> tensor<1024xi1, #blocked0> %177 = arith.remsi %174, %cst_6 : tensor<1024xi32, #blocked0> %178 = "triton_gpu.cmpi"(%177, %cst_14) {predicate = 1 : i64} : (tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi1, #blocked0> %179 = arith.divsi %174, %cst_6 : tensor<1024xi32, #blocked0> %180 = arith.subi %179, %cst_12 : tensor<1024xi32, #blocked0> %181 = "triton_gpu.select"(%178, %180, %179) : (tensor<1024xi1, #blocked0>, tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi32, #blocked0> %182 = "triton_gpu.select"(%176, %181, %179) : (tensor<1024xi1, #blocked0>, tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi32, #blocked0> %183 = arith.addi %170, %182 : tensor<1024xi32, #blocked0> %184 = "triton_gpu.select"(%173, %183, %cst_14) : (tensor<1024xi1, #blocked0>, tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi32, #blocked0> %185 = tt.addptr %52, %184 : tensor<1024x!tt.ptr, #blocked0>, tensor<1024xi32, #blocked0> %186 = triton_gpu.convert_layout %185 : (tensor<1024x!tt.ptr, #blocked0>) -> tensor<1024x!tt.ptr, #blocked0> %187 = tt.load %186 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xf32, #blocked0> %188 = "triton_gpu.cmpf"(%187, %35) {predicate = 3 : i64} : (tensor<1024xf32, #blocked0>, tensor<1024xf32, #blocked0>) -> tensor<1024xi1, #blocked0> %189 = "triton_gpu.cmpi"(%188, %cst_5) {predicate = 0 : i64} : (tensor<1024xi1, #blocked0>, tensor<1024xi1, #blocked0>) -> tensor<1024xi1, #blocked0> %190 = arith.andi %189, %173 : tensor<1024xi1, #blocked0> %191 = arith.addi %184, %cst_12 : tensor<1024xi32, #blocked0> %192 = "triton_gpu.select"(%190, %191, %170) : (tensor<1024xi1, #blocked0>, tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi32, #blocked0> %193 = arith.andi %188, %173 : tensor<1024xi1, #blocked0> %194 = "triton_gpu.select"(%193, %184, %172) : (tensor<1024xi1, #blocked0>, tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi32, #blocked0> %195 = "triton_gpu.cmpi"(%192, %194) {predicate = 2 : i64} : (tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi1, #blocked0> %196 = arith.subi %194, %192 : tensor<1024xi32, #blocked0> %197 = "triton_gpu.cmpi"(%196, %cst_14) {predicate = 2 : i64} : (tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi1, #blocked0> %198 = "triton_gpu.cmpi"(%197, %cst_5) {predicate = 1 : i64} : (tensor<1024xi1, #blocked0>, tensor<1024xi1, #blocked0>) -> tensor<1024xi1, #blocked0> %199 = arith.remsi %196, %cst_6 : tensor<1024xi32, #blocked0> %200 = "triton_gpu.cmpi"(%199, %cst_14) {predicate = 1 : i64} : (tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi1, #blocked0> %201 = arith.divsi %196, %cst_6 : tensor<1024xi32, #blocked0> %202 = arith.subi %201, %cst_12 : tensor<1024xi32, #blocked0> %203 = "triton_gpu.select"(%200, %202, %201) : (tensor<1024xi1, #blocked0>, tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi32, #blocked0> %204 = "triton_gpu.select"(%198, %203, %201) : (tensor<1024xi1, #blocked0>, tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi32, #blocked0> %205 = arith.addi %192, %204 : tensor<1024xi32, #blocked0> %206 = "triton_gpu.select"(%195, %205, %cst_14) : (tensor<1024xi1, #blocked0>, tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi32, #blocked0> %207 = tt.addptr %52, %206 : tensor<1024x!tt.ptr, #blocked0>, tensor<1024xi32, #blocked0> %208 = triton_gpu.convert_layout %207 : (tensor<1024x!tt.ptr, #blocked0>) -> tensor<1024x!tt.ptr, #blocked0> %209 = tt.load %208 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xf32, #blocked0> %210 = "triton_gpu.cmpf"(%209, %35) {predicate = 3 : i64} : (tensor<1024xf32, #blocked0>, tensor<1024xf32, #blocked0>) -> tensor<1024xi1, #blocked0> %211 = "triton_gpu.cmpi"(%210, %cst_5) {predicate = 0 : i64} : (tensor<1024xi1, #blocked0>, tensor<1024xi1, #blocked0>) -> tensor<1024xi1, #blocked0> %212 = arith.andi %211, %195 : tensor<1024xi1, #blocked0> %213 = arith.addi %206, %cst_12 : tensor<1024xi32, #blocked0> %214 = "triton_gpu.select"(%212, %213, %192) : (tensor<1024xi1, #blocked0>, tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi32, #blocked0> %215 = arith.andi %210, %195 : tensor<1024xi1, #blocked0> %216 = "triton_gpu.select"(%215, %206, %194) : (tensor<1024xi1, #blocked0>, tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi32, #blocked0> %217 = "triton_gpu.cmpi"(%214, %216) {predicate = 2 : i64} : (tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi1, #blocked0> %218 = arith.subi %216, %214 : tensor<1024xi32, #blocked0> %219 = "triton_gpu.cmpi"(%218, %cst_14) {predicate = 2 : i64} : (tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi1, #blocked0> %220 = "triton_gpu.cmpi"(%219, %cst_5) {predicate = 1 : i64} : (tensor<1024xi1, #blocked0>, tensor<1024xi1, #blocked0>) -> tensor<1024xi1, #blocked0> %221 = arith.remsi %218, %cst_6 : tensor<1024xi32, #blocked0> %222 = "triton_gpu.cmpi"(%221, %cst_14) {predicate = 1 : i64} : (tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi1, #blocked0> %223 = arith.divsi %218, %cst_6 : tensor<1024xi32, #blocked0> %224 = arith.subi %223, %cst_12 : tensor<1024xi32, #blocked0> %225 = "triton_gpu.select"(%222, %224, %223) : (tensor<1024xi1, #blocked0>, tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi32, #blocked0> %226 = "triton_gpu.select"(%220, %225, %223) : (tensor<1024xi1, #blocked0>, tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi32, #blocked0> %227 = arith.addi %214, %226 : tensor<1024xi32, #blocked0> %228 = "triton_gpu.select"(%217, %227, %cst_14) : (tensor<1024xi1, #blocked0>, tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi32, #blocked0> %229 = tt.addptr %52, %228 : tensor<1024x!tt.ptr, #blocked0>, tensor<1024xi32, #blocked0> %230 = triton_gpu.convert_layout %229 : (tensor<1024x!tt.ptr, #blocked0>) -> tensor<1024x!tt.ptr, #blocked0> %231 = tt.load %230 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xf32, #blocked0> %232 = "triton_gpu.cmpf"(%231, %35) {predicate = 3 : i64} : (tensor<1024xf32, #blocked0>, tensor<1024xf32, #blocked0>) -> tensor<1024xi1, #blocked0> %233 = "triton_gpu.cmpi"(%232, %cst_5) {predicate = 0 : i64} : (tensor<1024xi1, #blocked0>, tensor<1024xi1, #blocked0>) -> tensor<1024xi1, #blocked0> %234 = arith.andi %233, %217 : tensor<1024xi1, #blocked0> %235 = arith.addi %228, %cst_12 : tensor<1024xi32, #blocked0> %236 = "triton_gpu.select"(%234, %235, %214) : (tensor<1024xi1, #blocked0>, tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi32, #blocked0> %237 = arith.andi %232, %217 : tensor<1024xi1, #blocked0> %238 = "triton_gpu.select"(%237, %228, %216) : (tensor<1024xi1, #blocked0>, tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi32, #blocked0> %239 = "triton_gpu.cmpi"(%236, %238) {predicate = 2 : i64} : (tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi1, #blocked0> %240 = arith.subi %238, %236 : tensor<1024xi32, #blocked0> %241 = "triton_gpu.cmpi"(%240, %cst_14) {predicate = 2 : i64} : (tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi1, #blocked0> %242 = "triton_gpu.cmpi"(%241, %cst_5) {predicate = 1 : i64} : (tensor<1024xi1, #blocked0>, tensor<1024xi1, #blocked0>) -> tensor<1024xi1, #blocked0> %243 = arith.remsi %240, %cst_6 : tensor<1024xi32, #blocked0> %244 = "triton_gpu.cmpi"(%243, %cst_14) {predicate = 1 : i64} : (tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi1, #blocked0> %245 = arith.divsi %240, %cst_6 : tensor<1024xi32, #blocked0> %246 = arith.subi %245, %cst_12 : tensor<1024xi32, #blocked0> %247 = "triton_gpu.select"(%244, %246, %245) : (tensor<1024xi1, #blocked0>, tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi32, #blocked0> %248 = "triton_gpu.select"(%242, %247, %245) : (tensor<1024xi1, #blocked0>, tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi32, #blocked0> %249 = arith.addi %236, %248 : tensor<1024xi32, #blocked0> %250 = "triton_gpu.select"(%239, %249, %cst_14) : (tensor<1024xi1, #blocked0>, tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi32, #blocked0> %251 = tt.addptr %52, %250 : tensor<1024x!tt.ptr, #blocked0>, tensor<1024xi32, #blocked0> %252 = triton_gpu.convert_layout %251 : (tensor<1024x!tt.ptr, #blocked0>) -> tensor<1024x!tt.ptr, #blocked0> %253 = tt.load %252 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xf32, #blocked0> %254 = "triton_gpu.cmpf"(%253, %35) {predicate = 3 : i64} : (tensor<1024xf32, #blocked0>, tensor<1024xf32, #blocked0>) -> tensor<1024xi1, #blocked0> %255 = "triton_gpu.cmpi"(%254, %cst_5) {predicate = 0 : i64} : (tensor<1024xi1, #blocked0>, tensor<1024xi1, #blocked0>) -> tensor<1024xi1, #blocked0> %256 = arith.andi %255, %239 : tensor<1024xi1, #blocked0> %257 = arith.addi %250, %cst_12 : tensor<1024xi32, #blocked0> %258 = "triton_gpu.select"(%256, %257, %236) : (tensor<1024xi1, #blocked0>, tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi32, #blocked0> %259 = arith.andi %254, %239 : tensor<1024xi1, #blocked0> %260 = "triton_gpu.select"(%259, %250, %238) : (tensor<1024xi1, #blocked0>, tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi32, #blocked0> %261 = "triton_gpu.cmpi"(%258, %260) {predicate = 2 : i64} : (tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi1, #blocked0> %262 = arith.subi %260, %258 : tensor<1024xi32, #blocked0> %263 = "triton_gpu.cmpi"(%262, %cst_14) {predicate = 2 : i64} : (tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi1, #blocked0> %264 = "triton_gpu.cmpi"(%263, %cst_5) {predicate = 1 : i64} : (tensor<1024xi1, #blocked0>, tensor<1024xi1, #blocked0>) -> tensor<1024xi1, #blocked0> %265 = arith.remsi %262, %cst_6 : tensor<1024xi32, #blocked0> %266 = "triton_gpu.cmpi"(%265, %cst_14) {predicate = 1 : i64} : (tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi1, #blocked0> %267 = arith.divsi %262, %cst_6 : tensor<1024xi32, #blocked0> %268 = arith.subi %267, %cst_12 : tensor<1024xi32, #blocked0> %269 = "triton_gpu.select"(%266, %268, %267) : (tensor<1024xi1, #blocked0>, tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi32, #blocked0> %270 = "triton_gpu.select"(%264, %269, %267) : (tensor<1024xi1, #blocked0>, tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi32, #blocked0> %271 = arith.addi %258, %270 : tensor<1024xi32, #blocked0> %272 = "triton_gpu.select"(%261, %271, %cst_14) : (tensor<1024xi1, #blocked0>, tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi32, #blocked0> %273 = tt.addptr %52, %272 : tensor<1024x!tt.ptr, #blocked0>, tensor<1024xi32, #blocked0> %274 = triton_gpu.convert_layout %273 : (tensor<1024x!tt.ptr, #blocked0>) -> tensor<1024x!tt.ptr, #blocked0> %275 = tt.load %274 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xf32, #blocked0> %276 = "triton_gpu.cmpf"(%275, %35) {predicate = 3 : i64} : (tensor<1024xf32, #blocked0>, tensor<1024xf32, #blocked0>) -> tensor<1024xi1, #blocked0> %277 = "triton_gpu.cmpi"(%276, %cst_5) {predicate = 0 : i64} : (tensor<1024xi1, #blocked0>, tensor<1024xi1, #blocked0>) -> tensor<1024xi1, #blocked0> %278 = arith.andi %277, %261 : tensor<1024xi1, #blocked0> %279 = arith.addi %272, %cst_12 : tensor<1024xi32, #blocked0> %280 = "triton_gpu.select"(%278, %279, %258) : (tensor<1024xi1, #blocked0>, tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi32, #blocked0> %281 = arith.andi %276, %261 : tensor<1024xi1, #blocked0> %282 = "triton_gpu.select"(%281, %272, %260) : (tensor<1024xi1, #blocked0>, tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi32, #blocked0> %283 = "triton_gpu.cmpi"(%280, %282) {predicate = 2 : i64} : (tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi1, #blocked0> %284 = arith.subi %282, %280 : tensor<1024xi32, #blocked0> %285 = "triton_gpu.cmpi"(%284, %cst_14) {predicate = 2 : i64} : (tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi1, #blocked0> %286 = "triton_gpu.cmpi"(%285, %cst_5) {predicate = 1 : i64} : (tensor<1024xi1, #blocked0>, tensor<1024xi1, #blocked0>) -> tensor<1024xi1, #blocked0> %287 = arith.remsi %284, %cst_6 : tensor<1024xi32, #blocked0> %288 = "triton_gpu.cmpi"(%287, %cst_14) {predicate = 1 : i64} : (tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi1, #blocked0> %289 = arith.divsi %284, %cst_6 : tensor<1024xi32, #blocked0> %290 = arith.subi %289, %cst_12 : tensor<1024xi32, #blocked0> %291 = "triton_gpu.select"(%288, %290, %289) : (tensor<1024xi1, #blocked0>, tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi32, #blocked0> %292 = "triton_gpu.select"(%286, %291, %289) : (tensor<1024xi1, #blocked0>, tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi32, #blocked0> %293 = arith.addi %280, %292 : tensor<1024xi32, #blocked0> %294 = "triton_gpu.select"(%283, %293, %cst_14) : (tensor<1024xi1, #blocked0>, tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi32, #blocked0> %295 = tt.addptr %52, %294 : tensor<1024x!tt.ptr, #blocked0>, tensor<1024xi32, #blocked0> %296 = triton_gpu.convert_layout %295 : (tensor<1024x!tt.ptr, #blocked0>) -> tensor<1024x!tt.ptr, #blocked0> %297 = tt.load %296 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xf32, #blocked0> %298 = "triton_gpu.cmpf"(%297, %35) {predicate = 3 : i64} : (tensor<1024xf32, #blocked0>, tensor<1024xf32, #blocked0>) -> tensor<1024xi1, #blocked0> %299 = "triton_gpu.cmpi"(%298, %cst_5) {predicate = 0 : i64} : (tensor<1024xi1, #blocked0>, tensor<1024xi1, #blocked0>) -> tensor<1024xi1, #blocked0> %300 = arith.andi %299, %283 : tensor<1024xi1, #blocked0> %301 = arith.addi %294, %cst_12 : tensor<1024xi32, #blocked0> %302 = "triton_gpu.select"(%300, %301, %280) : (tensor<1024xi1, #blocked0>, tensor<1024xi32, #blocked0>, tensor<1024xi32, #blocked0>) -> tensor<1024xi32, #blocked0> %303 = arith.extsi %cst_12 : tensor<1024xi32, #blocked0> to tensor<1024xi64, #blocked0> %304 = "triton_gpu.cmpi"(%17, %303) {predicate = 0 : i64} : (tensor<1024xi64, #blocked0>, tensor<1024xi64, #blocked0>) -> tensor<1024xi1, #blocked0> %305 = arith.fptosi %23 : tensor<1024xf32, #blocked0> to tensor<1024xi64, #blocked0> %306 = arith.extsi %cst_14 : tensor<1024xi32, #blocked0> to tensor<1024xi64, #blocked0> %307 = "triton_gpu.cmpi"(%306, %305) {predicate = 4 : i64} : (tensor<1024xi64, #blocked0>, tensor<1024xi64, #blocked0>) -> tensor<1024xi1, #blocked0> %308 = arith.extsi %cst_4 : tensor<1024xi32, #blocked0> to tensor<1024xi64, #blocked0> %309 = "triton_gpu.cmpi"(%305, %308) {predicate = 4 : i64} : (tensor<1024xi64, #blocked0>, tensor<1024xi64, #blocked0>) -> tensor<1024xi1, #blocked0> %310 = "triton_gpu.select"(%309, %306, %305) : (tensor<1024xi1, #blocked0>, tensor<1024xi64, #blocked0>, tensor<1024xi64, #blocked0>) -> tensor<1024xi64, #blocked0> %311 = "triton_gpu.select"(%307, %306, %310) : (tensor<1024xi1, #blocked0>, tensor<1024xi64, #blocked0>, tensor<1024xi64, #blocked0>) -> tensor<1024xi64, #blocked0> %312 = "triton_gpu.select"(%304, %311, %306) : (tensor<1024xi1, #blocked0>, tensor<1024xi64, #blocked0>, tensor<1024xi64, #blocked0>) -> tensor<1024xi64, #blocked0> %313 = arith.extsi %cst_3 : tensor<1024xi32, #blocked0> to tensor<1024xi64, #blocked0> %314 = arith.muli %312, %313 : tensor<1024xi64, #blocked0> %315 = arith.extsi %302 : tensor<1024xi32, #blocked0> to tensor<1024xi64, #blocked0> %316 = arith.addi %315, %314 : tensor<1024xi64, #blocked0> %317 = arith.trunci %316 : tensor<1024xi64, #blocked0> to tensor<1024xi32, #blocked0> %318 = arith.extsi %317 : tensor<1024xi32, #blocked0> to tensor<1024xi64, #blocked0> %319 = tt.splat %arg9 : (!tt.ptr) -> tensor<1024x!tt.ptr, #blocked0> %320 = tt.addptr %319, %318 : tensor<1024x!tt.ptr, #blocked0>, tensor<1024xi64, #blocked0> %321 = triton_gpu.convert_layout %320 : (tensor<1024x!tt.ptr, #blocked0>) -> tensor<1024x!tt.ptr, #blocked0> %322 = tt.load %321 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xf64, #blocked0> %323 = arith.extf %cst_2 : tensor<1024xf32, #blocked0> to tensor<1024xf64, #blocked0> %324 = "triton_gpu.cmpf"(%322, %323) {predicate = 2 : i64} : (tensor<1024xf64, #blocked0>, tensor<1024xf64, #blocked0>) -> tensor<1024xi1, #blocked0> %325 = tt.splat %arg10 : (!tt.ptr) -> tensor<1024x!tt.ptr, #blocked0> %326 = tt.addptr %325, %318 : tensor<1024x!tt.ptr, #blocked0>, tensor<1024xi64, #blocked0> %327 = triton_gpu.convert_layout %326 : (tensor<1024x!tt.ptr, #blocked0>) -> tensor<1024x!tt.ptr, #blocked0> %328 = tt.load %327 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xf64, #blocked0> %329 = arith.divf %328, %322 : tensor<1024xf64, #blocked0> %330 = arith.truncf %329 : tensor<1024xf64, #blocked0> to tensor<1024xf32, #blocked0> %331 = arith.mulf %330, %cst_1 : tensor<1024xf32, #blocked0> %332 = arith.mulf %35, %cst_0 : tensor<1024xf32, #blocked0> %333 = arith.addf %331, %332 : tensor<1024xf32, #blocked0> %334 = "triton_gpu.select"(%324, %333, %35) : (tensor<1024xi1, #blocked0>, tensor<1024xf32, #blocked0>, tensor<1024xf32, #blocked0>) -> tensor<1024xf32, #blocked0> %335 = tt.addptr %319, %317 : tensor<1024x!tt.ptr, #blocked0>, tensor<1024xi32, #blocked0> %336 = triton_gpu.convert_layout %335 : (tensor<1024x!tt.ptr, #blocked0>) -> tensor<1024x!tt.ptr, #blocked0> %337 = tt.load %336 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xf64, #blocked0> %338 = arith.extf %cst : tensor<1024xf32, #blocked0> to tensor<1024xf64, #blocked0> %339 = arith.mulf %337, %338 : tensor<1024xf64, #blocked0> %340 = tt.addptr %325, %317 : tensor<1024x!tt.ptr, #blocked0>, tensor<1024xi32, #blocked0> %341 = triton_gpu.convert_layout %340 : (tensor<1024x!tt.ptr, #blocked0>) -> tensor<1024x!tt.ptr, #blocked0> %342 = tt.load %341 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xf64, #blocked0> %343 = arith.mulf %342, %338 : tensor<1024xf64, #blocked0> %344 = tt.splat %arg11 : (!tt.ptr) -> tensor<1024x!tt.ptr, #blocked0> %345 = tt.addptr %344, %4 : tensor<1024x!tt.ptr, #blocked0>, tensor<1024xi32, #blocked0> %346 = triton_gpu.convert_layout %345 : (tensor<1024x!tt.ptr, #blocked0>) -> tensor<1024x!tt.ptr, #blocked1> %347 = triton_gpu.convert_layout %28 : (tensor<1024xf32, #blocked0>) -> tensor<1024xf32, #blocked1> %348 = triton_gpu.convert_layout %5 : (tensor<1024xi1, #blocked0>) -> tensor<1024xi1, #blocked1> tt.store %346, %347, %348 : tensor<1024xf32, #blocked1> %349 = tt.splat %arg12 : (!tt.ptr) -> tensor<1024x!tt.ptr, #blocked0> %350 = tt.addptr %349, %4 : tensor<1024x!tt.ptr, #blocked0>, tensor<1024xi32, #blocked0> %351 = triton_gpu.convert_layout %350 : (tensor<1024x!tt.ptr, #blocked0>) -> tensor<1024x!tt.ptr, #blocked1> %352 = triton_gpu.convert_layout %317 : (tensor<1024xi32, #blocked0>) -> tensor<1024xi32, #blocked1> %353 = triton_gpu.convert_layout %5 : (tensor<1024xi1, #blocked0>) -> tensor<1024xi1, #blocked1> tt.store %351, %352, %353 : tensor<1024xi32, #blocked1> %354 = tt.splat %arg13 : (!tt.ptr) -> tensor<1024x!tt.ptr, #blocked0> %355 = tt.addptr %354, %4 : tensor<1024x!tt.ptr, #blocked0>, tensor<1024xi32, #blocked0> %356 = triton_gpu.convert_layout %355 : (tensor<1024x!tt.ptr, #blocked0>) -> tensor<1024x!tt.ptr, #blocked1> %357 = triton_gpu.convert_layout %334 : (tensor<1024xf32, #blocked0>) -> tensor<1024xf32, #blocked1> %358 = triton_gpu.convert_layout %5 : (tensor<1024xi1, #blocked0>) -> tensor<1024xi1, #blocked1> tt.store %356, %357, %358 : tensor<1024xf32, #blocked1> %359 = tt.splat %arg14 : (!tt.ptr) -> tensor<1024x!tt.ptr, #blocked0> %360 = tt.addptr %359, %318 : tensor<1024x!tt.ptr, #blocked0>, tensor<1024xi64, #blocked0> %361 = triton_gpu.convert_layout %360 : (tensor<1024x!tt.ptr, #blocked0>) -> tensor<1024x!tt.ptr, #blocked0> %362 = triton_gpu.convert_layout %339 : (tensor<1024xf64, #blocked0>) -> tensor<1024xf64, #blocked0> tt.store %361, %362 : tensor<1024xf64, #blocked0> %363 = tt.splat %arg15 : (!tt.ptr) -> tensor<1024x!tt.ptr, #blocked0> %364 = tt.addptr %363, %318 : tensor<1024x!tt.ptr, #blocked0>, tensor<1024xi64, #blocked0> %365 = triton_gpu.convert_layout %364 : (tensor<1024x!tt.ptr, #blocked0>) -> tensor<1024x!tt.ptr, #blocked0> %366 = triton_gpu.convert_layout %343 : (tensor<1024xf64, #blocked0>) -> tensor<1024xf64, #blocked0> tt.store %365, %366 : tensor<1024xf64, #blocked0> return } // A mnist model from torch inductor. // Check if topological sort is working correct and there's no unnecessary convert // CHECK-LABEL: mnist func public @mnist(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32}, %arg3: i32) { // CHECK-NOT: triton_gpu.convert_layout %cst = arith.constant dense<10> : tensor<16x1xi32, #blocked2> %cst_0 = arith.constant dense<10> : tensor<1x16xi32, #blocked3> %c16_i32 = arith.constant 16 : i32 %cst_1 = arith.constant dense<64> : tensor<16x1xi32, #blocked2> %cst_2 = arith.constant dense<0xFF800000> : tensor<16x16xf32, #blocked2> %cst_3 = arith.constant dense<0.000000e+00> : tensor<16x16xf32, #blocked2> %cst_4 = arith.constant dense<0> : tensor<16x16xi32, #blocked2> %0 = tt.get_program_id {axis = 0 : i32} : i32 %1 = arith.muli %0, %c16_i32 : i32 %2 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #blocked0> %3 = triton_gpu.convert_layout %2 : (tensor<16xi32, #blocked0>) -> tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> %4 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<16x1xi32, #blocked1> %5 = triton_gpu.convert_layout %4 : (tensor<16x1xi32, #blocked1>) -> tensor<16x1xi32, #blocked2> %6 = tt.splat %1 : (i32) -> tensor<16x1xi32, #blocked2> %7 = arith.addi %6, %5 : tensor<16x1xi32, #blocked2> %8 = "triton_gpu.cmpi"(%7, %cst_1) {predicate = 2 : i64} : (tensor<16x1xi32, #blocked2>, tensor<16x1xi32, #blocked2>) -> tensor<16x1xi1, #blocked2> %9 = triton_gpu.convert_layout %2 : (tensor<16xi32, #blocked0>) -> tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #blocked3}>> %10 = tt.expand_dims %9 {axis = 0 : i32} : (tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #blocked3}>>) -> tensor<1x16xi32, #blocked3> %11 = "triton_gpu.cmpi"(%10, %cst_0) {predicate = 2 : i64} : (tensor<1x16xi32, #blocked3>, tensor<1x16xi32, #blocked3>) -> tensor<1x16xi1, #blocked3> %12 = arith.muli %7, %cst : tensor<16x1xi32, #blocked2> %13 = tt.broadcast %10 : (tensor<1x16xi32, #blocked3>) -> tensor<16x16xi32, #blocked3> %14 = triton_gpu.convert_layout %13 : (tensor<16x16xi32, #blocked3>) -> tensor<16x16xi32, #blocked2> %15 = tt.broadcast %12 : (tensor<16x1xi32, #blocked2>) -> tensor<16x16xi32, #blocked2> %16 = arith.addi %14, %15 : tensor<16x16xi32, #blocked2> %17 = tt.splat %arg0 : (!tt.ptr) -> tensor<16x16x!tt.ptr, #blocked2> %18 = tt.addptr %17, %16 : tensor<16x16x!tt.ptr, #blocked2>, tensor<16x16xi32, #blocked2> %19 = tt.broadcast %11 : (tensor<1x16xi1, #blocked3>) -> tensor<16x16xi1, #blocked3> %20 = triton_gpu.convert_layout %19 : (tensor<16x16xi1, #blocked3>) -> tensor<16x16xi1, #blocked2> %21 = tt.broadcast %8 : (tensor<16x1xi1, #blocked2>) -> tensor<16x16xi1, #blocked2> %22 = arith.andi %20, %21 : tensor<16x16xi1, #blocked2> %23 = triton_gpu.convert_layout %18 : (tensor<16x16x!tt.ptr, #blocked2>) -> tensor<16x16x!tt.ptr, #blocked4> %24 = triton_gpu.convert_layout %22 : (tensor<16x16xi1, #blocked2>) -> tensor<16x16xi1, #blocked4> %25 = tt.load %23, %24 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x16xf32, #blocked4> %26 = triton_gpu.convert_layout %25 : (tensor<16x16xf32, #blocked4>) -> tensor<16x16xf32, #blocked2> %27 = "triton_gpu.cmpf"(%cst_2, %26) {predicate = 4 : i64} : (tensor<16x16xf32, #blocked2>, tensor<16x16xf32, #blocked2>) -> tensor<16x16xi1, #blocked2> %28 = arith.andi %22, %27 : tensor<16x16xi1, #blocked2> %29 = "triton_gpu.select"(%28, %26, %cst_2) : (tensor<16x16xi1, #blocked2>, tensor<16x16xf32, #blocked2>, tensor<16x16xf32, #blocked2>) -> tensor<16x16xf32, #blocked2> %30 = tt.reduce %29 {axis = 1 : i32, redOp = 12 : i32} : tensor<16x16xf32, #blocked2> -> tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> %31 = triton_gpu.convert_layout %30 : (tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>>) -> tensor<16xf32, #blocked0> %32 = triton_gpu.convert_layout %31 : (tensor<16xf32, #blocked0>) -> tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> %33 = tt.expand_dims %32 {axis = 1 : i32} : (tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<16x1xf32, #blocked1> %34 = triton_gpu.convert_layout %33 : (tensor<16x1xf32, #blocked1>) -> tensor<16x1xf32, #blocked2> %35 = arith.sitofp %cst_4 : tensor<16x16xi32, #blocked2> to tensor<16x16xf32, #blocked2> %36 = arith.addf %35, %cst_3 : tensor<16x16xf32, #blocked2> %37 = triton_gpu.convert_layout %18 : (tensor<16x16x!tt.ptr, #blocked2>) -> tensor<16x16x!tt.ptr, #blocked4> %38 = triton_gpu.convert_layout %22 : (tensor<16x16xi1, #blocked2>) -> tensor<16x16xi1, #blocked4> %39 = tt.load %37, %38 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x16xf32, #blocked4> %40 = triton_gpu.convert_layout %39 : (tensor<16x16xf32, #blocked4>) -> tensor<16x16xf32, #blocked2> %41 = tt.broadcast %34 : (tensor<16x1xf32, #blocked2>) -> tensor<16x16xf32, #blocked2> %42 = arith.subf %40, %41 : tensor<16x16xf32, #blocked2> %43 = math.exp %42 : tensor<16x16xf32, #blocked2> %44 = arith.addf %36, %43 : tensor<16x16xf32, #blocked2> %45 = "triton_gpu.select"(%22, %44, %36) : (tensor<16x16xi1, #blocked2>, tensor<16x16xf32, #blocked2>, tensor<16x16xf32, #blocked2>) -> tensor<16x16xf32, #blocked2> %46 = tt.reduce %45 {axis = 1 : i32, redOp = 2 : i32} : tensor<16x16xf32, #blocked2> -> tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> %47 = triton_gpu.convert_layout %46 : (tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>>) -> tensor<16xf32, #blocked0> %48 = triton_gpu.convert_layout %47 : (tensor<16xf32, #blocked0>) -> tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> %49 = tt.expand_dims %48 {axis = 1 : i32} : (tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<16x1xf32, #blocked1> %50 = triton_gpu.convert_layout %49 : (tensor<16x1xf32, #blocked1>) -> tensor<16x1xf32, #blocked2> %51 = triton_gpu.convert_layout %18 : (tensor<16x16x!tt.ptr, #blocked2>) -> tensor<16x16x!tt.ptr, #blocked4> %52 = triton_gpu.convert_layout %22 : (tensor<16x16xi1, #blocked2>) -> tensor<16x16xi1, #blocked4> %53 = tt.load %51, %52 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x16xf32, #blocked4> %54 = triton_gpu.convert_layout %53 : (tensor<16x16xf32, #blocked4>) -> tensor<16x16xf32, #blocked2> %55 = arith.subf %54, %41 : tensor<16x16xf32, #blocked2> %56 = math.log %50 : tensor<16x1xf32, #blocked2> %57 = tt.broadcast %56 : (tensor<16x1xf32, #blocked2>) -> tensor<16x16xf32, #blocked2> %58 = arith.subf %55, %57 : tensor<16x16xf32, #blocked2> %59 = tt.splat %arg1 : (!tt.ptr) -> tensor<16x16x!tt.ptr, #blocked2> %60 = tt.addptr %59, %16 : tensor<16x16x!tt.ptr, #blocked2>, tensor<16x16xi32, #blocked2> %61 = triton_gpu.convert_layout %60 : (tensor<16x16x!tt.ptr, #blocked2>) -> tensor<16x16x!tt.ptr, #blocked4> %62 = triton_gpu.convert_layout %58 : (tensor<16x16xf32, #blocked2>) -> tensor<16x16xf32, #blocked4> %63 = triton_gpu.convert_layout %22 : (tensor<16x16xi1, #blocked2>) -> tensor<16x16xi1, #blocked4> tt.store %61, %62, %63 : tensor<16x16xf32, #blocked4> return } // ----- #blocked0 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [8], order = [0]}> #blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [8, 1], order = [0, 1]}> #blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 4], order = [0, 1]}> #blocked3 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 8], order = [0, 1]}> #blocked4 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 8], warpsPerCTA = [8, 1], order = [1, 0]}> #blocked5 = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [2, 16], warpsPerCTA = [8, 1], order = [1, 0]}> // cmpf and cmpi have different operands and result types // CHECK-LABEL: cmp func public @cmp(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: !tt.ptr {tt.divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32}) { %c64 = arith.constant 64 : index %c2048 = arith.constant 2048 : index %c0 = arith.constant 0 : index %c64_i32 = arith.constant 64 : i32 %cst = arith.constant dense<-3.40282347E+38> : tensor<64x64xf32, #blocked2> %cst_0 = arith.constant dense<4194304> : tensor<64x1xi32, #blocked2> %cst_1 = arith.constant dense<12> : tensor<64x1xi32, #blocked2> %cst_2 = arith.constant dense<2048> : tensor<1x64xi32, #blocked3> %cst_3 = arith.constant dense<0> : tensor<64x64xi32, #blocked2> %cst_4 = arith.constant dense<2048> : tensor<64x1xi32, #blocked2> %cst_5 = arith.constant dense<49152> : tensor<64x1xi32, #blocked2> %cst_6 = arith.constant dense<0.000000e+00> : tensor<64x64xf32, #blocked2> %0 = tt.get_program_id {axis = 0 : i32} : i32 %1 = arith.muli %0, %c64_i32 : i32 %2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #blocked0> %3 = triton_gpu.convert_layout %2 : (tensor<64xi32, #blocked0>) -> tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> %4 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<64x1xi32, #blocked1> %5 = triton_gpu.convert_layout %4 : (tensor<64x1xi32, #blocked1>) -> tensor<64x1xi32, #blocked2> %6 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked2> %7 = arith.addi %6, %5 : tensor<64x1xi32, #blocked2> %8 = "triton_gpu.cmpi"(%7, %cst_5) {predicate = 2 : i64} : (tensor<64x1xi32, #blocked2>, tensor<64x1xi32, #blocked2>) -> tensor<64x1xi1, #blocked2> %9 = triton_gpu.convert_layout %2 : (tensor<64xi32, #blocked0>) -> tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked3}>> %10 = tt.expand_dims %9 {axis = 0 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked3}>>) -> tensor<1x64xi32, #blocked3> %11 = arith.remsi %7, %cst_4 : tensor<64x1xi32, #blocked2> %12 = arith.divsi %7, %cst_4 : tensor<64x1xi32, #blocked2> %13 = arith.sitofp %cst_3 : tensor<64x64xi32, #blocked2> to tensor<64x64xf32, #blocked2> %14 = arith.addf %13, %cst_6 : tensor<64x64xf32, #blocked2> %15 = arith.muli %7, %cst_4 : tensor<64x1xi32, #blocked2> %16 = tt.broadcast %15 : (tensor<64x1xi32, #blocked2>) -> tensor<64x64xi32, #blocked2> %17 = tt.splat %arg0 : (!tt.ptr) -> tensor<64x64x!tt.ptr, #blocked2> %18 = tt.broadcast %8 : (tensor<64x1xi1, #blocked2>) -> tensor<64x64xi1, #blocked2> %19 = arith.muli %11, %cst_4 : tensor<64x1xi32, #blocked2> %20 = tt.broadcast %19 : (tensor<64x1xi32, #blocked2>) -> tensor<64x64xi32, #blocked2> %21 = arith.divsi %12, %cst_1 : tensor<64x1xi32, #blocked2> %22 = arith.muli %21, %cst_0 : tensor<64x1xi32, #blocked2> %23 = tt.broadcast %22 : (tensor<64x1xi32, #blocked2>) -> tensor<64x64xi32, #blocked2> %24 = tt.splat %arg1 : (!tt.ptr) -> tensor<64x64x!tt.ptr, #blocked2> %25 = scf.for %arg6 = %c0 to %c2048 step %c64 iter_args(%arg7 = %14) -> (tensor<64x64xf32, #blocked2>) { %44 = arith.index_cast %arg6 : index to i32 %45 = tt.splat %44 : (i32) -> tensor<1x64xi32, #blocked3> %46 = arith.addi %45, %10 : tensor<1x64xi32, #blocked3> %47 = "triton_gpu.cmpi"(%46, %cst_2) {predicate = 2 : i64} : (tensor<1x64xi32, #blocked3>, tensor<1x64xi32, #blocked3>) -> tensor<1x64xi1, #blocked3> %48 = tt.broadcast %46 : (tensor<1x64xi32, #blocked3>) -> tensor<64x64xi32, #blocked3> %49 = triton_gpu.convert_layout %48 : (tensor<64x64xi32, #blocked3>) -> tensor<64x64xi32, #blocked2> %50 = arith.addi %49, %16 : tensor<64x64xi32, #blocked2> %51 = tt.addptr %17, %50 : tensor<64x64x!tt.ptr, #blocked2>, tensor<64x64xi32, #blocked2> %52 = tt.broadcast %47 : (tensor<1x64xi1, #blocked3>) -> tensor<64x64xi1, #blocked3> %53 = triton_gpu.convert_layout %52 : (tensor<64x64xi1, #blocked3>) -> tensor<64x64xi1, #blocked2> %54 = arith.andi %53, %18 : tensor<64x64xi1, #blocked2> %55 = triton_gpu.convert_layout %51 : (tensor<64x64x!tt.ptr, #blocked2>) -> tensor<64x64x!tt.ptr, #blocked4> %56 = triton_gpu.convert_layout %54 : (tensor<64x64xi1, #blocked2>) -> tensor<64x64xi1, #blocked4> %57 = tt.load %55, %56 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x64xf16, #blocked4> %58 = triton_gpu.convert_layout %57 : (tensor<64x64xf16, #blocked4>) -> tensor<64x64xf16, #blocked2> %59 = arith.extf %58 : tensor<64x64xf16, #blocked2> to tensor<64x64xf32, #blocked2> %60 = arith.addi %49, %20 : tensor<64x64xi32, #blocked2> %61 = arith.addi %60, %23 : tensor<64x64xi32, #blocked2> %62 = tt.addptr %24, %61 : tensor<64x64x!tt.ptr, #blocked2>, tensor<64x64xi32, #blocked2> %63 = triton_gpu.convert_layout %62 : (tensor<64x64x!tt.ptr, #blocked2>) -> tensor<64x64x!tt.ptr, #blocked5> %64 = triton_gpu.convert_layout %54 : (tensor<64x64xi1, #blocked2>) -> tensor<64x64xi1, #blocked5> %65 = tt.load %63, %64 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x64xf32, #blocked5> %66 = triton_gpu.convert_layout %65 : (tensor<64x64xf32, #blocked5>) -> tensor<64x64xf32, #blocked2> %67 = arith.addf %59, %66 : tensor<64x64xf32, #blocked2> %68 = "triton_gpu.cmpf"(%67, %67) {predicate = 13 : i64} : (tensor<64x64xf32, #blocked2>, tensor<64x64xf32, #blocked2>) -> tensor<64x64xi1, #blocked2> %69 = "triton_gpu.cmpf"(%67, %cst) {predicate = 2 : i64} : (tensor<64x64xf32, #blocked2>, tensor<64x64xf32, #blocked2>) -> tensor<64x64xi1, #blocked2> %70 = "triton_gpu.select"(%69, %67, %cst) : (tensor<64x64xi1, #blocked2>, tensor<64x64xf32, #blocked2>, tensor<64x64xf32, #blocked2>) -> tensor<64x64xf32, #blocked2> %71 = "triton_gpu.select"(%68, %67, %70) : (tensor<64x64xi1, #blocked2>, tensor<64x64xf32, #blocked2>, tensor<64x64xf32, #blocked2>) -> tensor<64x64xf32, #blocked2> %72 = math.exp %71 : tensor<64x64xf32, #blocked2> %73 = arith.addf %arg7, %72 : tensor<64x64xf32, #blocked2> %74 = "triton_gpu.select"(%54, %73, %arg7) : (tensor<64x64xi1, #blocked2>, tensor<64x64xf32, #blocked2>, tensor<64x64xf32, #blocked2>) -> tensor<64x64xf32, #blocked2> scf.yield %74 : tensor<64x64xf32, #blocked2> } %26 = tt.reduce %25 {axis = 1 : i32, redOp = 2 : i32} : tensor<64x64xf32, #blocked2> -> tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> %27 = triton_gpu.convert_layout %26 : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>>) -> tensor<64xf32, #blocked0> %28 = triton_gpu.convert_layout %27 : (tensor<64xf32, #blocked0>) -> tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> %29 = tt.expand_dims %28 {axis = 1 : i32} : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<64x1xf32, #blocked1> %30 = triton_gpu.convert_layout %29 : (tensor<64x1xf32, #blocked1>) -> tensor<64x1xf32, #blocked2> %31 = arith.muli %7, %cst_4 : tensor<64x1xi32, #blocked2> %32 = tt.broadcast %31 : (tensor<64x1xi32, #blocked2>) -> tensor<64x64xi32, #blocked2> %33 = tt.splat %arg0 : (!tt.ptr) -> tensor<64x64x!tt.ptr, #blocked2> %34 = tt.broadcast %8 : (tensor<64x1xi1, #blocked2>) -> tensor<64x64xi1, #blocked2> %35 = arith.muli %11, %cst_4 : tensor<64x1xi32, #blocked2> %36 = tt.broadcast %35 : (tensor<64x1xi32, #blocked2>) -> tensor<64x64xi32, #blocked2> %37 = arith.divsi %12, %cst_1 : tensor<64x1xi32, #blocked2> %38 = arith.muli %37, %cst_0 : tensor<64x1xi32, #blocked2> %39 = tt.broadcast %38 : (tensor<64x1xi32, #blocked2>) -> tensor<64x64xi32, #blocked2> %40 = tt.splat %arg1 : (!tt.ptr) -> tensor<64x64x!tt.ptr, #blocked2> %41 = tt.broadcast %30 : (tensor<64x1xf32, #blocked2>) -> tensor<64x64xf32, #blocked2> %42 = tt.splat %arg2 : (!tt.ptr) -> tensor<64x64x!tt.ptr, #blocked2> %43 = tt.splat %arg3 : (!tt.ptr) -> tensor<64x64x!tt.ptr, #blocked2> scf.for %arg6 = %c0 to %c2048 step %c64 { %44 = arith.index_cast %arg6 : index to i32 %45 = tt.splat %44 : (i32) -> tensor<1x64xi32, #blocked3> %46 = arith.addi %45, %10 : tensor<1x64xi32, #blocked3> %47 = "triton_gpu.cmpi"(%46, %cst_2) {predicate = 2 : i64} : (tensor<1x64xi32, #blocked3>, tensor<1x64xi32, #blocked3>) -> tensor<1x64xi1, #blocked3> %48 = tt.broadcast %46 : (tensor<1x64xi32, #blocked3>) -> tensor<64x64xi32, #blocked3> %49 = triton_gpu.convert_layout %48 : (tensor<64x64xi32, #blocked3>) -> tensor<64x64xi32, #blocked2> %50 = arith.addi %49, %32 : tensor<64x64xi32, #blocked2> %51 = tt.addptr %33, %50 : tensor<64x64x!tt.ptr, #blocked2>, tensor<64x64xi32, #blocked2> %52 = tt.broadcast %47 : (tensor<1x64xi1, #blocked3>) -> tensor<64x64xi1, #blocked3> %53 = triton_gpu.convert_layout %52 : (tensor<64x64xi1, #blocked3>) -> tensor<64x64xi1, #blocked2> %54 = arith.andi %53, %34 : tensor<64x64xi1, #blocked2> %55 = triton_gpu.convert_layout %51 : (tensor<64x64x!tt.ptr, #blocked2>) -> tensor<64x64x!tt.ptr, #blocked4> %56 = triton_gpu.convert_layout %54 : (tensor<64x64xi1, #blocked2>) -> tensor<64x64xi1, #blocked4> %57 = tt.load %55, %56 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x64xf16, #blocked4> %58 = triton_gpu.convert_layout %57 : (tensor<64x64xf16, #blocked4>) -> tensor<64x64xf16, #blocked2> %59 = arith.extf %58 : tensor<64x64xf16, #blocked2> to tensor<64x64xf32, #blocked2> %60 = arith.addi %49, %36 : tensor<64x64xi32, #blocked2> %61 = arith.addi %60, %39 : tensor<64x64xi32, #blocked2> %62 = tt.addptr %40, %61 : tensor<64x64x!tt.ptr, #blocked2>, tensor<64x64xi32, #blocked2> %63 = triton_gpu.convert_layout %62 : (tensor<64x64x!tt.ptr, #blocked2>) -> tensor<64x64x!tt.ptr, #blocked5> %64 = triton_gpu.convert_layout %54 : (tensor<64x64xi1, #blocked2>) -> tensor<64x64xi1, #blocked5> %65 = tt.load %63, %64 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x64xf32, #blocked5> %66 = triton_gpu.convert_layout %65 : (tensor<64x64xf32, #blocked5>) -> tensor<64x64xf32, #blocked2> %67 = arith.addf %59, %66 : tensor<64x64xf32, #blocked2> %68 = "triton_gpu.cmpf"(%67, %67) {predicate = 13 : i64} : (tensor<64x64xf32, #blocked2>, tensor<64x64xf32, #blocked2>) -> tensor<64x64xi1, #blocked2> %69 = "triton_gpu.cmpf"(%67, %cst) {predicate = 2 : i64} : (tensor<64x64xf32, #blocked2>, tensor<64x64xf32, #blocked2>) -> tensor<64x64xi1, #blocked2> %70 = "triton_gpu.select"(%69, %67, %cst) : (tensor<64x64xi1, #blocked2>, tensor<64x64xf32, #blocked2>, tensor<64x64xf32, #blocked2>) -> tensor<64x64xf32, #blocked2> %71 = "triton_gpu.select"(%68, %67, %70) : (tensor<64x64xi1, #blocked2>, tensor<64x64xf32, #blocked2>, tensor<64x64xf32, #blocked2>) -> tensor<64x64xf32, #blocked2> %72 = math.exp %71 : tensor<64x64xf32, #blocked2> %73 = arith.divf %72, %41 : tensor<64x64xf32, #blocked2> %74 = tt.addptr %42, %50 : tensor<64x64x!tt.ptr, #blocked2>, tensor<64x64xi32, #blocked2> %75 = triton_gpu.convert_layout %74 : (tensor<64x64x!tt.ptr, #blocked2>) -> tensor<64x64x!tt.ptr, #blocked5> %76 = triton_gpu.convert_layout %73 : (tensor<64x64xf32, #blocked2>) -> tensor<64x64xf32, #blocked5> %77 = triton_gpu.convert_layout %54 : (tensor<64x64xi1, #blocked2>) -> tensor<64x64xi1, #blocked5> tt.store %75, %76, %77 : tensor<64x64xf32, #blocked5> %78 = tt.addptr %43, %50 : tensor<64x64x!tt.ptr, #blocked2>, tensor<64x64xi32, #blocked2> %79 = arith.truncf %73 : tensor<64x64xf32, #blocked2> to tensor<64x64xf16, #blocked2> %80 = triton_gpu.convert_layout %78 : (tensor<64x64x!tt.ptr, #blocked2>) -> tensor<64x64x!tt.ptr, #blocked4> %81 = triton_gpu.convert_layout %79 : (tensor<64x64xf16, #blocked2>) -> tensor<64x64xf16, #blocked4> %82 = triton_gpu.convert_layout %54 : (tensor<64x64xi1, #blocked2>) -> tensor<64x64xi1, #blocked4> tt.store %80, %81, %82 : tensor<64x64xf16, #blocked4> } return } triton-2.0.0/test/TritonGPU/loop-pipeline.mlir000066400000000000000000000371501440023377100213120ustar00rootroot00000000000000// RUN: triton-opt %s -split-input-file -tritongpu-pipeline=num-stages=3 -canonicalize | FileCheck %s // 4 warps // matmul: 128x32 @ 32x128 -> 128x128 #AL = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> #BL = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}> #ALs0 = #triton_gpu.slice<{parent=#AL, dim=0}> #BLs0 = #triton_gpu.slice<{parent=#BL, dim=0}> #C = #triton_gpu.mma<{versionMajor = 2, warpsPerCTA = [4, 1]}> #A = #triton_gpu.dot_op<{opIdx = 0, parent = #C}> #B = #triton_gpu.dot_op<{opIdx = 1, parent = #C}> // CHECK: func @matmul_loop // CHECK-DAG: %[[CONSTANT_0:.*]] = arith.constant 0 : i32 // CHECK-DAG: %[[CONSTANT_1:.*]] = arith.constant 1 : i32 // CHECK-DAG: %[[CONSTANT_2:.*]] = arith.constant 2 : i32 // CHECK-DAG: %[[CONSTANT_3:.*]] = arith.constant 3 : i32 // CHECK-DAG: %[[LOOP_COND_0:.*]] = arith.cmpi slt, %[[LB:.*]], %[[UB:.*]] // CHECK: %[[ABUFFER:.*]] = triton_gpu.alloc_tensor // CHECK-DAG: %[[LOOP_COND_0_SPLAT_A:.*]] = tt.splat %[[LOOP_COND_0]] // CHECK: %[[A0BUFFER:.*]] = triton_gpu.insert_slice_async {{.*}}, {{.*}}, %[[CONSTANT_0]], %[[LOOP_COND_0_SPLAT_A]] // CHECK: %[[BBUFFER:.*]] = triton_gpu.alloc_tensor // CHECK-DAG: %[[LOOP_COND_0_SPLAT_B:.*]] = tt.splat %[[LOOP_COND_0]] // CHECK: %[[B0BUFFER:.*]] = triton_gpu.insert_slice_async {{.*}}, {{.*}}, %[[CONSTANT_0]], %[[LOOP_COND_0_SPLAT_B]] // CHECK-DAG: %[[IV_1:.*]] = arith.addi %[[LB]], %[[STEP:.*]] // CHECK-DAG: %[[LOOP_COND_1:.*]] = arith.cmpi slt, %[[IV_1]], %[[UB]] // CHECK-DAG: %[[LOOP_COND_1_SPLAT_A:.*]] = tt.splat %[[LOOP_COND_1]] // CHECK: %[[A1BUFFER:.*]] = triton_gpu.insert_slice_async {{.*}}, {{.*}}, %[[CONSTANT_1]], %[[LOOP_COND_1_SPLAT_A]] // CHECK-DAG: %[[LOOP_COND_1_SPLAT_B:.*]] = tt.splat %[[LOOP_COND_1]] // CHECK: %[[B1BUFFER:.*]] = triton_gpu.insert_slice_async {{.*}}, {{.*}}, %[[CONSTANT_1]], %[[LOOP_COND_1_SPLAT_B]] // CHECK: triton_gpu.async_wait {num = 2 : i32} // CHECK: %[[A0:.*]] = tensor.extract_slice %[[A1BUFFER]][0, 0, 0] // CHECK: %[[B0:.*]] = tensor.extract_slice %[[B1BUFFER]][0, 0, 0] // CHECK: scf.for {{.*}} iter_args({{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, %[[arg_a0:.*]] = %[[A0]], %[[arg_b0:.*]] = %[[B0]], {{.*}}, {{.*}}, {{.*}}, %[[PIPELINE_IDX:.*]] = %[[CONSTANT_2]], %[[LOOP_IDX:.*]] = %[[CONSTANT_1]] // CHECK: %[[arg_a0_dot_op:.*]] = triton_gpu.convert_layout %[[arg_a0]] // CHECK: %[[arg_b0_dot_op:.*]] = triton_gpu.convert_layout %[[arg_b0]] // CHECK: tt.dot %[[arg_a0_dot_op]], %[[arg_b0_dot_op]], {{.*}} // CHECK-DAG: %[[INSERT_IDX:.*]] = arith.remsi %[[PIPELINE_IDX]], %[[CONSTANT_3]] // CHECK-DAG: %[[EXTRACT_INT:.*]] = arith.remsi %[[LOOP_IDX]], %[[CONSTANT_3]] // CHECK-DAG: %[[EXTRACT_IDX:.*]] = arith.index_cast %[[EXTRACT_INT]] : i32 to index // CHECK: %[[NEXT_A_BUFFER:.*]] = triton_gpu.insert_slice_async {{.*}}, {{.*}}, %[[INSERT_IDX]] // CHECK: %[[NEXT_B_BUFFER:.*]] = triton_gpu.insert_slice_async {{.*}}, {{.*}}, %[[INSERT_IDX]] // CHECK: triton_gpu.async_wait {num = 2 : i32} // CHECK: %[[NEXT_A:.*]] = tensor.extract_slice %[[NEXT_A_BUFFER]][%[[EXTRACT_IDX]], 0, 0] // CHECK: %[[NEXT_B:.*]] = tensor.extract_slice %[[NEXT_B_BUFFER]][%[[EXTRACT_IDX]], 0, 0] // CHECK-DAG: %[[NEXT_PIPELINE_IDX:.*]] = arith.addi %[[PIPELINE_IDX]], %[[CONSTANT_1]] // CHECK-DAG: %[[NEXT_LOOP_IDX:.*]] = arith.addi %[[LOOP_IDX]], %[[CONSTANT_1]] // CHECK: scf.yield {{.*}}, {{.*}}, {{.*}}, %[[NEXT_A_BUFFER]], %[[NEXT_B_BUFFER]], %[[NEXT_A]], %[[NEXT_B]], {{.*}}, {{.*}}, {{.*}}, %[[NEXT_PIPELINE_IDX]], %[[NEXT_LOOP_IDX]] func @matmul_loop(%lb : index, %ub : index, %step : index, %A : !tt.ptr {tt.divisibility = 16 : i32}, %B : !tt.ptr {tt.divisibility = 16 : i32}) { // A ptrs %a_ptr_splat = tt.splat %A : (!tt.ptr) -> tensor<128x32x!tt.ptr, #AL> %a_tmp0 = tt.make_range {end = 32: i32, start = 0: i32} : tensor<32xi32, #ALs0> %a_tmp1 = tt.expand_dims %a_tmp0 {axis = 0 : i32} : (tensor<32xi32, #ALs0>) -> tensor<1x32xi32, #AL> %a_offs = tt.broadcast %a_tmp1 : (tensor<1x32xi32, #AL>) -> tensor<128x32xi32, #AL> %a_ptr_init = tt.addptr %a_ptr_splat, %a_offs : tensor<128x32x!tt.ptr, #AL>, tensor<128x32xi32, #AL> // B ptrs %b_ptr_splat = tt.splat %B : (!tt.ptr) -> tensor<32x128x!tt.ptr, #BL> %b_tmp0 = tt.make_range {end = 128: i32, start = 0: i32} : tensor<128xi32, #BLs0> %b_tmp1 = tt.expand_dims %b_tmp0 {axis = 0 : i32} : (tensor<128xi32, #BLs0>) -> tensor<1x128xi32, #BL> %b_offs = tt.broadcast %b_tmp1 : (tensor<1x128xi32, #BL>) -> tensor<32x128xi32, #BL> %b_ptr_init = tt.addptr %b_ptr_splat, %b_offs : tensor<32x128x!tt.ptr, #BL>, tensor<32x128xi32, #BL> %a_mask = arith.constant dense : tensor<128x32xi1, #AL> %a_other = arith.constant dense<0.00e+00> : tensor<128x32xf16, #AL> %b_mask = arith.constant dense : tensor<32x128xi1, #BL> %b_other = arith.constant dense<0.00e+00> : tensor<32x128xf16, #BL> %c_init = arith.constant dense<0.00e+00> : tensor<128x128xf32, #C> %a_off = arith.constant dense<4> : tensor<128x32xi32, #AL> %b_off = arith.constant dense<4> : tensor<32x128xi32, #BL> scf.for %iv = %lb to %ub step %step iter_args(%a_ptr = %a_ptr_init, %b_ptr = %b_ptr_init, %prev_c = %c_init) -> (tensor<128x32x!tt.ptr, #AL>, tensor<32x128x!tt.ptr, #BL>, tensor<128x128xf32, #C>) { %a_ = tt.load %a_ptr {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<128x32xf16, #AL> %a = triton_gpu.convert_layout %a_ : (tensor<128x32xf16, #AL>) -> tensor<128x32xf16, #A> %b_ = tt.load %b_ptr, %b_mask, %b_other {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<32x128xf16, #BL> %b = triton_gpu.convert_layout %b_ : (tensor<32x128xf16, #BL>) -> tensor<32x128xf16, #B> %c = tt.dot %a, %b, %prev_c {allowTF32 = true, transA = false, transB = false} : tensor<128x32xf16, #A> * tensor<32x128xf16, #B> -> tensor<128x128xf32, #C> %next_a_ptr = tt.addptr %a_ptr, %a_off : tensor<128x32x!tt.ptr, #AL>, tensor<128x32xi32, #AL> %next_b_ptr = tt.addptr %b_ptr, %b_off : tensor<32x128x!tt.ptr, #BL>, tensor<32x128xi32, #BL> scf.yield %next_a_ptr, %next_b_ptr, %c : tensor<128x32x!tt.ptr, #AL>, tensor<32x128x!tt.ptr, #BL>, tensor<128x128xf32, #C> } return } // CHECK: func @matmul_loop_nested // CHECK-DAG: %[[CONSTANT_0:.*]] = arith.constant 0 : i32 // CHECK-DAG: %[[CONSTANT_1:.*]] = arith.constant 1 : i32 // CHECK-DAG: %[[CONSTANT_2:.*]] = arith.constant 2 : i32 // CHECK-DAG: %[[CONSTANT_3:.*]] = arith.constant 3 : i32 // CHECK: scf.for // CHECK: %[[ABUFFER:.*]] = triton_gpu.alloc_tensor // CHECK: %[[A0BUFFER:.*]] = triton_gpu.insert_slice_async {{.*}}, {{.*}}, %[[CONSTANT_0]] // CHECK: %[[BBUFFER:.*]] = triton_gpu.alloc_tensor // CHECK: %[[B0BUFFER:.*]] = triton_gpu.insert_slice_async {{.*}}, {{.*}}, %[[CONSTANT_0]] // CHECK: %[[A1BUFFER:.*]] = triton_gpu.insert_slice_async {{.*}}, {{.*}}, %[[CONSTANT_1]] // CHECK: %[[B1BUFFER:.*]] = triton_gpu.insert_slice_async {{.*}}, {{.*}}, %[[CONSTANT_1]] // CHECK: triton_gpu.async_wait {num = 2 : i32} // CHECK: %[[A0:.*]] = tensor.extract_slice %[[A1BUFFER]][0, 0, 0] // CHECK: %[[B0:.*]] = tensor.extract_slice %[[B1BUFFER]][0, 0, 0] // CHECK: scf.for {{.*}} iter_args({{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, %[[arg_a0:.*]] = %[[A0]], %[[arg_b0:.*]] = %[[B0]], {{.*}}, {{.*}}, {{.*}}, %[[PIPELINE_IDX:.*]] = %[[CONSTANT_2]], %[[LOOP_IDX:.*]] = %[[CONSTANT_1]] // CHECK: %[[arg_a0_dot_op:.*]] = triton_gpu.convert_layout %[[arg_a0]] // CHECK: %[[arg_b0_dot_op:.*]] = triton_gpu.convert_layout %[[arg_b0]] // CHECK: tt.dot %[[arg_a0_dot_op]], %[[arg_b0_dot_op]], {{.*}} // CHECK-DAG: %[[INSERT_IDX:.*]] = arith.remsi %[[PIPELINE_IDX]], %[[CONSTANT_3]] // CHECK-DAG: %[[EXTRACT_INT:.*]] = arith.remsi %[[LOOP_IDX]], %[[CONSTANT_3]] // CHECK-DAG: %[[EXTRACT_IDX:.*]] = arith.index_cast %[[EXTRACT_INT]] : i32 to index // CHECK: %[[NEXT_A_BUFFER:.*]] = triton_gpu.insert_slice_async {{.*}}, {{.*}}, %[[INSERT_IDX]] // CHECK: %[[NEXT_B_BUFFER:.*]] = triton_gpu.insert_slice_async {{.*}}, {{.*}}, %[[INSERT_IDX]] // CHECK: triton_gpu.async_wait {num = 2 : i32} // CHECK: %[[NEXT_A:.*]] = tensor.extract_slice %[[NEXT_A_BUFFER]][%[[EXTRACT_IDX]], 0, 0] // CHECK: %[[NEXT_B:.*]] = tensor.extract_slice %[[NEXT_B_BUFFER]][%[[EXTRACT_IDX]], 0, 0] // CHECK-DAG: %[[NEXT_PIPELINE_IDX:.*]] = arith.addi %[[PIPELINE_IDX]], %[[CONSTANT_1]] // CHECK-DAG: %[[NEXT_LOOP_IDX:.*]] = arith.addi %[[LOOP_IDX]], %[[CONSTANT_1]] // CHECK: scf.yield {{.*}}, {{.*}}, {{.*}}, %[[NEXT_A_BUFFER]], %[[NEXT_B_BUFFER]], %[[NEXT_A]], %[[NEXT_B]], {{.*}}, {{.*}}, {{.*}}, %[[NEXT_PIPELINE_IDX]], %[[NEXT_LOOP_IDX]] func @matmul_loop_nested(%lb : index, %ub : index, %step : index, %A : !tt.ptr {tt.divisibility = 16 : i32}, %B : !tt.ptr {tt.divisibility = 16 : i32}) { scf.for %iv0 = %lb to %ub step %step { // A ptrs %a_ptr_splat = tt.splat %A : (!tt.ptr) -> tensor<128x32x!tt.ptr, #AL> %a_tmp0 = tt.make_range {end = 32: i32, start = 0: i32} : tensor<32xi32, #ALs0> %a_tmp1 = tt.expand_dims %a_tmp0 {axis = 0 : i32} : (tensor<32xi32, #ALs0>) -> tensor<1x32xi32, #AL> %a_offs = tt.broadcast %a_tmp1 : (tensor<1x32xi32, #AL>) -> tensor<128x32xi32, #AL> %a_ptr_init = tt.addptr %a_ptr_splat, %a_offs : tensor<128x32x!tt.ptr, #AL>, tensor<128x32xi32, #AL> // B ptrs %b_ptr_splat = tt.splat %B : (!tt.ptr) -> tensor<32x128x!tt.ptr, #BL> %b_tmp0 = tt.make_range {end = 128: i32, start = 0: i32} : tensor<128xi32, #BLs0> %b_tmp1 = tt.expand_dims %b_tmp0 {axis = 0 : i32} : (tensor<128xi32, #BLs0>) -> tensor<1x128xi32, #BL> %b_offs = tt.broadcast %b_tmp1 : (tensor<1x128xi32, #BL>) -> tensor<32x128xi32, #BL> %b_ptr_init = tt.addptr %b_ptr_splat, %b_offs : tensor<32x128x!tt.ptr, #BL>, tensor<32x128xi32, #BL> %a_mask = arith.constant dense : tensor<128x32xi1, #AL> %a_other = arith.constant dense<0.00e+00> : tensor<128x32xf16, #AL> %b_mask = arith.constant dense : tensor<32x128xi1, #BL> %b_other = arith.constant dense<0.00e+00> : tensor<32x128xf16, #BL> %c_init = arith.constant dense<0.00e+00> : tensor<128x128xf32, #C> %a_off = arith.constant dense<4> : tensor<128x32xi32, #AL> %b_off = arith.constant dense<4> : tensor<32x128xi32, #BL> scf.for %iv = %lb to %ub step %step iter_args(%a_ptr = %a_ptr_init, %b_ptr = %b_ptr_init, %prev_c = %c_init) -> (tensor<128x32x!tt.ptr, #AL>, tensor<32x128x!tt.ptr, #BL>, tensor<128x128xf32, #C>) { %a_ = tt.load %a_ptr, %a_mask, %a_other {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<128x32xf16, #AL> %a = triton_gpu.convert_layout %a_ : (tensor<128x32xf16, #AL>) -> tensor<128x32xf16, #A> %b_ = tt.load %b_ptr, %b_mask, %b_other {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<32x128xf16, #BL> %b = triton_gpu.convert_layout %b_ : (tensor<32x128xf16, #BL>) -> tensor<32x128xf16, #B> %c = tt.dot %a, %b, %prev_c {allowTF32 = true, transA = false, transB = false} : tensor<128x32xf16, #A> * tensor<32x128xf16, #B> -> tensor<128x128xf32, #C> %next_a_ptr = tt.addptr %a_ptr, %a_off : tensor<128x32x!tt.ptr, #AL>, tensor<128x32xi32, #AL> %next_b_ptr = tt.addptr %b_ptr, %b_off : tensor<32x128x!tt.ptr, #BL>, tensor<32x128xi32, #BL> scf.yield %next_a_ptr, %next_b_ptr, %c : tensor<128x32x!tt.ptr, #AL>, tensor<32x128x!tt.ptr, #BL>, tensor<128x128xf32, #C> } } return } // CHECK: func @matmul_loop_single_pipeline // CHECK-DAG: %[[CONSTANT_0:.*]] = arith.constant 0 : i32 // CHECK-DAG: %[[CONSTANT_1:.*]] = arith.constant 1 : i32 // CHECK-DAG: %[[CONSTANT_2:.*]] = arith.constant 2 : i32 // CHECK-DAG: %[[CONSTANT_3:.*]] = arith.constant 3 : i32 // CHECK: %[[BBUFFER:.*]] = triton_gpu.alloc_tensor // CHECK: %[[B0BUFFER:.*]] = triton_gpu.insert_slice_async {{.*}}, {{.*}}, %[[CONSTANT_0]] // CHECK: %[[B1BUFFER:.*]] = triton_gpu.insert_slice_async {{.*}}, {{.*}}, %[[CONSTANT_1]] // CHECK: triton_gpu.async_wait {num = 1 : i32} // CHECK: %[[B0:.*]] = tensor.extract_slice %[[B1BUFFER]][0, 0, 0] // CHECK: scf.for {{.*}} iter_args({{.*}}, {{.*}}, {{.*}}, %[[arg_b0:.*]] = %[[B0]], {{.*}}, {{.*}}, %[[PIPELINE_IDX:.*]] = %[[CONSTANT_2]], %[[LOOP_IDX:.*]] = %[[CONSTANT_1]] // CHECK: %[[arg_b0_dot_op:.*]] = triton_gpu.convert_layout %[[arg_b0]] // CHECK: tt.dot {{.*}}, %[[arg_b0_dot_op]], {{.*}} // CHECK-DAG: %[[INSERT_IDX:.*]] = arith.remsi %[[PIPELINE_IDX]], %[[CONSTANT_3]] // CHECK-DAG: %[[EXTRACT_INT:.*]] = arith.remsi %[[LOOP_IDX]], %[[CONSTANT_3]] // CHECK-DAG: %[[EXTRACT_IDX:.*]] = arith.index_cast %[[EXTRACT_INT]] : i32 to index // CHECK: %[[NEXT_B_BUFFER:.*]] = triton_gpu.insert_slice_async {{.*}}, {{.*}}, %[[INSERT_IDX]] // CHECK: triton_gpu.async_wait {num = 1 : i32} // CHECK: %[[NEXT_B:.*]] = tensor.extract_slice %[[NEXT_B_BUFFER]][%[[EXTRACT_IDX]], 0, 0] // CHECK-DAG: %[[NEXT_PIPELINE_IDX:.*]] = arith.addi %[[PIPELINE_IDX]], %[[CONSTANT_1]] // CHECK-DAG: %[[NEXT_LOOP_IDX:.*]] = arith.addi %[[LOOP_IDX]], %[[CONSTANT_1]] // CHECK: scf.yield {{.*}}, {{.*}}, %[[NEXT_B_BUFFER]], %[[NEXT_B]], {{.*}}, {{.*}}, %[[NEXT_PIPELINE_IDX]], %[[NEXT_LOOP_IDX]] func @matmul_loop_single_pipeline(%lb : index, %ub : index, %step : index, %A : !tt.ptr {tt.divisibility = 16 : i32}, %B : !tt.ptr {tt.divisibility = 16 : i32}) { // A ptrs %a_ptr_splat = tt.splat %A : (!tt.ptr) -> tensor<128x32x!tt.ptr, #AL> %a_tmp0 = tt.make_range {end = 32: i32, start = 0: i32} : tensor<32xi32, #ALs0> %a_tmp1 = tt.expand_dims %a_tmp0 {axis = 0 : i32} : (tensor<32xi32, #ALs0>) -> tensor<1x32xi32, #AL> %a_offs = tt.broadcast %a_tmp1 : (tensor<1x32xi32, #AL>) -> tensor<128x32xi32, #AL> %a_ptr_init = tt.addptr %a_ptr_splat, %a_offs : tensor<128x32x!tt.ptr, #AL>, tensor<128x32xi32, #AL> // B ptrs %b_ptr_splat = tt.splat %B : (!tt.ptr) -> tensor<32x128x!tt.ptr, #BL> %b_tmp0 = tt.make_range {end = 128: i32, start = 0: i32} : tensor<128xi32, #BLs0> %b_tmp1 = tt.expand_dims %b_tmp0 {axis = 0 : i32} : (tensor<128xi32, #BLs0>) -> tensor<1x128xi32, #BL> %b_offs = tt.broadcast %b_tmp1 : (tensor<1x128xi32, #BL>) -> tensor<32x128xi32, #BL> %b_ptr_init = tt.addptr %b_ptr_splat, %b_offs : tensor<32x128x!tt.ptr, #BL>, tensor<32x128xi32, #BL> %a_mask = arith.constant dense : tensor<128x32xi1, #AL> %a_other = arith.constant dense<0.00e+00> : tensor<128x32xf16, #AL> %a_ = tt.load %a_ptr_init, %a_mask, %a_other {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<128x32xf16, #AL> %a = triton_gpu.convert_layout %a_ : (tensor<128x32xf16, #AL>) -> tensor<128x32xf16, #A> %b_mask = arith.constant dense : tensor<32x128xi1, #BL> %b_other = arith.constant dense<0.00e+00> : tensor<32x128xf16, #BL> %c_init = arith.constant dense<0.00e+00> : tensor<128x128xf32, #C> %b_off = arith.constant dense<4> : tensor<32x128xi32, #BL> scf.for %iv = %lb to %ub step %step iter_args(%b_ptr = %b_ptr_init, %prev_c = %c_init) -> (tensor<32x128x!tt.ptr, #BL>, tensor<128x128xf32, #C>) { %b_ = tt.load %b_ptr, %b_mask, %b_other {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<32x128xf16, #BL> %b = triton_gpu.convert_layout %b_ : (tensor<32x128xf16, #BL>) -> tensor<32x128xf16, #B> %c = tt.dot %a, %b, %prev_c {allowTF32 = true, transA = false, transB = false} : tensor<128x32xf16, #A> * tensor<32x128xf16, #B> -> tensor<128x128xf32, #C> %next_b_ptr = tt.addptr %b_ptr, %b_off : tensor<32x128x!tt.ptr, #BL>, tensor<32x128xi32, #BL> scf.yield %next_b_ptr, %c : tensor<32x128x!tt.ptr, #BL>, tensor<128x128xf32, #C> } return } triton-2.0.0/test/TritonGPU/matmul.mlir000066400000000000000000000145721440023377100200400ustar00rootroot00000000000000// RUN: triton-opt %s -split-input-file -convert-triton-to-tritongpu -tritongpu-combine -tritongpu-pipeline=num-stages=3 -tritongpu-combine -test-print-allocation 2>&1 | FileCheck %s // CHECK: offset = 0, size = 49152 // CHECK: offset = 49152, size = 49152 // CHECK: size = 98304 module { func @matmul_kernel__Pfp32_Pfp32_Pfp32_i32_i32_i32_i32_i32_i32_i32_i32_i32__12c64_13c64_14c64_15c8(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32 {tt.divisibility = 16 : i32}, %arg7: i32, %arg8: i32 {tt.divisibility = 16 : i32}, %arg9: i32, %arg10: i32 {tt.divisibility = 16 : i32}, %arg11: i32) { %cst = arith.constant dense : tensor<64x64xi1> %c64 = arith.constant 64 : index %c0 = arith.constant 0 : index %cst_0 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> %c64_i32 = arith.constant 64 : i32 %c63_i32 = arith.constant 63 : i32 %c8_i32 = arith.constant 8 : i32 %0 = tt.get_program_id {axis = 0 : i32} : i32 %1 = arith.addi %arg3, %c63_i32 : i32 %2 = arith.divsi %1, %c64_i32 : i32 %3 = arith.addi %arg4, %c63_i32 : i32 %4 = arith.divsi %3, %c64_i32 : i32 %5 = arith.muli %4, %c8_i32 : i32 %6 = arith.divsi %0, %5 : i32 %7 = arith.muli %6, %c8_i32 : i32 %8 = arith.subi %2, %7 : i32 %9 = arith.cmpi slt, %8, %c8_i32 : i32 %10 = select %9, %8, %c8_i32 : i32 %11 = arith.remsi %0, %10 : i32 %12 = arith.addi %7, %11 : i32 %13 = arith.remsi %0, %5 : i32 %14 = arith.divsi %13, %10 : i32 %15 = arith.muli %12, %c64_i32 : i32 %16 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> %17 = tt.splat %15 : (i32) -> tensor<64xi32> %18 = arith.addi %17, %16 : tensor<64xi32> %19 = arith.muli %14, %c64_i32 : i32 %20 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> %21 = tt.splat %19 : (i32) -> tensor<64xi32> %22 = arith.addi %21, %20 : tensor<64xi32> %23 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> %24 = tt.expand_dims %18 {axis = 1 : i32} : (tensor<64xi32>) -> tensor<64x1xi32> %25 = tt.splat %arg6 : (i32) -> tensor<64x1xi32> %26 = arith.muli %24, %25 : tensor<64x1xi32> %27 = tt.expand_dims %23 {axis = 0 : i32} : (tensor<64xi32>) -> tensor<1x64xi32> %28 = tt.splat %arg7 : (i32) -> tensor<1x64xi32> %29 = arith.muli %27, %28 : tensor<1x64xi32> %30 = tt.broadcast %26 : (tensor<64x1xi32>) -> tensor<64x64xi32> %31 = tt.broadcast %29 : (tensor<1x64xi32>) -> tensor<64x64xi32> %32 = arith.addi %30, %31 : tensor<64x64xi32> %33 = tt.splat %arg0 : (!tt.ptr) -> tensor<64x64x!tt.ptr> %34 = tt.addptr %33, %32 : tensor<64x64x!tt.ptr>, tensor<64x64xi32> %35 = tt.expand_dims %23 {axis = 1 : i32} : (tensor<64xi32>) -> tensor<64x1xi32> %36 = tt.splat %arg8 : (i32) -> tensor<64x1xi32> %37 = arith.muli %35, %36 : tensor<64x1xi32> %38 = tt.expand_dims %22 {axis = 0 : i32} : (tensor<64xi32>) -> tensor<1x64xi32> %39 = tt.splat %arg9 : (i32) -> tensor<1x64xi32> %40 = arith.muli %38, %39 : tensor<1x64xi32> %41 = tt.broadcast %37 : (tensor<64x1xi32>) -> tensor<64x64xi32> %42 = tt.broadcast %40 : (tensor<1x64xi32>) -> tensor<64x64xi32> %43 = arith.addi %41, %42 : tensor<64x64xi32> %44 = tt.splat %arg1 : (!tt.ptr) -> tensor<64x64x!tt.ptr> %45 = tt.addptr %44, %43 : tensor<64x64x!tt.ptr>, tensor<64x64xi32> %46 = arith.index_cast %arg5 : i32 to index %47:3 = scf.for %arg12 = %c0 to %46 step %c64 iter_args(%arg13 = %cst_0, %arg14 = %34, %arg15 = %45) -> (tensor<64x64xf32>, tensor<64x64x!tt.ptr>, tensor<64x64x!tt.ptr>) { %76 = tt.load %arg14, %cst, %cst_0 {cache = 1 : i32, evict = 1 : i32, isVolatile = false, transA=false, transB=false} : tensor<64x64xf32> %77 = tt.load %arg15, %cst, %cst_0 {cache = 1 : i32, evict = 1 : i32, isVolatile = false, transA=false, transB=false} : tensor<64x64xf32> %78 = tt.dot %76, %77, %cst_0 {allowTF32 = true, transA = false, transB = false} : tensor<64x64xf32> * tensor<64x64xf32> -> tensor<64x64xf32> %79 = arith.addf %arg13, %78 : tensor<64x64xf32> %80 = arith.muli %arg7, %c64_i32 : i32 %81 = tt.splat %80 : (i32) -> tensor<64x64xi32> %82 = tt.addptr %arg14, %81 : tensor<64x64x!tt.ptr>, tensor<64x64xi32> %83 = arith.muli %arg8, %c64_i32 : i32 %84 = tt.splat %83 : (i32) -> tensor<64x64xi32> %85 = tt.addptr %arg15, %84 : tensor<64x64x!tt.ptr>, tensor<64x64xi32> scf.yield %79, %82, %85 : tensor<64x64xf32>, tensor<64x64x!tt.ptr>, tensor<64x64x!tt.ptr> } %48 = arith.muli %12, %c64_i32 : i32 %49 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> %50 = tt.splat %48 : (i32) -> tensor<64xi32> %51 = arith.addi %50, %49 : tensor<64xi32> %52 = arith.muli %14, %c64_i32 : i32 %53 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> %54 = tt.splat %52 : (i32) -> tensor<64xi32> %55 = arith.addi %54, %53 : tensor<64xi32> %56 = tt.expand_dims %51 {axis = 1 : i32} : (tensor<64xi32>) -> tensor<64x1xi32> %57 = tt.splat %arg10 : (i32) -> tensor<64x1xi32> %58 = arith.muli %57, %56 : tensor<64x1xi32> %59 = tt.expand_dims %55 {axis = 0 : i32} : (tensor<64xi32>) -> tensor<1x64xi32> %60 = tt.splat %arg11 : (i32) -> tensor<1x64xi32> %61 = arith.muli %59, %60 : tensor<1x64xi32> %62 = tt.broadcast %58 : (tensor<64x1xi32>) -> tensor<64x64xi32> %63 = tt.broadcast %61 : (tensor<1x64xi32>) -> tensor<64x64xi32> %64 = arith.addi %62, %63 : tensor<64x64xi32> %65 = tt.splat %arg2 : (!tt.ptr) -> tensor<64x64x!tt.ptr> %66 = tt.addptr %65, %64 : tensor<64x64x!tt.ptr>, tensor<64x64xi32> %67 = tt.expand_dims %51 {axis = 1 : i32} : (tensor<64xi32>) -> tensor<64x1xi32> %68 = tt.splat %arg3 : (i32) -> tensor<64x1xi32> %69 = arith.cmpi slt, %67, %68 : tensor<64x1xi32> %70 = tt.expand_dims %55 {axis = 0 : i32} : (tensor<64xi32>) -> tensor<1x64xi32> %71 = tt.splat %arg4 : (i32) -> tensor<1x64xi32> %72 = arith.cmpi slt, %70, %71 : tensor<1x64xi32> %73 = tt.broadcast %69 : (tensor<64x1xi1>) -> tensor<64x64xi1> %74 = tt.broadcast %72 : (tensor<1x64xi1>) -> tensor<64x64xi1> %75 = arith.andi %73, %74 : tensor<64x64xi1> tt.store %66, %47#0, %75 : tensor<64x64xf32> return } } triton-2.0.0/test/TritonGPU/prefetch.mlir000066400000000000000000000114771440023377100203420ustar00rootroot00000000000000// RUN: triton-opt %s -split-input-file -tritongpu-prefetch | FileCheck %s // 4 warps // matmul: 128x32 @ 32x128 -> 128x128 #AL = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> #BL = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}> #A = #triton_gpu.shared<{vec = 2, perPhase = 2, maxPhase = 4, order = [1, 0]}> #B = #triton_gpu.shared<{vec = 2, perPhase = 2, maxPhase = 4, order = [1, 0]}> #C = #triton_gpu.mma<{version = 2, warpsPerCTA = [4, 1]}> #A_OP = #triton_gpu.dot_op<{opIdx = 0, parent = #C}> #B_OP = #triton_gpu.dot_op<{opIdx = 1, parent = #C}> // CHECK: func @matmul_loop // CHECK-DAG: %[[A0_PREFETCH_SMEM:.*]] = tensor.extract_slice %[[A0:.*]][0, 0] [128, 16] // CHECK-DAG: %[[A0_PREFETCH:.*]] = triton_gpu.convert_layout %[[A0_PREFETCH_SMEM]] // CHECK-DAG: %[[B0_PREFETCH_SMEM:.*]] = tensor.extract_slice %[[B0:.*]][0, 0] [16, 128] // CHECK-DAG: %[[B0_PREFETCH:.*]] = triton_gpu.convert_layout %[[B0_PREFETCH_SMEM]] // CHECK: scf.for {{.*}} iter_args({{.*}}, {{.*}}, %[[arg_a0:.*]] = %[[A0]], %[[arg_b0:.*]] = %[[B0]], {{.*}}, %[[a0_prefetch:.*]] = %[[A0_PREFETCH]], %[[b0_prefetch:.*]] = %[[B0_PREFETCH]] // CHECK-DAG: %[[A_REM_SMEM:.*]] = tensor.extract_slice %[[arg_a0]][0, 16] [128, 16] // CHECK-DAG: %[[A_REM:.*]] = triton_gpu.convert_layout %[[A_REM_SMEM]] // CHECK-DAG: %[[B_REM_SMEM:.*]] = tensor.extract_slice %[[arg_b0]][16, 0] [16, 128] // CHECK-DAG: %[[B_REM:.*]] = triton_gpu.convert_layout %[[B_REM_SMEM]] // CHECK: %[[D_FIRST:.*]] = tt.dot %[[a0_prefetch]], %[[b0_prefetch:.*]], {{.*}} // CHECK: tt.dot %[[A_REM]], %[[B_REM]], %[[D_FIRST:.*]] // CHECK-DAG: %[[NEXT_A_PREFETCH_SMEM:.*]] = tensor.extract_slice {{.*}}[0, 0] [128, 16] // CHECK-DAG: %[[NEXT_A_PREFETCH:.*]] = triton_gpu.convert_layout %[[NEXT_A_PREFETCH_SMEM]] // CHECK-DAG: %[[NEXT_B_PREFETCH_SMEM:.*]] = tensor.extract_slice {{.*}}[0, 0] [16, 128] // CHECK-DAG: %[[NEXT_B_PREFETCH:.*]] = triton_gpu.convert_layout %[[NEXT_B_PREFETCH_SMEM]] // CHECK: scf.yield {{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, %[[NEXT_A_PREFETCH]], %[[NEXT_B_PREFETCH]] func @matmul_loop(%lb : index, %ub : index, %step : index, %A : !tt.ptr, %B : !tt.ptr) { %a_ptr_init = tt.broadcast %A : (!tt.ptr) -> tensor<128x32x!tt.ptr, #AL> %b_ptr_init = tt.broadcast %B : (!tt.ptr) -> tensor<32x128x!tt.ptr, #BL> %a_mask = arith.constant dense : tensor<128x32xi1, #AL> %a_other = arith.constant dense<0.00e+00> : tensor<128x32xf16, #AL> %b_mask = arith.constant dense : tensor<32x128xi1, #BL> %b_other = arith.constant dense<0.00e+00> : tensor<32x128xf16, #BL> %c_init = arith.constant dense<0.00e+00> : tensor<128x128xf32, #C> %a_off = arith.constant dense<4> : tensor<128x32xi32, #AL> %b_off = arith.constant dense<4> : tensor<32x128xi32, #BL> %a_ = tt.load %a_ptr_init, %a_mask, %a_other {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<128x32xf16, #AL> %a_init = triton_gpu.convert_layout %a_ : (tensor<128x32xf16, #AL>) -> tensor<128x32xf16, #A> %b_ = tt.load %b_ptr_init, %b_mask, %b_other {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<32x128xf16, #BL> %b_init = triton_gpu.convert_layout %b_ : (tensor<32x128xf16, #BL>) -> tensor<32x128xf16, #B> scf.for %iv = %lb to %ub step %step iter_args(%a_ptr = %a_ptr_init, %b_ptr = %b_ptr_init, %a = %a_init, %b = %b_init, %prev_c = %c_init) -> (tensor<128x32x!tt.ptr, #AL>, tensor<32x128x!tt.ptr, #BL>, tensor<128x32xf16, #A>, tensor<32x128xf16, #B>, tensor<128x128xf32, #C>) { %a_op = triton_gpu.convert_layout %a : (tensor<128x32xf16, #A>) -> tensor<128x32xf16, #A_OP> %b_op = triton_gpu.convert_layout %b : (tensor<32x128xf16, #B>) -> tensor<32x128xf16, #B_OP> %c = tt.dot %a_op, %b_op, %prev_c {allowTF32 = true, transA = false, transB = false} : tensor<128x32xf16, #A_OP> * tensor<32x128xf16, #B_OP> -> tensor<128x128xf32, #C> %next_a_ptr = tt.addptr %a_ptr, %a_off : tensor<128x32x!tt.ptr, #AL>, tensor<128x32xi32, #AL> %next_b_ptr = tt.addptr %b_ptr, %b_off : tensor<32x128x!tt.ptr, #BL>, tensor<32x128xi32, #BL> %next_a_ = tt.load %next_a_ptr, %a_mask, %a_other {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<128x32xf16, #AL> %next_a = triton_gpu.convert_layout %next_a_ : (tensor<128x32xf16, #AL>) -> tensor<128x32xf16, #A> %next_b_ = tt.load %next_b_ptr, %b_mask, %b_other {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<32x128xf16, #BL> %next_b = triton_gpu.convert_layout %b_ : (tensor<32x128xf16, #BL>) -> tensor<32x128xf16, #B> scf.yield %next_a_ptr, %next_b_ptr, %next_a, %next_b, %c : tensor<128x32x!tt.ptr, #AL>, tensor<32x128x!tt.ptr, #BL>, tensor<128x32xf16, #A>, tensor<32x128xf16, #B>, tensor<128x128xf32, #C> } return } triton-2.0.0/test/TritonGPU/update-mma-for-volta.mlir000066400000000000000000000124331440023377100224740ustar00rootroot00000000000000// RUN: triton-opt %s -split-input-file -tritongpu-combine -tritongpu-update-mma-for-volta 2>&1 | FileCheck %s // ----- // check the UpdateMMAVersionMinorForVolta pattern #blocked0 = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [8, 4], warpsPerCTA = [4, 4], order = [1, 0]}> #shared0 = #triton_gpu.shared<{vec = 1, perPhase=2, maxPhase=8 ,order = [1, 0]}> #mma0 = #triton_gpu.mma<{versionMajor=1, versionMinor=0, warpsPerCTA=[4,4]}> // Here, the isMMAv1Row of a and b's dot_operands mismatch #mma0's versionMinor, // and the pattern should update the versionMinor. #dot_operand_a = #triton_gpu.dot_op<{opIdx=0, parent=#mma0, isMMAv1Row=true}> #dot_operand_b = #triton_gpu.dot_op<{opIdx=1, parent=#mma0, isMMAv1Row=false}> // It creates a new MMA layout to fit with $a and $b's dot_operand, and get the right warpsPerCTA // The ID of this MMA instance should be 0. // CHECK: [[new_mma:#mma.*]] = #triton_gpu.mma<{versionMajor = 1, versionMinor = 3, warpsPerCTA = [4, 2]}> module attributes {"triton_gpu.num-warps" = 16 : i32} { // CHECK-LABEL: dot_mmav1 func @dot_mmav1(%A: tensor<64x64xf16, #blocked0>, %B: tensor<64x64xf16, #blocked0>) -> tensor<64x64xf32, #blocked0> { %C = arith.constant dense<0.000000e+00> : tensor<64x64xf32, #blocked0> %AA = triton_gpu.convert_layout %A : (tensor<64x64xf16, #blocked0>) -> tensor<64x64xf16, #dot_operand_a> %BB = triton_gpu.convert_layout %B : (tensor<64x64xf16, #blocked0>) -> tensor<64x64xf16, #dot_operand_b> %CC = triton_gpu.convert_layout %C : (tensor<64x64xf32, #blocked0>) -> tensor<64x64xf32, #mma0> // CHECK: {{.*}} = tt.dot {{.*}}, {{.*}}, %cst {allowTF32 = true} : tensor<64x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = [[new_mma]], isMMAv1Row = true}>> * tensor<64x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = [[new_mma]], isMMAv1Row = true}>> -> tensor<64x64xf32, [[new_mma]]> %D = tt.dot %AA, %BB, %CC {allowTF32 = true} : tensor<64x64xf16, #dot_operand_a> * tensor<64x64xf16, #dot_operand_b> -> tensor<64x64xf32, #mma0> %res = triton_gpu.convert_layout %D : (tensor<64x64xf32, #mma0>) -> tensor<64x64xf32, #blocked0> return %res : tensor<64x64xf32, #blocked0> } } // ----- // Check id in multiple MMA layout instances #blocked0 = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [8, 4], warpsPerCTA = [4, 4], order = [1, 0]}> #shared0 = #triton_gpu.shared<{vec = 1, perPhase=2, maxPhase=8 ,order = [1, 0]}> #mma0 = #triton_gpu.mma<{versionMajor=1, versionMinor=0, warpsPerCTA=[4,4]}> // mma id=1, with all other boolean flags be false, should get a versionMinor of 16(= 1 * 1<<4) #mma1 = #triton_gpu.mma<{versionMajor=1, versionMinor=16, warpsPerCTA=[4,4]}> // Will still get two MMA layouts // CHECK: [[new_mma:#mma.*]] = #triton_gpu.mma<{versionMajor = 1, versionMinor = 3, warpsPerCTA = [4, 2]}> // CHECK: [[new_mma1:#mma.*]] = #triton_gpu.mma<{versionMajor = 1, versionMinor = 19, warpsPerCTA = [4, 2]}> #dot_operand_a = #triton_gpu.dot_op<{opIdx=0, parent=#mma0, isMMAv1Row=true}> #dot_operand_b = #triton_gpu.dot_op<{opIdx=1, parent=#mma0, isMMAv1Row=false}> #dot_operand_a1 = #triton_gpu.dot_op<{opIdx=0, parent=#mma1, isMMAv1Row=true}> #dot_operand_b1 = #triton_gpu.dot_op<{opIdx=1, parent=#mma1, isMMAv1Row=false}> module attributes {"triton_gpu.num-warps" = 16 : i32} { // CHECK-LABEL: dot_mmav1 func @dot_mmav1(%A: tensor<64x64xf16, #blocked0>, %B: tensor<64x64xf16, #blocked0>) -> tensor<64x64xf32, #blocked0> { %C = arith.constant dense<0.000000e+00> : tensor<64x64xf32, #blocked0> %AA = triton_gpu.convert_layout %A : (tensor<64x64xf16, #blocked0>) -> tensor<64x64xf16, #dot_operand_a> %BB = triton_gpu.convert_layout %B : (tensor<64x64xf16, #blocked0>) -> tensor<64x64xf16, #dot_operand_b> %CC = triton_gpu.convert_layout %C : (tensor<64x64xf32, #blocked0>) -> tensor<64x64xf32, #mma0> %AA1 = triton_gpu.convert_layout %A : (tensor<64x64xf16, #blocked0>) -> tensor<64x64xf16, #dot_operand_a1> %BB1 = triton_gpu.convert_layout %B : (tensor<64x64xf16, #blocked0>) -> tensor<64x64xf16, #dot_operand_b1> %CC1 = triton_gpu.convert_layout %C : (tensor<64x64xf32, #blocked0>) -> tensor<64x64xf32, #mma1> // CHECK: {{.*}} = tt.dot {{.*}}, {{.*}}, {{.*}} {allowTF32 = true} : tensor<64x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = [[new_mma]], isMMAv1Row = true}>> * tensor<64x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = [[new_mma]], isMMAv1Row = true}>> -> tensor<64x64xf32, [[new_mma]]> // CHECK: {{.*}} = tt.dot {{.*}}, {{.*}}, {{.*}} {allowTF32 = true} : tensor<64x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = [[new_mma1]], isMMAv1Row = true}>> * tensor<64x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = [[new_mma1]], isMMAv1Row = true}>> -> tensor<64x64xf32, [[new_mma1]]> %D = tt.dot %AA, %BB, %CC {allowTF32 = true} : tensor<64x64xf16, #dot_operand_a> * tensor<64x64xf16, #dot_operand_b> -> tensor<64x64xf32, #mma0> %D1 = tt.dot %AA1, %BB1, %CC1 {allowTF32 = true} : tensor<64x64xf16, #dot_operand_a1> * tensor<64x64xf16, #dot_operand_b1> -> tensor<64x64xf32, #mma1> %res = triton_gpu.convert_layout %D : (tensor<64x64xf32, #mma0>) -> tensor<64x64xf32, #blocked0> %res1 = triton_gpu.convert_layout %D1 : (tensor<64x64xf32, #mma1>) -> tensor<64x64xf32, #blocked0> %sum = arith.addf %res, %res1 : tensor<64x64xf32, #blocked0> return %sum : tensor<64x64xf32, #blocked0> } } triton-2.0.0/test/lib/000077500000000000000000000000001440023377100145565ustar00rootroot00000000000000triton-2.0.0/test/lib/Analysis/000077500000000000000000000000001440023377100163415ustar00rootroot00000000000000triton-2.0.0/test/lib/Analysis/CMakeLists.txt000066400000000000000000000002451440023377100211020ustar00rootroot00000000000000add_mlir_library(TritonTestAnalysis TestAlias.cpp TestAxisInfo.cpp TestAllocation.cpp TestMembar.cpp LINK_LIBS PUBLIC TritonAnalysis ${dialect_libs} )triton-2.0.0/test/lib/Analysis/TestAlias.cpp000066400000000000000000000054031440023377100207400ustar00rootroot00000000000000#include "mlir/IR/AsmState.h" #include "mlir/Pass/Pass.h" #include "triton/Analysis/Alias.h" #include "triton/Analysis/Utility.h" #include "triton/Dialect/TritonGPU/IR/Dialect.h" using namespace mlir; namespace { struct TestAliasPass : public PassWrapper> { // LLVM15+ // MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestAliasPass); static void print(StringRef name, SmallVector &vals, raw_ostream &os) { if (vals.empty()) return; os << name << " -> "; size_t i = 0; for (auto val : vals) { if (i != 0) os << ","; os << val; ++i; } os << "\n"; } StringRef getArgument() const final { return "test-print-alias"; } StringRef getDescription() const final { return "print the result of the alias analysis pass"; } void runOnOperation() override { Operation *operation = getOperation(); auto &os = llvm::errs(); auto opName = SymbolTable::getSymbolName(operation).getValue().str(); os << opName << "\n"; SharedMemoryAliasAnalysis analysis(&getContext()); analysis.run(operation); AsmState state(operation->getParentOfType()); // Get operation ids of value's aliases auto getAllocOpNames = [&](Value value) { LatticeElement *latticeElement = analysis.lookupLatticeElement(value); SmallVector opNames; if (latticeElement) { auto &info = latticeElement->getValue(); if (!info.getAllocs().empty()) { for (auto &alias : info.getAllocs()) { auto opName = getValueOperandName(alias.getDefiningOp()->getResult(0), state); opNames.push_back(std::move(opName)); } } } // Ensure deterministic output std::sort(opNames.begin(), opNames.end()); return opNames; }; operation->walk([&](Operation *op) { if (op->getNumResults() < 1) return; if (auto forOp = dyn_cast(op)) { for (auto arg : llvm::enumerate(forOp.getRegionIterArgs())) { auto operand = forOp.getOpOperandForRegionIterArg(arg.value()).get(); auto opNames = getAllocOpNames(operand); auto argName = getValueOperandName(arg.value(), state); print(argName, opNames, os); } } for (auto result : llvm::enumerate(op->getResults())) { auto opNames = getAllocOpNames(result.value()); auto resultName = getValueOperandName(result.value(), state); print(resultName, opNames, os); } }); } }; } // namespace namespace mlir { namespace test { void registerTestAliasPass() { PassRegistration(); } } // namespace test } // namespace mlir triton-2.0.0/test/lib/Analysis/TestAllocation.cpp000066400000000000000000000034171440023377100217770ustar00rootroot00000000000000#include "mlir/Pass/Pass.h" #include "triton/Analysis/Allocation.h" using namespace mlir; namespace { struct TestAllocationPass : public PassWrapper> { // LLVM15+ // MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestAllocationPass); StringRef getArgument() const final { return "test-print-allocation"; } StringRef getDescription() const final { return "print the result of the allocation pass"; } void runOnOperation() override { Operation *operation = getOperation(); auto &os = llvm::errs(); // Convert to std::string can remove quotes from opName auto opName = SymbolTable::getSymbolName(operation).getValue().str(); os << opName << "\n"; Allocation allocation(operation); operation->walk([&](Operation *op) { auto scratchBufferId = allocation.getBufferId(op); if (scratchBufferId != Allocation::InvalidBufferId) { size_t offset = allocation.getOffset(scratchBufferId); size_t size = allocation.getAllocatedSize(scratchBufferId); os << "scratch offset = " << offset << ", size = " << size << "\n"; } if (op->getNumResults() < 1) return; for (Value result : op->getResults()) { auto bufferId = allocation.getBufferId(result); if (bufferId != Allocation::InvalidBufferId) { size_t offset = allocation.getOffset(bufferId); size_t size = allocation.getAllocatedSize(bufferId); os << "offset = " << offset << ", size = " << size << "\n"; } } }); os << "size = " << allocation.getSharedMemorySize() << "\n"; } }; } // namespace namespace mlir { namespace test { void registerTestAllocationPass() { PassRegistration(); } } // namespace test } // namespace mlir triton-2.0.0/test/lib/Analysis/TestAxisInfo.cpp000066400000000000000000000041621440023377100214300ustar00rootroot00000000000000#include "mlir/Pass/Pass.h" #include "triton/Analysis/AxisInfo.h" using namespace mlir; namespace { struct TestAxisInfoPass : public PassWrapper> { // LLVM15+ // MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestAlignmentPass); void print(const std::string &name, raw_ostream &os, ArrayRef vals) { os << name << ": ["; for (size_t d = 0; d < vals.size(); d++) { if (d != 0) os << ", "; os << vals[d]; } os << "]"; } StringRef getArgument() const final { return "test-print-alignment"; } StringRef getDescription() const final { return "print the result of the alignment analysis pass"; } void runOnOperation() override { Operation *operation = getOperation(); auto &os = llvm::errs(); auto opName = SymbolTable::getSymbolName(operation).getValue().str(); os << opName << "\n"; AxisInfoAnalysis analysis(&getContext()); analysis.run(operation); operation->walk([&](Operation *op) { if (op->getNumResults() < 1) return; for (Value result : op->getResults()) { // std::ostringstream oss; // result.print(oss); // os << " => "; LatticeElement *latticeElement = analysis.lookupLatticeElement(result); if (!latticeElement) { os << "None\n"; return; } AxisInfo &info = latticeElement->getValue(); print("Contiguity", os, info.getContiguity()); os << " ; "; print("Divisibility", os, info.getDivisibility()); os << " ; "; print("Constancy", os, info.getConstancy()); os << " ; "; auto constantValue = info.getConstantValue(); os << "ConstantValue: ["; if (constantValue.has_value()) os << constantValue.value(); else os << "None"; os << "] ( "; result.print(os); os << " ) "; os << "\n"; } }); } }; } // namespace namespace mlir { namespace test { void registerTestAlignmentPass() { PassRegistration(); } } // namespace test } // namespace mlir triton-2.0.0/test/lib/Analysis/TestMembar.cpp000066400000000000000000000026261440023377100211160ustar00rootroot00000000000000#include "mlir/Dialect/GPU/GPUDialect.h" #include "mlir/IR/Dialect.h" #include "mlir/Pass/Pass.h" #include "triton/Analysis/Allocation.h" #include "triton/Analysis/Membar.h" using namespace mlir; namespace { struct TestMembarPass : public PassWrapper> { // LLVM15+ // MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestMembarPass); StringRef getArgument() const final { return "test-print-membar"; } StringRef getDescription() const final { return "print the result of the allocation pass"; } void runOnOperation() override { Operation *operation = getOperation(); auto &os = llvm::errs(); // Convert to std::string can remove quotes from op_name auto opName = SymbolTable::getSymbolName(operation).getValue().str(); os << opName << "\n"; Allocation allocation(operation); MembarAnalysis membarPass(&allocation); membarPass.run(); size_t operationId = 0; operation->walk([&](Operation *op) { if (isa(op)) { os << "Membar " << operationId << "\n"; } if (op->getNumRegions() == 0) { // Don't count parent Operation to simplify the test. operationId++; } return; }); } }; } // namespace namespace mlir { namespace test { void registerTestMembarPass() { PassRegistration(); } } // namespace test } // namespace mlir triton-2.0.0/test/lib/CMakeLists.txt000066400000000000000000000000321440023377100173110ustar00rootroot00000000000000add_subdirectory(Analysis)triton-2.0.0/test/lit.cfg.py000066400000000000000000000042051440023377100157110ustar00rootroot00000000000000# -*- Python -*- import os import platform import re import subprocess import tempfile import lit.formats import lit.util from lit.llvm import llvm_config from lit.llvm.subst import ToolSubst from lit.llvm.subst import FindTool # Configuration file for the 'lit' test runner # name: The name of this test suite config.name = 'TRITON' config.test_format = lit.formats.ShTest(not llvm_config.use_lit_shell) # suffixes: A list of file extensions to treat as test files. config.suffixes = ['.mlir'] # test_source_root: The root path where tests are located. config.test_source_root = os.path.dirname(__file__) # test_exec_root: The root path where tests should be run. config.test_exec_root = os.path.join(config.triton_obj_root, 'test') config.substitutions.append(('%PATH%', config.environment['PATH'])) config.substitutions.append(('%shlibext', config.llvm_shlib_ext)) llvm_config.with_system_environment( ['HOME', 'INCLUDE', 'LIB', 'TMP', 'TEMP']) # llvm_config.use_default_substitutions() # excludes: A list of directories to exclude from the testsuite. The 'Inputs' # subdirectories contain auxiliary inputs for various tests in their parent # directories. config.excludes = ['Inputs', 'Examples', 'CMakeLists.txt', 'README.txt', 'LICENSE.txt'] # test_source_root: The root path where tests are located. config.test_source_root = os.path.dirname(__file__) # test_exec_root: The root path where tests should be run. config.test_exec_root = os.path.join(config.triton_obj_root, 'test') config.triton_tools_dir = os.path.join(config.triton_obj_root, 'bin') config.filecheck_dir = os.path.join(config.triton_obj_root, 'bin', 'FileCheck') tool_dirs = [config.triton_tools_dir, config.llvm_tools_dir, config.filecheck_dir] # Tweak the PATH to include the tools dir. for d in tool_dirs: llvm_config.with_environment('PATH', d, append_path=True) tools = [ 'triton-opt', ToolSubst('%PYTHON', config.python_executable, unresolved='ignore'), ] llvm_config.add_tool_substitutions(tools, tool_dirs) # TODO: what's this? llvm_config.with_environment('PYTHONPATH', [ os.path.join(config.mlir_binary_dir, 'python_packages', 'triton'), ], append_path=True) triton-2.0.0/test/lit.site.cfg.py.in000066400000000000000000000013341440023377100172610ustar00rootroot00000000000000@LIT_SITE_CFG_IN_HEADER@ import sys config.triton_obj_root = "@TRITON_BINARY_DIR@" config.llvm_src_root = "@LLVM_SOURCE_DIR@" config.llvm_obj_root = "@LLVM_BINARY_DIR@" config.llvm_tools_dir = "@LLVM_TOOLS_DIR@" config.llvm_lib_dir = "@LLVM_LIBS_DIR@" config.llvm_shlib_dir = "@SHLIBDIR@" config.llvm_shlib_ext = "@SHLIBEXT@" config.llvm_exe_ext = "@EXEEXT@" config.lit_tools_dir = "@LLVM_LIT_TOOLS_DIR@" config.mlir_binary_dir = "@MLIR_BINARY_DIR@" config.python_executable = "@Python3_EXECUTABLE@" config.enable_bindings_python = @MLIR_ENABLE_BINDINGS_PYTHON@ import lit.llvm lit.llvm.initialize(lit_config, config) # Let the main config do the real work lit_config.load_config(config, "@TRITON_SOURCE_DIR@/test/lit.cfg.py") triton-2.0.0/unittest/000077500000000000000000000000001440023377100147105ustar00rootroot00000000000000triton-2.0.0/unittest/Analysis/000077500000000000000000000000001440023377100164735ustar00rootroot00000000000000triton-2.0.0/unittest/Analysis/CMakeLists.txt000066400000000000000000000001301440023377100212250ustar00rootroot00000000000000add_triton_ut( NAME TestTritonAnalysis SRCS UtilityTest.cpp LIBS TritonAnalysis ) triton-2.0.0/unittest/Analysis/UtilityTest.cpp000066400000000000000000000013441440023377100215040ustar00rootroot00000000000000//===- UtilityTest.cpp - Tests for // Utility----------------------------------===// // //===----------------------------------------------------------------------===// #include "triton/Analysis/Utility.h" #include namespace mlir { TEST(Analysis, reorder) { SmallVector shape({10, 20, 30}); { SmallVector order({2, 1, 0}); auto reordered = reorder(shape, order); EXPECT_EQ(reordered[0], 30); EXPECT_EQ(reordered[1], 20); EXPECT_EQ(reordered[2], 10); } { SmallVector order({1, 0, 2}); auto reordered = reorder(shape, order); EXPECT_EQ(reordered[0], 20); EXPECT_EQ(reordered[1], 10); EXPECT_EQ(reordered[2], 30); } } } // namespace mlir triton-2.0.0/unittest/CMakeLists.txt000066400000000000000000000012131440023377100174450ustar00rootroot00000000000000 include (${CMAKE_CURRENT_SOURCE_DIR}/googletest.cmake) include(GoogleTest) enable_testing() function(add_triton_ut) set(options) set(oneValueArgs NAME) set(multiValueArgs SRCS LIBS) cmake_parse_arguments(_ "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) add_test(NAME ${__NAME} COMMAND ${__NAME}) add_executable( ${__NAME} ${__SRCS}) target_link_libraries( ${__NAME} PRIVATE GTest::gtest_main gmock ${__LIBS}) gtest_discover_tests(${__NAME}) endfunction() add_subdirectory(Analysis) add_subdirectory(Conversion) add_subdirectory(Dialect) triton-2.0.0/unittest/Conversion/000077500000000000000000000000001440023377100170355ustar00rootroot00000000000000triton-2.0.0/unittest/Conversion/CMakeLists.txt000066400000000000000000000000421440023377100215710ustar00rootroot00000000000000add_subdirectory(TritonGPUToLLVM) triton-2.0.0/unittest/Conversion/TritonGPUToLLVM/000077500000000000000000000000001440023377100216665ustar00rootroot00000000000000triton-2.0.0/unittest/Conversion/TritonGPUToLLVM/CMakeLists.txt000066400000000000000000000001311440023377100244210ustar00rootroot00000000000000add_triton_ut( NAME TestPtxAsmFormat SRCS PTXAsmFormatTest.cpp LIBS TritonGPUToLLVM ) triton-2.0.0/unittest/Conversion/TritonGPUToLLVM/PTXAsmFormatTest.cpp000066400000000000000000000101371440023377100255210ustar00rootroot00000000000000#include "triton/Conversion/TritonGPUToLLVM/PTXAsmFormat.h" #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h" #include "mlir/IR/Builders.h" #include "triton/Dialect/Triton/IR/Dialect.h" #include namespace mlir { namespace triton { class PTXAsmFormatTest : public ::testing::Test { protected: static constexpr int numValues = 4; PTXAsmFormatTest() { ctx.loadDialect(); createValues(); } // Creates the test values. void createValues() { OpBuilder builder(&ctx); builder.setInsertionPointToStart(&block); // a b1 value for predicate. v[0] = builder.create(builder.getUnknownLoc(), 1, 1); for (int i = 0; i < numValues; i++) { v[i + 1] = builder.create(builder.getUnknownLoc(), i, 32); } } MLIRContext ctx; Block block; Value v[numValues + 1]; }; TEST_F(PTXAsmFormatTest, basic) { PTXBuilder builder; // Create the operands needed by the instructions in the PTX code. auto *cst = builder.newConstantOperand(1); auto *val = builder.newOperand(v[1], "=r"); // create an instruction auto &mov = *builder.create("mov.b16"); mov(val, cst).predicate(v[0]); ASSERT_EQ(builder.dump(), "@$1 mov.b16 $0, 0x1;"); auto values = builder.getAllMLIRArgs(); ASSERT_EQ(values[0], v[1]); // $0 -> v[1] ASSERT_EQ(values[1], v[0]); // $1 -> v[0] auto constraints = builder.getConstraints(); ASSERT_EQ(constraints, "=r,b"); // $0 -> =r, $1 -> b } TEST_F(PTXAsmFormatTest, complexInstruction) { using triton::CacheModifier; using triton::EvictionPolicy; PTXBuilder builder; int width = 16; int nWords = 2; Value predicateVal = v[0]; Value addrVal = v[1]; auto addr = builder.newAddrOperand(addrVal, "l", 128 /*offset*/); bool isVolatile = false; auto cache = triton::CacheModifier::CA; auto cachePriority = triton::EvictionPolicy::EVICT_FIRST; bool hasL2EvictPolicy = true; auto &ld = builder .create<>("ld") // ->o("volatile", isVolatile) .global() .o("ca", cache == CacheModifier::CA) .o("cg", cache == CacheModifier::CG) .o("L1::evict_first", cachePriority == EvictionPolicy::EVICT_FIRST) .o("L1::evict_last", cachePriority == EvictionPolicy::EVICT_LAST) .o("L1::cache_hint", hasL2EvictPolicy) .v(nWords) .b(width); // Link the instruction to operands ld(addr).predicate(predicateVal); EXPECT_EQ( builder.dump(), "@$1 ld.global.ca.L1::evict_first.L1::cache_hint.v2.b16 [ $0 + 128 ];"); auto values = builder.getAllMLIRArgs(); EXPECT_EQ(values[0], addrVal); // $0 -> predicate EXPECT_EQ(values[1], predicateVal); // $1 -> addr EXPECT_EQ(builder.getConstraints(), "l,b"); } TEST_F(PTXAsmFormatTest, MultiLinePTX) { PTXBuilder builder; auto *constVal = builder.newConstantOperand(1); auto *valVal0 = builder.newOperand(v[1], "=r"); auto *valVal1 = builder.newOperand(v[2], "=r"); auto &mov = *builder.create("mov"); mov(valVal0, constVal); mov(valVal1, constVal); mov(valVal1, valVal0); EXPECT_EQ(builder.dump(), "mov $0, 0x1;\n\t" "mov $1, 0x1;\n\t" "mov $1, $0;"); auto values = builder.getAllMLIRArgs(); EXPECT_EQ(values[0], v[1]); // $0 -> v[1] EXPECT_EQ(values[1], v[2]); // $1 -> v[2] } TEST_F(PTXAsmFormatTest, onlyAttachMLIRArgs) { PTXBuilder builder; const char *ptxCode = ".param .b64 param0;\n" // prepare param0 (format string) "st.param.b64 [param0], %0;\n" "st.param.b64 [param0], %1;\n" "st.param.b64 [param0], %2;\n"; auto &ptxSnippet = *builder.create(ptxCode); auto *opr0 = builder.newOperand(v[0], "r"); auto *opr1 = builder.newOperand(v[1], "r"); auto *opr2 = builder.newOperand(v[2], "r"); ptxSnippet({opr1, opr2, opr0}, true); EXPECT_EQ(builder.dump(), ptxCode); ASSERT_EQ(builder.getAllMLIRArgs()[0], v[1]); ASSERT_EQ(builder.getAllMLIRArgs()[1], v[2]); ASSERT_EQ(builder.getAllMLIRArgs()[2], v[0]); ASSERT_EQ(builder.getAllMLIRArgs().size(), 3); } } // namespace triton } // namespace mlir triton-2.0.0/unittest/Dialect/000077500000000000000000000000001440023377100162555ustar00rootroot00000000000000triton-2.0.0/unittest/Dialect/CMakeLists.txt000066400000000000000000000000341440023377100210120ustar00rootroot00000000000000add_subdirectory(TritonGPU) triton-2.0.0/unittest/Dialect/TritonGPU/000077500000000000000000000000001440023377100201105ustar00rootroot00000000000000triton-2.0.0/unittest/Dialect/TritonGPU/CMakeLists.txt000066400000000000000000000001611440023377100226460ustar00rootroot00000000000000add_triton_ut( NAME TestSwizzling SRCS SwizzleTest.cpp LIBS TritonGPUIR ${dialect_libs} ${conversion_libs} ) triton-2.0.0/unittest/Dialect/TritonGPU/SwizzleTest.cpp000066400000000000000000000035071440023377100231300ustar00rootroot00000000000000#include "triton/Dialect/TritonGPU/IR/Dialect.h" #include using namespace mlir; using mlir::triton::gpu::SharedEncodingAttr; struct swizzleParams { int vec; int perPhase; int maxPhase; }; struct ParamT { std::array shape; int opIdx; int typeWidth; swizzleParams refSwizzle; }; class SwizzleDotOperandTestFixture : public ::testing::TestWithParam { protected: ParamType param; }; TEST_P(SwizzleDotOperandTestFixture, DotOperands) { auto params = GetParam(); // init context MLIRContext ctx; ctx.loadDialect(); // create encoding auto parent = triton::gpu::MmaEncodingAttr::get(&ctx, 2, 0, {1, 1}); auto encoding = triton::gpu::DotOperandEncodingAttr::get(&ctx, params.opIdx, parent); // create element type Type eltType = IntegerType::get(&ctx, params.typeWidth); auto layout = SharedEncodingAttr::get(&ctx, encoding, params.shape, {1, 0}, eltType); ASSERT_EQ(layout.getVec(), params.refSwizzle.vec); ASSERT_EQ(layout.getPerPhase(), params.refSwizzle.perPhase); ASSERT_EQ(layout.getMaxPhase(), params.refSwizzle.maxPhase); } INSTANTIATE_TEST_SUITE_P(TestDotOperands, SwizzleDotOperandTestFixture, ::testing::Values(ParamT{{128, 64}, 0, 16, {8, 1, 8}}, ParamT{{64, 256}, 1, 16, {8, 1, 8}}, ParamT{{128, 32}, 0, 16, {8, 2, 4}}, ParamT{{32, 128}, 1, 16, {8, 1, 8}}, ParamT{{32, 32}, 0, 16, {8, 2, 4}}, ParamT{{32, 32}, 1, 16, {8, 2, 4}}, ParamT{{16, 16}, 0, 16, {8, 4, 2}}, ParamT{{16, 16}, 1, 16, {8, 4, 2}})); triton-2.0.0/unittest/googletest.cmake000066400000000000000000000012151440023377100200650ustar00rootroot00000000000000include(FetchContent) set(GOOGLETEST_DIR "" CACHE STRING "Location of local GoogleTest repo to build against") if(GOOGLETEST_DIR) set(FETCHCONTENT_SOURCE_DIR_GOOGLETEST ${GOOGLETEST_DIR} CACHE STRING "GoogleTest source directory override") endif() FetchContent_Declare( googletest GIT_REPOSITORY https://github.com/google/googletest.git GIT_TAG release-1.12.1 ) FetchContent_GetProperties(googletest) if(NOT googletest_POPULATED) FetchContent_Populate(googletest) if (MSVC) set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) endif() add_subdirectory(${googletest_SOURCE_DIR} ${googletest_BINARY_DIR} EXCLUDE_FROM_ALL) endif()